From 58956b3ffc8e173dce40848ceba18e89789717c0 Mon Sep 17 00:00:00 2001
From: Thomas Goyne <plorkyeran@aegisub.org>
Date: Sat, 6 Sep 2014 09:16:44 -0700
Subject: [PATCH] Optimize Extradata handling a bit

---
 src/ass_file.cpp            | 97 +++++++++++++++++++++++--------------
 src/ass_file.h              | 12 +++--
 src/ass_parser.cpp          |  2 +-
 src/auto4_lua_assfile.cpp   |  7 +--
 src/subs_controller.cpp     |  2 +-
 src/subtitle_format_ass.cpp | 14 +++---
 6 files changed, 81 insertions(+), 53 deletions(-)

diff --git a/src/ass_file.cpp b/src/ass_file.cpp
index bd605d9bc..75c44c8f9 100644
--- a/src/ass_file.cpp
+++ b/src/ass_file.cpp
@@ -28,6 +28,8 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/filesystem/path.hpp>
 #include <cassert>
+#include <unordered_map>
+#include <unordered_set>
 
 AssFile::AssFile() { }
 
@@ -232,58 +234,79 @@ void AssFile::Sort(EntryList<AssDialogue> &lst, CompFunc comp, std::set<AssDialo
 uint32_t AssFile::AddExtradata(std::string const& key, std::string const& value) {
 	for (auto const& data : Extradata) {
 		// perform brute-force deduplication by simple key and value comparison
-		if (key == data.second.first && value == data.second.second) {
-			return data.first;
+		if (key == data.key && value == data.value) {
+			return data.id;
 		}
 	}
-	// next_extradata_id must not exist
-	assert(Extradata.find(next_extradata_id) == Extradata.end());
-	Extradata[next_extradata_id] = {key, value};
+	Extradata.push_back(ExtradataEntry{next_extradata_id, key, value});
 	return next_extradata_id++; // return old value, then post-increment
 }
 
-std::map<std::string, std::string> AssFile::GetExtradata(std::vector<uint32_t> const& id_list) const {
-	// If multiple IDs have the same key name, the last ID wins
-	std::map<std::string, std::string> result;
+namespace {
+struct extradata_id_cmp {
+	bool operator()(ExtradataEntry const& e, uint32_t id) {
+		return e.id < id;
+	}
+	bool operator()(uint32_t id, ExtradataEntry const& e) {
+		return id < e.id;
+	}
+};
+
+template<typename ExtradataType, typename Func>
+void enumerate_extradata(ExtradataType&& extradata, std::vector<uint32_t> const& id_list, Func&& f) {
+	auto begin = extradata.begin(), end = extradata.end();
 	for (auto id : id_list) {
-		auto it = Extradata.find(id);
-		if (it != Extradata.end())
-			result[it->second.first] = it->second.second;
+		auto it = lower_bound(begin, end, id, extradata_id_cmp{});
+		if (it != end) {
+			f(*it);
+			begin = it;
+		}
 	}
+}
+
+template<typename K, typename V>
+using reference_map = std::unordered_map<std::reference_wrapper<const K>, V, std::hash<K>, std::equal_to<K>>;
+}
+
+std::vector<ExtradataEntry> AssFile::GetExtradata(std::vector<uint32_t> const& id_list) const {
+	std::vector<ExtradataEntry> result;
+	enumerate_extradata(Extradata, id_list, [&](ExtradataEntry const& e) {
+		result.push_back(e);
+	});
 	return result;
 }
 
 void AssFile::CleanExtradata() {
-	// Collect all IDs existing in the database
-	// Then remove all IDs found to be in use from this list
-	// Remaining is then all garbage IDs
-	std::vector<uint32_t> ids;
-	for (auto& it : Extradata)
-		ids.push_back(it.first);
-	if (ids.empty()) return;
-
-	// For each line, find which IDs it actually uses and remove them from the unused-list
+	if (Extradata.empty()) return;
+
+	std::unordered_set<uint32_t> ids_used;
 	for (auto& line : Events) {
+		if (line.ExtradataIds.get().empty()) continue;
+
 		// Find the ID for each unique key in the line
-		std::map<std::string, uint32_t> key_ids;
-		for (auto id : line.ExtradataIds.get()) {
-			auto ed_it = Extradata.find(id);
-			if (ed_it == Extradata.end())
-				continue;
-			key_ids[ed_it->second.first] = id;
-		}
-		// Update the line's ID list to only contain the actual ID for any duplicate keys
-		// Also mark found IDs as used in the cleaning list
-		std::vector<uint32_t> new_ids;
-		for (auto& keyid : key_ids) {
-			new_ids.push_back(keyid.second);
-			ids.erase(remove(begin(ids), end(ids), keyid.second), end(ids));
+		reference_map<std::string, uint32_t> keys_used;
+		enumerate_extradata(Extradata, line.ExtradataIds.get(), [&](ExtradataEntry const& e) {
+			keys_used[e.key] = e.id;
+		});
+
+		for (auto const& used : keys_used)
+			ids_used.insert(used.second);
+
+		// If any keys were duplicated or missing, update the id list
+		if (keys_used.size() != line.ExtradataIds.get().size()) {
+			std::vector<uint32_t> ids;
+			ids.reserve(keys_used.size());
+			for (auto const& used : keys_used)
+				ids.push_back(used.second);
+			std::sort(begin(ids), end(ids));
+			line.ExtradataIds = std::move(ids);
 		}
-		line.ExtradataIds = new_ids;
 	}
 
-	// The ids list should contain only unused IDs now
-	for (auto id : ids) {
-		Extradata.erase(id);
+	if (ids_used.size() != Extradata.size()) {
+		// Erase all no-longer-used extradata entries
+		Extradata.erase(std::remove_if(begin(Extradata), end(Extradata), [&](ExtradataEntry const& e) {
+			return !ids_used.count(e.id);
+		}), end(Extradata));
 	}
 }
diff --git a/src/ass_file.h b/src/ass_file.h
index bc098c727..f291c2ea3 100644
--- a/src/ass_file.h
+++ b/src/ass_file.h
@@ -33,6 +33,7 @@
 #include <libaegisub/signal.h>
 
 #include <boost/intrusive/list.hpp>
+#include <map>
 #include <set>
 #include <vector>
 
@@ -45,7 +46,11 @@ class wxString;
 template<typename T>
 using EntryList = typename boost::intrusive::make_list<T, boost::intrusive::constant_time_size<false>, boost::intrusive::base_hook<AssEntryListHook>>::type;
 
-using AegisubExtradataMap = std::map<uint32_t, std::pair<std::string, std::string>>;
+struct ExtradataEntry {
+	uint32_t id;
+	std::string key;
+	std::string value;
+};
 
 struct AssFileCommit {
 	wxString const& message;
@@ -83,7 +88,7 @@ public:
 	EntryList<AssStyle> Styles;
 	EntryList<AssDialogue> Events;
 	std::vector<AssAttachment> Attachments;
-	AegisubExtradataMap Extradata;
+	std::vector<ExtradataEntry> Extradata;
 	ProjectProperties Properties;
 
 	uint32_t next_extradata_id = 0;
@@ -127,7 +132,7 @@ public:
 	/// @return ID of the created entry
 	uint32_t AddExtradata(std::string const& key, std::string const& value);
 	/// Fetch all extradata entries from a list of IDs
-	std::map<std::string, std::string> GetExtradata(std::vector<uint32_t> const& id_list) const;
+	std::vector<ExtradataEntry> GetExtradata(std::vector<uint32_t> const& id_list) const;
 	/// Remove unreferenced extradata entries
 	void CleanExtradata();
 
@@ -199,4 +204,3 @@ public:
 	/// @param limit If non-empty, only lines in this set are sorted
 	static void Sort(EntryList<AssDialogue>& lst, CompFunc comp = CompStart, std::set<AssDialogue*> const& limit = std::set<AssDialogue*>());
 };
-
diff --git a/src/ass_parser.cpp b/src/ass_parser.cpp
index 8ba47d0d4..c64950fef 100644
--- a/src/ass_parser.cpp
+++ b/src/ass_parser.cpp
@@ -219,7 +219,7 @@ void AssParser::ParseExtradataLine(std::string const &data) {
 
 		// ensure next_extradata_id is always at least 1 more than the largest existing id
 		target->next_extradata_id = std::max(id+1, target->next_extradata_id);
-		target->Extradata[id] = {key, value};
+		target->Extradata.push_back(ExtradataEntry{id, std::move(key), std::move(value)});
 	}
 }
 
diff --git a/src/auto4_lua_assfile.cpp b/src/auto4_lua_assfile.cpp
index 008389315..e4856016e 100644
--- a/src/auto4_lua_assfile.cpp
+++ b/src/auto4_lua_assfile.cpp
@@ -179,8 +179,8 @@ namespace Automation4 {
 			// create extradata table
 			lua_newtable(L);
 			for (auto const& ed : ass->GetExtradata(dia->ExtradataIds)) {
-				push_value(L, ed.first);
-				push_value(L, ed.second);
+				push_value(L, ed.key);
+				push_value(L, ed.value);
 				lua_settable(L, -3);
 			}
 			lua_setfield(L, -2, "extra");
@@ -309,7 +309,8 @@ namespace Automation4 {
 					get_string_or_default(L, -2),
 					get_string_or_default(L, -1)));
 			});
-			dia->ExtradataIds = new_ids;
+			std::sort(begin(new_ids), end(new_ids));
+			dia->ExtradataIds = std::move(new_ids);
 		}
 		else {
 			error(L, "Found line with unknown class: %s", lclass.c_str());
diff --git a/src/subs_controller.cpp b/src/subs_controller.cpp
index 5aff32e18..051a75fc0 100644
--- a/src/subs_controller.cpp
+++ b/src/subs_controller.cpp
@@ -57,7 +57,7 @@ struct SubsController::UndoInfo {
 	std::vector<AssStyle> styles;
 	std::vector<AssDialogueBase> events;
 	std::vector<AssAttachment> attachments;
-	AegisubExtradataMap extradata;
+	std::vector<ExtradataEntry> extradata;
 
 	mutable std::vector<int> selection;
 	int active_line_id = 0;
diff --git a/src/subtitle_format_ass.cpp b/src/subtitle_format_ass.cpp
index 77184df4e..be4c31356 100644
--- a/src/subtitle_format_ass.cpp
+++ b/src/subtitle_format_ass.cpp
@@ -122,8 +122,8 @@ struct Writer {
 			file.WriteLineToFile(key + std::to_string(n));
 	}
 
-	void WriteExtradata(AegisubExtradataMap const& extradata) {
-		if (extradata.size() == 0)
+	void WriteExtradata(std::vector<ExtradataEntry> const& extradata) {
+		if (extradata.empty())
 			return;
 
 		group = AssEntryGroup::EXTRADATA;
@@ -131,16 +131,16 @@ struct Writer {
 		file.WriteLineToFile("[Aegisub Extradata]");
 		for (auto const& edi : extradata) {
 			std::string line = "Data: ";
-			line += std::to_string(edi.first);
+			line += std::to_string(edi.id);
 			line += ",";
-			line += inline_string_encode(edi.second.first);
+			line += inline_string_encode(edi.key);
 			line += ",";
-			std::string encoded_data = inline_string_encode(edi.second.second);
-			if (4*edi.second.second.size() < 3*encoded_data.size()) {
+			std::string encoded_data = inline_string_encode(edi.value);
+			if (4*edi.value.size() < 3*encoded_data.size()) {
 				// the inline_string encoding grew the data by more than uuencoding would
 				// so base64 encode it instead
 				line += "u"; // marker for uuencoding
-				line += agi::ass::UUEncode(edi.second.second, false);
+				line += agi::ass::UUEncode(edi.value, false);
 			} else {
 				line += "e"; // marker for inline_string encoding (escaping)
 				line += encoded_data;
-- 
GitLab