New L2 normalization in TF-IDF document aligner

lpla · lpla · commit e31070619b78 · 2023-02-21T13:31:54.000+01:00
Before, we were not taking into account any infrequent or very frequent n-gram for the L2 normalization in the document score calculation. Now we follow these rules:
- If the n-gram is in `df`, we use the normal TF-IDF.
- If the n-gram is not in `df`, calculate the TF-IDF assuming a document frequency of 1.
- If the n-gram is not in `df`, but it is a frequent one (as defined by the parameter `max_count`), then do not use the calculated TF-IDF and ignore the n-gram.
diff --git a/document-aligner/docalign.cpp b/document-aligner/docalign.cpp
@@ -1,11 +1,10 @@
 #include <iostream>
 #include <iomanip>
 #include <fstream>
-#include <map>
 #include <unordered_map>
+#include <unordered_set>
 #include <thread>
 #include <memory>
-#include <mutex>
 #include <vector>
 #include <cmath>
 #include <boost/program_options.hpp>
@@ -64,16 +63,16 @@ template <typename T> void stop(blocking_queue<unique_ptr<T>> &queue, vector<thr
 
 ostream &operator<<(ostream &out, queue_performance const &performance) {
 	return out << "  underflow: " << performance.underflow << '\n'
-	           << "   overflow: " << performance.overflow << '\n';
+			   << "   overflow: " << performance.overflow << '\n';
 }
 
 void print_score(float score, size_t left_id, size_t right_id)
 {
 	cout << fixed << setprecision(5)
-	     << score
-	     << '\t' << left_id
-	     << '\t' << right_id
-	     << '\n';
+		 << score
+		 << '\t' << left_id
+		 << '\t' << right_id
+		 << '\n';
 }
 
 size_t queue_lines(util::FilePiece &fin, blocking_queue<unique_ptr<vector<Line>>> &queue, size_t skip_rate = 1)
@@ -97,7 +96,7 @@ size_t queue_lines(util::FilePiece &fin, blocking_queue<unique_ptr<vector<Line>>
 				break;
 		}
 
-		queue.push(move(line_batch));
+		queue.push(std::move(line_batch));
 	}
 
 	return document_count;
@@ -163,8 +162,8 @@ int main(int argc, char *argv[])
 	
 	if (vm.count("help") || !vm.count("translated-tokens") || !vm.count("english-tokens")) {
 		cout << "Usage: " << argv[0]
-		     << " TRANSLATED-TOKENS ENGLISH-TOKENS\n\n"
-		     << generic_desc << endl;
+			 << " TRANSLATED-TOKENS ENGLISH-TOKENS\n\n"
+			 << generic_desc << endl;
 		return 1;
 	}
 
@@ -185,6 +184,7 @@ int main(int argc, char *argv[])
 	// that parse documents and keep a local hash table for counting. At the
 	// end these tables are merged into df.
 	unordered_map<NGram,size_t> df;
+	unordered_set<NGram> max_ngram_pruned;
 	size_t in_document_cnt, en_document_cnt, document_cnt;
 
 	{
@@ -243,14 +243,19 @@ int main(int argc, char *argv[])
 			if (entry.second < min_ngram_cnt)
 				continue;
 
-			if (entry.second > max_ngram_cnt)
+			if (entry.second > max_ngram_cnt) {
+				max_ngram_pruned.insert(entry.first);
 				continue;
+			}
 
 			pruned_df[entry.first] = entry.second;
 		}
 
-		if (verbose)
-			cerr << "Pruned " << df.size() - pruned_df.size() << " (" << 100.0 - 100.0 * pruned_df.size() / df.size() << "%) entries from DF" << endl;
+		if (verbose) {
+            cerr << "Pruned " << df.size() - pruned_df.size() << " (" << 100.0 - 100.0 * pruned_df.size() / df.size()
+                 << "%) entries from DF" << endl;
+            cerr << "Very frequent ngram set is now " << max_ngram_pruned.size() << " long." << endl;
+        }
 
 		swap(df, pruned_df);
 	}
@@ -262,7 +267,7 @@ int main(int argc, char *argv[])
 		mutex ref_index_mutex;
 
 		blocking_queue<unique_ptr<vector<Line>>> queue(n_load_threads * QUEUE_SIZE_PER_THREAD);
-		vector<thread> workers(start(n_load_threads, [&queue, &ref_index, &ref_index_mutex, &df, &document_cnt, &ngram_size]() {
+		vector<thread> workers(start(n_load_threads, [&queue, &ref_index, &ref_index_mutex, &df, &max_ngram_pruned, &document_cnt, &ngram_size]() {
 			unordered_map<NGram, vector<DocumentNGramScore>> local_ref_index;
 
 			while (true) {
@@ -280,7 +285,7 @@ int main(int argc, char *argv[])
 					// so there should be no concurrency issue.
 					// DF is accessed read-only. N starts counting at 1.
 					DocumentRef ref;
-					calculate_tfidf(doc, ref, document_cnt, df);
+					calculate_tfidf(doc, ref, document_cnt, df, max_ngram_pruned);
 
 					for (auto const &entry : ref.wordvec) {
 						local_ref_index[entry.hash].push_back(DocumentNGramScore{
@@ -330,7 +335,7 @@ int main(int argc, char *argv[])
 
 		blocking_queue<unique_ptr<vector<DocumentRef>>> score_queue(n_score_threads * QUEUE_SIZE_PER_THREAD);
 
-		vector<thread> read_workers(start(n_read_threads, [&read_queue, &score_queue, &document_cnt, &df, &ngram_size]() {
+		vector<thread> read_workers(start(n_read_threads, [&read_queue, &score_queue, &document_cnt, &df, &max_ngram_pruned, &ngram_size]() {
 			while (true) {
 				unique_ptr<vector<Line>> line_batch(read_queue.pop());
 
@@ -346,10 +351,10 @@ int main(int argc, char *argv[])
 					ReadDocument(line.str, doc, ngram_size);
 
 					ref_batch->emplace_back();
-					calculate_tfidf(doc, ref_batch->back(), document_cnt, df);
+					calculate_tfidf(doc, ref_batch->back(), document_cnt, df, max_ngram_pruned);
 				}
 
-				score_queue.push(move(ref_batch));
+				score_queue.push(std::move(ref_batch));
 			}
 		}));
 
@@ -459,7 +464,7 @@ int main(int argc, char *argv[])
 
 		if (verbose)
 			cerr << "Read queue performance (Note: blocks when score queue fills up):\n" << read_queue.performance()
-			     << "Score queue performance:\n" << score_queue.performance();
+				 << "Score queue performance:\n" << score_queue.performance();
 	}
 
 	return 0;
diff --git a/document-aligner/src/blocking_queue.h b/document-aligner/src/blocking_queue.h
@@ -14,7 +14,7 @@ struct queue_performance {
 template <typename T> class blocking_queue
 {
 public:
-	explicit blocking_queue(size_t capacity);
+	explicit blocking_queue(size_t size);
 	
 	void push(T const &item);
 	void push(T &&item);
diff --git a/document-aligner/src/document.cpp b/document-aligner/src/document.cpp
@@ -1,8 +1,6 @@
 #include "document.h"
 #include "base64.h"
 #include "ngram.h"
-#include <sstream>
-#include <iostream>
 #include <cmath>
 
 using namespace std;
@@ -32,32 +30,39 @@ inline float tfidf(size_t tf, size_t dc, size_t df) {
  * across all documents. Only terms that are seen in this document and in the document frequency table are
  * counted. All other terms are ignored.
 */
-void calculate_tfidf(Document const &document, DocumentRef &document_ref, size_t document_count, unordered_map<NGram, size_t> const &df) {
+void calculate_tfidf(Document const &document, DocumentRef &document_ref, size_t document_count, unordered_map<NGram, size_t> const &df, unordered_set<NGram> const &max_ngram_pruned) {
 	document_ref.id = document.id;
 
 	document_ref.wordvec.clear();
 	document_ref.wordvec.reserve(document.vocab.size());
 	
 	float total_tfidf_l2 = 0;
-	
+
 	for (auto const &entry : document.vocab) {
 		// How often does the term occur in the whole dataset?
 		auto it = df.find(entry.first);
 
-		// Skip words that are not in the document frequency map entirely.
-		// (Matches Python implementation)
-		if (it == df.end())
-			continue;
-	
-		float document_tfidf = tfidf(entry.second, document_count, it->second);
+		float document_tfidf;
+
+		if (it == df.end()) {
+			if (max_ngram_pruned.find(entry.first) == max_ngram_pruned.end()) {
+				document_tfidf = tfidf(entry.second, document_count, 1);
+			}
+			else{
+				continue;
+			}
+		}
+		else {
+			document_tfidf = tfidf(entry.second, document_count, it->second);
+
+			document_ref.wordvec.push_back(WordScore{
+					.hash = entry.first,
+					.tfidf = document_tfidf
+			});
+		}
 		
 		// Keep track of the squared sum of all values for L2 normalisation
 		total_tfidf_l2 += document_tfidf * document_tfidf;
-		
-		document_ref.wordvec.push_back(WordScore{
-			.hash = entry.first,
-			.tfidf = document_tfidf
-		});
 	}
 	
 	// Normalize
diff --git a/document-aligner/src/document.h b/document-aligner/src/document.h
@@ -3,6 +3,7 @@
 #include "ngram.h"
 #include <istream>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 namespace bitextor {
@@ -31,6 +32,6 @@ struct DocumentRef {
 // Assumes base64 encoded still.
 void ReadDocument(const util::StringPiece &encoded, Document &to, size_t ngram_size);
 
-void calculate_tfidf(Document const &document, DocumentRef &document_ref, size_t document_count, std::unordered_map<NGram, size_t> const &df);
+void calculate_tfidf(Document const &document, DocumentRef &document_ref, size_t document_count, std::unordered_map<NGram, size_t> const &df, std::unordered_set<NGram> const &max_ngram_pruned);
 
 } // namespace bitextor
diff --git a/document-aligner/src/ngram.cpp b/document-aligner/src/ngram.cpp
@@ -1,7 +1,5 @@
 #include "ngram.h"
 #include "murmur_hash.h"
-#include <sstream>
-#include <iostream>
 
 using namespace std;
 
diff --git a/document-aligner/src/ngram.h b/document-aligner/src/ngram.h
@@ -22,7 +22,7 @@ class NGramIter : public boost::iterator_facade<NGramIter, const NGram, boost::f
 		return end_;
 	}
 
-	inline operator bool() const {
+	inline explicit operator bool() const {
 		return !end_;
 	}
 

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ struct queue_performance {`
`14`	`14`	`template <typename T> class blocking_queue`
`15`	`15`	`{`
`16`	`16`	`public:`
`17`		`- explicit blocking_queue(size_t capacity);`
	`17`	`+ explicit blocking_queue(size_t size);`
`18`	`18`
`19`	`19`	`void push(T const &item);`
`20`	`20`	`void push(T &&item);`
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ class NGramIter : public boost::iterator_facade<NGramIter, const NGram, boost::f`
`22`	`22`	`return end_;`
`23`	`23`	`}`
`24`	`24`
`25`		`- inline operator bool() const {`
	`25`	`+ inline explicit operator bool() const {`
`26`	`26`	`return !end_;`
`27`	`27`	`}`
`28`	`28`