Skip to content

Commit e310706

Browse files
committed
New L2 normalization in TF-IDF document aligner
Before, we were not taking into account any infrequent or very frequent n-gram for the L2 normalization in the document score calculation. Now we follow these rules: - If the n-gram is in `df`, we use the normal TF-IDF. - If the n-gram is not in `df`, calculate the TF-IDF assuming a document frequency of 1. - If the n-gram is not in `df`, but it is a frequent one (as defined by the parameter `max_count`), then do not use the calculated TF-IDF and ignore the n-gram.
1 parent 3805f4e commit e310706

File tree

6 files changed

+48
-39
lines changed

6 files changed

+48
-39
lines changed

document-aligner/docalign.cpp

+24-19
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
#include <iostream>
22
#include <iomanip>
33
#include <fstream>
4-
#include <map>
54
#include <unordered_map>
5+
#include <unordered_set>
66
#include <thread>
77
#include <memory>
8-
#include <mutex>
98
#include <vector>
109
#include <cmath>
1110
#include <boost/program_options.hpp>
@@ -64,16 +63,16 @@ template <typename T> void stop(blocking_queue<unique_ptr<T>> &queue, vector<thr
6463

6564
ostream &operator<<(ostream &out, queue_performance const &performance) {
6665
return out << " underflow: " << performance.underflow << '\n'
67-
<< " overflow: " << performance.overflow << '\n';
66+
<< " overflow: " << performance.overflow << '\n';
6867
}
6968

7069
void print_score(float score, size_t left_id, size_t right_id)
7170
{
7271
cout << fixed << setprecision(5)
73-
<< score
74-
<< '\t' << left_id
75-
<< '\t' << right_id
76-
<< '\n';
72+
<< score
73+
<< '\t' << left_id
74+
<< '\t' << right_id
75+
<< '\n';
7776
}
7877

7978
size_t queue_lines(util::FilePiece &fin, blocking_queue<unique_ptr<vector<Line>>> &queue, size_t skip_rate = 1)
@@ -97,7 +96,7 @@ size_t queue_lines(util::FilePiece &fin, blocking_queue<unique_ptr<vector<Line>>
9796
break;
9897
}
9998

100-
queue.push(move(line_batch));
99+
queue.push(std::move(line_batch));
101100
}
102101

103102
return document_count;
@@ -163,8 +162,8 @@ int main(int argc, char *argv[])
163162

164163
if (vm.count("help") || !vm.count("translated-tokens") || !vm.count("english-tokens")) {
165164
cout << "Usage: " << argv[0]
166-
<< " TRANSLATED-TOKENS ENGLISH-TOKENS\n\n"
167-
<< generic_desc << endl;
165+
<< " TRANSLATED-TOKENS ENGLISH-TOKENS\n\n"
166+
<< generic_desc << endl;
168167
return 1;
169168
}
170169

@@ -185,6 +184,7 @@ int main(int argc, char *argv[])
185184
// that parse documents and keep a local hash table for counting. At the
186185
// end these tables are merged into df.
187186
unordered_map<NGram,size_t> df;
187+
unordered_set<NGram> max_ngram_pruned;
188188
size_t in_document_cnt, en_document_cnt, document_cnt;
189189

190190
{
@@ -243,14 +243,19 @@ int main(int argc, char *argv[])
243243
if (entry.second < min_ngram_cnt)
244244
continue;
245245

246-
if (entry.second > max_ngram_cnt)
246+
if (entry.second > max_ngram_cnt) {
247+
max_ngram_pruned.insert(entry.first);
247248
continue;
249+
}
248250

249251
pruned_df[entry.first] = entry.second;
250252
}
251253

252-
if (verbose)
253-
cerr << "Pruned " << df.size() - pruned_df.size() << " (" << 100.0 - 100.0 * pruned_df.size() / df.size() << "%) entries from DF" << endl;
254+
if (verbose) {
255+
cerr << "Pruned " << df.size() - pruned_df.size() << " (" << 100.0 - 100.0 * pruned_df.size() / df.size()
256+
<< "%) entries from DF" << endl;
257+
cerr << "Very frequent ngram set is now " << max_ngram_pruned.size() << " long." << endl;
258+
}
254259

255260
swap(df, pruned_df);
256261
}
@@ -262,7 +267,7 @@ int main(int argc, char *argv[])
262267
mutex ref_index_mutex;
263268

264269
blocking_queue<unique_ptr<vector<Line>>> queue(n_load_threads * QUEUE_SIZE_PER_THREAD);
265-
vector<thread> workers(start(n_load_threads, [&queue, &ref_index, &ref_index_mutex, &df, &document_cnt, &ngram_size]() {
270+
vector<thread> workers(start(n_load_threads, [&queue, &ref_index, &ref_index_mutex, &df, &max_ngram_pruned, &document_cnt, &ngram_size]() {
266271
unordered_map<NGram, vector<DocumentNGramScore>> local_ref_index;
267272

268273
while (true) {
@@ -280,7 +285,7 @@ int main(int argc, char *argv[])
280285
// so there should be no concurrency issue.
281286
// DF is accessed read-only. N starts counting at 1.
282287
DocumentRef ref;
283-
calculate_tfidf(doc, ref, document_cnt, df);
288+
calculate_tfidf(doc, ref, document_cnt, df, max_ngram_pruned);
284289

285290
for (auto const &entry : ref.wordvec) {
286291
local_ref_index[entry.hash].push_back(DocumentNGramScore{
@@ -330,7 +335,7 @@ int main(int argc, char *argv[])
330335

331336
blocking_queue<unique_ptr<vector<DocumentRef>>> score_queue(n_score_threads * QUEUE_SIZE_PER_THREAD);
332337

333-
vector<thread> read_workers(start(n_read_threads, [&read_queue, &score_queue, &document_cnt, &df, &ngram_size]() {
338+
vector<thread> read_workers(start(n_read_threads, [&read_queue, &score_queue, &document_cnt, &df, &max_ngram_pruned, &ngram_size]() {
334339
while (true) {
335340
unique_ptr<vector<Line>> line_batch(read_queue.pop());
336341

@@ -346,10 +351,10 @@ int main(int argc, char *argv[])
346351
ReadDocument(line.str, doc, ngram_size);
347352

348353
ref_batch->emplace_back();
349-
calculate_tfidf(doc, ref_batch->back(), document_cnt, df);
354+
calculate_tfidf(doc, ref_batch->back(), document_cnt, df, max_ngram_pruned);
350355
}
351356

352-
score_queue.push(move(ref_batch));
357+
score_queue.push(std::move(ref_batch));
353358
}
354359
}));
355360

@@ -459,7 +464,7 @@ int main(int argc, char *argv[])
459464

460465
if (verbose)
461466
cerr << "Read queue performance (Note: blocks when score queue fills up):\n" << read_queue.performance()
462-
<< "Score queue performance:\n" << score_queue.performance();
467+
<< "Score queue performance:\n" << score_queue.performance();
463468
}
464469

465470
return 0;

document-aligner/src/blocking_queue.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ struct queue_performance {
1414
template <typename T> class blocking_queue
1515
{
1616
public:
17-
explicit blocking_queue(size_t capacity);
17+
explicit blocking_queue(size_t size);
1818

1919
void push(T const &item);
2020
void push(T &&item);

document-aligner/src/document.cpp

+20-15
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
#include "document.h"
22
#include "base64.h"
33
#include "ngram.h"
4-
#include <sstream>
5-
#include <iostream>
64
#include <cmath>
75

86
using namespace std;
@@ -32,32 +30,39 @@ inline float tfidf(size_t tf, size_t dc, size_t df) {
3230
* across all documents. Only terms that are seen in this document and in the document frequency table are
3331
* counted. All other terms are ignored.
3432
*/
35-
void calculate_tfidf(Document const &document, DocumentRef &document_ref, size_t document_count, unordered_map<NGram, size_t> const &df) {
33+
void calculate_tfidf(Document const &document, DocumentRef &document_ref, size_t document_count, unordered_map<NGram, size_t> const &df, unordered_set<NGram> const &max_ngram_pruned) {
3634
document_ref.id = document.id;
3735

3836
document_ref.wordvec.clear();
3937
document_ref.wordvec.reserve(document.vocab.size());
4038

4139
float total_tfidf_l2 = 0;
42-
40+
4341
for (auto const &entry : document.vocab) {
4442
// How often does the term occur in the whole dataset?
4543
auto it = df.find(entry.first);
4644

47-
// Skip words that are not in the document frequency map entirely.
48-
// (Matches Python implementation)
49-
if (it == df.end())
50-
continue;
51-
52-
float document_tfidf = tfidf(entry.second, document_count, it->second);
45+
float document_tfidf;
46+
47+
if (it == df.end()) {
48+
if (max_ngram_pruned.find(entry.first) == max_ngram_pruned.end()) {
49+
document_tfidf = tfidf(entry.second, document_count, 1);
50+
}
51+
else{
52+
continue;
53+
}
54+
}
55+
else {
56+
document_tfidf = tfidf(entry.second, document_count, it->second);
57+
58+
document_ref.wordvec.push_back(WordScore{
59+
.hash = entry.first,
60+
.tfidf = document_tfidf
61+
});
62+
}
5363

5464
// Keep track of the squared sum of all values for L2 normalisation
5565
total_tfidf_l2 += document_tfidf * document_tfidf;
56-
57-
document_ref.wordvec.push_back(WordScore{
58-
.hash = entry.first,
59-
.tfidf = document_tfidf
60-
});
6166
}
6267

6368
// Normalize

document-aligner/src/document.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "ngram.h"
44
#include <istream>
55
#include <unordered_map>
6+
#include <unordered_set>
67
#include <vector>
78

89
namespace bitextor {
@@ -31,6 +32,6 @@ struct DocumentRef {
3132
// Assumes base64 encoded still.
3233
void ReadDocument(const util::StringPiece &encoded, Document &to, size_t ngram_size);
3334

34-
void calculate_tfidf(Document const &document, DocumentRef &document_ref, size_t document_count, std::unordered_map<NGram, size_t> const &df);
35+
void calculate_tfidf(Document const &document, DocumentRef &document_ref, size_t document_count, std::unordered_map<NGram, size_t> const &df, std::unordered_set<NGram> const &max_ngram_pruned);
3536

3637
} // namespace bitextor

document-aligner/src/ngram.cpp

-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "ngram.h"
22
#include "murmur_hash.h"
3-
#include <sstream>
4-
#include <iostream>
53

64
using namespace std;
75

document-aligner/src/ngram.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class NGramIter : public boost::iterator_facade<NGramIter, const NGram, boost::f
2222
return end_;
2323
}
2424

25-
inline operator bool() const {
25+
inline explicit operator bool() const {
2626
return !end_;
2727
}
2828

0 commit comments

Comments
 (0)