1
1
#include < iostream>
2
2
#include < iomanip>
3
3
#include < fstream>
4
- #include < map>
5
4
#include < unordered_map>
5
+ #include < unordered_set>
6
6
#include < thread>
7
7
#include < memory>
8
- #include < mutex>
9
8
#include < vector>
10
9
#include < cmath>
11
10
#include < boost/program_options.hpp>
@@ -64,16 +63,16 @@ template <typename T> void stop(blocking_queue<unique_ptr<T>> &queue, vector<thr
64
63
65
64
ostream &operator <<(ostream &out, queue_performance const &performance) {
66
65
return out << " underflow: " << performance.underflow << ' \n '
67
- << " overflow: " << performance.overflow << ' \n ' ;
66
+ << " overflow: " << performance.overflow << ' \n ' ;
68
67
}
69
68
70
69
void print_score (float score, size_t left_id, size_t right_id)
71
70
{
72
71
cout << fixed << setprecision (5 )
73
- << score
74
- << ' \t ' << left_id
75
- << ' \t ' << right_id
76
- << ' \n ' ;
72
+ << score
73
+ << ' \t ' << left_id
74
+ << ' \t ' << right_id
75
+ << ' \n ' ;
77
76
}
78
77
79
78
size_t queue_lines (util::FilePiece &fin, blocking_queue<unique_ptr<vector<Line>>> &queue, size_t skip_rate = 1 )
@@ -97,7 +96,7 @@ size_t queue_lines(util::FilePiece &fin, blocking_queue<unique_ptr<vector<Line>>
97
96
break ;
98
97
}
99
98
100
- queue.push (move (line_batch));
99
+ queue.push (std:: move (line_batch));
101
100
}
102
101
103
102
return document_count;
@@ -163,8 +162,8 @@ int main(int argc, char *argv[])
163
162
164
163
if (vm.count (" help" ) || !vm.count (" translated-tokens" ) || !vm.count (" english-tokens" )) {
165
164
cout << " Usage: " << argv[0 ]
166
- << " TRANSLATED-TOKENS ENGLISH-TOKENS\n\n "
167
- << generic_desc << endl;
165
+ << " TRANSLATED-TOKENS ENGLISH-TOKENS\n\n "
166
+ << generic_desc << endl;
168
167
return 1 ;
169
168
}
170
169
@@ -185,6 +184,7 @@ int main(int argc, char *argv[])
185
184
// that parse documents and keep a local hash table for counting. At the
186
185
// end these tables are merged into df.
187
186
unordered_map<NGram,size_t > df;
187
+ unordered_set<NGram> max_ngram_pruned;
188
188
size_t in_document_cnt, en_document_cnt, document_cnt;
189
189
190
190
{
@@ -243,14 +243,19 @@ int main(int argc, char *argv[])
243
243
if (entry.second < min_ngram_cnt)
244
244
continue ;
245
245
246
- if (entry.second > max_ngram_cnt)
246
+ if (entry.second > max_ngram_cnt) {
247
+ max_ngram_pruned.insert (entry.first );
247
248
continue ;
249
+ }
248
250
249
251
pruned_df[entry.first ] = entry.second ;
250
252
}
251
253
252
- if (verbose)
253
- cerr << " Pruned " << df.size () - pruned_df.size () << " (" << 100.0 - 100.0 * pruned_df.size () / df.size () << " %) entries from DF" << endl;
254
+ if (verbose) {
255
+ cerr << " Pruned " << df.size () - pruned_df.size () << " (" << 100.0 - 100.0 * pruned_df.size () / df.size ()
256
+ << " %) entries from DF" << endl;
257
+ cerr << " Very frequent ngram set is now " << max_ngram_pruned.size () << " long." << endl;
258
+ }
254
259
255
260
swap (df, pruned_df);
256
261
}
@@ -262,7 +267,7 @@ int main(int argc, char *argv[])
262
267
mutex ref_index_mutex;
263
268
264
269
blocking_queue<unique_ptr<vector<Line>>> queue (n_load_threads * QUEUE_SIZE_PER_THREAD);
265
- vector<thread> workers (start (n_load_threads, [&queue, &ref_index, &ref_index_mutex, &df, &document_cnt, &ngram_size]() {
270
+ vector<thread> workers (start (n_load_threads, [&queue, &ref_index, &ref_index_mutex, &df, &max_ngram_pruned, & document_cnt, &ngram_size]() {
266
271
unordered_map<NGram, vector<DocumentNGramScore>> local_ref_index;
267
272
268
273
while (true ) {
@@ -280,7 +285,7 @@ int main(int argc, char *argv[])
280
285
// so there should be no concurrency issue.
281
286
// DF is accessed read-only. N starts counting at 1.
282
287
DocumentRef ref;
283
- calculate_tfidf (doc, ref, document_cnt, df);
288
+ calculate_tfidf (doc, ref, document_cnt, df, max_ngram_pruned );
284
289
285
290
for (auto const &entry : ref.wordvec ) {
286
291
local_ref_index[entry.hash ].push_back (DocumentNGramScore{
@@ -330,7 +335,7 @@ int main(int argc, char *argv[])
330
335
331
336
blocking_queue<unique_ptr<vector<DocumentRef>>> score_queue (n_score_threads * QUEUE_SIZE_PER_THREAD);
332
337
333
- vector<thread> read_workers (start (n_read_threads, [&read_queue, &score_queue, &document_cnt, &df, &ngram_size]() {
338
+ vector<thread> read_workers (start (n_read_threads, [&read_queue, &score_queue, &document_cnt, &df, &max_ngram_pruned, & ngram_size]() {
334
339
while (true ) {
335
340
unique_ptr<vector<Line>> line_batch (read_queue.pop ());
336
341
@@ -346,10 +351,10 @@ int main(int argc, char *argv[])
346
351
ReadDocument (line.str , doc, ngram_size);
347
352
348
353
ref_batch->emplace_back ();
349
- calculate_tfidf (doc, ref_batch->back (), document_cnt, df);
354
+ calculate_tfidf (doc, ref_batch->back (), document_cnt, df, max_ngram_pruned );
350
355
}
351
356
352
- score_queue.push (move (ref_batch));
357
+ score_queue.push (std:: move (ref_batch));
353
358
}
354
359
}));
355
360
@@ -459,7 +464,7 @@ int main(int argc, char *argv[])
459
464
460
465
if (verbose)
461
466
cerr << " Read queue performance (Note: blocks when score queue fills up):\n " << read_queue.performance ()
462
- << " Score queue performance:\n " << score_queue.performance ();
467
+ << " Score queue performance:\n " << score_queue.performance ();
463
468
}
464
469
465
470
return 0 ;
0 commit comments