32
32
*/
33
33
package com .orange .labs .comparison ;
34
34
35
+ import java .util .ArrayList ;
36
+ import java .util .LinkedHashMap ;
35
37
import java .util .List ;
36
38
import java .util .Map ;
37
39
@@ -51,6 +53,9 @@ public class Analyser implements Runnable {
51
53
private int feats ;
52
54
private int deprel ;
53
55
Map <String , ConlluComparator .Signatures > csents ; // unique id (filename#id#number): sentence
56
+ //Map<String, List<ConlluComparator.Result>> results; // store similar sentences: {id1: [(id2,dit)]}
57
+ List <String []> results ; // store similar sentences: "column <TAB> dist <TAB> id1 <TAB> id2"
58
+ private boolean aggregate ; // collect results and return
54
59
55
60
public Analyser (int modulo , int totalthreads , List <String > keys , Map <String , ConlluComparator .Signatures > csents ,
56
61
int form , int lemma , int upos , int xpos , int feats , int deprel ) {
@@ -66,13 +71,18 @@ public Analyser(int modulo, int totalthreads, List<String> keys, Map<String, Con
66
71
this .xpos = xpos ;
67
72
this .feats = feats ;
68
73
this .deprel = deprel ;
69
-
74
+ aggregate = true ;
75
+ if (aggregate ) {
76
+ results = new ArrayList <>();
77
+ }
70
78
}
71
79
72
80
@ Override
73
81
public void run () {
74
82
for (int i = 0 ; i < len ; ++i ) {
75
- //System.err.println(modulo + ": Checking " + i);
83
+ if (modulo == 0 && i % 7 == 0 ) {
84
+ System .err .format ("Checking %d/%d\r " , i , len );
85
+ }
76
86
ConlluComparator .Signatures cursent = csents .get (keys .get (i ));
77
87
for (int j = i + 1 ; j < len ; ++j ) {
78
88
if (j % totalthreads != modulo ) {
@@ -90,7 +100,7 @@ public void run() {
90
100
int dist = calculateDistance (cursent .sentence , othersent .sentence );
91
101
if (dist == 0 ) {
92
102
identical ("FORM" , cursent , othersent );
93
- } else if (dist <= form ) {
103
+ } else if (dist <= form ) {
94
104
similar ("FORM" , dist , cursent , othersent );
95
105
}
96
106
}
@@ -166,6 +176,7 @@ public void run() {
166
176
}
167
177
}
168
178
}
179
+ if (modulo == 0 ) System .err .println ();
169
180
}
170
181
171
182
// inspired by https://github.com/crwohlfeil/damerau-levenshtein
@@ -209,19 +220,34 @@ private int calculateDistance(List<? extends Object> source, List<? extends Obje
209
220
return dist [sourceLength ][targetLength ];
210
221
}
211
222
223
+ public List <String []> getResults () {
224
+ return results ;
225
+ }
212
226
213
227
private void identical (String column , ConlluComparator .Signatures s1 , ConlluComparator .Signatures s2 ) {
228
+ if (aggregate ) {
229
+ //results.add(String.format("%s\t0\t%s\t%s\t%s", column, s1.id, s2.id, s1.sent));
230
+ String [] e = { column , "0" , s1 .id , s2 .id , s1 .sent };
231
+ results .add (e );
232
+ } else {
214
233
System .out .format ("%s identical\t %s\t %s\n " , column , s1 .id , s2 .id );
215
234
System .out .format ("# %s\n " , s1 .sent );
216
235
printColumn (column , s1 );
236
+ }
217
237
}
218
238
219
239
private void similar (String column , int dist , ConlluComparator .Signatures s1 , ConlluComparator .Signatures s2 ) {
220
- System .out .format ("%s similar %d\t %s\t %s\n " , column , dist , s1 .id , s2 .id );
221
- System .out .format ("# %s\n " , s1 .sent );
222
- printColumn (column , s1 );
223
- System .out .format ("# %s\n " , s2 .sent );
224
- printColumn (column , s2 );
240
+ if (aggregate ) {
241
+ //results.add(String.format("%s\t%d\t%s\t%s\t%s\t%s", column, dist, s1.id, s2.id, s1.sent, s2.sent));
242
+ String [] e = {column , "" +dist , s1 .id , s2 .id , s1 .sent , s2 .sent };
243
+ results .add (e );
244
+ } else {
245
+ System .out .format ("%s similar %d\t %s\t %s\n " , column , dist , s1 .id , s2 .id );
246
+ System .out .format ("# %s\n " , s1 .sent );
247
+ printColumn (column , s1 );
248
+ System .out .format ("# %s\n " , s2 .sent );
249
+ printColumn (column , s2 );
250
+ }
225
251
}
226
252
227
253
0 commit comments