Skip to content

Commit e752abd

Browse files
committed
more readable output
1 parent b525e73 commit e752abd

File tree

2 files changed

+74
-10
lines changed

2 files changed

+74
-10
lines changed

src/main/java/com/orange/labs/comparison/Analyser.java

+34-8
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
*/
3333
package com.orange.labs.comparison;
3434

35+
import java.util.ArrayList;
36+
import java.util.LinkedHashMap;
3537
import java.util.List;
3638
import java.util.Map;
3739

@@ -51,6 +53,9 @@ public class Analyser implements Runnable {
5153
private int feats;
5254
private int deprel;
5355
Map<String, ConlluComparator.Signatures> csents; // unique id (filename#id#number): sentence
56+
//Map<String, List<ConlluComparator.Result>> results; // store similar sentences: {id1: [(id2,dit)]}
57+
List<String []> results; // store similar sentences: "column <TAB> dist <TAB> id1 <TAB> id2"
58+
private boolean aggregate; // collect results and return
5459

5560
public Analyser(int modulo, int totalthreads, List<String> keys, Map<String, ConlluComparator.Signatures> csents,
5661
int form, int lemma, int upos, int xpos, int feats, int deprel) {
@@ -66,13 +71,18 @@ public Analyser(int modulo, int totalthreads, List<String> keys, Map<String, Con
6671
this.xpos = xpos;
6772
this.feats = feats;
6873
this.deprel = deprel;
69-
74+
aggregate = true;
75+
if (aggregate) {
76+
results = new ArrayList<>();
77+
}
7078
}
7179

7280
@Override
7381
public void run() {
7482
for (int i = 0; i < len; ++i) {
75-
//System.err.println(modulo + ": Checking " + i);
83+
if (modulo == 0 && i % 7 == 0) {
84+
System.err.format("Checking %d/%d\r", i, len);
85+
}
7686
ConlluComparator.Signatures cursent = csents.get(keys.get(i));
7787
for (int j = i + 1; j < len; ++j) {
7888
if (j % totalthreads != modulo) {
@@ -90,7 +100,7 @@ public void run() {
90100
int dist = calculateDistance(cursent.sentence, othersent.sentence);
91101
if (dist == 0) {
92102
identical("FORM", cursent, othersent);
93-
} else if (dist <= form) {
103+
} else if (dist <= form) {
94104
similar("FORM", dist, cursent, othersent);
95105
}
96106
}
@@ -166,6 +176,7 @@ public void run() {
166176
}
167177
}
168178
}
179+
if (modulo == 0) System.err.println();
169180
}
170181

171182
// inspired by https://github.com/crwohlfeil/damerau-levenshtein
@@ -209,19 +220,34 @@ private int calculateDistance(List<? extends Object> source, List<? extends Obje
209220
return dist[sourceLength][targetLength];
210221
}
211222

223+
public List<String []> getResults() {
224+
return results;
225+
}
212226

213227
private void identical(String column, ConlluComparator.Signatures s1, ConlluComparator.Signatures s2) {
228+
if (aggregate) {
229+
//results.add(String.format("%s\t0\t%s\t%s\t%s", column, s1.id, s2.id, s1.sent));
230+
String [] e = { column, "0", s1.id, s2.id, s1.sent};
231+
results.add(e);
232+
} else {
214233
System.out.format("%s identical\t%s\t%s\n", column, s1.id, s2.id);
215234
System.out.format("# %s\n", s1.sent);
216235
printColumn(column, s1);
236+
}
217237
}
218238

219239
private void similar(String column, int dist, ConlluComparator.Signatures s1, ConlluComparator.Signatures s2) {
220-
System.out.format("%s similar %d\t%s\t%s\n", column, dist, s1.id, s2.id);
221-
System.out.format("# %s\n", s1.sent);
222-
printColumn(column, s1);
223-
System.out.format("# %s\n", s2.sent);
224-
printColumn(column, s2);
240+
if (aggregate) {
241+
//results.add(String.format("%s\t%d\t%s\t%s\t%s\t%s", column, dist, s1.id, s2.id, s1.sent, s2.sent));
242+
String [] e = {column, ""+dist, s1.id, s2.id, s1.sent, s2.sent};
243+
results.add(e);
244+
} else {
245+
System.out.format("%s similar %d\t%s\t%s\n", column, dist, s1.id, s2.id);
246+
System.out.format("# %s\n", s1.sent);
247+
printColumn(column, s1);
248+
System.out.format("# %s\n", s2.sent);
249+
printColumn(column, s2);
250+
}
225251
}
226252

227253

src/main/java/com/orange/labs/comparison/ConlluComparator.java

+40-2
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
import java.util.List;
4444
import java.util.Map;
4545
import java.util.LinkedHashMap;
46+
import java.util.LinkedHashSet;
47+
import java.util.Set;
4648

4749

4850
/**
@@ -81,7 +83,8 @@ public ConlluComparator(List<? extends Object> objects, int numberOfThreads) thr
8183
for (ConllFile cf : cdocs) {
8284
for (ConllSentence csent : cf.getSentences()) {
8385
ct += 1;
84-
String id = String.format("%s#%s#%d", cf.getFile(), csent.getSentid(), ct);
86+
//String id = String.format("%s#%s#%d", cf.getFile(), csent.getSentid(), ct);
87+
String id = String.format("%s#%s", cf.getFile(), csent.getSentid());
8588
csents.put(id, new Signatures(csent, id));
8689
}
8790
}
@@ -100,20 +103,55 @@ public void analyse(int form, int lemma, int upos, int xpos, int feats, int depr
100103
for(int x = 1; x< keys.size(); ++x) comps += x;
101104
System.err.println(comps + " comparisons needed");
102105
List<Thread> thrs = new ArrayList<>();
106+
List<Analyser>analysers = new ArrayList<>();
103107

104108
for (int th = 0; th < numberOfThreads; ++th) {
105109
Analyser a = new Analyser(th, numberOfThreads, keys, csents, form, lemma, upos, xpos, feats, deprel);
106110
Thread thr = new Thread(a);
107111
thr.start();
108112
thrs.add(thr);
113+
analysers.add(a);
109114
}
110115

111116
for(Thread thr : thrs) {
112117
thr.join();
113118
}
114119

120+
Map<String, Set<String>>identical = new LinkedHashMap<>(); // sentence: [ids]
121+
List<String []>similar = new ArrayList<>();
122+
// aggregate identical
123+
for(Analyser a : analysers) {
124+
for (String [] elems : a.getResults()) {
125+
if (elems[0].equals("FORM") && elems[1].equals("0")) {
126+
Set<String>ids = identical.get(elems[4]);
127+
if (ids == null) {
128+
ids = new LinkedHashSet<>();
129+
identical.put(elems[4], ids);
130+
}
131+
ids.add(elems[2]);
132+
ids.add(elems[3]);
133+
} else {
134+
similar.add(elems);
135+
}
136+
}
137+
}
138+
// output identical sentences
139+
for (String sentence : identical.keySet()) {
140+
System.out.format("FORM\t0\t%s\t%s\n", sentence, String.join("\t", identical.get(sentence)));
141+
}
142+
for (String [] sim: similar) {
143+
System.out.println(String.join("\t", sim));
144+
}
145+
}
146+
/** comparison results */
147+
class Result {
148+
public String id;
149+
public int dist;
150+
public String col;
115151
}
116152

153+
154+
/** preprocess the sentences to speed up the comparison */
117155
class Signatures {
118156

119157
public ConllSentence cs;
@@ -136,7 +174,7 @@ public Signatures(ConllSentence cs, String id) {
136174
xposs = new ArrayList<>();
137175
deprels = new ArrayList<>();
138176
feats = new ArrayList<>();
139-
sent = cs.getSentence();
177+
sent = cs.getSentence().strip();
140178
sentence = new ArrayList<>();
141179
for (char c : sent.toCharArray()) {
142180
sentence.add(c);

0 commit comments

Comments
 (0)