Skip to content

Commit 11d43f1

Browse files
committed
add some stats
1 parent 952901d commit 11d43f1

File tree

2 files changed

+39
-7
lines changed

2 files changed

+39
-7
lines changed

src/main/java/com/orange/labs/comparison/Analyser.java

+9-7
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434

3535
import static java.lang.Math.abs;
3636
import java.util.ArrayList;
37-
import java.util.LinkedHashMap;
3837
import java.util.List;
3938
import java.util.Map;
4039

@@ -46,7 +45,7 @@ public class Analyser implements Runnable {
4645
private int modulo; // current thread number
4746
private int totalthreads; // number of threads used (here we process one out of totalthread
4847
private List<String> keys;
49-
private int len;
48+
private int len; // number of sentences
5049
private int form;
5150
private int lemma;
5251
private int upos;
@@ -177,7 +176,9 @@ public void run() {
177176
}
178177
}
179178
}
180-
if (modulo == 0) System.err.println();
179+
if (modulo == 0) {
180+
System.err.format("Checked %d \n", len);
181+
}
181182
}
182183

183184
// inspired by https://github.com/crwohlfeil/damerau-levenshtein
@@ -228,16 +229,17 @@ private int calculateDistance(List<? extends Object> source, List<? extends Obje
228229
public List<String []> getResults() {
229230
return results;
230231
}
231-
232+
233+
232234
private void identical(String column, ConlluComparator.Signatures s1, ConlluComparator.Signatures s2) {
233235
if (aggregate) {
234236
//results.add(String.format("%s\t0\t%s\t%s\t%s", column, s1.id, s2.id, s1.sent));
235237
String [] e = { column, "0", s1.id, s2.id, s1.sent};
236238
results.add(e);
237239
} else {
238-
System.out.format("%s identical\t%s\t%s\n", column, s1.id, s2.id);
239-
System.out.format("# %s\n", s1.sent);
240-
printColumn(column, s1);
240+
System.out.format("%s identical\t%s\t%s\n", column, s1.id, s2.id);
241+
System.out.format("# %s\n", s1.sent);
242+
printColumn(column, s1);
241243
}
242244
}
243245

src/main/java/com/orange/labs/comparison/ConlluComparator.java

+30
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import java.io.IOException;
4141
import java.util.ArrayList;
4242
import java.util.Arrays;
43+
import java.util.TreeMap;
4344
import java.util.List;
4445
import java.util.Map;
4546
import java.util.LinkedHashMap;
@@ -58,6 +59,7 @@ public class ConlluComparator {
5859

5960
private Map<String, Signatures> csents; // unique id (filename#id#number): sentence
6061
private int numberOfThreads;
62+
Map<Integer, Integer>sentencelengths; // sentencelength: number of sentences
6163

6264
public ConlluComparator(List<? extends Object> objects, int numberOfThreads) throws ConllException, IOException {
6365
//public ConlluComparator(List<String> objects) throws ConllException, IOException {
@@ -79,15 +81,25 @@ public ConlluComparator(List<? extends Object> objects, int numberOfThreads) thr
7981

8082
this.numberOfThreads = numberOfThreads;
8183
csents = new LinkedHashMap<>();
84+
sentencelengths = new TreeMap<>(); // sentencelength: number of sentences
8285
int ct = 0;
8386
for (ConllFile cf : cdocs) {
8487
for (ConllSentence csent : cf.getSentences()) {
8588
ct += 1;
8689
//String id = String.format("%s#%s#%d", cf.getFile(), csent.getSentid(), ct);
8790
String id = String.format("%s#%s", cf.getFile(), csent.getSentid());
8891
csents.put(id, new Signatures(csent, id));
92+
int tokens = csent.getAllWords().size();
93+
Integer occ = sentencelengths.get(tokens);
94+
if (occ == null) {
95+
sentencelengths.put(tokens, 1);
96+
} else {
97+
sentencelengths.put(tokens, occ+1);
98+
}
99+
89100
}
90101
}
102+
91103
}
92104

93105
/**
@@ -120,6 +132,8 @@ public void analyse(int form, int lemma, int upos, int xpos, int feats, int depr
120132
Map<String, Set<String>>identical = new LinkedHashMap<>(); // sentence: [ids]
121133
List<String []>similar = new ArrayList<>();
122134
// aggregate identical
135+
136+
123137
for(Analyser a : analysers) {
124138
for (String [] elems : a.getResults()) {
125139
if (elems[0].equals("FORM") && elems[1].equals("0")) {
@@ -135,6 +149,22 @@ public void analyse(int form, int lemma, int upos, int xpos, int feats, int depr
135149
}
136150
}
137151
}
152+
153+
154+
155+
System.out.println("# sentence lenghts");
156+
for (int slen : sentencelengths.keySet()) {
157+
System.out.format("# %3d tokens: %4d sentences\n", slen, sentencelengths.get(slen));
158+
}
159+
160+
if (form >= 0) {
161+
int identical_form = 0;
162+
for (String sentence : identical.keySet()) {
163+
identical_form += identical.get(sentence).size();
164+
}
165+
System.out.format("# identical sentences (Form) %d/%d %.1f%%\n", identical_form, keys.size(), (100.0*identical_form/keys.size()));
166+
}
167+
138168
// output identical sentences
139169
for (String sentence : identical.keySet()) {
140170
System.out.format("FORM\t0\t%s\t%d\t%s\n", sentence, identical.get(sentence).size(), String.join("\t", identical.get(sentence)));

0 commit comments

Comments
 (0)