40
40
import java .io .IOException ;
41
41
import java .util .ArrayList ;
42
42
import java .util .Arrays ;
43
+ import java .util .TreeMap ;
43
44
import java .util .List ;
44
45
import java .util .Map ;
45
46
import java .util .LinkedHashMap ;
@@ -58,6 +59,7 @@ public class ConlluComparator {
58
59
59
60
private Map <String , Signatures > csents ; // unique id (filename#id#number): sentence
60
61
private int numberOfThreads ;
62
+ Map <Integer , Integer >sentencelengths ; // sentencelength: number of sentences
61
63
62
64
public ConlluComparator (List <? extends Object > objects , int numberOfThreads ) throws ConllException , IOException {
63
65
//public ConlluComparator(List<String> objects) throws ConllException, IOException {
@@ -79,15 +81,25 @@ public ConlluComparator(List<? extends Object> objects, int numberOfThreads) thr
79
81
80
82
this .numberOfThreads = numberOfThreads ;
81
83
csents = new LinkedHashMap <>();
84
+ sentencelengths = new TreeMap <>(); // sentencelength: number of sentences
82
85
int ct = 0 ;
83
86
for (ConllFile cf : cdocs ) {
84
87
for (ConllSentence csent : cf .getSentences ()) {
85
88
ct += 1 ;
86
89
//String id = String.format("%s#%s#%d", cf.getFile(), csent.getSentid(), ct);
87
90
String id = String .format ("%s#%s" , cf .getFile (), csent .getSentid ());
88
91
csents .put (id , new Signatures (csent , id ));
92
+ int tokens = csent .getAllWords ().size ();
93
+ Integer occ = sentencelengths .get (tokens );
94
+ if (occ == null ) {
95
+ sentencelengths .put (tokens , 1 );
96
+ } else {
97
+ sentencelengths .put (tokens , occ +1 );
98
+ }
99
+
89
100
}
90
101
}
102
+
91
103
}
92
104
93
105
/**
@@ -120,6 +132,8 @@ public void analyse(int form, int lemma, int upos, int xpos, int feats, int depr
120
132
Map <String , Set <String >>identical = new LinkedHashMap <>(); // sentence: [ids]
121
133
List <String []>similar = new ArrayList <>();
122
134
// aggregate identical
135
+
136
+
123
137
for (Analyser a : analysers ) {
124
138
for (String [] elems : a .getResults ()) {
125
139
if (elems [0 ].equals ("FORM" ) && elems [1 ].equals ("0" )) {
@@ -135,6 +149,22 @@ public void analyse(int form, int lemma, int upos, int xpos, int feats, int depr
135
149
}
136
150
}
137
151
}
152
+
153
+
154
+
155
+ System .out .println ("# sentence lenghts" );
156
+ for (int slen : sentencelengths .keySet ()) {
157
+ System .out .format ("# %3d tokens: %4d sentences\n " , slen , sentencelengths .get (slen ));
158
+ }
159
+
160
+ if (form >= 0 ) {
161
+ int identical_form = 0 ;
162
+ for (String sentence : identical .keySet ()) {
163
+ identical_form += identical .get (sentence ).size ();
164
+ }
165
+ System .out .format ("# identical sentences (Form) %d/%d %.1f%%\n " , identical_form , keys .size (), (100.0 *identical_form /keys .size ()));
166
+ }
167
+
138
168
// output identical sentences
139
169
for (String sentence : identical .keySet ()) {
140
170
System .out .format ("FORM\t 0\t %s\t %d\t %s\n " , sentence , identical .get (sentence ).size (), String .join ("\t " , identical .get (sentence )));
0 commit comments