Skip to content

Commit 952901d

Browse files
committed
accelerate comparison
1 parent e752abd commit 952901d

File tree

2 files changed

+14
-9
lines changed

2 files changed

+14
-9
lines changed

src/main/java/com/orange/labs/comparison/Analyser.java

+12-7
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
*/
3333
package com.orange.labs.comparison;
3434

35+
import static java.lang.Math.abs;
3536
import java.util.ArrayList;
3637
import java.util.LinkedHashMap;
3738
import java.util.List;
@@ -97,7 +98,7 @@ public void run() {
9798
identical("FORM", cursent, othersent);
9899
}
99100
} else if (form > 0) {
100-
int dist = calculateDistance(cursent.sentence, othersent.sentence);
101+
int dist = calculateDistance(cursent.sentence, othersent.sentence, form);
101102
if (dist == 0) {
102103
identical("FORM", cursent, othersent);
103104
} else if (dist <= form) {
@@ -111,7 +112,7 @@ public void run() {
111112
identical("LEMMA", cursent, othersent);
112113
}
113114
} else if (lemma > 0) {
114-
int dist = calculateDistance(cursent.lemmas, othersent.lemmas);
115+
int dist = calculateDistance(cursent.lemmas, othersent.lemmas, lemma);
115116
if (dist == 0) {
116117
identical("LEMMA", cursent, othersent);
117118
} else if (dist <= lemma) {
@@ -125,7 +126,7 @@ public void run() {
125126
identical("UPOS", cursent, othersent);
126127
}
127128
} else if (upos > 0) {
128-
int dist = calculateDistance(cursent.uposs, othersent.uposs);
129+
int dist = calculateDistance(cursent.uposs, othersent.uposs, upos);
129130
if (dist == 0) {
130131
identical("UPOS", cursent, othersent);
131132
} else if (dist <= upos) {
@@ -139,7 +140,7 @@ public void run() {
139140
identical("XPOS", cursent, othersent);
140141
}
141142
} else if (xpos > 0) {
142-
int dist = calculateDistance(cursent.xposs, othersent.xposs);
143+
int dist = calculateDistance(cursent.xposs, othersent.xposs, xpos);
143144
if (dist == 0) {
144145
identical("XPOS", cursent, othersent);
145146
} else if (dist <= xpos) {
@@ -153,7 +154,7 @@ public void run() {
153154
identical("FEATS", cursent, othersent);
154155
}
155156
} else if (feats > 0) {
156-
int dist = calculateDistance(cursent.feats, othersent.feats);
157+
int dist = calculateDistance(cursent.feats, othersent.feats, feats);
157158
if (dist == 0) {
158159
identical("FEATS", cursent, othersent);
159160
} else if (dist <= feats) {
@@ -167,7 +168,7 @@ public void run() {
167168
identical("DEPREL", cursent, othersent);
168169
}
169170
} else if (deprel > 0) {
170-
int dist = calculateDistance(cursent.deprels, othersent.deprels);
171+
int dist = calculateDistance(cursent.deprels, othersent.deprels, deprel);
171172
if (dist == 0) {
172173
identical("DEPREL", cursent, othersent);
173174
} else if (dist <= deprel) {
@@ -182,16 +183,20 @@ public void run() {
182183
// inspired by https://github.com/crwohlfeil/damerau-levenshtein
183184
/**
184185
* calculate the levenshtein-damerau distance between two lists of objects (characters or strings)
186+
* levenstein_distance(a,b) >= |len(a) - len(b)|
185187
* @param source
186188
* @param target
187189
* @return
188190
*/
189-
private int calculateDistance(List<? extends Object> source, List<? extends Object> target) {
191+
private int calculateDistance(List<? extends Object> source, List<? extends Object> target, int maxdist) {
190192
//if (source == null || target == null) {
191193
// throw new IllegalArgumentException("Parameter must not be null");
192194
//}
193195
int sourceLength = source.size();
194196
int targetLength = target.size();
197+
// if the length of the two sentences differs more than maxdist, we stop here
198+
if (abs(sourceLength - targetLength) > maxdist) return abs(sourceLength - targetLength);
199+
195200
if (sourceLength == 0) {
196201
return targetLength;
197202
}

src/main/java/com/orange/labs/comparison/ConlluComparator.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ public ConlluComparator(List<? extends Object> objects, int numberOfThreads) thr
9999
*/
100100
public void analyse(int form, int lemma, int upos, int xpos, int feats, int deprel) throws InterruptedException {
101101
List<String> keys = Arrays.asList(csents.keySet().toArray(new String[0]));
102-
int comps = 0;
102+
long comps = 0;
103103
for(int x = 1; x< keys.size(); ++x) comps += x;
104104
System.err.println(comps + " comparisons needed");
105105
List<Thread> thrs = new ArrayList<>();
@@ -137,7 +137,7 @@ public void analyse(int form, int lemma, int upos, int xpos, int feats, int depr
137137
}
138138
// output identical sentences
139139
for (String sentence : identical.keySet()) {
140-
System.out.format("FORM\t0\t%s\t%s\n", sentence, String.join("\t", identical.get(sentence)));
140+
System.out.format("FORM\t0\t%s\t%d\t%s\n", sentence, identical.get(sentence).size(), String.join("\t", identical.get(sentence)));
141141
}
142142
for (String [] sim: similar) {
143143
System.out.println(String.join("\t", sim));

0 commit comments

Comments
 (0)