-
-
Notifications
You must be signed in to change notification settings - Fork 2.4k
/
Copy pathmultiple_longest_common_subsequence.rs
492 lines (436 loc) · 17.1 KB
/
multiple_longest_common_subsequence.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
use std::cmp::max;
use std::cmp::Ordering;
use std::collections::HashMap;
const IMPOSSIBLE_NB: usize = 999_999_999_999;
// saves the precalculations
// will be moved around a lot
// alphabet : the common alphabet
// chains : the strings among which the common subsequence is
// d : the number of strings
// f : for each point, an heuristic function
// g : for each point, the number of ancestors
// ms : the table of suffix tables
// mt : the lookup table
// parents : the ancestor tree
struct Context {
alphabet: Vec<char>,
chains: Vec<Vec<char>>,
d: usize,
f: HashMap<Vec<usize>, u64>,
g: HashMap<Vec<usize>, u64>,
ms: Vec<Vec<Vec<u64>>>,
mt: Vec<Vec<Vec<usize>>>,
parents: HashMap<Vec<usize>, Option<Vec<usize>>>,
}
impl Context {
pub fn new(strings: &[&str]) -> Self {
// cast to ease [index] accessibily
let chains: Vec<Vec<char>> = strings.iter().map(|s| s.chars().collect()).collect();
let d = strings.len();
let mut alphabet: Vec<char> = get_alphabet(&chains);
let ms: Vec<Vec<Vec<u64>>> = matrices_score(&chains);
// an impossible to reach point, father of all points
let p0 = vec![IMPOSSIBLE_NB; d];
let mut parents: HashMap<_, Option<Vec<usize>>> = HashMap::new();
parents.insert(p0.clone(), None);
let mut g = HashMap::new();
g.insert(p0.clone(), 0);
let mut f: HashMap<Vec<usize>, u64> = HashMap::new();
f.insert(p0, 0);
let mt = mt_table(&chains, &mut alphabet);
Context {
alphabet,
chains,
d,
f,
g,
ms,
mt,
parents,
}
}
}
// ascend back up the parent tree to form the common subsequence
fn common_seq(ctx: &Context, p: &Vec<usize>) -> String {
let ref_str: &Vec<char> = &ctx.chains[0];
let mut common_subsequence: Vec<char> = vec![];
// Gaining mutability
let mut p = p;
while ctx.parents[p].is_some() {
common_subsequence.push(ref_str[p[0]]);
// getting the parent of current point
p = ctx.parents[p].as_ref().unwrap();
}
common_subsequence.iter().rev().collect::<String>()
}
/// Heuristic to find the smallest common alphabet among the strings
/// gets the shortest string and remove duplicates
///
/// # Arguments
/// # 'chains' The strings among wich the mlcs is
///
/// # Returns
/// A vector
fn get_alphabet(chains: &[Vec<char>]) -> Vec<char> {
let mut alphabet: Vec<char> = chains
.iter()
.min_by_key(|s| s.len())
.expect("No minimum found")
.to_vec();
alphabet.sort();
alphabet.dedup();
alphabet
}
/// CF Initqueue
fn get_starting_p(ctx: &Context) -> Vec<Vec<usize>> {
let mut successors: Vec<Vec<usize>> = vec![];
// for each alphabet letter, finds the next match
// meaning the a point where all strings share a character
// example: In ["AB", "BC", "CB", "BF"],
// A match for the letter B would be p = (1, 0, 1, 0)
for (ch_idx, _) in ctx.alphabet.iter().enumerate() {
// for each string, finds the next position of that letter
let mut succ: Vec<usize> = vec![];
for i in 0..(ctx.chains.len()) {
// gets the next position of the current letter
let next_ch_idx = ctx.mt[ch_idx][i][0];
succ.push(next_ch_idx);
}
// once the vector is complete, we add it to the successors
successors.push(succ);
}
successors
}
/// Finds all succcesors of the point p
/// A successor of p = (p_1, p_2, etc, p_n) is a point q = (q_1, q_2, etc, q_n)
/// such that q_1 > p_1, q_2 > p_2, etc, q_n > p_n
/// [Documentation](https://github.com/epita-rs/MLCS/blob/main/paper.pdf)
///
/// # Arguments
/// # 'Context' A struct containing informations
/// # 'p' The point under examination
///
/// # Returns
/// An array of the successors
fn get_successors(ctx: &Context, p: &[usize]) -> Vec<Vec<usize>> {
let mut successors: Vec<Vec<usize>> = vec![];
// for all alphabet letters
for (ch_idx, _) in ctx.alphabet.iter().enumerate() {
// for each string, finds the next position of that letter
let mut succ: Vec<usize> = vec![];
for (i, p_ith_elt) in p.iter().enumerate().take(ctx.chains.len()) {
let next_ch_idx = ctx.mt[ch_idx][i][p_ith_elt + 1];
// in case the letter is not rechable in the string
if next_ch_idx == IMPOSSIBLE_NB {
break;
}
succ.push(next_ch_idx);
}
// the vector is complete, hence we add it to the successors
if succ.len() == ctx.chains.len() {
successors.push(succ);
}
// else we discard it and move on to the next letter
}
successors
}
/// Computes the heuristic function given a point
/// min ( { M_ij[ p[i] ][ p[j] ] | (i,j) in [0 ; d] } )
/// [Documentation](https://github.com/epita-rs/MLCS/blob/main/paper.pdf)
fn heuristic(ctx: &Context, p: &[usize]) -> u64 {
let mut similarity: Vec<u64> = vec![];
for i in 0..ctx.d {
for j in 0..ctx.d {
if i != j {
similarity.push(ctx.ms[translate(i, j, ctx.d)][p[i]][p[j]]);
}
}
}
*similarity.iter().min().unwrap()
}
/// Add the first matches to the queue
/// For each starting point found, sets an impossible point as parent
/// [Documentation](https://github.com/epita-rs/MLCS/blob/main/paper.pdf)
///
/// # Arguments
///
/// * `ctx' - A structure containing informations
/// * 'queue' - The priority queue of points
fn init_queue(ctx: &mut Context, queue: &mut Vec<Vec<usize>>) {
*queue = get_starting_p(ctx);
for q in queue.clone() {
update_suc(ctx, vec![IMPOSSIBLE_NB; ctx.d], q.clone());
}
reorder_queue(ctx, queue);
}
/// Computes the suffix tables between each pair of string
/// used by the MLCS-Astar heuristic function
/// [Documentation](https://github.com/epita-rs/MLCS/blob/main/paper.pdf)
///
/// # Arguments
///
/// * `chains` - A slice of collected strings
/// - from which the suffix tables are computed.
fn matrices_score(chains: &[Vec<char>]) -> Vec<Vec<Vec<u64>>> {
let mut scores: Vec<Vec<Vec<u64>>> = vec![];
for s1 in chains.iter() {
for s2 in chains.iter() {
scores.push(score_matrix(s1, s2));
}
}
scores
}
/// Builds the lookup table used for accessing the index of the next char
/// updates the alphabet to be the alphabet of the letters common to all strings
///
/// # Arguments
/// # 'chains' the strings as a matrix of char
/// # 'alphabet' the letters in the strings
///
/// # Returns
/// An array of matrices.
/// Each matrix is tied to a string and can indicate, given a letter,
/// the next position of that letter in the string.
fn mt_table(chains: &Vec<Vec<char>>, alphabet: &mut Vec<char>) -> Vec<Vec<Vec<usize>>> {
let mut mt: Vec<Vec<Vec<usize>>> = vec![];
for ch in alphabet.clone() {
let mut chain: Vec<Vec<usize>> = vec![];
for s in chains {
let mut v: Vec<usize> = vec![IMPOSSIBLE_NB; s.len()];
let mut lpos = IMPOSSIBLE_NB;
// iterating backwards on the string
for i in (0..(s.len())).rev() {
if s[i] == ch {
lpos = i;
}
// pushing the index of the last encounter with the current letter
v[i] = lpos;
}
chain.push(v);
// if the letter was never seen in the current string
// then it can't part of the common alphabet
if lpos == IMPOSSIBLE_NB {
// removing that letter
alphabet.retain(|&x| x != ch);
chain = vec![];
break;
}
}
// the letter was seen at leat once
if !chain.is_empty() {
// pushing an array or array
// example on ["AB", "ABAA"]
// string1 => {
// 'A' => {0, IMPOSSIBLE_NB}
// 'B' => {1, 1}
// }
// string2 => {
// 'A' => {0, 2, 2, 3}
// 'B' => {1, 1, IMPOSSIBLE_NB, IMPOSSIBLE_NB}
// }
mt.push(chain);
}
}
mt
}
/// Finds one of the longest_common_subsequence among multiple strings
/// using a similar approach to the A* algorithm in graph theory
/// [Documentation](https://github.com/epita-rs/MLCS/blob/main/paper.pdf)
/// # Arguments
///
/// * `S` - Array of strings.
///
/// # Returns
///
/// * `String` if a Longest Common Subsequence exists
/// * `String' if no LCS was found
pub fn multiple_longest_common_subsequence(chains: &Vec<&str>) -> String {
const C: u64 = 20;
// Preprocessing
let mut ctx = Context::new(chains);
// queue
let mut queue: Vec<Vec<usize>> = vec![];
init_queue(&mut ctx, &mut queue);
while !queue.is_empty() {
// y = max( {f(p) | p in queue} )
let mut y = ctx.f[queue.last().unwrap()];
// y = y - c // without overflow
if y > C {
y -= C;
}
// R = { p | p in queue and y <= f(p) }
let second_queue = queue
.clone()
.into_iter()
.filter(|p| y <= ctx.f[p])
.collect::<Vec<Vec<usize>>>();
queue.clear();
for p in second_queue {
if heuristic(&ctx, &p) == 0 {
// An MLCS match was found
return common_seq(&ctx, &p);
}
// inserting all succesors in the queue
let succs = get_successors(&ctx, &p);
for q in succs {
// basically saying if the queue queue does not already
// contain the point q
if !queue.contains(&q) {
update_suc(&mut ctx, p.clone(), q.clone());
queue.push(q);
}
}
}
// sorting the queue
reorder_queue(&ctx, &mut queue);
}
String::from("")
}
// sorts the queue
fn reorder_queue(ctx: &Context, queue: &mut [Vec<usize>]) {
queue.sort_unstable_by(|p, q| {
if (ctx.f.get(p) > ctx.f.get(q))
|| (ctx.f.get(p) == ctx.f.get(q) && heuristic(ctx, p) > heuristic(ctx, q))
{
Ordering::Greater
} else {
Ordering::Less
}
});
}
/// Computes the suffix table
fn score_matrix(s1: &[char], s2: &[char]) -> Vec<Vec<u64>> {
let m = s1.len();
let n = s2.len();
let mut matrix: Vec<Vec<u64>> = vec![vec![0; n + 1]; m + 1];
if n > 0 && m > 0 {
for i in (0..(m - 1)).rev() {
for j in (0..(n - 1)).rev() {
matrix[i][j] = if s1[i + 1] == s2[j + 1] {
matrix[i + 1][j + 1] + 1
} else {
max(matrix[i][j + 1], matrix[i + 1][j])
};
}
}
}
matrix
}
//given given 2D coordinates, translates into 1D coordinates
fn translate(i: usize, j: usize, d: usize) -> usize {
i * d + j
}
// given a point p and his successor q, computes necessary informations
// point p is marked PARENT of q
fn update_suc(ctx: &mut Context, p: Vec<usize>, q: Vec<usize>) {
// g(q) = g(p) + 1
let nb = &ctx.g[&p] + 1;
ctx.g.insert(q.clone(), nb);
// saves the cost function for point p : h(p) + g(p)
ctx.f.insert(q.clone(), heuristic(ctx, &q) + nb);
// saves the fact that p is the parent of q
ctx.parents.insert(q, Some(p));
}
#[cfg(test)]
mod tests {
use super::*;
macro_rules! astar_tests {
($($name:ident: ($input:expr, $expected:expr),)*) => {
$(
#[test]
fn $name() {
let input: Vec<&str> = $input.iter().map(|s| *s).collect();
let expected: String = String::from($expected);
let result = multiple_longest_common_subsequence(&input);
assert_eq!(result, expected);
}
)*
};
}
astar_tests! {
all_empty_strings: (["", ""], ""),
all_same: (["abcdef", "abcdef", "abcdef", "abcdef"], "abcdef"),
empty_strings: (["", "ABC"], ""),
long: (
[
"qwertyuiop$asd$fgh$jkl;zxcvbnmqwert|yuiop1234567890-0",
"qwertyuiopasdfghj$kl;zx$cvbnmqwe$rtyu|iop,1234567890-0"
],
"qwertyuiopasdfghjkl;zxcvbnmqwert|iop1234567890-0"
),
medium_case: (
[
"gxt#xayb",
"abgt#ab",
"gyayt#ahjb",
"gyayjjjt#ab",
"gyayt#ahhhhb",
"ygaytp#pppahjb",
"ylllgaytm#8765majb",
],
"gt#ab"
),
medium_plus: (
[
"=串-用2于测试2展示测中测中0shgksjklkjlj测测🚀测测串文|",
"=串-串用2于测2试测中ss中0展示测测l中🚀文|串",
"=串-用2于测试2展67中中0xs中中中kkljhkkh示中测🚀测|测文|",
"=串-|用2于ss串试056u展xx🚀示中lj测ggk测|ss文|",
"=串-用2于-测22中中中uyty试串lj展gkks中示🚀测测s|测中文|b",
"=串用2于测s-试2中中0中hgtihlkk展串🚀中示s中|文|",
"=2串2中2中2中s用-于0t测🚀j试展示测s测hkkkg测中中串文|l",
"=2串2中2中2中s用-于0测🚀试展示测s中k中l串文|",
"=2串2中2中2中s用ur-于0测🚀试展示测jkjljkkllkskg中串文|;",
"=2串2中2中2中s用u-ur于0测🚀试展jll示测gks中中串文|0",
"=2串2中2中2中s用-uurr于0测🚀试kl展示测s测中中串文|8",
"=2中2中s用-于0测🚀试展示测jsjhg测测中串文|",
"=2串2中2中2中s用-于0rttru测ljjgjh🚀试示测s测测中中串文|",
"=2串2中2中2中s用-于0gjg测lu🚀试展示测s测测中中串文|6",
"=2中22s-于0测🚀展测j测ljy中中串文|",
"=2串2中2中2中s用-jklkjll于hgj0测🚀试展示测s测中中串文|",
"=2串2中2中2中s用-于0g🚀试展示测s测中中lj串文|",
"=2串2中2中2中s用-于hj0试展示测sghhjjhgjl测测中串文|",
"=2串2中2中2中s用-于0h🚀试展示测sj测中jkl中串文|",
"=2串2中2中2中s用-于0j🚀试展示测gjgjsjk测串文|",
"=2串2中2中2中s用-于kj0🚀试展示测jjjlks中串文|",
"=2串2中2中2中s用-于0l🚀试展示fdj测l测中中串文|",
"=2串2中2中2中s用-于0🚀kl试展测测djkhdd中文|",
"=2串2中2中2中s用-于0试展示测s测fdljh中中串文|",
"=2串2中2中2中s用-于0测l🚀l试展示lshd测测中中串文|",
"=2串2中2中2中s用-于0测🚀jk试展示sf测测中中串文|",
"=串用2串2中🚀2-中于0测试中lk展中ks中23文|串",
],
"=2于测文|"
),
mix: (
[
"=串-用2于测试2展示测中测中0ss测测🚀测测串文|",
"=串-串用2于测2试测中ss中0展示测测中🚀文|串",
"=串-用2于测试2展中中0xs中中中示中测🚀测|测文",
"=串-|用2于ss串试0展xx🚀示中测测|ss文",
"=串-用2于-测22中中中试串展s中示🚀测测s|测中文",
"=串用2于测s-试2中中0中展串🚀中示s中|文",
"=2串2中2中2中s用-于0测🚀试展示测s测测中中串文|",
"=串用2串2中🚀2-中于0测试中展中示s中文|串",
"=串2🚀用1于-2测2中20中试s中展s示中文测|测测测测串",
],
"=串用于试展示中文"
),
no_match: (["ABC", "DEF"], ""),
simple_case: (["ABC", "AC", "BAC"], "AC"),
unicode: (
[
"串用于测试展示测中测中测测🚀测测串文",
"串串用于测试测中中展示测测中🚀文串",
"串用于测试展中中中中中示中测🚀测测文",
"串用于测串试展示中测测文",
"串用于测中中中试串展中示🚀测测测中文",
"串用于测试中中中展串🚀中示中文",
"串中中中用于测🚀试展示测测测中中串文",
"串用串中🚀中于测试中展中示中文串",
"串🚀用于测中中试中展示中文测测测测测串",
],
"串用于测试展示中文"
),
}
}