Skip to content

Commit e54c85b

Browse files
committed
Merge branch 'issues/69-broken-surrogate-pairs'
Stop breaking surrogate pairs in toDelta()/fromDelta() Resolves google#69 for the following languages: - Objective-C - Java - JavaScript - Python2 - Python3 Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries. Alternative approaches: ========= - The [`dissimilar`](https://docs.rs/dissimilar/latest/dissimilar/) library in Rust takes a more comprehensive approach with its `cleanup_char_boundary()` method. Since that approach resolves the issue everywhere and not just in to/from Delta, it's worth exploring as a replacement for this patch. Remaining work to do: ======== -[ ] Fix CPP or verify not a problem -[ ] Fix CSharp or verify not a problem -[ ] Fix Dart or verify not a problem -[ ] Fix Lua or verify not a problem -[x] Refactor to use cleanupSplitSurrogates in JavaScript -[x] Refactor to use cleanupSplitSurrogates in Java -[ ] Refactor to use cleanupSplitSurrogates in Objective C -[ ] Refactor to use cleanupSplitSurrogates in Python2 -[ ] Refactor to use cleanupSplitSurrogates in Python3 -[ ] Refactor to use cleanupSplitSurrogates in CPP -[ ] Refactor to use cleanupSplitSurrogates in CSharp -[ ] Refactor to use cleanupSplitSurrogates in Dart -[ ] Refactor to use cleanupSplitSurrogates in Lua -[x] Fix patch_toText in JavaScript -[ ] Fix patch_toText in Java -[ ] Fix patch_toText in Objective C -[ ] Fix patch_toText in Python2 -[ ] Fix patch_toText in Python3 -[ ] Fix patch_toText in CPP -[ ] Fix patch_toText in CSharp -[ ] Fix patch_toText in Dart -[ ] Fix patch_toText in Lua -[ ] Figure out a "minimal" set of unit tests so we can get rid of the big chunk currently in the PR, then carry it around to all the libraries. The triggers are well understood, so we can write targeted tests instead of broad ones.
2 parents 62f2e68 + 50f1542 commit e54c85b

File tree

11 files changed

+992
-73
lines changed

11 files changed

+992
-73
lines changed

java/src/name/fraser/neil/plaintext/diff_match_patch.java

+141-8
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package name.fraser.neil.plaintext;
2020

2121
import java.io.UnsupportedEncodingException;
22+
import java.lang.Character;
2223
import java.net.URLDecoder;
2324
import java.net.URLEncoder;
2425
import java.util.*;
@@ -1293,6 +1294,46 @@ public void diff_cleanupMerge(LinkedList<Diff> diffs) {
12931294
}
12941295
}
12951296

1297+
/**
1298+
* Rearrange diff boudnaries that split Unicode surrogate pairs.
1299+
* @param diffs Linked list of diff objects
1300+
*/
1301+
public void diff_cleanupSplitSurrogates(List<Diff> diffs) {
1302+
char lastEnd = 0;
1303+
boolean isFirst = true;
1304+
HashSet<Diff> toRemove = new HashSet<Diff>();
1305+
1306+
for (Diff aDiff : diffs) {
1307+
if (aDiff.text.isEmpty()) {
1308+
toRemove.add(aDiff);
1309+
continue;
1310+
}
1311+
1312+
char thisTop = aDiff.text.charAt(0);
1313+
char thisEnd = aDiff.text.charAt(aDiff.text.length() - 1);
1314+
1315+
if (Character.isHighSurrogate(thisEnd)) {
1316+
lastEnd = thisEnd;
1317+
aDiff.text = aDiff.text.substring(0, aDiff.text.length() - 1);
1318+
}
1319+
1320+
if (!isFirst && Character.isHighSurrogate(lastEnd) && Character.isLowSurrogate(thisTop)) {
1321+
aDiff.text = lastEnd + aDiff.text;
1322+
}
1323+
1324+
isFirst = false;
1325+
1326+
if ( aDiff.text.isEmpty() ) {
1327+
toRemove.add(aDiff);
1328+
continue;
1329+
}
1330+
}
1331+
1332+
for (Diff aDiff : toRemove) {
1333+
diffs.remove(aDiff);
1334+
}
1335+
}
1336+
12961337
/**
12971338
* loc is a location in text1, compute and return the equivalent location in
12981339
* text2.
@@ -1429,6 +1470,7 @@ public int diff_levenshtein(List<Diff> diffs) {
14291470
*/
14301471
public String diff_toDelta(List<Diff> diffs) {
14311472
StringBuilder text = new StringBuilder();
1473+
this.diff_cleanupSplitSurrogates(diffs);
14321474
for (Diff aDiff : diffs) {
14331475
switch (aDiff.operation) {
14341476
case INSERT:
@@ -1457,6 +1499,103 @@ public String diff_toDelta(List<Diff> diffs) {
14571499
return delta;
14581500
}
14591501

1502+
private int digit16(char b) throws IllegalArgumentException {
1503+
switch (b) {
1504+
case '0': return 0;
1505+
case '1': return 1;
1506+
case '2': return 2;
1507+
case '3': return 3;
1508+
case '4': return 4;
1509+
case '5': return 5;
1510+
case '6': return 6;
1511+
case '7': return 7;
1512+
case '8': return 8;
1513+
case '9': return 9;
1514+
case 'A': case 'a': return 10;
1515+
case 'B': case 'b': return 11;
1516+
case 'C': case 'c': return 12;
1517+
case 'D': case 'd': return 13;
1518+
case 'E': case 'e': return 14;
1519+
case 'F': case 'f': return 15;
1520+
default:
1521+
throw new IllegalArgumentException();
1522+
}
1523+
}
1524+
1525+
private String decodeURI(String text) throws IllegalArgumentException {
1526+
int i = 0;
1527+
StringBuilder decoded = new StringBuilder(text.length());
1528+
1529+
while (i < text.length()) {
1530+
if (text.charAt(i) != '%') {
1531+
decoded.append(text.charAt(i++));
1532+
continue;
1533+
}
1534+
1535+
// start a percent-sequence
1536+
int byte1 = (digit16(text.charAt(i + 1)) << 4) + digit16(text.charAt(i + 2));
1537+
if ((byte1 & 0x80) == 0) {
1538+
decoded.append(Character.toChars(byte1));
1539+
i += 3;
1540+
continue;
1541+
}
1542+
1543+
if ( text.charAt(i + 3) != '%') {
1544+
throw new IllegalArgumentException();
1545+
}
1546+
1547+
int byte2 = (digit16(text.charAt(i + 4)) << 4) + digit16(text.charAt(i + 5));
1548+
if ((byte2 & 0xC0) != 0x80) {
1549+
throw new IllegalArgumentException();
1550+
}
1551+
byte2 = byte2 & 0x3F;
1552+
if ((byte1 & 0xE0) == 0xC0) {
1553+
decoded.append(Character.toChars(((byte1 & 0x1F) << 6) | byte2));
1554+
i += 6;
1555+
continue;
1556+
}
1557+
1558+
if (text.charAt(i + 6) != '%') {
1559+
throw new IllegalArgumentException();
1560+
}
1561+
1562+
int byte3 = (digit16(text.charAt(i + 7)) << 4) + digit16(text.charAt(i + 8));
1563+
if ((byte3 & 0xC0) != 0x80) {
1564+
throw new IllegalArgumentException();
1565+
}
1566+
byte3 = byte3 & 0x3F;
1567+
if ((byte1 & 0xF0) == 0xE0) {
1568+
// unpaired surrogate are fine here
1569+
decoded.append(Character.toChars(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3));
1570+
i += 9;
1571+
continue;
1572+
}
1573+
1574+
if (text.charAt(i + 9) != '%') {
1575+
throw new IllegalArgumentException();
1576+
}
1577+
1578+
int byte4 = (digit16(text.charAt(i + 10)) << 4) + digit16(text.charAt(i + 11));
1579+
if ((byte4 & 0xC0) != 0x80) {
1580+
throw new IllegalArgumentException();
1581+
}
1582+
byte4 = byte4 & 0x3F;
1583+
if ((byte1 & 0xF8) == 0xF0) {
1584+
int codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
1585+
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
1586+
decoded.append(Character.toChars((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800));
1587+
decoded.append(Character.toChars(0xDC00 | (codePoint & 0xFFFF) & 0x3FF));
1588+
i += 12;
1589+
continue;
1590+
}
1591+
}
1592+
1593+
throw new IllegalArgumentException();
1594+
}
1595+
1596+
return decoded.toString();
1597+
}
1598+
14601599
/**
14611600
* Given the original text1, and an encoded string which describes the
14621601
* operations required to transform text1 into text2, compute the full diff.
@@ -1483,10 +1622,7 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
14831622
// decode would change all "+" to " "
14841623
param = param.replace("+", "%2B");
14851624
try {
1486-
param = URLDecoder.decode(param, "UTF-8");
1487-
} catch (UnsupportedEncodingException e) {
1488-
// Not likely on modern system.
1489-
throw new Error("This system does not support UTF-8.", e);
1625+
param = this.decodeURI(param);
14901626
} catch (IllegalArgumentException e) {
14911627
// Malformed URI sequence.
14921628
throw new IllegalArgumentException(
@@ -2269,10 +2405,7 @@ public List<Patch> patch_fromText(String textline)
22692405
line = text.getFirst().substring(1);
22702406
line = line.replace("+", "%2B"); // decode would change all "+" to " "
22712407
try {
2272-
line = URLDecoder.decode(line, "UTF-8");
2273-
} catch (UnsupportedEncodingException e) {
2274-
// Not likely on modern system.
2275-
throw new Error("This system does not support UTF-8.", e);
2408+
line = this.decodeURI(line);
22762409
} catch (IllegalArgumentException e) {
22772410
// Malformed URI sequence.
22782411
throw new IllegalArgumentException(

java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java

+36
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,42 @@ public static void testDiffDelta() {
424424

425425
assertEquals("diff_fromDelta: Unicode.", diffs, dmp.diff_fromDelta(text1, delta));
426426

427+
diffs = diffList(new Diff(EQUAL, "\ud83d\ude4b\ud83d"), new Diff(INSERT, "\ude4c\ud83d"), new Diff(EQUAL, "\ude4b"));
428+
delta = dmp.diff_toDelta(diffs);
429+
assertEquals("diff_toDelta: Surrogate Pairs.", "=2\t+%F0%9F%99%8C\t=2", delta);
430+
431+
assertEquals(
432+
"diff_toDelta: insert surrogate pair between similar high surrogates",
433+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd70"), new Diff(EQUAL, "\ud83c\udd71"))),
434+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70\ud83c"), new Diff(INSERT, "\udd70\ud83c"), new Diff(EQUAL, "\udd71")))
435+
);
436+
437+
assertEquals(
438+
"diff_toDelta: swap surrogate pairs delete/insert",
439+
dmp.diff_toDelta(diffList(new Diff(DELETE, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd71"))),
440+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(DELETE, "\udd70"), new Diff(INSERT, "\udd71")))
441+
);
442+
443+
assertEquals(
444+
"diff_toDelta: swap surrogate pairs insert/delete",
445+
dmp.diff_toDelta(diffList(new Diff(INSERT, "\ud83c\udd70"), new Diff(DELETE, "\ud83c\udd71"))),
446+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(INSERT, "\udd70"), new Diff(DELETE, "\udd71")))
447+
);
448+
449+
assertEquals(
450+
"diff_toDelta: empty diff groups",
451+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(DELETE, ""), new Diff(INSERT, "ghijk"))),
452+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(INSERT, "ghijk")))
453+
);
454+
455+
// Different versions of the library may have created deltas with
456+
// half of a surrogate pair encoded as if it were valid UTF-8
457+
assertEquals(
458+
"diff_toDelta: surrogate half encoded as UTF8",
459+
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "-2\t+%F0%9F%85%B1")),
460+
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1"))
461+
);
462+
427463
// Verify pool of unchanged characters.
428464
diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "));
429465
String text2 = dmp.diff_text2(diffs);

0 commit comments

Comments
 (0)