Skip to content

Commit fdb2b05

Browse files
authored
Merge pull request #2 from hackmdio/fix/pathc-stop-breaking-surrogate-pairs
fix/stop breaking surrogate pairs
2 parents c2f8fb9 + 1964735 commit fdb2b05

File tree

2 files changed

+309
-2
lines changed

2 files changed

+309
-2
lines changed

index.js

+177-1
Original file line numberDiff line numberDiff line change
@@ -1176,6 +1176,166 @@ diff_match_patch.prototype.diff_cleanupMerge = function(diffs) {
11761176
}
11771177
};
11781178

1179+
/**
1180+
* Rearrange diff boundaries that split Unicode surrogate pairs.
1181+
*
1182+
* @param {!Array.<!diff_match_patch.Diff>} diffs Array of diff tuples.
1183+
*/
1184+
diff_match_patch.prototype.diff_cleanupSplitSurrogates = function(diffs) {
1185+
var lastEnd;
1186+
for (var x = 0; x < diffs.length; x++) {
1187+
var thisDiff = diffs[x];
1188+
var thisTop = thisDiff[1][0];
1189+
var thisEnd = thisDiff[1][thisDiff[1].length - 1];
1190+
1191+
if (0 === thisDiff[1].length) {
1192+
diffs.splice(x--, 1);
1193+
continue;
1194+
}
1195+
1196+
if (thisEnd && this.isHighSurrogate(thisEnd)) {
1197+
lastEnd = thisEnd;
1198+
thisDiff[1] = thisDiff[1].slice(0, -1);
1199+
}
1200+
1201+
if (lastEnd && thisTop && this.isHighSurrogate(lastEnd) && this.isLowSurrogate(thisTop)) {
1202+
thisDiff[1] = lastEnd + thisDiff[1];
1203+
}
1204+
1205+
if (0 === thisDiff[1].length) {
1206+
diffs.splice(x--, 1);
1207+
continue;
1208+
}
1209+
}
1210+
1211+
return diffs;
1212+
};
1213+
1214+
diff_match_patch.prototype.isHighSurrogate = function(c) {
1215+
var v = c.charCodeAt(0);
1216+
return v >= 0xD800 && v <= 0xDBFF;
1217+
};
1218+
1219+
diff_match_patch.prototype.isLowSurrogate = function(c) {
1220+
var v = c.charCodeAt(0);
1221+
return v >= 0xDC00 && v <= 0xDFFF;
1222+
};
1223+
1224+
diff_match_patch.prototype.digit16 = function(c) {
1225+
switch (c) {
1226+
case '0': return 0;
1227+
case '1': return 1;
1228+
case '2': return 2;
1229+
case '3': return 3;
1230+
case '4': return 4;
1231+
case '5': return 5;
1232+
case '6': return 6;
1233+
case '7': return 7;
1234+
case '8': return 8;
1235+
case '9': return 9;
1236+
case 'A': case 'a': return 10;
1237+
case 'B': case 'b': return 11;
1238+
case 'C': case 'c': return 12;
1239+
case 'D': case 'd': return 13;
1240+
case 'E': case 'e': return 14;
1241+
case 'F': case 'f': return 15;
1242+
default: throw new Error('Invalid hex-code');
1243+
}
1244+
};
1245+
1246+
/**
1247+
* Decode URI-encoded string but allow for encoded surrogate halves
1248+
*
1249+
* diff_match_patch needs this relaxation of the requirements because
1250+
* not all libraries and versions produce valid URI strings in toDelta
1251+
* and we don't want to crash this code when the input is valid input
1252+
* but at the same time invalid utf-8
1253+
*
1254+
* @example: decodeURI( 'abcd%3A %F0%9F%85%B0' ) = 'abcd: \ud83c\udd70'
1255+
* @example: decodeURI( 'abcd%3A %ED%A0%BC' ) = 'abcd: \ud83c'
1256+
*
1257+
* @cite: @mathiasbynens utf8.js at https://github.com/mathiasbynens/utf8.js
1258+
*
1259+
* @param {String} text input string encoded by encodeURI() or equivalent
1260+
* @return {String}
1261+
*/
1262+
diff_match_patch.prototype.decodeURI = function(text) {
1263+
try {
1264+
return decodeURI(text);
1265+
} catch ( e ) {
1266+
var i = 0;
1267+
var decoded = '';
1268+
1269+
while (i < text.length) {
1270+
if ( text[i] !== '%' ) {
1271+
decoded += text[i++];
1272+
continue;
1273+
}
1274+
1275+
// start a percent-sequence
1276+
var byte1 = (this.digit16(text[i + 1]) << 4) + this.digit16(text[i + 2]);
1277+
if ((byte1 & 0x80) === 0) {
1278+
decoded += String.fromCharCode(byte1);
1279+
i += 3;
1280+
continue;
1281+
}
1282+
1283+
if ('%' !== text[i + 3]) {
1284+
throw new URIError('URI malformed');
1285+
}
1286+
1287+
var byte2 = (this.digit16(text[i + 4]) << 4) + this.digit16(text[i + 5]);
1288+
if ((byte2 & 0xC0) !== 0x80) {
1289+
throw new URIError('URI malformed');
1290+
}
1291+
byte2 = byte2 & 0x3F;
1292+
if ((byte1 & 0xE0) === 0xC0) {
1293+
decoded += String.fromCharCode(((byte1 & 0x1F) << 6) | byte2);
1294+
i += 6;
1295+
continue;
1296+
}
1297+
1298+
if ('%' !== text[i + 6]) {
1299+
throw new URIError('URI malformed');
1300+
}
1301+
1302+
var byte3 = (this.digit16(text[i + 7]) << 4) + this.digit16(text[i + 8]);
1303+
if ((byte3 & 0xC0) !== 0x80) {
1304+
throw new URIError('URI malformed');
1305+
}
1306+
byte3 = byte3 & 0x3F;
1307+
if ((byte1 & 0xF0) === 0xE0) {
1308+
// unpaired surrogate are fine here
1309+
decoded += String.fromCharCode(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3);
1310+
i += 9;
1311+
continue;
1312+
}
1313+
1314+
if ('%' !== text[i + 9]) {
1315+
throw new URIError('URI malformed');
1316+
}
1317+
1318+
var byte4 = (this.digit16(text[i + 10]) << 4) + this.digit16(text[i + 11]);
1319+
if ((byte4 & 0xC0) !== 0x80) {
1320+
throw new URIError('URI malformed');
1321+
}
1322+
byte4 = byte4 & 0x3F;
1323+
if ((byte1 & 0xF8) === 0xF0) {
1324+
var codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
1325+
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
1326+
decoded += String.fromCharCode((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800);
1327+
decoded += String.fromCharCode(0xDC00 | (codePoint & 0xFFFF) & 0x3FF);
1328+
i += 12;
1329+
continue;
1330+
}
1331+
}
1332+
1333+
throw new URIError('URI malformed');
1334+
}
1335+
1336+
return decoded;
1337+
}
1338+
};
11791339

11801340
/**
11811341
* loc is a location in text1, compute and return the equivalent location in
@@ -1219,6 +1379,7 @@ diff_match_patch.prototype.diff_xIndex = function(diffs, loc) {
12191379
* @return {string} HTML representation.
12201380
*/
12211381
diff_match_patch.prototype.diff_prettyHtml = function(diffs) {
1382+
diffs = this.diff_cleanupSplitSurrogates(diffs);
12221383
var html = [];
12231384
var pattern_amp = /&/g;
12241385
var pattern_lt = /</g;
@@ -1319,6 +1480,7 @@ diff_match_patch.prototype.diff_levenshtein = function(diffs) {
13191480
* @return {string} Delta text.
13201481
*/
13211482
diff_match_patch.prototype.diff_toDelta = function(diffs) {
1483+
diffs = this.diff_cleanupSplitSurrogates(diffs);
13221484
var text = [];
13231485
for (var x = 0; x < diffs.length; x++) {
13241486
switch (diffs[x][0]) {
@@ -1361,7 +1523,7 @@ diff_match_patch.prototype.diff_fromDelta = function(text1, delta) {
13611523
switch (tokens[x].charAt(0)) {
13621524
case '+':
13631525
try {
1364-
diffs[diffsLength++] = [DIFF_INSERT, decodeURI(param)];
1526+
diffs[diffsLength++] = [DIFF_INSERT, this.decodeURI(param)];
13651527
} catch (ex) {
13661528
// Malformed URI sequence.
13671529
throw new Error('Illegal escape in diff_fromDelta: ' + param);
@@ -1597,11 +1759,23 @@ diff_match_patch.prototype.patch_addContext_ = function(patch, text) {
15971759
padding += this.Patch_Margin;
15981760

15991761
// Add the prefix.
1762+
if (
1763+
patch.start2 - padding > 0 &&
1764+
diff_match_patch.prototype.isLowSurrogate(text[patch.start2 - padding])
1765+
) {
1766+
padding++;
1767+
}
16001768
var prefix = text.substring(patch.start2 - padding, patch.start2);
16011769
if (prefix) {
16021770
patch.diffs.unshift([DIFF_EQUAL, prefix]);
16031771
}
16041772
// Add the suffix.
1773+
if (
1774+
patch.start2 + patch.length1 + padding < text.length &&
1775+
diff_match_patch.prototype.isHighSurrogate(text[patch.start2 + patch.length1 + padding])
1776+
) {
1777+
padding++;
1778+
}
16051779
var suffix = text.substring(patch.start2 + patch.length1,
16061780
patch.start2 + patch.length1 + padding);
16071781
if (suffix) {
@@ -1675,6 +1849,7 @@ diff_match_patch.prototype.patch_make = function(a, opt_b, opt_c) {
16751849
if (diffs.length === 0) {
16761850
return []; // Get rid of the null case.
16771851
}
1852+
diffs = this.diff_cleanupSplitSurrogates(diffs);
16781853
var patches = [];
16791854
var patch = new diff_match_patch.patch_obj();
16801855
var patchDiffLength = 0; // Keeping our own length var is faster in JS.
@@ -2171,6 +2346,7 @@ diff_match_patch.patch_obj.prototype.toString = function() {
21712346
var text = ['@@ -' + coords1 + ' +' + coords2 + ' @@\n'];
21722347
var op;
21732348
// Escape the body of the patch with %xx notation.
2349+
diff_match_patch.prototype.diff_cleanupSplitSurrogates(this.diffs);
21742350
for (var x = 0; x < this.diffs.length; x++) {
21752351
switch (this.diffs[x][0]) {
21762352
case DIFF_INSERT:

test/index.js

+132-1
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,137 @@ function testDiffDelta() {
483483
// Convert delta string into a diff.
484484
assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));
485485

486+
487+
diffs = [[DIFF_EQUAL, '\ud83d\ude4b\ud83d'], [DIFF_INSERT, '\ude4c\ud83d'], [DIFF_EQUAL, '\ude4b']];
488+
try {
489+
delta = dmp.diff_toDelta(diffs);
490+
assertEquals('=2\t+%F0%9F%99%8C\t=2', delta);
491+
} catch ( e ) {
492+
assertEquals(false, true);
493+
}
494+
495+
(function(){
496+
const originalText = `U+1F17x 🅰️ 🅱️ 🅾️ 🅿️ safhawifhkw
497+
U+1F18x 🆎
498+
0 1 2 3 4 5 6 7 8 9 A B C D E F
499+
U+1F19x 🆑 🆒 🆓 🆔 🆕 🆖 🆗 🆘 🆙 🆚
500+
U+1F20x 🈁 🈂️ sfss.,_||saavvvbbds
501+
U+1F21x 🈚
502+
U+1F22x 🈯
503+
U+1F23x 🈲 🈳 🈴 🈵 🈶 🈷️ 🈸 🈹 🈺
504+
U+1F25x 🉐 🉑
505+
U+1F30x 🌀 🌁 🌂 🌃 🌄 🌅 🌆 🌇 🌈 🌉 🌊 🌋 🌌 🌍 🌎 🌏
506+
U+1F31x 🌐 🌑 🌒 🌓 🌔 🌕 🌖 🌗 🌘 🌙 🌚 🌛 🌜 🌝 🌞 `;
507+
508+
// applies some random edits to string and returns new, edited string
509+
function applyRandomTextEdit(text) {
510+
let textArr = [...text];
511+
let r = Math.random();
512+
if(r < 1/3) { // swap
513+
let swapCount = Math.floor(Math.random()*5);
514+
for(let i = 0; i < swapCount; i++) {
515+
let swapPos1 = Math.floor(Math.random()*textArr.length);
516+
let swapPos2 = Math.floor(Math.random()*textArr.length);
517+
let char1 = textArr[swapPos1];
518+
let char2 = textArr[swapPos2];
519+
textArr[swapPos1] = char2;
520+
textArr[swapPos2] = char1;
521+
}
522+
} else if(r < 2/3) { // remove
523+
let removeCount = Math.floor(Math.random()*5);
524+
for(let i = 0; i < removeCount; i++) {
525+
let removePos = Math.floor(Math.random()*textArr.length);
526+
textArr[removePos] = "";
527+
}
528+
} else { // add
529+
let addCount = Math.floor(Math.random()*5);
530+
for(let i = 0; i < addCount; i++) {
531+
let addPos = Math.floor(Math.random()*textArr.length);
532+
let addFromPos = Math.floor(Math.random()*textArr.length);
533+
textArr[addPos] = textArr[addPos] + textArr[addFromPos];
534+
}
535+
}
536+
return textArr.join("");
537+
}
538+
539+
for(let i = 0; i < 1000; i++) {
540+
const newText = applyRandomTextEdit(originalText);
541+
dmp.diff_toDelta(dmp.diff_main(originalText, newText));
542+
}
543+
})();
544+
545+
// Unicode - splitting surrogates
546+
try {
547+
assertEquivalent(
548+
dmp.diff_toDelta([[DIFF_INSERT,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
549+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
550+
);
551+
} catch ( e ) {
552+
assertEquals('Inserting similar surrogate pair at beginning', 'crashed');
553+
}
554+
555+
try {
556+
assertEquivalent(
557+
dmp.diff_toDelta([[DIFF_EQUAL,'\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd70'], [DIFF_EQUAL, '\ud83c\udd71']]),
558+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd70\ud83c\udd71'))
559+
);
560+
} catch ( e ) {
561+
assertEquals('Inserting similar surrogate pair in the middle', 'crashed');
562+
}
563+
564+
try {
565+
assertEquivalent(
566+
dmp.diff_toDelta([[DIFF_DELETE,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
567+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
568+
);
569+
} catch ( e ) {
570+
assertEquals('Deleting similar surrogate pair at the beginning', 'crashed');
571+
}
572+
573+
try {
574+
assertEquivalent(
575+
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c\udd70'], [DIFF_DELETE,'\ud83c\udd72'], [DIFF_EQUAL, '\ud83c\udd71']]),
576+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
577+
);
578+
} catch ( e ) {
579+
assertEquals('Deleting similar surrogate pair in the middle', 'crashed');
580+
}
581+
582+
try {
583+
assertEquivalent(
584+
dmp.diff_toDelta([[DIFF_DELETE, '\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd71']]),
585+
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_DELETE, '\udd70'], [DIFF_INSERT, '\udd71']]),
586+
);
587+
} catch ( e ) {
588+
assertEquals('Swap surrogate pair', 'crashed');
589+
}
590+
591+
try {
592+
assertEquivalent(
593+
dmp.diff_toDelta([[DIFF_INSERT, '\ud83c\udd70'], [DIFF_DELETE, '\ud83c\udd71']]),
594+
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_INSERT, '\udd70'], [DIFF_DELETE, '\udd71']]),
595+
);
596+
} catch ( e ) {
597+
assertEquals('Swap surrogate pair', 'crashed');
598+
}
599+
600+
// Empty diff groups
601+
assertEquivalent(
602+
dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]),
603+
dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']]),
604+
);
605+
606+
// Different versions of the library may have created deltas with
607+
// half of a surrogate pair encoded as if it were valid UTF-8
608+
try {
609+
assertEquivalent(
610+
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
611+
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
612+
);
613+
} catch ( e ) {
614+
assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
615+
}
616+
486617
// Verify pool of unchanged characters.
487618
diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
488619
var text2 = dmp.diff_text2(diffs);
@@ -1019,4 +1150,4 @@ var tests = [
10191150

10201151
for (var x = 0; x < tests.length; x++) {
10211152
test(tests[x], eval(tests[x]))
1022-
}
1153+
}

0 commit comments

Comments
 (0)