Skip to content
This repository was archived by the owner on Aug 5, 2024. It is now read-only.

Commit 461b008

Browse files
committed
JavaScript: Stop breaking surrogate pairs in toDelta()
Resolves #69 for JavaScript Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
1 parent 62f2e68 commit 461b008

File tree

2 files changed

+83
-0
lines changed

2 files changed

+83
-0
lines changed

javascript/diff_match_patch_uncompressed.js

+25
Original file line numberDiff line numberDiff line change
@@ -1339,6 +1339,15 @@ diff_match_patch.prototype.diff_levenshtein = function(diffs) {
13391339
return levenshtein;
13401340
};
13411341

1342+
diff_match_patch.prototype.isHighSurrogate = function(c) {
1343+
var v = c.charCodeAt(0);
1344+
return v >= 0xD800 && v <= 0xDBFF;
1345+
}
1346+
1347+
diff_match_patch.prototype.isLowSurrogate = function(c) {
1348+
var v = c.charCodeAt(0);
1349+
return v >= 0xDC00 && v <= 0xDFFF;
1350+
}
13421351

13431352
/**
13441353
* Crush the diff into an encoded string which describes the operations
@@ -1350,7 +1359,23 @@ diff_match_patch.prototype.diff_levenshtein = function(diffs) {
13501359
*/
13511360
diff_match_patch.prototype.diff_toDelta = function(diffs) {
13521361
var text = [];
1362+
var lastEnd;
13531363
for (var x = 0; x < diffs.length; x++) {
1364+
1365+
var thisDiff = diffs[x];
1366+
var thisTop = thisDiff[1][0];
1367+
var thisEnd = thisDiff[1][thisDiff[1].length - 1];
1368+
1369+
if (thisEnd && this.isHighSurrogate(thisEnd)) {
1370+
thisDiff[1] = thisDiff[1].slice(0, -1);
1371+
}
1372+
1373+
if (lastEnd && thisTop && this.isHighSurrogate(lastEnd) && this.isLowSurrogate(thisTop)) {
1374+
thisDiff[1] = lastEnd + thisDiff[1];
1375+
}
1376+
1377+
lastEnd = thisEnd;
1378+
13541379
switch (diffs[x][0]) {
13551380
case DIFF_INSERT:
13561381
text[x] = '+' + encodeURI(diffs[x][1]);

javascript/tests/diff_match_patch_test.js

+58
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,64 @@ function testDiffDelta() {
492492
// Convert delta string into a diff.
493493
assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));
494494

495+
diffs = [[DIFF_EQUAL, '\ud83d\ude4b\ud83d'], [DIFF_INSERT, '\ude4c\ud83d'], [DIFF_EQUAL, '\ude4b']];
496+
try {
497+
delta = dmp.diff_toDelta(diffs);
498+
assertEquals('=2\t+%F0%9F%99%8C\t=2', delta);
499+
} catch ( e ) {
500+
assertEquals(false, true);
501+
}
502+
503+
(function(){
504+
const originalText = `U+1F17x 🅰️ 🅱️ 🅾️ 🅿️ safhawifhkw
505+
U+1F18x 🆎
506+
0 1 2 3 4 5 6 7 8 9 A B C D E F
507+
U+1F19x 🆑 🆒 🆓 🆔 🆕 🆖 🆗 🆘 🆙 🆚
508+
U+1F20x 🈁 🈂️ sfss.,_||saavvvbbds
509+
U+1F21x 🈚
510+
U+1F22x 🈯
511+
U+1F23x 🈲 🈳 🈴 🈵 🈶 🈷️ 🈸 🈹 🈺
512+
U+1F25x 🉐 🉑
513+
U+1F30x 🌀 🌁 🌂 🌃 🌄 🌅 🌆 🌇 🌈 🌉 🌊 🌋 🌌 🌍 🌎 🌏
514+
U+1F31x 🌐 🌑 🌒 🌓 🌔 🌕 🌖 🌗 🌘 🌙 🌚 🌛 🌜 🌝 🌞 `;
515+
516+
// applies some random edits to string and returns new, edited string
517+
function applyRandomTextEdit(text) {
518+
let textArr = [...text];
519+
let r = Math.random();
520+
if(r < 1/3) { // swap
521+
let swapCount = Math.floor(Math.random()*5);
522+
for(let i = 0; i < swapCount; i++) {
523+
let swapPos1 = Math.floor(Math.random()*textArr.length);
524+
let swapPos2 = Math.floor(Math.random()*textArr.length);
525+
let char1 = textArr[swapPos1];
526+
let char2 = textArr[swapPos2];
527+
textArr[swapPos1] = char2;
528+
textArr[swapPos2] = char1;
529+
}
530+
} else if(r < 2/3) { // remove
531+
let removeCount = Math.floor(Math.random()*5);
532+
for(let i = 0; i < removeCount; i++) {
533+
let removePos = Math.floor(Math.random()*textArr.length);
534+
textArr[removePos] = "";
535+
}
536+
} else { // add
537+
let addCount = Math.floor(Math.random()*5);
538+
for(let i = 0; i < addCount; i++) {
539+
let addPos = Math.floor(Math.random()*textArr.length);
540+
let addFromPos = Math.floor(Math.random()*textArr.length);
541+
textArr[addPos] = textArr[addPos] + textArr[addFromPos];
542+
}
543+
}
544+
return textArr.join("");
545+
}
546+
547+
for(let i = 0; i < 1000; i++) {
548+
newText = applyRandomTextEdit(originalText);
549+
dmp.patch_toText(dmp.patch_make(originalText, newText));
550+
}
551+
});
552+
495553
// Verify pool of unchanged characters.
496554
diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
497555
var text2 = dmp.diff_text2(diffs);

0 commit comments

Comments
 (0)