Skip to content
This repository was archived by the owner on Aug 5, 2024. It is now read-only.

JS: Handle surrogate pairs correctly #69

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions javascript/diff_match_patch_uncompressed.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@
* @author [email protected] (Neil Fraser)
*/

/**
* Determine if the index is inside a surrogate pair.
* @param {string} str The string
* @param {numer} idx The index
*/
function insideSurrogate(str, idx) {
var code = str.charCodeAt(idx);
return code >= 0xDC00 && code <= 0xDFFF;
}

/**
* Class containing the diff, match and patch methods.
* @constructor
Expand Down Expand Up @@ -361,6 +371,11 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
x1++;
y1++;
}
if (insideSurrogate(text1, x1)) {
x1--;
y1--;
}

v1[k1_offset] = x1;
if (x1 > text1_length) {
// Ran off the right of the graph.
Expand Down Expand Up @@ -569,6 +584,9 @@ diff_match_patch.prototype.diff_commonPrefix = function(text1, text2) {
}
pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
}
if (insideSurrogate(text1, pointermid)) {
pointermid--;
}
return pointermid;
};

Expand Down Expand Up @@ -601,6 +619,9 @@ diff_match_patch.prototype.diff_commonSuffix = function(text1, text2) {
}
pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
}
if (insideSurrogate(text1, text1.length - pointermid)) {
pointermid--;
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I came up with this instead, but I don't have a test that fails in either case so obviously I'm missing a test case

if (pointer mid < length - 1 && this.isLowSurrogate(text1[poitnermid])) {
  pointermid++;
}

my reasoning was that if the common suffix starts on a low surrogate then we have to bump it to the right so that the surrogate half doesn't end up in the common part - backwards from commonPrefix where we want to shift to the left in order to avoid the same for high surrogates

return pointermid;
};

Expand Down
1 change: 1 addition & 0 deletions javascript/tests/diff_match_patch_test.html
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@
'testPatchObj',
'testPatchFromText',
'testPatchToText',
'testPatchSurrogates',
'testPatchAddContext',
'testPatchMake',
'testPatchSplitMax',
Expand Down
51 changes: 51 additions & 0 deletions javascript/tests/diff_match_patch_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,57 @@ function testPatchToText() {
strp = '@@ -1,9 +1,9 @@\n-f\n+F\n oo+fooba\n@@ -7,9 +7,9 @@\n obar\n-,\n+.\n tes\n';
p = dmp.patch_fromText(strp);
assertEquals(strp, dmp.patch_toText(p));

}

function testPatchSurrogates() {
var p, p2, strp;

// These share the same high surrogate prefix
p = dmp.patch_make('\u{1F30D}', '\u{1F308}');
strp = dmp.patch_toText(p);
p2 = dmp.patch_fromText(strp);
assertEquivalent(p, p2);

// These share the same low surrogate suffix
p = dmp.patch_make('\u{10120}', '\u{10520}');
strp = dmp.patch_toText(p);
p2 = dmp.patch_fromText(strp);
assertEquivalent(p, p2);

// No common prefix, but later there's the same high surrogate char
p = dmp.patch_make('abbb\u{1F30D}', 'cbbb\u{1F308}');
strp = dmp.patch_toText(p);
p2 = dmp.patch_fromText(strp);
assertEquivalent(p, p2);

// No common suffix, but earlier there's the same low surrogate char
p = dmp.patch_make('\u{10120}aaac', '\u{10520}aaab');
strp = dmp.patch_toText(p);
p2 = dmp.patch_fromText(strp);
assertEquivalent(p, p2);

// No common suffix, but earlier there's the same low surrogate char
p = dmp.patch_make('abbb\u{10120}aaac', '\u{10520}aaab');
strp = dmp.patch_toText(p);
p2 = dmp.patch_fromText(strp);
assertEquivalent(p, p2);

var padding1 = "";
while (padding1.length < 100) {
padding1 += String.fromCharCode(50 + padding1.length);
}

var padding2 = "";
while (padding2.length < 100) {
padding2 += String.fromCharCode(200 + padding2.length);
}

// Add some random padding
p = dmp.patch_make(padding1+'\u{10120}'+padding2, padding2+'\u{10520}'+padding1);
strp = dmp.patch_toText(p);
p2 = dmp.patch_fromText(strp);
assertEquivalent(p, p2);
}

function testPatchAddContext() {
Expand Down