Skip to content

Commit 50f1542

Browse files
committed
Python3: Stop breaking surrogate pairs in toDelta()
Resolves google#69 for Python3 Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
1 parent db1cbba commit 50f1542

File tree

2 files changed

+82
-7
lines changed

2 files changed

+82
-7
lines changed

python3/diff_match_patch.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
__author__ = '[email protected] (Neil Fraser)'
2727

2828
import re
29+
import struct
2930
import sys
3031
import time
3132
import urllib.parse
@@ -1147,14 +1148,17 @@ def diff_toDelta(self, diffs):
11471148
"""
11481149
text = []
11491150
for (op, data) in diffs:
1151+
if 0 == len(data):
1152+
continue
1153+
11501154
if op == self.DIFF_INSERT:
11511155
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
11521156
data = data.encode("utf-8")
11531157
text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
11541158
elif op == self.DIFF_DELETE:
1155-
text.append("-%d" % len(data))
1159+
text.append("-%d" % (len(data.encode('utf-16-be')) // 2))
11561160
elif op == self.DIFF_EQUAL:
1157-
text.append("=%d" % len(data))
1161+
text.append("=%d" % (len(data.encode('utf-16-be')) // 2))
11581162
return "\t".join(text)
11591163

11601164
def diff_fromDelta(self, text1, delta):
@@ -1172,7 +1176,8 @@ def diff_fromDelta(self, text1, delta):
11721176
ValueError: If invalid input.
11731177
"""
11741178
diffs = []
1175-
pointer = 0 # Cursor in text1
1179+
as_utf16 = text1.encode('utf-16-be')
1180+
pointer = 0 # Cursor in as_utf16
11761181
tokens = delta.split("\t")
11771182
for token in tokens:
11781183
if token == "":
@@ -1191,8 +1196,8 @@ def diff_fromDelta(self, text1, delta):
11911196
raise ValueError("Invalid number in diff_fromDelta: " + param)
11921197
if n < 0:
11931198
raise ValueError("Negative number in diff_fromDelta: " + param)
1194-
text = text1[pointer : pointer + n]
1195-
pointer += n
1199+
text = as_utf16[pointer : pointer + n * 2].decode('utf-16-be')
1200+
pointer += n * 2
11961201
if token[0] == "=":
11971202
diffs.append((self.DIFF_EQUAL, text))
11981203
else:
@@ -1201,10 +1206,10 @@ def diff_fromDelta(self, text1, delta):
12011206
# Anything else is an error.
12021207
raise ValueError("Invalid diff operation in diff_fromDelta: " +
12031208
token[0])
1204-
if pointer != len(text1):
1209+
if pointer != len(as_utf16):
12051210
raise ValueError(
12061211
"Delta length (%d) does not equal source text length (%d)." %
1207-
(pointer, len(text1)))
1212+
(pointer, len(as_utf16)))
12081213
return diffs
12091214

12101215
# MATCH FUNCTIONS

python3/tests/diff_match_patch_test.py

+70
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"""
1919

2020
import imp
21+
import json
2122
import os
2223
import sys
2324
import time
@@ -444,6 +445,12 @@ def testDiffDelta(self):
444445
# Convert delta string into a diff.
445446
self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta))
446447

448+
diffs = self.dmp.diff_main("\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B")
449+
delta = self.dmp.diff_toDelta(diffs)
450+
self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta)
451+
452+
self.assertEqual(diffs, self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2"))
453+
447454
# Verify pool of unchanged characters.
448455
diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]
449456
text2 = self.dmp.diff_text2(diffs)
@@ -455,6 +462,69 @@ def testDiffDelta(self):
455462
# Convert delta string into a diff.
456463
self.assertEqual(diffs, self.dmp.diff_fromDelta("", delta))
457464

465+
# Unicode: split surrogates
466+
self.assertEqual(
467+
self.dmp.diff_toDelta([
468+
(self.dmp.DIFF_INSERT, '\U0001F171'),
469+
(self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
470+
]),
471+
self.dmp.diff_toDelta(self.dmp.diff_main(
472+
'\U0001F170\U0001F171',
473+
'\U0001F171\U0001F170\U0001F171'
474+
)),
475+
'Inserting similar surrogate pair at beginning'
476+
)
477+
478+
self.assertEqual(
479+
self.dmp.diff_toDelta([
480+
(self.dmp.DIFF_EQUAL, '\U0001F170'),
481+
(self.dmp.DIFF_INSERT, '\U0001F172'),
482+
(self.dmp.DIFF_EQUAL, '\U0001F171')
483+
]),
484+
self.dmp.diff_toDelta(self.dmp.diff_main(
485+
'\U0001F170\U0001F171',
486+
'\U0001F170\U0001F172\U0001F171'
487+
)),
488+
'Inserting similar surrogate pair in the middle'
489+
)
490+
491+
self.assertEqual(
492+
self.dmp.diff_toDelta([
493+
(self.dmp.DIFF_DELETE, '\U0001F171'),
494+
(self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
495+
]),
496+
self.dmp.diff_toDelta(self.dmp.diff_main(
497+
'\U0001F171\U0001F170\U0001F171',
498+
'\U0001F170\U0001F171'
499+
)),
500+
'Deleting similar surogate pair at the beginning'
501+
)
502+
503+
self.assertEqual(
504+
self.dmp.diff_toDelta([
505+
(self.dmp.DIFF_EQUAL, '\U0001F170'),
506+
(self.dmp.DIFF_DELETE, '\U0001F172'),
507+
(self.dmp.DIFF_EQUAL, '\U0001F171')
508+
]),
509+
self.dmp.diff_toDelta(self.dmp.diff_main(
510+
'\U0001F170\U0001F172\U0001F171',
511+
'\U0001F170\U0001F171'
512+
)),
513+
'Deleting similar surogate pair in the middle'
514+
)
515+
516+
self.assertEqual(
517+
self.dmp.diff_toDelta([
518+
(self.dmp.DIFF_DELETE, '\U0001F170'),
519+
(self.dmp.DIFF_INSERT, '\U0001F171')
520+
]),
521+
self.dmp.diff_toDelta(self.dmp.diff_main(
522+
'\U0001F170',
523+
'\U0001F171'
524+
)),
525+
'Swap surrogate pair'
526+
)
527+
458528
# 160 kb string.
459529
a = "abcdefghij"
460530
for i in range(14):

0 commit comments

Comments
 (0)