Fix: Second update to diff-match-patch fix

dmsnell · dmsnell · commit d6ab8872f5ab · 2020-02-05T12:28:32.000-07:00
In 1.0.2 we applied an update from google/diff-match-patch#80 to resolve the surrogate pair encoding issue. Since that time we found a bug in the fix and resolved that and therefore this patch brings those additional updates to Simeprium. There previously remained issues when decoding broken patches that already existed or which came from unpatched versions of the iOS library and also remained issues when unexpectedly receiving empty diff groups in `toDelta`
diff --git a/RELEASE-NOTES.txt b/RELEASE-NOTES.txt
@@ -1,5 +1,9 @@
 # Changelog
 
+## 1.0.4
+
+ - Update diff-match-patch to newer revision of surrogate-pair encoding fix
+
 ## 1.0.2
 
  - Update diff-match-patch to fix problem when encoding consecutive surrogate pairs
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "simperium",
-  "version": "1.0.3",
+  "version": "1.0.4",
   "description": "A simperium client for node.js",
   "main": "./lib/simperium/index.js",
   "repository": {
diff --git a/src/simperium/jsondiff/diff_match_patch.js b/src/simperium/jsondiff/diff_match_patch.js
@@ -1361,39 +1361,159 @@ diff_match_patch.prototype.diff_toDelta = function(diffs) {
   var text = [];
   var lastEnd;
   for (var x = 0; x < diffs.length; x++) {
-
     var thisDiff = diffs[x];
     var thisTop = thisDiff[1][0];
     var thisEnd = thisDiff[1][thisDiff[1].length - 1];
 
+    if (0 === thisDiff[1].length) {
+      continue;
+    }
+
+    // trap a trailing high-surrogate so we can
+    // distribute it to the successive edits
     if (thisEnd && this.isHighSurrogate(thisEnd)) {
+      lastEnd = thisEnd;
       thisDiff[1] = thisDiff[1].slice(0, -1);
     }
 
     if (lastEnd && thisTop && this.isHighSurrogate(lastEnd) && this.isLowSurrogate(thisTop)) {
       thisDiff[1] = lastEnd + thisDiff[1];
     }
 
-    lastEnd = thisEnd;
-    if ( 0 === thisDiff[1].length ) {
+    if (0 === thisDiff[1].length) {
       continue;
     }
 
-    switch (diffs[x][0]) {
+    switch (thisDiff[0]) {
       case DIFF_INSERT:
-        text[x] = '+' + encodeURI(diffs[x][1]);
+        text.push('+' + encodeURI(thisDiff[1]));
         break;
       case DIFF_DELETE:
-        text[x] = '-' + diffs[x][1].length;
+        text.push('-' + thisDiff[1].length);
         break;
       case DIFF_EQUAL:
-        text[x] = '=' + diffs[x][1].length;
+        text.push('=' + thisDiff[1].length);
         break;
     }
   }
   return text.join('\t').replace(/%20/g, ' ');
 };
 
+diff_match_patch.prototype.digit16 = function(c) {
+  switch (c) {
+    case '0': return 0;
+    case '1': return 1;
+    case '2': return 2;
+    case '3': return 3;
+    case '4': return 4;
+    case '5': return 5;
+    case '6': return 6;
+    case '7': return 7;
+    case '8': return 8;
+    case '9': return 9;
+    case 'A': case 'a': return 10;
+    case 'B': case 'b': return 11;
+    case 'C': case 'c': return 12;
+    case 'D': case 'd': return 13;
+    case 'E': case 'e': return 14;
+    case 'F': case 'f': return 15;
+    default: throw new Error('Invalid hex-code');
+  }
+};
+
+/**
+ * Decode URI-encoded string but allow for encoded surrogate halves
+ *
+ * diff_match_patch needs this relaxation of the requirements because
+ * not all libraries and versions produce valid URI strings in toDelta
+ * and we don't want to crash this code when the input is valid input
+ * but at the same time invalid utf-8
+ *
+ * @example: decodeURI( 'abcd%3A %F0%9F%85%B0' ) = 'abcd: \ud83c\udd70'
+ * @example: decodeURI( 'abcd%3A %ED%A0%BC' ) = 'abcd: \ud83c'
+ *
+ * @cite: @mathiasbynens utf8.js at https://github.com/mathiasbynens/utf8.js
+ *
+ * @param {String} text input string encoded by encodeURI() or equivalent
+ * @return {String}
+ */
+diff_match_patch.prototype.decodeURI = function(text) {
+  try {
+    return decodeURI(text);
+  } catch ( e ) {
+    var i = 0;
+    var decoded = '';
+
+    while (i < text.length) {
+      if ( text[i] !== '%' ) {
+        decoded += text[i++];
+        continue;
+      }
+
+      // start a percent-sequence
+      var byte1 = (this.digit16(text[i + 1]) << 4) + this.digit16(text[i + 2]);
+      if ((byte1 & 0x80) === 0) {
+        decoded += String.fromCharCode(byte1);
+        i += 3;
+        continue;
+      }
+
+      if ('%' !== text[i + 3]) {
+        throw new URIError('URI malformed');
+      }
+
+      var byte2 = (this.digit16(text[i + 4]) << 4) + this.digit16(text[i + 5]);
+      if ((byte2 & 0xC0) !== 0x80) {
+        throw new URIError('URI malformed');
+      }
+      byte2 = byte2 & 0x3F;
+      if ((byte1 & 0xE0) === 0xC0) {
+        decoded += String.fromCharCode(((byte1 & 0x1F) << 6) | byte2);
+        i += 6;
+        continue;
+      }
+
+      if ('%' !== text[i + 6]) {
+        throw new URIError('URI malformed');
+      }
+
+      var byte3 = (this.digit16(text[i + 7]) << 4) + this.digit16(text[i + 8]);
+      if ((byte3 & 0xC0) !== 0x80) {
+        throw new URIError('URI malformed');
+      }
+      byte3 = byte3 & 0x3F;
+      if ((byte1 & 0xF0) === 0xE0) {
+        // unpaired surrogate are fine here
+        decoded += String.fromCharCode(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3);
+        i += 9;
+        continue;
+      }
+
+      if ('%' !== text[i + 9]) {
+        throw new URIError('URI malformed');
+      }
+
+      var byte4 = (this.digit16(text[i + 10]) << 4) + this.digit16(text[i + 11]);
+      if ((byte4 & 0xC0) !== 0x80) {
+        throw new URIError('URI malformed');
+      }
+      byte4 = byte4 & 0x3F;
+      if ((byte1 & 0xF8) === 0xF0) {
+        var codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
+        if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
+          decoded += String.fromCharCode((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800);
+          decoded += String.fromCharCode(0xDC00 | (codePoint & 0xFFFF) & 0x3FF);
+          i += 12;
+          continue;
+        }
+      }
+
+      throw new URIError('URI malformed');
+    }
+
+    return decoded;
+  }
+};
 
 /**
  * Given the original text1, and an encoded string which describes the
@@ -1416,7 +1536,7 @@ diff_match_patch.prototype.diff_fromDelta = function(text1, delta) {
       case '+':
         try {
           diffs[diffsLength++] =
-              new diff_match_patch.Diff(DIFF_INSERT, decodeURI(param));
+              new diff_match_patch.Diff(DIFF_INSERT, this.decodeURI(param));
         } catch (ex) {
           // Malformed URI sequence.
           throw new Error('Illegal escape in diff_fromDelta: ' + param);

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "simperium",`
`3`		`- "version": "1.0.3",`
	`3`	`+ "version": "1.0.4",`
`4`	`4`	`"description": "A simperium client for node.js",`
`5`	`5`	`"main": "./lib/simperium/index.js",`
`6`	`6`	`"repository": {`