Handle surrogate pairs and combining characters correctly.

author Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 13 May 2014 21:08:54 +0000 (23:08 +0200)

committer Joe Wreschnig <joe.wreschnig@gmail.com>

Tue, 13 May 2014 21:08:54 +0000 (23:08 +0200)
author Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 13 May 2014 21:08:54 +0000 (23:08 +0200)
committer Joe Wreschnig <joe.wreschnig@gmail.com>
Tue, 13 May 2014 21:08:54 +0000 (23:08 +0200)
diff --git a/string-lerp.js b/string-lerp.js

index 3a304886374373c3471aaf530a92f2615ea5544d..40d9fd0c30655daaaac0e83a5fb648ae267cd8c1 100644 (file)
--- a/string-lerp.js
+++ b/string-lerp.js
@@ -61,24 +61,47 @@
  
      function patch(edits, s) {
          /** Apply the list of edits to s */
+        var edit;
          var i;
-        for (i = 0; i < edits.length; ++i) {
-            var edit = edits[i];
-            switch (edit[0]) {
-            case "sub":
-                s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1] + 1);
-                break;
-            case "ins":
-                s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1]);
-                break;
-            case "del":
-                s = s.slice(0, edit[1]) + s.slice(edit[1] + 1);
-                break;
+
+        if (Array.isArray(s)) {
+            for (i = 0; i < edits.length; ++i) {
+                edit = edits[i];
+                switch (edit[0]) {
+                case "sub":
+                    s[edit[1]] = edit[2];
+                    break;
+                case "ins":
+                    s.splice(edit[1], 0, edit[2]);
+                    break;
+                case "del":
+                    s.splice(edit[1], 1);
+                    break;
+                }
+            }
+        } else {
+            for (i = 0; i < edits.length; ++i) {
+                edit = edits[i];
+                switch (edit[0]) {
+                case "sub":
+                    s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1] + 1);
+                    break;
+                case "ins":
+                    s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1]);
+                    break;
+                case "del":
+                    s = s.slice(0, edit[1]) + s.slice(edit[1] + 1);
+                    break;
+                }
              }
          }
          return s;
      }
  
+    var MULTI = /[\uD800-\uDBFF][\uDC00-\uDFFF]|[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/;
+
+    var GLYPH = /([\0-\u02FF\u0370-\u1DBF\u1E00-\u20CF\u2100-\uD7FF\uDC00-\uFE1F\uFE30-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF])([\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]*)/g;
+
      function diffLerp(a, b, p) {
          /** Interpolate between two strings based on edit distance
  
@@ -89,6 +112,19 @@
              longer than a few hundred characters.
           */
  
+        // If given strings with astral codepoints or combining
+        // characters, split them into arrays of "glyphs" first,
+        // do the edit on the list of "glyphs", and rejoin them.
+        //
+        // This split is not perfect for all languages, but at least
+        // it won't create invalid surrogate pairs or orphaned
+        // combining characters.
+        if (a.match && a.match(MULTI) || b.match && b.match(MULTI)) {
+            var ca = a.match(GLYPH) || [];
+            var cb = b.match(GLYPH) || [];
+            return diffLerp(ca, cb, p).join("");
+        }
+
          // The edit path works from the string end, forwards, because
          // that's how Levenshtein edits work. To match LTR reading
          // direction (and the behavior of fastLerp), swap the strings
@@ -149,9 +185,19 @@
              front of one string with another. This approach is fast
              but does not look good when the strings are similar.
          */
-        var alen = Math.round(a.length * p);
-        var blen = Math.round(b.length * p);
-        return b.substring(0, blen) + a.substring(alen, a.length);
+        if (a.match(MULTI) || b.match(MULTI)) {
+            var ca = a.match(GLYPH) || [];
+            var cb = b.match(GLYPH) || [];
+            var calen = Math.round(ca.length * p);
+            var cblen = Math.round(cb.length * p);
+            var r = cb.slice(0, cblen);
+            r.push.apply(r, ca.slice(calen, ca.length));
+            return r.join("");
+        } else {
+            var alen = Math.round(a.length * p);
+            var blen = Math.round(b.length * p);
+            return b.substring(0, blen) + a.substring(alen, a.length);
+        }
      }
  
      function lerp(a, b, p) {
diff --git a/tests/string-lerp.js b/tests/string-lerp.js

index 921e199d59b7d0b8c2a7f9cb4757559e6fc3583b..5728e7f9ece4844cfea0f3a17050e6bf405b3999 100644 (file)
--- a/tests/string-lerp.js
+++ b/tests/string-lerp.js
@@ -1,6 +1,9 @@
  var JS = this.JS || require('jstest');
  var m = require('../string-lerp');
  
+var HALF_POOS = /\uD83D\uD83D|\uDCA9\uDCA9|\uD83D$/;
+var UNUSUAL_Q = 'q\u0307\u0323';
+
  JS.Test.describe('fast lerp', function () { with (this) {
      var lerp = m.fastLerp;
      var A = "Do you like green eggs and ham?";
@@ -38,6 +41,22 @@ JS.Test.describe('fast lerp', function () { with (this) {
          assertEqual("I do not like tn eggs and ham?", lerp(A, B, 0.5));
          assertEqual("I do not like them, Sam-I-am?", lerp(A, B, 0.98));
      }});
+
+    it("doesn't make half a poo", function () { with (this) {
+        var poos = "\uD83D\uDCA9\uD83D\uDCA9\uD83D\uDCA9\uD83D\uDCA9";
+        assertEqual("\uD83D\uDCA9\uD83D\uDCA9", lerp("", poos, 0.5));
+        assertEqual("\uD83D\uDCA9", lerp("", poos, 0.35));
+
+        for (var i = 0; i <= 1; i += 1/256)
+            assertNot(lerp("", poos, i).match(HALF_POOS));
+    }});
+
+    it("doesn't misplace combining marks", function () { with (this) {
+        for (var i = 0; i <= 1; i += 1/256) {
+            var r = lerp("a", UNUSUAL_Q, i);
+            assert(r === "a" || r === UNUSUAL_Q);
+        }
+    }});
  }});
  
  JS.Test.describe('diff lerp', function () { with (this) {
@@ -79,6 +98,22 @@ JS.Test.describe('diff lerp', function () { with (this) {
          assertEqual("I do not like them, Sam-Iham?", lerp(A, B, 0.9));
          assertEqual(B, lerp(A, B, 0.99));
      }});
+
+    it("doesn't make half a poo", function () { with (this) {
+        var poos = "\uD83D\uDCA9\uD83D\uDCA9\uD83D\uDCA9\uD83D\uDCA9";
+        assertEqual("\uD83D\uDCA9\uD83D\uDCA9", lerp("", poos, 0.5));
+        assertEqual("\uD83D\uDCA9", lerp("", poos, 0.35));
+
+        for (var i = 0; i <= 1; i += 1/256)
+            assertNot(lerp("", poos, i).match(HALF_POOS));
+    }});
+
+    it("doesn't misplace combining marks", function () { with (this) {
+        for (var i = 0; i <= 1; i += 1/256) {
+            var r = lerp("a", UNUSUAL_Q, i);
+            assert(r === "a" || r === UNUSUAL_Q);
+        }
+    }});
  }});
  
  JS.Test.describe('numeric lerp', function () { with (this) {
author	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 13 May 2014 21:08:54 +0000 (23:08 +0200)
committer	Joe Wreschnig <joe.wreschnig@gmail.com>
	Tue, 13 May 2014 21:08:54 +0000 (23:08 +0200)
string-lerp.js		patch \| blob \| history
tests/string-lerp.js		patch \| blob \| history