From: Joe Wreschnig Date: Tue, 13 May 2014 21:08:54 +0000 (+0200) Subject: Handle surrogate pairs and combining characters correctly. X-Git-Tag: 1.0.0~23 X-Git-Url: https://git.yukkurigames.com/?p=string-lerp.git;a=commitdiff_plain;h=245005bdfa792a76d55b0fafd2255c4c8325d28c Handle surrogate pairs and combining characters correctly. --- diff --git a/string-lerp.js b/string-lerp.js index 3a30488..40d9fd0 100644 --- a/string-lerp.js +++ b/string-lerp.js @@ -61,24 +61,47 @@ function patch(edits, s) { /** Apply the list of edits to s */ + var edit; var i; - for (i = 0; i < edits.length; ++i) { - var edit = edits[i]; - switch (edit[0]) { - case "sub": - s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1] + 1); - break; - case "ins": - s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1]); - break; - case "del": - s = s.slice(0, edit[1]) + s.slice(edit[1] + 1); - break; + + if (Array.isArray(s)) { + for (i = 0; i < edits.length; ++i) { + edit = edits[i]; + switch (edit[0]) { + case "sub": + s[edit[1]] = edit[2]; + break; + case "ins": + s.splice(edit[1], 0, edit[2]); + break; + case "del": + s.splice(edit[1], 1); + break; + } + } + } else { + for (i = 0; i < edits.length; ++i) { + edit = edits[i]; + switch (edit[0]) { + case "sub": + s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1] + 1); + break; + case "ins": + s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1]); + break; + case "del": + s = s.slice(0, edit[1]) + s.slice(edit[1] + 1); + break; + } } } return s; } + var MULTI = /[\uD800-\uDBFF][\uDC00-\uDFFF]|[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/; + + var GLYPH = /([\0-\u02FF\u0370-\u1DBF\u1E00-\u20CF\u2100-\uD7FF\uDC00-\uFE1F\uFE30-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF])([\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]*)/g; + function diffLerp(a, b, p) { /** Interpolate between two strings based on edit distance @@ -89,6 +112,19 @@ longer than a few hundred characters. */ + // If given strings with astral codepoints or combining + // characters, split them into arrays of "glyphs" first, + // do the edit on the list of "glyphs", and rejoin them. + // + // This split is not perfect for all languages, but at least + // it won't create invalid surrogate pairs or orphaned + // combining characters. + if (a.match && a.match(MULTI) || b.match && b.match(MULTI)) { + var ca = a.match(GLYPH) || []; + var cb = b.match(GLYPH) || []; + return diffLerp(ca, cb, p).join(""); + } + // The edit path works from the string end, forwards, because // that's how Levenshtein edits work. To match LTR reading // direction (and the behavior of fastLerp), swap the strings @@ -149,9 +185,19 @@ front of one string with another. This approach is fast but does not look good when the strings are similar. */ - var alen = Math.round(a.length * p); - var blen = Math.round(b.length * p); - return b.substring(0, blen) + a.substring(alen, a.length); + if (a.match(MULTI) || b.match(MULTI)) { + var ca = a.match(GLYPH) || []; + var cb = b.match(GLYPH) || []; + var calen = Math.round(ca.length * p); + var cblen = Math.round(cb.length * p); + var r = cb.slice(0, cblen); + r.push.apply(r, ca.slice(calen, ca.length)); + return r.join(""); + } else { + var alen = Math.round(a.length * p); + var blen = Math.round(b.length * p); + return b.substring(0, blen) + a.substring(alen, a.length); + } } function lerp(a, b, p) { diff --git a/tests/string-lerp.js b/tests/string-lerp.js index 921e199..5728e7f 100644 --- a/tests/string-lerp.js +++ b/tests/string-lerp.js @@ -1,6 +1,9 @@ var JS = this.JS || require('jstest'); var m = require('../string-lerp'); +var HALF_POOS = /\uD83D\uD83D|\uDCA9\uDCA9|\uD83D$/; +var UNUSUAL_Q = 'q\u0307\u0323'; + JS.Test.describe('fast lerp', function () { with (this) { var lerp = m.fastLerp; var A = "Do you like green eggs and ham?"; @@ -38,6 +41,22 @@ JS.Test.describe('fast lerp', function () { with (this) { assertEqual("I do not like tn eggs and ham?", lerp(A, B, 0.5)); assertEqual("I do not like them, Sam-I-am?", lerp(A, B, 0.98)); }}); + + it("doesn't make half a poo", function () { with (this) { + var poos = "\uD83D\uDCA9\uD83D\uDCA9\uD83D\uDCA9\uD83D\uDCA9"; + assertEqual("\uD83D\uDCA9\uD83D\uDCA9", lerp("", poos, 0.5)); + assertEqual("\uD83D\uDCA9", lerp("", poos, 0.35)); + + for (var i = 0; i <= 1; i += 1/256) + assertNot(lerp("", poos, i).match(HALF_POOS)); + }}); + + it("doesn't misplace combining marks", function () { with (this) { + for (var i = 0; i <= 1; i += 1/256) { + var r = lerp("a", UNUSUAL_Q, i); + assert(r === "a" || r === UNUSUAL_Q); + } + }}); }}); JS.Test.describe('diff lerp', function () { with (this) { @@ -79,6 +98,22 @@ JS.Test.describe('diff lerp', function () { with (this) { assertEqual("I do not like them, Sam-Iham?", lerp(A, B, 0.9)); assertEqual(B, lerp(A, B, 0.99)); }}); + + it("doesn't make half a poo", function () { with (this) { + var poos = "\uD83D\uDCA9\uD83D\uDCA9\uD83D\uDCA9\uD83D\uDCA9"; + assertEqual("\uD83D\uDCA9\uD83D\uDCA9", lerp("", poos, 0.5)); + assertEqual("\uD83D\uDCA9", lerp("", poos, 0.35)); + + for (var i = 0; i <= 1; i += 1/256) + assertNot(lerp("", poos, i).match(HALF_POOS)); + }}); + + it("doesn't misplace combining marks", function () { with (this) { + for (var i = 0; i <= 1; i += 1/256) { + var r = lerp("a", UNUSUAL_Q, i); + assert(r === "a" || r === UNUSUAL_Q); + } + }}); }}); JS.Test.describe('numeric lerp', function () { with (this) {