From f037d43269bdb28acf28500ce24d2bef1e011a75 Mon Sep 17 00:00:00 2001 From: Joe Wreschnig Date: Wed, 14 May 2014 00:15:18 +0200 Subject: [PATCH] Tweak to Levenshtein costs to prefer ins/del to sub/sub. --- string-lerp.js | 26 +++++++++++++++----------- tests/string-lerp.js | 16 ++++++++++++++++ 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/string-lerp.js b/string-lerp.js index 40d9fd0..6e0b109 100644 --- a/string-lerp.js +++ b/string-lerp.js @@ -3,10 +3,10 @@ var MAX_MATRIX_SIZE = 256 * 256; - function levenshteinMatrix(s, t) { + function levenshteinMatrix(s, t, ins, del, sub) { /** Calculate the Levenshtein edit distance matrix for two strings - The matrix is returned as a flat unsigned typed array. + The matrix is returned as a flat typed array. Following http://en.wikipedia.org/wiki/Levenshtein_distance */ @@ -23,9 +23,9 @@ if (s[i - 1] === t[j - 1]) d[n * i + j] = d[n * (i - 1) + j - 1]; else - d[n * i + j] = 1 + Math.min(d[n * (i - 1) + j ], - d[n * i + j - 1], - d[n * (i - 1) + j - 1]); + d[n * i + j] = Math.min(del + d[n * (i - 1) + j ], + ins + d[n * i + j - 1], + sub + d[n * (i - 1) + j - 1]); return d; } @@ -56,7 +56,7 @@ function diff(s, t) { /** Create a diff between string s and t */ - return editPath(levenshteinMatrix(s, t), t); + return editPath(levenshteinMatrix(s, t, 2, 2, 3), t); } function patch(edits, s) { @@ -103,7 +103,7 @@ var GLYPH = /([\0-\u02FF\u0370-\u1DBF\u1E00-\u20CF\u2100-\uD7FF\uDC00-\uFE1F\uFE30-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF])([\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]*)/g; function diffLerp(a, b, p) { - /** Interpolate between two strings based on edit distance + /** Interpolate between two strings based on edit operations This interpolation algorithm applys a partial edit of one string into the other. This produces nice looking results, @@ -159,10 +159,10 @@ is clamped to an integer. For example, numericLerp("0.0", "100", 0.123) === "12.3" - because the "." in "0.0" is intepreted as a decimal point. - But numericLerp("0.", "100.", 0.123) === "12." because the - strings are interpreted as integers followed by a full - stop. + because the "." in "0.0" is interpreted as a decimal + point. But numericLerp("0.", "100.", 0.123) === "12." + because the strings are interpreted as integers followed + by a full stop. Calling this functions on strings that differ in more than numerals gives undefined results. @@ -185,6 +185,10 @@ front of one string with another. This approach is fast but does not look good when the strings are similar. */ + + // TODO: Consider fast-pathing this even more for very large + // strings, e.g. in the megabyte range. These are large enough + // that if (a.match(MULTI) || b.match(MULTI)) { var ca = a.match(GLYPH) || []; var cb = b.match(GLYPH) || []; diff --git a/tests/string-lerp.js b/tests/string-lerp.js index 5728e7f..7cee7d1 100644 --- a/tests/string-lerp.js +++ b/tests/string-lerp.js @@ -114,6 +114,22 @@ JS.Test.describe('diff lerp', function () { with (this) { assert(r === "a" || r === UNUSUAL_Q); } }}); + + it("prefers ins/del to sub/sub", function () { with (this) { + // When the cost is uniform this string can be transformed by + // rewriting the whole thing for the same cost as deleting the + // front and adding to the back. But visually, we'd rather do + // the latter. + assertEqual("core", lerp("hard core", "core dump", 0.50)); + }}); + + it("weights ins/del cheaper than sub", function () { with (this) { + // When the cost is uniform it is cheaper to rewrite the + // former into the latter. But we'd rather keep the "core" for + // visual reasons, so we need to make sure we have unequal + // costs. + assertEqual("core", lerp("apple core", "core dump", 0.51)); + }}); }}); JS.Test.describe('numeric lerp', function () { with (this) { -- 2.30.2