X-Git-Url: https://git.yukkurigames.com/?p=string-lerp.git;a=blobdiff_plain;f=string-lerp.js;h=1aa89b0552d39891f6290aed1495fbd805a33481;hp=40d9fd0c30655daaaac0e83a5fb648ae267cd8c1;hb=13d986bfdc0f90b6a84cdb1662f318864e177d99;hpb=245005bdfa792a76d55b0fafd2255c4c8325d28c diff --git a/string-lerp.js b/string-lerp.js index 40d9fd0..1aa89b0 100644 --- a/string-lerp.js +++ b/string-lerp.js @@ -1,17 +1,41 @@ +/* string-lerp - progressively turn one string into another + Copyright 2014 Joe Wreschnig + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. +*/ + +/* @license Copyright 2014 Joe Wreschnig - GNU GPL v2 or later */ + (function (exports) { "use strict"; var MAX_MATRIX_SIZE = 256 * 256; - function levenshteinMatrix(s, t) { - /** Calculate the Levenshtein edit distance matrix for two strings + function costMatrix(source, target, ins, del, sub) { + /** Calculate the Levenshtein cost matrix for source and target + + If source and target are strings, they cannot contain any + astral or combining codepoints. Such data must be passed + as arrays of strings with one element per glyph. + + ins, del, and sub are the costs for insertion, deletion, + and substition respectively. Their default value is 1. If + only ins is passed, del and sub are set to the same cost. + If ins and del are passed, sub is set to the more + expensive of the two. + + The matrix is returned as a flat typed array. - The matrix is returned as a flat unsigned typed array. - Following http://en.wikipedia.org/wiki/Levenshtein_distance */ - var m = s.length + 1; - var n = t.length + 1; + ins = ins === undefined ? 1 : (ins | 0); + del = (del | 0) || ins; + sub = (sub | 0) || Math.max(ins, del); + var m = source.length + 1; + var n = target.length + 1; var d = new Uint32Array(m * n); var i, j; for (i = 1; i < m; ++i) @@ -20,82 +44,73 @@ d[j] = j; for (j = 1; j < n; ++j) for (i = 1; i < m; ++i) - if (s[i - 1] === t[j - 1]) + if (source[i - 1] === target[j - 1]) d[n * i + j] = d[n * (i - 1) + j - 1]; else - d[n * i + j] = 1 + Math.min(d[n * (i - 1) + j ], - d[n * i + j - 1], - d[n * (i - 1) + j - 1]); + d[n * i + j] = Math.min(del + d[n * (i - 1) + j ], + ins + d[n * i + j - 1], + sub + d[n * (i - 1) + j - 1]); return d; } - function editPath(d, t) { - /** Given a Levenshtein matrix and target, create an edit list */ + // First, note that deletion is just substition with nothing, so + // any DEL operation can be replaced by a SUB. Second, the + // operation code *is* the necessary slice offset for applying the + // diff. + var INS = 0, SUB = 1; + + function editPath(costs, target) { + /** Given a cost matrix and a target, create an edit list */ var path = []; - var j = t.length; + var j = target.length; var n = j + 1; - var i = d.length / n - 1; + var i = costs.length / n - 1; while (i || j) { - var sub = (i && j) ? d[n * (i - 1) + j - 1] : Infinity; - var del = i ? d[n * (i - 1) + j] : Infinity; - var ins = j ? d[n * i + j - 1] : Infinity; + var sub = (i && j) ? costs[n * (i - 1) + j - 1] : Infinity; + var del = i ? costs[n * (i - 1) + j] : Infinity; + var ins = j ? costs[n * i + j - 1] : Infinity; if (sub <= ins && sub <= del) { - if (d[n * i + j] !== d[n * (i - 1) + j - 1]) - path.push(["sub", i - 1, t[j - 1]]); + if (costs[n * i + j] !== costs[n * (i - 1) + j - 1]) + path.push([SUB, i - 1, target[j - 1]]); --i; --j; } else if (ins <= del) { - path.push(["ins", i, t[j - 1]]); + path.push([INS, i, target[j - 1]]); --j; } else { - path.push(["del", i - 1]); + path.push([SUB, i - 1, ""]); --i; } } return path; } - function diff(s, t) { - /** Create a diff between string s and t */ - return editPath(levenshteinMatrix(s, t), t); + function diff(source, target, ins, del, sub) { + /** Create a diff between string source and target + + ins, del, and sub are as passed to levenshtein + */ + return editPath(costMatrix(source, target, ins, del, sub), target); } - function patch(edits, s) { + function patch(diff, source) { /** Apply the list of edits to s */ var edit; var i; - if (Array.isArray(s)) { - for (i = 0; i < edits.length; ++i) { - edit = edits[i]; - switch (edit[0]) { - case "sub": - s[edit[1]] = edit[2]; - break; - case "ins": - s.splice(edit[1], 0, edit[2]); - break; - case "del": - s.splice(edit[1], 1); - break; - } + if (Array.isArray(source)) { + for (i = 0; i < diff.length; ++i) { + edit = diff[i]; + source.splice(edit[1], edit[0], edit[2]); } } else { - for (i = 0; i < edits.length; ++i) { - edit = edits[i]; - switch (edit[0]) { - case "sub": - s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1] + 1); - break; - case "ins": - s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1]); - break; - case "del": - s = s.slice(0, edit[1]) + s.slice(edit[1] + 1); - break; - } + for (i = 0; i < diff.length; ++i) { + edit = diff[i]; + var head = source.slice(0, edit[1]); + var tail = source.slice(edit[1] + edit[0]); + source = head + edit[2] + tail; } } - return s; + return source; } var MULTI = /[\uD800-\uDBFF][\uDC00-\uDFFF]|[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/; @@ -103,7 +118,7 @@ var GLYPH = /([\0-\u02FF\u0370-\u1DBF\u1E00-\u20CF\u2100-\uD7FF\uDC00-\uFE1F\uFE30-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF])([\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]*)/g; function diffLerp(a, b, p) { - /** Interpolate between two strings based on edit distance + /** Interpolate between two strings based on edit operations This interpolation algorithm applys a partial edit of one string into the other. This produces nice looking results, @@ -129,7 +144,7 @@ // that's how Levenshtein edits work. To match LTR reading // direction (and the behavior of fastLerp), swap the strings // and invert the parameter when editing. - var edits = diff(b, a); + var edits = diff(b, a, 2, 2, 3); var partial = edits.slice(0, Math.round((1 - p) * edits.length)); return patch(partial, b); } @@ -159,10 +174,10 @@ is clamped to an integer. For example, numericLerp("0.0", "100", 0.123) === "12.3" - because the "." in "0.0" is intepreted as a decimal point. - But numericLerp("0.", "100.", 0.123) === "12." because the - strings are interpreted as integers followed by a full - stop. + because the "." in "0.0" is interpreted as a decimal + point. But numericLerp("0.", "100.", 0.123) === "12." + because the strings are interpreted as integers followed + by a full stop. Calling this functions on strings that differ in more than numerals gives undefined results. @@ -185,6 +200,10 @@ front of one string with another. This approach is fast but does not look good when the strings are similar. */ + + // TODO: Consider fast-pathing this even more for very large + // strings, e.g. in the megabyte range. These are large enough + // that if (a.match(MULTI) || b.match(MULTI)) { var ca = a.match(GLYPH) || []; var cb = b.match(GLYPH) || []; @@ -230,7 +249,7 @@ return ((n && n < MAX_MATRIX_SIZE) ? diffLerp : fastLerp)(a, b, p); } - exports.levenshteinMatrix = levenshteinMatrix; + exports.costMatrix = costMatrix; exports.patch = patch; exports.diff = diff; exports.fastLerp = fastLerp;