X-Git-Url: https://git.yukkurigames.com/?p=string-lerp.git;a=blobdiff_plain;f=string-lerp.js;h=1aa89b0552d39891f6290aed1495fbd805a33481;hp=40d9fd0c30655daaaac0e83a5fb648ae267cd8c1;hb=13d986bfdc0f90b6a84cdb1662f318864e177d99;hpb=245005bdfa792a76d55b0fafd2255c4c8325d28c

diff --git a/string-lerp.js b/string-lerp.js
index 40d9fd0..1aa89b0 100644
--- a/string-lerp.js
+++ b/string-lerp.js
@@ -1,17 +1,41 @@
+/* string-lerp - progressively turn one string into another
+   Copyright 2014 Joe Wreschnig
+  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+*/
+
+/* @license Copyright 2014 Joe Wreschnig - GNU GPL v2 or later */
+
 (function (exports) {
     "use strict";
 
     var MAX_MATRIX_SIZE = 256 * 256;
 
-    function levenshteinMatrix(s, t) {
-        /** Calculate the Levenshtein edit distance matrix for two strings
+    function costMatrix(source, target, ins, del, sub) {
+        /** Calculate the Levenshtein cost matrix for source and target
+
+            If source and target are strings, they cannot contain any
+            astral or combining codepoints. Such data must be passed
+            as arrays of strings with one element per glyph.
+
+            ins, del, and sub are the costs for insertion, deletion,
+            and substition respectively. Their default value is 1. If
+            only ins is passed, del and sub are set to the same cost.
+            If ins and del are passed, sub is set to the more
+            expensive of the two.
+
+            The matrix is returned as a flat typed array.
 
-            The matrix is returned as a flat unsigned typed array.
-            
             Following http://en.wikipedia.org/wiki/Levenshtein_distance
         */
-        var m = s.length + 1;
-        var n = t.length + 1;
+        ins = ins === undefined ? 1 : (ins | 0);
+        del = (del | 0) || ins;
+        sub = (sub | 0) || Math.max(ins, del);
+        var m = source.length + 1;
+        var n = target.length + 1;
         var d = new Uint32Array(m * n);
         var i, j;
         for (i = 1; i < m; ++i)
@@ -20,82 +44,73 @@
             d[j] = j;
         for (j = 1; j < n; ++j)
             for (i = 1; i < m; ++i)
-                if (s[i - 1] === t[j - 1])
+                if (source[i - 1] === target[j - 1])
                     d[n * i + j] = d[n * (i - 1) + j - 1];
                 else
-                    d[n * i + j] = 1 + Math.min(d[n * (i - 1) + j    ],
-                                                d[n *    i    + j - 1],
-                                                d[n * (i - 1) + j - 1]);
+                    d[n * i + j] = Math.min(del + d[n * (i - 1) + j    ],
+                                            ins + d[n *    i    + j - 1],
+                                            sub + d[n * (i - 1) + j - 1]);
         return d;
     }
 
-    function editPath(d, t) {
-        /** Given a Levenshtein matrix and target, create an edit list */
+    // First, note that deletion is just substition with nothing, so
+    // any DEL operation can be replaced by a SUB. Second, the
+    // operation code *is* the necessary slice offset for applying the
+    // diff.
+    var INS = 0, SUB = 1;
+
+    function editPath(costs, target) {
+        /** Given a cost matrix and a target, create an edit list */
         var path = [];
-        var j = t.length;
+        var j = target.length;
         var n = j + 1;
-        var i = d.length / n - 1;
+        var i = costs.length / n - 1;
         while (i || j) {
-            var sub = (i && j) ? d[n * (i - 1) + j - 1] : Infinity;
-            var del = i ? d[n * (i - 1) + j] : Infinity;
-            var ins = j ? d[n * i + j - 1] : Infinity;
+            var sub = (i && j) ? costs[n * (i - 1) + j - 1] : Infinity;
+            var del = i ? costs[n * (i - 1) + j] : Infinity;
+            var ins = j ? costs[n * i + j - 1] : Infinity;
             if (sub <= ins && sub <= del) {
-                if (d[n * i + j] !== d[n * (i - 1) + j - 1])
-                    path.push(["sub", i - 1, t[j - 1]]);
+                if (costs[n * i + j] !== costs[n * (i - 1) + j - 1])
+                    path.push([SUB, i - 1, target[j - 1]]);
                 --i; --j;
             } else if (ins <= del) {
-                path.push(["ins", i, t[j - 1]]);
+                path.push([INS, i, target[j - 1]]);
                 --j;
             } else {
-                path.push(["del", i - 1]);
+                path.push([SUB, i - 1, ""]);
                 --i;
             }
         }
         return path;
     }
 
-    function diff(s, t) {
-        /** Create a diff between string s and t */
-        return editPath(levenshteinMatrix(s, t), t);
+    function diff(source, target, ins, del, sub) {
+        /** Create a diff between string source and target
+
+            ins, del, and sub are as passed to levenshtein
+         */
+        return editPath(costMatrix(source, target, ins, del, sub), target);
     }
 
-    function patch(edits, s) {
+    function patch(diff, source) {
         /** Apply the list of edits to s */
         var edit;
         var i;
 
-        if (Array.isArray(s)) {
-            for (i = 0; i < edits.length; ++i) {
-                edit = edits[i];
-                switch (edit[0]) {
-                case "sub":
-                    s[edit[1]] = edit[2];
-                    break;
-                case "ins":
-                    s.splice(edit[1], 0, edit[2]);
-                    break;
-                case "del":
-                    s.splice(edit[1], 1);
-                    break;
-                }
+        if (Array.isArray(source)) {
+            for (i = 0; i < diff.length; ++i) {
+                edit = diff[i];
+                source.splice(edit[1], edit[0], edit[2]);
             }
         } else {
-            for (i = 0; i < edits.length; ++i) {
-                edit = edits[i];
-                switch (edit[0]) {
-                case "sub":
-                    s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1] + 1);
-                    break;
-                case "ins":
-                    s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1]);
-                    break;
-                case "del":
-                    s = s.slice(0, edit[1]) + s.slice(edit[1] + 1);
-                    break;
-                }
+            for (i = 0; i < diff.length; ++i) {
+                edit = diff[i];
+                var head = source.slice(0, edit[1]);
+                var tail = source.slice(edit[1] + edit[0]);
+                source = head + edit[2] + tail;
             }
         }
-        return s;
+        return source;
     }
 
     var MULTI = /[\uD800-\uDBFF][\uDC00-\uDFFF]|[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/;
@@ -103,7 +118,7 @@
     var GLYPH = /([\0-\u02FF\u0370-\u1DBF\u1E00-\u20CF\u2100-\uD7FF\uDC00-\uFE1F\uFE30-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF])([\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]*)/g;
 
     function diffLerp(a, b, p) {
-        /** Interpolate between two strings based on edit distance
+        /** Interpolate between two strings based on edit operations
 
             This interpolation algorithm applys a partial edit of one
             string into the other. This produces nice looking results,
@@ -129,7 +144,7 @@
         // that's how Levenshtein edits work. To match LTR reading
         // direction (and the behavior of fastLerp), swap the strings
         // and invert the parameter when editing.
-        var edits = diff(b, a);
+        var edits = diff(b, a, 2, 2, 3);
         var partial = edits.slice(0, Math.round((1 - p) * edits.length));
         return patch(partial, b);
     }
@@ -159,10 +174,10 @@
             is clamped to an integer.
 
             For example, numericLerp("0.0", "100", 0.123) === "12.3"
-            because the "." in "0.0" is intepreted as a decimal point.
-            But numericLerp("0.", "100.", 0.123) === "12." because the
-            strings are interpreted as integers followed by a full
-            stop.
+            because the "." in "0.0" is interpreted as a decimal
+            point. But numericLerp("0.", "100.", 0.123) === "12."
+            because the strings are interpreted as integers followed
+            by a full stop.
 
             Calling this functions on strings that differ in more than
             numerals gives undefined results.
@@ -185,6 +200,10 @@
             front of one string with another. This approach is fast
             but does not look good when the strings are similar.
         */
+
+        // TODO: Consider fast-pathing this even more for very large
+        // strings, e.g. in the megabyte range. These are large enough
+        // that 
         if (a.match(MULTI) || b.match(MULTI)) {
             var ca = a.match(GLYPH) || [];
             var cb = b.match(GLYPH) || [];
@@ -230,7 +249,7 @@
         return ((n && n < MAX_MATRIX_SIZE) ? diffLerp : fastLerp)(a, b, p);
     }
 
-    exports.levenshteinMatrix = levenshteinMatrix;
+    exports.costMatrix = costMatrix;
     exports.patch = patch;
     exports.diff = diff;
     exports.fastLerp = fastLerp;