Rename levenshteinMatrix to costMatrix. Better argument names for costMatrix, editPat...
[string-lerp.git] / string-lerp.js
1 /* string-lerp - progressively turn one string into another
2 Copyright 2014 Joe Wreschnig
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8 */
9
10 /* @license Copyright 2014 Joe Wreschnig - GNU GPL v2 or later */
11
12 (function (exports) {
13 "use strict";
14
15 var MAX_MATRIX_SIZE = 256 * 256;
16
17 function costMatrix(source, target, ins, del, sub) {
18 /** Calculate the Levenshtein cost matrix for source and target
19
20 If source and target are strings, they cannot contain any
21 astral or combining codepoints. Such data must be passed
22 as arrays of strings with one element per glyph.
23
24 ins, del, and sub are the costs for insertion, deletion,
25 and substition respectively. Their default value is 1. If
26 only ins is passed, del and sub are set to the same cost.
27 If ins and del are passed, sub is set to the more
28 expensive of the two.
29
30 The matrix is returned as a flat typed array.
31
32 Following http://en.wikipedia.org/wiki/Levenshtein_distance
33 */
34 ins = ins === undefined ? 1 : (ins | 0);
35 del = (del | 0) || ins;
36 sub = (sub | 0) || Math.max(ins, del);
37 var m = source.length + 1;
38 var n = target.length + 1;
39 var d = new Uint32Array(m * n);
40 var i, j;
41 for (i = 1; i < m; ++i)
42 d[n * i] = i;
43 for (j = 1; j < n; ++j)
44 d[j] = j;
45 for (j = 1; j < n; ++j)
46 for (i = 1; i < m; ++i)
47 if (source[i - 1] === target[j - 1])
48 d[n * i + j] = d[n * (i - 1) + j - 1];
49 else
50 d[n * i + j] = Math.min(del + d[n * (i - 1) + j ],
51 ins + d[n * i + j - 1],
52 sub + d[n * (i - 1) + j - 1]);
53 return d;
54 }
55
56 function editPath(costs, target) {
57 /** Given a cost matrix and a target, create an edit list */
58 var path = [];
59 var j = target.length;
60 var n = j + 1;
61 var i = costs.length / n - 1;
62 while (i || j) {
63 var sub = (i && j) ? costs[n * (i - 1) + j - 1] : Infinity;
64 var del = i ? costs[n * (i - 1) + j] : Infinity;
65 var ins = j ? costs[n * i + j - 1] : Infinity;
66 if (sub <= ins && sub <= del) {
67 if (costs[n * i + j] !== costs[n * (i - 1) + j - 1])
68 path.push(["sub", i - 1, target[j - 1]]);
69 --i; --j;
70 } else if (ins <= del) {
71 path.push(["ins", i, target[j - 1]]);
72 --j;
73 } else {
74 path.push(["del", i - 1]);
75 --i;
76 }
77 }
78 return path;
79 }
80
81 function diff(source, target, ins, del, sub) {
82 /** Create a diff between string source and target
83
84 ins, del, and sub are as passed to levenshtein
85 */
86 return editPath(costMatrix(source, target, ins, del, sub), target);
87 }
88
89 function patch(edits, s) {
90 /** Apply the list of edits to s */
91 var edit;
92 var i;
93
94 if (Array.isArray(s)) {
95 for (i = 0; i < edits.length; ++i) {
96 edit = edits[i];
97 switch (edit[0]) {
98 case "sub":
99 s[edit[1]] = edit[2];
100 break;
101 case "ins":
102 s.splice(edit[1], 0, edit[2]);
103 break;
104 case "del":
105 s.splice(edit[1], 1);
106 break;
107 }
108 }
109 } else {
110 for (i = 0; i < edits.length; ++i) {
111 edit = edits[i];
112 switch (edit[0]) {
113 case "sub":
114 s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1] + 1);
115 break;
116 case "ins":
117 s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1]);
118 break;
119 case "del":
120 s = s.slice(0, edit[1]) + s.slice(edit[1] + 1);
121 break;
122 }
123 }
124 }
125 return s;
126 }
127
128 var MULTI = /[\uD800-\uDBFF][\uDC00-\uDFFF]|[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/;
129
130 var GLYPH = /([\0-\u02FF\u0370-\u1DBF\u1E00-\u20CF\u2100-\uD7FF\uDC00-\uFE1F\uFE30-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF])([\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]*)/g;
131
132 function diffLerp(a, b, p) {
133 /** Interpolate between two strings based on edit operations
134
135 This interpolation algorithm applys a partial edit of one
136 string into the other. This produces nice looking results,
137 but can take a significant amount of time and memory to
138 compute the edits. It is not recommended for strings
139 longer than a few hundred characters.
140 */
141
142 // If given strings with astral codepoints or combining
143 // characters, split them into arrays of "glyphs" first,
144 // do the edit on the list of "glyphs", and rejoin them.
145 //
146 // This split is not perfect for all languages, but at least
147 // it won't create invalid surrogate pairs or orphaned
148 // combining characters.
149 if (a.match && a.match(MULTI) || b.match && b.match(MULTI)) {
150 var ca = a.match(GLYPH) || [];
151 var cb = b.match(GLYPH) || [];
152 return diffLerp(ca, cb, p).join("");
153 }
154
155 // The edit path works from the string end, forwards, because
156 // that's how Levenshtein edits work. To match LTR reading
157 // direction (and the behavior of fastLerp), swap the strings
158 // and invert the parameter when editing.
159 var edits = diff(b, a, 2, 2, 3);
160 var partial = edits.slice(0, Math.round((1 - p) * edits.length));
161 return patch(partial, b);
162 }
163
164 var NUMBERS = /(-?\d+(?:\.\d+)?)/g;
165
166 function areNumericTwins(a, b) {
167 /** Check if a and b differ only in numerals
168
169 A leading "-" counts as part of numbers; a leading "+"
170 does not. Numbers may contain a single ".", but no other
171 floating point syntax.
172 */
173 return a.replace(NUMBERS, "0") === b.replace(NUMBERS, "0");
174 }
175
176 function nlerp(a, b, p) {
177 return a + (b - a) * p;
178 }
179
180 function numericLerp(a, b, p) {
181 /** Interpolate numerically between two strings containing numbers
182
183 Numbers may have a leading "-" and a single "." to mark
184 the decimal point, but something must be after the ".".
185 If both of the numbers in a pair are integers, the result
186 is clamped to an integer.
187
188 For example, numericLerp("0.0", "100", 0.123) === "12.3"
189 because the "." in "0.0" is interpreted as a decimal
190 point. But numericLerp("0.", "100.", 0.123) === "12."
191 because the strings are interpreted as integers followed
192 by a full stop.
193
194 Calling this functions on strings that differ in more than
195 numerals gives undefined results.
196 */
197 var aParts = a.split(NUMBERS);
198 var bParts = b.split(NUMBERS);
199 for (var i = 1; i < aParts.length; i += 2) {
200 var part = nlerp(+aParts[i], +bParts[i], p);
201 if (aParts[i].indexOf(".") === -1 && bParts[i].indexOf(".") === -1)
202 part = Math.round(part);
203 aParts[i] = part.toString();
204 }
205 return aParts.join("");
206 }
207
208 function fastLerp(a, b, p) {
209 /** Interpolate between two strings based on length
210
211 This interpolation algorithm progressively replaces the
212 front of one string with another. This approach is fast
213 but does not look good when the strings are similar.
214 */
215
216 // TODO: Consider fast-pathing this even more for very large
217 // strings, e.g. in the megabyte range. These are large enough
218 // that
219 if (a.match(MULTI) || b.match(MULTI)) {
220 var ca = a.match(GLYPH) || [];
221 var cb = b.match(GLYPH) || [];
222 var calen = Math.round(ca.length * p);
223 var cblen = Math.round(cb.length * p);
224 var r = cb.slice(0, cblen);
225 r.push.apply(r, ca.slice(calen, ca.length));
226 return r.join("");
227 } else {
228 var alen = Math.round(a.length * p);
229 var blen = Math.round(b.length * p);
230 return b.substring(0, blen) + a.substring(alen, a.length);
231 }
232 }
233
234 function lerp(a, b, p) {
235 /** Interpolate between two strings as best as possible
236
237 If the strings are identical aside from numbers in them,
238 they are passed through numericLerp.
239
240 If the strings are not numbers and short, they are passed
241 through diffLerp.
242
243 Otherwise, they are passed through fastLerp.
244 */
245 a = a.toString();
246 b = b.toString();
247
248 // Fast path for boundary cases.
249 if (p === 0) return a;
250 if (p === 1) return b;
251
252 if (areNumericTwins(a, b))
253 return numericLerp(a, b, p);
254
255 // Numeric lerps should over- and under-shoot when fed numbers
256 // outside 0 to 1, but other types cannot.
257 if (p < 0) return a;
258 if (p > 1) return b;
259
260 var n = a.length * b.length;
261 return ((n && n < MAX_MATRIX_SIZE) ? diffLerp : fastLerp)(a, b, p);
262 }
263
264 exports.costMatrix = costMatrix;
265 exports.patch = patch;
266 exports.diff = diff;
267 exports.fastLerp = fastLerp;
268 exports.diffLerp = diffLerp;
269 exports.numericLerp = numericLerp;
270 exports.lerp = lerp;
271
272 })(typeof exports === "undefined" ? (this.stringLerp = {}) : exports);