Handle surrogate pairs and combining characters correctly.
[string-lerp.git] / string-lerp.js
1 (function (exports) {
2 "use strict";
3
4 var MAX_MATRIX_SIZE = 256 * 256;
5
6 function levenshteinMatrix(s, t) {
7 /** Calculate the Levenshtein edit distance matrix for two strings
8
9 The matrix is returned as a flat unsigned typed array.
10
11 Following http://en.wikipedia.org/wiki/Levenshtein_distance
12 */
13 var m = s.length + 1;
14 var n = t.length + 1;
15 var d = new Uint32Array(m * n);
16 var i, j;
17 for (i = 1; i < m; ++i)
18 d[n * i] = i;
19 for (j = 1; j < n; ++j)
20 d[j] = j;
21 for (j = 1; j < n; ++j)
22 for (i = 1; i < m; ++i)
23 if (s[i - 1] === t[j - 1])
24 d[n * i + j] = d[n * (i - 1) + j - 1];
25 else
26 d[n * i + j] = 1 + Math.min(d[n * (i - 1) + j ],
27 d[n * i + j - 1],
28 d[n * (i - 1) + j - 1]);
29 return d;
30 }
31
32 function editPath(d, t) {
33 /** Given a Levenshtein matrix and target, create an edit list */
34 var path = [];
35 var j = t.length;
36 var n = j + 1;
37 var i = d.length / n - 1;
38 while (i || j) {
39 var sub = (i && j) ? d[n * (i - 1) + j - 1] : Infinity;
40 var del = i ? d[n * (i - 1) + j] : Infinity;
41 var ins = j ? d[n * i + j - 1] : Infinity;
42 if (sub <= ins && sub <= del) {
43 if (d[n * i + j] !== d[n * (i - 1) + j - 1])
44 path.push(["sub", i - 1, t[j - 1]]);
45 --i; --j;
46 } else if (ins <= del) {
47 path.push(["ins", i, t[j - 1]]);
48 --j;
49 } else {
50 path.push(["del", i - 1]);
51 --i;
52 }
53 }
54 return path;
55 }
56
57 function diff(s, t) {
58 /** Create a diff between string s and t */
59 return editPath(levenshteinMatrix(s, t), t);
60 }
61
62 function patch(edits, s) {
63 /** Apply the list of edits to s */
64 var edit;
65 var i;
66
67 if (Array.isArray(s)) {
68 for (i = 0; i < edits.length; ++i) {
69 edit = edits[i];
70 switch (edit[0]) {
71 case "sub":
72 s[edit[1]] = edit[2];
73 break;
74 case "ins":
75 s.splice(edit[1], 0, edit[2]);
76 break;
77 case "del":
78 s.splice(edit[1], 1);
79 break;
80 }
81 }
82 } else {
83 for (i = 0; i < edits.length; ++i) {
84 edit = edits[i];
85 switch (edit[0]) {
86 case "sub":
87 s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1] + 1);
88 break;
89 case "ins":
90 s = s.slice(0, edit[1]) + edit[2] + s.slice(edit[1]);
91 break;
92 case "del":
93 s = s.slice(0, edit[1]) + s.slice(edit[1] + 1);
94 break;
95 }
96 }
97 }
98 return s;
99 }
100
101 var MULTI = /[\uD800-\uDBFF][\uDC00-\uDFFF]|[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/;
102
103 var GLYPH = /([\0-\u02FF\u0370-\u1DBF\u1E00-\u20CF\u2100-\uD7FF\uDC00-\uFE1F\uFE30-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF])([\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]*)/g;
104
105 function diffLerp(a, b, p) {
106 /** Interpolate between two strings based on edit distance
107
108 This interpolation algorithm applys a partial edit of one
109 string into the other. This produces nice looking results,
110 but can take a significant amount of time and memory to
111 compute the edits. It is not recommended for strings
112 longer than a few hundred characters.
113 */
114
115 // If given strings with astral codepoints or combining
116 // characters, split them into arrays of "glyphs" first,
117 // do the edit on the list of "glyphs", and rejoin them.
118 //
119 // This split is not perfect for all languages, but at least
120 // it won't create invalid surrogate pairs or orphaned
121 // combining characters.
122 if (a.match && a.match(MULTI) || b.match && b.match(MULTI)) {
123 var ca = a.match(GLYPH) || [];
124 var cb = b.match(GLYPH) || [];
125 return diffLerp(ca, cb, p).join("");
126 }
127
128 // The edit path works from the string end, forwards, because
129 // that's how Levenshtein edits work. To match LTR reading
130 // direction (and the behavior of fastLerp), swap the strings
131 // and invert the parameter when editing.
132 var edits = diff(b, a);
133 var partial = edits.slice(0, Math.round((1 - p) * edits.length));
134 return patch(partial, b);
135 }
136
137 var NUMBERS = /(-?\d+(?:\.\d+)?)/g;
138
139 function areNumericTwins(a, b) {
140 /** Check if a and b differ only in numerals
141
142 A leading "-" counts as part of numbers; a leading "+"
143 does not. Numbers may contain a single ".", but no other
144 floating point syntax.
145 */
146 return a.replace(NUMBERS, "0") === b.replace(NUMBERS, "0");
147 }
148
149 function nlerp(a, b, p) {
150 return a + (b - a) * p;
151 }
152
153 function numericLerp(a, b, p) {
154 /** Interpolate numerically between two strings containing numbers
155
156 Numbers may have a leading "-" and a single "." to mark
157 the decimal point, but something must be after the ".".
158 If both of the numbers in a pair are integers, the result
159 is clamped to an integer.
160
161 For example, numericLerp("0.0", "100", 0.123) === "12.3"
162 because the "." in "0.0" is intepreted as a decimal point.
163 But numericLerp("0.", "100.", 0.123) === "12." because the
164 strings are interpreted as integers followed by a full
165 stop.
166
167 Calling this functions on strings that differ in more than
168 numerals gives undefined results.
169 */
170 var aParts = a.split(NUMBERS);
171 var bParts = b.split(NUMBERS);
172 for (var i = 1; i < aParts.length; i += 2) {
173 var part = nlerp(+aParts[i], +bParts[i], p);
174 if (aParts[i].indexOf(".") === -1 && bParts[i].indexOf(".") === -1)
175 part = Math.round(part);
176 aParts[i] = part.toString();
177 }
178 return aParts.join("");
179 }
180
181 function fastLerp(a, b, p) {
182 /** Interpolate between two strings based on length
183
184 This interpolation algorithm progressively replaces the
185 front of one string with another. This approach is fast
186 but does not look good when the strings are similar.
187 */
188 if (a.match(MULTI) || b.match(MULTI)) {
189 var ca = a.match(GLYPH) || [];
190 var cb = b.match(GLYPH) || [];
191 var calen = Math.round(ca.length * p);
192 var cblen = Math.round(cb.length * p);
193 var r = cb.slice(0, cblen);
194 r.push.apply(r, ca.slice(calen, ca.length));
195 return r.join("");
196 } else {
197 var alen = Math.round(a.length * p);
198 var blen = Math.round(b.length * p);
199 return b.substring(0, blen) + a.substring(alen, a.length);
200 }
201 }
202
203 function lerp(a, b, p) {
204 /** Interpolate between two strings as best as possible
205
206 If the strings are identical aside from numbers in them,
207 they are passed through numericLerp.
208
209 If the strings are not numbers and short, they are passed
210 through diffLerp.
211
212 Otherwise, they are passed through fastLerp.
213 */
214 a = a.toString();
215 b = b.toString();
216
217 // Fast path for boundary cases.
218 if (p === 0) return a;
219 if (p === 1) return b;
220
221 if (areNumericTwins(a, b))
222 return numericLerp(a, b, p);
223
224 // Numeric lerps should over- and under-shoot when fed numbers
225 // outside 0 to 1, but other types cannot.
226 if (p < 0) return a;
227 if (p > 1) return b;
228
229 var n = a.length * b.length;
230 return ((n && n < MAX_MATRIX_SIZE) ? diffLerp : fastLerp)(a, b, p);
231 }
232
233 exports.levenshteinMatrix = levenshteinMatrix;
234 exports.patch = patch;
235 exports.diff = diff;
236 exports.fastLerp = fastLerp;
237 exports.diffLerp = diffLerp;
238 exports.numericLerp = numericLerp;
239 exports.lerp = lerp;
240
241 })(typeof exports === "undefined" ? (this.stringLerp = {}) : exports);