1 // Taken from: https://github.com/walling/unorm/blob/master/lib/unorm.js
4 * UnicodeNormalizer 1.0.0
5 * Copyright (c) 2008 Matsuza
6 * Dual licensed under the MIT (MIT-LICENSE.txt) and
7 * GPL (GPL-LICENSE.txt) licenses.
8 * $Date: 2008-06-05 16:44:17 +0200 (Thu, 05 Jun 2008) $
14 var primitiveSet = require('../../../object/primitive-set')
15 , validValue = require('../../../object/valid-value')
16 , data = require('./_data')
19 , forms = primitiveSet('NFC', 'NFD', 'NFKC', 'NFKD')
21 , DEFAULT_FEATURE = [null, 0, {}], CACHE_THRESHOLD = 10, SBase = 0xAC00
22 , LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, LCount = 19, VCount = 21
23 , TCount = 28, NCount = VCount * TCount, SCount = LCount * NCount
24 , UChar, cache = {}, cacheCounter = [], i, fromCache, fromData, fromCpOnly
25 , fromRuleBasedJamo, fromCpFilter, strategies, UCharIterator
26 , RecursDecompIterator, DecompIterator, CompIterator, createIterator
29 UChar = function (cp, feature) {
31 this.feature = feature;
35 for (i = 0; i <= 0xFF; ++i) cacheCounter[i] = 0;
37 fromCache = function (next, cp, needFeature) {
40 ret = next(cp, needFeature);
41 if (!!ret.feature && ++cacheCounter[(cp >> 8) & 0xFF] > CACHE_THRESHOLD) {
48 fromData = function (next, cp, needFeature) {
49 var hash = cp & 0xFF00, dunit = UChar.udata[hash] || {}, f = dunit[cp];
50 return f ? new UChar(cp, f) : new UChar(cp, DEFAULT_FEATURE);
52 fromCpOnly = function (next, cp, needFeature) {
53 return !!needFeature ? next(cp, needFeature) : new UChar(cp, null);
56 fromRuleBasedJamo = function (next, cp, needFeature) {
57 var c, base, i, arr, SIndex, TIndex, feature, j;
58 if (cp < LBase || (LBase + LCount <= cp && cp < SBase) ||
59 (SBase + SCount < cp)) {
60 return next(cp, needFeature);
62 if (LBase <= cp && cp < LBase + LCount) {
64 base = (cp - LBase) * VCount;
65 for (i = 0; i < VCount; ++i) {
66 c[VBase + i] = SBase + TCount * (i + base);
70 return new UChar(cp, arr);
74 TIndex = SIndex % TCount;
77 feature[0] = [SBase + SIndex - TIndex, TBase + TIndex];
79 feature[0] = [LBase + floor(SIndex / NCount), VBase +
80 floor((SIndex % NCount) / TCount)];
82 for (j = 1; j < TCount; ++j) {
83 feature[2][TBase + j] = cp + j;
86 return new UChar(cp, feature);
89 fromCpFilter = function (next, cp, needFeature) {
90 return (cp < 60) || ((13311 < cp) && (cp < 42607))
91 ? new UChar(cp, DEFAULT_FEATURE) : next(cp, needFeature);
94 strategies = [fromCpFilter, fromCache, fromCpOnly, fromRuleBasedJamo, fromData];
96 UChar.fromCharCode = strategies.reduceRight(function (next, strategy) {
97 return function (cp, needFeature) { return strategy(next, cp, needFeature); };
100 UChar.isHighSurrogate = function (cp) { return cp >= 0xD800 && cp <= 0xDBFF; };
101 UChar.isLowSurrogate = function (cp) { return cp >= 0xDC00 && cp <= 0xDFFF; };
103 UChar.prototype.prepFeature = function () {
105 this.feature = UChar.fromCharCode(this.codepoint, true).feature;
109 UChar.prototype.toString = function () {
111 if (this.codepoint < 0x10000) return String.fromCharCode(this.codepoint);
112 x = this.codepoint - 0x10000;
113 return String.fromCharCode(floor(x / 0x400) + 0xD800, x % 0x400 + 0xDC00);
116 UChar.prototype.getDecomp = function () {
118 return this.feature[0] || null;
121 UChar.prototype.isCompatibility = function () {
123 return !!this.feature[1] && (this.feature[1] & (1 << 8));
125 UChar.prototype.isExclude = function () {
127 return !!this.feature[1] && (this.feature[1] & (1 << 9));
129 UChar.prototype.getCanonicalClass = function () {
131 return !!this.feature[1] ? (this.feature[1] & 0xff) : 0;
133 UChar.prototype.getComposite = function (following) {
136 if (!this.feature[2]) return null;
137 cp = this.feature[2][following.codepoint];
138 return cp ? UChar.fromCharCode(cp) : null;
141 UCharIterator = function (str) {
145 UCharIterator.prototype.next = function () {
146 if (!!this.str && this.cursor < this.str.length) {
147 var cp = this.str.charCodeAt(this.cursor++), d;
148 if (UChar.isHighSurrogate(cp) && this.cursor < this.str.length &&
149 UChar.isLowSurrogate((d = this.str.charCodeAt(this.cursor)))) {
150 cp = (cp - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000;
153 return UChar.fromCharCode(cp);
159 RecursDecompIterator = function (it, cano) {
161 this.canonical = cano;
165 RecursDecompIterator.prototype.next = function () {
166 var recursiveDecomp, uchar;
167 recursiveDecomp = function (cano, uchar) {
168 var decomp = uchar.getDecomp(), ret, i, a, j;
169 if (!!decomp && !(cano && uchar.isCompatibility())) {
171 for (i = 0; i < decomp.length; ++i) {
172 a = recursiveDecomp(cano, UChar.fromCharCode(decomp[i]));
173 //ret.concat(a); //<-why does not this work?
174 //following block is a workaround.
175 for (j = 0; j < a.length; ++j) ret.push(a[j]);
181 if (this.resBuf.length === 0) {
182 uchar = this.it.next();
183 if (!uchar) return null;
184 this.resBuf = recursiveDecomp(this.canonical, uchar);
186 return this.resBuf.shift();
189 DecompIterator = function (it) {
194 DecompIterator.prototype.next = function () {
195 var cc, uchar, inspt, uchar2, cc2;
196 if (this.resBuf.length === 0) {
198 uchar = this.it.next();
200 cc = uchar.getCanonicalClass();
201 inspt = this.resBuf.length;
203 for (inspt; inspt > 0; --inspt) {
204 uchar2 = this.resBuf[inspt - 1];
205 cc2 = uchar2.getCanonicalClass();
206 if (cc2 <= cc) break;
209 this.resBuf.splice(inspt, 0, uchar);
212 return this.resBuf.shift();
215 CompIterator = function (it) {
219 this.lastClass = null;
222 CompIterator.prototype.next = function () {
223 var uchar, starter, composite, cc;
224 while (this.resBuf.length === 0) {
225 uchar = this.it.next();
227 this.resBuf = this.procBuf;
231 if (this.procBuf.length === 0) {
232 this.lastClass = uchar.getCanonicalClass();
233 this.procBuf.push(uchar);
235 starter = this.procBuf[0];
236 composite = starter.getComposite(uchar);
237 cc = uchar.getCanonicalClass();
238 if (!!composite && (this.lastClass < cc || this.lastClass === 0)) {
239 this.procBuf[0] = composite;
242 this.resBuf = this.procBuf;
246 this.procBuf.push(uchar);
250 return this.resBuf.shift();
253 createIterator = function (mode, str) {
256 return new DecompIterator(
257 new RecursDecompIterator(new UCharIterator(str), true)
260 return new DecompIterator(
261 new RecursDecompIterator(new UCharIterator(str), false)
264 return new CompIterator(new DecompIterator(
265 new RecursDecompIterator(new UCharIterator(str), true)
268 return new CompIterator(new DecompIterator(
269 new RecursDecompIterator(new UCharIterator(str), false)
272 throw mode + " is invalid";
274 normalize = function (mode, str) {
275 var it = createIterator(mode, str), ret = "", uchar;
276 while (!!(uchar = it.next())) ret += uchar.toString();
283 module.exports = function (/*form*/) {
284 var str = String(validValue(this)), form = arguments[0];
285 if (form === undefined) form = 'NFC';
286 else form = String(form);
287 if (!forms[form]) throw new RangeError('Invalid normalization form: ' + form);
288 return normalize(form, str);