Coverage

45%
652
297
355

/Users/deNULL/dev/web/az/dist/az.js

45%
652
297
355
LineHitsSource
11;(function (global, factory) {
21 typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
3 typeof define === 'function' && define.amd ? define('Az', factory) :
4 global.Az = factory()
51}(this, function () { 'use strict';
61 if (typeof require != 'undefined' && typeof exports === 'object' && typeof module !== 'undefined') {
71 var fs = require('fs');
8 }
9
101 var Az = {
11 load: function(url, responseType, callback) {
1210 if (fs) {
1310 fs.readFile(url, { encoding: responseType == 'json' ? 'utf8' : null }, function (err, data) {
1410 if (err) {
150 callback(err);
160 return;
17 }
18
1910 if (responseType == 'json') {
204 callback(null, JSON.parse(data));
21 } else
226 if (responseType == 'arraybuffer') {
236 callback(null, data.buffer);
24 } else {
250 callback(new Error('Unknown responseType'));
26 }
27 });
2810 return;
29 }
30
310 var xhr = new XMLHttpRequest();
320 xhr.open('GET', url, true);
330 xhr.responseType = responseType;
34
350 xhr.onload = function (e) {
360 if (xhr.response) {
370 callback && callback(null, xhr.response);
38 }
39 };
40
410 xhr.send(null);
42 }
43 };
44
451 return Az;
46}));
471;(function (global, factory) {
481 typeof exports === 'object' && typeof module !== 'undefined' ? (module.exports = module.exports || {}) && (module.exports.DAWG = factory(module.exports)) :
49 typeof define === 'function' && define.amd ? define('Az.DAWG', ['Az'], factory) :
50 (global.Az = global.Az || {}) && (global.Az.DAWG = factory(global.Az))
511}(this, function (Az) { 'use strict';
521 var ROOT = 0,
53 MISSING = -1,
54 PRECISION_MASK = 0xFFFFFFFF,
55 HAS_LEAF_BIT = 1 << 8,
56 EXTENSION_BIT = 1 << 9,
57 OFFSET_MAX = 1 << 21,
58 IS_LEAF_BIT = 1 << 31;
59
601 var CP1251 = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16,
61 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32,
62 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48,
63 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64,
64 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80,
65 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96,
66 97: 97, 98: 98, 99: 99, 100: 100, 101: 101, 102: 102, 103: 103, 104: 104, 105: 105, 106: 106, 107: 107, 108: 108, 109: 109, 110: 110, 111: 111, 112: 112,
67 113: 113, 114: 114, 115: 115, 116: 116, 117: 117, 118: 118, 119: 119, 120: 120, 121: 121, 122: 122, 123: 123, 124: 124, 125: 125, 126: 126, 127: 127,
68 1027: 129, 8225: 135, 1046: 198, 8222: 132, 1047: 199, 1168: 165, 1048: 200, 1113: 154, 1049: 201, 1045: 197, 1050: 202, 1028: 170, 160: 160, 1040: 192,
69 1051: 203, 164: 164, 166: 166, 167: 167, 169: 169, 171: 171, 172: 172, 173: 173, 174: 174, 1053: 205, 176: 176, 177: 177, 1114: 156, 181: 181, 182: 182,
70 183: 183, 8221: 148, 187: 187, 1029: 189, 1056: 208, 1057: 209, 1058: 210, 8364: 136, 1112: 188, 1115: 158, 1059: 211, 1060: 212, 1030: 178, 1061: 213,
71 1062: 214, 1063: 215, 1116: 157, 1064: 216, 1065: 217, 1031: 175, 1066: 218, 1067: 219, 1068: 220, 1069: 221, 1070: 222, 1032: 163, 8226: 149, 1071: 223,
72 1072: 224, 8482: 153, 1073: 225, 8240: 137, 1118: 162, 1074: 226, 1110: 179, 8230: 133, 1075: 227, 1033: 138, 1076: 228, 1077: 229, 8211: 150, 1078: 230,
73 1119: 159, 1079: 231, 1042: 194, 1080: 232, 1034: 140, 1025: 168, 1081: 233, 1082: 234, 8212: 151, 1083: 235, 1169: 180, 1084: 236, 1052: 204, 1085: 237,
74 1035: 142, 1086: 238, 1087: 239, 1088: 240, 1089: 241, 1090: 242, 1036: 141, 1041: 193, 1091: 243, 1092: 244, 8224: 134, 1093: 245, 8470: 185, 1094: 246,
75 1054: 206, 1095: 247, 1096: 248, 8249: 139, 1097: 249, 1098: 250, 1044: 196, 1099: 251, 1111: 191, 1055: 207, 1100: 252, 1038: 161, 8220: 147, 1101: 253,
76 8250: 155, 1102: 254, 8216: 145, 1103: 255, 1043: 195, 1105: 184, 1039: 143, 1026: 128, 1106: 144, 8218: 130, 1107: 131, 8217: 146, 1108: 186, 1109: 190};
77
78 // Based on all common ЙЦУКЕН-keyboards (both Windows and Apple variations)
791 var COMMON_TYPOS = {
80 'й': 'ёцыф', 'ц': 'йфыву', 'у': 'цывак', 'к': 'увапе', 'е': 'капрн', 'н': 'епрог', 'г': 'нролш', 'ш': 'голдщ', 'щ': 'шлджз', 'з': 'щджэх-', 'х': 'зжэъ-', 'ъ': 'хэ-ё',
81 'ф': 'йцычяё', 'ы': 'йцувсчяф', 'в': 'цукамсчы', 'а': 'укепимсв', 'п': 'кенртима', 'р': 'енгоьтип', 'о': 'нгшлбьтр', 'л': 'гшщдюбьо', 'д': 'шщзжюбл', 'ж': 'щзхэюд', 'э': 'зхъжё',
82 'ё': 'йфяъэ', 'я': 'ёфыч', 'ч': 'яфывс', 'с': 'чывам', 'м': 'свапи', 'и': 'мапрт', 'т': 'ипроь', 'ь': 'тролб', 'б': 'ьолдю', 'ю': 'блдж',
83 '1': 'ёйц', '2': 'йцу', '3': 'цук', '4': 'уке', '5': 'кен', '6': 'енг', '7': 'нгш', '8': 'гшщ', '9': 'шщз', '0': 'щзх-', '-': 'зхъ', '=': '-хъ', '\\': 'ъэ', '.': 'южэ'
84 };
85
861 function offset(base) {
871 return ((base >> 10) << ((base & EXTENSION_BIT) >> 6)) & PRECISION_MASK;
88 }
89
901 function label(base) {
911 return base & (IS_LEAF_BIT | 0xFF) & PRECISION_MASK;
92 }
93
941 function hasLeaf(base) {
950 return (base & HAS_LEAF_BIT & PRECISION_MASK) != 0;
96 }
97
981 function value(base) {
990 return base & ~IS_LEAF_BIT & PRECISION_MASK;
100 }
101
1021 var DAWG = function(units, guide, format) {
1035 this.units = units;
1045 this.guide = guide;
1055 this.format = format;
106 }
107
1081 DAWG.fromArrayBuffer = function(data, format) {
1095 var dv = new DataView(data),
110 unitsLength = dv.getUint32(0, true),
111 guideLength = dv.getUint32(unitsLength * 4 + 4, true);
1125 return new DAWG(
113 new Uint32Array(data, 4, unitsLength),
114 new Uint8Array(data, unitsLength * 4 + 8, guideLength * 2),
115 format);
116 }
117
1181 DAWG.load = function(url, format, callback) {
1195 Az.load(url, 'arraybuffer', function(err, data) {
1205 callback(err, err ? null : DAWG.fromArrayBuffer(data, format));
121 });
122 }
123
1241 DAWG.prototype.followByte = function(c, index) {
1251 var o = offset(this.units[index]);
1261 var nextIndex = (index ^ o ^ (c & 0xFF)) & PRECISION_MASK;
127
1281 if (label(this.units[nextIndex]) != (c & 0xFF)) {
1291 return MISSING;
130 }
131
1320 return nextIndex;
133 }
134
1351 DAWG.prototype.followString = function(str, index) {
1360 index = index || ROOT;
1370 for (var i = 0; i < str.length; i++) {
1380 var code = str.charCodeAt(i);
1390 if (!(code in CP1251)) {
1400 return MISSING;
141 }
1420 index = this.followByte(CP1251[code], index);
1430 if (index == MISSING) {
1440 return MISSING;
145 }
146 }
1470 return index;
148 }
149
1501 DAWG.prototype.hasValue = function(index) {
1510 return hasLeaf(this.units[index]);
152 }
153
1541 DAWG.prototype.value = function(index) {
1550 var o = offset(this.units[index]);
1560 var valueIndex = (index ^ o) & PRECISION_MASK;
1570 return value(this.units[valueIndex]);
158 }
159
1601 DAWG.prototype.find = function(str) {
1610 var index = this.followString(str);
1620 if (index == MISSING) {
1630 return MISSING;
164 }
1650 if (!this.hasValue(index)) {
1660 return MISSING;
167 }
1680 return this.value(index);
169 }
170
1711 DAWG.prototype.iterateAll = function(index) {
1720 var results = [];
1730 var stack = [index];
1740 var key = [];
1750 var last = ROOT;
1760 var label;
177
1780 while (true) {
1790 index = stack[stack.length - 1];
180
1810 if (last != ROOT) {
1820 label = this.guide[index << 1];
1830 if (label) {
1840 index = this.followByte(label, index);
1850 if (index == MISSING) {
1860 return results;
187 }
1880 key.push(label);
1890 stack.push(index);
190 } else {
1910 do {
1920 label = this.guide[(index << 1) + 1];
1930 key.pop();
1940 stack.pop();
1950 if (!stack.length) {
1960 return results;
197 }
1980 index = stack[stack.length - 1];
1990 if (label) {
2000 index = this.followByte(label, index);
2010 if (index == MISSING) {
2020 return results;
203 }
2040 key.push(label);
2050 stack.push(index);
206 }
207 } while (!label);
208 }
209 }
210
2110 while (!this.hasValue(index)) {
2120 var label = this.guide[index << 1];
2130 index = this.followByte(label, index);
2140 if (index == MISSING) {
2150 return results;
216 }
2170 key.push(label);
2180 stack.push(index);
219 }
220
221 // Only three formats supported
2220 if (this.format == 'words') {
2230 results.push([
224 ((key[0] ^ 1) << 6) + (key[1] >> 1),
225 ((key[2] ^ 1) << 6) + (key[3] >> 1)
226 ]);
227 } else
2280 if (this.format == 'probs') {
2290 results.push([
230 ((key[0] ^ 1) << 6) + (key[1] >> 1),
231 ((key[2] ^ 1) << 6) + (key[3] >> 1),
232 ((key[4] ^ 1) << 6) + (key[5] >> 1)
233 ]);
234 } else {
235 // Raw bytes
2360 results.push(key.slice());
237 }
2380 last = index;
239 }
240 }
241
242 // Features:
243 // replaces (е -> ё) (DONE)
244 // stutter (ннет -> нет, гоол -> гол, д-да -> да)
245 // typos (count-limited):
246 // swaps (солво -> слово)
247 // extra letters (свлово -> слово)
248 // missing letters (сово -> слово)
249 // wrong letters (сково -> слово)
2501 DAWG.prototype.findAll = function(str, replaces, mstutter, mtypos) {
2511 mtypos = mtypos || 0;
2521 mstutter = mstutter || 0;
2531 var results = [],
254 prefixes = [['', 0, 0, 0, ROOT]],
255 prefix, index, len, code, cur, typos, stutter;
256
2571 while (prefixes.length) {
2581 prefix = prefixes.pop();
2591 index = prefix[4], stutter = prefix[3], typos = prefix[2], len = prefix[1], prefix = prefix[0];
260
261 // Done
2621 if (len == str.length) {
2630 if (this.format == 'int') {
2640 if (this.hasValue(index)) {
2650 results.push([prefix, this.value(index)]);
266 }
2670 continue;
268 }
269 // Find all payloads
2700 if (this.format == 'words' || this.format == 'probs') {
2710 index = this.followByte(1, index); // separator
2720 if (index == MISSING) {
2730 continue;
274 }
275 }
2760 results.push([prefix, this.iterateAll(index), stutter, typos]);
2770 continue;
278 }
279
280 // Follow a replacement path
2811 if (replaces && str[len] in replaces) {
2820 code = replaces[str[len]].charCodeAt(0);
2830 if (code in CP1251) {
2840 cur = this.followByte(CP1251[code], index);
2850 if (cur != MISSING) {
2860 prefixes.push([ prefix + replaces[str[len]], len + 1, typos, stutter, cur ]);
287 }
288 }
289 }
290
291 // Follow typos path (if not over limit)
2921 if (typos < mtypos && !stutter) {
293 // Skip a letter entirely (extra letter)
2940 prefixes.push([ prefix, len + 1, typos + 1, stutter, index ]);
295
296 // Add a letter (missing) - or - replace a letter
297 // TODO: iterate all childs?
298 // Now it checks only most probable typos (located near to each other on keyboards)
2990 var possible = COMMON_TYPOS[str[len]];
3000 if (possible) {
3010 for (var i = 0; i < possible.length; i++) {
3020 code = possible.charCodeAt(i);
3030 if (code in CP1251) {
3040 cur = this.followByte(CP1251[code], index);
3050 if (cur != MISSING) {
306 // for missing letter we need to iterate all childs, not only COMMON_TYPOS
307 // prefixes.push([ prefix + possible[i], len, typos + 1, stutter, cur ]);
3080 prefixes.push([ prefix + possible[i], len + 1, typos + 1, stutter, cur ]);
309 }
310 }
311 }
312 }
313
314 // Swapped two letters
315 // TODO: support for replacements?
3160 if (len < str.length - 1) {
3170 code = str.charCodeAt(len + 1);
3180 if (code in CP1251) {
3190 cur = this.followByte(CP1251[code], index);
3200 if (cur != MISSING) {
3210 code = str.charCodeAt(len);
3220 if (code in CP1251) {
3230 cur = this.followByte(CP1251[code], cur);
3240 if (cur != MISSING) {
3250 prefixes.push([ prefix + str[len + 1] + str[len], len + 2, typos + 1, stutter, cur ]);
326 }
327 }
328 }
329 }
330 }
331 }
332
333 // Follow base path
3341 code = str.charCodeAt(len);
3351 if (code in CP1251) {
3361 cur = this.followByte(CP1251[code], index);
3371 if (cur != MISSING) {
3380 prefixes.push([ prefix + str[len], len + 1, typos, stutter, cur ]);
339
3400 while (stutter < mstutter && !typos && len < str.length - 1) {
341 // Follow a simple stutter path (merge two equal letters into one)
3420 if (str[len] == str[len + 1]) {
3430 prefixes.push([ prefix + str[len], len + 2, typos, stutter + 1, cur ]);
3440 len++;
345 } else
346 // Follow a stutter with a dash (д-да)
3470 if (len < str.length - 2 && str[len + 1] == '-' && str[len] == str[len + 2]) {
3480 prefixes.push([ prefix + str[len], len + 3, typos, stutter + 1, cur ]);
3490 len += 2;
350 } else {
3510 break;
352 }
3530 stutter++;
354 }
355 }
356 }
357 }
3581 return results;
359 }
360
3611 return DAWG;
362}));
3631;(function (global, factory) {
3641 typeof exports === 'object' && typeof module !== 'undefined' ? (module.exports = module.exports || {}) && (module.exports.Morph = factory(module.exports)) :
365 typeof define === 'function' && define.amd ? define('Az.Morph', ['Az', 'Az.DAWG'], factory) :
366 (global.Az = global.Az || {}) && (global.Az.Morph = factory(global.Az))
3671}(this, function (Az) { 'use strict';
3681 var words,
369 probabilities,
370 predictionSuffixes = new Array(3),
371 prefixes = [ '', 'по', 'наи' ],
372 suffixes,
373 grammemes,
374 paradigms,
375 tags,
376 defaults = {
377 // Замены (работают как в pymorphy2).
378 // false, чтобы отключить.
379 replacements: { 'е': 'ё' },
380 // "Заикание". Устраняет повторения букв (как с дефисом - "не-е-ет", так и без - "нееет").
381 // Infinity не ограничивает максимальное число повторений (суммарно во всем слове).
382 // 0 или false чтобы отключить.
383 stutter: Infinity,
384 // Опечатки. Максимальное количество опечаток в слове.
385 // Опечаткой считается:
386 // - лишняя буква в слове
387 // - (пропущенная буква в слове) (TODO: пока не работает)
388 // - не та буква в слове (если правильная буква стоит рядом на клавиатуре)
389 // - переставленные местами соседние буквы
390 // 0 или false чтобы отключить.
391 // 'auto':
392 // - 0, если слово короче 5 букв
393 // - 1, если слово короче 10 букв (но только если не нашлось варианта разбора без опечаток)
394 // - 2 в противном случае (но только если не нашлось варианта разбора без опечаток или с 1 опечаткой)
395 typos: 0
396 // Совместное появление опечаток и "заикания" считается недопустимым (т.к. это приводит к большому числу вариантов, особенно на словах с "заиканием")
397 };
398
399 // Взято из https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object/freeze
4001 function deepFreeze(obj) {
40129281 if (!('freeze' in Object)) {
4020 return;
403 }
404
40529281 var propNames = Object.getOwnPropertyNames(obj);
40629281 propNames.forEach(function(name) {
407252745 var prop = obj[name];
408
409252745 if (typeof prop == 'object' && prop !== null)
41029280 deepFreeze(prop);
411 });
412
41329281 return Object.freeze(obj);
414 }
415
416 //
417 // Экземпляры Tag могут быть довольно большими, т.к. будут переиспользоваться для всех слов.
418 // Однако это приводит к запрету на любые изменения этих экземпляров. В современных браузерах для этого будет использован метод Object.freeze()
419 // Каждая граммема хранится внутри тега в нескольких местах:
420 // tag[grammeme] = true | false
421 // tag[parent] = grammeme
422 // tag.stat = [grammeme1, grammeme2, ...] // неизменяемые граммемы
423 // tag.flex = [grammeme1, grammeme2, ...] // изменяемые граммемы
424 //
425 //
426 // tag.ext[grammemeCyr] = true | false
427 // tag.ext[parentCyr] = grammemeCyr
428 // tag.ext.stat = [grammemeCyr1, grammemeCyr2, ...] // неизменяемые граммемы
429 // tag.ext.flex = [grammemeCyr1, grammemeCyr2, ...] // изменяемые граммемы
430 //
431 //
432 // Тут grammeme - латинская запись граммемы, grammemeCyr - кириллическая
433 // parent, parentCyr - родительская граммема.
434 //
4351 var Tag = function(str) {
4369760 var par, pair = str.split(' ');
4379760 this.stat = pair[0].split(',');
4389760 for (var i = 0; i < this.stat.length; i++) {
43942374 this[this.stat[i]] = true;
44042374 if (par = grammemes[this.stat[i]].parent) {
44131332 this[par] = this.stat[i];
442 }
443 }
4449760 this.flex = pair[1] ? pair[1].split(',') : [];
4459760 for (var i = 0; i < this.flex.length; i++) {
44626842 this[this.flex[i]] = true;
44726842 if (par = grammemes[this.flex[i]].parent) {
44824428 this[par] = this.flex[i];
449 }
450 }
4519760 if ('POST' in this) {
4529760 this.POS = this.POST;
453 }
454 }
4551 Tag.prototype.toString = function() {
4560 return (this.stat.join(',') + ' ' + this.flex.join(',')).trim();
457 }
458 // Проверяет согласованность с конкретными значениями граммем либо со списком граммем из другого тега (или слова)
459 // tag.matches({ 'POS' : 'NOUN', 'GNdr': ['masc', 'neut'] })
460 // Ключи — названия граммем, значения — дочерние граммемы, массивы граммем, либо true/false
461 // tag.matches(otherTag, ['POS', 'GNdr'])
462 // Тег (или слово) + список граммем, значения которых у этих двух тегов должны совпадать
4631 Tag.prototype.matches = function(tag, grammemes) {
4640 if (!grammemes) {
465 // Match to map
4660 for (var k in tag) {
4670 if (Object.prototype.toString.call(tag[k]) === '[object Array]') {
4680 if (!tag[k].indexOf(this[k])) {
4690 return false;
470 }
471 } else {
4720 if (tag[k] != this[k]) {
4730 return false;
474 }
475 }
476 }
4770 return true;
478 }
479
4800 if (tag instanceof Word) {
4810 tag = tag.tag;
482 }
483
484 // Match to another tag
4850 for (var i = 0; i < grammemes.length; i++) {
4860 if (tag[grammemes[i]] != this[grammemes[i]]) {
4870 return false;
488 }
489 }
4900 return true;
491 }
492
4931 var Word = function(val, paradigmIdx, formIdx, stutterCnt, typosCnt) {
4940 this.val = val;
4950 this.paradigmIdx = paradigmIdx;
4960 this.paradigm = paradigms[paradigmIdx];
4970 var len = this.paradigm.length / 3;
4980 this.formIdx = formIdx;
4990 this.tag = tags[this.paradigm[len + formIdx]];
5000 this.stutterCnt = stutterCnt;
5010 this.typosCnt = typosCnt;
502 }
503 // Возвращает основу слова
5041 Word.prototype.base = function() {
5050 if (this._base) {
5060 return this._base;
507 }
5080 var len = this.paradigm.length / 3;
5090 return this._base = this.val.substring(prefixes[this.paradigm[(len << 1) + this.formIdx]].length, this.val.length - suffixes[this.paradigm[this.formIdx]].length);
510 }
511 // Приводит к начальной форме. Аргумент keepPOS=true нужен, если требуется не менять часть речи при нормализации (например, не делать из причастия инфинитив).
512 // TODO: некоторые смены частей речи, возможно, стоит делать в любом случае (т.к., например, компаративы, краткие формы причастий и прилагательных разделены, инфинитив отделен от глагола)
5131 Word.prototype.normalize = function(keepPOS) {
5140 return this.inflect(keepPOS ? { POS: this.tag.POS } : 0);
515 }
516 // Склоняет/спрягает слово так, чтобы оно соответствовало граммемам другого слова, тега или просто конкретным граммемам (подробнее см. Tag.prototype.matches).
517 // Всегда выбирается первый подходящий вариант.
5181 Word.prototype.inflect = function(tag, grammemes) {
5190 var len = this.paradigm.length / 3;
5200 if (!grammemes && typeof tag === 'number') {
521 // Inflect to specific formIdx
5220 return [prefixes[this.paradigm[(len << 1) + tag]] + this.base() + suffixes[this.paradigm[tag]], tags[this.paradigm[len + tag]]];
523 }
524
5250 for (var formIdx = 0; formIdx < len; formIdx++) {
5260 if (tags[this.paradigm[len + formIdx]].matches(tag, grammemes)) {
5270 return [prefixes[this.paradigm[(len << 1) + formIdx]] + this.base() + suffixes[this.paradigm[formIdx]], tags[this.paradigm[len + formIdx]]];
528 }
529 }
530
5310 return false;
532 }
533 // Аналогично Tag.prototype.matches.
5341 Word.prototype.matches = function(tag, grammemes) {
5350 return this.tag.matches(tag, grammemes);
536 }
537 // Выводит информацию о слове в консоль.
5381 Word.prototype.log = function() {
5390 var len = this.paradigm.length / 3;
5400 console.group(this.val);
5410 console.log('Stutter?', this.stutterCnt, 'Typos?', this.typosCnt);
5420 console.log(prefixes[this.paradigm[(len << 1) + this.formIdx]] + '|' + this.base() + '|' + suffixes[this.paradigm[this.formIdx]]);
5430 console.log(this.tag.ext.toString());
5440 var norm = this.normalize();
5450 console.log('=> ', norm[0] + ' (' + norm[1].ext.toString() + ')');
5460 var norm = this.normalize(true);
5470 console.log('=> ', norm[0] + ' (' + norm[1].ext.toString() + ')');
5480 console.groupCollapsed('Все формы: ' + len);
5490 for (var formIdx = 0; formIdx < len; formIdx++) {
5500 var form = this.inflect(formIdx);
5510 console.log(form[0] + ' (' + form[1].ext.toString() + ')');
552 }
5530 console.groupEnd();
5540 console.groupEnd();
555 }
556
5571 var Morph = function(word, config) {
5581 config = config || defaults;
559
5601 for (var k in defaults) {
5613 if (!(k in config)) {
5620 config[k] = defaults[k];
563 }
564 }
565
5661 var opts;
5671 if (config.typos == 'auto') {
5680 opts = words.findAll(word, config.replacements, config.stutter, 0);
5690 if (!opts.length && word.length > 4) {
5700 opts = words.findAll(word, config.replacements, config.stutter, 1);
5710 if (!opts.length && word.length > 9) {
5720 opts = words.findAll(word, config.replacements, config.stutter, 2);
573 }
574 }
575 } else {
5761 opts = words.findAll(word, config.replacements, config.stutter, config.typos);
577 }
578
5791 var vars = [];
580 //console.log(opts);
5811 for (var i = 0; i < opts.length; i++) {
5820 for (var j = 0; j < opts[i][1].length; j++) {
5830 var word = new Word(opts[i][0], opts[i][1][j][0], opts[i][1][j][1], opts[i][2], opts[i][3]);
584 //word.log();
5850 vars.push(word);
586 }
587 }
5881 return vars;
589 }
590
5911 Morph.setDefaults = function(config) {
5920 defaults = config;
593 }
594
5951 Morph.init = function(path, callback) {
5961 var loading = 0;
5971 var tagsInt, tagsExt;
5981 function loaded() {
59910 if (!--loading) {
6001 tags = Array(tagsInt.length);
6011 for (var i = 0; i < tagsInt.length; i++) {
6024880 tags[i] = new Tag(tagsInt[i]);
6034880 tags[i].ext = new Tag(tagsExt[i]);
604 }
6051 tags = deepFreeze(tags);
6061 callback && callback(null, Morph);
607 }
608 }
609
6101 loading++;
6111 Az.DAWG.load(path + '/words.dawg', 'words', function(err, dawg) {
6121 words = dawg;
6131 loaded();
614 });
615
6161 for (var prefix = 0; prefix < 3; prefix++) {
6173 (function(prefix) {
6183 loading++;
6193 Az.DAWG.load(path + '/prediction-suffixes-' + prefix + '.dawg', 'probs', function(err, dawg) {
6203 predictionSuffixes[prefix] = dawg;
6213 loaded();
622 });
623 })(prefix);
624 }
625
6261 loading++;
6271 Az.DAWG.load(path + '/p_t_given_w.intdawg', 'int', function(err, dawg) {
6281 probabilities = dawg;
6291 loaded();
630 });
631
6321 loading++;
6331 Az.load(path + '/grammemes.json', 'json', function(err, json) {
6341 grammemes = {};
6351 for (var i = 0; i < json.length; i++) {
636113 grammemes[json[i][0]] = grammemes[json[i][2]] = {
637 parent: json[i][1],
638 internal: json[i][0],
639 external: json[i][2],
640 externalFull: json[i][3]
641 }
642 }
6431 loaded();
644 });
645
6461 loading++;
6471 Az.load(path + '/gramtab-opencorpora-int.json', 'json', function(err, json) {
6481 tagsInt = json;
6491 loaded();
650 });
651
6521 loading++;
6531 Az.load(path + '/gramtab-opencorpora-ext.json', 'json', function(err, json) {
6541 tagsExt = json;
6551 loaded();
656 });
657
6581 loading++;
6591 Az.load(path + '/suffixes.json', 'json', function(err, json) {
6601 suffixes = json;
6611 loaded();
662 });
663
6641 loading++;
6651 Az.load(path + '/paradigms.array', 'arraybuffer', function(err, data) {
6661 var list = new Uint16Array(data),
667 count = list[0],
668 pos = 1;
669
6701 paradigms = [];
6711 for (var i = 0; i < count; i++) {
6723256 var size = list[pos++];
6733256 paradigms.push(list.subarray(pos, pos + size));
6743256 pos += size;
675 }
6761 loaded();
677 });
678 }
679
6801 return Morph;
681}));
6821;(function (global, factory) {
6831 typeof exports === 'object' && typeof module !== 'undefined' ? (module.exports = module.exports || {}) && (module.exports.Syntax = factory(module.exports)) :
684 typeof define === 'function' && define.amd ? define('Az.Syntax', ['Az'], factory) :
685 (global.Az = global.Az || {}) && (global.Az.Syntax = factory(global.Az))
6861}(this, function (Az) { 'use strict';
687 // TBD: Syntax analyzer
6881 var Syntax = function() {
689
690 }
691
6921 return Syntax;
693}));
6941;(function (global, factory) {
6951 typeof exports === 'object' && typeof module !== 'undefined' ? (module.exports = module.exports || {}) && (module.exports.Tokens = factory()) :
696 typeof define === 'function' && define.amd ? define('Az.Tokens', ['Az'], factory) :
697 (global.Az = global.Az || {}) && (global.Az.Tokens = factory())
6981}(this, function () { 'use strict';
6991 var TLDs = 'ac|ad|ae|aero|af|ag|ai|al|am|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|be|bf|bg|bh|bi|biz|bj|bm|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|cl|cm|cn|co|com|coop|cr|cu|cv|cw|cx|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|es|et|eu|fi|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jo|jobs|jp|kg|ki|km|kn|kp|kr|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|na|name|nc|ne|net|nf|ng|nl|no|nr|nu|nz|om|org|pa|pe|pf|ph|pk|pl|pm|pn|post|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sx|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|yt|امارات|հայ|বাংলা|бел|中国|中國|الجزائر|مصر|ею|გე|ελ|香港|भारत|بھارت|భారత్|ભારત|ਭਾਰਤ|ভারত|இந்தியா|ایران|ايران|عراق|الاردن|한국|қаз|ලංකා|இலங்கை|المغرب|мкд|мон|澳門|澳门|مليسيا|عمان|پاکستان|پاكستان|فلسطين|срб|рф|قطر|السعودية|السعودیة|السعودیۃ|السعوديه|سودان|新加坡|சிங்கப்பூர்|سورية|سوريا|ไทย|تونس|台灣|台湾|臺灣|укр|اليمن|xxx|zm|aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|academy|accenture|accountant|accountants|aco|active|actor|adac|ads|adult|aeg|aetna|afamilycompany|afl|africa|africamagic|agakhan|agency|aig|aigo|airbus|airforce|airtel|akdn|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|americanexpress|americanfamily|amex|amfam|amica|amsterdam|analytics|android|anquan|anz|aol|apartments|app|apple|aquarelle|arab|aramco|archi|army|art|arte|asda|associates|athleta|attorney|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aws|axa|azure|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bbc|bbt|bbva|bcg|bcn|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bharti|bible|bid|bike|bing|bingo|bio|black|blackfriday|blanco|blockbuster|blog|bloomberg|blue|bms|bmw|bnl|bnpparibas|boats|boehringer|bofa|bom|bond|boo|book|booking|boots|bosch|bostik|boston|bot|boutique|box|bradesco|bridgestone|broadway|broker|brother|brussels|budapest|bugatti|build|builders|business|buy|buzz|bzh|cab|cafe|cal|call|calvinklein|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|cartier|casa|case|caseih|cash|casino|catering|catholic|cba|cbn|cbre|cbs|ceb|center|ceo|cern|cfa|cfd|chanel|channel|chase|chat|cheap|chintai|chloe|christmas|chrome|chrysler|church|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|coach|codes|coffee|college|cologne|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|corsica|country|coupon|coupons|courses|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cuisinella|cymru|cyou|dabur|dad|dance|date|dating|datsun|day|dclk|dds|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dnp|docs|dodge|dog|doha|domains|dot|download|drive|dstv|dtv|dubai|duck|dunlop|duns|dupont|durban|dvag|dwg|earth|eat|edeka|education|email|emerck|emerson|energy|engineer|engineering|enterprises|epost|epson|equipment|ericsson|erni|esq|estate|esurance|etisalat|eurovision|eus|events|everbank|exchange|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|flickr|flights|flir|florist|flowers|flsmidth|fly|foo|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fujixerox|fun|fund|furniture|futbol|fyi|gal|gallery|gallo|gallup|game|games|gap|garden|gbiz|gdn|gea|gent|genting|george|ggee|gift|gifts|gives|giving|glade|glass|gle|global|globo|gmail|gmbh|gmo|gmx|godaddy|gold|goldpoint|golf|goo|goodhands|goodyear|goog|google|gop|got|gotv|grainger|graphics|gratis|green|gripe|group|guardian|gucci|guge|guide|guitars|guru|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hkt|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|honeywell|horse|host|hosting|hot|hoteles|hotmail|house|how|hsbc|htc|hughes|hyatt|hyundai|ibm|icbc|ice|icu|ieee|ifm|iinet|ikano|imamat|imdb|immo|immobilien|industries|infiniti|ing|ink|institute|insurance|insure|intel|international|intuit|investments|ipiranga|irish|iselect|ismaili|ist|istanbul|itau|itv|iveco|iwc|jaguar|java|jcb|jcp|jeep|jetzt|jewelry|jio|jlc|jll|jmp|jnj|joburg|jot|joy|jpmorgan|jprs|juegos|juniper|kaufen|kddi|kerryhotels|kerrylogistics|kerryproperties|kfh|kia|kim|kinder|kindle|kitchen|kiwi|koeln|komatsu|kosher|kpmg|kpn|krd|kred|kuokgroup|kyknet|kyoto|lacaixa|ladbrokes|lamborghini|lamer|lancaster|lancia|lancome|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|liaison|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|loan|loans|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|ltd|ltda|lundbeck|lupin|luxe|luxury|macys|madrid|maif|maison|makeup|man|management|mango|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mcd|mcdonalds|mckinsey|med|media|meet|melbourne|meme|memorial|men|menu|meo|metlife|miami|microsoft|mini|mint|mit|mitsubishi|mlb|mls|mma|mnet|mobily|moda|moe|moi|mom|monash|money|monster|montblanc|mopar|mormon|mortgage|moscow|moto|motorcycles|mov|movie|movistar|msd|mtn|mtpc|mtr|multichoice|mutual|mutuelle|mzansimagic|nab|nadex|nagoya|naspers|nationwide|natura|navy|nba|nec|netbank|netflix|network|neustar|new|newholland|news|next|nextdirect|nexus|nfl|ngo|nhk|nico|nike|nikon|ninja|nissan|nissay|nokia|northwesternmutual|norton|now|nowruz|nowtv|nra|nrw|ntt|nyc|obi|observer|off|office|okinawa|olayan|olayangroup|oldnavy|ollo|omega|one|ong|onl|online|onyourside|ooo|open|oracle|orange|organic|orientexpress|origins|osaka|otsuka|ott|ovh|page|pamperedchef|panasonic|panerai|paris|pars|partners|parts|party|passagens|pay|payu|pccw|pet|pfizer|pharmacy|philips|photo|photography|photos|physio|piaget|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|place|play|playstation|plumbing|plus|pnc|pohl|poker|politie|porn|pramerica|praxi|press|prime|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|pub|pwc|qpon|quebec|quest|qvc|racing|raid|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|rightathome|ril|rio|rip|rmit|rocher|rocks|rodeo|rogers|room|rsvp|ruhr|run|rwe|ryukyu|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sapo|sarl|sas|save|saxo|sbi|sbs|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scjohnson|scor|scot|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shopping|shouji|show|showtime|shriram|silk|sina|singles|site|ski|skin|sky|skype|sling|smart|smile|sncf|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|space|spiegel|spot|spreadbetting|srl|srt|stada|staples|star|starhub|statebank|statefarm|statoil|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|sucks|supersport|supplies|supply|support|surf|surgery|suzuki|swatch|swiftcover|swiss|sydney|symantec|systems|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tci|tdk|team|tech|technology|telecity|telefonica|temasek|tennis|teva|thd|theater|theatre|theguardian|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tjmaxx|tjx|tkmaxx|tmall|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|trade|trading|training|travelchannel|travelers|travelersinsurance|trust|trv|tube|tui|tunes|tushu|tvs|ubank|ubs|uconnect|unicom|university|uno|uol|ups|vacations|vana|vanguard|vegas|ventures|verisign|versicherung|vet|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|vista|vistaprint|viva|vivo|vlaanderen|vodka|volkswagen|volvo|vote|voting|voto|voyage|vuelos|wales|walmart|walter|wang|wanggou|warman|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|कॉम|セール|佛山|慈善|集团|在线|大众汽车|点看|คอม|八卦|موقع|一号店|公益|公司|香格里拉|网站|移动|我爱你|москва|католик|онлайн|сайт|联通|קום|时尚|微博|淡马锡|ファッション|орг|नेट|ストア|삼성|商标|商店|商城|дети|ポイント|新闻|工行|家電|كوم|中文网|中信|娱乐|谷歌|電訊盈科|购物|クラウド|通販|网店|संगठन|餐厅|网络|ком|诺基亚|食品|飞利浦|手表|手机|ارامكو|العليان|اتصالات|بازار|موبايلي|ابوظبي|كاثوليك|همراه|닷컴|政府|شبكة|بيتك|عرب|机构|组织机构|健康|рус|珠宝|大拿|みんな|グーグル|世界|書籍|网址|닷넷|コム|天主教|游戏|vermögensberater|vermögensberatung|企业|信息|嘉里大酒店|嘉里|广东|政务|xperia|xyz|yachts|yahoo|yamaxun|yandex|yodobashi|yoga|yokohama|you|youtube|yun|zappos|zara|zero|zip|zippo|zone|zuerich'.split('|');
7001 var defaults = {
701 html: false,
702 wiki: false, // TODO: check all cases
703 markdown: false, // TODO: check all cases
704 hashtags: true,
705 mentions: true,
706 emails: true,
707 links: {
708 protocols: true,
709 www: true,
710 tlds: {}
711 }
712 };
713 /* TODO: add more named HTML entities */
7141 var HTML_ENTITIES = { nbsp: ' ', quot: '"', gt: '>', lt: '<', amp: '&' };
715
7161 for (var i = 0; i < TLDs.length; i++) {
7171528 defaults.links.tlds[TLDs[i]] = true;
718 }
719
720 // Start splitting text into tokens
721 // Returns a context, use `done` method to retrieve result
7221 var Tokens = function(text, config) {
7238 if (this instanceof Tokens) {
7244 this.tokens = [];
7254 this.config = config || defaults;
7264 this.append(text);
7274 this.index = -1;
728 } else {
7294 return new Tokens(text);
730 }
731 }
732
733 // Adds more text content
7341 Tokens.prototype.append = function(text, config) {
735 // TODO: get rid of 's' field (storing a copy of token)
736 // st + len + en should be enough (check that they are always correct)
7374 config = config || this.config;
7384 for (var i = 0; i < text.length; i++) {
73994 var ch = text.charAt(i);
74094 var code = text.charCodeAt(i);
741
74294 var append = false;
74394 var last = this.tokens.length - 1;
74494 var token = this.tokens[last];
745
74694 if (config.html && ch == ';') {
747 //  
7480 if (last > 0 && token.type == 'WORD' && this.tokens[last - 1].s == '&') {
7490 var name = token.s.toLowerCase();
7500 if (name in HTML_ENTITIES) {
7510 ch = HTML_ENTITIES[name];
7520 code = ch.charCodeAt(0);
753
7540 last -= 2;
7550 token = this.tokens[last];
7560 this.tokens.length = last + 1;
757 }
758 } else
759 // &x123AF5;
760 // &1234;
7610 if (last > 1 && (token.type == 'NUMBER' || (token.type == 'WORD' && token.s[0] == 'x')) && this.tokens[last - 1].s == '#' && this.tokens[last - 2].s == '&') {
7620 if (token.s[0] == 'x') {
7630 code = parseInt(token.s.substr(1), 16);
764 } else {
7650 code = parseInt(token.s, 10);
766 }
7670 ch = String.fromCharCode(code);
768
7690 last -= 3;
7700 token = this.tokens[last];
7710 this.tokens.length = last + 1;
772 }
773 }
774
77594 var charType = 'OTHER';
77694 var charUpper = (ch.toLocaleLowerCase() != ch);
777112 if (code >= 0x0400 && code <= 0x04FF) charType = 'CYRIL';
778139 if ((code >= 0x0041 && code <= 0x005A) || (code >= 0x0061 && code <= 0x007A) || (code >= 0x00C0 && code <= 0x024F)) charType = 'LATIN';
77997 if (code >= 0x0030 && code <= 0x0039) charType = 'DIGIT';
780104 if ((code <= 0x0020) || (code >= 0x0080 && code <= 0x00A0)) charType = 'SPACE';
781107 if ('‐-−‒–—―.…,:;?!¿¡()[]«»"\'’‘’“”/⁄'.indexOf(ch) > -1) charType = 'PUNCT';
782
78394 var tokenType = charType;
78494 var tokenSubType = false;
78594 if (charType == 'CYRIL' || charType == 'LATIN') {
78663 tokenType = 'WORD';
78763 tokenSubType = charType;
788 } else
78931 if (charType == 'DIGIT') {
7903 tokenType = 'NUMBER';
791 }
792
79394 var lineStart = !token || token.s[token.s.length - 1] == '\n';
794
79594 if (config.wiki) {
7960 if (lineStart) {
7970 if (':;*#~|'.indexOf(ch) > -1) {
7980 tokenType = 'MARKUP';
7990 tokenSubType = 'NEWLINE';
800 }
801 }
8020 if ('={[|]}'.indexOf(ch) > -1) {
8030 tokenType = 'MARKUP';
804 }
805 }
806
80794 if (config.markdown) {
8080 if (lineStart) {
8090 if ('=-#>+-'.indexOf(ch) > -1) {
8100 tokenType = 'MARKUP';
8110 tokenSubType = 'NEWLINE';
812 }
813 }
8140 if ('[]*~_`\\'.indexOf(ch) > -1) {
8150 tokenType = 'MARKUP';
816 }
817 }
818
81994 if (token) {
82091 if (config.wiki && ch != '\'' && token.s == '\'' && last > 0 && this.tokens[last - 1].type == 'WORD') {
8210 this.tokens[last - 1].s += token.s;
8220 this.tokens[last - 1].en = token.en;
8230 this.tokens[last - 1].len += token.len;
824
8250 last -= 1;
8260 this.tokens.length = last + 1;
8270 token = this.tokens[last];
828 }
829
830 // Preprocess last token
83191 if (config.links && config.links.tlds &&
832 (charType == 'PUNCT' || charType == 'SPACE') &&
833 this.tokens.length > 2 &&
834 this.tokens[last - 2].type == 'WORD' &&
835 this.tokens[last - 1].s == '.' &&
836 this.tokens[last].type == 'WORD' &&
837 this.tokens[last].s in config.links.tlds) {
838
839 // Merge all subdomains
8402 while (last >= 2 &&
841 this.tokens[last - 2].type == 'WORD' &&
842 (this.tokens[last - 1].s == '.' || this.tokens[last - 1].s == '@' || this.tokens[last - 1].s == ':')) {
8432 last -= 2;
8442 token = this.tokens[last];
8452 token.s += this.tokens[last + 1].s + this.tokens[last + 2].s;
8462 token.allUpper = token.allUpper && this.tokens[last + 1].allUpper && this.tokens[last + 2].allUpper;
847 }
848
8492 if (config.emails && token.s.indexOf('@') > -1 && token.s.indexOf(':') == -1) {
850 // URL can contain a '@' but in that case it should be in form http://user@site.com or user:pass@site.com
851 // So if URL has a '@' but no ':' in it, we assume it's a email
8520 token.type = 'EMAIL';
853 } else {
8542 token.type = 'LINK';
855
8562 if (ch == '/') {
8570 append = true;
858 }
859 }
8602 this.tokens.length = last + 1;
861 } else
862
863 // Process next char (start new token or append to the previous one)
86489 if (token.type == 'LINK') {
86514 if (charType != 'SPACE' && ch != ',') {
86612 append = true;
867 }
868 } else
86975 if (token.type == 'EMAIL') {
8700 if (charType == 'CYRIL' || charType == 'LATIN' || ch == '.') {
8710 append = true;
872 }
873 } else
87475 if (token.type == 'HASHTAG' || token.type == 'MENTION') {
87510 if (charType == 'CYRIL' || charType == 'LATIN' || charType == 'DIGIT' || ch == '_' || (ch == '@' && token.s.indexOf('@') == -1)) {
8769 append = true;
877 }
878 } else
87965 if (token.type == 'TAG' && (token.quote || token.s[token.s.length - 1] != '>')) {
8800 append = true;
8810 if (token.quote) {
8820 if (ch == token.quote && token.s[token.s.length - 1] != '\\') {
8830 delete token.quote;
884 }
885 } else
8860 if (ch == '"' || ch == '\'') {
8870 token.quote = ch;
888 }
889 } else
89065 if (token.type == 'CONTENT') {
8910 append = true;
8920 if (token.quote) {
8930 if (ch == token.quote && token.s[token.s.length - 1] != '\\') {
8940 delete token.quote;
895 }
896 } else
8970 if (ch == '"' || ch == '\'') {
8980 token.quote = ch;
899 } else
9000 if (ch == '<') {
9010 append = false;
902 }
903 } else
90465 if (token.type == 'TAG' && ch != '<' && token.s.substr(1, 6).toLowerCase() == 'script') {
9050 tokenType = 'CONTENT';
9060 tokenSubType = 'SCRIPT';
907 } else
90865 if (token.type == 'TAG' && ch != '<' && token.s.substr(1, 5).toLowerCase() == 'style') {
9090 tokenType = 'CONTENT';
9100 tokenSubType = 'STYLE';
911 } else
91265 if (config.html && token.s == '<' && (charType == 'LATIN' || ch == '!' || ch == '/')) {
9130 append = true;
9140 token.type = 'TAG';
9150 if (ch == '!') {
9160 token.subType = 'COMMENT';
917 } else
9180 if (ch == '/') {
9190 token.subType = 'CLOSING';
920 }
921 } else
92265 if (token.type == 'CONTENT') {
9230 append = true;
924 } else
92565 if (token.type == 'MARKUP' && token.subType == 'TEMPLATE' && (token.s[token.s.length - 1] != '}' || token.s[token.s.length - 2] != '}')) {
9260 append = true;
927 } else
92865 if (token.type == 'MARKUP' && token.type == 'LINK' && token.s[token.s.length - 1] != ')') {
9290 append = true;
930 } else
93165 if (token.type == 'MARKUP' && token.s[0] == '`' && token.subType == 'NEWLINE' && charType == 'LATIN') {
9320 append = true;
933 } else
93465 if (charType == 'CYRIL' || charType == 'LATIN') {
93541 if (token.type == 'WORD') {
93628 append = true;
93728 token.subType = (token.subType == charType) ? token.subType : 'MIXED';
938 } else
93913 if (token.type == 'NUMBER') { // Digits + ending
9400 append = true;
9410 token.subType = (token.subType && token.subType != charType) ? 'MIXED' : charType;
942 } else
94313 if (config.hashtags && token.s == '#') { // Hashtags
9442 append = true;
9452 token.type = 'HASHTAG';
946 } else
94711 if (config.mentions && token.s == '@' && (last == 0 || this.tokens[last - 1].type == 'SPACE')) { // Mentions
9480 append = true;
9490 token.type = 'MENTION';
950 } else
95111 if (charType == 'LATIN' && (token.s == '\'' || token.s == '’')) {
9520 append = true;
9530 token.type = 'WORD';
9540 token.subType = 'LATIN';
955 } else
95611 if (token.s == '-') { // -цать (?), 3-й
9570 append = true;
958
9590 if (last > 0 && this.tokens[last - 1].type == 'NUMBER') {
9600 token = this.tokens[last - 1];
9610 token.s += this.tokens[last].s;
962
9630 this.tokens.length -= 1;
964 }
965
9660 token.type = 'WORD';
9670 token.subType = charType;
968 }
969 } else
97024 if (charType == 'DIGIT') {
9713 if (token.type == 'WORD') {
9720 append = true;
9730 token.subType = 'MIXED';
974 } else
9753 if (token.type == 'NUMBER') {
9761 append = true;
977 } else
9782 if (token.s == '+' || token.s == '-') {
9790 append = true;
980
9810 if (last > 0 && this.tokens[last - 1].type == 'NUMBER') {
9820 token = this.tokens[last - 1];
9830 token.s += this.tokens[last].s;
9840 token.subType = 'RANGE';
985
9860 this.tokens.length -= 1;
987 }
988
9890 token.type = 'NUMBER';
990 } else
9912 if ((token.s == ',' || token.s == '.') && this.tokens.length > 1 && this.tokens[last - 1].type == 'NUMBER') {
9920 append = true;
993
9940 token = this.tokens[last - 1];
9950 token.s += this.tokens[last].s;
996
9970 this.tokens.length -= 1;
998 }
999 } else
100021 if (charType == 'SPACE') {
10018 if (token.type == 'SPACE') {
10020 append = true;
1003 }
1004 } else
100513 if (token.type == 'MARKUP' && token.s[0] == ch && '=-~:*#`\'>_'.indexOf(ch) > -1) {
10060 append = true;
1007 } else
100813 if (ch == '.') {
10096 if (config.links && config.links.www && token.s.toLocaleLowerCase() == 'www') { // Links without protocol but with www
10101 append = true;
10111 token.type = 'LINK';
1012 }
1013 } else
10147 if (config.wiki && ch == '\'') {
10150 if (token.s == '\'') {
10160 append = true;
10170 token.type = 'MARKUP';
1018 } else {
10190 tokenType = 'PUNCT';
1020 }
1021 } else
10227 if (ch == '-' || ch == '’' || ch == '\'') {
10230 if (token.type == 'WORD') {
10240 append = true;
1025 }
1026 } else
10277 if (ch == '/') {
10282 if (config.links && config.links.protocols &&
1029 this.tokens.length > 2 &&
1030 this.tokens[last - 2].type == 'WORD' &&
1031 this.tokens[last - 2].subType == 'LATIN' &&
1032 this.tokens[last - 1].s == ':' &&
1033 this.tokens[last].s == '/') { // Links (with protocols)
10341 append = true;
1035
10361 token = this.tokens[last - 2];
10371 token.s += this.tokens[last - 1].s + this.tokens[last].s;
10381 token.allUpper = token.allUpper && this.tokens[last - 1].allUpper && this.tokens[last].allUpper;
10391 token.type = 'LINK';
1040
10411 this.tokens.length -= 2;
1042 }
1043 } else
10445 if (config.html && ch == ';') {
10450 if (last > 0 && token.type == 'WORD' && this.tokens[last - 1].s == '&') {
10460 append = true;
1047
10480 token = this.tokens[last - 1];
10490 token.s += this.tokens[last].s;
10500 token.allUpper = token.allUpper && this.tokens[last - 1].allUpper;
10510 token.type = 'ENTITY';
1052
10530 this.tokens.length -= 1;
1054 } else
10550 if (last > 1 && (token.type == 'WORD' || token.type == 'NUMBER') && this.tokens[last - 1].s == '#' && this.tokens[last - 2].s == '&') {
10560 append = true;
1057
10580 token = this.tokens[last - 2];
10590 token.s += this.tokens[last - 1].s + this.tokens[last].s;
10600 token.allUpper = token.allUpper && this.tokens[last - 1].allUpper && this.tokens[last].allUpper;
10610 token.type = 'ENTITY';
1062
10630 this.tokens.length -= 2;
1064 }
1065 } else
10665 if (config.markdown && ch == '[' && token.s == '!') {
10670 append = true;
10680 token.type = 'MARKUP';
1069 } else
10705 if (config.markdown && ch == '(' && token.s == ']') {
10710 tokenType = 'MARKUP';
10720 tokenSubType = 'LINK';
1073 } else
10745 if (config.wiki && ch == '{' && token.s == '{') {
10750 append = true;
10760 token.type = 'MARKUP';
10770 token.subType = 'TEMPLATE';
1078 } else
10795 if (config.wiki && ch == '[' && token.s == '[') {
10800 append = true;
1081 } else
10825 if (config.wiki && ch == ']' && token.s == ']') {
10830 append = true;
1084 } else
10855 if (config.wiki && ch == '|' && !lineStart) {
10860 var found = -1;
10870 for (var j = last - 1; j >= 0; j--) {
10880 if (this.tokens[j].s == '[[') {
10890 found = j;
10900 break;
1091 }
10920 if (this.tokens[j].s == '|' || this.tokens[j].s.indexOf('\n') > -1) {
10930 break;
1094 }
1095 }
10960 if (found > -1) {
10970 append = true;
10980 for (var j = last - 1; j >= found; j--) {
10990 token = this.tokens[j];
11000 token.s += this.tokens[j + 1].s;
11010 token.allUpper = token.allUpper && this.tokens[j + 1].allUpper;
1102 }
11030 last = found;
11040 this.tokens.length = last + 1;
11050 token.subType = 'LINK';
1106 }
1107 }
1108 }
1109
111094 if (append) {
111154 token.s += ch;
1112 } else {
111340 token = {
1114 type: tokenType,
1115 s: ch,
1116 st: i,
1117 idx: this.tokens.length,
1118
1119 firstUpper: charUpper,
1120 allUpper: charUpper,
1121 }
112240 if (tokenSubType) {
112313 token.subType = tokenSubType;
1124 }
112540 this.tokens.push(token);
1126 }
112794 token.en = i;
112894 token.length = (token.en - token.st) + 1;
112994 token.allUpper = token.allUpper && charUpper;
1130 }
11314 return this;
1132 }
1133
11341 Tokens.prototype.done = function(filter, exclude) {
1135 // Finalize tokenizing, return list of tokens
1136 // For now it just returns tokens, in the future there could be some additional work
11374 if (!filter) {
11384 return this.tokens;
1139 }
11400 var list = [];
11410 for (var i = 0; i < this.tokens.length; i++) {
11420 if ((filter.indexOf(this.tokens[i].type) == -1) == exclude) {
11430 list.push(this.tokens[i]);
1144 }
1145 }
11460 return list;
1147 }
1148
11491 Tokens.prototype.countTokens = function(filter, exclude) {
11500 if (!skipSpace && !skipPunct) {
11510 return this.tokens.length;
1152 }
11530 var count = 0;
11540 for (var i = 0; i < this.tokens.length; i++) {
11550 if ((filter.indexOf(this.tokens[i].type) == -1) == exclude) {
11560 count++;
1157 }
1158 }
11590 return count;
1160 }
1161
11621 Tokens.prototype.nextToken = function(moveIndex, filter, exclude) {
11630 var index = this.index;
11640 index++;
11650 while (index < this.tokens.length && filter && (filter.indexOf(this.tokens[index].type) != -1) == exclude) {
11660 index++;
1167 }
11680 if (index < this.tokens.length) {
11690 if (moveIndex) {
11700 this.index = index;
1171 }
11720 return this.tokens[index];
1173 }
11740 return null;
1175 }
1176
11771 Tokens.prototype.punctAhead = function() {
11780 var token = this.nextToken(false, ['SPACE'], true);
11790 return token && token.type == 'PUNCT' && token;
1180 }
1181
11821 Tokens.prototype.prevToken = function(moveIndex, filter, exclude) {
11830 var index = this.index;
11840 index--;
11850 while (index >= 0 && filter && (filter.indexOf(this.tokens[index].type) != -1) == exclude) {
11860 index--;
1187 }
11880 if (index >= 0) {
11890 if (moveIndex) {
11900 this.index = index;
1191 }
11920 return this.tokens[index];
1193 }
11940 return null;
1195 }
1196
11971 Tokens.prototype.punctBehind = function() {
11980 var token = this.prevToken(false, ['SPACE'], true);
11990 return token && token.type == 'PUNCT' && token;
1200 }
1201
12021 Tokens.prototype.hasTokensAhead = function(filter, exclude) {
12030 return this.nextToken(false, filter, exclude) != null;
1204 }
1205
12061 Tokens.prototype.hasTokensBehind = function(filter, exclude) {
12070 return this.prevToken(false, filter, exclude) != null;
1208 }
1209
12101 return Tokens;
1211}));