1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242 | 1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
1x
627x
627x
627x
627x
627x
7036x
627x
627x
4x
4x
2874x
2873x
248x
248x
3247x
3247x
2x
2x
34x
34x
7035x
7035x
7035x
626x
7018x
7018x
7035x
2375x
4660x
7036x
7036x
18162x
18162x
18162x
18162x
18162x
7035x
7035x
7035x
7035x
7035x
1x
1x
1x
7035x
2874x
2376x
18x
2358x
2874x
627x
4x
2375x
2357x
18x
17x
1x
2874x
248x
3247x
2x
34x
| import { Context, Source, ContentModel, Location } from '../context';
import { Token, TokenType } from './token';
enum State {
INITIAL = 1,
DOCTYPE,
TEXT,
TAG,
ATTR,
CDATA,
SCRIPT,
}
type NextStateCallback = (token?: Token) => State;
type LexerTest = [RegExp | false, State | NextStateCallback, TokenType | false];
export type TokenStream = IterableIterator<Token>;
/* eslint-disable no-useless-escape */
const MATCH_WHITESPACE = /^(?:[ \t]+\n?|\n)/;
const MATCH_DOCTYPE_OPEN = /^<!(?:DOCTYPE|doctype)\s/;
const MATCH_DOCTYPE_VALUE = /^[^>]+/;
const MATCH_DOCTYPE_CLOSE = /^>/;
const MATCH_XML_TAG = /^<\?xml.*?\?>\n/;
const MATCH_TAG_OPEN = /^<(\/?)([a-zA-Z0-9\-:]+)/; // https://www.w3.org/TR/html/syntax.html#start-tags
const MATCH_TAG_CLOSE = /^\/?>/;
const MATCH_TEXT = /^[^]*?(?=([ \t]*\n|<|$))/;
const MATCH_TAG_LOOKAHEAD = /^[^]*?(?=<|$)/;
const MATCH_ATTR_START = /^([^\t\n\f \/><"'=]+)/; // https://www.w3.org/TR/html/syntax.html#elements-attributes
const MATCH_ATTR_SINGLE = /^='([^']*?)(')/;
const MATCH_ATTR_DOUBLE = /^="([^"]*?)(")/;
const MATCH_ATTR_UNQUOTED = /^=([a-zA-Z0-9]+)/;
const MATCH_CDATA_BEGIN = /^<!\[CDATA\[/;
const MATCH_CDATA_END = /^[^]*?]]>/;
const MATCH_SCRIPT_DATA = /^[^]*?(?=<\/script)/;
const MATCH_SCRIPT_END = /^<(\/)(script)/;
const MATCH_COMMENT = /^<!--([^]*?)-->/;
export class InvalidTokenError extends Error {
public location: Location;
public constructor(location: Location, message: string){
super(message);
this.location = location;
}
}
export class Lexer {
*tokenize(source: Source): TokenStream {
const context = new Context(source);
context.state = State.INITIAL;
/* for sanity check */
let previousState: State = context.state;
let previousLength: number = context.string.length;
while (context.string.length > 0){
switch (context.state){
case State.INITIAL:
yield* this.tokenizeInitial(context);
break;
case State.DOCTYPE:
yield* this.tokenizeDoctype(context);
break;
case State.TAG:
yield* this.tokenizeTag(context);
break;
case State.ATTR:
yield* this.tokenizeAttr(context);
break;
case State.TEXT:
yield* this.tokenizeText(context);
break;
case State.CDATA:
yield* this.tokenizeCDATA(context);
break;
case State.SCRIPT:
yield* this.tokenizeScript(context);
break;
default:
this.unhandled(context);
}
/* sanity check: state or string must change, if both are intact
* we are stuck in an endless loop. */
Iif (context.state === previousState && context.string.length === previousLength){
this.errorStuck(context);
}
previousState = context.state;
previousLength = context.string.length;
}
yield this.token(context, TokenType.EOF);
}
token(context: Context, type: TokenType, data?: any): Token {
Iif (!type) throw Error("TokenType must be set");
return {
type,
location: context.getLocation(),
data,
};
}
unhandled(context: Context){
const truncated = JSON.stringify(context.string.length > 13 ? `${context.string.slice(0, 10)}...` : context.string);
const message = `failed to tokenize ${truncated}, unhandled state ${State[context.state]}.`;
throw new InvalidTokenError(context.getLocation(), message);
}
errorStuck(context: Context){
const truncated = JSON.stringify(context.string.length > 13 ? `${context.string.slice(0, 10)}...` : context.string);
const message = `failed to tokenize ${truncated}, state ${State[context.state]} failed to consume data or change state.`;
throw new InvalidTokenError(context.getLocation(), message);
}
evalNextState(nextState: State | ((token: Token) => State), token: Token){
if (typeof nextState === 'function'){
return nextState(token);
} else {
return nextState;
}
}
*match(context: Context, tests: Array<LexerTest>, error: string){
let match = undefined;
for (const test of tests){
let token: Token = null;
const regex = test[0];
const nextState = test[1];
const tokenType = test[2];
if (regex === false || (match = context.string.match(regex))){
if (tokenType !== false) yield (token = this.token(context, tokenType, match));
const state = this.evalNextState(nextState, token);
context.consume(match || 0, state);
this.enter(context, state, match);
return;
}
}
const truncated = JSON.stringify(context.string.length > 13 ? `${context.string.slice(0, 10)}...` : context.string);
const message = `failed to tokenize ${truncated}, ${error}.`;
throw new InvalidTokenError(context.getLocation(), message);
}
/**
* Called when entering a new state.
*/
enter(context: Context, state: State, data: any){
switch (state) {
case State.TAG:
/* request script tag tokenization */
if (data && data[0][0] === '<'){
if (data[0] === '<script'){
context.contentModel = ContentModel.SCRIPT;
} else {
context.contentModel = ContentModel.TEXT;
}
}
break;
}
}
*tokenizeInitial(context: Context){
yield* this.match(context, [
[MATCH_XML_TAG, State.INITIAL, false],
[MATCH_DOCTYPE_OPEN, State.DOCTYPE, TokenType.DOCTYPE_OPEN],
[false, State.TEXT, false],
], 'expected doctype');
}
*tokenizeDoctype(context: Context){
yield* this.match(context, [
[MATCH_WHITESPACE, State.DOCTYPE, TokenType.WHITESPACE],
[MATCH_DOCTYPE_VALUE, State.DOCTYPE, TokenType.DOCTYPE_VALUE],
[MATCH_DOCTYPE_CLOSE, State.TEXT, TokenType.DOCTYPE_CLOSE],
], 'expected doctype name');
}
*tokenizeTag(context: Context){
function nextState(token: Token){
switch (context.contentModel){
case ContentModel.TEXT:
return State.TEXT;
case ContentModel.SCRIPT:
if (token.data[0][0] !== '/'){
return State.SCRIPT;
} else {
return State.TEXT; /* <script/> (not legal but handle it anyway so the lexer doesn't choke on it) */
}
}
return context.contentModel !== ContentModel.SCRIPT ? State.TEXT : State.SCRIPT;
}
yield* this.match(context, [
[MATCH_TAG_CLOSE, nextState, TokenType.TAG_CLOSE],
[MATCH_ATTR_START, State.ATTR, TokenType.ATTR_NAME],
[MATCH_WHITESPACE, State.TAG, TokenType.WHITESPACE],
], 'expected attribute, ">" or "/>"');
}
*tokenizeAttr(context: Context){
yield* this.match(context, [
[MATCH_ATTR_SINGLE, State.TAG, TokenType.ATTR_VALUE],
[MATCH_ATTR_DOUBLE, State.TAG, TokenType.ATTR_VALUE],
[MATCH_ATTR_UNQUOTED, State.TAG, TokenType.ATTR_VALUE],
[false, State.TAG, false],
], 'expected attribute, ">" or "/>"');
}
*tokenizeText(context: Context){
yield* this.match(context, [
[MATCH_WHITESPACE, State.TEXT, TokenType.WHITESPACE],
[MATCH_CDATA_BEGIN, State.CDATA, false],
[MATCH_COMMENT, State.TEXT, TokenType.COMMENT],
[MATCH_TAG_OPEN, State.TAG, TokenType.TAG_OPEN],
[MATCH_TEXT, State.TEXT, TokenType.TEXT],
[MATCH_TAG_LOOKAHEAD, State.TEXT, TokenType.TEXT],
], 'expected text or "<"');
}
*tokenizeCDATA(context: Context){
yield* this.match(context, [
[MATCH_CDATA_END, State.TEXT, false],
], 'expected ]]>');
}
*tokenizeScript(context: Context){
yield* this.match(context, [
[MATCH_SCRIPT_END, State.TAG, TokenType.TAG_OPEN],
[MATCH_SCRIPT_DATA, State.SCRIPT, TokenType.SCRIPT],
], 'expected </script>');
}
}
|