All files / src/lexer lexer.ts

90.48% Statements 95/105
77.78% Branches 35/45
88.24% Functions 15/17
91.26% Lines 94/103
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 2421x 1x   1x 1x 1x 1x 1x 1x 1x 1x               1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x   1x       1x 1x       1x   627x 627x     627x 627x   627x 7036x   627x 627x     4x 4x     2874x 2873x     248x 248x     3247x 3247x     2x 2x     34x 34x               7035x       7035x 7035x     626x       7018x 7018x                                       7035x 2375x   4660x         7036x 7036x 18162x 18162x 18162x 18162x   18162x 7035x 7035x 7035x 7035x 7035x       1x 1x 1x             7035x     2874x 2376x 18x   2358x     2874x         627x               4x                 2375x   2357x   18x 17x   1x         2874x               248x                 3247x                     2x           34x            
import { Context, Source, ContentModel, Location } from '../context';
import { Token, TokenType } from './token';
 
enum State {
	INITIAL = 1,
	DOCTYPE,
	TEXT,
	TAG,
	ATTR,
	CDATA,
	SCRIPT,
}
 
type NextStateCallback = (token?: Token) => State;
type LexerTest = [RegExp | false, State | NextStateCallback, TokenType | false];
export type TokenStream = IterableIterator<Token>;
 
/* eslint-disable no-useless-escape */
const MATCH_WHITESPACE = /^(?:[ \t]+\n?|\n)/;
const MATCH_DOCTYPE_OPEN = /^<!(?:DOCTYPE|doctype)\s/;
const MATCH_DOCTYPE_VALUE = /^[^>]+/;
const MATCH_DOCTYPE_CLOSE = /^>/;
const MATCH_XML_TAG = /^<\?xml.*?\?>\n/;
const MATCH_TAG_OPEN = /^<(\/?)([a-zA-Z0-9\-:]+)/;       // https://www.w3.org/TR/html/syntax.html#start-tags
const MATCH_TAG_CLOSE = /^\/?>/;
const MATCH_TEXT = /^[^]*?(?=([ \t]*\n|<|$))/;
const MATCH_TAG_LOOKAHEAD = /^[^]*?(?=<|$)/;
const MATCH_ATTR_START = /^([^\t\n\f \/><"'=]+)/;        // https://www.w3.org/TR/html/syntax.html#elements-attributes
const MATCH_ATTR_SINGLE = /^='([^']*?)(')/;
const MATCH_ATTR_DOUBLE = /^="([^"]*?)(")/;
const MATCH_ATTR_UNQUOTED = /^=([a-zA-Z0-9]+)/;
const MATCH_CDATA_BEGIN = /^<!\[CDATA\[/;
const MATCH_CDATA_END = /^[^]*?]]>/;
const MATCH_SCRIPT_DATA = /^[^]*?(?=<\/script)/;
const MATCH_SCRIPT_END = /^<(\/)(script)/;
const MATCH_COMMENT = /^<!--([^]*?)-->/;
 
export class InvalidTokenError extends Error {
	public location: Location;
 
	public constructor(location: Location, message: string){
		super(message);
		this.location = location;
	}
}
 
export class Lexer {
	*tokenize(source: Source): TokenStream {
		const context = new Context(source);
		context.state = State.INITIAL;
 
		/* for sanity check */
		let previousState: State = context.state;
		let previousLength: number = context.string.length;
 
		while (context.string.length > 0){
			switch (context.state){
			case State.INITIAL:
				yield* this.tokenizeInitial(context);
				break;
 
			case State.DOCTYPE:
				yield* this.tokenizeDoctype(context);
				break;
 
			case State.TAG:
				yield* this.tokenizeTag(context);
				break;
 
			case State.ATTR:
				yield* this.tokenizeAttr(context);
				break;
 
			case State.TEXT:
				yield* this.tokenizeText(context);
				break;
 
			case State.CDATA:
				yield* this.tokenizeCDATA(context);
				break;
 
			case State.SCRIPT:
				yield* this.tokenizeScript(context);
				break;
 
			default:
				this.unhandled(context);
			}
 
			/* sanity check: state or string must change, if both are intact
			 * we are stuck in an endless loop. */
			Iif (context.state === previousState && context.string.length === previousLength){
				this.errorStuck(context);
			}
 
			previousState = context.state;
			previousLength = context.string.length;
		}
 
		yield this.token(context, TokenType.EOF);
	}
 
	token(context: Context, type: TokenType, data?: any): Token {
		Iif (!type) throw Error("TokenType must be set");
		return {
			type,
			location: context.getLocation(),
			data,
		};
	}
 
	unhandled(context: Context){
		const truncated = JSON.stringify(context.string.length > 13 ? `${context.string.slice(0, 10)}...` : context.string);
		const message = `failed to tokenize ${truncated}, unhandled state ${State[context.state]}.`;
		throw new InvalidTokenError(context.getLocation(), message);
	}
 
	errorStuck(context: Context){
		const truncated = JSON.stringify(context.string.length > 13 ? `${context.string.slice(0, 10)}...` : context.string);
		const message = `failed to tokenize ${truncated}, state ${State[context.state]} failed to consume data or change state.`;
		throw new InvalidTokenError(context.getLocation(), message);
	}
 
	evalNextState(nextState: State | ((token: Token) => State), token: Token){
		if (typeof nextState === 'function'){
			return nextState(token);
		} else {
			return nextState;
		}
	}
 
	*match(context: Context, tests: Array<LexerTest>, error: string){
		let match = undefined;
		for (const test of tests){
			let token: Token = null;
			const regex = test[0];
			const nextState = test[1];
			const tokenType = test[2];
 
			if (regex === false || (match = context.string.match(regex))){
				if (tokenType !== false) yield (token = this.token(context, tokenType, match));
				const state = this.evalNextState(nextState, token);
				context.consume(match || 0, state);
				this.enter(context, state, match);
				return;
			}
		}
 
		const truncated = JSON.stringify(context.string.length > 13 ? `${context.string.slice(0, 10)}...` : context.string);
		const message = `failed to tokenize ${truncated}, ${error}.`;
		throw new InvalidTokenError(context.getLocation(), message);
	}
 
	/**
	 * Called when entering a new state.
	 */
	enter(context: Context, state: State, data: any){
		switch (state) {
		case State.TAG:
			/* request script tag tokenization */
			if (data && data[0][0] === '<'){
				if (data[0] === '<script'){
					context.contentModel = ContentModel.SCRIPT;
				} else {
					context.contentModel = ContentModel.TEXT;
				}
			}
			break;
		}
	}
 
	*tokenizeInitial(context: Context){
		yield* this.match(context, [
			[MATCH_XML_TAG, State.INITIAL, false],
			[MATCH_DOCTYPE_OPEN, State.DOCTYPE, TokenType.DOCTYPE_OPEN],
			[false, State.TEXT, false],
		], 'expected doctype');
	}
 
	*tokenizeDoctype(context: Context){
		yield* this.match(context, [
			[MATCH_WHITESPACE, State.DOCTYPE, TokenType.WHITESPACE],
			[MATCH_DOCTYPE_VALUE, State.DOCTYPE, TokenType.DOCTYPE_VALUE],
			[MATCH_DOCTYPE_CLOSE, State.TEXT, TokenType.DOCTYPE_CLOSE],
		], 'expected doctype name');
	}
 
	*tokenizeTag(context: Context){
		function nextState(token: Token){
			switch (context.contentModel){
			case ContentModel.TEXT:
				return State.TEXT;
			case ContentModel.SCRIPT:
				if (token.data[0][0] !== '/'){
					return State.SCRIPT;
				} else {
					return State.TEXT; /* <script/> (not legal but handle it anyway so the lexer doesn't choke on it) */
				}
			}
			return context.contentModel !== ContentModel.SCRIPT ? State.TEXT : State.SCRIPT;
		}
		yield* this.match(context, [
			[MATCH_TAG_CLOSE, nextState, TokenType.TAG_CLOSE],
			[MATCH_ATTR_START, State.ATTR, TokenType.ATTR_NAME],
			[MATCH_WHITESPACE, State.TAG, TokenType.WHITESPACE],
		], 'expected attribute, ">" or "/>"');
	}
 
	*tokenizeAttr(context: Context){
		yield* this.match(context, [
			[MATCH_ATTR_SINGLE, State.TAG, TokenType.ATTR_VALUE],
			[MATCH_ATTR_DOUBLE, State.TAG, TokenType.ATTR_VALUE],
			[MATCH_ATTR_UNQUOTED, State.TAG, TokenType.ATTR_VALUE],
			[false, State.TAG, false],
		], 'expected attribute, ">" or "/>"');
	}
 
	*tokenizeText(context: Context){
		yield* this.match(context, [
			[MATCH_WHITESPACE, State.TEXT, TokenType.WHITESPACE],
			[MATCH_CDATA_BEGIN, State.CDATA, false],
			[MATCH_COMMENT, State.TEXT, TokenType.COMMENT],
			[MATCH_TAG_OPEN, State.TAG, TokenType.TAG_OPEN],
			[MATCH_TEXT, State.TEXT, TokenType.TEXT],
			[MATCH_TAG_LOOKAHEAD, State.TEXT, TokenType.TEXT],
		], 'expected text or "<"');
	}
 
	*tokenizeCDATA(context: Context){
		yield* this.match(context, [
			[MATCH_CDATA_END, State.TEXT, false],
		], 'expected ]]>');
	}
 
	*tokenizeScript(context: Context){
		yield* this.match(context, [
			[MATCH_SCRIPT_END, State.TAG, TokenType.TAG_OPEN],
			[MATCH_SCRIPT_DATA, State.SCRIPT, TokenType.SCRIPT],
		], 'expected </script>');
	}
}