/*!
* RegexGen.js - JavaScript Regular Expression Generator v0.2.0
* https://github.com/amobiz/regexgen.js
*
* Supports CommonJS(node.js), AMD(RequireJS) and browser global.
*
* Released under the MIT license
* http://opensource.org/licenses/MIT
*
* Date: 2014-06-11
*/
/**
* @fileOverview RegexGen.js is a JavaScript regular expression generator
* that helps to construct complex regular expressions.
* @author Amobiz(amobiz.tw+github@gmail.com)
* @version 1.2.0
* @license MIT
*
*/
(function( factory ) {
'use strict';
if ( typeof module !== 'undefined' && module.exports ) {
module.exports = factory();
}
else if ( typeof define === 'function' && define.amd ) {
define( factory );
}
else {
window.regexGen = factory();
}
})(function(){
'use strict';
////////////////////////////////////////////////////////
// regexGen
////////////////////////////////////////////////////////
/**
* The Generator
* =============
*
* The generator is exported as the `regexGen()` function, everything must be referenced from it.
*
* To generate a regular expression, pass sub-expressions as parameters to the call of `regexGen()` function.
*
* Sub-expressions are then concatenated together to form the whole regular expression.
*
* Sub-expressions can either be a `string`, a `number`, a `RegExp` object, or any combinations of the call to methods (i.e., the `sub-generators`) of the `regexGen()` function object.
*
* Strings passed to the the call of `regexGen()`, `text()`, `maybe()`, `anyCharOf()` and `anyCharBut()` functions, are always escaped as necessary, so you don't have to worry about which characters to escape.
*
* The result of calling the `regexGen()` function is a `RegExp` object. See __<a href="#user-content-the-regexp-object">The RegExp Object</a>__ section for detail.
* Since everything must be referenced from the `regexGen()` function, to simplify codes, assign it to a short variable is preferable.
*
* @example
* var _ = regexGen;
*
* var regex = regexGen(
* _.startOfLine(),
* _.capture( 'http', _.maybe( 's' ) ), '://',
* _.capture( _.anyCharBut( ':/' ).repeat() ),
* _.group( ':', _.capture( _.digital().multiple(2,4) ) ).maybe(), '/',
* _.capture( _.anything() ),
* _.endOfLine()
* );
* var matches = regex.exec( url );
*
* @namespace regexGen
* @returns {RegExp} the generated RegExp object.
*/
function regexGen() {
var i, n, context, term, terms, pattern, modifiers, regex;
terms = [];
modifiers = [];
context = {
captures: [ '0' ],
warnings: []
};
for ( i = 0, n = arguments.length; i < n; ++i ) {
term = arguments[ i ];
if ( term instanceof Modifier ) {
if ( modifiers.indexOf( term._modifier ) !== -1 ) {
context.warnings.push( 'duplicated modifier: ' + term._modifier );
continue;
}
modifiers.push( term._modifier );
}
else {
terms.push( term );
}
}
pattern = new Sequence( terms )._generate( context, 0 );
regex = new RegExp( pattern, modifiers.join( '' ) );
_mixin(regex, {
warnings: context.warnings,
captures: context.captures,
extract: extract,
extractAll: extractAll
});
return regex;
}
_mixin( regexGen, {
/**
* A utility function helps using the regexGen generator.
* @memberof regexGen
* @param {Object} global - the target object that sub-generators will inject to.
*/
mixin: function( global ) {
_mixin( global, regexGen );
},
////////////////////////////////////////////////////
/// Modifiers
////////////////////////////////////////////////////
/**
* Case-insensitivity modifier.
* @memberof regexGen
* @static
*/
ignoreCase: function() {
return new Modifier( 'i' );
},
/**
* Default behaviour is with "g" modifier,
* so we can turn this another way around
* than other modifiers
* @memberof regexGen
* @static
*/
searchAll: function() {
return new Modifier( 'g' );
},
/**
* Multiline
* @memberof regexGen
* @static
*/
searchMultiLine: function() {
return new Modifier( 'm' );
},
////////////////////////////////////////////////////
// Boundaries
////////////////////////////////////////////////////
/**
* @memberof regexGen
* @returns {Term}
*/
startOfLine: function() {
return new Term( '^' );
},
/**
* @memberof regexGen
* @returns {Term}
*/
endOfLine: function() {
return new Term( '$' );
},
/**
* Matches a word boundary. A word boundary matches the position
* where a word character is not followed or preceeded by another word-character.
* Note that a matched word boundary is not included in the match.
* In other words, the length of a matched word boundary is zero.
* (Not to be confused with [\b].)
*
* @memberof regexGen
* @static
* @returns {Term} the word boundary expression term object.
*/
wordBoundary: function() {
return new Term( '\\b' );
},
/**
* Matches a non-word boundary.
* This matches a position where the previous and next character
* are of the same type: Either both must be words, or both must be non-words.
* The beginning and end of a string are considered non-words.
*
* @memberof regexGen
* @static
* @returns {Term} the non-word boundary expression term object.
*/
nonWordBoundary: function() {
return new Term( '\\B' );
},
////////////////////////////////////////////////////
// Literal Characters
////////////////////////////////////////////////////
/**
* Any character sequence (abc).
* @memberof regexGen
* @param {String} value the character sequence.
* @returns {Term} the text literal expression term object.
*/
text: function( value ) {
return Term.sanitize( value );
},
////////////////////////////////////////////////////
// Character Classes
////////////////////////////////////////////////////
/**
* Any given character ([abc])
* usage: anyCharOf( [ 'a', 'c' ], ['2', '6'], 'fgh', 'z' ): ([a-c2-6fghz])
* @memberof regexGen
* @returns {Term}
*/
anyCharOf: function() {
var warnings = [];
return new Term( '[' + Term.charClasses( arguments, true, warnings ) + ']' )._warn( warnings );
},
/**
* Anything but these characters ([^abc])
* usage: anyCharBut( [ 'a', 'c' ], ['2', '6'], 'fgh', 'z' ): ([^a-c2-6fghz])
* @memberof regexGen
* @returns {Term}
*/
anyCharBut: function() {
var warnings = [];
return new Term( '[^' + Term.charClasses( arguments, false, warnings ) + ']' )._warn( warnings );
},
////////////////////////////////////////////////////
// Character Shorthands
////////////////////////////////////////////////////
/**
* Matches any single character except the newline character (.)
* @memberof regexGen
* @returns {Term}
*/
anyChar: function() {
return new Term( '.' );
},
/**
* Matches the character with the code hh (two hexadecimal digits)
* @memberof regexGen
* @returns {Term}
*/
ascii: function() {
var i, n, value, values, warning;
values = '';
warning = [];
n = arguments.length;
if ( n > 0 ) {
for ( i = 0; i < n; ++i ) {
value = arguments[ i ];
if ( typeof value === 'string' && regexCodes.hexAsciiCodes.test( value ) ) {
values += '\\x' + value;
continue;
}
else if ( typeof value === 'number' && 0 <= value && value <= 0xFF ) {
values += '\\x' + toHex( value, 2 );
continue;
}
warning.push( value.toString() );
}
return new Term( values )._warn( warning.length === 0 ? '' : 'ascii(): values are not valid 2 hex digitals ascii code(s): ', warning );
}
return new Term()._warn( 'ascii(): no values given, should provides a 2 hex digitals ascii code or any number <= 0xFF.' );
},
/**
* Matches the character with the code hhhh (four hexadecimal digits).
* @memberof regexGen
* @returns {Term}
*/
unicode: function() {
var i, n, value, values, warning;
values = '';
warning = [];
n = arguments.length;
if ( n > 0 ) {
for ( i = 0, n = arguments.length; i < n; ++i ) {
value = arguments[ i ];
if ( typeof value === 'string' && regexCodes.hexUnicodes.test( value ) ) {
values += '\\u' + value;
continue;
}
else if ( typeof value === 'number' && 0 <= value && value <= 0xFFFF ) {
values += '\\u' + toHex( value, 4 );
continue;
}
warning.push( value.toString() );
}
return new Term( values )._warn( warning.length === 0 ? '' : 'unicode(): values are not valid 2 hex digitals unicode code(s): ', warning );
}
return new Term()._warn( 'unicode(): no values given, should provides a 2 hex digitals ascii code or any number <= 0xFFFF.' );
},
/**
* Matches a NULL (U+0000) character.
* Do not follow this with another digit,
* because \0<digits> is an octal escape sequence.
* @memberof regexGen
* @returns {Term}
*/
nullChar: function() {
return new Term( '\\0' );
},
/**
* Matches a control character in a string.
* Where X is a character ranging from A to Z.
* @memberof regexGen
* @returns {Term}
*/
controlChar: function( value ) {
if ( typeof value === 'string' && regexCodes.ctrlChars.test( value ) ) {
return new Term( '\\c' + value );
}
return new Term()._warn( 'controlChar(): specified character is not a valid control character: ', value );
},
/**
* Matches a backspace (U+0008).
* You need to use square brackets if you want to match a literal backspace character.
* (Not to be confused with \b.)
* @memberof regexGen
* @returns {Term}
*/
backspace: function() {
return new Term( '[\\b]' );
},
/**
* Matches a form feed: (\f)
* @memberof regexGen
* @returns {Term}
*/
formFeed: function() {
return new Term( '\\f' );
},
/**
* Matches a line feed: (\n)
* @memberof regexGen
* @returns {Term}
*/
lineFeed: function() {
return new Term( '\\n' );
},
/**
* Matches a carriage return: (\r)
* @memberof regexGen
* @returns {Term}
*/
carriageReturn: function() {
return new Term( '\\r' );
},
/**
* Matches a single white space character, including space, tab, form feed, line feed: (\s)
* @memberof regexGen
* @returns {Term}
*/
space: function() {
return new Term( '\\s' );
},
/**
* Matches a single character other than white space: (\S)
* @memberof regexGen
* @returns {Term}
*/
nonSpace: function() {
return new Term( '\\S' );
},
/**
* Matches a tab (U+0009): (\t)
* @memberof regexGen
* @returns {Term}
*/
tab: function() {
return new Term( '\\t' );
},
/**
* Matches a vertical tab (U+000B): (\v)
* @memberof regexGen
* @returns {Term}
*/
vertTab: function() {
return new Term( '\\v' );
},
/**
* Matches a digit character: (\d)
* @memberof regexGen
* @returns {Term}
*/
digital: function() {
return new Term( '\\d' );
},
/**
* Matches any non-digit character
* @memberof regexGen
* @returns {Term}
*/
nonDigital: function() {
return new Term( '\\D' );
},
/**
* Matches any alphanumeric character including the underscore: (\w)
* @memberof regexGen
* @returns {Term}
*/
word: function() {
return new Term( '\\w' );
},
/**
* Matches any non-word character.
* @memberof regexGen
* @returns {Term}
*/
nonWord: function() {
return new Term( '\\W' );
},
////////////////////////////////////////////////////
// Extended Character Shorthands
////////////////////////////////////////////////////
/**
* Matches any characters except the newline character: (.*)
* @memberof regexGen
* @returns {Term}
*/
anything: function() {
return new Term( '.', '*' );
},
/**
* @memberof regexGen
* @returns {Term}
*/
hexDigital: function() {
return new Term( '[0-9A-Fa-f]' );
},
/**
* Matches any line break, includes Unix and windows CRLF
* @memberof regexGen
* @returns {Term}
*/
lineBreak: function() {
return this.either( this.group( this.carriageReturn(), this.lineFeed() ),
this.carriageReturn(),
this.lineFeed()
);
//Term( '(?:\\r\\n|\\r|\\n)' );
},
/**
* Matches any alphanumeric character sequence including the underscore: (\w+)
* @memberof regexGen
* @returns {Term}
*/
words: function() {
return new Term( '\\w', '+' );
},
////////////////////////////////////////////////////
// Quantifiers
////////////////////////////////////////////////////
/**
* @memberof regexGen
* @returns {Term}
*/
any: function( value ) {
return Term.sanitize( value, '*' );
},
/**
* occurs one or more times (x+)
* @memberof regexGen
* @returns {Term}
*/
many: function( value ) {
return Term.sanitize( value, '+' );
},
/**
* Any optional character sequence, shortcut for Term.maybe ((?:abc)?)
* @memberof regexGen
* @returns {Term}
*/
maybe: function( value ) {
return Term.sanitize( value, '?' );
},
////////////////////////////////////////////////////
// Grouping and back references
////////////////////////////////////////////////////
/**
* Adds alternative expressions
* @memberof regexGen
* @returns {Sequence}
*/
either: function() {
return new Sequence( arguments, '', '', '|' )._warn(
arguments.length >= 2 ? '' : 'eidther(): this function needs at least 2 sub-expressions. given only: ', arguments[ 0 ]
);
},
/**
* Matches specified terms but does not remember the match. The generated parentheses are called non-capturing parentheses.
* @memberof regexGen
* @returns {Sequence}
*/
group: function() {
//return new Sequence( arguments, '(?:', ')' );
return new Sequence( arguments );
},
/**
* Matches specified terms and remembers the match. The generated parentheses are called capturing parentheses.
* label 是用來供 back reference 索引 capture 的編號。
* 計算方式是由左至右,計算左括號出現的順序,也就是先深後廣搜尋。
* capture( label('cap1'), capture( label('cap2'), 'xxx' ), capture( label('cap3'), '...' ), 'something else' )
* @memberof regexGen
* @returns {Capture}
*/
capture: function() {
var label, sequence;
if ( arguments.length > 0 && arguments[0] instanceof Label ) {
label = arguments[0]._label;
sequence = Array.prototype.slice.call( arguments, 1 );
}
else {
label = '';
sequence = arguments;
}
return new Capture( label, sequence );
},
/**
* label is a reference to a capture group, and is allowed only in the capture() method
* @memberof regexGen
* @returns {Label}
*/
label: function( label ) {
return new Label( label );
},
/**
* back reference
* @memberof regexGen
* @returns {CaptureReference}
*/
sameAs: function( label ) {
return new CaptureReference( label );
},
////////////////////////////////////////////////////
/**
* trust me, just put the value as is.
* @memberof regexGen
* @returns {Term | RegexOverwrite}
*/
regex: function( value ) {
if ( value instanceof RegExp ) {
return new RegexOverwrite( value.source );
}
else if ( typeof value === 'string' ) {
return new RegexOverwrite( value );
}
return new Term( value )._warn( 'regex(): specified regex is not a RegExp instance or is not a string: ', value );
}
});
////////////////////////////////////////////////////////
function extract( text ) {
var i, n, matches, json;
matches = this.exec( text ); // jshint ignore: line
if ( matches ) {
json = {};
for ( i = 0, n = matches.length; i < n; ++i ) {
json[ this.captures[ i ] ] = matches[ i ]; // jshint ignore: line
}
}
return json;
}
function extractAll( text ) {
var n, json, all;
if ( ! this.global ) { // jshint ignore: line
json = this.extract( text ); // jshint ignore: line
return json ? [ json ] : [];
}
all = [];
n = text.length;
this.lastIndex = 0;
while ( (json = this.extract( text )) ) { // jshint ignore: line
all.push( json );
}
return all;
}
////////////////////////////////////////////////////////
function _mixin( obj ) {
var i, k, ext;
for ( i = 1; i < arguments.length; ++i ) {
ext = arguments[ i ];
for ( k in ext ) {
if ( ext.hasOwnProperty( k ) ) {
obj[ k ] = ext[ k ];
}
}
}
return obj;
}
var zeropad = '00000000';
function toHex( value, digits ) {
var ret = value.toString( 16 );
if ( ret.length < digits ) {
return zeropad.substring( 0, digits - ret.length ) + ret;
}
return ret;
}
function isArray( o ) {
return ('[object Array]' === Object.prototype.toString.call( o ));
}
////////////////////////////////////////////////////
var regexCodes = {
captureParentheses: /(\((?!\?[:=!]))/g,
characterClassChars: /^(?:.|\\[bdDfnrsStvwW]|\\x[A-Fa-f0-9]{2}|\\u[A-Fa-f0-9]{4}|\\c[A-Z])$/,
characterClassExpr: /^\[\^?(.*)]$/,
ctrlChars: /^[A-Za-z]$/,
hexAsciiCodes: /^[0-9A-Fa-f]{2}$/,
hexUnicodes: /^[0-9A-Fa-f]{4}$/,
//
// Regular Expressions
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
// metaChars: /([.*+?^=!:${}()|\[\]\/\\])/g,
//
// How to escape regular expression in javascript?
// http://stackoverflow.com/questions/2593637/how-to-escape-regular-expression-in-javascript
// answerd by Gracenotes
// metaChars: /([.?*+^$[\]\\(){}|-])/g,
//
// using Gracenotes version plus '\/'.
// note that MDN's version includes: ':', '=', '!' and '-',
// they are metacharacters only when used in (?:), (?=), (?!) and [0-9] (character classes), respectively.
// metaChars: /([.?*+^$[\]\/\\(){}|-])/g,
//
// According to the book Regular Expression Cookbook
// (added '/' for convenience when using the /regex/ literal):
//
metaChars: /([$()*+.?[\\^{|\/])/g,
//
// What literal characters should be escaped in a regex? (corner cases)
// http://stackoverflow.com/questions/5484084/what-literal-characters-should-be-escaped-in-a-regex
//
// How to escape square brackets inside brackets in grep
// http://stackoverflow.com/questions/21635126/how-to-escape-square-brackets-inside-brackets-in-grep?rq=1
//
metaClassChars: /([-\]\\^])/g,
// treat any single character, meta characters, character classes, back reference, unicode character, ascii character,
// control character and special escaped character in regular expression as a unit term.
unitTerms: /^(?:.|\\[bBdDfnrsStvwW]|\\x[A-Fa-f0-9]{2}|\\u[A-Fa-f0-9]{4}|\\c[A-Z]|\\[$()*+.?[\/\\^{|]|\[(?:\\\]|[^\]])*?\]|\\\d{1,2})$/
};
////////////////////////////////////////////////////////
// Term
////////////////////////////////////////////////////////
/**
* Construct a Term object.
*
* The Term object represents a valid fragment of regular expression
* that forms a small part of the whole regular expression.
*
* @class Term
* @protected
* @param {Object} body - a valid regular expression unit.
* @param {String} quantifiers - the quantifiers applied on this term.
*/
function Term( body, quantifiers ) {
this._init( body, quantifiers );
}
_mixin( Term, /** @lends Term */{
/**
* Quote regular expression characters.
*
* Takes string and puts a backslash in front of every character that is part of the regular expression syntax.
*
* @static
* @protected
* @param {String} value - the string to quote.
* @returns {String} the quoted string.
*/
quote: function( value ) {
return value.replace( regexCodes.metaChars, '\\$1' );
},
/**
* Quote the terms so they can be put into character classes (square brackets).
*
* @static
* @protected
* @param {Array} list - the input term(s) to convert.
* @param {Boolean} positive - treat for positive or negative character classes.
* @param {Array} warnings - [output] a collection array to keep errors / warnings while converting character classes.
* @returns a single character sequence that can fit into character classes.
*/
charClasses: function( list, positive, warnings ) {
var i, v, sets, value, hyphen, circumflex;
hyphen = circumflex = '';
sets = [];
for ( i = 0; i < list.length; ++i ) {
v = list[ i ];
// range
if ( isArray( v ) ) {
if ( v.length === 2 &&
((typeof v[0] === 'number' && 0 <= v[0] && v[0] <= 9) || (typeof v[0] === 'string' && regexCodes.characterClassChars.test( v[0] ))) &&
((typeof v[1] === 'number' && 0 <= v[1] && v[1] <= 9) || (typeof v[1] === 'string' && regexCodes.characterClassChars.test( v[1] ))) ) {
sets.push( v[0] + '-' + v[1] );
continue;
}
}
// bunch of characters
else if ( typeof v === 'string' ) {
if ( v.indexOf( '-' ) !== -1 ) {
hyphen = '-';
v = v.replace( /-/g, '' );
}
if ( v.indexOf( '^' ) !== -1 ) {
circumflex = '^';
v = v.replace( /\^/g, '' );
}
sets.push( v.replace( regexCodes.metaClassChars, '\\$1' ) );
continue;
}
else if ( v instanceof Term ) {
if ( v._quantifiers ) {
warnings.push( 'ignoring quantifier of embeded character class: ' + v._quantifiers );
}
if ( v._preLookaheads || v._lookaheads ) {
warnings.push( 'ignoring lookaheads of embeded character class: ' + v._preLookaheads + ' : ' + v._lookaheads );
}
value = v._body.match( regexCodes.characterClassExpr );
if ( value && value[1] ) {
value = value[1];
if ( value[0] === '^' ) {
warnings.push( 'ignoring negation directive of embeded character class' );
value = value.substring( 1 );
}
sets.push( value );
continue;
}
else if ( regexCodes.characterClassChars.test( v._body ) ) {
sets.push( v._body );
continue;
}
}
warnings.push( 'invalid character: ' + v );
}
value = sets.join( '' );
if ( value ) {
return (hyphen + value + circumflex);
}
value = hyphen + circumflex;
if ( value.length === 1 && positive ) {
return Term.quote( value );
}
return value;
},
/**
* Sanitation function for adding anything safely to the expression.
*
* @static
* @protected
* @param {Object} body - the expression object to sanitize.
* @param {String} quantifiers - the quantifiers applied on this term.
* @returns {Term} a new Term object with contents sanitized.
*/
sanitize: function( body, quantifiers ) {
if ( body instanceof Term ) {
return body;
}
else if ( typeof body === 'string' ) {
return new Term( Term.quote( body ), quantifiers );
}
else if ( typeof body === 'number' ) {
return new Term( body.toString(), quantifiers );
}
else if ( body instanceof RegExp ) {
return new RegexOverwrite( body.source );
}
return new Term()._warn( 'invalid regular expression: ', body );
},
/**
* Test if the given expression is a unit term.
*
* @static
* @protected
* @param {String} expression - the expression string to test.
* @returns {Boolean} true is the given expression is a unit term.
*/
isUnitTerm: function( expression ) {
return regexCodes.unitTerms.test( expression );
},
/**
* Wrap the given expression if it is not a unit term.
*
* @static
* @protected
* @param {String} body - the expression string to test.
* @returns {String} a unit expression that is properly protected.
*/
wrap: function( body ) {
if ( Term.isUnitTerm( body ) ) {
return body;
}
return '(?:' + body + ')';
}
});
////////////////////////////////////////////////////
_mixin( Term.prototype, /** @lends Term.prototype */ {
/**
* Initialize the term object, setup default values.
*
* @protected
* @param {String} body - the expression string.
* @param {String} quantifiers - the quantifiers applied on this term.
*/
_init: function( body, quantifiers ) {
this._body = body || '';
this._quantifiers = quantifiers || '';
this._greedy = '';
this._preLookaheads = '';
this._lookaheads = '';
this._overwrite = '';
},
/**
*
* important: _generate and _generateBody should never modify the term object.
*
* implementation notes:
*
* termRequiresWrap tells fragile term(s) in sub-expression that if protection is required.
* There are 2 situations:
* 0.no: If there is only one term, then the terms need not protection at all.
* 1.maybe: If the sub-expression is composed with more then one term,
* and the terms will be evaluated in order, i.e., will be concatenated directly,
* then the terms need not protection, unless it is the "either" expression.
*
* [in traditional chinese]
*
* termRequiresWrap 是要通知元素是否需要使用 group 來保護內容。
*
* 有兩種狀況:
*
* 0.no: 元素沒有兄弟元素(僅有一個子元素),則元素本身不需要特別保護。
* 1.maybe: 有兄弟元素,且兄弟元素之間將直接接合(concatenated),
* 元素應視需要(目前只有 either 運算式有此需要)自我保護。
*
* @protected
* @param {Object} context - the context object of the regexGen generator.
* @param {Number} termRequiresWrap - should the term requires wrap. See possible values descripted above.
* @returns {String} the generated regular expression string literal.
*/
_generate: function( context, termRequiresWrap ) {
var i, n, body, bodyRequiresWrap;
function lookahead( value ) {
return typeof value === 'string' ? value : new Sequence( value )._generate( context, false );
}
bodyRequiresWrap = this._quantifiers ? 2 : (this._preLookaheads || this._lookaheads) ? 1 : 0;
bodyRequiresWrap = Math.max( bodyRequiresWrap, termRequiresWrap );
// let captures and labels have chances to evaluate.
body = this._generateBody( context, bodyRequiresWrap );
if ( this._warnings && this._warnings.length > 0 ) {
for ( i = 0, n = this._warnings.length; i < n; ++i ) {
context.warnings.push( this._warnings[ i ] );
}
}
if ( this._overwrite ) {
body = this._overwrite._generate( context, termRequiresWrap );
}
else {
body = lookahead( this._preLookaheads ) + body + (this._quantifiers ? (this._quantifiers + this._greedy) : '') + lookahead( this._lookaheads );
}
return body;
},
/**
* @protected
* @param {Object} context - the context object of the regexGen generator.
* @param {Number} bodyRequiresWrap - should the body of term requires wrap. See possible values descripted in {@link #_generate}.
* @returns {String} the generated regular expression string literal (body part).
*/
_generateBody: function( context, bodyRequiresWrap ) {
return bodyRequiresWrap === 2 ? Term.wrap( this._body ) : this._body;
},
/**
* @protected
* @param {String} msg -
* @param {Object} values -
*/
_warn: function( msg, values ) {
if ( msg ) {
if ( ! this._warnings ) {
this._warnings = [];
}
this._warnings.push( msg + (values ? JSON.stringify( values ) : '') );
}
return this;
},
////////////////////////////////////////////////////
// Lookahead
////////////////////////////////////////////////////
/**
* @param {...Object} terms -
*/
contains: function() {
var sequence = new Sequence( arguments, '(?=', ')' );
if ( typeof this._preLookaheads === 'string' ) {
this._preLookaheads = [ sequence ];
}
else {
this._preLookaheads.push( sequence );
}
return this;
},
/**
* @param {...Object} terms -
*/
notContains: function() {
var sequence = new Sequence( arguments, '(?!', ')' );
if ( typeof this._preLookaheads === 'string' ) {
this._preLookaheads = [ sequence ];
}
else {
this._preLookaheads.push( sequence );
}
return this;
},
/**
* Matches 'x' only if 'x' is followed by 'y'. This is called a lookahead. (x(?=y))
*
* @param {...Object} terms -
*/
followedBy: function() {
var sequence = new Sequence( arguments, '(?=', ')' );
if ( typeof this._lookaheads === 'string' ) {
this._lookaheads = [ sequence ];
}
else {
this._lookaheads.push( sequence );
}
return this;
},
/**
* Matches 'x' only if 'x' is not followed by 'y'. This is called a negated lookahead. (x(?!y))
*
* @param {...Object} terms -
*/
notFollowedBy: function() {
var sequence = new Sequence( arguments, '(?!', ')' );
if ( typeof this._lookaheads === 'string' ) {
this._lookaheads = [ sequence ];
}
else {
this._lookaheads.push( sequence );
}
return this;
},
////////////////////////////////////////////////////
// Quantifiers
////////////////////////////////////////////////////
/**
* Matches the expression generated by the preceding sub-generator 0 or more times. Equivalent to `/(.*)/` and `/.{0,}/`.
*/
any: function() {
this._quantifiers = '*';
return this;
},
/**
* occurs one or more times (x+)
*/
many: function() {
this._quantifiers = '+';
return this;
},
/**
* occurs zero or one times (x?)
*/
maybe: function() {
this._quantifiers = '?';
return this;
},
/**
* occurs at least once or exactly specified times (+|{n})
*/
repeat: function( times ) {
if ( typeof times === 'number' ) {
this._quantifiers = '{' + times + '}';
}
else {
this._quantifiers = '+';
}
return this;
},
/**
* occurs at least min times and (optional) at most max times (?|*|+|{min,}|{min,max})
* occurs at least min times and (optional) at most max times (?|*|+|{min,}|{min,max})
*/
multiple: function( minTimes, maxTimes ) {
minTimes = (typeof minTimes === 'number' ? minTimes.toString() : '0');
maxTimes = (typeof maxTimes === 'number' ? maxTimes.toString() : '');
if ( maxTimes === '' ) {
if ( minTimes === '0' ) {
this._quantifiers = '*';
return this;
}
else if ( minTimes === '1' ) {
this._quantifiers = '+';
return this;
}
}
// 'maybe' is more clear for this situation
else if ( minTimes === '0' && maxTimes === '1' ) {
this._quantifiers = '?';
return this;
}
// note that {,n} is not valid.
this._quantifiers = '{' + minTimes + ',' + maxTimes + '}';
return this;
},
/**
* Makes a quantifier greedy. Note that quantifier are greedy by default.
*
* @example
* anyChar().any().greedy() // ==> /.\u002A/
* anyChar().many().greedy() // ==> /.+/
* anyChar().maybe().greedy() // ==> /.?/
*/
greedy: function() {
this._greedy = '';
return this;
},
/**
* Makes a quantifier lazy.
*
* @example
* anyChar().any().lazy() // ==> /.*?/
* anyChar().many().lazy() // ==> /.+?/
* anyChar().maybe().lazy() // ==> /.??/
* anyChar().multiple(5,9).lazy() // ==> /.{5,9}?/
*
*/
lazy: function() {
this._greedy = '?';
return this;
},
/**
* This is an alias of [`lazy()`]{@link #lazy}.
*/
reluctant: function() {
this._greedy = '?';
return this;
},
// Term.prototype.possessive = function() {
// this._greedy = '+';
// };
////////////////////////////////////////////////////
/**
* Use the given regex, i.e., trust me, just put the value as is.
*
* @example
* regex( /\w\d/ ) // ==> /\w\d/
* regex( "\\w\\d" ) // ==> /\w\d/
*
* @param {RegExp | String} value
*/
regex: function( value ) {
if ( value instanceof RegExp ) {
this._overwrite = new RegexOverwrite( value.source );
}
else if ( typeof value === 'string' ) {
this._overwrite = new RegexOverwrite( value );
}
else {
this._warn( 'regex(): specified regex is not a RegExp instance or is not a string. given: ', value );
}
return this;
}
});
////////////////////////////////////////////////////////
// Sequence
////////////////////////////////////////////////////////
/**
* @class Sequence
* @extends Term
* @protected
*/
function Sequence( sequence, prefixes, suffixes, join ) {
this._init( Sequence.normalize( sequence ) );
this._prefixes = prefixes || '';
this._suffixes = suffixes || '';
this._join = join || '';
}
/**
* @memberof Sequence
* @static
* @function
*/
Sequence.normalize = function( list ) {
var i, n, term, terms;
terms = [];
if ( list && list.length > 0 ) {
for ( i = 0, n = list.length; i < n; ++i ) {
term = list[ i ];
term = Term.sanitize( term );
terms.push( term );
}
}
return terms;
};
Sequence.prototype = new Term();
_mixin( Sequence.prototype, /** @lends Sequence.prototype */{
/**
* bodyRequiresWrap 是要通知子元素是否需要使用 group 保護 body 內容主體。
* 有三種狀況:
* 0.no: 子元素沒有兄弟元素(僅有一個子元素),則子元素本身不需要特別保護。
* 1.maybe: 有兄弟元素,且兄弟元素之間將直接接合(concatenated),子元素應視需要自我保護(目前只有 either 運算式有此需要)。
* 2.must: 子元素具有 quantifiers,應視需要自我保護(除非是 unit term)。
* @protected
* @param context {Object} - The generator context.
* @param bodyRequiresWrap {Number} - The wrap information.
* @return {String} - The generated body expression.
*/
_generateBody: function( context, bodyRequiresWrap ) {
var i, n, term, terms, body, values, termRequiresWrap;
terms = this._body;
// 下列兩種狀況下,子元素不需特別加以保護:
// 1.若只有一個子元素,
// 2.若母運算式採用 either 運算子 (|),
// 由於 either 的優先權極小,內部分隔的子元素不需要保護來自兄弟元素的侵擾。
// 可以將各個子元素視為已受群組保護,而只需要保護好整個 either 母運算式不受外部侵擾即可。
// (見下面說明)
termRequiresWrap = (terms.length === 1 || this._join === '|') ? 0 : 1;
values = [];
for ( i = 0, n = terms.length; i < n; ++i ) {
term = terms[ i ];
body = term._generate( context, termRequiresWrap );
values.push( body );
}
body = values.join( this._join );
if ( this._prefixes || this._suffixes ) {
return this._prefixes + body + this._suffixes;
}
// 下列兩種狀況,此母運算式需要自我保護:
// 1.若 bodyRequiresWrap === 2,表示外部要求一定要群組,目前只有當元素具有 quantifiers 時,才會符合此項。
// 2.若 bodyRequiresWrap === 1,表示目前的運算式將與其他運算式直接接合(concatenated),此時需要保護 either 運算式。
// 注意,若 bodyRequiresWrap === 0,表示此母運算式已受適當的保護,不需要擔心受到外部及兄弟元素的侵擾。
//
// switch ( bodyRequiresWrap.toString() + termRequiresWrap.toString() ) {
// case '00': // /()((a))/ => /a/
// // /()((a)|(b))/ => /a|b/
// case '01': // /()((a)(b))/ => /ab/
// case '10': // /(o)((a))/ => /oa/
// // /(o)((a)|(b))/ => /o(a|b)/
// case '11': // /(o)((a)(b))/ => /oab/
// case '20': // /(o)((a))?/ => /o(a)?/
// // /(o)((a)|(b))?/ => /o(a|b)?/
// case '21': // /(o)((a)(b))?/ => /o(ab)?/
// }
//
// 注意:註解的 if 判斷式與下面的 if 判斷式等價,但比較容易了解。
// if ( bodyRequiresWrap === 2 || (bodyRequiresWrap === 1 && terms.length !== 1 && this._join === '|') ) {
if ( bodyRequiresWrap === 2 || (bodyRequiresWrap === 1 && ! termRequiresWrap) ) {
return Term.wrap( body );
}
return body;
}
});
////////////////////////////////////////////////////////
// Capture
////////////////////////////////////////////////////////
/**
* @class Capture
* @extends Sequence
* @protected
*/
function Capture( label, sequence ) {
Sequence.call( this, sequence, '(', ')' );
this._label = label;
}
/**
* @memberof Capture
* @static
* @function
*/
Capture.currentLabel = function( context ) {
return Label.normalize( context.captures.length );
};
/**
* @memberof Capture
* @static
* @function
*/
Capture.register = function( context, captureLabel ) {
context.captures.push( captureLabel );
};
/**
* @memberof Capture
* @static
* @function
*/
Capture.lookup = function( context, captureLabel ) {
var index;
index = context.captures.indexOf( captureLabel );
if ( index !== -1 ) {
return '\\' + index;
}
return null;
};
Capture.prototype = new Sequence();
_mixin( Capture.prototype, {
/**
* @memberof Capture#
* @protected
* @function
*/
_generateBody: function( context, bodyRequiresWrap ) {
// note: don't assign this._label here, or the regex can't reuse.
Capture.register( context, this._label === '' ? Capture.currentLabel( context ) : this._label );
return Sequence.prototype._generateBody.call( this, context, bodyRequiresWrap );
}
});
////////////////////////////////////////////////////////
// CaptureReference
////////////////////////////////////////////////////////
/**
* @class CaptureReference
* @extends Term
* @protected
*/
function CaptureReference( label ) {
this._init();
this._label = Label.normalize( label );
}
CaptureReference.prototype = new Term();
_mixin( CaptureReference.prototype, {
/**
* @memberof CaptureReference
* @protected
* @function
* @protected
*/
_generateBody: function( context /*, bodyRequiresWrap */ ) {
var backreference = Capture.lookup( context, this._label );
if ( backreference ) {
return backreference;
}
this._warn( 'sameAs(): back reference has no matching capture: ', this._label );
return '';
}
});
////////////////////////////////////////////////////////
// Label
////////////////////////////////////////////////////////
/**
* @class Label
* @protected
*/
function Label( label ) {
this._label = label;
}
/**
* @memberof Label
* @static
* @function
* @protected
*/
Label.normalize = function( label ) {
if ( typeof label === 'string' ) {
return label;
}
else if ( typeof label === 'number' ) {
return label.toString();
}
else if ( label instanceof Label ) {
return label._label;
}
return '__invalid_label__(' + label.toString() + ')';
};
////////////////////////////////////////////////////////
// RegexOverwrite
////////////////////////////////////////////////////////
/**
* @class RegexOverwrite
* @extends Term
* @protected
*/
function RegexOverwrite( value ) {
this._init( value );
}
RegexOverwrite.prototype = new Term();
_mixin( RegexOverwrite.prototype, {
/**
* @memberof RegexOverwrite
* @function
* @protected
*/
_registerCaptures: function( context ) {
var i, n, captures;
captures = this._body.match( regexCodes.captureParentheses );
if ( captures && captures.length > 0 ) {
for ( i = 0, n = captures.length; i < n; ++i ) {
Capture.register( context, Capture.currentLabel( context ) );
}
}
},
/**
* @memberof RegexOverwrite
* @function
* @protected
*/
_generateBody: function( context, bodyRequiresWrap ) {
this._registerCaptures( context );
return Term.prototype._generateBody.call( this, context, bodyRequiresWrap );
}
});
////////////////////////////////////////////////////////
// Modifier
////////////////////////////////////////////////////////
/**
* @class Modifier
* @protected
*/
function Modifier( modifier ) {
this._modifier = modifier;
}
return regexGen;
});