/*global Autolinker */ /** * @private * @class Autolinker.htmlParser.HtmlParser * @extends Object * * An HTML parser implementation which simply walks an HTML string and returns an array of * {@link Autolinker.htmlParser.HtmlNode HtmlNodes} that represent the basic HTML structure of the input string. * * Autolinker uses this to only link URLs/emails/Twitter handles within text nodes, effectively ignoring / "walking * around" HTML tags. */ Autolinker.htmlParser.HtmlParser = Autolinker.Util.extend( Object, { /** * @private * @property {RegExp} htmlRegex * * The regular expression used to pull out HTML tags from a string. Handles namespaced HTML tags and * attribute names, as specified by http://www.w3.org/TR/html-markup/syntax.html. * * Capturing groups: * * 1. The "!DOCTYPE" tag name, if a tag is a <!DOCTYPE> tag. * 2. If it is an end tag, this group will have the '/'. * 3. The tag name for all tags (other than the <!DOCTYPE> tag) */ htmlRegex : (function() { var tagNameRegex = /[0-9a-zA-Z][0-9a-zA-Z:]*/, attrNameRegex = /[^\s\0"'>\/=\x01-\x1F\x7F]+/, // the unicode range accounts for excluding control chars, and the delete char attrValueRegex = /(?:"[^"]*?"|'[^']*?'|[^'"=<>`\s]+)/, // double quoted, single quoted, or unquoted attribute values nameEqualsValueRegex = attrNameRegex.source + '(?:\\s*=\\s*' + attrValueRegex.source + ')?'; // optional '=[value]' return new RegExp( [ // for <!DOCTYPE> tag. Ex: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">) '(?:', '<(!DOCTYPE)', // *** Capturing Group 1 - If it's a doctype tag // Zero or more attributes following the tag name '(?:', '\\s+', // one or more whitespace chars before an attribute // Either: // A. attr="value", or // B. "value" alone (To cover example doctype tag: <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">) '(?:', nameEqualsValueRegex, '|', attrValueRegex.source + ')', ')*', '>', ')', '|', // All other HTML tags (i.e. tags that are not <!DOCTYPE>) '(?:', '<(/)?', // Beginning of a tag. Either '<' for a start tag, or '</' for an end tag. // *** Capturing Group 2: The slash or an empty string. Slash ('/') for end tag, empty string for start or self-closing tag. // *** Capturing Group 3 - The tag name '(' + tagNameRegex.source + ')', // Zero or more attributes following the tag name '(?:', '\\s+', // one or more whitespace chars before an attribute nameEqualsValueRegex, // attr="value" (with optional ="value" part) ')*', '\\s*/?', // any trailing spaces and optional '/' before the closing '>' '>', ')' ].join( "" ), 'gi' ); } )(), /** * @private * @property {RegExp} htmlCharacterEntitiesRegex * * The regular expression that matches common HTML character entities. * * Ignoring & as it could be part of a query string -- handling it separately. */ htmlCharacterEntitiesRegex: /( | |<|<|>|>|"|"|')/gi, /** * Parses an HTML string and returns a simple array of {@link Autolinker.htmlParser.HtmlNode HtmlNodes} to represent * the HTML structure of the input string. * * @param {String} html The HTML to parse. * @return {Autolinker.htmlParser.HtmlNode[]} */ parse : function( html ) { var htmlRegex = this.htmlRegex, currentResult, lastIndex = 0, textAndEntityNodes, nodes = []; // will be the result of the method while( ( currentResult = htmlRegex.exec( html ) ) !== null ) { var tagText = currentResult[ 0 ], tagName = currentResult[ 1 ] || currentResult[ 3 ], // The <!DOCTYPE> tag (ex: "!DOCTYPE"), or another tag (ex: "a" or "img") isClosingTag = !!currentResult[ 2 ], inBetweenTagsText = html.substring( lastIndex, currentResult.index ); // Push TextNodes and EntityNodes for any text found between tags if( inBetweenTagsText ) { textAndEntityNodes = this.parseTextAndEntityNodes( inBetweenTagsText ); nodes.push.apply( nodes, textAndEntityNodes ); } // Push the ElementNode nodes.push( this.createElementNode( tagText, tagName, isClosingTag ) ); lastIndex = currentResult.index + tagText.length; } // Process any remaining text after the last HTML element. Will process all of the text if there were no HTML elements. if( lastIndex < html.length ) { var text = html.substring( lastIndex ); // Push TextNodes and EntityNodes for any text found between tags if( text ) { textAndEntityNodes = this.parseTextAndEntityNodes( text ); nodes.push.apply( nodes, textAndEntityNodes ); } } return nodes; }, /** * Parses text and HTML entity nodes from a given string. The input string should not have any HTML tags (elements) * within it. * * @private * @param {String} text The text to parse. * @return {Autolinker.htmlParser.HtmlNode[]} An array of HtmlNodes to represent the * {@link Autolinker.htmlParser.TextNode TextNodes} and {@link Autolinker.htmlParser.EntityNode EntityNodes} found. */ parseTextAndEntityNodes : function( text ) { var nodes = [], textAndEntityTokens = Autolinker.Util.splitAndCapture( text, this.htmlCharacterEntitiesRegex ); // split at HTML entities, but include the HTML entities in the results array // Every even numbered token is a TextNode, and every odd numbered token is an EntityNode // For example: an input `text` of "Test "this" today" would turn into the // `textAndEntityTokens`: [ 'Test ', '"', 'this', '"', ' today' ] for( var i = 0, len = textAndEntityTokens.length; i < len; i += 2 ) { var textToken = textAndEntityTokens[ i ], entityToken = textAndEntityTokens[ i + 1 ]; if( textToken ) nodes.push( this.createTextNode( textToken ) ); if( entityToken ) nodes.push( this.createEntityNode( entityToken ) ); } return nodes; }, /** * Factory method to create an {@link Autolinker.htmlParser.ElementNode ElementNode}. * * @private * @param {String} tagText The full text of the tag (element) that was matched, including its attributes. * @param {String} tagName The name of the tag. Ex: An <img> tag would be passed to this method as "img". * @param {Boolean} isClosingTag `true` if it's a closing tag, false otherwise. * @return {Autolinker.htmlParser.ElementNode} */ createElementNode : function( tagText, tagName, isClosingTag ) { return new Autolinker.htmlParser.ElementNode( { text : tagText, tagName : tagName.toLowerCase(), closing : isClosingTag } ); }, /** * Factory method to create a {@link Autolinker.htmlParser.EntityNode EntityNode}. * * @private * @param {String} text The text that was matched for the HTML entity (such as '&nbsp;'). * @return {Autolinker.htmlParser.EntityNode} */ createEntityNode : function( text ) { return new Autolinker.htmlParser.EntityNode( { text: text } ); }, /** * Factory method to create a {@link Autolinker.htmlParser.TextNode TextNode}. * * @private * @param {String} text The text that was matched. * @return {Autolinker.htmlParser.TextNode} */ createTextNode : function( text ) { return new Autolinker.htmlParser.TextNode( { text: text } ); } } );