1 2 // expose to the world 3 module.exports.stripHTML = stripHTML; 4 5 /** 6 * <p>Converts a HTML stringo into plaintext format that resembles Markdown</p> 7 * 8 * <p>Only good for simple and valid HTML, probably breaks on eveything else</p> 9 * 10 * <p>Placeholders:</p> 11 * 12 * <ul> 13 * <li>-\u0000\u0000- for newline</li> 14 * <li>-\u0001\u0001- for a space</li> 15 * <li>-\u0002\u0002- temporary newlines</li> 16 * </ul> 17 * 18 * @param {String} str HTML string to convert 19 * @return {String} Plaintext that resembles Markdown 20 */ 21 function stripHTML(str){ 22 str = (str || "").toString("utf-8").trim(); 23 24 // replace newlines 25 str = str.replace(/\r?\n|\r/g,"-\u0002\u0002-"); 26 27 // convert block element endings to linebreak markers 28 str = str.replace(/<(?:\/p|br|\/tr|\/table|\/div)>/g,"-\u0000\u0000--\u0000\u0000-"); 29 30 // H1-H6, add underline or prepend with # 31 str = str.replace(/<[hH](\d)[^>]*>(.*?)<\/[hH]\d[^>]*>/g,function(match, level, content){ 32 var line = "", 33 symbol, // line symbol char 34 i, len; 35 36 level = Number(level) || 0; 37 38 content = content.replace(/<[^>]*>/g," "). 39 replace(/\s\s+/g," "). 40 trim(); 41 42 if(!content){ 43 // the tag was empty or only included other tags (<img> and such), nothing to show 44 return ""; 45 } 46 47 // select correct symbol for the line 48 switch(level){ 49 case 1: 50 symbol = "="; 51 len = content.length; 52 break; 53 case 2: 54 symbol = "-"; 55 len = content.length; 56 break; 57 default: 58 symbol = "#"; 59 len = level; 60 } 61 62 line = new Array(len+1).join(symbol); 63 64 if(symbol == "#"){ 65 // prepend the line: 66 // ### This is a H3 67 return line + " " + content + "\n\n"; 68 }else{ 69 // add underline: 70 // This is a H1 71 // ============ 72 return content + "\n" + line + "\n\n"; 73 } 74 75 }); 76 77 // B 78 str = str.replace(/<(?:b|strong)(?: [^>])?>(.*?)<\/(?:b|strong)>/ig,function(match, content){ 79 return "**" + content.trim() + "**"; 80 }); 81 82 // U 83 str = str.replace(/<u(?: [^>])?>(.*?)<\/u>/ig,function(match, content){ 84 return "_" + content.trim() + "_"; 85 }); 86 87 // EM 88 str = str.replace(/<(?:i|em)(?: [^>])?>(.*?)<\/(?:i|em)>/ig,function(match, content){ 89 return "*" + content.trim() + "*"; 90 }); 91 92 // CODE 93 str = str.replace(/<code(?: [^>])?>(.*?)<\/code>/ig,function(match, content){ 94 return "`" + content.trim() + "`"; 95 }); 96 97 // A 98 str = str.replace(/<a ([^>]*)>(.*?)<\/a[^>]*>/ig,function(match, params, content){ 99 var paramMatch = params.match(/href\s*=\s*['"]([^'"]+)['"]/), 100 url = paramMatch && paramMatch[1] || "#"; 101 102 return "(" + content.trim() + ")" + "[" + url +"]"; 103 }); 104 105 // UL, replace with newlines 106 str = str.replace(/(<\/(?:ul|ol)>)/gi,"$1-\u0000\u0000--\u0000\u0000-"); 107 108 // LI, indent by 2 spaces + * 109 str = str.replace(/<li[^>]*>(.*?)<\/?(?:li|ol|ul)[^>]*>/ig,function(match, content){ 110 111 content = content.replace(/<[^>]*>/g," "). 112 replace(/\s\s+/g," "). 113 trim(); 114 115 if(!content){ 116 // the tag was empty or only included other tags (<img> and such), nothing to show 117 return ""; 118 } 119 120 // return with the space placeholders 121 return "-\u0001\u0001--\u0001\u0001-* " + content + "\n"; 122 }); 123 124 // PRE, indent by 4 spaces 125 str = str.replace(/<pre[^>]*>(.*?)<\/pre[^>]*>/ig,function(match, content){ 126 if(!content){ 127 return ""; 128 } 129 130 // remove empty lines before and after 131 content = content.replace(/^((?:[ \t]*)\-\u0002\u0002\-)+|((?:\-\u0002\u0002\-[ \t]*))+$/g, ""); 132 133 // replace tabs with 4 spaces 134 content = content.replace(/\t/g, " "); 135 136 // replace temp. linebreak placeholders with 4 space placehorlders 137 content = content.replace(/\-\u0002\u0002\-([ ]*)/g, function(match, spaces){ 138 // keep spaces in the beginning of the lines 139 var spaces = spaces.replace(/ /g, "-\u0001\u0001-"); 140 141 return "\n-\u0001\u0001--\u0001\u0001--\u0001\u0001--\u0001\u0001-" + spaces; 142 }); 143 144 content = content.replace(/</g,"<").replace(/>/g,">"); 145 146 // add prepending 4 spaces 147 return "\n-\u0001\u0001--\u0001\u0001--\u0001\u0001--\u0001\u0001-" + content.trim() + "\n\n"; 148 }); 149 150 // remove all remaining html tags 151 str = str.replace(/<[^>]*>/g," "); 152 153 // remove duplicate spaces 154 str = str.replace(/[ ][ ]+/g," "); 155 156 // remove temp. newlines 157 str = str.replace(/-\u0002\u0002-/g," "); 158 159 // restore newlines 160 str = str.replace(/-\u0000\u0000-/g,"\n"); 161 162 // remove spaces before and after newlines 163 str = str.replace(/[ \t]*\n[ \t]*/g,"\n"); 164 165 // remove more than 2 newlines in a row 166 str = str.replace(/\n\n+/g,"\n\n"); 167 168 // restore hidden spaces 169 str = str.replace(/-\u0001\u0001-/g," "); 170 171 // decode HTML entities (< and such) 172 str = decodeHTMLEntities(str); 173 174 return str.trim(); 175 } 176 177 var HTMLEntities = { 178 apos:0x0027,quot:0x0022,amp:0x0026,lt:0x003C,gt:0x003E,nbsp:0x00A0,iexcl:0x00A1,cent:0x00A2,pound:0x00A3, 179 curren:0x00A4,yen:0x00A5,brvbar:0x00A6,sect:0x00A7,uml:0x00A8,copy:0x00A9,ordf:0x00AA,laquo:0x00AB, 180 not:0x00AC,shy:0x00AD,reg:0x00AE,macr:0x00AF,deg:0x00B0,plusmn:0x00B1,sup2:0x00B2,sup3:0x00B3, 181 acute:0x00B4,micro:0x00B5,para:0x00B6,middot:0x00B7,cedil:0x00B8,sup1:0x00B9,ordm:0x00BA,raquo:0x00BB, 182 frac14:0x00BC,frac12:0x00BD,frac34:0x00BE,iquest:0x00BF,Agrave:0x00C0,Aacute:0x00C1,Acirc:0x00C2,Atilde:0x00C3, 183 Auml:0x00C4,Aring:0x00C5,AElig:0x00C6,Ccedil:0x00C7,Egrave:0x00C8,Eacute:0x00C9,Ecirc:0x00CA,Euml:0x00CB, 184 Igrave:0x00CC,Iacute:0x00CD,Icirc:0x00CE,Iuml:0x00CF,ETH:0x00D0,Ntilde:0x00D1,Ograve:0x00D2,Oacute:0x00D3, 185 Ocirc:0x00D4,Otilde:0x00D5,Ouml:0x00D6,times:0x00D7,Oslash:0x00D8,Ugrave:0x00D9,Uacute:0x00DA,Ucirc:0x00DB, 186 Uuml:0x00DC,Yacute:0x00DD,THORN:0x00DE,szlig:0x00DF,agrave:0x00E0,aacute:0x00E1,acirc:0x00E2,atilde:0x00E3, 187 auml:0x00E4,aring:0x00E5,aelig:0x00E6,ccedil:0x00E7,egrave:0x00E8,eacute:0x00E9,ecirc:0x00EA,euml:0x00EB, 188 igrave:0x00EC,iacute:0x00ED,icirc:0x00EE,iuml:0x00EF,eth:0x00F0,ntilde:0x00F1,ograve:0x00F2,oacute:0x00F3, 189 ocirc:0x00F4,otilde:0x00F5,ouml:0x00F6,divide:0x00F7,oslash:0x00F8,ugrave:0x00F9,uacute:0x00FA,ucirc:0x00FB, 190 uuml:0x00FC,yacute:0x00FD,thorn:0x00FE,yuml:0x00FF,OElig:0x0152,oelig:0x0153,Scaron:0x0160,scaron:0x0161, 191 Yuml:0x0178,fnof:0x0192,circ:0x02C6,tilde:0x02DC,Alpha:0x0391,Beta:0x0392,Gamma:0x0393,Delta:0x0394, 192 Epsilon:0x0395,Zeta:0x0396,Eta:0x0397,Theta:0x0398,Iota:0x0399,Kappa:0x039A,Lambda:0x039B,Mu:0x039C, 193 Nu:0x039D,Xi:0x039E,Omicron:0x039F,Pi:0x03A0,Rho:0x03A1,Sigma:0x03A3,Tau:0x03A4,Upsilon:0x03A5, 194 Phi:0x03A6,Chi:0x03A7,Psi:0x03A8,Omega:0x03A9,alpha:0x03B1,beta:0x03B2,gamma:0x03B3,delta:0x03B4, 195 epsilon:0x03B5,zeta:0x03B6,eta:0x03B7,theta:0x03B8,iota:0x03B9,kappa:0x03BA,lambda:0x03BB,mu:0x03BC, 196 nu:0x03BD,xi:0x03BE,omicron:0x03BF,pi:0x03C0,rho:0x03C1,sigmaf:0x03C2,sigma:0x03C3,tau:0x03C4, 197 upsilon:0x03C5,phi:0x03C6,chi:0x03C7,psi:0x03C8,omega:0x03C9,thetasym:0x03D1,upsih:0x03D2,piv:0x03D6, 198 ensp:0x2002,emsp:0x2003,thinsp:0x2009,zwnj:0x200C,zwj:0x200D,lrm:0x200E,rlm:0x200F,ndash:0x2013, 199 mdash:0x2014,lsquo:0x2018,rsquo:0x2019,sbquo:0x201A,ldquo:0x201C,rdquo:0x201D,bdquo:0x201E,dagger:0x2020, 200 Dagger:0x2021,bull:0x2022,hellip:0x2026,permil:0x2030,prime:0x2032,Prime:0x2033,lsaquo:0x2039,rsaquo:0x203A, 201 oline:0x203E,frasl:0x2044,euro:0x20AC,image:0x2111,weierp:0x2118,real:0x211C,trade:0x2122,alefsym:0x2135, 202 larr:0x2190,uarr:0x2191,rarr:0x2192,darr:0x2193,harr:0x2194,crarr:0x21B5,lArr:0x21D0,uArr:0x21D1, 203 rArr:0x21D2,dArr:0x21D3,hArr:0x21D4,forall:0x2200,part:0x2202,exist:0x2203,empty:0x2205,nabla:0x2207, 204 isin:0x2208,notin:0x2209,ni:0x220B,prod:0x220F,sum:0x2211,minus:0x2212,lowast:0x2217,radic:0x221A, 205 prop:0x221D,infin:0x221E,ang:0x2220,and:0x2227,or:0x2228,cap:0x2229,cup:0x222A,"int":0x222B, 206 there4:0x2234,sim:0x223C,cong:0x2245,asymp:0x2248,ne:0x2260,equiv:0x2261,le:0x2264,ge:0x2265, 207 sub:0x2282,sup:0x2283,nsub:0x2284,sube:0x2286,supe:0x2287,oplus:0x2295,otimes:0x2297,perp:0x22A5, 208 sdot:0x22C5,lceil:0x2308,rceil:0x2309,lfloor:0x230A,rfloor:0x230B,lang:0x2329,rang:0x232A,loz:0x25CA, 209 spades:0x2660,clubs:0x2663,hearts:0x2665,diams:0x2666 210 }; 211 212 /** 213 * <p>Decodes any HTML entities in a string into their unicode form</p> 214 * 215 * @param {String} text text to decode 216 * @return {String} Decoded text 217 */ 218 decodeHTMLEntities = function(text){ 219 return text.replace(/&(.+?);/g, function(str, ent){ 220 return String.fromCharCode( ent[0] !== '#' ? HTMLEntities[ent] : ent[1] === 'x' ? parseInt(ent.substr(2),16) : parseInt(ent.substr(1), 10) ); 221 } 222 ); 223 };