1 
  2 // expose to the world
  3 module.exports.stripHTML = stripHTML;
  4 
  5 /**
  6  * <p>Converts a HTML stringo into plaintext format that resembles Markdown</p>
  7  * 
  8  * <p>Only good for simple and valid HTML, probably breaks on eveything else</p>
  9  *
 10  * <p>Placeholders:</p>
 11  * 
 12  * <ul>
 13  *     <li>-\u0000\u0000- for newline</li>
 14  *     <li>-\u0001\u0001- for a space</li>
 15  *     <li>-\u0002\u0002- temporary newlines</li>
 16  * </ul>
 17  * 
 18  * @param {String} str HTML string to convert
 19  * @return {String} Plaintext that resembles Markdown
 20  */
 21 function stripHTML(str){
 22     str = (str || "").toString("utf-8").trim();
 23 
 24     // replace newlines
 25     str = str.replace(/\r?\n|\r/g,"-\u0002\u0002-");
 26 
 27     // convert block element endings to linebreak markers
 28     str = str.replace(/<(?:\/p|br|\/tr|\/table|\/div)>/g,"-\u0000\u0000--\u0000\u0000-");
 29 
 30     // H1-H6, add underline or prepend with #
 31     str = str.replace(/<[hH](\d)[^>]*>(.*?)<\/[hH]\d[^>]*>/g,function(match, level, content){
 32         var line = "",
 33             symbol, // line symbol char
 34             i, len;
 35 
 36         level = Number(level) || 0;
 37 
 38         content = content.replace(/<[^>]*>/g," ").
 39                     replace(/\s\s+/g," ").
 40                     trim();
 41 
 42         if(!content){
 43             // the tag was empty or only included other tags (<img> and such), nothing to show
 44             return "";
 45         }
 46 
 47         // select correct symbol for the line
 48         switch(level){
 49             case 1:
 50                 symbol = "=";
 51                 len = content.length;
 52                 break;
 53             case 2:
 54                 symbol = "-";
 55                 len = content.length;
 56                 break;
 57             default:
 58                 symbol = "#";
 59                 len = level;
 60         }
 61 
 62         line = new Array(len+1).join(symbol);
 63 
 64         if(symbol == "#"){
 65             // prepend the line:
 66             // ### This is a H3
 67             return line + " " + content + "\n\n";
 68         }else{
 69             // add underline:
 70             // This is a H1
 71             // ============
 72             return content + "\n" + line + "\n\n";
 73         }
 74         
 75     });
 76 
 77     // B
 78     str = str.replace(/<(?:b|strong)(?: [^>])?>(.*?)<\/(?:b|strong)>/ig,function(match, content){
 79         return "**" + content.trim() + "**";
 80     });
 81 
 82     // U
 83     str = str.replace(/<u(?: [^>])?>(.*?)<\/u>/ig,function(match, content){
 84         return "_" + content.trim() + "_";
 85     });
 86 
 87     // EM
 88     str = str.replace(/<(?:i|em)(?: [^>])?>(.*?)<\/(?:i|em)>/ig,function(match, content){
 89         return "*" + content.trim() + "*";
 90     });
 91 
 92     // CODE
 93     str = str.replace(/<code(?: [^>])?>(.*?)<\/code>/ig,function(match, content){
 94         return "`" + content.trim() + "`";
 95     });
 96 
 97     // A
 98     str = str.replace(/<a ([^>]*)>(.*?)<\/a[^>]*>/ig,function(match, params, content){
 99         var paramMatch = params.match(/href\s*=\s*['"]([^'"]+)['"]/),
100             url = paramMatch && paramMatch[1] || "#";
101 
102         return "(" + content.trim() + ")" + "[" + url +"]";
103     });
104 
105     // UL, replace with newlines
106     str = str.replace(/(<\/(?:ul|ol)>)/gi,"$1-\u0000\u0000--\u0000\u0000-");
107 
108     // LI, indent by 2 spaces + *
109     str = str.replace(/<li[^>]*>(.*?)<\/?(?:li|ol|ul)[^>]*>/ig,function(match, content){
110         
111         content = content.replace(/<[^>]*>/g," ").
112                     replace(/\s\s+/g," ").
113                     trim();
114 
115         if(!content){
116             // the tag was empty or only included other tags (<img> and such), nothing to show
117             return "";
118         }
119 
120         // return with the space placeholders
121         return "-\u0001\u0001--\u0001\u0001-* " + content + "\n";
122     });
123 
124     // PRE, indent by 4 spaces
125     str = str.replace(/<pre[^>]*>(.*?)<\/pre[^>]*>/ig,function(match, content){
126         if(!content){
127             return "";
128         }
129 
130         // remove empty lines before and after
131         content = content.replace(/^((?:[ \t]*)\-\u0002\u0002\-)+|((?:\-\u0002\u0002\-[ \t]*))+$/g, "");
132 
133         // replace tabs with 4 spaces
134         content = content.replace(/\t/g, "    ");
135 
136         // replace temp. linebreak placeholders with 4 space placehorlders
137         content = content.replace(/\-\u0002\u0002\-([ ]*)/g, function(match, spaces){
138             // keep spaces in the beginning of the lines
139             var spaces = spaces.replace(/ /g, "-\u0001\u0001-");
140             
141             return "\n-\u0001\u0001--\u0001\u0001--\u0001\u0001--\u0001\u0001-" + spaces;
142         });
143 
144         content = content.replace(/</g,"<").replace(/>/g,">");
145 
146         // add prepending 4 spaces
147         return "\n-\u0001\u0001--\u0001\u0001--\u0001\u0001--\u0001\u0001-" + content.trim() + "\n\n";
148     });
149 
150     // remove all remaining html tags
151     str = str.replace(/<[^>]*>/g," ");
152 
153     // remove duplicate spaces
154     str = str.replace(/[ ][ ]+/g," ");
155 
156     // remove temp. newlines
157     str = str.replace(/-\u0002\u0002-/g," ");
158 
159     // restore newlines
160     str = str.replace(/-\u0000\u0000-/g,"\n");
161     
162     // remove spaces before and after newlines
163     str = str.replace(/[ \t]*\n[ \t]*/g,"\n");
164     
165     // remove more than 2 newlines in a row
166     str = str.replace(/\n\n+/g,"\n\n");
167     
168     // restore hidden spaces
169     str = str.replace(/-\u0001\u0001-/g," ");
170 
171     // decode HTML entities (< and such)
172     str = decodeHTMLEntities(str);
173 
174     return str.trim();
175 }
176 
177 var HTMLEntities = {
178 apos:0x0027,quot:0x0022,amp:0x0026,lt:0x003C,gt:0x003E,nbsp:0x00A0,iexcl:0x00A1,cent:0x00A2,pound:0x00A3,
179 curren:0x00A4,yen:0x00A5,brvbar:0x00A6,sect:0x00A7,uml:0x00A8,copy:0x00A9,ordf:0x00AA,laquo:0x00AB,
180 not:0x00AC,shy:0x00AD,reg:0x00AE,macr:0x00AF,deg:0x00B0,plusmn:0x00B1,sup2:0x00B2,sup3:0x00B3,
181 acute:0x00B4,micro:0x00B5,para:0x00B6,middot:0x00B7,cedil:0x00B8,sup1:0x00B9,ordm:0x00BA,raquo:0x00BB,
182 frac14:0x00BC,frac12:0x00BD,frac34:0x00BE,iquest:0x00BF,Agrave:0x00C0,Aacute:0x00C1,Acirc:0x00C2,Atilde:0x00C3,
183 Auml:0x00C4,Aring:0x00C5,AElig:0x00C6,Ccedil:0x00C7,Egrave:0x00C8,Eacute:0x00C9,Ecirc:0x00CA,Euml:0x00CB,
184 Igrave:0x00CC,Iacute:0x00CD,Icirc:0x00CE,Iuml:0x00CF,ETH:0x00D0,Ntilde:0x00D1,Ograve:0x00D2,Oacute:0x00D3,
185 Ocirc:0x00D4,Otilde:0x00D5,Ouml:0x00D6,times:0x00D7,Oslash:0x00D8,Ugrave:0x00D9,Uacute:0x00DA,Ucirc:0x00DB,
186 Uuml:0x00DC,Yacute:0x00DD,THORN:0x00DE,szlig:0x00DF,agrave:0x00E0,aacute:0x00E1,acirc:0x00E2,atilde:0x00E3,
187 auml:0x00E4,aring:0x00E5,aelig:0x00E6,ccedil:0x00E7,egrave:0x00E8,eacute:0x00E9,ecirc:0x00EA,euml:0x00EB,
188 igrave:0x00EC,iacute:0x00ED,icirc:0x00EE,iuml:0x00EF,eth:0x00F0,ntilde:0x00F1,ograve:0x00F2,oacute:0x00F3,
189 ocirc:0x00F4,otilde:0x00F5,ouml:0x00F6,divide:0x00F7,oslash:0x00F8,ugrave:0x00F9,uacute:0x00FA,ucirc:0x00FB,
190 uuml:0x00FC,yacute:0x00FD,thorn:0x00FE,yuml:0x00FF,OElig:0x0152,oelig:0x0153,Scaron:0x0160,scaron:0x0161,
191 Yuml:0x0178,fnof:0x0192,circ:0x02C6,tilde:0x02DC,Alpha:0x0391,Beta:0x0392,Gamma:0x0393,Delta:0x0394,
192 Epsilon:0x0395,Zeta:0x0396,Eta:0x0397,Theta:0x0398,Iota:0x0399,Kappa:0x039A,Lambda:0x039B,Mu:0x039C,
193 Nu:0x039D,Xi:0x039E,Omicron:0x039F,Pi:0x03A0,Rho:0x03A1,Sigma:0x03A3,Tau:0x03A4,Upsilon:0x03A5,
194 Phi:0x03A6,Chi:0x03A7,Psi:0x03A8,Omega:0x03A9,alpha:0x03B1,beta:0x03B2,gamma:0x03B3,delta:0x03B4,
195 epsilon:0x03B5,zeta:0x03B6,eta:0x03B7,theta:0x03B8,iota:0x03B9,kappa:0x03BA,lambda:0x03BB,mu:0x03BC,
196 nu:0x03BD,xi:0x03BE,omicron:0x03BF,pi:0x03C0,rho:0x03C1,sigmaf:0x03C2,sigma:0x03C3,tau:0x03C4,
197 upsilon:0x03C5,phi:0x03C6,chi:0x03C7,psi:0x03C8,omega:0x03C9,thetasym:0x03D1,upsih:0x03D2,piv:0x03D6,
198 ensp:0x2002,emsp:0x2003,thinsp:0x2009,zwnj:0x200C,zwj:0x200D,lrm:0x200E,rlm:0x200F,ndash:0x2013,
199 mdash:0x2014,lsquo:0x2018,rsquo:0x2019,sbquo:0x201A,ldquo:0x201C,rdquo:0x201D,bdquo:0x201E,dagger:0x2020,
200 Dagger:0x2021,bull:0x2022,hellip:0x2026,permil:0x2030,prime:0x2032,Prime:0x2033,lsaquo:0x2039,rsaquo:0x203A,
201 oline:0x203E,frasl:0x2044,euro:0x20AC,image:0x2111,weierp:0x2118,real:0x211C,trade:0x2122,alefsym:0x2135,
202 larr:0x2190,uarr:0x2191,rarr:0x2192,darr:0x2193,harr:0x2194,crarr:0x21B5,lArr:0x21D0,uArr:0x21D1,
203 rArr:0x21D2,dArr:0x21D3,hArr:0x21D4,forall:0x2200,part:0x2202,exist:0x2203,empty:0x2205,nabla:0x2207,
204 isin:0x2208,notin:0x2209,ni:0x220B,prod:0x220F,sum:0x2211,minus:0x2212,lowast:0x2217,radic:0x221A,
205 prop:0x221D,infin:0x221E,ang:0x2220,and:0x2227,or:0x2228,cap:0x2229,cup:0x222A,"int":0x222B,
206 there4:0x2234,sim:0x223C,cong:0x2245,asymp:0x2248,ne:0x2260,equiv:0x2261,le:0x2264,ge:0x2265,
207 sub:0x2282,sup:0x2283,nsub:0x2284,sube:0x2286,supe:0x2287,oplus:0x2295,otimes:0x2297,perp:0x22A5,
208 sdot:0x22C5,lceil:0x2308,rceil:0x2309,lfloor:0x230A,rfloor:0x230B,lang:0x2329,rang:0x232A,loz:0x25CA,
209 spades:0x2660,clubs:0x2663,hearts:0x2665,diams:0x2666
210 };
211 
212 /**
213  * <p>Decodes any HTML entities in a string into their unicode form</p>
214  * 
215  * @param {String} text text to decode
216  * @return {String} Decoded text
217  */
218 decodeHTMLEntities = function(text){
219     return text.replace(/&(.+?);/g, function(str, ent){
220             return String.fromCharCode( ent[0] !== '#' ? HTMLEntities[ent] : ent[1] === 'x' ? parseInt(ent.substr(2),16) : parseInt(ent.substr(1), 10) );
221         }
222     );
223 };