{"_id":"cheerio-soupselect","_rev":"14-8c1d30796e6ab6778e35ac6c4dbcf4f3","name":"cheerio-soupselect","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","dist-tags":{"latest":"0.1.1"},"versions":{"0.0.1":{"name":"cheerio-soupselect","version":"0.0.1","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"1.5.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.0.1","contributors":[{"name":"Simon Willison https://github.com/simonw"},{"name":"Harry Fuecks https://github.com/harryf"},{"name":"Chris O'Hara https://github.com/chriso"}],"devDependencies":{},"_engineSupported":true,"_npmVersion":"1.0.104","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"ecdd4eae8ee867d6bc9dfb384c3b207f3ee255ad","tarball":"https://registry.npmjs.org/cheerio-soupselect/-/cheerio-soupselect-0.0.1.tgz","integrity":"sha512-bQ7aoSbCl6dGznY1rrzEw7tACLhhGNZdHKcF3Vvl4ZIqEYBPFyHiclHp1bEUaVSRRNEQGFgL2otxxaIkmFN0Xg==","signatures":[{"keyid":"SHA256:jl3bwswu80PjjokCgh0o2w5c2U4LhQAE57gj9cz1kzA","sig":"MEUCID0Cpia87+PPNcon1u710cz0Fl/O7pvJ5sZPEBy/QQ9UAiEAqBC/jV39e2O5tUXJFGTBnTGG+uNjtKrEyYd4IhiUYPE="}]},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}]},"0.0.2":{"name":"cheerio-soupselect","version":"0.0.2","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"1.5.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.0.2","devDependencies":{},"_engineSupported":true,"_npmVersion":"1.0.104","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"ed81023842bd0109e616c0d044d10c5dc7e1e3ec","tarball":"https://registry.npmjs.org/cheerio-soupselect/-/cheerio-soupselect-0.0.2.tgz","integrity":"sha512-Qa9iOGL+n8DLN3rRteRZAQ1RIGmzDW/qkEy8/J0ibODE0NwXmNcSRk2YDCxPxasw6N/Xe3WA8L/UKnrrfXUSaw==","signatures":[{"keyid":"SHA256:jl3bwswu80PjjokCgh0o2w5c2U4LhQAE57gj9cz1kzA","sig":"MEUCICOynE0vRm/ZqwTI8B6JDm6LR2I4ERXLG5IoMKCkH+rvAiEApRcSf7AhE1wgmMgqoZHHPS/hpRhDa25+IzX6ARvfikA="}]},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}]},"0.0.3":{"name":"cheerio-soupselect","version":"0.0.3","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"2.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.0.3","devDependencies":{},"_engineSupported":true,"_npmVersion":"1.0.103","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"e498a5d9abc2b77f7d64e586f0849402d2398381","tarball":"https://registry.npmjs.org/cheerio-soupselect/-/cheerio-soupselect-0.0.3.tgz","integrity":"sha512-WWeM3Plnu0WSuEULDvccjgRQV4QXQzRUFA+0Xoj+DGs8WI03ggaiZVC8Gnk31MBrOEfi6doIbPvyyOouw8f8xA==","signatures":[{"keyid":"SHA256:jl3bwswu80PjjokCgh0o2w5c2U4LhQAE57gj9cz1kzA","sig":"MEUCIQCGZx8iwxpkSsfdnCMTHU8TBvlZGtfH1fDgySEq6QGSHQIgZDWcAJBTb5UG2pTRO14bDOW3oqbRtqRWmF4J0ydJodo="}]},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}]},"0.1.0":{"name":"cheerio-soupselect","version":"0.1.0","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"2.x"},"contributors":[{"name":"Siddharth Mahendraker","email":"siddharth_mahen@me.com"}],"devDependencies":{"mocha":"0.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","scripts":{"test":"mocha -u tdd -R list"},"license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.1.0","_engineSupported":true,"_npmVersion":"1.0.103","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"22a973dab4b89ff3b98592b6caf668e921fb1660","tarball":"https://registry.npmjs.org/cheerio-soupselect/-/cheerio-soupselect-0.1.0.tgz","integrity":"sha512-bo479XQihF1OReCn3uduKilXXz48wFYoCdxdXMpKXoJamX9m1M/x/vdO4C1SUd1gTPVtGfvpPxJIIREPgGv7TQ==","signatures":[{"keyid":"SHA256:jl3bwswu80PjjokCgh0o2w5c2U4LhQAE57gj9cz1kzA","sig":"MEYCIQCAArkfZJv9KZwbsWl4NRRukFI+e02XZBCi8xjc7rNiGgIhAO+4cFWlFM7D9kaODV5IyqYsfpMxBYLFHD4oxr6I+UpB"}]},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}]},"0.1.1":{"name":"cheerio-soupselect","version":"0.1.1","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"2.x"},"contributors":[{"name":"Siddharth Mahendraker","email":"siddharth_mahen@me.com"}],"devDependencies":{"mocha":"0.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","scripts":{"test":"mocha -u tdd -R list"},"license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.1.1","_engineSupported":true,"_npmVersion":"1.0.103","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"9baa6ab69d3b8cd223167690365a336cb9ff2359","tarball":"https://registry.npmjs.org/cheerio-soupselect/-/cheerio-soupselect-0.1.1.tgz","integrity":"sha512-/Y6HCIerEIDYjOrPIZULglbJf7CjuCEpfXyXwL7mossQoSF53bP3hzUzXyL9qfRF0xT9Y+olYHPfRy43v89bjA==","signatures":[{"keyid":"SHA256:jl3bwswu80PjjokCgh0o2w5c2U4LhQAE57gj9cz1kzA","sig":"MEUCICF0bYxmTgkyN3ZLl+6L3OlcLZ7ZhiRlneb0fw+lr0VkAiEA01lqWOXjkK9Qk2tNiN2G9wwMP0/KKJvTkrY0qZ4eoqM="}]},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}]}},"readme":"node-soupselect\n---------------\n\nA port of Simon Willison's [soupselect](http://code.google.com/p/soupselect/) for use with node.js and node-htmlparser.\n\n    $ npm install soupselect\n\nMinimal example...\n\n    var select = require('soupselect').select;\n    // dom provided by htmlparser...\n    select(dom, \"#main a.article\").forEach(function(element) {//...});\n\nWanted a friendly way to scrape HTML using node.js. Tried using [jsdom](http://github.com/tmpvar/jsdom), prompted by [this article](http://blog.nodejitsu.com/jsdom-jquery-in-5-lines-on-nodejs) but, unfortunately, [jsdom](http://github.com/tmpvar/jsdom) takes a strict view of lax HTML making it unusable for scraping the kind of soup found in real world web pages. Luckily [htmlparser](http://github.com/tautologistics/node-htmlparser/) is more forgiving. More details on this found [here](http://www.reddit.com/r/node/comments/dm0tz/nodesoupselect_for_scraping_html_with_css/c118r23).\n\nA complete example including fetching HTML etc...;\n\n    var select = require('soupselect').select,\n        htmlparser = require(\"htmlparser\"),\n        http = require('http'),\n        sys = require('sys');\n\n    // fetch some HTML...\n    var http = require('http');\n    var host = 'www.reddit.com';\n    var client = http.createClient(80, host);\n    var request = client.request('GET', '/',{'host': host});\n\n    request.on('response', function (response) {\n        response.setEncoding('utf8');\n    \n        var body = \"\";\n        response.on('data', function (chunk) {\n            body = body + chunk;\n        });\n    \n        response.on('end', function() {\n        \n            // now we have the whole body, parse it and select the nodes we want...\n            var handler = new htmlparser.DefaultHandler(function(err, dom) {\n                if (err) {\n                    sys.debug(\"Error: \" + err);\n                } else {\n                \n                    // soupselect happening here...\n                    var titles = select(dom, 'a.title');\n                \n                    sys.puts(\"Top stories from reddit\");\n                    titles.forEach(function(title) {\n                        sys.puts(\"- \" + title.children[0].raw + \" [\" + title.attribs.href + \"]\\n\");\n                    })\n                }\n            });\n\n            var parser = new htmlparser.Parser(handler);\n            parser.parseComplete(body);\n        });\n    });\n    request.end();\n\nNotes:\n\n* Requires node-htmlparser > 1.6.2 & node.js 2+\n* Calls to select are synchronous - not worth trying to make it asynchronous IMO given the use case\n\n","maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}],"time":{"modified":"2022-06-13T05:56:29.841Z","created":"2011-11-26T04:29:15.654Z","0.0.1":"2011-11-26T04:30:48.954Z","0.0.2":"2011-11-26T04:51:55.232Z","0.0.3":"2011-12-19T09:43:36.718Z","0.1.0":"2012-01-17T03:51:32.175Z","0.1.1":"2012-03-04T05:28:22.708Z"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"users":{"mast4461":true}}