lib/thresher.js
require('shelljs/global'); var deps = require('./dependencies.js'), events = require('events');

SpookyJS provides our bridge to CasperJS and PhantomJS

var Spooky = require('spooky'); var file = require('./file.js') , Downloader = require('./download.js') , url = require('./url.js') , dom = require('./dom.js') , Ticker = require('./ticker.js') , request = require('request');

Create a new Thresher. A Thresher controlls a scraping operation. Thresher handles rendering a page using the chosen rendering engine, passing the HTML of the rendered page back to the Node context, re-rendering it in the local Node jsdom, and running scraperJSON-defined scrapers on the rendered DOM. Thresher emits events during the scraping process: - 'error': if an error occurs - 'element': for each extracted element - 'result': the final result of a single scraping operation - 'rendered': when the HTML of the rendered DOM is returned from PhantomJS

var Thresher = function() { events.EventEmitter.call(this); }

Thresher inherits from EventEmitter

Thresher.prototype.__proto__ = events.EventEmitter.prototype;

Bubble SpookyJS errors up to our interface, providing a clear context message and the SpookyJS message as detail.

Parameters:

  • err must be a String.
    (the SpookyJS error.)
var handleInitError = function(err) { if (err) { var e = new Error('Failed to initialize SpookyJS'); e.details = err; log.error(e); log.debug(e.stack); throw e; } };

generate SpookyJS settings

Parameters:

  • loglevel must be a String.
    (the loglevel)

Returns an Object
(the settings)

var settings = function(loglevel) { env['PHANTOMJS_EXECUTABLE'] = deps.getbinpath('phantomjs'); return { child: { command: deps.getbinpath('casperjs') }, casper: { logLevel: loglevel, verbose: true, exitOnError: true, httpStatusHandlers: { 404: function(resource) { emit('error', resource.status + ': ' + resource.url); casper.exit(4); } }, pageSettings: { loadImages: false, loadPlugins: false }, resourceTimeout: 20000, onResourceTimeout: function(e) { emit('resourceTimeout', e.errorCode, e.errorString, e.url); casper.exit(2); }, onLoadError: function(msg, trace) { emit('log', { space: 'remote', message: msg + trace }); emit('loadError', msg, trace); casper.exit(3); } } }; }

Scrape a URL using a ScraperJSON-defined scraper.

Parameters:

  • scrapeUrl must be a String.
    (the URL to scrape)

  • definition must be an Object.
    (a dictionary defining the scraper)

Thresher.prototype.scrape = function(scrapeUrl, definition, headless) { log.debug('function scrape: ' + scrapeUrl); var loglevel = 'debug'; // delete this and move away from logging to events

validate arguments

url.checkUrl(scrapeUrl); this.emit('scrapeStart'); var thresher = this; if (headless) {

let's get our scrape on

log.debug('creating spooky instance'); var spooky = new Spooky(settings(loglevel), function() { log.debug('spooky initialising'); spooky.start(scrapeUrl); spooky.then(function() {

in SpookyJS scope

this.emit('pageDownload', this.evaluate(function() {

in rendered page scope

return document.documentElement.outerHTML; })); }); spooky.run(); }); spooky.on('pageDownload', function(html) { thresher.emit('pageRendered', html); log.debug('page downloaded and rendered'); try { var results = thresher.scrapeHtml(html, definition, scrapeUrl); thresher.emit('scrapeResults', results); } catch(e) { log.error('problem scraping html:'); log.error(e.message); log.error(e.stack); } }); spooky.on('404', function (msg, trace) { log.error(msg); var err = new Error(msg); err.stack = trace; throw err; }); spooky.on('resourceTimeout', function (code, string, url) { log.error(code); var err = new Error(code + ' ' + string + " - " + url); throw err; }); if (loglevel === 'debug') { spooky.on('console', function (line) { var parts = line.split(' '); log.debug(parts.slice(1, parts.length).join(' ')); }); spooky.on('log', function (log) { if (log.space === 'remote') { console.log(log.message.replace(/ \- .*/, '')); } }); } } else { var conf = {url: scrapeUrl}; if (this.jar) { conf.jar = jar; } request(conf, function (error, response, body) { if (!error && response.statusCode == 200) { var results = thresher.scrapeHtml(body, definition, scrapeUrl); thresher.emit('scrapeResults', results); } }); } }; Thresher.prototype.loadCookie = function(filepath) { var cookiejson = JSON.parse(fs.readFileSync(filepath)); this.jar = new CookieJar(); jar.add(cookiejson); } module.exports = Thresher;