How to scrape web pages with PhantomJS and jQuery


JavaScript posted 9 months ago by christian

This is an example of how to scrape the web using PhantomJS and jQuery:

   1  var page = new WebPage(),
   2      url = 'http://localhost/a-search-form',
   3      stepIndex = 0;
   4 
   5  /**
   6   * From PhantomJS documentation:
   7   * This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments: 
   8   * the string for the message, the line number, and the source identifier.
   9   */
  10  page.onConsoleMessage = function (msg, line, source) {
  11      console.log('console> ' + msg);
  12  };
  13 
  14  /**
  15   * From PhantomJS documentation:
  16   * This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message.
  17   */
  18  page.onAlert = function (msg) {
  19      console.log('alert!!> ' + msg);
  20  };
  21 
  22  // Callback is executed each time a page is loaded...
  23  page.open(url, function (status) {
  24    if (status === 'success') {
  25      // State is initially empty. State is persisted between page loads and can be used for identifying which page we're on.
  26      console.log('============================================');
  27      console.log('Step "' + stepIndex + '"');
  28      console.log('============================================');
  29 
  30      // Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file)
  31      page.injectJs('jquery-1.6.1.min.js');
  32 
  33      // Our "event loop"
  34      if(!phantom.state){
  35        initialize();
  36      } else {
  37        phantom.state();
  38      }
  39 
  40      // Save screenshot for debugging purposes
  41      page.render("step" + stepIndex++ + ".png");
  42    }
  43  });
  44 
  45  // Step 1
  46  function initialize() {
  47    page.evaluate(function() {
  48      $('form#search input.query').val('Jebus saves');
  49      $('form#search').submit();
  50      console.log('Searching...');
  51    });
  52    // Phantom state doesn't change between page reloads
  53    // We use the state to store the search result handler, ie. the next step
  54    phantom.state = parseResults;
  55  }
  56 
  57  // Step 2
  58  function parseResults() {
  59    page.evaluate(function() {
  60      $('#search-result a').each(function(index, link) {
  61        console.log($(link).attr('href'));
  62      })
  63      console.log('Parsed results');
  64    });
  65    // If there was a 3rd step we could point to another function
  66    // but we would have to reload the page for the callback to be called again
  67    phantom.exit();
  68  }
声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。