How to scrape web pages with PhantomJS and jQuery
JavaScript posted 9 months ago by christian
This is an example of how to scrape the web using PhantomJS and jQuery:
1 var page = new WebPage(), 2 url = 'http://localhost/a-search-form', 3 stepIndex = 0; 4 5 /** 6 * From PhantomJS documentation: 7 * This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments: 8 * the string for the message, the line number, and the source identifier. 9 */ 10 page.onConsoleMessage = function (msg, line, source) { 11 console.log('console> ' + msg); 12 }; 13 14 /** 15 * From PhantomJS documentation: 16 * This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message. 17 */ 18 page.onAlert = function (msg) { 19 console.log('alert!!> ' + msg); 20 }; 21 22 // Callback is executed each time a page is loaded... 23 page.open(url, function (status) { 24 if (status === 'success') { 25 // State is initially empty. State is persisted between page loads and can be used for identifying which page we're on. 26 console.log('============================================'); 27 console.log('Step "' + stepIndex + '"'); 28 console.log('============================================'); 29 30 // Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file) 31 page.injectJs('jquery-1.6.1.min.js'); 32 33 // Our "event loop" 34 if(!phantom.state){ 35 initialize(); 36 } else { 37 phantom.state(); 38 } 39 40 // Save screenshot for debugging purposes 41 page.render("step" + stepIndex++ + ".png"); 42 } 43 }); 44 45 // Step 1 46 function initialize() { 47 page.evaluate(function() { 48 $('form#search input.query').val('Jebus saves'); 49 $('form#search').submit(); 50 console.log('Searching...'); 51 }); 52 // Phantom state doesn't change between page reloads 53 // We use the state to store the search result handler, ie. the next step 54 phantom.state = parseResults; 55 } 56 57 // Step 2 58 function parseResults() { 59 page.evaluate(function() { 60 $('#search-result a').each(function(index, link) { 61 console.log($(link).attr('href')); 62 }) 63 console.log('Parsed results'); 64 }); 65 // If there was a 3rd step we could point to another function 66 // but we would have to reload the page for the callback to be called again 67 phantom.exit(); 68 }
声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。
评论(0)