Nodejs爬虫(来源Hacker News)[跟着狼叔学爬虫]
发布于 8 年前 作者 NextZeus 8533 次浏览 来自 分享
/**
 * Created by lixiaodong on 16/9/6.
 */
var Crawler = require("crawler");
var jsdom = require('jsdom');

var articles = [];

var c = new Crawler({
    jQuery: jsdom,
    maxConnections : 100,
    forceUTF8:true,
    // incomingEncoding: 'gb2312',
    // This will be called for each crawled page
    callback : function (error, result, $) {
        var urls = $('.itemlist .athing .title a');

        for(var i = 0,j=0; i< urls.length; i+=2,j++){
            var url = urls[i];
            var _url = $(url).attr('href')+"";
            var title = $(url).text();
            articles.push({
                id  :   j + 1,
                title   :   title,
                url :   _url
            });
        }

        for(var i = 1,j=0; i< urls.length; i+=2,j++){
            var url = urls[i];
            var _url = $(url).attr('href')+"";
            if(articles[j]){
                articles[j].from = _url.substring(10);
            }
        }


        var scores = $('.itemlist .subtext .score');
        for(var i = 0 ; i < scores.length; i++){
            var score = $(scores[i]).text();
            articles[i].score = score;
        }
        var hnusers = $('.itemlist .subtext a:eq(0)');
        for(var i = 0 ; i < hnusers.length; i++){
            var user = $(hnusers[i]).text();
            articles[i].username = user;
        }

        var comments = $('.itemlist .subtext a:eq(3)');
        for(var i = 0 ; i < comments.length; i++){
            var comment = $(comments[i]).text();
            articles[i].comment = parseInt(comment) + ' comments';
        }

        var ages = $('.itemlist .age a');
        for(var i = 0 ; i < ages.length; i++){
            var age = $(ages[i]).text();
            articles[i].age = parseInt(age) + ' hours ago';
            var uid = $(ages[i]).attr('href')+'';
            articles[i].uid = uid.substring(8);
        }

        console.log(articles[0]);
    }
});

c.queue('https://news.ycombinator.com/news?p=1');

//result
[{ 
  id: 1,
  title: 'When you change the world and no one notices',
  url: 'http://www.collaborativefund.com/blog/when-you-change-the-world-and-no-one-notices/',
  from: 'collaborativefund.com',
  score: '288 points',
  username: 'waqasaday',
  comment: '99 comments',
  age: '6 hours ago',
  uid: '12433365' 
},,,,]

7 回复

学的话写成job,大家都来写就写成一个系列啦,哈哈

@luoyjx 刚开始学着写,写爬虫也是一需要耐心的过程

@NextZeus 加油,软件都是慢慢打磨的过程,哈哈

话说HN有官方api,写啥爬虫 From NeoReader

话说HN有官方api,写啥爬虫

@nihgwu 我就是练练手 行吗

@p412726700 我就是练练手 行吗

回到顶部