Nodejs爬虫(来源Hacker News)[跟着狼叔学爬虫]
发布于 8 年前 作者 NextZeus 8977 次浏览 来自 分享
 * Created by lixiaodong on 16/9/6.
var Crawler = require("crawler");
var jsdom = require('jsdom');

var articles = [];

var c = new Crawler({
    jQuery: jsdom,
    maxConnections : 100,
    // incomingEncoding: 'gb2312',
    // This will be called for each crawled page
    callback : function (error, result, $) {
        var urls = $('.itemlist .athing .title a');

        for(var i = 0,j=0; i< urls.length; i+=2,j++){
            var url = urls[i];
            var _url = $(url).attr('href')+"";
            var title = $(url).text();
                id  :   j + 1,
                title   :   title,
                url :   _url

        for(var i = 1,j=0; i< urls.length; i+=2,j++){
            var url = urls[i];
            var _url = $(url).attr('href')+"";
                articles[j].from = _url.substring(10);

        var scores = $('.itemlist .subtext .score');
        for(var i = 0 ; i < scores.length; i++){
            var score = $(scores[i]).text();
            articles[i].score = score;
        var hnusers = $('.itemlist .subtext a:eq(0)');
        for(var i = 0 ; i < hnusers.length; i++){
            var user = $(hnusers[i]).text();
            articles[i].username = user;

        var comments = $('.itemlist .subtext a:eq(3)');
        for(var i = 0 ; i < comments.length; i++){
            var comment = $(comments[i]).text();
            articles[i].comment = parseInt(comment) + ' comments';

        var ages = $('.itemlist .age a');
        for(var i = 0 ; i < ages.length; i++){
            var age = $(ages[i]).text();
            articles[i].age = parseInt(age) + ' hours ago';
            var uid = $(ages[i]).attr('href')+'';
            articles[i].uid = uid.substring(8);



  id: 1,
  title: 'When you change the world and no one notices',
  url: '',
  from: '',
  score: '288 points',
  username: 'waqasaday',
  comment: '99 comments',
  age: '6 hours ago',
  uid: '12433365' 

7 回复


@luoyjx 刚开始学着写,写爬虫也是一需要耐心的过程

@NextZeus 加油,软件都是慢慢打磨的过程,哈哈

话说HN有官方api,写啥爬虫 From NeoReader


@nihgwu 我就是练练手 行吗

@p412726700 我就是练练手 行吗
