新手用nodejs 写的一个小的爬虫,更新采集网盘的url链接地址,大家来一起优化下
发布于 7 年前 作者 xinhua51 6928 次浏览 来自 分享

先上代码

var cheerio = require("cheerio");
var http = require('http');
var mysql = require('mysql');
var async = require("async");	
var request = require('request');
var LIMIT = 30;
var sqllimit =1000;
var time =1000*60*1;
var concurrencyCount = 0;

var connection = mysql.createConnection({
host: '127.0.0.1',
user: 'root',
password: 'root',
database:'111',
useConnectionPooling: true
});
var fetchUrl = function (row, callback) {
var delay = parseInt((Math.random() * 10000000) % 2000, 10);
concurrencyCount++;
var url = '/s/'+row.shorturl;
console.log('现在的并发数是', concurrencyCount, ',正在抓取的是', url, ',耗时' + delay + '毫秒');
var options1 = {
    host: 'pan.baidu.com',
    port: 80,
    path: '/',
    method: 'HEAD'
};
var req1 = http.request(options1, function(response1){
    var cookie = '';
    if(response1.statusCode ==200 && response1.headers['set-cookie']){
        cookie = response1.headers['set-cookie'][0];
    }
    var net = require('net');
    var client = new net.Socket();
    client.connect(80, 'pan.baidu.com', function() {
        var h = 'GET '+url+' HTTP/1.1\r\nHost: pan.baidu.com\r\nCookie: '+cookie+'\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36\r\nAccept: */*\r\n\r\n';
        client.write( h);
    });
    client.on( 'error', function(data) {
        console.log(data);
       // client.destroy();
    });
    client.on('data', function(data) {
        var $ = cheerio.load(data);
        row['title'] =connection.escape( $("title").text());
        row['count'] =0;
        row['status']=0;
        row['is_updated']= 0;

        var sql1 = "INSERT INTO pan_check (`crid`,`share_id`,`count`, `status`,`is_updated`,`title`) VALUES ("
            + row['rid']+","+row['shareid']+","+row['count']+","+row['status']+","+row['is_updated']+","+row['title']+") ON DUPLICATE KEY UPDATE count=count+1, is_updated=0 ";
        console.log('BODY: ' +sql1);
       connection.query(sql1, function(err, rows, fields) {
            if (err){
                console.log(err);
            }
           // console.log('查询结果为: ', rows);
        });
        // 完全关闭连接
        client.destroy();
    });
}).on('error',function(e){
    console.log(e.message);
});
req1.end();

setTimeout(function () {
    concurrencyCount--;
    callback(null, url + ' html content');
}, delay);
};

connection.connect();
//查询

var task  = function () {
   var max_id = '';
	connection.query('select max(crid) as max_id from pan_check', function(err, rows, fields) {
		if (err){
			connection.connect();
		   // throw err;
		}
		var max_id =  rows[0]['max_id'];
		var select_sql  = ' SELECT  * FROM pan_record  where rid >='+max_id+' order by rid asc limit '+sqllimit;
		connection.query(select_sql, function(err, rows, fields) {
			if (err) {
				connection.connect();
				//throw err;
			 }
			if(rows)
			{
				var urls=[];
				for(var i = 0; i < rows.length; i++)
				{
					if(rows[i].shorturl != undefined && rows[i].shorturl != '')
					{
						urls.push(rows[i]);
					}
				}
				async.mapLimit(urls, LIMIT, function (url, callback) {
					fetchUrl(url, callback);
				}, function (err, result) {
					console.log('final:');
					// console.log(result);
				});
			}
		});


	});

};

task();
setInterval(task , time);

目前数据库中有已经采集了几千万条数据, 需要定期对全部数据或者部分数据扫描检验链接的有效性. 怎么优化程序让执行效率更高? 这个做小广告 : 大家可以 用小白盘来搜 nodejs的学习资料 http://www.xiaobaipan.com 百度网盘搜索

4 回复

能贴上完整的代码吗

这个就是所有的代码啊

回到顶部