新手用nodejs 写的一个小的爬虫,更新采集网盘的url链接地址,大家来一起优化下
先上代码
var cheerio = require("cheerio");
var http = require('http');
var mysql = require('mysql');
var async = require("async");
var request = require('request');
var LIMIT = 30;
var sqllimit =1000;
var time =1000*60*1;
var concurrencyCount = 0;
var connection = mysql.createConnection({
host: '127.0.0.1',
user: 'root',
password: 'root',
database:'111',
useConnectionPooling: true
});
var fetchUrl = function (row, callback) {
var delay = parseInt((Math.random() * 10000000) % 2000, 10);
concurrencyCount++;
var url = '/s/'+row.shorturl;
console.log('现在的并发数是', concurrencyCount, ',正在抓取的是', url, ',耗时' + delay + '毫秒');
var options1 = {
host: 'pan.baidu.com',
port: 80,
path: '/',
method: 'HEAD'
};
var req1 = http.request(options1, function(response1){
var cookie = '';
if(response1.statusCode ==200 && response1.headers['set-cookie']){
cookie = response1.headers['set-cookie'][0];
}
var net = require('net');
var client = new net.Socket();
client.connect(80, 'pan.baidu.com', function() {
var h = 'GET '+url+' HTTP/1.1\r\nHost: pan.baidu.com\r\nCookie: '+cookie+'\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36\r\nAccept: */*\r\n\r\n';
client.write( h);
});
client.on( 'error', function(data) {
console.log(data);
// client.destroy();
});
client.on('data', function(data) {
var $ = cheerio.load(data);
row['title'] =connection.escape( $("title").text());
row['count'] =0;
row['status']=0;
row['is_updated']= 0;
var sql1 = "INSERT INTO pan_check (`crid`,`share_id`,`count`, `status`,`is_updated`,`title`) VALUES ("
+ row['rid']+","+row['shareid']+","+row['count']+","+row['status']+","+row['is_updated']+","+row['title']+") ON DUPLICATE KEY UPDATE count=count+1, is_updated=0 ";
console.log('BODY: ' +sql1);
connection.query(sql1, function(err, rows, fields) {
if (err){
console.log(err);
}
// console.log('查询结果为: ', rows);
});
// 完全关闭连接
client.destroy();
});
}).on('error',function(e){
console.log(e.message);
});
req1.end();
setTimeout(function () {
concurrencyCount--;
callback(null, url + ' html content');
}, delay);
};
connection.connect();
//查询
var task = function () {
var max_id = '';
connection.query('select max(crid) as max_id from pan_check', function(err, rows, fields) {
if (err){
connection.connect();
// throw err;
}
var max_id = rows[0]['max_id'];
var select_sql = ' SELECT * FROM pan_record where rid >='+max_id+' order by rid asc limit '+sqllimit;
connection.query(select_sql, function(err, rows, fields) {
if (err) {
connection.connect();
//throw err;
}
if(rows)
{
var urls=[];
for(var i = 0; i < rows.length; i++)
{
if(rows[i].shorturl != undefined && rows[i].shorturl != '')
{
urls.push(rows[i]);
}
}
async.mapLimit(urls, LIMIT, function (url, callback) {
fetchUrl(url, callback);
}, function (err, result) {
console.log('final:');
// console.log(result);
});
}
});
});
};
task();
setInterval(task , time);
目前数据库中有已经采集了几千万条数据, 需要定期对全部数据或者部分数据扫描检验链接的有效性. 怎么优化程序让执行效率更高? 这个做小广告 : 大家可以 用小白盘来搜 nodejs的学习资料 http://www.xiaobaipan.com 百度网盘搜索
4 回复
能贴上完整的代码吗
这个就是所有的代码啊
dd