分享一段网页爬虫代码
看到这么多人对爬虫感兴趣,这里我把我之前写的爬虫代码贴出来,用于之前一个采集网站 发布一个用nodejs建的小站 ,网站使用geddy框架,不过因为域名备案的问题,目前暂时停掉了。 爬虫使用async做任务调度,iconv-lite转换gb2312编码,request请求页面内容,cheerio解析内容数据,gm生产缩略图。代码删掉了部分处理代码,保留了完整的流程,所以不能正常运行。代码没有半句注释,我认为好的代码就是最好的注释,不过我不是说我代码写得好,本人js菜鸟,只是没有写注释的习惯而已。 在此顺便提个问题,怎么在内存中生成缩略图?看了gm的文档好多遍都没有找到解决办法,找了github上其他的图像处理库,在windows下都没法用,只得先保存为临时文件再删除。。
var async = require('async');
var iconv=require('iconv-lite');
var request=require('request');
var cheerio=require('cheerio');
var querystring=require('querystring');
var Buffer=require('buffer').Buffer;
var gm=require('gm');
var fs=require('fs');
var log=function(str){
var time=geddy.date.strftime(new Date(), '%Y.%m.%d %H:%M:%S')
console.log(time+': '+str);
}
var postQueue=async.queue(function(task,callback){
log('get post ---> board: ' + task.board + ' file: ' +task.file);
getPost(task.board,task.file,task.replyCount);
setTimeout(callback,5000);
});
var userQueue=async.queue(function(task,callback){
log('get user ---> userid: ' + task.userid);
getUser(task.userid);
setTimeout(callback,5000);
},1);
var imageQueue=async.queue(function(task,callback){
log('get image ---> url: ' + task.url );
getImage(task.id,task.url);
setTimeout(callback,2000);
},5);
var cookie='';
function login(){
var qs=querystring.stringify({
id:'',
pw:'',
xml:1
});
request.get('bbslogin?'+qs,{encoding: null},function(err,res,data){
if(!err&&res.statusCode==200){
var xml=iconv.decode(data,'gb2312');
var $=cheerio.load(xml);
var utmpnum= $('utmpnum').text();
var utmpuserid=$('utmpuserid').text();
var utmpkey=$('utmpkey').text();
cookie='utmpnum='+utmpnum+'; utmpuserid='+utmpuserid+'; utmpkey='+utmpkey;
log("login success");
}
})
}
function getPosts(){
request.get('posttop10.xml',{ encoding: null },function(err,res,data){
if(!err&&res.statusCode==200){
var xml=iconv.decode(data,'gb2312');
var $=cheerio.load(xml);
var updatetime=$('updatetime').text();
$('post').each(function(i,item){ if(i>2)return;
var board=$(this).children('board').text();
var file=$(this).children('file').text();
var replyCount=parseInt( $(this).children('reply_count').text());
geddy.model.Post.first({board:board, file:file},function (err, post) {
if(null == post || (post.replyCount!=null&&post.replyCount!=replyCount)){
log('add post task ---> board: '+board + ', file: ' +file);
postQueue.push({
board:board,
file:file,
replyCount:replyCount
})
}else{
post.crawlTime=new Date();
post.save();
}
});
})
}
});
}
function replaceEmotion(content) {
return content;
}
function purifyContent(content) {
return content;
}
function getPostTime(content) {
var postTime = new Date();
return postTime;
}
function replace(content) {
return content;
}
function getPost(board,file,replyCount){
var qs=querystring.stringify({
board:board,
file:file,
xml:1
});
var r = request.defaults({ encoding: null, headers: { cookie: cookie} });
r.get('bbsnewtcon?'+qs,function(err,res,data){
if(err)
{
log(err);
return;
}
if(res.statusCode==403){
//login();
return;
}
if(res.statusCode==200){
var xml=iconv.decode(data,'gb2312');
if(xml.indexOf('<error>')>0){
login();
return;
}
var $=cheerio.load(xml);
var postid,posttitle;
var floor=0;
async.eachSeries($('article'),function(item,callback1){
//log($(item).text());
var title=$(item).children('title').html();
var content=$(item).children('content').text();
var userid=$(item).children('owner').text();
var filename=$(item).children('filename').text();
var crawlTime=new Date();
var replyTo=null;
if(floor>0){
var reg=/【 在 (\w+) [\w\W]*的大作中提到: 】/;
var r=content.match(reg);
if(r!=null){
replyTo=r[1];
content=content.replace(reg,'');
}
}
content = replace(content);
var postTime = getPostTime(content);
var $$=cheerio.load(content);
async.each($$('a'),function(item,callback){
var tmp= $$(item).html().toLowerCase();
if (tmp.indexOf(".gif") != -1
|| tmp.indexOf(".jpg") != -1
|| tmp.indexOf(".jpeg") != -1
|| tmp.indexOf(".bmp") != -1
|| tmp.indexOf(".png") != -1) {
var url=$$(item).html();
var crawlTime=new Date();
geddy.model.Image.first({url:url} , function(err,image){
if(null==image){
image = geddy.model.Image.create({
url:url
})
}
image.save(function(err){
log('add image task ---> id: '+image.id + ', url: ' +url);
imageQueue.push({
id:image.id,
url:url
});
var src='/images/'+ image.id;
$$(item).html('<div/><img class="img" src="'+ src +'" />');
$$(item).attr('href',src+'.jpg').attr('alt',src+'.jpg');
callback();
});
});
}
else{
callback();
}
},function(err){
content=$$.html();
content = purifyContent(content);
if(floor==0){
//replyCount=$('article').length-1;
geddy.model.Post.first({board:board,file:file} , function(err,post){
if(null==post){
post = geddy.model.Post.create({
board : board ,
file : file ,
title : title
});
geddy.model.User.first({userid:userid},function (err, user) {
if(err) return;
if(null!=user && user.crawlTime.getDate()== new Date().getDate()) return;
userQueue.push({
userid:userid
});
});
}
post.userid = userid;
post.postTime = postTime;
post.crawlTime=crawlTime;
post.title = title;
post.content = content;
post.replyCount = replyCount;
post.save(function(err){
postid=post.id;
posttitle=post.title;
floor++;
callback1();
})
});
}
else{
geddy.model.Comment.first({board:board,file:filename} , function(err,comment){
if(null==comment){
comment = geddy.model.Comment.create({
board : board ,
file : filename
});
geddy.model.User.first({userid:userid},function (err, user) {
if(err) return;
if(null!=user && user.crawlTime.getDate()== new Date().getDate()) return;
userQueue.push({
userid:userid
});
});
}
comment.postid=postid;
comment.posttitle=posttitle;
comment.replyTo=replyTo;
comment.userid = userid;
comment.postTime = postTime;
comment.crawlTime=crawlTime;
comment.title = title;
comment.content = content;
comment.floor = floor;
comment.save(function(err){
floor++;
callback1();
});
});
}
})
},function(err){
if(!err)
log('got post ---> board: ' + board + ' file: ' +file);
else
log('got post error ---> board: ' + board + ' file: ' +file);
})
}
})
}
function getUser(userid){
var qs=querystring.stringify({
userid:userid,
xml:1
});
request.get('bbsqry?'+qs,{ encoding: null },function(err,res,data){
if(err || res.statusCode!=200){
//userQueue.push({
// userid:userid
//});
return;
}
if(!err && res.statusCode==200){
var xml=iconv.decode(data,'gb2312');
if(xml.indexOf('<error>')>0) return;
var $=cheerio.load(xml);
$=cheerio.load($('userinfo').html());
var userid=$('userid').text();
var nick=$('nick').text();
nick=replaceEmotion(nick);
var horoscope=$('horoscope').text();
var lastloginstr=$('lastlogin').text().substring(0,19).replace('年', '-').replace('月', '-').replace('日', ' ');
var lastlogin=geddy.date.parse(lastloginstr);
var strposts=$('strposts').text();
var strnetage=$('strnetage').text();
var strexp=$('strexp').text();
var strmoney=$('strmoney').text();
var strmedals=$('strmedals').text();
var duty=$('duty').text();
var individual=$('individual').text();
var plans=$('plans').text();
plans=replace(plans);
var numlogins=parseInt( $('numlogins').text());
var gender=parseInt( $('gender').text());
var newmail=parseInt( $('newmail').text());
var numposts=parseInt( $('numposts').text());
var netage=parseInt( $('netage').text());
var life=parseInt( $('life').text());
var exp=parseInt( $('exp').text());
var money=parseInt( $('money').text());
var medals=parseInt( $('medals').text());
var crawlTime=new Date();
geddy.model.User.first({userid:userid},function (err, user) {
if(null==user){
user = geddy.model.User.create({
userid : userid
})
}
user.nick = nick ;
user.horoscope= horoscope;
user.lastlogin = lastlogin;
user.strposts =strposts;
user.strnetage = strnetage;
user.strexp = strexp;
user.strmoney = strmoney;
user.strmedals =strmedals;
user.duty = duty;
user.individual = individual;
user.plans = plans;
user.numlogins = numlogins;
user.gender = gender;
user.newmail = newmail;
user.numposts = numposts;
user.netage = netage;
user.life = life;
user.exp = exp;
user.money = money;
user.medals = medals;
user.crawlTime = crawlTime;
user.save(function(err){
log('got user ---> userid: '+userid);
var url='faceimg/'+userid.substring(0,1).toUpperCase()+'/'+userid+'.jpg';
imageQueue.push({
url:url
})
});
});
}
});
}
function getImage(id,url){var r = request.defaults({ encoding: null, headers: { cookie: cookie} });
r.get(url,function(err,res,data){
if(err)
{
log(err); /*
imageQueue.push({
id:id,
url:url
}) */
return;
}
if(res.statusCode==200){
if(id){
geddy.model.Image.first({id:id} , function(err,image){
image.data=data;
image.crawlTime=new Date();
image.save();
log('saved image ---> id: '+id +' url: '+url);
var tmpfile=id;
gm(data).size({bufferStream:true},function(err,size){
if(!err&&size.width>600){
this.resize(600,size.height*600/size.width)
.write(tmpfile,function(err){
if(!err){
fs.exists(tmpfile,function(exists){
if(!exists) return;
try{
var buf=fs.readFileSync(tmpfile);
image.thumbnail=buf;
image.save();
log('thumbnailed image ---> id: '+id +' url: '+url);
fs.unlink(tmpfile);
}
catch(ex){
}
})
}
})
}
})
});
}
else{
var tmpfile=url.substring(url.lastIndexOf('/')+1);
var userid=tmpfile.substring(0,tmpfile.lastIndexOf('.')-1);
async.waterfall([
function(callback){
gm(data).thumb(100,100,tmpfile,100,function(err,stdout,stderr){
if(!err&&fs.existsSync(tmpfile)){
try{
var buf=fs.readFileSync(tmpfile);
fs.unlink(tmpfile);
callback(null,data,buf);
}
catch(ex) {
callback(null,data,null)
}
}else{
callback(null,data,null)
}
})
},function(data,thumbnail,callback){
geddy.model.Image.first({url:url},function(err,image){
if(null == image){
image=geddy.model.Image.create({
data:data,
thumbnail:thumbnail,
url:url,
crawlTime:new Date()
})
}else{
image.data=data;
image.thumbnail=thumbnail;
image.url=url;
image.crawlTime=new Date();
}
image.save();
})
}
],function(err,result){
if(!err)
log('saved gravatar ---> userid: '+userid );
})
}
}
});
}
exports.getCookie=function(){return cookie;};
exports.postQueue=postQueue;
exports.userQueue=userQueue;
exports.imageQueue=imageQueue;
exports.run=function(){
login();
async.whilst(
function () { return true; },
function (callback) {
if(postQueue.tasks.length==0){
log('request top10 posts');
getPosts();
}
setTimeout(callback, 1000*60*5);
},
function (err) {
log(err);
}
);
};
6 回复
nodejs专注于爬虫,图片交给云,我的小站就是这样处理~~图片我用了七牛云储存~ http://www.17qingsong.com
感觉不错的样子,不过官网怎么没找到价位表呢
@neavo 还没正式推出~~估计是在这个月正式推出~现在有3个月试用期~
@xieren58 大概看了一眼API,针对爬虫类网站的话得先手动抓到服务器上再上传至云端?
@neavo 有api,可以全自动啦~
爬虫用Ruby是不是更好呢?