一个简单的爬虫【2】,promise 的,并求教 promise 写法。
发布于 10 年前 作者 zysam 4963 次浏览 最后一次编辑是 8 年前 来自 分享

好吧,事件没什么人看。于是我看完 promise , 重写。 用 promise 就两件事 :

  1. 写 promise 风格的函数,返回 promise 的;
  2. 用 promise 链 then , catch 组织。

本地测试通过了,网络测试未完成。点评禁了,返回的是 !220 : Forbiden ,这下不知怎办了。

错误能正常 catch,还不清楚会不会中断其它流,不知道 promise 这么组织对不对 ?

希望有人指教下…

代码如下:

http = require 'http'
cheerio = require 'cheerio'
fs = require 'fs'
Model = require './model'
mongoose = require 'mongoose'
STATUS_CODES = http.STATUS_CODES

loadWebPage = (url) ->
	promise = new Promise (resolve,reject) ->
		console.log 'loading : %s',url
		req = http.get url,(res) ->
			body = ''
			if res.statusCode isnt 200
				reject '!200 : %s',STATUS_CODES[res.statusCode]
			res.on 'data',(chunk) ->
				body += chunk
			res.on 'end', ->
				console.log 'body:' + body
				resolve body
		req.on 'error',(err) ->
			reject 'req error : %s',err

parsePage = (html) ->
	promise = new Promise (resolve,reject) ->
		console.log 'parse...'
		$ = cheerio.load html
		docs = []

		$('#shop-all-list ul li')
			.each (i,elem) ->
				model = new Object 
					shopName : ''
					link : ''
					pic : ''
					addr : ''
					cate : 
						life : new Array
						buss : new Array
					comment : new Array
					
				#console.log 'i:%s',i
				#console.log i + ':' + $('.txt .tag-addr span',@).text()
				model.shopName = $('.txt .tit a',@).attr('title')
				model.link = $('.txt .tit a',@).attr('href')
				model.pic = $('.pic a img',@).attr('data-src')
				
				$('.txt .tag-addr',@)
					.each (i,elem) ->
						model.addr = $('.addr',@).text()
						model.cate.life.push $('a span',@).eq(0).text(),$('a',@).eq(0).attr('href')
						model.cate.buss.push $('a span',@).eq(1).text(),$('a',@).eq(1).attr('href')

				model.comment.push $('.txt .comment span',@).attr('title')

				$('.txt .comment a',@)
					.each (i,elem) ->
						model.comment.push $(@).children().text()
				docs.push model

		resolve docs
db = (docs) ->
	promise = new Promise (resolve,reject) ->
		console.log 'db runing...'
		Model.create docs,(err) ->
			if err then reject err else resolve()

readFile = (path) ->
	promise = new Promise (resolve,reject) ->
		console.log 'readding...'
		fs.readFile path,'utf8',(err,data) ->
			if err then reject err else resolve data

writeFile = (path,data) ->
	promise = new Promise (resolve,reject) ->
		console.log 'writing...'
		fs.writeFile path,JSON.stringify(data),(err) ->
			if err then reject err else resolve 'done!'

closeDB = ->
	console.log '\nall run.\nclose db.'
	mongoose.disconnect()

handleErr = (err) ->
	console.log 'has some error : %s',err

generateUrls = (url,prefix,limit) ->
	if typeof prefix is 'number'
		limit = prefix
		prefix = ''
	urls = []
	urls.push url
	urls.push url + prefix + i for i in [2..limit]

	urls

wizard = (Urls) ->
	if !Urls.length then return console.log 'all urls have run.'
	url = Urls.shift()
	promise = new Promise (resolve,reject) ->
		p = loadWebPage url
		#这个做两件事,一个重新开始,一个后面处理
		p.then ->
			wizard(Urls)
		p.then(parsePage).then(db).then(resolve).catch(reject)

#单个例子
opts = {
	host : 'www.dianping.com'
	path : '/search/category/100/10/g132'
	headers : {
  		'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
  	}
}
#url = 'http://www.dianping.com'

loadWebPage(opts)
	.then(parsePage)
	.then(db)
	.catch(handleErr)
	.then(closeDB)

#并发
#Urls = generateUrls url,'p',50
###
numberOfParallelRequests = 5
promises = wizard Urls for i in [2..numberOfParallelRequests]

Promise.all(promises)
	.then closeDB
	.catch (err)->
		wizard(Urls)
		handleErr(err)
###

#
#本地例子
#
###
filepath = './test/gz_movie_p1.html'
destpath = './test/test_promise.json'

readFile(filepath)
	.then (data) ->
		parsePage(data)
	.then (data) ->
		writeFile destpath,data
	.catch handleErr
	.then closeDB
###
2 回复

我觉着吧,爬不到的原因可能还是装的不像浏览器,你看我这请求这么多header呢,你就来个useragent 人家一看就不是浏览器 GET / HTTP/1.1 Host: www.dianping.com Connection: keep-alive Pragma: no-cache Cache-Control: no-cache Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2288.6 Safari/537.36 DNT: 1 Accept-Encoding: gzip, deflate, sdch Accept-Language: en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4 Cookie: cookie is a secret lol. 另外如果用node 本身的http.get 如果我没记错的话自己应该还要处理302跟踪这种情况吧,要不拿到302后面就全完了。我如果直接连点评反正是拿到302了。

@superobin 实际爬过第二天,我用浏览器隐私模式上点评,页面需要输入验证码才能正常跳转。的确我只处理 200 。我主要想看看并发后,Promise.all 处理得对不对,或者有其它更简单的写法。

回到顶部