好吧,事件没什么人看。于是我看完 promise , 重写。 用 promise 就两件事 :
- 写 promise 风格的函数,返回 promise 的;
- 用 promise 链
then
,catch
组织。
本地测试通过了,网络测试未完成。点评禁了,返回的是 !220 : Forbiden
,这下不知怎办了。
错误能正常 catch
,还不清楚会不会中断其它流,不知道 promise 这么组织对不对 ?
希望有人指教下…
代码如下:
http = require 'http'
cheerio = require 'cheerio'
fs = require 'fs'
Model = require './model'
mongoose = require 'mongoose'
STATUS_CODES = http.STATUS_CODES
loadWebPage = (url) ->
promise = new Promise (resolve,reject) ->
console.log 'loading : %s',url
req = http.get url,(res) ->
body = ''
if res.statusCode isnt 200
reject '!200 : %s',STATUS_CODES[res.statusCode]
res.on 'data',(chunk) ->
body += chunk
res.on 'end', ->
console.log 'body:' + body
resolve body
req.on 'error',(err) ->
reject 'req error : %s',err
parsePage = (html) ->
promise = new Promise (resolve,reject) ->
console.log 'parse...'
$ = cheerio.load html
docs = []
$('#shop-all-list ul li')
.each (i,elem) ->
model = new Object
shopName : ''
link : ''
pic : ''
addr : ''
cate :
life : new Array
buss : new Array
comment : new Array
#console.log 'i:%s',i
#console.log i + ':' + $('.txt .tag-addr span',@).text()
model.shopName = $('.txt .tit a',@).attr('title')
model.link = $('.txt .tit a',@).attr('href')
model.pic = $('.pic a img',@).attr('data-src')
$('.txt .tag-addr',@)
.each (i,elem) ->
model.addr = $('.addr',@).text()
model.cate.life.push $('a span',@).eq(0).text(),$('a',@).eq(0).attr('href')
model.cate.buss.push $('a span',@).eq(1).text(),$('a',@).eq(1).attr('href')
model.comment.push $('.txt .comment span',@).attr('title')
$('.txt .comment a',@)
.each (i,elem) ->
model.comment.push $(@).children().text()
docs.push model
resolve docs
db = (docs) ->
promise = new Promise (resolve,reject) ->
console.log 'db runing...'
Model.create docs,(err) ->
if err then reject err else resolve()
readFile = (path) ->
promise = new Promise (resolve,reject) ->
console.log 'readding...'
fs.readFile path,'utf8',(err,data) ->
if err then reject err else resolve data
writeFile = (path,data) ->
promise = new Promise (resolve,reject) ->
console.log 'writing...'
fs.writeFile path,JSON.stringify(data),(err) ->
if err then reject err else resolve 'done!'
closeDB = ->
console.log '\nall run.\nclose db.'
mongoose.disconnect()
handleErr = (err) ->
console.log 'has some error : %s',err
generateUrls = (url,prefix,limit) ->
if typeof prefix is 'number'
limit = prefix
prefix = ''
urls = []
urls.push url
urls.push url + prefix + i for i in [2..limit]
urls
wizard = (Urls) ->
if !Urls.length then return console.log 'all urls have run.'
url = Urls.shift()
promise = new Promise (resolve,reject) ->
p = loadWebPage url
#这个做两件事,一个重新开始,一个后面处理
p.then ->
wizard(Urls)
p.then(parsePage).then(db).then(resolve).catch(reject)
#单个例子
opts = {
host : 'www.dianping.com'
path : '/search/category/100/10/g132'
headers : {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
}
}
#url = 'http://www.dianping.com'
loadWebPage(opts)
.then(parsePage)
.then(db)
.catch(handleErr)
.then(closeDB)
#并发
#Urls = generateUrls url,'p',50
###
numberOfParallelRequests = 5
promises = wizard Urls for i in [2..numberOfParallelRequests]
Promise.all(promises)
.then closeDB
.catch (err)->
wizard(Urls)
handleErr(err)
###
#
#本地例子
#
###
filepath = './test/gz_movie_p1.html'
destpath = './test/test_promise.json'
readFile(filepath)
.then (data) ->
parsePage(data)
.then (data) ->
writeFile destpath,data
.catch handleErr
.then closeDB
###
我觉着吧,爬不到的原因可能还是装的不像浏览器,你看我这请求这么多header呢,你就来个useragent 人家一看就不是浏览器 GET / HTTP/1.1 Host: www.dianping.com Connection: keep-alive Pragma: no-cache Cache-Control: no-cache Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2288.6 Safari/537.36 DNT: 1 Accept-Encoding: gzip, deflate, sdch Accept-Language: en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4 Cookie: cookie is a secret lol. 另外如果用node 本身的http.get 如果我没记错的话自己应该还要处理302跟踪这种情况吧,要不拿到302后面就全完了。我如果直接连点评反正是拿到302了。
@superobin 实际爬过第二天,我用浏览器隐私模式上点评,页面需要输入验证码才能正常跳转。的确我只处理 200 。我主要想看看并发后,Promise.all 处理得对不对,或者有其它更简单的写法。