最近需要跟踪竞争对手网站上的信息,公开的接口带了 sign 加密验证,无奈只有使用 puppeteer 来获取网页上所有加密的 接口列表,找了下网上的滚动加载更多,都不是很理想,自己翻了一遍官网 API 写了个出来,或许可以帮助到别人。 ··· const puppeteer = require(‘puppeteer’) const EventProxy = require(‘eventproxy’) const ep = new EventProxy() let browser
main() async function main () { browser = await openBrowser()
const ps = []
for (let index = 10; index >= 1; index–) {
let request_url = demoUrl=${index}
ps.push(spiderPage(request_url))
}
await Promise.all(ps)
ep.after(‘pageFinished’, ps.length, async rs => {
await browser.close()
})
}
async function openBrowser () { let args = [ ’–no-sandbox’, ’–disable-infobars ‘, // don’t show information bar ’–window-size=1920,1080’, // resize window view port size ’–lang=zh-CN’, ’–disable-dev-shm-usage’ ]
const browser = await puppeteer.launch({ defaultViewport: { width: 375, height: 812 }, ignoreHTTPSErrors: true, headless: false, devtools: true, timeout: 0, args }) return browser }
async function spiderPage (request_url) {
const page = await openPageToUrl(request_url)
ep.on(${request_url}_response
, async rs => {
await sleep(200)
await pageScroll(page)
if (rs.data.productList.list.length != 20) {
console.log(request_url, ‘finished’)
await page.screenshot({ path: ${request_url.split('?')[1]}.png
, fullPage: true })
await page.close()
ep.emit(‘pageFinished’)
}
})
}
async function openPageToUrl (request_url) { const page = await browser.newPage()
await page.goto(request_url, { waitUntil: ‘domcontentloaded’ }).catch(err => console.log(err))
page.on(‘request’, request => { // console.log(‘request’, request._url) const url = request._url if (url.indexOf(‘yourTargetUrl’) != -1) { // console.log(‘request’, url, ‘page stop scroll’) } })
page.on(‘response’, async response => {
// console.log(‘request’, request._url)
const url = response._url
if (url.indexOf(‘yourTargetUrl’) != -1) {
const rs = await response.json()
console.log(‘response’, url, ‘page start scroll’, rs.data.productList.list.length)
// console.log(rs.data.productList.list.length)
ep.emit(${request_url}_response
, rs)
}
})
return page
}
async function pageScroll (page) { await page.evaluate(mValues => { // const scrollY = window.innerHeight const scrollY = document.body.clientHeight window.scrollTo(0, scrollY) }) await sleep(300) }
// 延时函数 function sleep (delay) { return new Promise((resolve, reject) => { setTimeout(() => { try { resolve(1) } catch (e) { reject(0) } }, delay) }) } ···