puppeteer 滚动控制研究
发布于 6 年前 作者 yhc-yhc 3904 次浏览 来自 分享

最近需要跟踪竞争对手网站上的信息,公开的接口带了 sign 加密验证,无奈只有使用 puppeteer 来获取网页上所有加密的 接口列表,找了下网上的滚动加载更多,都不是很理想,自己翻了一遍官网 API 写了个出来,或许可以帮助到别人。 ··· const puppeteer = require(‘puppeteer’) const EventProxy = require(‘eventproxy’) const ep = new EventProxy() let browser

main() async function main () { browser = await openBrowser()

const ps = [] for (let index = 10; index >= 1; index–) { let request_url = demoUrl=${index} ps.push(spiderPage(request_url)) } await Promise.all(ps) ep.after(‘pageFinished’, ps.length, async rs => { await browser.close() }) }

async function openBrowser () { let args = [ ’–no-sandbox’, ’–disable-infobars ‘, // don’t show information bar ’–window-size=1920,1080’, // resize window view port size ’–lang=zh-CN’, ’–disable-dev-shm-usage’ ]

const browser = await puppeteer.launch({ defaultViewport: { width: 375, height: 812 }, ignoreHTTPSErrors: true, headless: false, devtools: true, timeout: 0, args }) return browser }

async function spiderPage (request_url) { const page = await openPageToUrl(request_url) ep.on(${request_url}_response, async rs => { await sleep(200) await pageScroll(page) if (rs.data.productList.list.length != 20) { console.log(request_url, ‘finished’) await page.screenshot({ path: ${request_url.split('?')[1]}.png, fullPage: true }) await page.close() ep.emit(‘pageFinished’) } }) }

async function openPageToUrl (request_url) { const page = await browser.newPage()

await page.goto(request_url, { waitUntil: ‘domcontentloaded’ }).catch(err => console.log(err))

page.on(‘request’, request => { // console.log(‘request’, request._url) const url = request._url if (url.indexOf(‘yourTargetUrl’) != -1) { // console.log(‘request’, url, ‘page stop scroll’) } })

page.on(‘response’, async response => { // console.log(‘request’, request._url) const url = response._url if (url.indexOf(‘yourTargetUrl’) != -1) { const rs = await response.json() console.log(‘response’, url, ‘page start scroll’, rs.data.productList.list.length) // console.log(rs.data.productList.list.length) ep.emit(${request_url}_response, rs) } }) return page }

async function pageScroll (page) { await page.evaluate(mValues => { // const scrollY = window.innerHeight const scrollY = document.body.clientHeight window.scrollTo(0, scrollY) }) await sleep(300) }

// 延时函数 function sleep (delay) { return new Promise((resolve, reject) => { setTimeout(() => { try { resolve(1) } catch (e) { reject(0) } }, delay) }) } ···

回到顶部