node写爬虫的问题 - CNode技术社区

数据量不是特别大，但有很多层循环嵌套，程序很多时候跑到一半终端就未响应了。。。不报错，也不再继续执行，，，

const https = require('https')
const cheerio = require('cheerio');
const fs = require('fs');

const whttps = {
    gets: async function(url, encodeType, timeout) {
        return new Promise((resolve, reject) => {
            if (typeof encodeType == 'number' && !timeout) {
                timeout = encodeType;
            }
            timeout || (timeout = 40000)
            let req = https.get(url, (res) => {
                if (encodeType == 'binary') {
                    res.setEncoding('binary')
                };
                let body = '';
                clearTimeout(_t)
                res.on('data', (chunk) => {
                    body += chunk
                }).on('end', () => {
                    resolve(body)
                });
            });
            let _t = setTimeout(() => {
                req.emit('timeout', { message: 'have been timeout...' });
            }, timeout);
            req.on('error', (error) => { resolve({ code: -2, codeInfo: error }) });
            req.on('timeout', (timeout) => { resolve({ code: -1, codeInfo: 'timeout' }) });

            req.end()
        })
    }
};
async function init(_s, _e) {
    for (let _i = _s; _i < _e; _i++) {
        let ppath = `${__dirname}/page_${_i}`;
        fs.existsSync(ppath) || fs.mkdirSync(ppath);
        console.log(`>>正在分析页面-${_i}`)
        let data = await whttps.gets('https://www.thefashionisto.com/fashion/campaign/?vpage=' + _i)
        if (checkHttpReturn) {
            childPage(data, ppath);
        }
    }
}
// 检验返回结果是否ok
function checkHttpReturn(data) {
    try {
        let _d = JSON.parse(data);
        if (_d.code == -1 || _d.code == -2) {
            // todo 记录错误
            return 0
        } else {
            return _d;
        }
    } catch (error) {
        return 1
    }
}

// 得到图片的url,保存图片到文件 img_url 为 图片网络地址，cpath为目录
async function savePicFromUrl(img_url, cpath) {
    let img_name = img_url.replace(/(.*\/)|(\..*)/g, ''),
        file_path = `${cpath}/${img_name}.jpg`;
    if (fs.existsSync(file_path)) {
        console.log('>> 文件存在，跳过该图片');
    } else {
        let img_data = await whttps.gets(img_url, 'binary');
        if (checkHttpReturn(img_data)) {
            fs.writeFile(file_path, img_data, 'binary', function(err) {
                try {
                    if (err) throw err;
                    console.log(`>> 图片${img_name}保存成功`)
                } catch (error) {
                    console.log(`>> 图片${img_name}保存失败`)
                }
            })
        }
    }
}
// 处理二级url
async function childPage(data, ppath) {
    $ = cheerio.load(data);
    $('._self').each(async function(i, item) {
        if ($(this).html() == 'Read More') {
            let curl = $(this).attr('href'),
                cdata = await whttps.gets(curl);
            if (checkHttpReturn(cdata)) {
                let name = curl.replace(/(https:\/\/.*?\/)|(\/)/g, '');
                let cpath = `${ppath}/${name}`;
                fs.existsSync(`${cpath}`) || fs.mkdirSync(`${cpath}`);
                c$ = cheerio.load(cdata);
                c$('.entry-content figure img').each(async function(i) {
                    let img_url = c$(this).attr('src');
                    savePicFromUrl(img_url, cpath); //存储文件
                });
                let relation_img = await whttps.gets(curl + '?relatedposts=1');
                relation_img = checkHttpReturn(relation_img)
                if (relation_img) {
                    try {
                        relation_img.items.forEach(async(element, i) => {
                            let img_url = element.img.src;
                            savePicFromUrl(img_url, cpath);
                        });
                    } catch (error) {
                        // todo 记录错误
                    }
                }
            }
        }
    });
}
init(0, 4000);

最新优化后的代码，望批评；来自酷炫的 CNodeMD

yinsu 1楼•8 年前

上代码

sodawy 2楼•8 年前

这么少的信息很难帮到你

wjiban 3楼•8 年前作者

@soda-wy 好的，稍等，谢谢

来自酷炫的 CNodeMD

wjiban 4楼•8 年前作者

@yinsu 好的，谢谢

来自酷炫的 CNodeMD

wjiban 5楼•8 年前作者

@yinsu 好了，麻烦看看，先谢谢了

wjiban 6楼•8 年前作者

@soda-wy 哥，我把代码上传了

sodawy 7楼•8 年前

@wjiban

1、原来的真会有人把代码写成这么乱。。 2、我觉得不是不报错，应该是哪里把错接住后吞了。没在你代码里看到对FileIO和NetworkIO该有的容错处理 3、whttps是啥不知道，但看上去你并没有处理 超时和请求异常的情况。 4、你这代码完全并行，不限制速度，对方有可能会封你出口IP。你自己可以抓包感受下，一下子发出去了多少请求。也许你客户端先瓶颈了 5、已经在代码里加了这么多console.log，建议自己分析一下原因 6、有很多重复的代码可以提取出来