ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

JS爬虫 利用axios和cheerio爬取好大夫病历并生成xlsx

2021-05-16 20:32:32  阅读:225  来源: 互联网

标签:5.0 xlsx axios Windows data Mozilla cheerio var NT


var axios = require("axios")
var cheerio = require("cheerio")
var xlsx = require('node-xlsx');
var fs = require('fs');

var userAgentPool = [
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
];
var userAgentFlag = 0;
var urls = [];
var num = 1;
var data = [
    {
        name : 'sheet1',
        data : [
            [
                '',
                '疾病描述',
                '疾病',
                '病历概要'
            ]
        ]
    }
]
let si1 = setInterval(getUrls, 1000)

console.log('开始爬取病症链接');

function getUrls() {
    if(userAgentFlag === userAgentPool.length) userAgentFlag = 0;
    if(num > 100) {
        clearInterval(si1);
        console.log('病症链接爬取完成,爬取数量:' + urls.length);
        getDatas();
        return;
    }
    axios.get("https://zixun.haodf.com/dispatched/45001000.htm?p=" + num++,
    {
        headers: {
            'User-Agent': userAgentPool[userAgentFlag]
        }
    }).then(resp => {
        var $ = cheerio.load(resp.data)
        var lis = $('.clearfix li');
        for (var i = 0; i < lis.length; i++) {
            var li = lis.eq(i);
            if(li.find(".fl a").attr("href")) {
                urls.push(li.find(".fl a").attr("href"));
            }
        }
        console.log('已爬取第', num - 1, '页', '总爬取数量:', 'urls:', urls.length, '该页末位链接:', urls[urls.length - 1]);
    })
    userAgentFlag++;
}

function getDatas() {
    console.log('开始爬取具体数据');
    num = 0;
    si1 = setInterval(getItem, 100)
}

function getItem() {
    if(userAgentFlag === userAgentPool.length) userAgentFlag = 0;
    if(num === urls.length) {
        clearInterval(si1);
        
        var buffer = xlsx.build(data);
        fs.writeFile('./res.xls', buffer, function (err)
        {
            if (err)
                throw err;
            console.log('Write to xls has finished');
        })
        
        return;
    }
    
    axios.get(urls[num],
    {
        headers: {
            'User-Agent': userAgentPool[userAgentFlag]
        }
    }).then(resp => {
        var $ = cheerio.load(resp.data)
        var section = $('.bccard section').eq(0).find('.info3-value p');
        
        data[0].data.push([
            ++num,
            section.eq(0).text().trim(),
            section.eq(2).text().trim(),
            $('.suggestions-content .suggestions-text-value').text().trim()
        ])
        
        console.log('爬取数据:', data[0].data[data[0].data.length - 1]);
    }
)
    userAgentFlag++;
}

 

标签:5.0,xlsx,axios,Windows,data,Mozilla,cheerio,var,NT
来源: https://www.cnblogs.com/shiningmage/p/14774711.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有