首页 > 语言 > JavaScript > 正文

node.js 基于cheerio的爬虫工具的实现(需要登录权限的爬虫工具)

2024-05-06 15:40:52
字体:
来源:转载
供稿:网友

公司有过一个需求,需要拿一个网页的的表格数据,数据量达到30w左右;为了提高工作效率。

结合自身经验和网上资料。写了一套符合自己需求的nodejs爬虫工具。也许也会适合你的。

先上代码。在做讲解

'use strict';// 引入模块const superagent = require('superagent');const cheerio = require('cheerio');const Excel = require('exceljs');var baseUrl = '';var Cookies = 'PHPSESSID=1c948cafb361cb5dce87122846e649cd'; //伪装的cookielet pageDatas = [];let count = 1;let limit = 3;for (count; count < limit; count++) {   baseUrl = `http://bxjd.henoo.com/policy/policyList?page=${count}`;  loadPage(baseUrl); }function loadPage(baseUrl) {    getPageLoad(baseUrl);} async function getPageLoad(baseUrl) {  try {    let body = await superagent.get(baseUrl)      .set("Cookie", Cookies)    var $ = cheerio.load(body.text);    var trList = $("#tableList").children("tr");        for (var i = 0; i < trList.length; i++) {         let item = {};        var tdArr = trList.eq(i).find("td");        var id = tdArr.eq(0).text();        item.sortId = id;        var detailUrl = `http://bxjd.henoo.com/policy/view?id=${id}`;        item.policyId = tdArr.eq(1).text();        item.policyProductName = tdArr.eq(2).text();        item.policyName = tdArr.eq(3).text();        item.policyMoney = tdArr.eq(4).text();        let detailBody = await superagent.get(detailUrl)          .set("Cookie", Cookies);        var $$ = cheerio.load(detailBody.text);        var detailT = $$(".table-view");             //投保人证件号        item.policyIdNum = detailT.find("tr").eq(11).find("td").eq(1).text();        //投保人手机号        item.policyPhone = detailT.find("tr").eq(10).find("td").eq(1).text();        //被保人手机号        item.bePoliciedPhone = detailT.find("tr").eq(16).find("td").eq(1).text();              //被保人姓名        item.bePoliciedName = detailT.find("tr").eq(13).find("td").eq(1).text();        console.log(item.bePoliciedName)        //被保人证件号        item.bePoliciedIdNum = detailT.find("tr").eq(17).find("td").eq(1).text();        pageDatas = [...pageDatas,item];      }    if (pageDatas.length / 15 == (count - 1)) {       writeXLS(pageDatas)    }  } catch (error) {  }}function writeXLS(pageDatas) {  const workbook = new Excel.Workbook();  const sheet = workbook.addWorksheet('My Sheet');  const reColumns=[    {header:'序号',key:'sortId'},    {header:'投保单号',key:'policyId'},    {header: '产品名称', key: 'policyProductName'},    {header: '投保人姓名', key: 'policyName' },    {header: '投保人手机号', key: 'policyPhone' },    {header: '投保人证件号', key: 'policyIdNum'},    {header: '被保人姓名', key: 'bePoliciedName' },    {header: '被保人手机号', key: 'bePoliciedPhone' },    {header: '被保人证件号', key: 'bePoliciedIdNum' },    {header:'保费',key:'policyMoney'},  ];  sheet.columns = reColumns;  for(let trData of pageDatas){    sheet.addRow(trData);  }  const filename = './projects.xlsx';  workbook.xlsx.writeFile(filename)  .then(function() {   console.log('ok');  }).catch(function (error) {      console.error(error);   }); }            
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表

图片精选