爬取300首古诗

2022-09-09 20:32:39 阅读：277 来源： 互联网

import time

import requests
from lxml import etree
from multiprocessing import Pool




def zxc():
  qwe_op=requests.get('https://so.gushiwen.cn/shiwenv_45c396367f59.aspx').text
  html1 = etree.HTML(qwe_op)
  '标头的xpth的'
  roto=html1.xpath('//*[@id="sonsyuanwen"]/div[1]/h1/text()')
  "作者的xpth"
  roto1=html1.xpath('//*[@id="sonsyuanwen"]/div[1]/p/a[1]/text()')
  '诗句'
  textuio=html1.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/div[2]/text()')
  with open('lp/'+str(roto[0])+'.txt','a+',encoding='utf-8')as ll:
     ll.write('{}\n'.format(str(roto[0])))
     ll.write('{}\n'.format(str(roto1[0])))
     for i in textuio:
       s=str(i).replace(' ','')
       ll.write('{}\n'.format(str(s)))


# 用多进程
if __name__ == '__main__':
   qwe1=time.time()
   pool= Pool()
   hercx = {
     'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.25'
   }
   cvb = requests.get('https://so.gushiwen.cn/gushi/tangshi.aspx/', headers=hercx).text
   html = etree.HTML(cvb)

   '爬取300首古诗 获取300首的地址xpath'
   qwe_300 = '//*[@id="html"]/body/div/div/div/div/span/a/@href'
   rahsfd_300 = html.xpath(qwe_300)
   vbn_to = rahsfd_300  # 300个地址返回成列表里面

   adp='https://so.gushiwen.cn'
   for i in vbn_to:
         qwe=adp+str(i)
         pool.apply_async(zxc, args=(qwe,hercx))
   cvb=time.time()
   pool.close()
   pool.join()
   print('{}秒'.format(str(cvb-qwe1)))

标签：xpath,300,text,qwe,爬取,str,古诗,div
来源： https://www.cnblogs.com/xxh12/p/16673878.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

爬取300首古诗