标签:img url text 爬取 简易 print div page asyncio
asyncio简易爬取图片
1.实现功能
爬取明星写真图片,网址: http://www.521609.com/tuku/mxxz/index.html.
2.简单实现
# -*- codeing = utf-8 -*-
#@Time : 2021-04-15 08:52
#@Author : Guo
#@Fil : main.py
#@Software : PyCharm
import requests
from lxml import etree
import os
import time
# 创建一个文件夹 用来保存图片
if not os.path.exists('./写真图片'):
os.mkdir('./写真图片')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
startTime = time.time()
# 设置一个通用的url模板
url_base = "http://www.521609.com/tuku/mxxz/index_{}.html"
# 翻页设置
for pageNum in range(1,3):
if pageNum == 1:
url = 'http://www.521609.com/tuku/mxxz/index.html'
else:
url = url_base.format(pageNum)
# print(url)
page_text = requests.get(url=url, headers=headers).content.decode('utf-8') # 解决中文乱码
tree = etree.HTML(page_text)
li_list = tree.xpath('/html/body/div[4]/div[3]/ul/li')
for li in li_list:
img_name = li.xpath('./a/p/text()')[0]
img_name = img_name.replace(' ', '').replace('/', '')
page_url = 'http://www.521609.com' + li.xpath('./a/@href')[0]
# print(img_name, page_url)
detail_page_text = requests.get(url=page_url, headers=headers).content.decode('utf-8')
tree = etree.HTML(detail_page_text)
img_page_url_list = tree.xpath("/html/body/div[5]/div[1]/div[5]/div/div/div/div")
for img_page_url in img_page_url_list:
img_url = 'http://www.521609.com' + img_page_url.xpath(".//img/@src")[0]
# print(img_url)
# print(img_name, img_url)
detail_img_url = img_url.split('/')[-1]
# 下载图片
img_data = requests.get(url=img_url, headers = headers).content
# 存储
rootPath = '/Users/guo/项目/爬虫/实战/6.抓取明星写真/写真图片'
imgPath = rootPath+'/'+img_name+ '/'+detail_img_url
# print(imgPath)
# 由于存储的照片路径是第3级目录 所以要对第2级目录进行创建操作
if not os.path.exists(rootPath+'/'+img_name):
os.mkdir(rootPath+'/'+img_name)
with open(imgPath, 'wb') as fp:
fp.write(img_data)
print(img_name+'下载完成!!!')
print('==================第'+str(pageNum) + '页下载完成!!!===================')
endTime = time.time()
print('共耗时:'+str(endTime-startTime)+'秒')
这里只爬取了前两页,文件大小为84.1M,耗时554.74s
居中并且带尺寸的图片:
3.asyncio异步爬取
import requests
from lxml import etree
import os
import time
import asyncio
import aiohttp
# 创建一个文件夹 用来保存图片
if not os.path.exists('./写真图片'):
os.mkdir('./写真图片')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
startTime = time.time()
urls = []
# 设置一个通用的url模板
url_base = "http://www.521609.com/tuku/mxxz/index_{}.html"
# 翻页设置
for pageNum in range(1,3):
if pageNum == 1:
url = 'http://www.521609.com/tuku/mxxz/index.html'
else:
url = url_base.format(pageNum)
page_text = requests.get(url=url, headers=headers).content.decode('utf-8') # 解决中文乱码
tree = etree.HTML(page_text)
li_list = tree.xpath('/html/body/div[4]/div[3]/ul/li')
for li in li_list:
img_name = li.xpath('./a/p/text()')[0]
img_name = img_name.replace(' ', '').replace('/', '')
page_url = 'http://www.521609.com' + li.xpath('./a/@href')[0]
urls.append(page_url)
async def get_dtail_page(url):
# # requests发起的请求是同步的 无法异步
# response = requests.get(url=url)
# aiohttp 基于异步的网络请求模块
async with aiohttp.ClientSession() as session:
async with await session.get(url) as response:
# 注意:获取响应数据操作前 一定要使用await进行手动挂起
detail_page_text = await response.text()
# print(detail_page_text)
tree = etree.HTML(detail_page_text)
img_page_url_list = tree.xpath("/html/body/div[5]/div[1]/div[5]/div/div/div/div")
for img_page_url in img_page_url_list:
img_url = 'http://www.521609.com' + img_page_url.xpath(".//img/@src")[0]
pageName = img_page_url.xpath("/html/body/div[5]/div[1]/h1/text()")[0]
# print(img_url)
# print(img_name, img_url)
imgName = img_url.split('/')[-1]
# 下载图片
img_data = requests.get(url=img_url, headers = headers).content
# 存储
rootPath = '/Users/guo/项目/爬虫/实战/6.抓取明星写真/写真图片'
imgPath = rootPath+'/'+pageName+ '/'+imgName
# print(imgPath)
# 由于存储的照片路径是第3级目录 所以要对第2级目录进行创建操作
if not os.path.exists(rootPath+'/'+pageName):
os.mkdir(rootPath+'/'+pageName)
with open(imgPath, 'wb') as fp:
fp.write(img_data)
print(pageName+'下载完成!!!')
# 任务列表:存放多个任务对象
tasks = []
for url in urls:
c = get_dtail_page(url) # c为返回的协程对象
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
# 把任务列表封装到wait中
loop.run_until_complete(asyncio.wait(tasks))
endTime = time.time()
print('共耗时:'+str(endTime-startTime)+'秒')
耗时374.79s,大大减少了爬取数据所需的时间。
标签:img,url,text,爬取,简易,print,div,page,asyncio 来源: https://blog.csdn.net/qq_39382777/article/details/115725412
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。