ICode9

精准搜索请尝试: 精确搜索
首页 > 编程语言> 文章详细

Python批量下载壁纸,保存壁纸,flask搭建壁纸网站

2022-05-21 19:34:31  阅读:180  来源: 互联网

标签:img title Python detail url HTML flask print 壁纸


小提示:需更换新的请求头,下载时同目录创建img文件夹存图片

 

1. 获取壁纸数据

"""
思路
1. 请求网页,拿到源代码 ok   requests.get(网址)
2. 将源代码解析成网页        lxml的etree.HTML(源代码)
3. 从网页中提取数据         HTML.xpath('复制来的xpath/text()')
4. 翻页,全站数据爬取       翻页一般修改url就可以实现
5. 数据保存
"""
from pprint import pprint
import csv

"""
1. 获取小图片页面的大图页面的地址
2. 将域名拼接上去,构成大图片的页面地址
3. 请求大图片的地址,将大图片的网址保存
4. 下载图片
"""

import requests
from lxml import etree

baseUrl = 'http://www.netbian.com'

# 伪装成浏览器  需要更换请求头
cookies = {
    '__yjs_duid': '1_33e223172d0308c509f12b4f304f2d491651476976719',
    'yjs_js_security_passport': 'f75d121a02abe7650a974a503fabb1b5f24977f8_1652958780_js',
    'Hm_lvt_0f461eb489c245a31c209d36e41fcc0f': '1652672709,1652774098,1652870042,1652958781',
    'Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f': '1652958781',
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Referer': 'http://www.netbian.com/',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
}


def page(url):
    res = requests.get(url, headers=headers, cookies=cookies)

    html = res.text
    HTML = etree.HTML(html)

    # 提取数据 href属性
    for i in range(1, 21):
        if i == 3:
            continue

        href = HTML.xpath('//*[@id="main"]/div[3]/ul/li[{}]/a/@href'.format(i))

        # 提取小图片地址
        small = HTML.xpath('//*[@id="main"]/div[3]/ul/li[{}]/a/img/@src'.format(i))

        # 将域名拼接上去
        detail_url = baseUrl + href[0]
        # print(detail_url)

        # 请求详情页,拿大图地址
        detail = requests.get(detail_url, headers=headers, cookies=cookies)
        detail.encoding = "gbk"
        detail_html = detail.text
        detail_HTML = etree.HTML(detail_html)

        # 提取图片
        big = detail_HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@src')

        # 提取图片标题
        title = detail_HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@title')
        print(title, small, big)
        data.append(
            {"title": title[0], "small": small[0], "big": big[0], "category": category}
        )


if __name__ == '__main__':
    category = "动漫"
    data = []
    for p in range(2, 5):
        main_url = "http://www.netbian.com/dongman/"
        url = main_url + 'index_{}.htm'.format(p)
        page(url)

    pprint(data)

    # 保存数据
    # 1. 创建表头
    header_list = ["title", "small", "big", "category"]
    # 打开文件
    with open("img_data_a.csv", 'w', encoding="utf-8-sig", newline="") as f:
        # 创建csv的写对象
        writer = csv.DictWriter(f, header_list)
        # 写入表头(a模式 第一次爬的需要写入表头,第二次不需要表头,请把写入表头注释了)
        writer.writeheader()
        # 写入数据
        writer.writerows(data)
获取数据

2. 批量下载

# 导入 csv 库
import csv
import requests
import time

# 伪装成浏览器  需要更换请求头
cookies = {
    '__yjs_duid': '1_33e223172d0308c509f12b4f304f2d491651476976719',
    'yjs_js_security_passport': 'f75d121a02abe7650a974a503fabb1b5f24977f8_1652958780_js',
    'Hm_lvt_0f461eb489c245a31c209d36e41fcc0f': '1652672709,1652774098,1652870042,1652958781',
    'Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f': '1652958781',
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Referer': 'http://www.netbian.com/',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
}


def progressbar(url, filepath='./必须加上扩展名'):
    start = time.time()  # 下载开始时间
    response = requests.get(url, stream=True, headers=headers, cookies=cookies)  # stream=True必须写上
    size = 0  # 初始化已下载大小
    chunk_size = 1024  # 每次下载的数据大小
    content_size = int(response.headers['content-length'])  # 下载文件总大小
    try:
        if response.status_code == 200:  # 判断是否响应成功
            print('开始下载,[文件大小]:{size:.2f} MB'.format(
                size=content_size / chunk_size / 1024))  # 开始下载,显示下载文件大小
            # filepath = '下载/222.mp4'  #注:必须加上扩展名
            with open(filepath, 'wb') as file:  # 显示进度条
                for data in response.iter_content(chunk_size=chunk_size):
                    file.write(data)
                    size += len(data)
                    print('\r' + '[下载进度]:%s%.2f%%' % (
                        '>' * int(size * 50 / content_size), float(size / content_size * 100)), end=' ')
        end = time.time()  # 下载结束时间
        print('完成!用时: %.2f秒' % (end - start))  # 输出下载用时时间
    except Exception:
        pass


# 打开文件
with open("img_data_a.csv", encoding="utf-8-sig", mode="r") as f:
    # 基于打开的文件,创建csv.DictReader实例
    reader = csv.DictReader(f)

    # 输出信息
    for row in reader:
        title = row.get("title")
        big_url = row.get("big")
        print("下载:", title)
        # 下载
        # res = requests.get(big_url, headers=headers, cookies=cookies)
        #
        # with open("img/{}.jpg".format(title), 'wb') as f:
        #     f.write(res.content)

        # 用进度条下载  需要当前目录创建img文件夹存放下载的图片
        u = big_url
        progressbar(url=u, filepath="img/{}.jpg".format(title))
下载图片

3. 线程池批量获取全站壁纸地址

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
import csv

baseUrl = 'http://www.netbian.com'

# 伪装成浏览器 需要更换

cookies = {
    '__yjs_duid': '1_33e223172d0308c509f12b4f304f2d491651476976719',
    'Hm_lvt_0f461eb489c245a31c209d36e41fcc0f': '1652672709,1652774098,1652870042',
    'trenvecookieclassrecord': '^%^2C4^%^2C',
    'trenvecookieinforecord': '^%^2C4-14978^%^2C',
    'yjs_js_security_passport': '9376682f8d181fc1c094828cbcf9858097ffe69e_1652876557_js',
    'Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f': '1652876558',
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Referer': 'http://www.netbian.com/fengjing/index_3.htm',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
}


def home_page(url):
    res = requests.get(url, headers=headers, cookies=cookies)
    html = res.text
    HTML = etree.HTML(html)
    lis = HTML.xpath('//*[@id="main"]/div[3]/ul//li')
    # print(lis)
    for i in range(1, len(lis)):
        # 提取数据 href属性
        if i == 3:
            continue

        href = HTML.xpath('//*[@id="main"]/div[3]/ul/li[1]/a/@href')
        small = HTML.xpath('//*[@id="main"]/div[3]/ul/li[4]/a/img/@src')[0]
        # print(href)
        # 将域名拼接上去
        detail_url = baseUrl + href[0]
        print(detail_url)
        detail_date = {'small': small}

        detail(detail_url, detail_date)


def detail(detail_url, detail_date):
    res = requests.get(detail_url, headers=headers, cookies=cookies)
    res.encoding = "gbk"
    html = res.text
    HTML = etree.HTML(html)
    img_url = HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@src')[0]
    title = HTML.xpath('//*[@id="main"]/div[3]/div/p/a/img/@title')[0]
    # print(img_url)
    detail_date["title"] = title
    detail_date["big"] = img_url
    detail_date['category'] = current
    print(current)
    print(title)
    data.append(detail_date)


def job(url):
    print(url)
    for page in range(1, 10000):
        try:
            if page == 1:
                home_page(url)
            else:
                url_next = url + 'index_{}.htm'.format(page)
                home_page(url_next)
        except:
            print("没有了")
            return


def save():
    # 表头
    header_list = ["title", "category", "small", 'big']
    with open("img_data_all.csv", 'w', encoding="utf-8") as f:
        writer = csv.DictWriter(f, header_list)
        writer.writeheader()

        # 写入数据
        writer.writerows(data)


if __name__ == '__main__':
    category = [
        "/rili/",
        "/dongman/",
        "/fengjing/",
        "/meinv/",
        "/youxi/",
        "/yingshi/",
        "/dongtai/",
        "/weimei/",
        "/sheji/",
        "/keai/",
        "/qiche/",
        "/huahui/",
        "/dongwu/",
        "/jieri/",
        "/renwu/",
        "/meishi/",
        "/shuiguo/",
        "/jianzhu/",
        "/tiyu/",
        "/junshi/",
        "/feizhuliu/",
        "/qita/",
        "/s/wangzherongyao/",
        "/s/huyan/",
        "/s/lol/", ]
    data = []
    pool = ThreadPoolExecutor(50)
    for lei in category:
        current = lei
        url = baseUrl + lei
        # job(url)
        pool.submit(job, url)
    pool.shutdown()
    print("-- 爬取结束 --".center(20, "*"))
    print("开始写入")
    save()
    print("写入完成")
全站/多线程

4. 存数据库

 

5. 搭建壁纸网站

标签:img,title,Python,detail,url,HTML,flask,print,壁纸
来源: https://www.cnblogs.com/zwnsyw/p/16295765.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有