80小说爬取

2021-11-27 11:04:00 阅读：182 来源： 互联网

标签：content title url self 爬取 html download 80 小说

import os
import random
import re
import time

import lxml.etree
import requests
import faker

fake = faker.Faker()

uaList = []
for i in range(0, 10):
    uaList.append(fake.user_agent())

headers = {
    "User-Agent": random.choice(uaList)
}


def request_view(response):
    import webbrowser
    request_url = response.url
    base_url = '<head><base href="%s">' % (request_url)
    base_url = base_url.encode()
    content = response.content.replace(b"<head>", base_url)
    tem_html = open('tmp.html', 'wb')
    tem_html.write(content)
    tem_html.close()
    webbrowser.open_new_tab('tmp.html')


class Crawl:
    host = ""
    headers = ""

    def __init__(self, host, headers):
        self.host = host
        self.headers = headers

    def get_content(self, url):
        resp = requests.get(url, self.headers)
        if resp.status_code != 200:
            print("crawl url error " + url + str(resp.status_code))
            content = None
        else:
            content = resp.content
        return content

    def get_novel_list(self, content):
        html = lxml.etree.HTML(content)
        list = html.xpath('//div[@class="searchlist_l_box"]/ul//li')
        if len(list) > 0:
            for li in list:
                hrefs = li.xpath("./a/@href")
                if len(hrefs) > 0:
                    for href in hrefs:
                        detail_url = self.join_url(href)
                        self.get_download_url(detail_url)

    def join_url(self, url):
        return "http://" + self.host + url

    def get_download_url(self, detail_url):
        content = self.get_content(detail_url)
        html = lxml.etree.HTML(content)
        title = html.xpath('//dd[@class="bt"]/h2/text()')
        download_url = html.xpath('//div[@class="downlinks"]//a/@href')

        if len(title) == 1 and len(download_url) >= 1:
            title = title[0]
            download_url = download_url[0]
            download_url = self.join_url(download_url)
            self.download_url(download_url, title)

    def download_url(self,url,title):
        title = re.sub(r'[？\\*|“<>:/]', '', title)
        content = self.get_content(url)
        html = lxml.etree.HTML(content)
        txt_url = html.xpath('//div[@class="downlist"][1]/li/strong/a/@href')
        if len(txt_url) == 1:
            self.download_txt(txt_url[0], title)

    def download_txt(self, url, title):
        content = self.get_content(url)
        path = "E:\\xiaoshuo"
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
        file = path + "\\" + title
        with open(file, "wb") as f:
            f.write(content)
            f.close()
        print("download success " + title)
        time.sleep(1)

    def start(self):
        list_code = [ 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
                     'U', 'V', 'W', 'X', 'Y', 'Z']

        for code in list_code:
            url = "http://" + self.host + "/" + code + ".html"
            content = self.get_content(url)
            if content is not None:
                print("crawl url success:" + url)
                self.get_novel_list(content)

if __name__ == "__main__":
    host = "www.txt80.com"
    crawl = Crawl(host, headers)
    try:
        crawl.start()
    except Exception as e:
        print(str(e))

标签：content,title,url,self,爬取,html,download,80,小说
来源： https://www.cnblogs.com/brady-wang/p/15611046.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

80小说爬取