ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

爬取豆瓣影评2--完整代码

2021-11-27 19:03:35  阅读:176  来源: 互联网

标签:comment __ xpath douban -- text 爬取 豆瓣 print


# -*-coding:utf-8-*-
# @Time :2021/11/20 13:58
# @Author:shuaichao
# @File :.py
# @Software: PyCharm
import urllib.request
from bs4 import BeautifulSoup  # 网页解析,获悉数据.231
import urllib.request, urllib.error  # 制定URL,获取网页数据
import time
import os
import requests
from lxml import etree
import json
from urllib.request import Request
from urllib.request import urlopen


def askUrl(url):
    headers = {
        "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }
    request = urllib.request.Request(url, headers=headers)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reasen)
    return html


# 爬取网页信息
def get_info(baseurl):
    html = askUrl(baseurl)
    bs = BeautifulSoup(html, "html.parser")
    return bs


# soup处理并转换成字符串
def transport(bs, info):
    ex_info = bs.find_all(class_=info)
    info = str(ex_info)
    return ex_info, info


def getImg(url, imgName):
    headers = {
        "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }
    try:
        req_timeout = 5
        req = Request(url=url, headers=headers)
        f = urlopen(req, None, req_timeout)
        pic = f.read()
        # pic= Request.get(url, timeout=10)
        imgPath = './imgs/%s.jpg' % (imgName)
        fp = open(imgPath, 'wb')
        fp.write(pic)
        fp.close()
    except Request.exceptions.ConnectionError:
        print(u'链接失败')  ##再写一个爬去豆瓣登录页面的代码,并调用上述所写的方法


'''
    TODO:获取豆瓣电影ID
'''
if __name__ == '__main__':
    print("开始")
    headers = {
        "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }
    # 获取一千个电影ID
    # 热门类型的
    url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=1300&page_start=0'
    # 国产类型的
    url_guochan = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%9B%BD%E4%BA%A7%E5%89%A7&page_limit=150&page_start=0'
    # 豆瓣高分
    url_douban='https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&page_limit=300&page_start=0'
    # 美剧
    url_meiju='https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%BE%8E%E5%89%A7&page_limit=300&page_start=0'

    res = get_info(url_meiju)
    response_data = json.loads(res.text)
    # 存放评论
    comment_high = []
    comment_middle = []
    comment_low = []
    try:
        for index, k in enumerate(response_data['subjects']):
            # if index <= 1000:
            #     print(index)
            #     continue
            # 存放评论
            comment_high = []
            comment_middle = []
            comment_low = []
            print(index)

            if index % 2 == 0:
                time.sleep(5)
            id = k['id']
            highUrl = "https://movie.douban.com/subject/%s/comments?percent_type=h&limit=20&status=P&sort=new_score" % (
                id)
            middleUrl = "https://movie.douban.com/subject/%s/comments?percent_type=m&limit=20&status=P&sort=new_score" % (
                id)
            lowUrl = "https://movie.douban.com/subject/%s/comments?percent_type=l&limit=20&status=P&sort=new_score" % (
                id)
            print(highUrl)
            '''
                获取高评价评论
            '''
            # 循环请求接口
            for i in range(0, 10):
                time.sleep(2)
                urlTmp = highUrl + "&start=" + str(i * 20)
                re = requests.get(url=urlTmp, headers=headers).text
                # 构造了一个XPath解析对象并对HTML文本进行自动修正
                html = etree.HTML(re)
                # XPath使用路径表达式来选取用户名
                comment = html.xpath('//div[@class="comment"]')
                print("开始好评")
                for content in comment:
                    names = content.xpath('.//a[@class=""]')
                    grades = content.xpath('.//span[contains(@class,"rating")]')
                    texts = content.xpath('.//span[@class="short"]')
                    name = names[0].xpath('./text()')[0]
                    if len(grades) > 0:
                        grade = grades[0].xpath('./@class')[0][7:8] + '星'
                    else:
                        grade = '暂无评价'
                    text = texts[0].xpath('./text()')[0]
                    comment_high.append(text)
                    print(text)
                    print(len(comment_high))
            '''
                获取中评价评论
            '''
            for i in range(0, 10):
                time.sleep(2)
                urlTmp = middleUrl + "&start=" + str(i * 20)
                re = requests.get(url=urlTmp, headers=headers).text
                # 构造了一个XPath解析对象并对HTML文本进行自动修正
                html = etree.HTML(re)
                # XPath使用路径表达式来选取用户名
                print("开始中评")
                comment = html.xpath('//div[@class="comment"]')
                for content in comment:
                    names = content.xpath('.//a[@class=""]')
                    grades = content.xpath('.//span[contains(@class,"rating")]')
                    texts = content.xpath('.//span[@class="short"]')
                    name = names[0].xpath('./text()')[0]
                    if len(grades) > 0:
                        grade = grades[0].xpath('./@class')[0][7:8] + '星'
                    else:
                        grade = '暂无评价'
                    text = texts[0].xpath('./text()')[0]
                    print(text)
                    comment_middle.append(text)
                    print(len(comment_middle))

            '''
                获取低评价评论
            '''

            for i in range(0, 10):
                time.sleep(2)
                urlTmp = lowUrl + "&start=" + str(i * 20)
                re = requests.get(url=urlTmp, headers=headers).text
                # 构造了一个XPath解析对象并对HTML文本进行自动修正
                html = etree.HTML(re)
                # XPath使用路径表达式来选取用户名
                comment = html.xpath('//div[@class="comment"]')
                print("开始差评")
                for content in comment:
                    names = content.xpath('.//a[@class=""]')
                    grades = content.xpath('.//span[contains(@class,"rating")]')
                    texts = content.xpath('.//span[@class="short"]')
                    name = names[0].xpath('./text()')[0]
                    if len(grades) > 0:
                        grade = grades[0].xpath('./@class')[0][7:8] + '星'
                    else:
                        grade = '暂无评价'
                    text = texts[0].xpath('./text()')[0]
                    comment_low.append(text)
                    print(text)
                    print(len(comment_low))
                # 文件夹不存在,则创建文件夹
            save_path = './douban'
            folder = os.path.exists(save_path)
            if not folder:
                os.makedirs(save_path)
            print("开始写入文件")
            with open('./douban/comments_high.txt', 'a+', encoding='utf-8') as f:
                for v in comment_high:
                    print(v)
                    f.write('%s high\n' % v)
            with open('./douban/comments_middle.txt', 'a+', encoding='utf-8') as f:
                for v in comment_middle:
                    print(v)
                    f.write('%s middle\n' % v)
            with open('./douban/comments_low.txt', 'a+', encoding='utf-8') as f:
                for v in comment_low:
                    print(v)
                    f.write('%s low\n' % v)
    except:
        with open('./douban/comments_high.txt', 'a+', encoding='utf-8') as f:
            for v in comment_high:
                print(v)
                f.write('%s high\n' % v)
        with open('./douban/comments_middle.txt', 'a+', encoding='utf-8') as f:
            for v in comment_middle:
                print("写入文件")
                f.write('%s middle\n' % v)
        with open('./douban/comments_low.txt', 'a+', encoding='utf-8') as f:
            for v in comment_low:
                print("写入文件")
                f.write('%s low\n' % v)

 

标签:comment,__,xpath,douban,--,text,爬取,豆瓣,print
来源: https://www.cnblogs.com/chaogehahaha/p/15612691.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有