ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

爬取豆瓣top250电影

2021-10-30 20:37:04  阅读:200  来源: 互联网

标签:name img url self 爬取 豆瓣 file top250 film


爬取思路:
1、使用selenium来获取页面源码,实现翻页功能。
2、获取页面源码后可以获取每个电影对应页面的url。
3、然后请求电影页面的url信息,将需要的保存起来即可。
code:

import requests
from selenium import webdriver 
from bs4 import BeautifulSoup
import os
import time 

class Spide_douban():
    def __init__(self):
        self.headers = {
            "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"
        }                
        self.url = "https://movie.douban.com/top250?"
        self.driver = None
        self.film_url = []
        self.film_name = []
        self.film_seq = []
        
        
        
    def get_film_url(self,html):
        soup = BeautifulSoup(html,"html.parser")
        film_url_list = soup.find("ol",attrs = {"class":"grid_view"}).find_all("div",attrs = {"class":"pic"})
        for div in film_url_list:
            self.film_url.append(div.find("a").attrs["href"])
            self.film_name.append(div.find("img").attrs["alt"])
    def deal_actor_img(self,soup_img):
        img_url = soup_img.find("div").attrs["style"]
        img_name = soup_img.find("span").text
        img_name_role = soup_img.find("span",attrs = {"class":"role"}).text
        return img_url[22:-1],img_name,img_name_role
    
    def deal_actor_role(self,img_name,img_name_role,num):
        file = open("D:\\项目案例\\{}\\{}的简介.txt".format(self.film_name[num],self.film_name[num]),"a",encoding=("utf-8"))
        file.write("\n")
        file.write(img_name)
        file.write("  ")
        file.write(img_name_role)
        file.close()
    
        
    def deal_url(self,url,num):
        try:
            os.mkdir("D:\\项目案例\\{}".format(self.film_name[num]))
            print("正在处理电影  {}".format(self.film_name[num]))
        except Exception as e:
            print("正在处理电影  {}".format(self.film_name[num]))
        
        
        res = requests.get(url,headers = self.headers)
        
        time.sleep(2)
        
        soup = BeautifulSoup(res.text,"html.parser")
        file = open("D:\\项目案例\\{}\\{}的简介.txt".format(self.film_name[num],self.film_name[num]),"w",encoding=("utf-8"))
        
        #添加电影评分
        soup_comment = soup.find("div",attrs = {"id":"interest_sectl"})
        grad = soup_comment.find("strong",attrs = {"class":"ll rating_num"}).string
        vote = soup_comment.find("span",attrs = {"property":"v:votes"}).string
        file.write("电影豆瓣评分:{}".format(grad))
        file.write("   {}人评价\n".format(vote))
        
        F = soup.find("div",attrs = {"id":"info"})
        
        #添加片长
        time1 = F.find("span",attrs = {"property":"v:runtime"})
        file.write("电影时长:{}\n\n".format(time1.string))
        
        #添加电影类型
        
        label_list = F.find_all("span",attrs = {"property":"v:genre"})
        l= len(label_list)
        file.write("电影类型:")
        for i in range(l):
            file.write(label_list[i].string)
            if(i+1 != l):
                file.write("、")
        file.write("\n")
        
        
        #添加上映日期
        data_list = F.find_all("span",attrs = {"property":"v:initialReleaseDate"})
        l = len(data_list)
        file.write("电影上映日期:")
        for i in range(l):
            file.write(data_list[i].string)
            if(i+1 != l):
                file.write("、")
        file.write("\n")
        
        
        
        
        #添加电影简介
        file.write("电影简介:\n")
        soup_text = soup.find("div",attrs = {"id":"link-report"}).find("span",attrs = {"property":"v:summary"}).text.strip()
        soup_text_list = soup_text.split()
        text = ""
        for str_i in soup_text_list:
            number = 0
            for j in str_i:
                number +=1
                if(number %30==0):
                    text+='\n'
                text+=j
        file.write(text)
        file.write("\n")
        file.close()
        

        
        
        #保存电影海报
        file = open("D:\\项目案例\\{}\\{}.jpg".format(self.film_name[num],self.film_name[num]),"wb")
        
        img_url = soup.find("div",attrs = {"id":"mainpic"}).find("img").attrs["src"]
        res_img = requests.get(img_url,headers = self.headers)
        file.write(res_img.content)
        file.close()
        
        
            
        
        
        #保存主演的照片
        soup_actor_list = soup.find("div",attrs = {"id":"celebrities"}).find_all("li")
        for actor in soup_actor_list:
            img_url,img_name,img_name_role = self.deal_actor_img(actor)
            file = open("D:\\项目案例\\{}\\{}.jpg".format(self.film_name[num],img_name),"wb")
            res = requests.get(img_url,headers = self.headers)
            file.write(res.content)
            file.close()
            self.deal_actor_role(img_name,img_name_role,num)

    def move(self):
        for i in range(30):
            js = "var q=document.documentElement.scrollTop={}".format(i*100)  #javascript语句
            self.driver.execute_script(js)
            time.sleep(0.25)
    
    def run(self):        
        self.driver = webdriver.Chrome(executable_path="D:\chromedriver.exe")     
        self.driver.get(self.url)
        #滑动当前页面,使页面充分加载
        time.sleep(2)
        self.move()
        # html = self.driver.page_source
        # self.get_film_url(html)
        flag = True
        while(flag):
            html = self.driver.page_source
            self.get_film_url(html)
            soup = BeautifulSoup(html,"html.parser")
            next_soup = soup.find("span",attrs = {"class":"next"})
            if(next_soup.find("a")==None):
                break
            self.driver.find_element_by_css_selector('#content > div > div.article > div.paginator > span.next > a').click()
            self.move()
        #控制翻页,获得每个页面的HTML源码。
        # 判断是否有a标签,有的话就点,是None的话说明当前是最后一页了。
        #得到的每个电影的url,然后在处理每个电影。
        for num,url in enumerate(self.film_url):
            try:
                self.deal_url(url,num)
            except Exception as E:
                print("{} 电影信息获取出错,等待进行二次获取".format(self.film_name[num]))
                self.film_seq.append(num)
        for i in film_seq:
            try:
                print("尝试二次获取电影:{}的信息".format(self.film_name[i]))
                self.deal_url(self.film_url[i], self.film_name[i])
            except Exception as e:
                print("{} 电影二次获取信息仍然失败".format(self.film_name[i]))
    
Sd = Spide_douban()
Sd.run()

豆瓣网的反爬机制就是如果频繁访问的话会对ip进行封禁,下次再访问的时候需要登录才能访问。

标签:name,img,url,self,爬取,豆瓣,file,top250,film
来源: https://blog.csdn.net/qq_44805233/article/details/121055945

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有