ICode9

精准搜索请尝试: 精确搜索
首页 > 数据库> 文章详细

Python爬虫爬取搜狐视频电影并存储到mysql数据库

2021-05-15 19:35:53  阅读:162  来源: 互联网

标签:Python text list li 爬取 url mysql print templist


代码:

  1 import time
  2 import traceback
  3 import requests
  4 from lxml import etree
  5 import re
  6 from bs4 import BeautifulSoup
  7 from lxml.html.diff import end_tag
  8 import json
  9 import pymysql
 10 #连接数据库  获取游标
 11 def get_conn():
 12     """
 13     :return: 连接,游标
 14     """
 15     # 创建连接
 16     conn = pymysql.connect(host="127.0.0.1",
 17                     user="root",
 18                     password="000429",
 19                     db="movierankings",
 20                     charset="utf8")
 21     # 创建游标
 22     cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示
 23     if ((conn != None) & (cursor != None)):
 24         print("数据库连接成功!游标创建成功!")
 25     else:
 26         print("数据库连接失败!")
 27     return conn, cursor
 28 #关闭数据库连接和游标
 29 def close_conn(conn, cursor):
 30     if cursor:
 31         cursor.close()
 32     if conn:
 33         conn.close()
 34     return 1
 35 
 36 def get_souhu():
 37     url='https://film.sohu.com/list_0_0_0_2_2_1_60.html?channeled=1200100000'
 38     #最新上架
 39     new_url='https://film.sohu.com/list_0_0_0_2_1_1_60.html?channeled=1200100000'
 40     #本周热播
 41     week_url='https://film.sohu.com/list_0_0_0_2_0_1_60.html?channeled=1200100000'
 42     headers={
 43         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
 44     }
 45 
 46     #初始化list
 47     templist=[]
 48     dataRes=[]
 49     #最受好评
 50     for i in range(1,31):
 51         url_1='https://film.sohu.com/list_0_0_0_2_2_'
 52         auto=str(i)
 53         url_2='_60.html?channeled=1200100000'
 54         url=url_1+auto+url_2
 55         response = requests.get(url, headers)
 56         response.encoding = 'utf-8'
 57         page_text = response.text
 58         # etree_ = etree.HTML(page_text)
 59         # 获取所有的li
 60         soup = BeautifulSoup(page_text, 'lxml')
 61         # 标签层级选择
 62         li_list = soup.select('.movie-list>li')
 63         print(len(li_list))
 64         if(len(li_list)==0):
 65             print("最受好评爬取结束!")
 66             if(len(dataRes)!=0):
 67                 return dataRes
 68         for li in li_list:
 69             li_text=str(li)
 70             # print(li_text)
 71             li_soup=BeautifulSoup(li_text,'lxml')
 72             name=li_soup.find('div',class_="v_name_info").text
 73             #添加名字
 74             templist.append(name)
 75             # print(name)
 76             #添加评分
 77             score=li_soup.find('span',class_='v_score').text
 78             #处理评分
 79             score=score[-4:-1]
 80             templist.append(score)
 81             # print(score)
 82             #添加path
 83             path=li_soup.find('a',target="_blank")['href']
 84             templist.append(path)
 85             # print(path)
 86             #添加播放状态
 87             state="VIP"
 88             templist.append(state)
 89             print(templist)
 90             dataRes.append(templist)
 91             templist=[]
 92         print("-------------------------------------------")
 93     # print(len(dataRes))
 94 
 95     # #最新上架
 96     #
 97     # templist = []
 98     # for i in range(1,31):
 99     #     url_1='https://film.sohu.com/list_0_0_0_2_1_'
100     #     auto=str(i)
101     #     url_2='_60.html?channeled=1200100000'
102     #     url=url_1+auto+url_2
103     #     response = requests.get(url, headers)
104     #     response.encoding = 'utf-8'
105     #     page_text = response.text
106     #     # etree_ = etree.HTML(page_text)
107     #     # 获取所有的li
108     #     soup = BeautifulSoup(page_text, 'lxml')
109     #     # 标签层级选择
110     #     li_list = soup.select('.movie-list>li')
111     #     print(len(li_list))
112     #     if(len(li_list)==0):
113     #         print("最新上架爬取结束!")
114     #         if(len(dataRes)!=0):
115     #             return dataRes
116     #     for li in li_list:
117     #         li_text=str(li)
118     #         # print(li_text)
119     #         li_soup=BeautifulSoup(li_text,'lxml')
120     #         name=li_soup.find('div',class_="v_name_info").text
121     #         #添加名字
122     #         templist.append(name)
123     #         # print(name)
124     #         #添加评分
125     #         score=li_soup.find('span',class_='v_score').text
126     #         #处理评分
127     #         score=score[-4:-1]
128     #         templist.append(score)
129     #         # print(score)
130     #         #添加path
131     #         path=li_soup.find('a',target="_blank")['href']
132     #         templist.append(path)
133     #         # print(path)
134     #         #添加播放状态
135     #         state="VIP"
136     #         templist.append(state)
137     #         print(templist)
138     #         dataRes.append(templist)
139     #         templist=[]
140     #     print("-------------------------------------------")
141     # # print(len(dataRes))
142     # #本周热播
143     # templist = []
144     # for i in range(1, 31):
145     #     url_1 = 'https://film.sohu.com/list_0_0_0_2_0_'
146     #     auto = str(i)
147     #     url_2 = '_60.html?channeled=1200100000'
148     #     url = url_1 + auto + url_2
149     #     response = requests.get(url, headers)
150     #     response.encoding = 'utf-8'
151     #     page_text = response.text
152     #     # etree_ = etree.HTML(page_text)
153     #     # 获取所有的li
154     #     soup = BeautifulSoup(page_text, 'lxml')
155     #     # 标签层级选择
156     #     li_list = soup.select('.movie-list>li')
157     #     print(len(li_list))
158     #     if (len(li_list) == 0):
159     #         print("本周热播爬取结束!")
160     #         if (len(dataRes) != 0):
161     #             return dataRes
162     #     for li in li_list:
163     #         li_text = str(li)
164     #         # print(li_text)
165     #         li_soup = BeautifulSoup(li_text, 'lxml')
166     #         name = li_soup.find('div', class_="v_name_info").text
167     #         # 添加名字
168     #         templist.append(name)
169     #         # print(name)
170     #         # 添加评分
171     #         score = li_soup.find('span', class_='v_score').text
172     #         # 处理评分
173     #         score = score[-4:-1]
174     #         templist.append(score)
175     #         # print(score)
176     #         # 添加path
177     #         path = li_soup.find('a', target="_blank")['href']
178     #         templist.append(path)
179     #         # print(path)
180     #         # 添加播放状态
181     #         state = "VIP"
182     #         templist.append(state)
183     #         print(templist)
184     #         dataRes.append(templist)
185     #         templist = []
186     #     print("-------------------------------------------")
187     # print(len(dataRes))
188     #list去重
189     # old_list = dataRes
190     # new_list = []
191     # for i in old_list:
192     #     if i not in new_list:
193     #         new_list.append(i)
194     # print(new_list)  # [2, 3, 4, 5, 1]
195     return dataRes
196 #插入数据库
197 def insert_souhu():
198     cursor = None
199     conn = None
200     try:
201         count=0
202         list = get_souhu()
203         print(f"{time.asctime()}开始插入搜狐电影数据")
204         conn, cursor = get_conn()
205         sql = "insert into moviesohu (id,name,score,path,state) values(%s,%s,%s,%s,%s)"
206         for item in list:
207             print(item)
208             count = count + 1
209             #异常捕获,防止数据库主键冲突
210             try:
211                 cursor.execute(sql, [0, item[0], item[1], item[2], item[3] ])
212             except pymysql.err.IntegrityError:
213                 print("重复!跳过!")
214         conn.commit()  # 提交事务 update delete insert操作
215         print(f"{time.asctime()}插入搜狐电影数据完毕")
216     except:
217         traceback.print_exc()
218     finally:
219         close_conn(conn, cursor)
220     return;
221 
222 if __name__ == '__main__':
223     # get_iqy()
224     # get_souhu()
225     insert_souhu()

运行截图

数据库截图

 

建表语句

1 CREATE TABLE `moviesohu` (
2   `id` INT(11) NOT NULL AUTO_INCREMENT,
3   `name` VARCHAR(45) COLLATE utf8_bin NOT NULL,
4   `score` VARCHAR(45) COLLATE utf8_bin NOT NULL,
5   `path` VARCHAR(100) COLLATE utf8_bin NOT NULL,
6   `state` VARCHAR(10) COLLATE utf8_bin NOT NULL,
7   PRIMARY KEY (`name`),
8   KEY `id` (`id`)
9 ) ENGINE=INNODB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

标签:Python,text,list,li,爬取,url,mysql,print,templist
来源: https://www.cnblogs.com/rainbow-1/p/14772320.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有