ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

爬取搜狐娱乐类新闻

2021-06-03 09:05:18  阅读:230  来源: 互联网

标签:info 娱乐 搜狐 index 爬取 cursor append print conn


#-*-coding:utf-8-*-
# @Time :2021/4/22 7:08
# @Author:shuaichao
# @File :.py
# @Software: PyCharm
from bs4 import BeautifulSoup        #网页解析,获悉数据.231
import re                            #正则表达式
import urllib.request,urllib.error   #制定URL,获取网页数据
import pymysql
import traceback
import time
import requests
import json
def askUrl(url):
    head={
        # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
        # "Connection": "keep-alive",
        # "Cache-Control": "max-age = 0",
        # "Accept-Language": "zh - CN, zh;q = 0.9",
        # "Accept-Encoding": "gzip, deflate, br",
        # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }
    if __name__ == '__main__':
        request = urllib.request.Request(url, headers=head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reasen)
        return html
def get_info(baseurl):
    html = askUrl(baseurl)
    bs = BeautifulSoup(html, "html.parser")
    return bs
#soup处理并转换成字符串
def transport(bs, info):
    ex_info = bs.find_all(class_=info)
    info = str(ex_info)
    return ex_info, info
def get_conn():
    conn = pymysql.connect(
        host="localhost",
        user="root",
        passwd="qwer1234",
        db="news",
        charset="utf8mb4"
    )
    cursor = conn.cursor()
    return conn, cursor
#关闭数据库
def close_conn(conn, cursor):
    if cursor:
        cursor.close()
    if conn:
        conn.close()
#更新新闻数据
def update_news(allinfo):
    cursor = None
    conn = None
    try:
        conn, cursor = get_conn()
        sql = "insert into new(title, article, type) values(%s,%s,%s)"
        print(f"{time.asctime()}开始更新最新数据")
        for item in allinfo:
            cursor.execute(sql, item)
        conn.commit()
        print(f"{time.asctime()}更新最新数据完毕")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

if __name__=="__main__":
    head = {
        # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
        # "Connection": "keep-alive",
        # "Cache-Control": "max-age = 0",
        # "Accept-Language": "zh - CN, zh;q = 0.9",
        # "Accept-Encoding": "gzip, deflate, br",
        # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }
    #存放所有的新闻网址
    linkall = []
    #所有存放新闻的.js文件
    linkJQ = []
    #所有超链接id
    Linkid = []
    #所有超链接Authorid
    LinkAid = []
    #存放所有标题
    allTitle = []
    #存放所有文章
    allArticle = []
    #存放所有图片链接
    allImg = []
    #汇总所有存入mysql的数据
    allinfo = []
    #制作每个js网页的链接
    for i in range(1,9):
        linkJQ.append(
            'https://cis.sohu.com/cis/feeds?clientType=3&pvId=1619446844595FQymAut&requestId=2011032041009993_1619446844595&refererSpm=smpc.ch19.fd&sceneParam=[{%22spm%22:%22smpc.ch19.fd%22,%22page%22:' + str(
                i) + ',%22size%22:25}]')
        res = requests.get(linkJQ[i - 1], headers=head)
        response_data = json.loads(res.text)
    #存入每个新闻的id和authorid
        for index, value in enumerate(response_data['smpc.ch19.fd']['data']):
            if int(response_data['smpc.ch19.fd']['data'][index]['resourceData']['id']) > 1000000:
                Linkid.append(response_data['smpc.ch19.fd']['data'][index]['resourceData']['id'])
                LinkAid.append(str(response_data['smpc.ch19.fd']['data'][index]['resourceData']['contentData']['authorId']))

    #制作旅游新闻所有网址
    for index,value in enumerate(Linkid):
        linkall.append('https://www.sohu.com/a/'+str(Linkid[index])+'_'+str(LinkAid[index])+'?scm=1004.768163804164063232.0.0.4162&spm=smpc.travel-home.feed.5.1619267001122I92VC4c')
    #最后一个链接是广告,删除
    linkall.pop()
    #开始爬取主要数据
    for index, value in enumerate(linkall):
        bs = get_info(value)
        title = bs.select("h1")
        article = bs.select("article > p")
        if title and article:
            str = ''
            # 总文章表添加文章
            for item in range(1, len(article)):
                str += article[item].get_text()
            if len(str) * 4 > 21000:
                print("超出可储存长度")
                del linkall[index]
                continue
            # article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"\n", "")
            allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", ""))
            # 总标题表添加标题
            allTitle.append(title[0].get_text().strip().replace("原创", "").replace("\n", ""))
            print(index)
            print(value)
            print(title[0].get_text().strip().replace("原创", ""))
            # 总图片表添加图片
            # ex_info, info = transport(bs, "ql-align-center")
            # findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>')
            # Img = re.findall(findImg, info)
            # if Img:
            #     allImg.append(Img)
            # else:
            #     allImg.append("")
        else:
            print(index)
            print(value)
            del linkall[index]
    # for item in linkall:
    #     allinfo.append([item])
    for index, value in enumerate(allTitle):
        allinfo.append([value])
        allinfo[index].append(allArticle[index])
        allinfo[index].append('娱乐')
    update_news(allinfo)

 

标签:info,娱乐,搜狐,index,爬取,cursor,append,print,conn
来源: https://www.cnblogs.com/chaogehahaha/p/14843715.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有