爬虫_scrapy_某瓜视频

2022-06-02 13:31:24 阅读：126 来源： 互联网

标签：ixigua self settings 爬虫某瓜 item scrapy https

1.settings.py

# Scrapy settings for scrapy_ixigua project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'scrapy_ixigua'

SPIDER_MODULES = ['scrapy_ixigua.spiders']
NEWSPIDER_MODULE = 'scrapy_ixigua.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# ua可以写再这里，也可以写在 DEFAULT_REQUEST_HEADERS，根据个人习惯
# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# 解开注释代表启用cookie，为False时候使用settings中的cookie
# 总结
# 如果使用自定义cookie就把COOKIES_ENABLED设置为True
# 如果使用settings的cookie就把COOKIES_ENABLED设置为False
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  'Accept-Language': 'zh-CN,zh;q=0.9',
  'cookie': 'ttcid=bcb6f1be446c4cd582bb5e23d160443d30; MONITOR_WEB_ID=aa2d700f-6664-4ab9-8a72-fa92af0e86d2; s_v_web_id=verify_l3lbglna_w7Mnw0NE_fqKL_4EkF_8q0i_zGaLOrhiCfb0; _tea_utm_cache_1300=undefined; ixigua-a-s=1; BD_REF=1; support_webp=true; support_avif=true; __ac_signature=_02B4Z6wo00f01j1kb-QAAIDCvWaVpRGIALo9QGtAAO3PkbI7CARXRmXsKOtUBp5FiKR-ArNnnFR4Ezbs2vWvO5pjf6IOKkDAboJ8FvY6BaIKH4b19oW9uJWNAdjll476BJginerDQG2.16UH1b; msToken=bn5b2TKGS6jRsq0K1lR8Pbrt4H29ghi0bSrvVtA0vq2jAFpFmo_SPzLJarFJSfseQPaQU6n-nZh2MVkE8SSMg8kq-yDOFhx6Ymkiazf7S8TzCmp6ujj1ctKptJ1w0G-Z; tt_scid=EpID885-K-4MtEOFevGSniewf1jbhDnQcY--yonCIQp.ou2Q4jqV9AyHq4rXyDtGf0f9; ttwid=1|EXTC5VvDMV8v4FXyn9V_k2BQi6I_NEtP38oaZVfrlN4|1654134611|625dc6664875869927cd8af09d64e49589b6108f556cde317de886b3c88e3633',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'scrapy_ixigua.middlewares.ScrapyIxiguaSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'scrapy_ixigua.middlewares.ScrapyIxiguaDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

#配置数据库,名称一定要大写
DB_HOST = '127.0.0.1'
DB_PORT = 3306
DB_USER = '数据库账户'
DB_PASSWORD = '数据库密码'
DB_NAME = 'spider01'
# utf-8的“-”杠不允许写，否则就报错
DB_CHARSET = 'utf8'

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'scrapy_ixigua.pipelines.ScrapyIxiguaPipeline': 300,
   'scrapy_ixigua.pipelines.MysqlPipeline': 301
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

2.pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import urllib.request
# 管道-下载缩略图
class ScrapyIxiguaPipeline:
    def process_item(self,item,spider):
        url = item.get('src')
        filename = './src/'+item.get('title')+'.jpg'
        urllib.request.urlretrieve(url=url,filename=filename)
        return item

#加载settings文件
from scrapy.utils.project import get_project_settings
#导入pymysql
import pymysql
# 管道-Mysql持久化
class MysqlPipeline:
    def open_spider(self,spider):
        settings = get_project_settings()
        self.host = settings['DB_HOST']
        self.port = settings['DB_PORT']
        self.user = settings['DB_USER']
        self.password = settings['DB_PASSWORD']
        self.database = settings['DB_NAME']
        self.charset = settings['DB_CHARSET']
        self.connect()

    def connect(self):
        self.conn =pymysql.connect(
                            host=self.host,
                            port=self.port,
                            user=self.user,
                            password=self.password,
                            db=self.database,
                            charset=self.charset
        )

        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        #插入数据库
        sql = 'insert into ixigua(src,title,user,play,mv,userimg,duration) values("{}","{}","{}","{}","{}","{}","{}")'.format(item['src'],item['title'],item['user'],item['play'],item['mv'],item['img'],item['duration'])
        # 执行sql语句
        self.cursor.execute(sql)
        # 提交
        self.conn.commit()
        return item

    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()

3.items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ScrapyIxiguaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 视频预览图片地址
    src = scrapy.Field()
    # 视频标题
    title = scrapy.Field()
    # 作者名称
    user = scrapy.Field()
    # 播放量
    play = scrapy.Field()
    # 视频播放地址
    mv = scrapy.Field()
    # 作者头像地址
    img = scrapy.Field()
    # 视频时长
    duration = scrapy.Field()

    pass

4.ixigua.py

import scrapy
from scrapy_ixigua.items import ScrapyIxiguaItem

class IxiguaSpider(scrapy.Spider):
    name = 'ixigua'
    allowed_domains = ['www.ixigua.com']
    #
    # start_urls = ['https://www.ixigua.com/search/广场舞/?logTag=da15511cf98419ebe3b8&tab_name=search']
    start_urls = ['https://www.ixigua.com/']

    def parse(self, response):
        # print(response.request.headers['User-Agent'])
        # print(response.request.headers['cookie'])
        # print(response.text)

        ixigua_list = response.xpath('//div[@class="FeedContainer__items"]/div[@class="FeedContainer__itemWrapper"]')

        for xg in ixigua_list:
            #视频地址
            mv = 'https://www.ixigua.com'+xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a/@href').extract_first()
            # 视频标题
            title = xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a/@title').extract_first()
            # 缩略图片地址
            src = 'https:'+xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a//img/@src').extract_first()
            # 视频时长
            duration =xg.xpath('.//div[@class="HorizontalFeedCard__coverContainer"]/a//span/text()').extract_first()
            # 作者头像地址
            userImg = 'http:'+xg.xpath('.//div[@class="HorizontalFeedCard__contentWrapper"]//img/@src').extract_first()
            # 作者名称
            user = xg.xpath('.//div[@class="HorizontalFeedCard__contentWrapper"]//a[@class="user__name"]/text()').extract_first()
            # 播放量
            play = xg.xpath('.//div[@class="HorizontalFeedCard__contentWrapper"]//span[@class="HorizontalFeedCard-accessories-bottomInfo__statistics"]/text()').extract_first()
            # 提交管道
            ixigua = ScrapyIxiguaItem(mv=mv,title=title,src=src,duration=duration,img=userImg,user=user,play=play)
            yield ixigua

建立一个启动文件，定时执行爬取文件

import time
import os

if __name__ == '__main__':
    print('tuwner_spider running')
    times = 1
    while True:
        os.system("scrapy crawl ixigua")
        time.sleep(10)  # 每隔一天运行一次 24*60*60=86400s
        times+=1
        print('Crawl successful,【'+str(times)+'】')

代码地址：https://gitee.com/heating-cloud/python_spider.git

标签：ixigua,self,settings,爬虫,某瓜,item,scrapy,https
来源： https://www.cnblogs.com/ckfuture/p/16336947.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

爬虫_scrapy_某瓜视频

1.settings.py

2.pipelines.py

3.items.py

4.ixigua.py