ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

爬取汽车网站汽车数据

2021-12-01 09:02:19  阅读:189  来源: 互联网

标签:网站 tw li 爬取 汽车 car div jsx class


from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import datetime
import openpyxl
import re
import time
import os


def get_connect():
    firefox_options = Options()
    # 设置无头
    firefox_options.headless = True
    browser = webdriver.Firefox(firefox_options=firefox_options)
    browser.get("https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x")
    browser.implicitly_wait(5)
    return browser


def parse_car_data():
    browser = get_connect()
    # 汽车数据存储
    car_data = []
    # 品牌id
    car_brand_id = 1
    # 车系id
    car_bank_id = 1
    # 解析第一个ul里的li A B C... 并除去  不限和热门两个
    lis = browser.find_elements_by_xpath("//div[@class='wrap tw-bg-white']//"
                                         "div[@class='jsx-1042301898 item-wrap']//"
                                         "div[@class='jsx-1042301898 item-list']//"
                                         "ul[@class='jsx-975855502 tw-flex md:tw-flex-none']//"
                                         "li")[2:]
    # 获取汽车类型  轿车 SUV MPV
    car_type_spans = browser.find_elements_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                    "section//"
                                                    "div[@class='jsx-964070570 tw-flex']//"
                                                    "ul[@class='jsx-964070570 tw-flex-1']//"
                                                    "li//"
                                                    "a[@class='jsx-964070570']//"
                                                    "span[@class='jsx-964070570 series-type_car-name__3pZLx']")
    index = 1
    for li in lis:
        li.click()
        # 获取 A B C...下的所有品牌
        brand_lis = browser.find_elements_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                   "div[@class='jsx-1042301898 item-wrap']//"
                                                   "div[@class='jsx-1042301898 item-list']//"
                                                   "div[@class='jsx-1207899626 more-list-wrap']//"
                                                   "ul[" + str(index) + "]//li")
        index += 1
        for brand_li in brand_lis:
            brand_li.click()
            brand_name = brand_li.text
            print("{}品牌数据开始爬取---------->".format(brand_name))
            for car_type_span in car_type_spans:
                car_type_span.click()
                # 解决加载不全 1 拖动滚动条 2 窗口放大
                browser.set_window_size(1000, 30000)
                time.sleep(3)

                car_type = car_type_span.text
                # 获取车系数据
                car_bank_lis = browser.find_elements_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                              "section//"
                                                              "div[@class='jsx-3448462877 list-wrap']//"
                                                              "ul[@class='jsx-3448462877 list tw-grid tw-grid-cols-12 tw-gap-x-12 tw-gap-y-16']//"
                                                              "li")
                car_bank_lis_len = len(car_bank_lis)
                if car_bank_lis_len == 0:
                    continue
                else:
                    for car_bank_li in range(1, car_bank_lis_len + 1):
                        print("第{}个车系数据开始爬取---------->".format(car_bank_id))
                        bank_name = browser.find_element_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                                  "section//"
                                                                  "div[@class='jsx-3448462877 list-wrap']//"
                                                                  "ul[@class='jsx-3448462877 list tw-grid tw-grid-cols-12 tw-gap-x-12 tw-gap-y-16']//"
                                                                  "li[" + str(car_bank_li) + "]//"
                                                                  "a[@class='jsx-2744368201 item-link']//"
                                                                  "p[@class='jsx-2744368201 car-name']").text
                        car_price = browser.find_element_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                                  "section//"
                                                                  "div[@class='jsx-3448462877 list-wrap']//"
                                                                  "ul[@class='jsx-3448462877 list tw-grid tw-grid-cols-12 tw-gap-x-12 tw-gap-y-16']//"
                                                                  "li[" + str(car_bank_li) + "]//"
                                                                  "a[@class='jsx-2744368201 item-link']//"
                                                                  "p[@class='jsx-2744368201 price']").text
                        car_image_src = browser.find_element_by_xpath("//div[@class='wrap tw-bg-white']//"
                                                                      "section//"
                                                                      "div[@class='jsx-3448462877 list-wrap']//"
                                                                      "ul[@class='jsx-3448462877 list tw-grid tw-grid-cols-12 tw-gap-x-12 tw-gap-y-16']//"
                                                                      "li[" + str(car_bank_li) + "]//"
                                                                      "div[@class='jsx-2682525847 button-wrap tw-grid tw-grid-cols-12 tw-gap-x-3']//"
                                                                      "a[2]").get_attribute("href")
                        car_data.append([car_brand_id, car_bank_id, brand_name, bank_name, car_type, car_price, car_image_src,get_time()])
                        car_bank_id += 1
            print("{}品牌数据爬取结束---------->".format(brand_name))
            car_brand_id += 1
    print("数据开始保存---------->")
    save_car_data(car_data)
    print("数据保存成功---------->")


def format_car_data(data):
    new_data = data.replace(" ", "")
    return re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]", "", new_data)


def save_car_data(car_data):
    path = "../dataset/" + get_time() + "_car_data.xlsx"
    if os.path.exists(path) is False:
        wk = openpyxl.Workbook()
        sheet = wk.active
        header ='品牌id', '车系id', '品牌', '车系', '类型', '价格', '图片链接', '截止时间'
        sheet.append(header)
        wk.save(path)
    if len(car_data) != 0:
        wk = openpyxl.load_workbook(path)
        sheet = wk.active
        for item in car_data:
            sheet.append(item)
        wk.save(path)


def get_time():
    return datetime.datetime.now().strftime("%Y_%m_%d")


def start():
    parse_car_data()


if __name__ == '__main__':
    start()

 

标签:网站,tw,li,爬取,汽车,car,div,jsx,class
来源: https://www.cnblogs.com/MoooJL/p/15627402.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有