使用Python抓取Google Play应用程序

2022-10-29 13:52:00 阅读：315 来源： 互联网

完整代码

如果不需要说明，请查看联机 IDE 中的完整代码示例。

import time, json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from parsel import Selector

google_play_apps = {
    'Top charts': {
        'Top free': [],
        'Top grossing': [],
        'Top paid': []
    },
}


def scroll_page(url):
    service = Service(ChromeDriverManager().install())

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--lang=en")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)

    while True:
        try:
            driver.execute_script("document.querySelector('.snByac').click();")
            WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body')))
            break
        except:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body')))

    scrape_top_charts(driver=driver, chart='Top free', button_selector='#ct\|apps_topselling_free .ypTNYd')
    scrape_top_charts(driver=driver, chart='Top grossing', button_selector='#ct\|apps_topgrossing .ypTNYd')
    scrape_top_charts(driver=driver, chart='Top paid', button_selector='#ct\|apps_topselling_paid .ypTNYd')

    selector = Selector(driver.page_source)
    driver.quit()

    return selector


def scrape_top_charts(driver, chart, button_selector):
    button = driver.find_element(By.CSS_SELECTOR, button_selector)
    driver.execute_script("arguments[0].click();", button)
    time.sleep(2)
    selector = Selector(driver.page_source)

    for result in selector.css('.itIJzb'):
        title = result.css('.OnEJge::text').get()
        link = 'https://play.google.com' + result.css('::attr(href)').get()
        category = result.css('.ubGTjb .sT93pb.w2kbF:not(.K4Wkre)::text').get()
        rating = float(result.css('.CKzsaf .w2kbF::text').get())
        thumbnail = result.css('.stzEZd::attr(srcset)').get().replace(' 2x', '')

        google_play_apps['Top charts'][chart].append({
            'title': title,
            'link': link,
            'category': category,
            'rating': rating,
            'thumbnail': thumbnail,
        })


def scrape_all_sections(selector):  
    for section in selector.css('section'):
        section_title = section.css('.kcen6d span::text').get()
        google_play_apps[section_title] = []

        for app in section.css('.UVEnyf'):
            title = app.css('.Epkrse::text').get()
            link = 'https://play.google.com' + app.css('.Si6A0c::attr(href)').get()
            rating = app.css('.LrNMN::text').get()
            rating = float(rating) if rating else rating
            thumbnail = app.css('.Q8CSx::attr(srcset)').get().replace(' 2x', '')

            google_play_apps[section_title].append({
                'title': title,
                'link': link,
                'rating': rating,
                'thumbnail': thumbnail,
            })

    print(json.dumps(google_play_apps, indent=2, ensure_ascii=False))


def scrape_google_play_apps():
    params = {
        'device': 'phone',  
        'hl': 'en_GB',      # language 
        'gl': 'US',         # country of the search
    }

    URL = f"https://play.google.com/store/apps?device={params['device']}&hl={params['hl']}&gl={params['gl']}"

    result = scroll_page(URL)
    scrape_all_sections(result)


if __name__ == "__main__":
    scrape_google_play_apps()

制备

安装库：

pip install parsel selenium webdriver webdriver_manager

减少被阻止的机会

确保您使用请求标头用户代理充当“真实”用户访问。因为默认是python请求，并且网站知道它很可能是发送请求的脚本。检查您的用户代理是什么。requestsuser-agent

有一个方法可以减少在网络抓取博客文章时被阻止的机会，可以让您熟悉基本和更高级的方法。

代码说明

导入库：

import time, json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from parsel import Selector

图书馆	目的
`time`	在 Python 中处理时间。
`json`	将提取的数据转换为 JSON 对象。
`webdriver`	像用户一样在本地或使用 Selenium 服务器的远程计算机上本地驱动浏览器。
`Service`	来管理 ChromeDriver 的启动和停止。
`By`	到一组支持的定位器策略（By.ID、By.TAG_NAME、By.XPATH 等）。
`WebDriverWait`	只根据需要等待。.
`expected_conditions`	包含一组用于 WebDriverWait 的预定义条件。
`Selector`	具有完整 XPath 和 CSS 选择器的 XML/HTML 解析器。

定义字典结构：

google_play_apps = {
    'Top charts': {
        'Top free': [],
        'Top grossing': [],
        'Top paid': []
    },
}

顶级代码环境

在函数的开头，定义了用于生成 .如果要将其他参数传递给 URL，可以使用字典执行此操作。参数会影响输出结果：URLparams

params = {
    'device': 'phone',  
    'hl': 'en_GB',      # language 
    'gl': 'US',         # country of the search
}

接下来，将 URL 传递给函数以滚动页面并获取所有数据。此函数返回的结果将传递给函数以提取必要的数据。这些功能的说明将在下面的相应标题中。scroll_page(URL)scrape_all_categories(result)

此代码使用普遍接受的规则，即使用 __name__ == “__main__” 构造：

def scrape_google_play_apps():
    params = {
        'device': 'phone',  
        'hl': 'en_GB',      # language 
        'gl': 'US',         # country of the search
    }

    URL = f"https://play.google.com/store/apps?device={params['device']}&hl={params['hl']}&gl={params['gl']}"

    result = scroll_page(URL)
    scrape_all_sections(result)


if __name__ == "__main__":
    scrape_google_play_apps()

仅当用户运行此文件时，才会执行此检查。如果用户将此文件导入另一个文件，则检查将不起作用。

标签：python,GooglePlay,应用程序,代码,网站,数据
来源：

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

使用Python抓取Google Play应用程序

完整代码

制备

代码说明

顶级代码环境