ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

爬取某APP的数据

2021-10-15 16:05:00  阅读:77  来源: 互联网

标签:addert url 数据 APP 爬取 json import data append


好久没写博客了,也许人还处在迷茫状态一天浑浑噩噩的。最近写了一个爬虫脚本爬某APP的用户厂商数据,由于数据有12W+加上sleep的时间跑起来花费的时间很长。所以我将脚本分开写了先抓一级页面的请求参数再抓二级页面的详细数据,也可以将两个脚本合并,抓到的请求参数会存放在列表中在二脚本循环读取。数据量过大频繁的抓取必然会遭到反爬,所以我们需要代理IP池。说实在的去抓取免费的代理IP池真的纯属浪费时间,能用的IP少得可怜,那种东西只适合自己写着玩玩。真正的实际操作中你要为公司抓取数据还是买个代理IP套餐,下图标红的喂代理AP的API,调用API获取代理IP,把获取到的IP放进池子里通过页面的状态码去甄别有效的IP加以利用。(忘了说抓APP数据要用filder去找请求头参数,用自己手机下载注册APP连接到跟电脑同一网段的wife,通过代理IP把wife配置成自己电脑的IP这样你在打开APP的时候filder上会刷出APP的请求记录)

import requests
import urllib3
import pprint
import socket
import pymysql
import pandas as pd
import os
import ssl
import time

datapage = []
aplist = []
requests.adapters.DEFAULT_RETRIES = 5
timeout = 120
socket.setdefaulttimeout(timeout)
requests.packages.urllib3.disable_warnings()

ssl._create_default_https_context = ssl._create_unverified_context
proxy_pool_url = []



def main():
    urllib3.disable_warnings()
    os.chdir(r'E:\eclipse-workspace\day23\weixiuzhan\venv')
    url1 = 'http://47.106.123.30:8070/app/api/usercompany/finsCompanyListAndVipCompany'
    df = pd.read_json('city.json')
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Connection': 'keep-alive',
        'Accept': '*/*',
        'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
        'Content-Length': '86',
        'Accept-Encoding': 'gzip, deflate',
        'User-Agent': 'lanaer/15.1.8 (iPhone; iOS 14.4.2; Scale/2.00)'

    }
    sk = 0
    for i in range(len(df.dataes)):
        for Num in range(1, 20):
            try:
                # print(df['data'][i]['latitude'], df['data'][i]['longitude'])
                params = "locationX=" + str(df['dataes'][i]['latitude']) + "&locationY=" \
                         + str(df['dataes'][i]['longitude']) \
                         + "&oneselfType=0&pageId=" + str(Num) \
                         + "&pageCount=25&pageSize=20&userType=3"
            except KeyError:
                continue

            gurl = 'https://proxyapi.horocn.com/api/v2/proxies?order_id=ZILH1713559550939697&num=1&format=text&line_separator=win&can_repeat=no&user_token=8a61d42fdd4041c67145cf6a44f51d69'
            rep = requests.get(url=gurl)
            proxy_pool_url.append(rep.text)
            time.sleep(1)
            proxies = {'https': 'https://' + proxy_pool_url[sk]}
            response = requests.post(url=url1, allow_redirects=False, proxies=proxies, data=params, headers=headers,
                                     verify=False)
            json_data = response.json()
            result = json_data
            coed_status = result['msg']
            if coed_status == '您的操作过于频繁,请休息一下吧~':
                sk = sk + 1
                proxies1 = {'https': 'https://' + proxy_pool_url[sk]}
                response1 = requests.post(url=url1, allow_redirects=False, proxies=proxies1, data=params, headers=headers,
                                          verify=False)
                json_data1 = response1.json()
                result2 = json_data1
            else:
                result2 = json_data


            try:
                result2['data']['result'][0]
            except (IndexError, KeyError, TypeError):
                continue
            for i in range(len(result['data'])):

                try:
                    id = result2['data']['result'][i]['_id']
                    aplist.append(id)
                except (IndexError, KeyError):
                    id = ' '
                    aplist.append(id)
            time.sleep(2)

    print(aplist)
    output = open('id.json', 'w', encoding='gbk')
    output.write(str(aplist))
    output.close()

if __name__ == '__main__':
    main()

 

import requests
import urllib3
import pprint
import socket
import pymysql
import pandas as pd
import os
import ssl
import time

datapage = []
aplist = []
requests.adapters.DEFAULT_RETRIES = 5
timeout = 10
socket.setdefaulttimeout(timeout)
requests.packages.urllib3.disable_warnings()
ssl._create_default_https_context = ssl._create_unverified_context
proxy_pool_url = []

def main():
    urllib3.disable_warnings()
    os.chdir(r'E:\eclipse-workspace\day23\weixiuzhan\venv')
    df = pd.read_json('id.json')
    head = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Connection': 'keep-alive',
        'Connection': 'keep-alive',
        'Accept': '*/*',
        'User-Agent': 'lanaer/15.1.20 (iPhone; iOS 13.4.2; Scale/2.00)',
        'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
        'Content-Length': '319',
        'Accept-Encoding': 'gzip, deflate'


    }
    url2 = 'http://47.106.123.30:8070/app/api/usercompany/v1/getById'
    sk = 0
    for i in range(len(df.dataes)):

        param = "companyType=3&id=" + str(df['dataes'][i]['id']) + "&key=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJBUFAiLCJpc3MiOiJTZXJ2aWNlIiwiZXhwIjoxNjk3MjYzNzkwLCJ1c2VySWQiOiI0ODIwMWUzNy02ODlhLTRlNjEtYmZjYy1mMzVlMmQwYWRlMjEiLCJpYXQiOjE2MzQxOTE3OTAsInRva2VuIjoiMTYzNDE5MTc5MDIzOCJ9.60gGl6hJbvpKcHtwfxRSMQveZ8O-moWrLEBEpVn-PYo"

        gurl = 'https://proxyapi.horocn.com/api/v2/proxies?order_id=ZILH1713559550939697&num=1&format=text&line_separator=win&can_repeat=no&user_token=8a61d42fdd4041c67145cf6a44f51d69'
        rep = requests.get(url=gurl)
        proxy_pool_url.append(rep.text)
        time.sleep(1)
        proxies = {'https': 'https://' + proxy_pool_url[sk]}
        response = requests.post(url=url2, allow_redirects=False, proxies=proxies, data=param, headers=head,verify=False)
        json_data = response.json()
        result = json_data
        coed_status = result['msg']
        if coed_status == '您的操作过于频繁,请休息一下吧~':
            sk = sk + 1
            proxies1 = {'https': 'https://' + proxy_pool_url[sk]}
            response1 = requests.post(url=url2, allow_redirects=False, proxies=proxies1, data=param, headers=head,verify=False)
            json_data1 = response1.json()
            result2 = json_data1
        else:
            result2 = json_data
        print(proxies)


        print(result2)
        addert = []

        try:
            companyName = result2['data']['companyName']
            addert.append(companyName)
        except (IndexError, KeyError):
            companyName = ' '
            addert.append(companyName)
        try:
            repairTypeName = result2['data']['repairTypeName']
            addert.append(repairTypeName)
        except (IndexError, KeyError):
            repairTypeName = ' '
            addert.append(repairTypeName)
        try:
            contacts = result2['data']['contacts']
            addert.append(contacts)
        except (IndexError, KeyError):
            contacts = ' '
            addert.append(contacts)
        try:
            mobile = result2['data']['mobile']
            addert.append(mobile)
        except (IndexError, KeyError):
            mobile = ' '
            addert.append(mobile)
        try:
            workDescribe = result2['data']['workDescribe'].strip('\n')
            addert.append(workDescribe)
        except (IndexError, KeyError):
            workDescribe = ' '
            addert.append(workDescribe)
        try:
            address = result2['data']['address']
            addert.append(address)
        except (IndexError, KeyError):
            address = ' '
            addert.append(address)
        try:
            location = result2['data']['location']
            addert.append(location)
        except (IndexError, KeyError):
            location = ' '
            addert.append(location)

        datapage.append(addert)
        time.sleep(1)
    dbpath = pymysql.connect(host='192.168.1.202', port=3306, user='root', password='Password@123', database='wxzhan')
    saveData(datapage, dbpath)


# 创建数据表
def init_db(dbpath):
    c = dbpath.cursor()  # 获取游标
    sql = '''
        CREATE TABLE `weixiuz` (id int unsigned not null auto_increment primary key,
        `company`  mediumtext NULL ,
        `type`  mediumtext NULL ,
        `contact`  mediumtext NULL ,
        `mobile`   mediumtext NULL ,
        `describe`  longtext NULL ,
        `address`  mediumtext NULL ,
        `location`  longtext NULL 
)
    '''  # 创建数据表
    dbpath.ping(reconnect=True)
    c.execute(sql)  # 执行SQL
    dbpath.commit()  # 提交数据库操作
    dbpath.close()  # 关闭数据库连接

# 保存数据
def saveData(datapage, dbpath):
    init_db(dbpath)
    cur = dbpath.cursor()
    for page in datapage:
        for index in range(len(page)):
            if (len(page[index]) != 0):
                page[index] = '"' + str(page[index]) + '"'
                # page.append('"' + str(page1[index]) + '"')
            else:

                page[index] = '""'
        sql = '''
        insert into `weixiuz` (company, `type`, contact, `mobile`, `describe`, `address`, `location`)
        values (%s)''' % str(",".join(page))
        print(sql)
        dbpath.ping(reconnect=True)
        cur.execute(sql)
        dbpath.commit()
    cur.close()
    dbpath.close()

if __name__ == '__main__':
    main()

 

标签:addert,url,数据,APP,爬取,json,import,data,append
来源: https://www.cnblogs.com/FireLL/p/15411338.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有