爬虫项目

2021-05-11 09:05:20 阅读：167 来源： 互联网

标签：项目 list selenium 爬虫 element bro import id

1. 基于selenium实现12306登录

#下述代码为超级鹰提供的示例代码
import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()

# chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370')    #用户中心>>软件ID 生成一个替换 96001
# im = open('12306.jpg', 'rb').read()                                                    #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
# print(chaojiying.PostPic(im, 9004)['pic_str'])
#上述代码为超级鹰提供的示例代码

#使用selenium打开登录页面
from selenium import webdriver
import time
from PIL import Image
from selenium.webdriver import Chrome
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options


'''
在拖拽滑块的过程中，老是报错，即便是用手拖住。这是因为服务器检测到了在使用selenium。
如何实现让selenium规避被检测到的风险
实际上，服务器是通过window.navigator.webdriver来检查是否使用了selenium的。正常情况下，结果为false，但是如果使用了selenium，则结果为true。
解决方法如下：
1. 如果chrome版本小于88，则在启动浏览器的时候（此时没有加载任何网页内容），向页面签入js代码，去掉webdriver。
web = Chrome()
web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",{
    "source":"""
    window.navigator.webdriver = undefined
    Object.defineProperty(navigator,'webdriver',{
        get:（）=>undefined
    })
    """
})
web.get(xxxxxx)

2. 如果chrome版本大于等于88，则采用以下方式：
option = Options()
# option.add_experimental_option('excludeSwitches',['enable-automation'])  # 这一句加不加无所谓
option.add_argument('--disable-blink-features=AutomationControlled')

bro = webdriver.Chrome(executable_path='./chromedriver_win32/chromedriver',options=option)
bro.get('https://kyfw.12306.cn/otn/resources/login.html')

'''

option = Options()
option.add_argument('--disable-blink-features=AutomationControlled')

bro = webdriver.Chrome(executable_path='./chromedriver_win32/chromedriver',options=option)
bro.get('https://kyfw.12306.cn/otn/resources/login.html')
time.sleep(1)
a_tag = bro.find_element_by_xpath("/html/body/div[2]/div[2]/ul/li[2]/a")
a_tag.click()
#save_screenshot就是将当前页面进行截图且保存
bro.save_screenshot('aa.png')

#确定验证码图片对应的左上角和右下角的坐标（裁剪的区域就确定）
code_img_ele = bro.find_element_by_xpath('//*[@id="J-loginImg"]')
location = code_img_ele.location  # 验证码图片左上角的坐标 x,y
print('location:',location)
size = code_img_ele.size  #验证码标签对应的长和宽
print('size:',size)
#左上角和右下角坐标
rangle = (
int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
#至此验证码图片区域就确定下来了

i = Image.open('./aa.png')
code_img_name = './code.png'
#crop根据指定区域进行图片裁剪
frame = i.crop(rangle)
frame.save(code_img_name)

#将验证码图片提交给超级鹰进行识别
chaojiying = Chaojiying_Client('ziyouzheyan3', 'liuyanyan03', '914163')    #用户中心>>软件ID 生成一个替换 96001
im = open('code.png', 'rb').read()                                                    #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
print(chaojiying.PostPic(im, 9004)['pic_str'])
result = chaojiying.PostPic(im, 9004)['pic_str']
all_list = [] #要存储即将被点击的点的坐标  [[x1,y1],[x2,y2]]
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
print(all_list)
#遍历列表，使用动作链对每一个列表元素对应的x,y指定的位置进行点击操作
for l in all_list:
    x = l[0]
    y = l[1]
    ActionChains(bro).move_to_element_with_offset(code_img_ele, x, y).click().perform()
    time.sleep(0.5)

bro.find_element_by_id('J-userName').send_keys('18769756237')
time.sleep(2)
bro.find_element_by_id('J-password').send_keys('liuyanyan03')
time.sleep(2)
bro.find_element_by_id('J-login').click()
time.sleep(5)
# 滑块验证
span = bro.find_element_by_xpath('//*[@id="nc_1_n1z"]')
ActionChains(bro).drag_and_drop_by_offset(span,300,0).perform()

2. 基于selenium实现豆瓣网登录

#使用selenium打开登录页面
from selenium import webdriver
import time
from PIL import Image
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ChromeOptions
def get_tracks(distance):
    v = 0
    t = 0.3
    tracks = []
    current = 0
    mid = distance*4/5
    while current < distance:
        if current < mid:
            a = 2
        else:
            a =-3
        v0 = v
        s = v0*t + 0.5*a*(t**2)
        current += s
        tracks.append(round(s))
        v = v0 + a*t
    return tracks
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])

#如何实现让selenium规避被检测到的风险
bro = webdriver.Chrome(executable_path='./chromedriver_win32/chromedriver',options=option)
# bro.maximize_window()
bro.get('https://accounts.douban.com/passport/login')

time.sleep(1)
a_tag = bro.find_element_by_xpath("//*[@id='account']/div[2]/div[2]/div/div[1]/ul[1]/li[2]")
a_tag.click()
#save_screenshot就是将当前页面进行截图且保存
bro.save_screenshot('aa.png')

bro.find_element_by_id('username').send_keys('18769756237')
time.sleep(2)
bro.find_element_by_id('password').send_keys('liuyanyan33')
time.sleep(2)
bro.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a').click()
time.sleep(3)

bro.switch_to.frame('tcaptcha_iframe')
span = bro.find_element_by_xpath('//*[@id="slideBlock"]')

ActionChains(bro).click_and_hold(on_element=span).perform()
ActionChains(bro).move_to_element_with_offset(to_element=span,xoffset=200,yoffset=0).perform()
tracks = get_tracks(40)
print(tracks)
for track in tracks:
    ActionChains(bro).move_by_offset(xoffset=track,yoffset=0).perform()
time.sleep(1)
ActionChains(bro).release().perform()

3. 从快代理爬取IP

import requests
from lxml import etree
import time
'''
1. 循环抓取代理IP
https://www.kuaidaili.com/free/inha/1
2. 检测代理IP的质量问题
'''

header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
    }
base_url = 'https://www.kuaidaili.com/free/inha/%d'

def getHttp():
    proxy_lst = []
    for i in range(1, 10):
        url = base_url % i
        response = requests.get(url,headers = header)
        data = response.text
        tree = etree.HTML(data)
        tr_list = tree.xpath('//*[@id="list"]/table//tr')
        # http_lst = []
        for tr in tr_list:
            dic = {}
            type_list = tr.xpath('./td[@data-title="类型"]/text()')
            IP_list = tr.xpath('./td[@data-title="IP"]/text()')
            PORT_list = tr.xpath('./td[@data-title="PORT"]/text()')
            if type_list and IP_list and PORT_list:
                type = type_list[0]
                IP = IP_list[0]
                PORT = PORT_list[0]
                dic[type] = IP + ":" + PORT
                proxy_lst.append(dic)
            time.sleep(0.5)  # 如果在爬取每页数据之后不加间隔时间，则无法完全抓取到数据，只会抓取到部分数据。
    return proxy_lst
def check_ip(lst):
    act_lst = []
    for proxy in lst:
        # timeout=0.1指的是响应时间。超过该时间会自动报错。
        try:
            res = requests.get('http://www.baidu.com',headers = header,proxies = proxy,timeout = 0.1)
            if res.status_code == 200:
                act_lst.append(proxy)
        except Exception as e:
            print(e)
    return act_lst

# print(lst,len(lst))
if __name__ == "__main__":
# 检测代理IP可用性

    proxy_lst=getHttp()
    print(proxy_lst)
    can_use = check_ip(proxy_lst)
    print('可用的代理IP：',can_use)
    print('可用的代理IP的数量：',len(can_use))
    print('可用的代理IP的百分比：',len(can_use)/len(proxy_lst))

标签：项目,list,selenium,爬虫,element,bro,import,id
来源： https://www.cnblogs.com/libyan/p/14656602.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

爬虫项目