selenium爬虫

2022-05-06 02:31:40 阅读：201 来源： 互联网

标签：webdriver get selenium 爬虫 element bro find

介绍

selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题

selenium本质是通过驱动浏览器，完全模拟浏览器的操作，比如跳转、输入、点击、下拉等，来拿到网页渲染之后的结果，可支持多种浏览器

from selenium import webdriver
browser=webdriver.Chrome()  # 推荐使用
browser=webdriver.Firefox()
browser=webdriver.PhantomJS()
browser=webdriver.Safari()
browser=webdriver.Edge()

安装

有界面的浏览器

#安装：selenium+chromedriver
pip3 install selenium
下载chromdriver.exe放到python安装路径的scripts目录中即可
国内镜像网站地址：http://npm.taobao.org/mirrors/chromedriver
最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads
        
 
# 注意，下载的驱动要和本机浏览器版本对应
# 下载谷歌浏览器驱动：http://npm.taobao.org/mirrors/chromedriver/
#安装使用
from selenium import webdriver

bro = webdriver.Chrome(executable_path='./chromedriver')  # 弹出浏览器，要给浏览器驱动的地址
bro.get('https://www.baidu.com')
print(bro.page_source)  # 获取页面返回的html代码
bro.close()  # 关闭浏览器

#注意：
selenium3默认支持的webdriver是Firfox，而Firefox需要安装geckodriver
下载链接：https://github.com/mozilla/geckodriver/releases
selenium+chromedriver

无界面浏览器

# 5 无界面浏览器（驱动谷歌，驱动其他浏览器）
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
bro=webdriver.Chrome(executable_path='./chromedriver',options=chrome_options)
bro.get("https://www.baidu.com")
print(bro.get_cookies())
bro.close()

开发者模式

options = webdriver.ChromeOptions()
# 开发者模式
options.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path=chromedriver_path, options=options)


def login():
    for res in setting.user:
        # 换用户
        try:
            username = res[0]
            password = res[1]
            options = webdriver.ChromeOptions()
            # 开发者模式
            options.add_experimental_option('excludeSwitches', ['enable-automation'])
            bro = webdriver.Chrome(executable_path=chromedriver_path, options=options)

            bro.implicitly_wait(10)
            bro.get('https://www.taobao.com/')
            # 登录按钮
            # login = bro.find_element_by_css_selector('#login-info > a.sn-login')
            bro.find_element_by_css_selector('#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-sign > a.h').click()
            input_username = bro.find_element_by_css_selector('#fm-login-id')
            input_username.send_keys(username)
            input_password = bro.find_element_by_css_selector('#fm-login-password')
            input_password.send_keys(password)
            # 人工登录
            input("人工操作")
            return bro
        except Exception as e:
            continue

window.navigator.webdriver为true的情况

window.navigator.webdriver为true

def selenium(js):
    option = webdriver.ChromeOptions()
    # option.add_argument('--headless')
    option.add_experimental_option('useAutomationExtension', False)
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    bro = webdriver.Chrome(executable_path='./chromedriver', options=option)  # 弹出浏览器，要给浏览器驱动的地址
     # 打开页面优先执行的js,execute_cdp_cmd
    bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            Object.defineProperty(navigator, 'webdriver', {
              get: () => undefined
            })
          """
    })

    bro.implicitly_wait(10)
    bro.get('https://www.toutiao.com/')
    time.sleep(5)
    print(bro.page_source)  # 获取页面返回的html代码
    bro.execute_script(js)
    input()

selenium高级用法

.send_keys('')  # 写入
.click()  # 点击
.clear()  # 清空

selenium选择器选择

# 1、find_element_by_css_selector    # css选择器找
# 2、find_element_by_xpath           #xpath选择器找
# 3、find_element_by_partial_link_text # a标签上的文字模糊
# 4、find_element_by_tag_name        # 根据标签名字找
# 5、find_element_by_id  # id找
# 6、find_element_by_name            # name='xx' 根据name属性找
# 7、find_element_by_class_name      # 根据类名字找
# 8、find_element_by_link_text   # a标签上的文字找

常用用法

# 常用用法(在输入框中输入美女，搜索)
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")
bro.implicitly_wait(10)  # 添加隐士等待，最多等待10秒


#  一、在输入框中输入美女（自带的解析器，查找输入框空间）
# 1.找到输入框
input_search=bro.find_element_by_xpath('//*[@id="kw"]')  # xpath选择器
input_search=bro.find_element_by_css_selector('#kw')  # css选择器
# 2.写文字
input_search.send_keys("美女")
# 3.查找搜索按钮
enter=bro.find_element_by_id('su')
time.sleep(3)
# 4.点击按钮
enter.click()
time.sleep(5)
bro.close()

模拟百度登录

# 二、模拟百度登录
import time
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")
# 隐士等待(最多等待10s)
# 只有控件没有加载出来，才会等，控件一旦加载出来，直接就取到
bro.implicitly_wait(10)

# 1.找到登录标签
submit_button=bro.find_element_by_link_text('登录')
# 2.点击登录
submit_button.click()  
# 3.找到用户名登录
user_button=bro.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
# 4.点击登录
user_button.click()
# 5.找到用户名框的标签
user_input=bro.find_element_by_id('TANGRAM__PSP_10__userName')
# 6.输入用户名
user_input.send_keys("jeff@qq.com")  
# 7.找到密码输入框
pwd_input=bro.find_element_by_id('TANGRAM__PSP_10__password')
# 8.输入密码
pwd_input.send_keys("123456")
# 9.找到登录按钮标签
submit_input=bro.find_element_by_id('TANGRAM__PSP_10__submit')
# 10.点击登录
submit_input.click()
time.sleep(5)
bro.close()

获取cookie

#搭建cookie池和代理池的作用是什么？封ip ，封账号（弄一堆小号，一堆cookie）

# 三 获取cookie
# 登陆之后，拿到cookie：就可以自己搭建cookie池（requests模块发请求，携带者cookie）
import time
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")
print(bro.get_cookies())
bro.close()

获取标签属性、获取文本、标签ID、位置、大小

# 6 获取标签属性
# (重点：获取属性)
print(tag.get_attribute('src'))  # 获取属性
print(tag.get_attribute('href'))  # 获取属性
print(tag.text)  # 获取文本


# #获取标签ID，位置，名称，大小（了解）
print(tag.id)  # 标签ID
print(tag.location)  # 位置      #{'x': 312, 'y': 213}
print(tag.tag_name)  # 标签名称  #input
print(tag.size)  # 大小

显示等待、隐士等待

# 7 显示等待和隐士等待
# 隐士等待(最多等待10s)
# 只有控件没有加载出来，才会等，控件一旦加载出来，直接就取到
bro.implicitly_wait(10)
# 显示等待（每个控件，都要写等待），不要使用

执行JS代码

简单使用

from selenium import webdriver
import time
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")

# 执行js代码
bro.execute_script('alert(1)')
time.sleep(5)
bro.close()

js屏幕上下滚动

# js
window.scrollTo(0,100)  # 向下滑动100
window.scrollTo(0,500)  # 向下滑动500
window.scrollTo(0,document.body.scrollHeight)  # 滑到底部
window.scrollTo(0,document.body.scrollHeight-500) # 滑到-500

# 执行js
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')

# 完整代码
from selenium import webdriver
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.cnblogs.com")
# 执行js代码
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')

模拟浏览器前进后退

# bro.back()   # 后退
# bro.forward()  # 前进


from selenium import webdriver
import time
bro=webdriver.Chrome(executable_path='./chromedriver')


bro.get("https://www.cnblogs.com")  # 第一个页面
time.sleep(1)
bro.get("https://www.baidu.com")  # 第二个页面
time.sleep(1)
bro.get("https://www.jd.com")  # 第三个页面
time.sleep(1)
bro.back()  # 后退
time.sleep(1)
bro.forward()  # 前进

选项卡管理(新窗口跳转)

原理：都是js在操作，执行Js代码

from selenium import webdriver
import time
browser=webdriver.Chrome(executable_path='./chromedriver')

browser.get('https://www.baidu.com')
browser.execute_script('window.open()')  # 打开一个新窗口

print(browser.window_handles) #获取所有的选项卡
browser.switch_to_window(browser.window_handles[1])  # 到第一个窗口
browser.get('https://www.taobao.com')  # 跳转网址
time.sleep(2)
browser.switch_to_window(browser.window_handles[0])  # 到第0个窗口
browser.get('https://www.sina.com.cn') # 跳转网址
# browser.close()

异常处理

from selenium import webdriver

try:
    browser=webdriver.Chrome(executable_path='./chromedriver')
    browser.get('http://www.baidu.com')
    browser.find_element_by_id("xxx")

except Exception as e:
    print(e)
finally:  # 不管有没有报错都执行
    browser.close()

模拟键盘操作

from selenium.webdriver.common.keys import Keys
#模拟键盘操作(模拟键盘敲回车)
input_search.send_keys(Keys.ENTER) 


# 案例，打开百度，输入美女，键盘敲回车
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")
bro.implicitly_wait(10)   # 隐士等待
 
input_search=bro.find_element_by_css_selector('#kw')  # 找到输入框
input_search.send_keys("美女")  # 输入美女

#模拟键盘操作(模拟键盘敲回车)
input_search.send_keys(Keys.ENTER)

标签：webdriver,get,selenium,爬虫,element,bro,find
来源： https://www.cnblogs.com/haiqinai/p/16227136.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9