标签:selenum content name excel list anchor 爬取 斗鱼 page
斗鱼直播主播信息采集
from selenium import webdriver
import time
from lxml import etree
from excel_utils.excel_utils import write_to_excel,append_to_excel
import os
# 浏览器生成并解析
def get_page_content_by_selenium(url):
driver.get(url)
time.sleep(2)
driver.maximize_window()
page_content = driver.page_source
return etree.HTML(page_content)
def main():
start_url = 'https://www.douyu.com/g_LOL'
page_content = get_page_content_by_selenium(start_url)
next_btn = driver.find_element_by_xpath('//div[@class="ListFooter"]/ul/li[last()]')
print(next_btn.tag_name)
n = 1
while True:
print(f'爬取第{n}页')
titles = page_content.xpath('//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h3/text()')
anchor = page_content.xpath('//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h2/div[@class="DyListCover-userName"]/text()')
focus = page_content.xpath('//section[@id="listAll"]//ul[@class="layout-Cover-list"]//span[@class="DyListCover-hot"]/text()')
anchor_list = []
for index, title in enumerate(titles):
item = {}
item['title'] = title
item['anchor'] = anchor[index]
item['focus'] = focus[index]
anchor_list.append(item)
file_name = 'anchor.xls'
if not os.path.exists(file_name):
write_to_excel(anchor_list, file_name)
else:
append_to_excel(anchor_list, file_name)
if next_btn.get_attribute('aria-disabled') == 'false':
next_btn.click()
time.sleep(0.5)
page_content = etree.HTML(driver.page_source)
else:
break
n += 1
if __name__ == '__main__':
driver = webdriver.Chrome()
main()
标签:selenum,content,name,excel,list,anchor,爬取,斗鱼,page 来源: https://www.cnblogs.com/childheart/p/14238274.html
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。