标签:39 span string Python detail 爬取 url data find
39健康网
https://jbk.39.net/bw/jizhenke/
Python源码
from bs4 import BeautifulSoup
import xlwt
import requests
import re
def ask_url(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47"
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63"
}
try:
r = requests.get(url, headers=head, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def get_data(base_url):
data_list = []
# 遍历每一页
for i in range(0, 40):
url = base_url + str(i + 1)
html = ask_url(url)
if html == "":
continue
soup = BeautifulSoup(html, 'html.parser')
# 遍历每一种疾病
for item in soup.find_all('div', class_="result_item"):
data = {}
if item.div.p.span.string == "疾病":
# diseaseName
data['diseaseName'] = item.div.p.a.string
# diseaseAlias
# data.append(item.div.p.span.string.strip('()'))
# symptom
symptoms = []
p = item.find('p', class_='result_item_content_label')
for symptom in p.find_all('a'):
symptoms.append(symptom.string)
# https://jbk.39.net/zs/
sub_url = item.div.p.a.attrs["href"]
sub_html = ask_url(sub_url)
if sub_html == "":
continue
sub_soup = BeautifulSoup(sub_html, 'html.parser')
information_ul = sub_soup.find('ul', class_="information_ul")
for detail in information_ul.find_all('li'):
if detail.i.string == '别名:':
data['diseaseAlias'] = detail.span.string
elif detail.i.string == '发病部位:':
data['siteOfOnset'] = []
for site in detail.span.find_all('a'):
data['siteOfOnset'].append(site.string)
elif detail.i.string == '传染性:':
data['infectivity'] = detail.span.string
elif detail.i.string == '多发人群:':
data['multiplePopulation'] = detail.span.string
elif detail.i.string == '并发症:':
data['complication'] = []
for complication in detail.span.find_all('a'):
data['complication'].append(complication.string)
elif detail.i.string == '挂号科室:':
data['registrationDepartment'] = []
for department in detail.span.find_all('a'):
data['registrationDepartment'].append(department.string)
elif detail.i.string == '临床检查:':
data['clinicalExamination'] = []
for examination in detail.span.find_all('a'):
data['clinicalExamination'].append(examination.string)
elif detail.i.string == '典型症状:':
for symptom in detail.span.find_all('a'):
symptoms.append(symptom.string)
data['commonDrugs'] = symptoms
information_ul1 = sub_soup.find('ul', class_="information_ul information_ul_bottom")
for detail in information_ul1.find_all('li'):
if detail.i.string == '常用药品:':
data['commonDrugs'] = []
for drug in detail.span.find_all('a'):
data['commonDrugs'].append(drug.string)
data_list.append(data)
return data_list
def save_data(data_list, save_path):
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet("智能诊断数据集", cell_overwrite_ok=True)
col = ("diseaseName", "diseaseAlias", "siteOfOnset", "infectivity", "multiplePopulation", "earlySymptom", "advancedSymptom", "complication", "registrationDepartment", "clinicalExamination", "commonDrugs")
length = len(data_list)
for i in range(0, 11):
sheet.write(0, i, col[i])
for i in range(0, length):
print("\r当前进度:{:.2f}%".format((i + 1) * 100 / length), end="")
data = data_list[i]
for j in range(0, 11):
if col[j] in data:
sheet.write(i + 1, j, data[col[j]])
book.save(save_path)
return ""
if __name__ == "__main__":
base_url = "https://jbk.39.net/bw/jizhenke_p"
save_path = ".\\智能诊断数据集.xls"
# html = ask_url(base_url)
data_list = get_data(base_url)
save_data(data_list, save_path)
实验结果
标签:39,span,string,Python,detail,爬取,url,data,find 来源: https://blog.csdn.net/weixin_45953673/article/details/122798537
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。