标签:搜狗 plt headers python 爬取 url html print re
import requests
import re
import os
def getHTMLtext(url):
headers = {'user-agent':'Mozilla/5.0'}
try:
r = requests.get(url, timeout=30, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("cannot scrapy the url")
return ""
#解析html文本,筛选出连接
def HTMLparse(link, html):
try:
plt = re.findall(r'"thumbUrl":"http://(.*?)"', html)
for i in range(len(plt)):
plt[i] = re.sub(r"thumbUrl", "", plt[i])
plt[i] = re.sub(r":", "", plt[i])
plt[i] = re.sub(r'"', "", plt[i])
if plt[i][-1]=='g' and plt[i][-2]=='p' and plt[i][-3]=='j':
link.append(r"http://"+plt[i])
except:
print("error")
def main():
source = input("请输入要查找的图片:")
link = []
try:
url = "http://pic.sogou.com/pics?pid=sogou-site-3b24156ad560a696&query=" + source
print(url)
html = getHTMLtext(url)
print(html)
HTMLparse(link, html)
except:
print("error2")
root = "d://ai//sogou//打架//"
headers = {'user-agent':'Mozilla/5.0'}
count = 0
for url in link:
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url, headers=headers)
with open(path, 'wb') as file:
file.write(r.content)
file.close()
print("successful safed:"+ url.split('/')[-1])
count = count + 1
else:
print(url.split('/')[-1] + "has already existed")
except:
print("cannot safed:" + url.split('/')[-1])
pass
print("total count = ", count)
main()
标签:搜狗,plt,headers,python,爬取,url,html,print,re 来源: https://blog.csdn.net/wym2011aaj/article/details/98376962
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。