ICode9

精准搜索请尝试: 精确搜索
首页 > 编程语言> 文章详细

python异步爬虫+数据分析+数据可视化

2021-07-25 10:00:53  阅读:146  来源: 互联网

标签:异步 plt python text price mi 爬虫 print data


python爬虫+数据分析+数据可视化

import csv
import pandas as pd
import numpy as np
import asyncio
import aiohttp
from pandas import Series, DataFrame
# import matplotlib as mpl
import matplotlib.pyplot as plt
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.4071 SLBChan/30 '
}


async def get_page(url):
    async with aiohttp.ClientSession() as session:
        async with await session.get(url=url, headers=headers) as response:
            # text()返回字符串形式的响应数据
            # read()返回二进制形式的响应数据
            # json()返回的就是json对象
            # 注意:获取响应数据操作之前一定要使用await进行手动挂起
            page_text = await response.text()
            tree = etree.HTML(page_text)
            titles = tree.xpath('//div[@class="property-content-title"]/h3//text()')
            values = tree.xpath('//p[@class="property-price-total"]/span[1]/text()')
            layouts = tree.xpath('//div[@class="property-content-info"]/p[1]//text()')
            a = ''
            for i in layouts:
                if i != ' ':
                    a = a + i
            layout = []
            for i in range(int(len(a) / 6)):
                layout.append(''.join(list(a)[6 * i:6 * i + 6]))

            mi = tree.xpath('//div[@class="property-content-info"]/p[2]//text()')
            location = tree.xpath('//div[@class="property-content-info"]/p[3]//text()')
            high = tree.xpath('//div[@class="property-content-info"]/p[4]//text()')
            build_times = tree.xpath('//div[@class="property-content-info"]/p[5]//text()')
            address = tree.xpath('//div[@class="property-content-info property-content-info-comm"]/p[1]//text()')
            specific_address = tree.xpath(
                '//div[@class="property-content-info property-content-info-comm"]/p[2]//text()')
            insertion = []
            for i in range(int(len(specific_address))):
                insertion.append(specific_address[i])
                if (i + 1) % 3 != 0:
                    insertion.insert(len(specific_address), '-')
            # print(insertion)
            name = tree.xpath('//div[@class="property-extra"]/span[1]/text()')
            grade = tree.xpath('//div[@class="property-extra"]/span[2]/text()')
            website = tree.xpath('//div[@class="property-extra"]/span[3]/text()')
            urls = tree.xpath('//div[@class="property"]/a[1]/@href')
            # for url in urls:
            #     print(url)
            #     new_page_text = requests.get(url=url, headers=headers, proxies={'HTTP': 'HTTP://121.230.210.132:3256'}).text
            #     new_tree = etree.HTML(new_page_text)
            #     add_time = new_tree.xpath('//div[@class="houseInfo"]/table/tbody/tr[6]/td[2]/span[2]/text()')
            #     print(add_time)
            new_specific_address = []
            for i in range(int(len(insertion) / 5)):
                new_specific_address.append(''.join(insertion[5 * i:5 * i + 5]))
            # print(new_specific_address)
            print(len(build_times))
            print(len(titles))

            for i in range(len(titles) - 1):
                new_data = [titles[i], values[i] + '万', layout[i], mi[i], location[i], high[i], build_times[i],
                            address[i],
                            new_specific_address[i], name[i], grade[i], website[i], urls[i]]
                writer.writerow(new_data)
                # fp.write('房子描述:' + titles[i] + '   ¥' + '价格:' + values[i] + '万' + '房子构造:' + layout[i] + '房子面积:' + mi[
                #     i] + '房子朝向:' + location[i] + '楼房层数:' + high[i] + '建造时间:' + build_times[i] +
                #          '地址:' + address[i] + '详细地址:' + new_specific_address[i] + '户主姓名:' + name[i] + '评分:' + grade[
                #              i] + '发布公司:' + website[i] + '网站地址:' + urls[i] + '\n')


async def main():
    urls = []

    url = 'https://bj.58.com/ershoufang/p%d/'
    for pageNum in range(1, 9):
        urls.append(format(url % pageNum))

    tasks = []

    for url in urls:
        c = get_page(url)
        task = asyncio.create_task(c)
        tasks.append(task)

    await asyncio.wait(tasks)


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

plt.rcParams["font.sans-serif"] = [u"SimHei"]
plt.rcParams["axes.unicode_minus"] = False

data = pd.read_csv('room01.csv', encoding='gbk')

print(data.shape)
print(data.dtypes)
print(data.columns)
# 将多余的行删除
index01 = data[data["建造时间"].str[29:33] == ''].index
data.drop(index01, inplace=True)
# 将房子面积转变为double类型新加一列mi
data['mi'] = data["房子面积"].str[29:-26].astype('double')
# 将价格转变为double类型新加一列price
data['price'] = data["¥价格"].str[:-1].astype('double')
# 将建造时间转变为int类型新加一列year
data['year'] = data["建造时间"].str[29:33].astype('int')
# print(data["建造时间"].str[29:33])
# print(data.dtypes)

# 添加一列months表示使用多少个月
data['months'] = (2021 - data['year']) * 12 + 6
# 删除评分中无用的数据
index02 = data[data['评分'].str[3:4] != '分'].index
data.drop(index02, inplace=True)
# 将评分转变为double类型新加一列grade
data['grade'] = data['评分'].str[:-1].astype('double')


def plot01():
    # 将价格分组
    price_cut = pd.cut(data['price'],
                       bins=[data['price'].min(), 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000,
                             data['price'].max()])
    # print(price_cut)

    # 计算不同价格的房子数量
    price_count = price_cut.value_counts()
    # print(price_count)

    # 查看占比情况
    for i in price_count / price_count.sum():
        print(i)

    # 打印price_count索引
    print(price_count.index)

    # 绘制不同价格区间占比情况的柱状图
    X = np.arange(len(price_count))
    print(X)
    Y = price_count
    print(Y)
    plt.figure(figsize=(8, 6))
    plt.bar(X, Y, color='b', alpha=0.5)
    plt.title("二手房价格分布图")
    plt.xlabel("价格区间")
    plt.ylabel("数量")
    plt.xticks(np.arange(len(price_count)), price_count.index, rotation=30)
    plt.ylim([0, price_count.max() + 100])

    percents = [str(round(i * 100, 2)) + '%' for i in price_count / price_count.sum()]
    for x, y, z in zip(X, Y, percents):
        plt.text(x - 0.3, y + 5, z)
    plt.show()


def plot02():
    # 平均面积集合
    means = [int(data[(data['price'] < 100) & (data['price'] >= data['price'].min())]['mi'].mean()),
             int(data[(data['price'] < 200) & (data['price'] >= 100)]['mi'].mean()),
             int(data[(data['price'] < 300) & (data['price'] >= 200)]['mi'].mean()),
             int(data[(data['price'] < 400) & (data['price'] >= 300)]['mi'].mean()),
             int(data[(data['price'] < 500) & (data['price'] >= 400)]['mi'].mean()),
             int(data[(data['price'] < 600) & (data['price'] >= 500)]['mi'].mean()),
             int(data[(data['price'] < 700) & (data['price'] >= 600)]['mi'].mean()),
             int(data[(data['price'] < 800) & (data['price'] >= 700)]['mi'].mean()),
             int(data[(data['price'] < 900) & (data['price'] >= 800)]['mi'].mean()),
             int(data[(data['price'] < 1000) & (data['price'] >= 900)]['mi'].mean()),
             int(data[(data['price'] < data['price'].max()) & (data['price'] >= 1000)]['mi'].mean())]

    x = [f"[{data['price'].min()},100)", "[100,200)", "[200,300)", "[300,400)", "[400,500)", "[500,600)", "[600,700)",
         "[700,800)", "[800,900)", "[900,1000)", f"[1000,{data['price'].max()})"]

    X = np.arange(len(x))

    Y = means

    plt.figure(figsize=(8, 10))
    plt.plot(X, Y, '-..', color='b')
    plt.title('房子价格和面积之间的关系')
    plt.xlabel('价格区间')
    plt.ylabel('平均面积')
    plt.xticks(np.arange(len(X)), x, rotation=30)
    ax = plt.gca()
    for i, j in zip(X, Y):
        ax.text(i + 0.2, j + 4, j, bbox=dict(facecolor='red', alpha=0.3))
    plt.grid(True)

    plt.show()


def plot03():
    # 分析房子使用时长、面积及价格之间的关系
    plt.figure(figsize=(10, 8))

    plt.scatter(data['mi'], data['months'], s=data['price'] / 10, c='r')
    plt.xlabel("面积")
    plt.ylabel("使用月份")
    plt.show()


def plot04():
    # 分析房子评分、面积及价格之间的关系
    plt.figure(figsize=(10, 8))

    plt.scatter(data['mi'], data['grade'], s=data['price'] / 10, c='r')
    plt.xlabel("面积")
    plt.ylabel("评分")
    plt.show()


if __name__ == '__main__':
    head = ['房子描述', '¥价格', '房子构造', '房子面积', '房子朝向', '楼房层数', '建造时间', '地址', '详细地址', '户主姓名', '评分', '发布公司', '网站地址']

    with open('room02.csv', 'a', encoding='gbk', newline='')as f:
        writer = csv.writer(f)
        writer.writerow(head)
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main())
        f.close()
    plot04()

标签:异步,plt,python,text,price,mi,爬虫,print,data
来源: https://blog.csdn.net/qq_52006948/article/details/119077879

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有