首页 > 其他分享> 文章详细

数据分析-Day05 pandas（2）

2022-01-28 17:06:53 阅读：195 来源： 互联网

标签：数据分析 index 1.0 df NaN Day05 print 100 pandas

1. 字符串离散化

1. 字符串离散化

1）获取字符串去重后的列表；

2）构造全为0的数组，其中colums为字符串的列表；

3）遍历，给全为0的数组赋值

#根据一组从2006年到2016年1000部最流行的电影数据，统计电影体裁(genre)的情况

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', 100)

df = pd.read_csv("./IMDB-Movie-Data.csv")

#获取字符串去重后的列表
temp_list = df["Genre"].str.split(",").tolist()
genre_list = list(set([i for j in temp_list for i in j]))

#构造全为0的数组
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zeros_df)

#按照每个电影的体裁，给数组赋值"1"
for i in range(df.shape[0]):
    zeros_df.loc[i,temp_list[i]] = 1

#统计每种体裁的电影数量
genre_count = zeros_df.sum(axis=0)

#排序
genre_count = genre_count.sort_values(ascending=False)
# print("*"*100)
# print(genre_count)

#画图
plt.figure(figsize=(20,8),dpi=80)

_x = genre_count.index
_y = genre_count.values

plt.bar(range(len(_x)),_y)

plt.xticks(range(len(_x)),_x)

plt.savefig("./movie_genre.png")

plt.show()

2. 数据合并

2.1 join

按照index进行分组，默认情况下将行索引相同的数据合并到一起：t1.join(t2)

#测试数据合并 join

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df2 = pd.DataFrame(np.ones((3,3)),index=["A","B","C"],columns=list("xyz"))
temp1 = df1.join(df2)
print(temp1)
temp2 = df2.join(df1)
print(temp2)


'''
输出结果：
a    b    c    d    x    y    z
A  1.0  1.0  1.0  1.0  1.0  1.0  1.0
B  1.0  1.0  1.0  1.0  1.0  1.0  1.0
****************************************************************************************************
     x    y    z    a    b    c    d
A  1.0  1.0  1.0  1.0  1.0  1.0  1.0
B  1.0  1.0  1.0  1.0  1.0  1.0  1.0
C  1.0  1.0  1.0  NaN  NaN  NaN  NaN
'''

2.2 merge

按照指定的列，将数据以一定的方式合并到一起：t1.merge(t2, on="")

#测试数据合并 merge

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df2 = pd.DataFrame(np.ones((3,3)),index=["A","B","C"],columns=list("xyz"))
df3 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list("fax"))
temp3 = df1.merge(df3,on="a")     #默认为“inner”，内连接
print(temp3)                      #df1中有两行“a”列的取值都为1，所以合并后有两行
print("*"*100)

df1.loc["A","a"] = 100
temp4 = df1.merge(df3,on="a",how="outer")        
print(temp4)
print("*"*100)

temp5 = df1.merge(df3,on="a",how="left")
print(temp5)
print("*"*100)

temp6 = df1.merge(df3,on="a",how="right")
print(temp6)


'''
输出数据：
a    b    c    d  f  x
0  1.0  1.0  1.0  1.0  0  2
1  1.0  1.0  1.0  1.0  0  2
****************************************************************************************************
       a    b    c    d    f    x
0  100.0  1.0  1.0  1.0  NaN  NaN
1    1.0  1.0  1.0  1.0  0.0  2.0
2    4.0  NaN  NaN  NaN  3.0  5.0
3    7.0  NaN  NaN  NaN  6.0  8.0
****************************************************************************************************
       a    b    c    d    f    x
0  100.0  1.0  1.0  1.0  NaN  NaN
1    1.0  1.0  1.0  1.0  0.0  2.0
****************************************************************************************************
     a    b    c    d  f  x
0  1.0  1.0  1.0  1.0  0  2
1  4.0  NaN  NaN  NaN  3  5
2  7.0  NaN  NaN  NaN  6  8
'''

3. 数据分组聚合

df.groupby(by="columns_name")

grouped是一个可迭代的DataFrameGroupBy对象，其中的每一个元素是一个元组：(索引(分组的值)，分组之后的DataFrame）

#根据一组关于全球星巴克店铺的统计数据，测试pandas 分组聚合

import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.read_csv("./starbucks_store_worldwide.csv")

grouped1 = df.groupby(by="Country")    #按列分组，DataFrameGroupBy 对象，可迭代
print(grouped1)

#DataFrameGroupBy
#可以进行遍历
for i,j in grouped1:
    print(i)
    print("-"*50)
    print(j)
    print("*"*50)

#调用聚合方法
print(grouped1.count())
country_count = grouped1["Brand"].count()
print(country_count["US"])
print(country_count["CN"])

#统计中国每个省份的店铺数量
China_data = df[df["Country"] == "CN"]
grouped2 = China_data.groupby(by="State/Province").count()["Brand"]
print(grouped2)

#按照多个条件进行分组，返回Series
grouped3 = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count()
print(grouped3)    #Series类型
grouped4 = df.groupby(by=["Country","State/Province"])["Country"].count()
grouped5 = df.groupby(by=["Country","State/Province"]).count()["Brand"]
print(grouped4)
print(grouped5)

#按照多个条件进行分组，返回DataFrame
grouped6 = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
grouped7 = df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
grouped8 = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]
print(grouped6)
print(grouped7)
print(grouped8)

4. 数据索引、复合索引

简单的索引操作：

• 获取 index ： df.index • 指定 index ： df.index = [' x','y '] • 重新设置 index : df.reindex ( list(" abcedf ")) • 指定某一列作为 index ： df.set_index (" Country",drop =False) • 返回 index 的唯一值： df.set_index ("Country"). index.unique ()

#测试pandas 索引和复合索引

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 100)

df = pd.read_csv("./starbucks_store_worldwide.csv")

grouped = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
# print(grouped)
print(grouped.index)
print("*"*100)

df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
df1.loc["A","a"] = 100
df1.index = ["x","y"]
print(df1)
print("*"*100)

df2 = df1.reindex(["x","z"])   #从df1中选取“x”和“z”两行赋值给df2
print(df2)
print("*"*100)
# print(df1)
df3 = df1.set_index("a",drop=False)    #drop参数设定是否将指定列从dataframe中删除，默认为True
print(df3)
print("*"*100)
df4 = df1.set_index("b",drop=False).index.unique()
print(df4)
print("*"*100)
df5 = df1.set_index(["a","c"])
print(df5.index)
print("*"*100)

a = pd.DataFrame({'a': range(7),'b': range(7, 0, -1),'c': ['one','one','one','two','two','two', 'two'],'d': list("hjklmno")})
print(a)
print("*"*100)
b = a.set_index(["c","d"])
print(b)


'''
输出结果：
MultiIndex([('AD',  '7'),
            ('AE', 'AJ'),
            ('AE', 'AZ'),
            ('AE', 'DU'),
            ('AE', 'FU'),
            ('AE', 'RK'),
            ('AE', 'SH'),
            ('AE', 'UQ'),
            ('AR',  'B'),
            ('AR',  'C'),
            ...
            ('US', 'UT'),
            ('US', 'VA'),
            ('US', 'VT'),
            ('US', 'WA'),
            ('US', 'WI'),
            ('US', 'WV'),
            ('US', 'WY'),
            ('VN', 'HN'),
            ('VN', 'SG'),
            ('ZA', 'GT')],
           names=['Country', 'State/Province'], length=545)
****************************************************************************************************
       a    b    c    d
x  100.0  1.0  1.0  1.0
y    1.0  1.0  1.0  1.0
****************************************************************************************************
       a    b    c    d
x  100.0  1.0  1.0  1.0
z    NaN  NaN  NaN  NaN
****************************************************************************************************
           a    b    c    d
a                          
100.0  100.0  1.0  1.0  1.0
1.0      1.0  1.0  1.0  1.0
****************************************************************************************************
Float64Index([1.0], dtype='float64', name='b')
****************************************************************************************************
MultiIndex([(100.0, 1.0),
            (  1.0, 1.0)],
           names=['a', 'c'])
****************************************************************************************************
   a  b    c  d
0  0  7  one  h
1  1  6  one  j
2  2  5  one  k
3  3  4  two  l
4  4  3  two  m
5  5  2  two  n
6  6  1  two  o
****************************************************************************************************
       a  b
c   d      
one h  0  7
    j  1  6
    k  2  5
two l  3  4
    m  4  3
    n  5  2
    o  6  1
'''

#测试Series复合索引

import pandas as pd

a = pd.DataFrame({'a': range(7),'b': range(7, 0, -1),'c': ['one','one','one','two','two','two', 'two'],'d': list("hjklmno")})
b = a.set_index(["c","d"])
c = b["a"]
print(c["one"])
print("*"*100)
print(c["one"]["j"])
print("*"*100)
print(c["one","j"])
print("*"*100)

d = a.set_index(["d","c"])["a"]
d1 = a.set_index(["d","c"])[["a"]]    #DataFrame
print(d)
print("*"*100)
print(d1)
print("*"*100)

e = d.swaplevel()       #交换内外侧索引层次，可帮助取内层索引
print(e)
print("*"*100)

print(b.loc["one"].loc["h"])


'''
输出结果：
d
h    0
j    1
k    2
Name: a, dtype: int64
****************************************************************************************************
1
****************************************************************************************************
1
****************************************************************************************************
d  c  
h  one    0
j  one    1
k  one    2
l  two    3
m  two    4
n  two    5
o  two    6
Name: a, dtype: int64
****************************************************************************************************
       a
d c     
h one  0
j one  1
k one  2
l two  3
m two  4
n two  5
o two  6
****************************************************************************************************
c    d
one  h    0
     j    1
     k    2
two  l    3
     m    4
     n    5
     o    6
Name: a, dtype: int64
****************************************************************************************************
a    0
b    7
Name: h, dtype: int64
'''

标签：数据分析,index,1.0,df,NaN,Day05,print,100,pandas
来源： https://blog.csdn.net/wutongxdf/article/details/122733242

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

数据分析-Day05 pandas（2）

1. 字符串离散化

2. 数据合并

2.1 join

2.2 merge

3. 数据分组聚合

4. 数据索引、复合索引