ICode9

精准搜索请尝试: 精确搜索
首页 > 编程语言> 文章详细

Python3数据分析处理库pandas

2021-07-09 10:35:52  阅读:264  来源: 互联网

标签:数据分析 info food series survival titanic print pandas Python3


用pandas封装函数对数据进行读取,预处理,数据分析等操作。
pandas库是基于numpy库编写的, 在命令行窗口安装完numpy后,安装pandas:pip install pandas。

相关numpy库的内容参考
http://blog.csdn.net/cymy001/article/details/78163468

通常需要pandas读取的数据文件的文本格式为.txt,.csv,.json
pandas里定义的数据类型:
(1.)object字符值(2.)int整型(3.)float浮点型(4.)datatime时间值(5.)bool布尔值

    #Python pandas introduce
    #导入数据集
    #import csv
    #food_info=csv.reader('D:\PYTHON35\idle\database\pandas\food_info.csv')
    #print(type(food_info))
    ##<class '_csv.reader'>
    
    import pandas as pd
    import os
    food_info_site = r"D:\PYTHON35\idle\database\pandas\food_info.csv"
    pwd = os.getcwd()  #获取当前工作目录
    os.chdir(os.path.dirname(food_info_site))
    #os.chdir改变当前工作目录到指定参数目录,os.path.dirname获取参数路径所在文件夹地址
    food_info = pd.read_csv(os.path.basename(food_info_site)) #read_csv的参数只能是文件名,不能是地址
    #os.path.basename返回文件名,无论参数是一个路径还是一个文件(这里food_info是路径)
    print(type(food_info))
    #<class 'pandas.core.frame.DataFrame'>,pandas读进来的数据流的格式dataframe
    print(food_info.dtypes)  #查看数据集food_info各列的数据类型,每一列的格式相同
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    first_rows=food_info.head(3) #head方法读取数据集food_info的前几行,默认参数是5
    print(first_rows)
    print(food_info.columns) #columns方法查看数据集每一列都是什么特征
    print(food_info.shape) #shape方法查看数据集维度,样本有多少行多少列
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    print(food_info.loc[0]) #loc方法查找某些行,从数据开始读,不算标题
    #pandas里object类型相当于python里的str类型
    print(food_info.loc[6]) #打印第6行,参数不能超过数据集最大的行号
    print(food_info.loc[3:6])  #打印出来多行——3,4,5,6行
    two_five_ten=[2,5,10]
    print(food_info.loc[two_five_ten])  #挑选打印2,5,10行
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    ndb_col=food_info['NDB_No']  #参数是列名称
    print(ndb_col)  #查找数据集的某一列
    print('_________________________________________')
    columns=['Zinc_(mg)','Copper_(mg)']   #一次查找多个列
    zinc_copper=food_info[columns]
    print(zinc_copper)
    #print(food_info[['Zinc_(mg)','Copper_(mg)']])
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #查找出以(g)为单位的列属性
    print(food_info.columns)
    print(food_info.head(2))
    col_names=food_info.columns.tolist() #将列属性名字索引转化成列表
    print(col_names)
    gram_columns=[] #挑选出属性单位是g的列,放入列表中,先创建一个空列表
    for c in col_names:
        if c.endswith('(g)'):
            gram_columns.append(c)
    gram_df=food_info[gram_columns] #查找出gram_columns列表中包含的列
    print(gram_df.head(3))
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #对以(mg)为单位的列转化成以(g)为单位的数值计算
    div_1000=food_info['Iron_(mg)']/1000  #对Iron_(mg)列的值整列变换,除以1000
    add_100=food_info['Iron_(mg)']+100
    sub_100=food_info['Iron_(mg)']-100
    mult_2=food_info['Iron_(mg)']*2
    
    water_energy=food_info['Water_(g)']*food_info['Energ_Kcal']  #对两列运算,对应位置作运算
    iron_grams=food_info['Iron_(mg)']/1000
    food_info['Iron_(g)']=iron_grams  #新得到的列,加入数据集,原数据集中没有Iron_(g)这一名字的列
    
    weighted_protein=food_info['Protein_(g)']*2
    weighted_fat=-0.75*food_info['Lipid_Tot_(g)']
    initial_rating=weighted_protein+weighted_fat  #对两列进行代数运算
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #对不同列进行自己列的归一化处理
    max_calories=food_info['Energ_Kcal'].max()
    normalized_calories=food_info['Energ_Kcal']/max_calories  #除以当前列最大值
    normalized_protein=food_info['Protein_(g)']/food_info['Protein_(g)'].max()
    food_info['Normalized_Protein']=normalized_protein  #将归一化的值加入到数据集中
    normalized_fat=food_info['Lipid_Tot_(g)']/food_info['Lipid_Tot_(g)'].max()
    food_info['Normalized_Fat']=normalized_fat
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #pandas读取数据的排序问题
    print(food_info['Sodium_(mg)'])
    print('_________________________________________')
    food_info.sort_values('Sodium_(mg)',inplace=True)
    #sort_values方法从小到大排序,inplace是否新建一个新的Dataframe,True是不需要
    print(food_info['Sodium_(mg)'])
    print('_________________________________________')
    food_info.sort_values('Sodium_(mg)',inplace=True,ascending=False) #ascending参数控制排序升降
    print(food_info['Sodium_(mg)'])
[/code]

```code
    import pandas as pd
    import numpy as np
    import os
    titanic_survival_site = r"D:\PYTHON35\idle\database\pandas\titanic_train.csv"
    pwd = os.getcwd()  
    os.chdir(os.path.dirname(titanic_survival_site))
    titanic_survival = pd.read_csv(os.path.basename(titanic_survival_site)) 
    print(type(titanic_survival))
    print(titanic_survival.head())
    #<class 'pandas.core.frame.DataFrame'>
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #对数据集中的缺失值进行预处理
    age=titanic_survival['Age']
    print(age.loc[0:10])  #缺失值NaN
    age_is_null=pd.isnull(age)  #isnull方法判断是否是缺失值,返回布尔值列表
    print(age_is_null) #True代表值缺失,False代表不缺失
    age_null_true=age[age_is_null] #取出age列中age_is_null为True对应的值,缺失值
    print(len(age_null_true))
    print('_________________________________________')
    mean_age=sum(titanic_survival['Age'])/len(titanic_survival['Age'])
    print(mean_age) #直接计算,由于有缺失值,输出nan
    good_ages=titanic_survival['Age'][age_is_null==False] #把age列不缺失的值都取出
    print(good_ages)
    correct_mean_age=sum(good_ages)/len(good_ages)
    print(correct_mean_age)
    print('_________________________________________')
    #pandas直接封装的API函数,自动过滤缺失值计算
    correct_mean_age=titanic_survival['Age'].mean() #mean方法自动过滤缺失值
    print(correct_mean_age)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #统计各仓位等级的船票价格
    passenger_classes=[1,2,3]
    fares_by_class={}
    for this_class in passenger_classes:
        pclass_rows=titanic_survival[titanic_survival['Pclass']==this_class] #找出this_class等舱的人
        pclass_fares=pclass_rows['Fare'] #对应人True的位置找出船票价格
        fare_for_class=pclass_fares.mean()
        fares_by_class[this_class]=fare_for_class
    print(fares_by_class)
    print('_________________________________________')
    #找出两个量的关系,具体什么关系由aggfunc参数指定,aggfunc默认值就是求均值
    passenger_survival=titanic_survival.pivot_table(index='Pclass',values='Survived',aggfunc=np.mean)
    #pivot_table方法,数据透视表,返回index和values这两个的关系
    print(passenger_survival)
    #同时考虑某个量与其余多个量关系,values列表参数
    port_stats=titanic_survival.pivot_table(index='Embarked',values=['Fare','Survived'],aggfunc=np.sum)
    print(port_stats)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #数据属性确实,把整行数据丢弃
    drop_na_columns=titanic_survival.dropna(axis=1) #dropna方法是按行检查,每一行的任一项有缺失值,就把这一行去掉
    new_titanic_survival=titanic_survival.dropna(axis=0,subset=['Age','Sex']) #考查Age和Sex列,有缺失值的行去掉
    print(new_titanic_survival)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    row_index_83_age=titanic_survival.loc[83,'Age']  #loc方法,查看某一行的某一属性值
    print(row_index_83_age)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #按某一列属性排序后,索引编号重新定义reset_index方法
    new_titanic_survival=titanic_survival.sort_values('Age',ascending=False)
    print(new_titanic_survival[0:10])
    titanic_reindex=new_titanic_survival.reset_index(drop=True)  #对Age降序排列之后重新加索引
    print(titanic_reindex.iloc[0:10])
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #自定义函数作用于Dataframe
    def hundredth_row(column):
        hundredth_item=column.iloc[99]
        return hundredth_item
    hundredth_row=titanic_survival.apply(hundredth_row)  #apply方法参数为自定义函数,Dateframe apply这个自定义函数,就可以得到函数作用Dataframe结果
    print(hundredth_row)
    print('_________________________________________')
    def not_null_count(column):
        column_null=pd.isnull(column)
        null=column[column_null]
        return len(null)
    column_null_count=titanic_survival.apply(not_null_count)
    print(column_null_count)
    print('_________________________________________')
    def which_class(row):
        pclass=row['Pclass']
        if pd.isnull(pclass):
            return 'Unknown'
        elif pclass==1:
            return 'First Class'
        elif pclass==2:
            return 'Second Class'
        elif pclass==3:
            return 'Third Class'
    classes=titanic_survival.apply(which_class,axis=1)
    print(classes)
[/code]

**pandas的3种主要数据结构** :  
**Series** ——一些值的集合,数据元素;支持float,int,bool,
datatime,timedelta,category,object类型  
**DataFrame** ——Series的集合  
**Panel** ——DataFrame的集合

```code
    import pandas as pd
    import os
    fandango_site = r"D:\PYTHON35\idle\database\pandas\fandango_score_comparison.csv"
    pwd = os.getcwd()  
    os.chdir(os.path.dirname(fandango_site))
    fandango = pd.read_csv(os.path.basename(fandango_site)) 
    print(type(fandango))
    series_film=fandango['FILM']  #取第一列电影名
    print(series_film[0:5]) #以“索引:电影名”形式列出
    series_rt=fandango['RottenTomatoes']  #取第二列电影评分
    print(series_rt[0:5])
    print('_________________________________________')
    #Series可以改变索引,将任意指定属性列定义为索引列
    from pandas import Series
    film_names=series_film.values #values方法,以列表形式给出对应名字
    print(type(film_names))
    print(film_names)
    rt_scores=series_rt.values
    print(rt_scores)
    series_custom=Series(rt_scores,index=film_names) #Series函数,index参数为索引列,另一个参数评分值为值
    print(series_custom)
    print('_____________')
    print(series_custom[['Ant-Man (2015)','The Water Diviner (2015)']]) #通过电影名字找样本
    print('_____________')
    #有了Series索引后,仍然可以利用index编号去寻找样本
    fiveten=series_custom[5:10]
    print(fiveten)
    print('_____________')
    original_index=series_custom.index.tolist() #取按电影名字作为索引的列表
    sorted_index=sorted(original_index) #按字母升序排列
    sorted_by_index=series_custom.reindex(sorted_index)  #reindex方法定义新索引
    print(sorted_by_index)
    print('_____________')
    sc2=series_custom.sort_index() #先找到Series变量,按index排序
    print(sc2[0:10])
    sc3=series_custom.sort_values() #先找到Series变量,按values排序
    print(sc3[0:10])
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    print(type(series_custom.values)) #查看series创建对象的数据结构
    #<class 'numpy.ndarray'>
    import numpy as np
    print(np.add(series_custom,series_custom)) #series对象可以进行numpy的相关函数操作
    print(np.sin(series_custom))
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #过滤操作
    criteria_one=series_custom>50
    criteria_two=series_custom<75
    both_criteria=series_custom[criteria_one & criteria_two]
    print(both_criteria)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #对两个Series进行算术运算
    rt_critics=Series(fandango['RottenTomatoes'].values,index=fandango['FILM'])
    re_users=Series(fandango['RottenTomatoes_User'].values,index=fandango['FILM'])
    rt_mean=(rt_critics+re_users)/2
    print(rt_mean)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
[/code]

```code
    #在DataFrame结构里设置索引列
    import pandas as pd
    import os
    fandango_site = r"D:\PYTHON35\idle\database\pandas\fandango_score_comparison.csv"
    pwd = os.getcwd()  
    os.chdir(os.path.dirname(fandango_site))
    fandango = pd.read_csv(os.path.basename(fandango_site))
    print(type(fandango))
    fandango_films=fandango.set_index('FILM',drop=False) #set_index函数设置‘FILM’列为索引项,drop参数False表示‘FILM’列还在,不仅是索引项,‘FILM’在值里也还可查
    print(fandango_films.index)
    
    movies=['Ant-Man (2015)','The Water Diviner (2015)']
    print(fandango_films.loc[movies]) #loc方法用电影名字索引做参数查找对应行信息
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    
    #numpy和DataFrame结构结合使用
    types=fandango_films.dtypes
    print(types)
    float_columns=types[types.values=='float64'].index #将每一列的值类型是float64的列属性取出来
    float_df=fandango_films[float_columns] #将对应列的值取出来
    deviations=float_df.apply(lambda x:np.std(x)) #利用,numpy里的std方法,对float64值类型对应的每一“列”求方差
    print(deviations)
    
    rt_mt_user=float_df[['RT_user_norm','Metacritic_user_nom']]
    rowdeviation=rt_mt_user.apply(lambda x:np.std(x),axis=1) #axis=1参数,表示横向求方差
    print(rowdeviation)

在这里插入图片描述

标签:数据分析,info,food,series,survival,titanic,print,pandas,Python3
来源: https://www.cnblogs.com/nigulasiximegn/p/14989425.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有