标签:level df Time fitness PA Fitness means pca row
PA-Fitness
一、数据集/数据预处理
1、原始数据集:姓名,年龄,性别,多久运动一次?运动对您的重要性?您当前的健康水平?买过运动器材吗?...
(https://www.kaggle.com/datasets/nithilaa/fitness-analysis)可在这个网站下载
2、处理后的数据集:男女分开(对应代码如下)
# Importing Libraries and Reading Dataset
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import category_encoders as ce
pd.set_option('display.max_columns',None)
df=pd.read_csv("dataset/fitness analysis.csv")
# Data Cleaning
df.info()
print("\n\nNull Values found in Dataframe: " + str(df.isna().sum().sum()) + "\n")
print(df.isna().sum()) #没有空项 直接进行编码 如果有的话 需要进行填充
# Reducing the Column Heading to be more readable and easier to work with
new_cols=['Timestamp','Name','Gender','Age_range','Exercise_importance','Fitness_level','Regularity','Barriers','Exercises','Do_you','Time','Time_spent','Balanced_diet','Prevents_balanced_diet','Health_level','Recommend_fitness','Equipment','Motivation']
# column_reference=pd.DataFrame(new_cols,df.columns)
# print(column_reference)
df.columns=new_cols
# 去掉无关项
df = df.drop(columns=['Timestamp','Name'],axis = 1)
print(df.head())
#Encoding Data
# Finding Unique Values in necessary Columns
gender_vals=df["Gender"].unique()
print(gender_vals)
age_vals=df["Age_range"].unique()
print(age_vals)
fit_lev_vals=df["Fitness_level"].unique()
print(fit_lev_vals)
reg_vals=df['Regularity'].unique()
print(reg_vals)
do_you_vals=df['Do_you'].unique()
print(do_you_vals)
time_vals=df['Time'].unique()
print(time_vals)
time_spent_vals=df['Time_spent'].unique()
print(time_spent_vals)
bal_diet_vals=df['Balanced_diet'].unique()
print(bal_diet_vals)
rec_fit_vals=df['Recommend_fitness'].unique()
print(rec_fit_vals)
equ_own_vals=df['Equipment'].unique()
print(equ_own_vals)
# Creating objects of OrdinalEncoding
encoder = ce.OrdinalEncoder(cols=[['Gender', 'Age_range', 'Fitness_level', 'Regularity',
'Do_you', 'Time', 'Time_spent', 'Balanced_diet',
'Recommend_fitness', 'Equipment']], return_df=True,
mapping=[
## 4.1. Finding Unique Values in Columns
{'col': 'Gender',
'mapping': {'Female': 1, 'Male': 2}},
{'col': 'Age_range',
'mapping': {'15 to 18': 1, '19 to 25': 2, '26 to 30': 3,
'30 to 40': 4, '40 and above': 5}},
{'col': 'Fitness_level',
'mapping': {'Unfit': 1, 'Average': 2, 'Good': 3,
'Very good': 4, 'Perfect': 5}},
{'col': 'Regularity',
'mapping': {'Never': 1, '1 to 2 times a week': 2, '2 to 3 times a week': 3,
'3 to 4 times a week': 4, '5 to 6 times a week': 5, 'Everyday': 6}},
{'col': 'Do_you',
'mapping': {"I don't really exercise": 1, 'Alone': 2, 'With a friend': 3,
'With a group': 4, 'Within a class environment': 5}},
{'col': 'Time',
'mapping': {'Early morning': 1, 'Afternoon': 2, 'Evening': 3}},
{'col': 'Time_spent',
'mapping': {"I don't really exercise": 0, '30 minutes': 1, '1 hour': 2,
'2 hours': 3, '3 hours and above': 4}},
{'col': 'Balanced_diet',
'mapping': {'No': 0, 'Not always': 1, 'Yes': 2}},
{'col': 'Recommend_fitness',
'mapping': {'No': 0, 'Yes': 1}},
{'col': 'Equipment',
'mapping': {'No': 0, 'Yes': 1}}
])
df_encode = encoder.fit_transform(df)
df1 = df_encode.drop(columns = ['Barriers','Exercises','Prevents_balanced_diet','Motivation'])
df1.to_csv('dataset/out_fitness_analysis.csv', index=False, columns=['Gender', 'Age_range', 'Exercise_importance','Fitness_level','Regularity','Do_you','Time','Time_spent','Balanced_diet','Health_level','Recommend_fitness','Equipment'])
# print(df1.head())
# df1.info()
'''
Gender Age_range Exercise_importance Fitness_level Regularity Do_you \
0 1 2 2 3 1 1
1 1 2 4 4 1 4
2 1 1 3 3 2 2
3 1 1 4 3 4 2
4 1 2 3 1 1 1
Time Time_spent Balanced_diet Health_level Recommend_fitness Equipment
0 1 0 1 3 1 0
1 1 0 1 4 1 0
2 1 1 1 4 1 1
3 3 2 2 4 1 0
4 3 0 2 4 1 0 '''
out_fitness_analysis_male= open("dataset/out_fitness_analysis_male.csv", "w", encoding='utf-8')
out_fitness_analysis_male.write("Gender,Age_range,Exercise_importance,Fitness_level,Regularity,Do_you,Time,Time_spent,Balanced_diet,Health_level,Recommend_fitness,Equipment\n")
out_fitness_analysis_female = open("dataset/out_fitness_analysis_female.csv", "w", encoding='utf-8')
out_fitness_analysis_female.write("Gender,Age_range,Exercise_importance,Fitness_level,Regularity,Do_you,Time,Time_spent,Balanced_diet,Health_level,Recommend_fitness,Equipment\n")
for _, row in df1.iterrows():
if row['Gender'] == 1:
Gender = row['Gender']
Age_range = row['Age_range']
Exercise_importance = row['Exercise_importance']
Fitness_level = row['Fitness_level']
Regularity = row['Regularity']
Do_you = row['Do_you']
Time = row['Time']
Time_spent = row['Time_spent']
Balanced_diet = row['Balanced_diet']
Health_level = row['Health_level']
Recommend_fitness = row['Recommend_fitness']
Equipment = row['Equipment']
out_fitness_analysis_male.write(f"{Gender},{Age_range},{Exercise_importance},{Fitness_level},{Regularity},{Do_you},{Time},{Time_spent},{Balanced_diet},{Health_level},{Recommend_fitness},{Equipment}\n")
for _, row in df1.iterrows():
if row['Gender'] == 2:
Gender = row['Gender']
Age_range = row['Age_range']
Exercise_importance = row['Exercise_importance']
Fitness_level = row['Fitness_level']
Regularity = row['Regularity']
Do_you = row['Do_you']
Time = row['Time']
Time_spent = row['Time_spent']
Balanced_diet = row['Balanced_diet']
Health_level = row['Health_level']
Recommend_fitness = row['Recommend_fitness']
Equipment = row['Equipment']
out_fitness_analysis_female.write(f"{Gender},{Age_range},{Exercise_importance},{Fitness_level},{Regularity},{Do_you},{Time},{Time_spent},{Balanced_diet},{Health_level},{Recommend_fitness},{Equipment}\n")
此为女性的数据集
二、运用PCA和k-means
1、原始的df(11个属性)
2、数据归一化(11个属性)
3、计算每个属性的贡献值,然后选择0.8为阈值,得出需要保留7个components
4、运用k-means 选取最佳目标簇的数值
5、得出5个簇心
6、分簇
7、测试,3个新用户 输入年龄范围 运动重要性... 11个属性 得出其在5类人群中的分类
对应代码如下
import sys
import spotipy
import yaml
import spotipy.util as util
from pprint import pprint
import json
import argparse
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
from yellowbrick.cluster import KElbowVisualizer
from kneed import KneeLocator
import plotly.graph_objects as go
from plotly.subplots import make_subplots
sns.set()
pd.set_option('display.max_columns',None)
df = pd.read_csv("dataset/out_fitness_analysis_female.csv")
# print(df.head())
#Principal Component Analysis (PCA)
non_features = ['Gender']
track_info = df[non_features]
df_X = df.drop(columns=non_features)
print(df_X.head())
scaler = StandardScaler()
X_std = scaler.fit_transform(df_X) #数据归一化处理
print(X_std)
pca = PCA()
pca.fit(X_std)
# The attribute shows how much variance is explained by each of the nine features
evr = pca.explained_variance_ratio_
print(evr)
fig = plt.figure(figsize=(10,8))
plt.plot(range(1, len(df_X.columns)+1), evr.cumsum(), marker='o', linestyle='--')
plt.xlabel('Number of Components', fontsize=18)
plt.ylabel('Cumulative Explained Variance',fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
fig = plt.savefig("dataset/Number_of_Components_PCA.png")
plt.show()
for i, exp_var in enumerate(evr.cumsum()):
if exp_var >= 0.8:
n_comps = i + 1
break
print("Number of components:", n_comps)
pca = PCA(n_components=n_comps)
pca.fit(X_std)
scores_pca = pca.transform(X_std)
#K-Means Clustering
#Finding the elbow point of the WCSS (within cluster sum of squares) curve using the YellowBrick KElbowVisualizer
visualizer = KElbowVisualizer(KMeans(init='k-means++', random_state=42), k=(1,21), timings=False)
visualizer.fit(scores_pca)
visualizer.show()
n_clusters = visualizer.elbow_value_
print("Optimal number of clusters:", n_clusters)
#Finding the elbow point of the WCSS (within cluster sum of squares) curve using the kneed KneeLocator
wcss = []
max_clusters = 21
for i in range(1, max_clusters):
kmeans_pca = KMeans(i, init='k-means++', random_state=42)
kmeans_pca.fit(scores_pca)
wcss.append(kmeans_pca.inertia_)
n_clusters = KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee
print("Optimal number of clusters", n_clusters)
fig = plt.figure(figsize=(10,8))
plt.plot(range(1, 21), wcss, marker='o', linestyle='--')
plt.vlines(KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee, ymin=min(wcss), ymax=max(wcss), linestyles='dashed')
plt.xlabel('Number of Clusters', fontsize=18)
plt.ylabel('Within Cluster Sum of Squares (WCSS)', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
fig.savefig("dataset/num_clusters.png")
plt.show()
kmeans_pca = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
kmeans_pca.fit(scores_pca)
print(kmeans_pca.cluster_centers_)
df_seg_pca_kmeans = pd.concat([df_X.reset_index(drop=True), pd.DataFrame(scores_pca)], axis=1)
df_seg_pca_kmeans.columns.values[(-1*n_comps):] = ["Component " + str(i+1) for i in range(n_comps)]
df_seg_pca_kmeans['Cluster'] = kmeans_pca.labels_
print(df_seg_pca_kmeans.head())
x = df_seg_pca_kmeans['Component 2']
y = df_seg_pca_kmeans['Component 1']
fig = plt.figure(figsize=(10, 8))
sns.scatterplot(x, y, hue=df_seg_pca_kmeans['Cluster'], palette = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple'])
plt.title('Clusters by PCA Components', fontsize=20)
plt.xlabel("Component 2", fontsize=18)
plt.ylabel("Component 1", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.show()
fig.savefig("dataset/clusters-2d.png")
#测试
#Gender,Age_range,Exercise_importance,Fitness_level,Regularity,Do_you,Time,
# Time_spent,Balanced_diet,Health_level,Recommend_fitness,Equipment
data = {'Age_range': [1,2,4],
'Exercise_importance': [1,2,3],
'Fitness_level': [3,4,5],
'Regularity': [4,5,6],
'Do_you': [1,2,4],
'Time': [1,2,3],
'Time_spent': [0,2,3],
'Balanced_diet': [2,1,0],
'Health_level': [1,2,3],
'Recommend_fitness': [0,1,0],
'Equipment': [1,0,1],
}
frame = pd.DataFrame(data)
print(frame)
X_std_new = scaler.fit_transform(frame)
scores_pca_new = pca.transform(X_std_new)
print(kmeans_pca.predict(scores_pca_new))
标签:level,df,Time,fitness,PA,Fitness,means,pca,row 来源: https://www.cnblogs.com/monster-little/p/16536395.html
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。