Crisp overview of the dataset
Problem Framework
Problem Statement
# All the necessary imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# features are loaded from the features file given
features = list()
with open('/home/codename_sai/Desktop/1_HumanActivityRecognition/UCI HAR Dataset/features.txt') as f:
features = [line.split()[1] for line in f.readlines()]
print("No of features : {}".format(features.__len__()))
print(features[:10])
datapath = '/home/codename_sai/Desktop/1_HumanActivityRecognition/UCI HAR Dataset/'
# Train data is loaded from the X_train and y_train files given
X_train = pd.read_csv(datapath + 'train/X_train.txt', sep = '\s+',names=features)
# add subject column to the dataframe
X_train['Subject'] = pd.read_csv(datapath + 'train/subject_train.txt')
y_train = pd.read_csv(datapath + 'train/y_train.txt', sep = '\s+',names=['Activity'])
# Test data is loaded from the X_test and y_test files given
X_test = pd.read_csv(datapath + 'test/X_test.txt', sep = '\s+',names=features)
# add subject column to the dataframe
X_test['Subject'] = pd.read_csv(datapath + 'test/subject_test.txt')
y_test = pd.read_csv(datapath + 'test/y_test.txt', sep = '\s+',names=['Activity'])
# train data has 7352 datapoints and 562 features
X_train.shape
# 1 - Walking
# 2 - WalkingUpstairs
# 3 - WalkingDownstairs
# 4 - Standing
# 5 - Sitting
# 6 - Lying
y_train['Activity'].value_counts()
type(X_train), X_train.shape, type(y_train), y_train.shape
# Add class label to the X_train
X_train['Activity'] = y_train['Activity']
X_train.head()
y_train.head()
Data Cleaning - Dirtying hands
# Checking for duplicated
sum(X_train.duplicated()), sum(X_test.duplicated())
# checking for NaN/null values
X_train.isnull().values.any(), X_test.isnull().values.any()
# Taking the rows which has NaN
nan_rows_train = X_train[X_train.isnull().T.any().T]
nan_rows_test = X_test[X_test.isnull().T.any().T]
print('NaN rows in train')
nan_rows_train
print('NaN rows in test')
nan_rows_test
# Here we get an extra feature for train as we have added class label to X_train
X_train.shape, X_test.shape
# dropping the NaN/Null rows
X_train = X_train.dropna(how = 'any')
X_test = X_test.dropna(how = 'any')
# Drop the corresponding y_train and y_test rows
y_train = y_train.drop([7351])
y_test = y_test.drop([2946])
# Checking the shape after dropping Nan's
X_train.shape, X_test.shape
# Cross Checking for the succesful drop of NaN
X_train.isnull().values.any(), X_test.isnull().values.any()
X_train.columns
From above we can see that,
# Removing '()','-' from column names for readability
X_train.columns = X_train.columns.str.replace('[()]','')
X_train.columns = X_train.columns.str.replace('[-]','')
X_test.columns = X_test.columns.str.replace('[()]','')
X_test.columns = X_test.columns.str.replace('[-]','')
# Replacing BodyBody from column name to Body which is a typo error
X_train = X_train.rename(columns = lambda x : str(x).replace('BodyBody','Body'))
X_test = X_test.rename(columns = lambda x : str(x).replace('BodyBody','Body'))
# Adding 'ActivityNames' column for interpretability.
X_train['ActivityNames'] = X_train['Activity']
# Replacing a Activity numbers with Activity Names for interpretability
X_train.ActivityNames = X_train.ActivityNames.replace(1,'Walking')
X_train.ActivityNames = X_train.ActivityNames.replace(2,'WalingUpstairs')
X_train.ActivityNames = X_train.ActivityNames.replace(3,'WalkingDownstairs')
X_train.ActivityNames = X_train.ActivityNames.replace(4,'Sitting')
X_train.ActivityNames = X_train.ActivityNames.replace(5,'Standing')
X_train.ActivityNames = X_train.ActivityNames.replace(6,'Laying')
# Copying X_train to df just to avoid all the labour done above
# incase of any change in df later
df = X_train
Featuring Engineering from Domain Knowledge - Dirtying mind
As we tend to learn about Accelerometer and Gyroscope and the features carefully we get following
After all the feature engineering and analysis we get following 38 features.
# final list of features
f = ['tBodyAccMagmean','tBodyAccMagstd','tBodyAccJerkMagmean','tBodyAccJerkMagstd','tBodyGyroMagmean',
'tBodyGyroMagstd','tBodyGyroJerkMagmean','tBodyGyroJerkMagstd','fBodyAccMagmean','fBodyAccMagstd',
'fBodyAccJerkMagmean','fBodyAccJerkMagstd','fBodyGyroMagmean','fBodyGyroMagstd','fBodyGyroJerkMagmean',
'fBodyGyroJerkMagstd','fBodyGyroMagmeanFreq','fBodyGyroJerkMagmeanFreq','fBodyAccMagmeanFreq',
'fBodyAccJerkMagmeanFreq','fBodyAccMagskewness','fBodyAccMagkurtosis','fBodyAccJerkMagskewness',
'fBodyAccJerkMagkurtosis','fBodyGyroMagskewness','fBodyGyroMagkurtosis','fBodyGyroJerkMagskewness',
'fBodyGyroJerkMagkurtosis','angletBodyAccJerkMean,gravityMean','angletBodyAccMean,gravity',
'angletBodyGyroJerkMean,gravityMean','angletBodyGyroMean,gravityMean','angleX,gravityMean',
'angleY,gravityMean','angleZ,gravityMean']
# we add 'Subject', Activity and 'ActivityNames' to the features
f1 = f + ['Activity','ActivityNames','Subject']
dfR = df[f]
dfR_test = X_test[f]
# Taking a new dataframe with reduced features
df = df[f1]
df.shape
# Just for readability of feature names we remove Body and Magnitude
# and replace mean with 'Mean' and std with 'SD'
# tAccMean refers to tBodyAccMagMean
# fAccMean refers to fBodyAccMagMean
df = df.rename(columns = lambda x : str(x).replace('Body',''))
df = df.rename(columns = lambda x : str(x).replace('Mag',''))
df = df.rename(columns = lambda x : str(x).replace('mean','Mean'))
df = df.rename(columns = lambda x : str(x).replace('std','SD'))
# for plotting purposes taking datapoints of each activity to a different dataframe
df1 = df[df['Activity']==1]
df2 = df[df['Activity']==2]
df3 = df[df['Activity']==3]
df4 = df[df['Activity']==4]
df5 = df[df['Activity']==5]
df6 = df[df['Activity']==6]
sns.distplot(df1['tAccMean'],color = 'red',hist = True, kde = False,label = 'Walking')
sns.distplot(df2['tAccMean'],color = 'blue',hist = True, kde = False,label = 'Walking Up')
sns.distplot(df3['tAccMean'],color = 'green',hist = True, kde = False,label = 'Walking down')
sns.distplot(df4['tAccMean'],color = 'yellow',hist = True, kde = False,label = 'Sitting')
sns.distplot(df5['tAccMean'],color = 'm',hist = True, kde = False,label = 'Standing')
sns.distplot(df6['tAccMean'],color = 'orange',hist = True, kde = False,label = 'Laying')
sns.set_style("whitegrid")
plt.legend()
plt.tight_layout()
plt.show()
plt.subplot(2,2,1)
sns.distplot(df1['angletAccMean,gravity'],color = 'red',hist = True, kde = True,label = 'Walking')
sns.distplot(df2['angletAccMean,gravity'],color = 'blue',hist = True, kde = True,label = 'Walking Up')
sns.distplot(df3['angletAccMean,gravity'],color = 'green',hist = True, kde = True,label = 'Walking down')
plt.legend()
plt.subplot(2,2,2)
sns.distplot(df4['angletAccMean,gravity'],color = 'yellow',hist = True, kde = True,label = 'Sitting')
sns.distplot(df5['angletAccMean,gravity'],color = 'm',hist = True, kde = True,label = 'Standing')
sns.distplot(df6['angletAccMean,gravity'],color = 'orange',hist = True, kde = True,label = 'Laying')
sns.set_style("whitegrid")
plt.legend()
plt.tight_layout()
plt.show()
sns.boxplot(x='ActivityNames', y='tAccMean',data=df, showfliers=False, saturation=1)
# plt.savefig('pair.png')
plt.xticks(rotation=90)
plt.show()
sns.boxplot(x='ActivityNames', y='angleX,gravityMean', data=df, showfliers=False)
plt.xticks(rotation = 40)
plt.show()
sns.boxplot(x='ActivityNames', y='angleY,gravityMean', data = df, showfliers=False)
plt.xticks(rotation = 40)
plt.show()
Let's see the whether the features we hand picked heuristically from the domain knowledge makes sense
Logistic Regression on reduced features
# import
from sklearn.linear_model import LogisticRegression
# instantiate LogisticRegression model
lgr = LogisticRegression()
lgr.fit(dfR,y_train)
y_pred_lgr = lgr.predict(dfR_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred_lgr))
Random forests on reduced features
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200)
clf.fit(dfR,y_train)
pred = clf.predict(dfR_test)
print(accuracy_score(y_test,pred))
Tsne on the original dataset
X_train.shape
dft = X_train
del dft['Subject']
del dft['Activity']
del dft['ActivityNames']
dft.shape
# import
from sklearn.manifold import TSNE
# Taking [2,5,10,30,50] as perplexities to see which one gives converges better
l = [2,5,10,30,50]
# loop to calculate and display tsne for different perplexities
X_tsne_p = []
for i in l:
tsne1 = TSNE( perplexity = i)
X1_tsne = tsne1.fit_transform(dft)
a1 = X1_tsne
a1 = a1.reshape((-1,2))
d_1 = pd.DataFrame({'x':a1[:,0],'y':a1[:,1]})
d_1['ActivityNames'] = df['ActivityNames']
X_tsne_p.append(d_1)
sns.FacetGrid(d_1,hue = 'ActivityNames',size = 8).map(plt.scatter,'x','y').add_legend()
print("Tsne for perplexity = ", i)
plt.show()
# Tsne plot for perplexity = 50
sns.FacetGrid(X_tsne_p[4],hue = 'ActivityNames',size = 8).map(plt.scatter,'x','y').add_legend()
print("Tsne for perplexity = 50", )
plt.show()
Conclusions