-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
5 changed files
with
469 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
""" | ||
Author: Marko Arezina | ||
Email: [email protected] | ||
Date created: 10/23/2018 | ||
""" | ||
|
||
import pandas as pd | ||
import datetime | ||
|
||
import os | ||
|
||
|
||
def combine_csv(base_dir): | ||
""" | ||
Combine all csv in base dir | ||
:param base_dir: | ||
:return: | ||
""" | ||
|
||
csvs = next(os.walk(base_dir))[2] | ||
|
||
combined = None | ||
for csv in csvs: | ||
if combined is None: | ||
combined = pd.read_csv(base_dir + "/" + csv, low_memory=False) | ||
combined = combined.query('Activity.notnull() and not (Activity == "end")') | ||
else: | ||
new = pd.read_csv(base_dir + "/" + csv, low_memory=False) | ||
new = new.query('Activity.notnull() and not (Activity == "end")') | ||
combined = combined.append(new, ignore_index=True) | ||
|
||
combined = combined.drop("Unnamed: 0", axis=1) | ||
combined.to_csv('combined_biometric.csv', sep=',', encoding='utf-8') | ||
|
||
|
||
def create_series(data): | ||
""" | ||
Takes the pandas dataframe and sets an activity_session id to each row | ||
Activity session a successive series of activities | ||
:param data: | ||
:return: | ||
""" | ||
|
||
cleaned = data.copy() | ||
cleaned["session"] = None | ||
|
||
# The permitted amount differnce time for rows to be same activity | ||
same_session_threshold = datetime.timedelta(minutes=15) | ||
|
||
activity_sessions = 0 | ||
|
||
# Query for each person | ||
subjects = pd.unique(data["subject"]) | ||
days = pd.unique(data["day"]) | ||
|
||
for subject in subjects: | ||
for day in days: | ||
subject_data = data.query('subject == \'{0}\' and day == {1}'.format(subject, day)) | ||
|
||
subject_data['time'] = pd.to_datetime(subject_data.time, format="%H:%M:%S") | ||
subject_data.sort_values(by='time') | ||
|
||
subject_data = subject_data.reset_index() | ||
|
||
try: | ||
previous_time = subject_data.at[0, 'time'] | ||
for row in range(1, subject_data.shape[0] - 1): | ||
|
||
if (subject_data.at[row, 'time'] - previous_time) > same_session_threshold: | ||
activity_sessions += 1 | ||
|
||
cleaned.at[row, "session"] = activity_sessions | ||
except KeyError: | ||
print("Key error day") | ||
|
||
cleaned.to_csv('series_bio.csv', sep=',', encoding='utf-8') | ||
|
||
|
||
combine_csv("/Users/marezina/PycharmProjects/ActivityBioSpy2018/Cleaned") | ||
|
||
data = pd.read_csv("/Users/marezina/PycharmProjects/ActivityBioSpy2018/combined_biometric.csv") | ||
create_series(data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
""" | ||
Author: Marko Arezina | ||
Email: [email protected] | ||
Date created: 10/24/2018 | ||
""" | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import pickle | ||
|
||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import explained_variance_score | ||
from sklearn.metrics import accuracy_score | ||
|
||
from xgboost import XGBClassifier | ||
|
||
# If not labels match | ||
default_label = 0 | ||
|
||
train_model = False | ||
|
||
# Which subject to exlude from train, and test on | ||
exclude_subject = "D" | ||
|
||
save_model = "xgmodel.pickle.dat" | ||
save_prediction = "prediction_d.csv" | ||
|
||
activities = [ | ||
{ | ||
"substring": ["walk"], | ||
"numerical": 1 | ||
}, | ||
{ | ||
"substring": ["jog", "run"], | ||
"numerical": 2 | ||
}, | ||
{ | ||
"substring": ["bike", "cycle"], | ||
"numerical": 3 | ||
}, | ||
{ | ||
"substring": ["computer", "work"], | ||
"numerical": 4 | ||
}, | ||
{ | ||
"substring": ["movie", "tele", "watch"], | ||
"numerical": 5 | ||
}, | ||
{ | ||
"substring": ["sleep"], | ||
"numerical": 6 | ||
}, | ||
{ | ||
"substring": ["eat", "breakfast", "lunch", "dinner"], | ||
"numerical": 7 | ||
} | ||
|
||
] | ||
|
||
|
||
def check_activities(activity): | ||
""" | ||
Checks if substring in activity | ||
:param activity: | ||
:return: | ||
""" | ||
|
||
for substrings in activities: | ||
for sub in substrings["substring"]: | ||
if sub in activity.lower(): | ||
return True, substrings["numerical"] | ||
|
||
return False, default_label | ||
|
||
|
||
def label_data(data): | ||
""" | ||
Used to label the activites, by substring matching | ||
:param data: | ||
:return: | ||
""" | ||
|
||
for i in data.index: | ||
activity = data.at[i, 'Activity'] | ||
|
||
# Using the general activity name | ||
is_valid, activity = check_activities(activity) | ||
data.at[i, 'Activity'] = activity | ||
|
||
return data | ||
|
||
|
||
def drop_data(data_frame): | ||
""" | ||
Drops not needed column | ||
:param data_frame: | ||
:return: | ||
""" | ||
|
||
data = data_frame.drop("time", axis=1) | ||
data = data.drop("subject", axis=1) | ||
data = data.drop("day", axis=1) | ||
data = data.drop("Unnamed: 0", axis=1) | ||
|
||
return data | ||
|
||
|
||
def prepare_arrays(data_frame, test_size=0.3): | ||
""" | ||
Creates nd arrays of data with numerical labels | ||
:param data_frame: | ||
:return: | ||
""" | ||
|
||
x = data_frame[["heart_rate", "tidal_volume_adjusted", "cadence", "step", "activity", "temperature_celcius", | ||
"systolic_pressure_adjusted", "minute_ventilation_adjusted"]] | ||
|
||
y = pd.to_numeric(data_frame['Activity'], errors='coerce').astype(float) | ||
|
||
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=test_size, random_state=42, shuffle=True) | ||
|
||
print("Train shape: {0}".format(X_train.shape[0])) | ||
|
||
X_train = X_train.values | ||
X_test = X_test.values | ||
Y_train = Y_train.values | ||
Y_test = Y_test.values | ||
|
||
return X_train, X_test, Y_train, Y_test | ||
|
||
|
||
def predict_activity(my_model, X_test, Y_test): | ||
""" | ||
Use model to make prediction and save | ||
:param my_model: | ||
:param X_test: | ||
:param Y_test: | ||
:return: | ||
""" | ||
|
||
print("Testing") | ||
predictions = my_model.predict(X_test) | ||
|
||
print("Accuracy Score: {0}".format(accuracy_score(predictions, Y_test))) | ||
print("Variance: {0}".format(explained_variance_score(predictions, Y_test))) | ||
|
||
pred = ["Prediction", "Truth", "Heart Rate"] | ||
cleaned = pd.DataFrame(columns=pred) | ||
cleaned["Prediction"] = list(predictions) | ||
cleaned["Truth"] = list(Y_test) | ||
cleaned["Heart Rate"] = [X[0] for X in X_test] | ||
|
||
cleaned.to_csv(save_prediction) | ||
|
||
|
||
data = pd.read_csv( | ||
"/Users/mnarezina/PycharmProjects/tensorflow_samples/learning_projects/PhysicalActivity/combined_biometric.csv") | ||
data = data.query("Activity.notnull()") | ||
|
||
# Data with all subjects | ||
all_subject_data = label_data(data) | ||
|
||
# Dont include default category | ||
all_subject_data = all_subject_data.query("Activity != 0") | ||
|
||
# Exlude a subject | ||
excluded_subject = all_subject_data.query("subject != \"D\"") | ||
# exlude_subject = all_subject_data.copy() | ||
|
||
X_train, X_test, Y_train, Y_test = prepare_arrays(excluded_subject) | ||
|
||
if train_model: | ||
eval_set = [(X_test, Y_test)] | ||
my_model = XGBClassifier(n_estimators=500, learning_rate=0.05, verbose=True) | ||
|
||
print("Training") | ||
# X_train = list(X_train) | ||
# Y_train = list(Y_train) | ||
# two = zip(X_train, Y_train) | ||
# for t in two: | ||
# print(t) | ||
|
||
my_model.fit(X_train, Y_train, verbose=True, eval_set=eval_set) | ||
results = my_model.evals_result() | ||
print(results) | ||
|
||
pickle.dump(my_model, open(save_model, "wb+")) | ||
else: | ||
my_model = pickle.load(open(save_model, "rb")) | ||
|
||
predict_activity(my_model, X_test, Y_test) | ||
|
||
""" | ||
Testing for particular subject | ||
""" | ||
print("Testing select: ") | ||
qeury = "subject == \"{0}\"".format(exclude_subject) | ||
data = all_subject_data.query(qeury) | ||
print("Subject test {0}".format(data.shape[0])) | ||
|
||
data = drop_data(data) | ||
|
||
X_train, X_test, Y_train, Y_test = prepare_arrays(data, test_size=0.3) | ||
predictions = my_model.predict(X_test) | ||
|
||
print("Accuracy Score: {0}".format(accuracy_score(predictions, Y_test))) | ||
print("Variance: {0}".format(explained_variance_score(predictions, Y_test))) | ||
|
||
pred = ["Prediction", "Truth", "Heart Rate"] | ||
cleaned = pd.DataFrame(columns=pred) | ||
cleaned["Prediction"] = list(predictions) | ||
cleaned["Truth"] = list(Y_test) | ||
cleaned["Heart Rate"] = [X[0] for X in X_test] | ||
|
||
cleaned.to_csv('single_' + save_prediction) |
Oops, something went wrong.