-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathREADME.Rmd
152 lines (122 loc) · 6.3 KB
/
README.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
---
title: Linear Regression for TriPASS Data Sci 20180925 Meetup
author: "Rick Pack"
date: "September 24, 2018"
output: github_document
---
```{r setup, include=FALSE}
library(rvest)
library(readr)
library(dplyr)
library(tidyr)
library(lubridate)
library(stringr)
library(ggplot2)
fig.path = "Graphs"
```
## Topic
Co-presentation with Kevin Feasel (linear regression).
### Simple Linear Regression with mtcars
```{r mtcars, warning = FALSE, error = FALSE, message = FALSE, tidy = FALSE}
mtcars_lm <- lm(mtcars$mpg ~ mtcars$wt)
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
labs(title = paste("Adj R2 = ", signif(summary(mtcars_lm)$adj.r.squared, 2),
"Intercept = ", signif(mtcars_lm$coefficients[[1]], 2),
" Slope = ", signif(mtcars_lm$coefficients[[2]], 2),
" P = ", signif(summary(mtcars_lm)$coefficients[2,4], 2)))
par(mfrow = c(2,2))
plot(mtcars_lm)
```
### R code for regression of Carolina Godiva Summer Track Series (2018) data.
```{r trackres, warning = FALSE, error = FALSE, message = FALSE, tidy = FALSE}
# Produced from 2018 results websites
# see SumTrack2018.Rmd in the R subfolder
track_res <- read.csv('godiva_summer_track_res_2018.csv')
track_res$Date_Meet <- ymd(track_res$Date_Meet)
track_res <- track_res %>% group_by(Name) %>%
arrange(Name, Date_Meet, ct_evt) %>%
mutate(EventNum = row_number()) %>%
group_by(Name, Date_Meet) %>%
mutate(EventDay = row_number()) %>%
group_by(Name, dist_m, Date_Meet) %>%
mutate(DistNum = row_number()) %>%
ungroup()
all_dates <- sort(unique(track_res$Date_Meet))
# downloaded from 'https://www.carolinagodiva.org/index.php?page=track-season-weather-conditions')
# see SumTrack2018.R in the R subfolder
temps_2018 <- read_csv("temps_2018.csv")
track_res <- left_join(track_res, temps_2018, by = 'Date_Meet')
track_res %>% dplyr::filter(Name=='Rick Pack' & Date_Meet==ymd('2018-08-01'))
track_res_base <- track_res %>%
mutate(Sex_num = case_when(
Sex == "M" ~ 0,
Sex == "F" ~ 1)) %>%
select(Sex_num, Age, dist_m, track_time, DistNum, EventNum, EventDay, Temp)
track_lm <- lm(track_time ~ . , track_res_base)
summary(track_lm)
# Intercept absolute value is large and P-value (testing != 0) supports
# stat significance
# Model appears to be flawed
# What rows have missing data given lm reported
# 246 'observations deleted due to missingness'?
which(! complete.cases(track_res_base))
track_res_base[c(3, 20, 27), ]
# Keep only complete cases
track_res_base_complete <- track_res_base[complete.cases(track_res_base),]
```
## Flawed model, let's look at plots (the usual first step)
```{r plot1}
plot(track_res_base_complete)
```
### No variable looks linear to the response variable track_time (column 4) but the combination of explanatory parameters (coefficients * explanatory variables) could be.
```{r plot2}
track_res_base_complete$Predicted <- fitted(track_lm)
ggplot(track_res_base_complete, aes(x = Predicted, y = track_time)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
labs(title = paste("Adj R2 = ", signif(summary(track_lm)$adj.r.squared, 2),
"Intercept = ", signif(track_lm$coefficients[[1]], 2),
" P = ", signif(summary(track_lm)$coefficients[2,4], 2)),
subtitle = "CGTC Summer Track Series (2018)")
```
### Let's run the model only for the 100 meter run
```{r 100m_lm2, warning = FALSE, error = FALSE, message = FALSE, echo = FALSE, tidy = FALSE}
track_res_base_complete_100 <- track_res_base[complete.cases(track_res_base),] %>%
dplyr::filter(dist_m == 100) %>%
# no variation for these
select(-DistNum, -dist_m)
track100_lm <- lm(track_time ~ . , track_res_base_complete_100)
track_res_base_complete_100$Predicted <- fitted(track100_lm)
head(track_res_base_complete_100 %>% select(Age, track_time, Predicted))
summary(track100_lm)
ggplot(track_res_base_complete_100, aes(x = Predicted, y = track_time)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
labs(title = paste("Adj R2 = ", signif(summary(track100_lm)$adj.r.squared, 2),
"Intercept = ", signif(track100_lm$coefficients[[1]], 2),
" P = ", signif(summary(track100_lm)$coefficients[2,4], 2)),
subtitle = "CGTC 100 meters (2018)")
```
### Higher times are a problem. Exploding the error and killing r-squared. Eliminate times above 28 seconds for the 100 (nine track_times)
```{r 100m_lm3, warning = FALSE, error = FALSE, message = FALSE, echo = FALSE, tidy = FALSE}
track_res_base_complete_100_fast <- track_res_base_complete_100 %>%
dplyr::filter(track_time <= 28)
track100_fast_lm <- lm(track_time ~ . , track_res_base_complete_100_fast)
track_res_base_complete_100_fast$Predicted <- fitted(track100_fast_lm)
head(track_res_base_complete_100_fast %>% select(Age, track_time, Predicted))
summary(track100_fast_lm)
ggplot(track_res_base_complete_100_fast, aes(x = Predicted, y = track_time)) +
geom_point(shape = 1) +
geom_smooth(method = lm, se = FALSE) +
labs(title = paste("Adj R2 = ", signif(summary(track100_fast_lm)$adj.r.squared, 2),
"Intercept = ", signif(track100_fast_lm$coefficients[[1]], 2),
" P = ", signif(summary(track100_fast_lm)$coefficients[2,4], 2)),
subtitle = "CGTC 100 meters (2018) - times under 28 seconds")
```
###### We might pursue more adjustments, dropping insignificant predictors, and possibly a different kind of model than a linear one. Of course, this also illustrates the importance of checking the statistical assumptions first, as Dr. Frank Harrell cautions - and this extends beyond machine learning.

```{r tweetURL, warning = FALSE, error = FALSE, message = FALSE, echo = FALSE, tidy = FALSE}
print("https://twitter.com/f2harrell/status/1043871065498357760?s=12")
```