COD_week10_1_MGK_BTE3207.Rmd

---
title: "COD_week10_1_MGK_BTE3207"
author: "Minsik Kim"
date: "2024-11-04"
output:
        rmdformats::downcute:
        downcute_theme: "chaos"
code_folding: show
fig_width: 6
fig_height: 6
df_print: paged
editor_options: 
        chunk_output_type: inline
markdown: 
        wrap: 72
---
        

<!-- <style> -->
<!--   /* Default light mode styles */ -->
<!--   .reactable { -->
<!--     background-color: #ffffff !important; /* Light background */ -->
<!--     color: #000000 !important;            /* Dark text */ -->
<!--     border-color: #cccccc !important;     /* Light border */ -->
<!--   } -->

<!-- </style> -->
        
```{r warning=FALSE, message=FALSE, echo=FALSE, results='hide', setup}
#===============================================================================
#BTC.LineZero.Header.1.1.0
#===============================================================================
#R Markdown environment setup and reporting utility.
#===============================================================================
#RLB.Dependencies:
#   knitr, magrittr, pacman, rio, rmarkdown, rmdformats, tibble, yaml
#===============================================================================
#Input for document parameters, libraries, file paths, and options.
#=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=
knitr::opts_chunk$set(message=FALSE, warning = FALSE)

path_working <- 
        ifelse(sessionInfo()[1]$R.version$platform == "x86_64-pc-linux-gnu",
               "/mnt/4T_samsung/Dropbox/",
               ifelse(sessionInfo()[1]$R.version$platform == "aarch64-apple-darwin20",
                      "/Volumes/macdrive/Dropbox/", 
                      "/Users/minsikkim/Dropbox (Personal)/"))
path_library <- 
        ifelse(sessionInfo()[1]$R.version$platform == "x86_64-pc-linux-gnu",
               "/home/bagel/R_lib/",
               "/Library/Frameworks/R.framework/Resources/library/")


str_libraries <- c("tidyverse", "pacman", "yaml", "reactable", "scatterplot3d")


YAML_header <-
        '---
title: "BTE3207 week 10-1"
author: "Minsik Kim"
date: "2024.11.04"
output:
    rmdformats::downcute:
        downcute_theme: "chaos"
        code_folding: hide
        fig_width: 6
        fig_height: 6
---'
seed <- "20241104"

#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Loads libraries, file paths, and other document options.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Boot <- function() {
        .libPaths(path_library)
        
        require(pacman)
        pacman::p_load(c("knitr", "rmarkdown", "rmdformats", "yaml"))
        
        knitr::opts_knit$set(root.dir = path_working)
        
        str_libraries |> unique() |> sort() -> str_libraries
        pacman::p_load(char = str_libraries)
        
        set.seed(seed)
}
FUN.LineZero.Boot()
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Outputs R environment report.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Report <- function() {
        cat("Line Zero Environment:\n\n")
        paste("R:", pacman::p_version(), "\n") |> cat()
        cat("Libraries:\n")
        for (str_libraries in str_libraries) {
                paste(
                        "    ", str_libraries, ": ", pacman::p_version(package = str_libraries),
                        "\n", sep = ""
                ) |> cat()
        }
        paste("\nOperating System:", pacman::p_detectOS(), "\n") |> cat()
        paste("    Library Path:", path_library, "\n") |> cat()
        paste("    Working Path:", path_working, "\n") |> cat()
        paste("Seed:", seed, "\n\n") |> cat()
        cat("YAML Header:\n")
        cat(YAML_header)
}
FUN.LineZero.Report()

```


# Before begin..

Let's load the SBP dataset.

```{r}

dataset_sbp <- read.csv(file = "Inha/5_Lectures/2024/Advanced biostatistics/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")


dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))
dataset_sbp
dataset_sbp_small

lm_result <- lm(data = dataset_sbp_small, formula = DBP ~ SBP)
lm_result$coefficients
lm_result$residuals
summary(lm_result)

lm(data = dataset_sbp_small, formula = FBS ~ BMI)
lm(data = dataset_sbp_small, formula = BMI ~ FBS) %>%
        summary

cor(x = dataset_sbp_small$FBS, y = dataset_sbp_small$BMI)
```


Making a new variable `hypertension`

```{r}

dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
                                           dataset_sbp$DBP > 80,
                                   T,
                                   F)


```

# DBP ~ SBP

With a scatter plot, we can see there is a correlation between SBP and DBP.

```{r}

set.seed(1)
dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))

ggplot(data = dataset_sbp_small, aes(x = SBP, y = DBP)) +
        geom_point() +
        theme_classic( base_family = "serif", base_size = 20) +
        xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)")
        

```

# Data with trendline

With `lm()` function, we can calculate the linear model of DBP~SBP with the least squares algorithm.

```{r}

#A function that returns equation

lm_eqn <- function(m){
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}


set.seed(1)

dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))

ggplot(data = dataset_sbp_small, aes(x = SBP, y = DBP)) +
        geom_point() +
        theme_classic(base_family = "serif", base_size = 20) +
        xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)") + 
        geom_smooth(method = "lm") +
        geom_label(x = 130, y = 115,
                  label = lm_eqn(lm(data = dataset_sbp_small, DBP ~ SBP)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")


#rnorm(100, mean = 100, sd = 1)

```

# Calculation of residual

The result of `Model's predicted value - sample ` is called residual. It can be shown as below.

```{r}

lm_result <- 
        lm(data = dataset_sbp_small,DBP ~ SBP) #%>% summary

d <- dataset_sbp_small


#Saving predicted values 
d$predicted <- lm_result$fitted.values
d$residuals <- residuals(lm_result)


ggplot(d, aes(x = SBP, y = DBP)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point() +
        xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_label(x = 130, y = 115,
                  label = lm_eqn(lm(data = dataset_sbp_small,DBP ~ SBP)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")

  
```
# Intercept, slope, and their units

Remember that the size of intercept and slope is *DEPENDENT* on the units of input or outcome.

```{r}

dataset_sbp_small$SBP_pa <- 133.322 * dataset_sbp_small$SBP
dataset_sbp_small$DBP_pa <- 133.322 * dataset_sbp_small$DBP

dataset_sbp_small$SBP_psi <- 0.0193368 * dataset_sbp_small$SBP
dataset_sbp_small$DBP_psi <- 0.0193368 * dataset_sbp_small$DBP
```

If we chnage the units of SBP or DBP,

DBP (pascal) ~ SBP (pascal)

```{r}
lm(data = dataset_sbp_small, DBP_pa ~ SBP_pa) %>% summary
```


DBP (mmHg) ~ SBP (pascal)


```{r}
lm(data = dataset_sbp_small, DBP ~ SBP_pa) %>% summary
```


DBP (psi) ~ SBP (psi)

```{r}
lm(data = dataset_sbp_small, DBP_psi ~ SBP_psi) %>% summary

```

# Distance from the mean value can be drawn as below.

This is the residual from mean of all value - which is X-bar - Xi. Using them we can calcualte SD of mean!

```{r}

#Saving predicted values 
d$mean <- mean(dataset_sbp_small$DBP)
d$predicted <- lm_result$fitted.values
d$residuals <- residuals(lm_result)


ggplot(d, aes(x = SBP, y = DBP)) +
        #geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "red") +  # alpha to fade lines
        geom_point() +
        #xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "blue") +
        geom_label(x = 100, y = 95,
                  label = "Mean DBP",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")


```
# Distance from a regression line can be drawn as below

With the same approach as above, we can calculate SD of a linear regression!


```{r}

lm_result <- dataset_sbp_small %>%
        lm(data = .,DBP ~ SBP) #%>% summary


ggplot(d, aes(x = SBP, y = DBP)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        geom_segment(aes(xend = SBP, yend = predicted), alpha = 1, linetype = "dashed", color = "red") +  # alpha to fade
        geom_point() +
        xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_label(x = 100, y = 90,
                  label = "Linear model",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")

```

# Two draw both,

```{r}


ggplot(d, aes(x = SBP, y = DBP)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        geom_segment(aes(xend = SBP, yend = predicted), alpha = 1, linetype = "dashed", color = "blue") +  # alpha to fade
        #geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "purple") +  # alpha to fade
        geom_point() +
        xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_label(x = 100, y = 90,
                  label = "Linear model",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue") +
        geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "purple") +
        geom_label(x = 170, y = 70,
                  label = "Mean DBP",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "purple")
        

```


# Example 2 - BMI vs. SBP

R-squared is the measuremnt how the regression line fits well with the data. 

With lower signifiacnt associations, R-squared value will be smaller.

```{r}

lm(data = dataset_sbp_small, BMI ~ SBP) %>% summary

BMI_model <- lm(data = dataset_sbp_small, BMI ~ SBP)

d$predicted <- BMI_model$fitted.values


ggplot(d, aes(x = SBP, y = BMI)) +
        geom_smooth(method = "lm", se = FALSE, color = "red") +  # Plot regression slope
        geom_segment(aes(xend = SBP, yend = predicted), alpha = 1, linetype = "dashed", color = "red") +  # alpha to fade
        #geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "purple") +  # alpha to fade
        geom_point() +
        xlab("SBP (mmHg)") +
        ylab("BMI (kg/m2)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_label(x = 170, y = 36,
                  label = "Linear model",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "red") +
        #geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "purple") +
        geom_label(x = 155, y = 16,
                  label = lm_eqn(lm(data = dataset_sbp_small,BMI ~ SBP)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "red", fill = alpha(c("white"),1)) 
        #ylim(c(11, 30))


```


# Example 3 - Binary variables

Linear model, confidence intervals of each term, p-values, and R-squared value can be calculated for binary input as well.

SBP vs Gender

```{r}


dataset_sbp_small$Female <- ifelse(dataset_sbp_small$SEX == 1, 0, 1) 

lm(data = dataset_sbp_small, DBP ~ Female) %>% summary

gender_model <- lm(data = dataset_sbp_small, DBP ~ Female)

dataset_sbp_small$predicted <- gender_model$fitted.values


ggplot(dataset_sbp_small, aes(x = Female, y = DBP)) +
        geom_smooth(method = "lm", se = FALSE, color = "red") +  # Plot regression slope
        geom_segment(aes(xend = Female, yend = predicted), alpha = 1, linetype = "dashed", color = "red") +  # alpha to fade
        #geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "purple") +  # alpha to fade
        geom_point() +
        #xlab("") +
        scale_x_discrete(name ="Gender (Male = 0, Female = 1)", 
                    limits=c(0,1)) +
        #scale_x_discrete(breaks = c(0,1))+
        ylab("BMI (kg/m2)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_label(x = 0.5, y = 36,
                  label = "Linear model",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "red") +
        #geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "purple") +
        geom_label(x = 0.5, y = 120,
                  label = lm_eqn(lm(data = dataset_sbp_small,DBP ~ Female)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "red", fill = alpha(c("white"),1)) 
        #ylim(c(11, 30))


```

# Pearson's correlation

Pearson's correlation can be calculated using `stats::cor()` function. 


```{r}

#THis is Pearson's correlation coefficient
cor(dataset_sbp_small$DBP, dataset_sbp_small$SBP)

lm(data = dataset_sbp_small, SBP ~ SEX) %>% confint


```

Where, Pearson's correlation is just the square root of R-squared!

```{r}
#It's sqrt() of R-squared.
sqrt(lm(data = dataset_sbp_small, DBP ~ SBP) %>% summary %>% .$r.squared)

```

`stats::cor()` function will calculated multiple correlations at the same time!

```{r}

cor(dataset_sbp)


```

Other types of correlation - Spearman's correlation (rank-based)

```{r}

cor(dataset_sbp,method = "spearman")

```

# Bibliography

```{r warning=FALSE, message=FALSE, echo=FALSE}
#===============================================================================
#BTC.LineZero.Footer.1.1.0
#===============================================================================
#R markdown citation generator.
#===============================================================================
#RLB.Dependencies:
#   magrittr, pacman, stringr
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#BTC.Dependencies:
#   LineZero.Header
#===============================================================================
#Generates citations for each explicitly loaded library.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
str_libraries <- c("r", str_libraries)
for (str_libraries in str_libraries) {
        str_libraries |>
                pacman::p_citation() |>
                print(bibtex = FALSE) |>
                capture.output() %>%
                .[-1:-3] %>% .[. != ""] |>
                stringr::str_squish() |>
                stringr::str_replace("_", "") |>
                cat()
        cat("\n")
}
#===============================================================================
```