vignettes/SECOM.Rmd

---
title: "SECOM Tutorial"
author: 
  - Huang Lin$^1$
  - $^1$NIEHS, Research Triangle Park, NC 27709, USA
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output: rmarkdown::html_vignette
bibliography: bibliography.bib
vignette: >
  %\VignetteIndexEntry{SECOM Tutorial}
  %\VignetteEngine{knitr::rmarkdown}
  \usepackage[utf8]{inputenc}
---

```{r setup, message = FALSE, warning = FALSE, comment = NA}
knitr::opts_chunk$set(message = FALSE, warning = FALSE, comment = NA, 
                      fig.width = 6.25, fig.height = 5)
library(ANCOMBC)
library(tidyverse)
```

```{r helper}
get_upper_tri = function(cormat){
    cormat[lower.tri(cormat)] = NA
    diag(cormat) = NA
    return(cormat)
}
```

# 1. Introduction

Sparse Estimation of Correlations among Microbiomes (SECOM) [@lin2022linear]
is a methodology that aims to detect both linear and nonlinear relationships 
between a pair of taxa within an ecosystem (e.g., gut) or across ecosystems 
(e.g., gut and tongue). SECOM corrects both sample-specific and taxon-specific 
biases and obtains a consistent estimator for the correlation matrix of 
microbial absolute abundances while maintaining the underlying true sparsity.
For more details, please refer to the 
[SECOM](https://doi.org/10.1038/s41467-022-32243-x) paper.

# 2. Installation

Download package. 

```{r getPackage, eval=FALSE}
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("ANCOMBC")
```

Load the package. 

```{r load, eval=FALSE}
library(ANCOMBC)
```

# 3. Example Data 

The HITChip Atlas dataset contains genus-level microbiota profiling with 
HITChip for 1006 western adults with no reported health complications, 
reported in [@lahti2014tipping]. The dataset is available via the 
microbiome R package [@lahti2017tools] in phyloseq [@mcmurdie2013phyloseq] 
format.

```{r}
data(atlas1006, package = "microbiome")
tse = mia::makeTreeSummarizedExperimentFromPhyloseq(atlas1006)

# subset to baseline
tse = tse[, tse$time == 0]

# Re-code the bmi group
tse$bmi = recode(tse$bmi_group,
                 obese = "obese",
                 severeobese = "obese",
                 morbidobese = "obese")
# Subset to lean, overweight, and obese subjects
tse = tse[, tse$bmi %in% c("lean", "overweight", "obese")]

# Create the region variable
tse$region = recode(as.character(tse$nationality),
                    Scandinavia = "NE", UKIE = "NE", SouthEurope = "SE", 
                    CentralEurope = "CE", EasternEurope = "EE",
                    .missing = "unknown")

# Discard "EE" as it contains only 1 subject
# Discard subjects with missing values of region
tse = tse[, ! tse$region %in% c("EE", "unknown")]

print(tse)
```

# 4. Run SECOM on a Single Ecosystem

## 4.1 Run secom functions

```{r}
set.seed(123)
# Linear relationships
res_linear = secom_linear(data = list(tse), assay_name = "counts",
                          tax_level = "Phylum", pseudo = 0, 
                          prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5, 
                          wins_quant = c(0.05, 0.95), method = "pearson", 
                          soft = FALSE, thresh_len = 20, n_cv = 10, 
                          thresh_hard = 0.3, max_p = 0.005, n_cl = 2)

# Nonlinear relationships
res_dist = secom_dist(data = list(tse), assay_name = "counts",
                      tax_level = "Phylum", pseudo = 0, 
                      prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5, 
                      wins_quant = c(0.05, 0.95), R = 1000, 
                      thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
```

## 4.2 Visualizations {.tabset}

### Pearson correlation with thresholding

```{r}
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0

df_linear = data.frame(get_upper_tri(corr_linear)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))

tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)

heat_linear_th = df_linear %>%
  ggplot(aes(var2, var1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE) +
  scale_y_discrete(drop = FALSE) +
  geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, 
                                   face = "italic"),
        axis.text.y = element_text(size = 12, face = "italic"),
        strip.text.x = element_text(size = 14),
        strip.text.y = element_text(size = 14),
        legend.text = element_text(size = 12),
        plot.title = element_text(hjust = 0.5, size = 15),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none") +
  coord_fixed()

heat_linear_th
```

### Pearson correlation with p-value filtering

```{r}
corr_linear = res_linear$corr_fl
cooccur_linear = res_linear$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0

df_linear = data.frame(get_upper_tri(corr_linear)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))

tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)

heat_linear_fl = df_linear %>%
  ggplot(aes(var2, var1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE) +
  scale_y_discrete(drop = FALSE) +
  geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "Pearson (Filtering)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, 
                                   face = "italic"),
        axis.text.y = element_text(size = 12, face = "italic"),
        strip.text.x = element_text(size = 14),
        strip.text.y = element_text(size = 14),
        legend.text = element_text(size = 12),
        plot.title = element_text(hjust = 0.5, size = 15),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none") +
  coord_fixed()

heat_linear_fl
```

### Distance correlation with p-value filtering

```{r}
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0

df_dist = data.frame(get_upper_tri(corr_dist)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))

tax_name = sort(union(df_dist$var1, df_dist$var2))
df_dist$var1 = factor(df_dist$var1, levels = tax_name)
df_dist$var2 = factor(df_dist$var2, levels = tax_name)

heat_dist_fl = df_dist %>%
  ggplot(aes(var2, var1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE) +
  scale_y_discrete(drop = FALSE) +
  geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "Distance (Filtering)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, 
                                   face = "italic"),
        axis.text.y = element_text(size = 12, face = "italic"),
        strip.text.x = element_text(size = 14),
        strip.text.y = element_text(size = 14),
        legend.text = element_text(size = 12),
        plot.title = element_text(hjust = 0.5, size = 15),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none") +
  coord_fixed()

heat_dist_fl
```

# 5. Run SECOM on Multiple Ecosystems

## 5.1 Data manipulation

To compute correlations whithin and across different ecosystems, one needs to
make sure that there are samples in common across these ecosystems.

```{r}
# Select subjects from "CE" and "NE"
tse1 = tse[, tse$region == "CE"]
tse2 = tse[, tse$region == "NE"]

# Rename samples to ensure there is an overlap of samples between CE and NE
colnames(tse1) = paste0("Sample-", seq_len(ncol(tse1)))
colnames(tse2) = paste0("Sample-", seq_len(ncol(tse2)))

print(tse1)
print(tse2)
```

## 5.2 Run secom functions

```{r}
set.seed(123)
# Linear relationships
res_linear = secom_linear(data = list(CE = tse1, NE = tse2), 
                          assay_name = c("counts", "counts"),
                          tax_level = c("Phylum", "Phylum"), pseudo = 0, 
                          prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5, 
                          wins_quant = c(0.05, 0.95), method = "pearson", 
                          soft = FALSE, thresh_len = 20, n_cv = 10, 
                          thresh_hard = 0.3, max_p = 0.005, n_cl = 2)

# Nonlinear relationships
res_dist = secom_dist(data = list(CE = tse1, NE = tse2),
                      assay_name = c("counts", "counts"),
                      tax_level = c("Phylum", "Phylum"), pseudo = 0, 
                      prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5, 
                      wins_quant = c(0.05, 0.95), R = 1000, 
                      thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
```

## 5.3 Visualizations {.tabset}

### Pearson correlation with thresholding

```{r, fig.width=8, fig.height=8}
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0

df_linear = data.frame(get_upper_tri(corr_linear)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(var2 = gsub("\\...", " - ", var2),
         value = round(value, 2))

tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")

heat_linear_th = df_linear %>%
  ggplot(aes(var2, var1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       na.value = "grey", midpoint = 0, limit = c(-1,1), 
                       space = "Lab", name = NULL) +
  scale_x_discrete(drop = FALSE) +
  scale_y_discrete(drop = FALSE) +
  geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") +
  theme_bw() +
  geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
  geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, 
                                   face = "italic", color = txt_color),
        axis.text.y = element_text(size = 12, face = "italic", 
                                   color = txt_color),
        strip.text.x = element_text(size = 14),
        strip.text.y = element_text(size = 14),
        legend.text = element_text(size = 12),
        plot.title = element_text(hjust = 0.5, size = 15),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none") +
  coord_fixed()

heat_linear_th
```

### Pearson correlation with p-value filtering

```{r, fig.width=8, fig.height=8}
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0

df_linear = data.frame(get_upper_tri(corr_linear)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(var2 = gsub("\\...", " - ", var2),
         value = round(value, 2))

tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")

heat_linear_fl = df_linear %>%
  ggplot(aes(var2, var1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       na.value = "grey", midpoint = 0, limit = c(-1,1), 
                       space = "Lab", name = NULL) +
  scale_x_discrete(drop = FALSE) +
  scale_y_discrete(drop = FALSE) +
  geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "Pearson (Filtering)") +
  theme_bw() +
  geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
  geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, 
                                   face = "italic", color = txt_color),
        axis.text.y = element_text(size = 12, face = "italic", 
                                   color = txt_color),
        strip.text.x = element_text(size = 14),
        strip.text.y = element_text(size = 14),
        legend.text = element_text(size = 12),
        plot.title = element_text(hjust = 0.5, size = 15),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none") +
  coord_fixed()

heat_linear_fl
```

### Distance correlation with p-value filtering

```{r, fig.width=8, fig.height=8}
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur

# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0

df_dist = data.frame(get_upper_tri(corr_dist)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(var2 = gsub("\\...", " - ", var2),
         value = round(value, 2))

tax_name = sort(union(df_dist$var1, df_dist$var2))
df_dist$var1 = factor(df_dist$var1, levels = tax_name)
df_dist$var2 = factor(df_dist$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")

heat_dist_fl = df_dist %>%
  ggplot(aes(var2, var1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       na.value = "grey", midpoint = 0, limit = c(-1,1), 
                       space = "Lab", name = NULL) +
  scale_x_discrete(drop = FALSE) +
  scale_y_discrete(drop = FALSE) +
  geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "Distance (Filtering)") +
  theme_bw() +
  geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
  geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1, 
                                   face = "italic", color = txt_color),
        axis.text.y = element_text(size = 12, face = "italic", 
                                   color = txt_color),
        strip.text.x = element_text(size = 14),
        strip.text.y = element_text(size = 14),
        legend.text = element_text(size = 12),
        plot.title = element_text(hjust = 0.5, size = 15),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none") +
  coord_fixed()

heat_dist_fl
```

# Session information

```{r sessionInfo, message = FALSE, warning = FALSE, comment = NA}
sessionInfo()
```

# References