-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRmarkdown.Rmd
119 lines (97 loc) · 4.11 KB
/
Rmarkdown.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
---
title: "Hydrophobicity_predictor"
output:
pdf_document: default
html_document:
df_print: paged
word_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, error = FALSE, tidy.opts=list(width.cutoff=I(60)), tidy=TRUE)
```
```{r}
Hydrophobicity_predictor <- function(length = 20){
require(tidyverse)
#AA_values <- read.csv("AA_values")
# Prompt to select protein sequence file
# Extracts protein name from fasta title and protein sequence from file
filename <- file.choose()
Protein_name <- read_lines(filename)
Protein_name <- Protein_name[1] %>% strsplit(" ")
Protein_name <- Protein_name[[1]][2:3] %>% paste(collapse = " ")
inputseq <- read_lines(filename, skip = 1)
sequence <- unlist(strsplit(inputseq, ""))
# Charge identifier function
charge_df <- function(filename){
inputseq <- read_lines(filename, skip = 1)
sequence <- unlist(strsplit(inputseq, ""))
# For loop to create a vector of charge identifiers based on if the amino acid is positivly charged or not
charge_vector <- vector()
AA_seq <- seq(from = 1, to = length(sequence))
residue_number_vector <- vector()
for (residue in AA_seq) {
if(sequence[residue] == "K" | sequence[residue] == "R" | sequence[residue] == "H"){
charge_vector[residue] <- "positive"
} else {
charge_vector[residue] <- "uncharged"
}
residue_number_vector[residue] <- residue
}
charge_vector
}
charge_vector <- charge_df(filename)
# Function for converting AA into hydrophobicity values
AA_converter <- function(filename){
inputseq <- read_lines(filename, skip = 1)
protseq <- unlist(strsplit(inputseq, ""))
value_vector <- seq(from = 1, to = 20)
hydro_vector <- vector()
seqAA_vector <- seq(from = 1, to = length(protseq))
for (seqAA in seqAA_vector) {
for (AA in value_vector) {
if (protseq[seqAA] == AA_values[AA,1]) {
value <- AA_values[AA,2]
}
}
hydro_vector[seqAA] <- value
}
hydro_vector
}
seq_hydro_values <- AA_converter(filename)
# Rolling average function
rolling_average <- function(sequence, length){
posvector <- seq(from=1, to=length(sequence)-(length-1))
avgvector <- vector()
avg_size <- seq(from = 1, to = (length-1))
for (position in posvector) {
plusone_vector <- vector()
for (plusone in avg_size) {
plusone_vector[plusone] <- sequence[position + plusone]
}
plusone_vector <- append(plusone_vector, sequence[position])
average1 <- mean(plusone_vector)
avgvector[position] <- average1
}
avgvector
}
# Creating a data frame using the rolling average function and combining it with the charge identifier and residue numbers
df <- data_frame("Residue" = seq(from = 1, to = length(seq_hydro_values)-(length-1)),
"Hydrophobicity" = rolling_average(seq_hydro_values, length),
"Charge" = charge_vector[1:(length(charge_vector)-(length-1))])
# Plotting the data
g <- ggplot(df) +
geom_line(mapping = aes(Residue, Hydrophobicity)) +
theme_bw() +
scale_x_continuous(breaks = seq(from = 0, to = length(seq_hydro_values), by = 20)) +
scale_y_continuous(breaks = seq(from = round(min(df$Hydrophobicity), digits = 0), to = round(max(df$Hydrophobicity), digits = 0), by = 0.5)) +
geom_hline(yintercept=0) +
labs(title = paste(Protein_name, "Hydrophobicity Plot with sampling width", length)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
geom_point(mapping = aes(Residue, Hydrophobicity, color = Charge, alpha = Charge)) +
scale_alpha_discrete(limits = rev, guide = "none")
# Make a second df with the addition of the unaveraged hydrovalues for later use
hydro_df <- data_frame("Residue" = seq(from = 1, to = length(seq_hydro_values)), "Hydrophobicity" = seq_hydro_values)
df_union <- union(df[,c(1,2)], hydro_df[(1+nrow(hydro_df)-(nrow(hydro_df)-nrow(df))):nrow(hydro_df),1:2])
return(list(g, df, inputseq, df_union))
}
```