cb2edu · kimberlye73 · Dec 15, 2023
diff --git a/data/Homo_sapiens.gene_info b/data/Homo_sapiens.gene_info
diff --git a/problem_1/final_assignment_problem_1.sh b/problem_1/final_assignment_problem_1.sh
@@ -0,0 +1,42 @@
+## Problem 1
+
+# Write a bash script that takes a fasta file as an input and print out the average protein length in the file.
+# Note: You can use E. coli MG1655 proteome file to test your script. The file can be downloaded from here: https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.faa.
+# Note:
+# 1. You must use only bash commands. No other programming language is allowed.
+# 2. You may need the following commands in bash to complete this task. wget, zcat, wc, tr, bc, and grep. You are not restricted to any of these commands. You can use any or all or any other bash
+# commands in your script or command line.
+
+#!/bin/bash
+
+wget https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.faa
+
+file="NC_000913.faa"
+cat $file | head
+
+sequences=$(awk '/^>/ {next} {gsub(/ /,""); print}' "$file")
+
+readarray -t sequences_array <<< "$sequences"
+
+total_length=0
+num_sequences=0
+for sequence in "${sequences_array[@]}"; do
+    sequence_length=${#sequence}
+    total_length=$((total_length + sequence_length))
+    ((num_sequences++))
+done
+
+if [ "$num_sequences" -gt 0 ]; then
+    average_length=$((total_length / num_sequences))
+    echo "Average protein length in $file is: $average_length"
+else
+    echo "No sequences found in the input file."
+fi
+
+
+
+
+
+
+
+
diff --git a/problem_2/final_assignment_problem_2.Rmd b/problem_2/final_assignment_problem_2.Rmd
@@ -0,0 +1,92 @@
+---
+title: "problem_2"
+author: "Kim"
+date: "2023-12-14"
+output: html_document
+editor_options: 
+  chunk_output_type: console
+---
+
+## Problem 2
+
+Swissvar is a database of human gene, their variations, and disease associations. The file can be downloaded from here: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/variants/humsavar.txt. The first column of this file contains the gene name and the rest of the columns contains the other information. Using this file
+
+1. list out the top five genes that are mutated in various human disease.
+2. plot the frequency distribution of disease variants in human genome across all the genes in the file.
+3. calculate the average number disease causing mutations across all genes in human genome and mark
+this number on the previous plot as vertical red line.
+4. The 4th column of this file contains the amino acid affected by the the mutation like this: p.Gly477Arg.
+The p indicates it is protein sequence. Then the 3 letter code of the aa affected then the position in number and then three letter code for the aa that the position changed to. You should write a regular expression to extract the affected aa. Plot a graph showing the fraction of mutations affecting each 20 amino acid on the x-axis. Which amino acid has the highest probablity of getting mutated?
+Hint: Remember to skip the information lines in the file and also note that type of variant column contains both disease causing and non-disease causing variants.
+Note: Try to parse this file yourself. If you cannot do it, run the script “create_data_file.R” to create the data file humsavar.tsv.gz in data directory. A ready made file for use is also present in the data directory. Read this file using the standard R way.
+
+```{r}
+
+# Packages
+library(ggplot2) ; library(dplyr)
+
+setwd("/n/projects/ke2488/class/2023/fall/CB2-101_bioinformatics/final_assignment/CB2-101-2023-assignment/problem_2/")
+
+raw.data <- read.table("/n/projects/ke2488/class/2023/fall/CB2-101_bioinformatics/final_assignment/CB2-101-2023-assignment/problem_2/humsavar_noheader.txt", fill = T, stringsAsFactors = F, sep="", flush = T)
+
+colnames(raw.data) <- c("Main_gene_name","SwissProt_AC","FTId", "AA_change", "Variant_category", "dbSNP", "Disease_name")
+
+
+variantsPerGene <- raw.data %>% count(Main_gene_name)
+variantsPerGene <- variantsPerGene[order(variantsPerGene$n, decreasing = TRUE),]
+
+# Question 1
+top5 <- variantsPerGene %>% head(5)
+top5
+
+# Question 2
+ggplot(data = variantsPerGene) +
+  geom_bar(mapping = aes(x = Main_gene_name, y = n), stat = "identity")
+
+# Question 3
+variantsPerGene_withDisease <- raw.data[raw.data$Disease_name != "-",] %>% count(Main_gene_name)
+
+sum(variantsPerGene_withDisease$n) / length(variantsPerGene_withDisease$Main_gene_name)
+
+plot <- ggplot(data = variantsPerGene) +
+  geom_bar(mapping = aes(x = Main_gene_name, y = n), stat = "identity") + 
+  geom_hline(aes(yintercept=9.063027), color = "red")
+
+plot + scale_y_continuous(limits = c(0, 200))
+
+# Question 4
+mutations <- raw.data$AA_change
+
+regex <- "p\\.([A-Z][a-z]{2}\\d+)[A-Z][a-z]{2}"
+
+affected_aa <- character(length(mutations))
+
+for (i in seq_along(mutations)) {
+  match <- regmatches(mutations[i], gregexpr(regex, mutations[i], perl = TRUE))
+  if (length(match[[1]]) > 0) {
+    affected_aa[i] <- substr(match[[1]], 3, 5)
+  } else {
+    affected_aa[i] <- NA
+  }
+}
+
+# print(affected_aa)
+
+
+mut_freq <- table(affected_aa) / length(affected_aa) 
+
+mut_freq <- mut_freq[order(names(mut_freq))]
+
+mut_freq <- mut_freq %>% as.data.frame()
+
+ggplot(mut_freq, aes(x = affected_aa, y = Freq)) +
+  geom_bar(stat = "identity", fill = "black") +
+  labs(title = "AA mutations", 
+       x = "AA", y = "Fraction of Mutations") +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
+
+# AA Arginine seems to be the one which is most affected 
+
+```
+
+
diff --git a/problem_2/final_assignment_problem_2.html b/problem_2/final_assignment_problem_2.html