-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update on non-imex protein list generation
- Loading branch information
1 parent
c3f25e5
commit 61faa26
Showing
18 changed files
with
225,530 additions
and
33,942 deletions.
There are no files selected for viewing
Binary file removed
BIN
-128 Bytes
IMEx/IMEx_dsgen_cache/html/unnamed-chunk-1_dc0a1de6ae26234d4c084986f9145e31.RData
Binary file not shown.
Binary file removed
BIN
-60 Bytes
IMEx/IMEx_dsgen_cache/html/unnamed-chunk-1_dc0a1de6ae26234d4c084986f9145e31.rdb
Binary file not shown.
Binary file removed
BIN
-138 Bytes
IMEx/IMEx_dsgen_cache/html/unnamed-chunk-1_dc0a1de6ae26234d4c084986f9145e31.rdx
Binary file not shown.
24,970 changes: 0 additions & 24,970 deletions
24,970
dsp_comparison/imex_non_curated/Swissprot_with_isoforms_missing_in_IntAct.txt
This file was deleted.
Oops, something went wrong.
2,993 changes: 407 additions & 2,586 deletions
2,993
dsp_comparison/imex_non_curated/Swissprot_without_isoforms_missing_in_IntAct.txt
Large diffs are not rendered by default.
Oops, something went wrong.
39,832 changes: 34,055 additions & 5,777 deletions
39,832
dsp_comparison/imex_non_curated/UniprotKB_without_isoforms_missing_in_IntAct.txt
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
--- | ||
title: "Proteome_coverage" | ||
author: "Pablo Porras" | ||
date: "05/03/2020" | ||
output: html_document | ||
--- | ||
|
||
```{r setup, include=FALSE} | ||
knitr::opts_chunk$set(echo = TRUE) | ||
``` | ||
|
||
### Synopsis | ||
|
||
Coverage of IntAct for human proteome. | ||
|
||
#### Libraries | ||
```{r libraries,message=FALSE,warning=FALSE} | ||
library(data.table) | ||
library(VennDiagram) | ||
library(ggplot2) | ||
``` | ||
|
||
#### Data upload | ||
```{r upload} | ||
# IntAct | ||
if (!file.exists("./source_data/intact.zip")){ | ||
download.file(url="ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip",destfile = "./source_data/intact.zip",method = "curl") | ||
} | ||
setwd("./source_data") | ||
system("unzip -j intact.zip") | ||
intact <- unique(fread( | ||
"intact.txt", | ||
header=T, | ||
sep="\t", | ||
colClasses="character", | ||
check.names = T, | ||
stringsAsFactors = F)) | ||
system("rm intact.txt") | ||
system("rm intact_negative.txt") | ||
setwd("../") | ||
# UniProt | ||
## SwissProt | ||
swissprot <- paste("./source_data/",format(Sys.time(),format="%y%m"),"_hsapiens_sp.txt",sep="") | ||
if(!file.exists(swissprot)){ | ||
download.file('https://www.uniprot.org/uniprot/?query=taxonomy%3A%22Homo+sapiens+(Human)+[9606]%22+AND+reviewed%3Ayes&compress=no&include=true&format=tab', destfile = paste(swissprot,sep=""), method= "curl",quiet=F) | ||
} | ||
sp <- unique(fread( | ||
swissprot, | ||
header=T, | ||
sep="\t", | ||
colClasses="character", | ||
check.names = T, | ||
stringsAsFactors = F)) | ||
## TrEMBL | ||
trembl <- paste("./source_data/",format(Sys.time(),format="%y%m"),"_hsapiens_tr.txt",sep="") | ||
if(!file.exists(trembl)){ | ||
download.file("https://www.uniprot.org/uniprot/?query=taxonomy%3A%22Homo+sapiens+(Human)+[9606]%22+AND+reviewed%3Ano&compress=no&include=true&format=tab", destfile = paste(trembl,sep=""), method= "curl") | ||
} | ||
tr <- unique(fread( | ||
trembl, | ||
header=T, | ||
sep="\t", | ||
colClasses="character", | ||
check.names = T, | ||
stringsAsFactors = F)) | ||
``` | ||
|
||
### Pre-processing IntAct human data | ||
```{r intact_preproc} | ||
intact_human_prots <- unique(rbind( | ||
intact[grepl("taxid:9606\\(Homo sapiens\\)",Taxid.interactor.A) & | ||
grepl("uniprotkb:",X.ID.s..interactor.A), | ||
.(upac=gsub("-[0-9]+","",gsub("uniprotkb:","",X.ID.s..interactor.A)), | ||
upac_isof=gsub("uniprotkb:","",X.ID.s..interactor.A))], | ||
intact[grepl("taxid:9606\\(Homo sapiens\\)",Taxid.interactor.B) & | ||
grepl("uniprotkb:",ID.s..interactor.B), | ||
.(upac=gsub("-[0-9]+","",gsub("uniprotkb:","",ID.s..interactor.B)), | ||
upac_isof=gsub("uniprotkb:","",ID.s..interactor.B))] | ||
)) | ||
intact_up_generic <- unique(intact_human_prots[,.(upac,intact="yes")]) | ||
# intact_up_isof <- unique(intact_human_prots[,.(upac_isof,intact="yes")]) # No isoform info downloaded from UniProt | ||
``` | ||
|
||
### Pre-processing UniProt entries | ||
```{r uniprot_preproc} | ||
sp_lite <- unique(sp[,.( | ||
upac = Entry, | ||
Status | ||
)]) | ||
tr_lite <- unique(tr[,.( | ||
upac = Entry, | ||
Status | ||
)]) | ||
up_full_lite <- unique(rbind( | ||
sp_lite, | ||
tr_lite | ||
)) | ||
``` | ||
|
||
### Comparison datasets | ||
```{r comp} | ||
sp_comp <- unique(merge( | ||
intact_up_generic, | ||
sp_lite, | ||
by = "upac", | ||
all = T | ||
)) | ||
up_full_comp <- unique(merge( | ||
intact_up_generic, | ||
up_full_lite, | ||
by = "upac", | ||
all = T | ||
)) | ||
``` | ||
|
||
### Saving SwissProt proteins not in IMEx | ||
```{r upnonimex} | ||
spnonimex <- unique(up_full_comp[Status=="reviewed" & is.na(intact),.( | ||
upac | ||
)]) | ||
fwrite( | ||
spnonimex, | ||
"./Swissprot_without_isoforms_missing_in_IntAct.txt", | ||
col.names = F, | ||
row.names = F, | ||
quote = F, | ||
sep = "\t" | ||
) | ||
upnonimex <- unique(up_full_comp[is.na(intact),.( | ||
upac | ||
)]) | ||
fwrite( | ||
upnonimex, | ||
"./UniprotKB_without_isoforms_missing_in_IntAct.txt", | ||
col.names = F, | ||
row.names = F, | ||
quote = F, | ||
sep = "\t" | ||
) | ||
``` | ||
|
||
### Venn diagram overlap | ||
#### SwissProt overlap | ||
```{r venn_sp} | ||
draw.pairwise.venn( | ||
area2 = nrow(sp_comp[is.na(Status)]) + nrow(sp_comp[!is.na(intact) & !is.na(Status)]), | ||
area1 = nrow(sp_comp[is.na(intact)]) + nrow(sp_comp[!is.na(intact) & !is.na(Status)]), | ||
cross.area = nrow(sp_comp[!is.na(intact) & !is.na(Status)]), | ||
category = c("SwissProt","IntAct / IMEx"), | ||
fill = c("blue", "red"), | ||
lty = rep("blank",2), | ||
alpha = rep(0.5,2), | ||
cat.pos = c(330,30), | ||
cat.dist = rep(0.025,2)) | ||
``` | ||
|
||
#### Full UniProt overlap | ||
```{r venn_fu} | ||
draw.pairwise.venn( | ||
area2 = nrow(up_full_comp[is.na(Status)]) + nrow(up_full_comp[!is.na(intact) & !is.na(Status)]), | ||
area1 = nrow(up_full_comp[is.na(intact)]) + nrow(up_full_comp[!is.na(intact) & !is.na(Status)]), | ||
cross.area = nrow(up_full_comp[!is.na(intact) & !is.na(Status)]), | ||
category = c("Full UniProtKB","IntAct / IMEx"), | ||
fill = c("blue", "red"), | ||
lty = rep("blank",2), | ||
alpha = rep(0.5,2), | ||
cat.pos = c(330,30), | ||
cat.dist = rep(0.025,2)) | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Version: 1.0 | ||
|
||
RestoreWorkspace: Default | ||
SaveWorkspace: Default | ||
AlwaysSaveHistory: Default | ||
|
||
EnableCodeIndexing: Yes | ||
UseSpacesForTab: Yes | ||
NumSpacesForTab: 2 | ||
Encoding: UTF-8 | ||
|
||
RnwWeave: Sweave | ||
LaTeX: pdfLaTeX |
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.