forked from ssbio/E_dermatitidis_model
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNCBI_get_ECs.pl
55 lines (44 loc) · 2.15 KB
/
NCBI_get_ECs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#usr/bin/perl -w
#Created by Wheaton Schroeder to use protein names from table downloaded from NCBI database
#will use protein name as a search string in the Brenda Database. Takes an NCBI downloaded
#table of protein details (for E. dermatitidis the URL is https://www.ncbi.nlm.nih.gov/genome/proteins/2962?genome_assembly_id=34221&gi=-1)
#which has the following transformations:
#1) replace all commas with semicolons (necessary for csv formatting not to be thrown off)
#2)replace all tabs (\t) with commas (easily done in notepad ++)
#3) save the resultant file as a .csv
#this allows for easy of manual reading (as it may be opened in Excel) and ease of programmatic
#reading
#As this program cobers thousands of ORFs and all which are non-hypothetical are searched for in Brenda,
#the runtime of this program is by necessity long
#allows the use of searchBrendaProtein subroutine in the text common_function.pl
use LWP::UserAgent;
require 'common_functions.pl';
$headerLine = " ";
@linesToWrite = ( );
$ProtNameCol = 9; #column number where the protein name is. indexing begins at 0
#opens the file which contains the protein data from NCBI
if ( ! open PROTDATA, "<NCBIProteinDetails.csv") {
die "Could not find/access NCBIProteinDetails.csv, reason: $!"; #fatal error
}
#opens the file to write new protein data
if ( ! open PROTNEW, ">NCBIProtECno.csv") {
die "Could not create NCBIProtECno.csv, reason: $!"; #fatal error
}
#read the lines from the file
chomp(@lines = <PROTDATA>); #chomp removes white spaces at ends of each array element
#read in first line as the header
$headerLine = $lines[0];
#foreach line after the header
for (my $i = 0; $i <= $#lines - 1; $i++) {
#search Brenda for the protein in each line
$ECno = searchBrendaProtein($lines[$i+1], $ProtNameCol);
$linesToWrite[$i] = $lines[$i+1].",".$ECno;
}
#begin writing protein information to NCBIProtECno.csv
select PROTNEW;
printf "%s, Brenda EC#s\n", $headerLine;
for ($j = 0; $j <= $#linesToWrite; $j++) { #for each protein line
printf "%s\n", $linesToWrite[$j]; #write that line to the document
}
#after all lines are written, close the file
close PROTNEW;