Tue_22_apr_15_17:30

frankMusacchia · Apr 22, 2015 · d90c108 · d90c108
commit d90c108
Show file tree

Hide file tree

Showing 188 changed files with 253,102 additions and 0 deletions.
diff --git a/CONFIGURATION/config_annocript.txt b/CONFIGURATION/config_annocript.txt
@@ -0,0 +1,169 @@
+# File configuration for Annocript 0.2
+# It contains parameters that are used very often by Annocript but very rarely by the user. 
+# We collected the parameters in RARELY CHANGED which means that an expert user can decide to change
+# and VERY RARELY CHANGED because it is very rare that an user wants to change
+# READ CAREFULLY!!!
+# This file has been written with a specific sintax. 
+# The variables MUST stay in the form: variable = value
+# A series of hashes (#########) closes the parameters to read
+# When you want to execute something you have to write YES (in upper case) or NO otherwise
+# other strings will give error.
+# THIS FILE SHOULD BE CHANGED ONLY IF YOU KNOW PERFECTLY WHAT YOU ARE DOING!!!
+##############################
+
+
+#RARELY CHANGED PARAMETERS
+#Links to Files
+#Uniprot databases are also present on expasy and ebi. Please use these domains if you want.
+swissprotDBLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
+tremblDBLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz
+uniprotVerLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt
+GODBLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.gz
+GOTermsLink = http://www.geneontology.org/doc/GO.terms_alt_ids
+enzymeDBLink = ftp://ftp.expasy.org/databases/enzyme/enzyme.dat
+cdDBLink = ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/Cdd_LE.tar.gz
+rfamDBLink = ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT/Rfam.fasta.gz
+unirefDBLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref90/uniref90.fasta.gz
+unirefVerLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref90/uniref90.release_note
+cdTableLink = ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddid_all.tbl.gz
+pathwaysTableLink = http://www.uniprot.org/docs/pathway.txt
+uniprotOrgListLink = http://www.uniprot.org/docs/speclist.txt
+pfam2GOLink = http://geneontology.org/external2go/pfam2go
+silvaLSULink = ftp://ftp.arb-silva.de/current/Exports/SILVA_119_LSUParc_tax_silva.fasta.gz
+silvaSSULink = ftp://ftp.arb-silva.de/current/Exports/SILVA_119_SSUParc_tax_silva.fasta.gz
+
+#If active (YES) permits to keep in memory all the tables that have been used to generate the database
+#You can use to see if the construction has been well done
+keepTempFiles = YES
+
+#This is a threshold. If you have more than this memory Annocript will use more and be faster.
+bigMemory = 20
+
+#Max tentatives to use when downloading or using some system function
+max_tentatives =  5
+
+#A separator for different results in the same field of the table
+separator = ]---[
+
+#Columns of Uniref ids in the idmapping.tab file
+UniRef100PosIdMap = 8
+UniRef90PosIdMap = 9
+UniRef50PosIdMap = 10
+
+#FASTA CONSTRAINTS
+#Maximum number of chars a FASTA file can have
+maxFastaSeqLen = 80
+#Regexes containing IUPAC allowed chars
+nuclIUPAC = ACGTURYSWKMBDHVN\.\-
+protIUPAC = ACDEFGHIKLMNPQRSTVWY
+
+#PORTRAIT CONSTRAINTS
+#to avoid stochastic interruption of Portrait split the transcriptome in N files
+split_num = 5000
+#Minimum and maximum lengths for a transcriptome to work with Portrait
+max_port_seq_length = 100000
+min_port_seq_length = 100
+#Maximum percentage of Ns that can be present or Portrait will not work
+max_perc_ns = 20
+
+
+##VERY RARELY CHANGED 
+#Names given to the output files. Other information will be concatenate by Annocript
+outFileName = ann_out.txt
+outFiltered = filt_ann_out.txt
+ORFFastaFileName = orf_info.fasta 
+NCOutFile = lncRNAseqs.fasta
+codingOutFile = codingSeqs.fasta
+gcContentFile = out.geecee
+
+#Name of the file with non coding sequences
+ncDB = ncRNA.fa
+
+#Modify this value if you want to get plots showing a different domain database ids 
+#Possible values: cdd, cog, kog, pfam, prk, smart, tigr
+cdName4Expression = pfam
+
+#Names of the folders used by Annocript (A change will compromise Annocript)
+DBCreationFolder = DB_CREATION
+ProgExecFolder = PROGRAMS_EXEC
+GFFAndOutputFolder = GFF3_AND_OUTPUT
+usefulFolder = USEFUL
+guideFolder = GUIDE
+configFolder = CONFIGURATION
+
+#Database info
+platform = mysql
+host = localhost
+port = 3306
+
+#Programs output names
+rpstblastnOut = rpstblastnOut
+blastxSPOut = blastxSPOut
+blastxTROut = blastxTROut
+blastxUnirefOut = blastxUnirefOut
+blastnOut = blastnOut
+portraitOut = portraitOut
+dna2pepOut = dna2pepOut
+outCount = countOut
+
+#Other names
+outHashFile = outHashFile.hash
+parsingFilePath = db_headers.txt
+filtMappingFile = filtMappingFile.txt
+R_barplot_script = tables_2_barplot.R
+R_piechart_script = tables_2_pie_chart.R
+R_seqs_stats_script = ann_seqs_stats.R
+R_log_file = R.log
+
+#Database names
+gffDB = gffDB
+
+#Stats output files names
+bpStatsFile = GO_bp
+mfStatsFile = GO_mf
+ccStatsFile = GO_cc
+bpStatsFileDom = GO_dom_bp
+mfStatsFileDom = GO_dom_mf
+ccStatsFileDom = GO_dom_cc
+cdStatsFile = CD
+closerOSFile = closer_os_table.txt
+pathwaysL1File = pwl1
+pathwaysL2File = pwl2
+pathwaysL3File = pwl3
+ATGCPercFile = ATGC_Percentages.txt
+
+#Maximum length of descriptions in plots
+maxLengthDescs = 50
+
+#BLAST databases names
+swissProtDB = uniprot_sprot.fasta
+tremblDB = uniprot_trembl.fasta
+
+#BLAST output type (you can use only: blast)
+blastOutType = blast
+
+#GFF3 Output file names
+blastxGFF3FileName = blastx_out.gff
+blastxTRGFF3FileName = blastxTremble_out.gff
+blastxSPGFF3FileName = blastxSprot_out.gff
+blastxUnirefGFF3FileName = blastxUniref_out.gff
+rpstblastnGFF3FileName = rpstblastn_out.gff
+blastnGFF3FileName = blastn_out.gff
+nonCodingAlgorithm = portrait
+nonCodingGFF3FileName = portait_out.gff
+orfAlgorithm = dna2pep
+orfGFF3FileName = dna2pep_out.gff
+
+#LOG FILES
+headParsNAValues = headParsNAValues.log
+uniprotGenesNotPresent = genesNotPresent.log
+
+#HTML PAGES
+htmlHome = index.html
+htmlAnnotStats = annot_stats.html
+htmlSeqStats = seq_stats.html
+AnnocriptLogo = a_logo.png
+
+#EXTERNAL PROGRAMS
+faSomeRecords = faSomeRecords
+#############################
diff --git a/CONFIGURATION/config_user.txt b/CONFIGURATION/config_user.txt
@@ -0,0 +1,106 @@
+# File configuration for the user of Annocript 0.2
+# READ CAREFULLY!!!
+# This file has been written with a specific sintax. 
+# The variables MUST stay in the format: variable = value
+# A series of hashes (#########) closes the parameters to read
+# Parameters of BLAST programs without a value assigned will not be used (i.e. word_sizeX = )
+# When you want to execute something you have to write YES (in upper case) or NO otherwise
+# other strings will give error.
+##############################
+
+#Allowed characters [A-za-z0-9\_\-]. Allowed extensions (fa|fasta). Please use a dot only to separate the extension!
+fastaSeqs = trial_transcriptome.fasta
+
+#organisms to blast ('all' means all the organisms in UniProt are taken)
+#please use 'all' or a file name with organisms names
+#Such file must be placed in your working directory (i.e. ann_works)
+#Selection of the organisms works only if the TrEMBL database is used
+blastedOrganism = all
+
+#How to extract GO terms: you can choose to extract 
+#for proteins ('proteins'), domains ('domains') or for both ('both')
+goTermsAss = proteins
+
+#What have to do Annocript
+doDbCreation = YES
+doExecutePrograms = YES
+doBuildOutput = YES
+extractStatistics = YES
+
+#What programs to execute
+doBlastxSP = YES
+doBlastxTRorUf = YES
+doRpstblastn = YES
+doBlastn = YES
+doPortrait = YES
+doDna2Pep = YES
+
+#Permits the use of the GFF database. Use YES only if you need it. GFF output files will always be print.
+useGFFDB = NO
+
+#BLASTX and BLASTP PARAMETERS (use word_size 4 and threshold 18 to reduce computational time)
+#(outfmt can be only 0 with this version of Annocript)
+word_sizeX = 4
+evalueX = 1E-5
+num_descriptionsX = 5
+num_alignmentsX = 5
+max_target_seqsX =
+num_threadsX = 10
+thresholdX = 18
+matrixX = 
+
+#BLASTN PARAMETERS
+word_sizeN = 
+evalueN = 0.00001
+num_descriptionsN = 1
+num_alignmentsN = 1
+max_target_seqsN =
+num_threadsN = 4
+thresholdN =
+
+#RPSBLAST and RPSTBLASTN PARAMETERS 
+word_sizeRPS = 
+evalueRPS = 0.00001
+num_descriptionsRPS = 20
+num_alignmentsRPS = 20
+max_target_seqsRPS =
+thresholdRPS =
+
+#BLAST results with evalue lower than evalMax will be shown in the tabular output
+evalMax = 0.00001
+
+#Number of threads for parallel executions (Used only for RPSBLAST)
+threads4Parallel = 10
+
+#DNA2PEP PARAMETERS (default is 'none')
+d2pMode = none
+
+#PLOTS
+#Number of top scored elements to show in the plots (maximum is 50)
+topToShow = 20
+#Type of plot to show [piechart|barplot]
+plotType = barplot
+
+#Thresholds to be non-coding. They guide the heuristic in Annocript
+#Minimum Portrait score
+NCThresh = 0.95
+#Maximum length of the ORF
+NCORFLength = 100
+#Minimum length of the transcript
+NCSeqLength = 200
+
+#FIXED PARAMETERS (You should set only once)#
+
+#Database account info
+mySqlUser = [INSERT_MYSQL_USERID]
+mySqlPass = [INSERT_MYSQL_PASSWORD]
+
+#UNIPROT informations for access
+uniprotWebUser = anonymous
+uniprotWebPass = [INSERT_YOUR_EMAIL_ADDRESS]
+
+#Programs Paths
+blastPath = /home/francesco/bin/ncbi-blast-2.2.30+/bin/
+portraitPath = /home/francesco/bin/portrait-1.1/portrait-1.1.pl
+dna2pepPath = /home/francesco/bin/dna2pep-1.1/dna2pep.py
+##############################
diff --git a/CONFIGURATION/variables.txt b/CONFIGURATION/variables.txt
@@ -0,0 +1,2 @@
+ncDB;cdName4Expression;DBCreationFolder;ProgExecFolder;GFFAndOutputFolder;usefulFolder;guideFolder;configFolder;platform;host;port;rpstblastnOut;blastxSPOut;blastxTROut;blastxUnirefOut;blastnOut;portraitOut;dna2pepOut;outCount;outHashFile;parsingFilePath;filtMappingFile;R_barplot_script;R_piechart_script;R_seqs_stats_script;R_log_file;gffDB;keepTempFiles;bigMemory;separator;maxFastaSeqLen;nuclIUPAC;protIUPAC;outFileName;outFiltered;ORFFastaFileName;NCOutFile;codingOutFile;gcContentFile;bpStatsFile;mfStatsFile;ccStatsFile;bpStatsFileDom;mfStatsFileDom;ccStatsFileDom;cdStatsFile;closerOSFile;pathwaysL1File;pathwaysL2File;pathwaysL3File;UniRef100PosIdMap;UniRef90PosIdMap;UniRef50PosIdMap;split_num;max_port_seq_length;min_port_seq_length;max_perc_ns;ATGCPercFile;maxLengthDescs;swissProtDB;tremblDB;blastOutType;blastxGFF3FileName;blastxTRGFF3FileName;blastxSPGFF3FileName;blastxUnirefGFF3FileName;rpstblastnGFF3FileName;blastnGFF3FileName;nonCodingAlgorithm;nonCodingGFF3FileName;orfAlgorithm;orfGFF3FileName;swissprotDBLink;tremblDBLink;unirefDBLink;unirefVerLink;uniprotVerLink;GODBLink;GOTermsLink;enzymeDBLink;cdDBLink;rfamDBLink;cdTableLink;pathwaysTableLink;uniprotOrgListLink;pfam2GOLink;silvaLSULink;silvaSSULink;headParsNAValues;uniprotGenesNotPresent;htmlHome;htmlAnnotStats;htmlSeqStats;AnnocriptLogo;faSomeRecords;max_tentatives
+fastaSeqs;blastedOrganism;goTermsAss;doDbCreation;doExecutePrograms;doBuildOutput;mySqlUser;mySqlPass;uniprotWebUser;uniprotWebPass;doBlastxSP;doBlastxTRorUf;doRpstblastn;doBlastn;doPortrait;doDna2Pep;useGFFDB;word_sizeX;evalueX;num_descriptionsX;num_alignmentsX;max_target_seqsX;num_threadsX;thresholdX;matrixX;word_sizeN;evalueN;num_descriptionsN;num_alignmentsN;max_target_seqsN;num_threadsN;thresholdN;word_sizeRPS;evalueRPS;num_descriptionsRPS;num_alignmentsRPS;max_target_seqsRPS;thresholdRPS;threads4Parallel;evalMax;d2pMode;topToShow;plotType;extractStatistics;NCThresh;NCORFLength;NCSeqLength;blastPath;portraitPath;dna2pepPath
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ncDB;cdName4Expression;DBCreationFolder;ProgExecFolder;GFFAndOutputFolder;usefulFolder;guideFolder;configFolder;platform;host;port;rpstblastnOut;blastxSPOut;blastxTROut;blastxUnirefOut;blastnOut;portraitOut;dna2pepOut;outCount;outHashFile;parsingFilePath;filtMappingFile;R_barplot_script;R_piechart_script;R_seqs_stats_script;R_log_file;gffDB;keepTempFiles;bigMemory;separator;maxFastaSeqLen;nuclIUPAC;protIUPAC;outFileName;outFiltered;ORFFastaFileName;NCOutFile;codingOutFile;gcContentFile;bpStatsFile;mfStatsFile;ccStatsFile;bpStatsFileDom;mfStatsFileDom;ccStatsFileDom;cdStatsFile;closerOSFile;pathwaysL1File;pathwaysL2File;pathwaysL3File;UniRef100PosIdMap;UniRef90PosIdMap;UniRef50PosIdMap;split_num;max_port_seq_length;min_port_seq_length;max_perc_ns;ATGCPercFile;maxLengthDescs;swissProtDB;tremblDB;blastOutType;blastxGFF3FileName;blastxTRGFF3FileName;blastxSPGFF3FileName;blastxUnirefGFF3FileName;rpstblastnGFF3FileName;blastnGFF3FileName;nonCodingAlgorithm;nonCodingGFF3FileName;orfAlgorithm;orfGFF3FileName;swissprotDBLink;tremblDBLink;unirefDBLink;unirefVerLink;uniprotVerLink;GODBLink;GOTermsLink;enzymeDBLink;cdDBLink;rfamDBLink;cdTableLink;pathwaysTableLink;uniprotOrgListLink;pfam2GOLink;silvaLSULink;silvaSSULink;headParsNAValues;uniprotGenesNotPresent;htmlHome;htmlAnnotStats;htmlSeqStats;AnnocriptLogo;faSomeRecords;max_tentatives
		fastaSeqs;blastedOrganism;goTermsAss;doDbCreation;doExecutePrograms;doBuildOutput;mySqlUser;mySqlPass;uniprotWebUser;uniprotWebPass;doBlastxSP;doBlastxTRorUf;doRpstblastn;doBlastn;doPortrait;doDna2Pep;useGFFDB;word_sizeX;evalueX;num_descriptionsX;num_alignmentsX;max_target_seqsX;num_threadsX;thresholdX;matrixX;word_sizeN;evalueN;num_descriptionsN;num_alignmentsN;max_target_seqsN;num_threadsN;thresholdN;word_sizeRPS;evalueRPS;num_descriptionsRPS;num_alignmentsRPS;max_target_seqsRPS;thresholdRPS;threads4Parallel;evalMax;d2pMode;topToShow;plotType;extractStatistics;NCThresh;NCORFLength;NCSeqLength;blastPath;portraitPath;dna2pepPath