Skip to content

Commit

Permalink
Tue_22_apr_15_17:30
Browse files Browse the repository at this point in the history
  • Loading branch information
frankMusacchia committed Apr 22, 2015
0 parents commit d90c108
Show file tree
Hide file tree
Showing 188 changed files with 253,102 additions and 0 deletions.
169 changes: 169 additions & 0 deletions CONFIGURATION/config_annocript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# File configuration for Annocript 0.2
# It contains parameters that are used very often by Annocript but very rarely by the user.
# We collected the parameters in RARELY CHANGED which means that an expert user can decide to change
# and VERY RARELY CHANGED because it is very rare that an user wants to change
# READ CAREFULLY!!!
# This file has been written with a specific sintax.
# The variables MUST stay in the form: variable = value
# A series of hashes (#########) closes the parameters to read
# When you want to execute something you have to write YES (in upper case) or NO otherwise
# other strings will give error.
# THIS FILE SHOULD BE CHANGED ONLY IF YOU KNOW PERFECTLY WHAT YOU ARE DOING!!!
##############################


#RARELY CHANGED PARAMETERS
#Links to Files
#Uniprot databases are also present on expasy and ebi. Please use these domains if you want.
swissprotDBLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
tremblDBLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz
uniprotVerLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt
GODBLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.gz
GOTermsLink = http://www.geneontology.org/doc/GO.terms_alt_ids
enzymeDBLink = ftp://ftp.expasy.org/databases/enzyme/enzyme.dat
cdDBLink = ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/Cdd_LE.tar.gz
rfamDBLink = ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT/Rfam.fasta.gz
unirefDBLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref90/uniref90.fasta.gz
unirefVerLink = ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref90/uniref90.release_note
cdTableLink = ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddid_all.tbl.gz
pathwaysTableLink = http://www.uniprot.org/docs/pathway.txt
uniprotOrgListLink = http://www.uniprot.org/docs/speclist.txt
pfam2GOLink = http://geneontology.org/external2go/pfam2go
silvaLSULink = ftp://ftp.arb-silva.de/current/Exports/SILVA_119_LSUParc_tax_silva.fasta.gz
silvaSSULink = ftp://ftp.arb-silva.de/current/Exports/SILVA_119_SSUParc_tax_silva.fasta.gz

#If active (YES) permits to keep in memory all the tables that have been used to generate the database
#You can use to see if the construction has been well done
keepTempFiles = YES

#This is a threshold. If you have more than this memory Annocript will use more and be faster.
bigMemory = 20

#Max tentatives to use when downloading or using some system function
max_tentatives = 5

#A separator for different results in the same field of the table
separator = ]---[

#Columns of Uniref ids in the idmapping.tab file
UniRef100PosIdMap = 8
UniRef90PosIdMap = 9
UniRef50PosIdMap = 10

#FASTA CONSTRAINTS
#Maximum number of chars a FASTA file can have
maxFastaSeqLen = 80
#Regexes containing IUPAC allowed chars
nuclIUPAC = ACGTURYSWKMBDHVN\.\-
protIUPAC = ACDEFGHIKLMNPQRSTVWY

#PORTRAIT CONSTRAINTS
#to avoid stochastic interruption of Portrait split the transcriptome in N files
split_num = 5000
#Minimum and maximum lengths for a transcriptome to work with Portrait
max_port_seq_length = 100000
min_port_seq_length = 100
#Maximum percentage of Ns that can be present or Portrait will not work
max_perc_ns = 20


##VERY RARELY CHANGED
#Names given to the output files. Other information will be concatenate by Annocript
outFileName = ann_out.txt
outFiltered = filt_ann_out.txt
ORFFastaFileName = orf_info.fasta
NCOutFile = lncRNAseqs.fasta
codingOutFile = codingSeqs.fasta
gcContentFile = out.geecee

#Name of the file with non coding sequences
ncDB = ncRNA.fa

#Modify this value if you want to get plots showing a different domain database ids
#Possible values: cdd, cog, kog, pfam, prk, smart, tigr
cdName4Expression = pfam

#Names of the folders used by Annocript (A change will compromise Annocript)
DBCreationFolder = DB_CREATION
ProgExecFolder = PROGRAMS_EXEC
GFFAndOutputFolder = GFF3_AND_OUTPUT
usefulFolder = USEFUL
guideFolder = GUIDE
configFolder = CONFIGURATION

#Database info
platform = mysql
host = localhost
port = 3306

#Programs output names
rpstblastnOut = rpstblastnOut
blastxSPOut = blastxSPOut
blastxTROut = blastxTROut
blastxUnirefOut = blastxUnirefOut
blastnOut = blastnOut
portraitOut = portraitOut
dna2pepOut = dna2pepOut
outCount = countOut

#Other names
outHashFile = outHashFile.hash
parsingFilePath = db_headers.txt
filtMappingFile = filtMappingFile.txt
R_barplot_script = tables_2_barplot.R
R_piechart_script = tables_2_pie_chart.R
R_seqs_stats_script = ann_seqs_stats.R
R_log_file = R.log

#Database names
gffDB = gffDB

#Stats output files names
bpStatsFile = GO_bp
mfStatsFile = GO_mf
ccStatsFile = GO_cc
bpStatsFileDom = GO_dom_bp
mfStatsFileDom = GO_dom_mf
ccStatsFileDom = GO_dom_cc
cdStatsFile = CD
closerOSFile = closer_os_table.txt
pathwaysL1File = pwl1
pathwaysL2File = pwl2
pathwaysL3File = pwl3
ATGCPercFile = ATGC_Percentages.txt

#Maximum length of descriptions in plots
maxLengthDescs = 50

#BLAST databases names
swissProtDB = uniprot_sprot.fasta
tremblDB = uniprot_trembl.fasta

#BLAST output type (you can use only: blast)
blastOutType = blast

#GFF3 Output file names
blastxGFF3FileName = blastx_out.gff
blastxTRGFF3FileName = blastxTremble_out.gff
blastxSPGFF3FileName = blastxSprot_out.gff
blastxUnirefGFF3FileName = blastxUniref_out.gff
rpstblastnGFF3FileName = rpstblastn_out.gff
blastnGFF3FileName = blastn_out.gff
nonCodingAlgorithm = portrait
nonCodingGFF3FileName = portait_out.gff
orfAlgorithm = dna2pep
orfGFF3FileName = dna2pep_out.gff

#LOG FILES
headParsNAValues = headParsNAValues.log
uniprotGenesNotPresent = genesNotPresent.log

#HTML PAGES
htmlHome = index.html
htmlAnnotStats = annot_stats.html
htmlSeqStats = seq_stats.html
AnnocriptLogo = a_logo.png

#EXTERNAL PROGRAMS
faSomeRecords = faSomeRecords
#############################
106 changes: 106 additions & 0 deletions CONFIGURATION/config_user.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# File configuration for the user of Annocript 0.2
# READ CAREFULLY!!!
# This file has been written with a specific sintax.
# The variables MUST stay in the format: variable = value
# A series of hashes (#########) closes the parameters to read
# Parameters of BLAST programs without a value assigned will not be used (i.e. word_sizeX = )
# When you want to execute something you have to write YES (in upper case) or NO otherwise
# other strings will give error.
##############################

#Allowed characters [A-za-z0-9\_\-]. Allowed extensions (fa|fasta). Please use a dot only to separate the extension!
fastaSeqs = trial_transcriptome.fasta

#organisms to blast ('all' means all the organisms in UniProt are taken)
#please use 'all' or a file name with organisms names
#Such file must be placed in your working directory (i.e. ann_works)
#Selection of the organisms works only if the TrEMBL database is used
blastedOrganism = all

#How to extract GO terms: you can choose to extract
#for proteins ('proteins'), domains ('domains') or for both ('both')
goTermsAss = proteins

#What have to do Annocript
doDbCreation = YES
doExecutePrograms = YES
doBuildOutput = YES
extractStatistics = YES

#What programs to execute
doBlastxSP = YES
doBlastxTRorUf = YES
doRpstblastn = YES
doBlastn = YES
doPortrait = YES
doDna2Pep = YES

#Permits the use of the GFF database. Use YES only if you need it. GFF output files will always be print.
useGFFDB = NO

#BLASTX and BLASTP PARAMETERS (use word_size 4 and threshold 18 to reduce computational time)
#(outfmt can be only 0 with this version of Annocript)
word_sizeX = 4
evalueX = 1E-5
num_descriptionsX = 5
num_alignmentsX = 5
max_target_seqsX =
num_threadsX = 10
thresholdX = 18
matrixX =

#BLASTN PARAMETERS
word_sizeN =
evalueN = 0.00001
num_descriptionsN = 1
num_alignmentsN = 1
max_target_seqsN =
num_threadsN = 4
thresholdN =

#RPSBLAST and RPSTBLASTN PARAMETERS
word_sizeRPS =
evalueRPS = 0.00001
num_descriptionsRPS = 20
num_alignmentsRPS = 20
max_target_seqsRPS =
thresholdRPS =

#BLAST results with evalue lower than evalMax will be shown in the tabular output
evalMax = 0.00001

#Number of threads for parallel executions (Used only for RPSBLAST)
threads4Parallel = 10

#DNA2PEP PARAMETERS (default is 'none')
d2pMode = none

#PLOTS
#Number of top scored elements to show in the plots (maximum is 50)
topToShow = 20
#Type of plot to show [piechart|barplot]
plotType = barplot

#Thresholds to be non-coding. They guide the heuristic in Annocript
#Minimum Portrait score
NCThresh = 0.95
#Maximum length of the ORF
NCORFLength = 100
#Minimum length of the transcript
NCSeqLength = 200

#FIXED PARAMETERS (You should set only once)#

#Database account info
mySqlUser = [INSERT_MYSQL_USERID]
mySqlPass = [INSERT_MYSQL_PASSWORD]

#UNIPROT informations for access
uniprotWebUser = anonymous
uniprotWebPass = [INSERT_YOUR_EMAIL_ADDRESS]

#Programs Paths
blastPath = /home/francesco/bin/ncbi-blast-2.2.30+/bin/
portraitPath = /home/francesco/bin/portrait-1.1/portrait-1.1.pl
dna2pepPath = /home/francesco/bin/dna2pep-1.1/dna2pep.py
##############################
2 changes: 2 additions & 0 deletions CONFIGURATION/variables.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ncDB;cdName4Expression;DBCreationFolder;ProgExecFolder;GFFAndOutputFolder;usefulFolder;guideFolder;configFolder;platform;host;port;rpstblastnOut;blastxSPOut;blastxTROut;blastxUnirefOut;blastnOut;portraitOut;dna2pepOut;outCount;outHashFile;parsingFilePath;filtMappingFile;R_barplot_script;R_piechart_script;R_seqs_stats_script;R_log_file;gffDB;keepTempFiles;bigMemory;separator;maxFastaSeqLen;nuclIUPAC;protIUPAC;outFileName;outFiltered;ORFFastaFileName;NCOutFile;codingOutFile;gcContentFile;bpStatsFile;mfStatsFile;ccStatsFile;bpStatsFileDom;mfStatsFileDom;ccStatsFileDom;cdStatsFile;closerOSFile;pathwaysL1File;pathwaysL2File;pathwaysL3File;UniRef100PosIdMap;UniRef90PosIdMap;UniRef50PosIdMap;split_num;max_port_seq_length;min_port_seq_length;max_perc_ns;ATGCPercFile;maxLengthDescs;swissProtDB;tremblDB;blastOutType;blastxGFF3FileName;blastxTRGFF3FileName;blastxSPGFF3FileName;blastxUnirefGFF3FileName;rpstblastnGFF3FileName;blastnGFF3FileName;nonCodingAlgorithm;nonCodingGFF3FileName;orfAlgorithm;orfGFF3FileName;swissprotDBLink;tremblDBLink;unirefDBLink;unirefVerLink;uniprotVerLink;GODBLink;GOTermsLink;enzymeDBLink;cdDBLink;rfamDBLink;cdTableLink;pathwaysTableLink;uniprotOrgListLink;pfam2GOLink;silvaLSULink;silvaSSULink;headParsNAValues;uniprotGenesNotPresent;htmlHome;htmlAnnotStats;htmlSeqStats;AnnocriptLogo;faSomeRecords;max_tentatives
fastaSeqs;blastedOrganism;goTermsAss;doDbCreation;doExecutePrograms;doBuildOutput;mySqlUser;mySqlPass;uniprotWebUser;uniprotWebPass;doBlastxSP;doBlastxTRorUf;doRpstblastn;doBlastn;doPortrait;doDna2Pep;useGFFDB;word_sizeX;evalueX;num_descriptionsX;num_alignmentsX;max_target_seqsX;num_threadsX;thresholdX;matrixX;word_sizeN;evalueN;num_descriptionsN;num_alignmentsN;max_target_seqsN;num_threadsN;thresholdN;word_sizeRPS;evalueRPS;num_descriptionsRPS;num_alignmentsRPS;max_target_seqsRPS;thresholdRPS;threads4Parallel;evalMax;d2pMode;topToShow;plotType;extractStatistics;NCThresh;NCORFLength;NCSeqLength;blastPath;portraitPath;dna2pepPath
Loading

0 comments on commit d90c108

Please sign in to comment.