Skip to content

Commit

Permalink
add swissprot as an output column for review status
Browse files Browse the repository at this point in the history
  • Loading branch information
dmx2 committed Jun 9, 2024
1 parent 9786c95 commit 4451b5c
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 13 deletions.
27 changes: 16 additions & 11 deletions pepmatch/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,28 @@ def extract_metadata(record: SeqRecord) -> list:
record: protein SeqRecord from proteome FASTA file.
"""
regexes = {
'protein_id': re.compile(r"\|([^|]*)\|"), # between | and |
'protein_name': re.compile(r"\s(.+?)\sOS"), # between space and space before OS
'species': re.compile(r"OS=(.+?)\sOX"), # between OS= and space before OX
'taxon_id': re.compile(r"OX=(.+?)(\s|$)"), # between OX= and space
'gene': re.compile(r"GN=(.+?)(\s|$)"), # between GN= and space
'pe_level': re.compile(r"PE=(.+?)(\s|$)"), # between PE= and space
'sequence_version': re.compile(r"SV=(.+?)(\s|$)"), # between SV= and space
'gene_priority': re.compile(r"GP=(.+?)(\s|$)"), # between GP= and space
'protein_id': re.compile(r"\|([^|]*)\|"), # between | and |
'protein_name': re.compile(r"\s(.+?)\sOS"), # between space and space before OS
'species': re.compile(r"OS=(.+?)\sOX"), # between OS= and space before OX
'taxon_id': re.compile(r"OX=(.+?)(\s|$)"), # between OX= and space
'gene': re.compile(r"GN=(.+?)(\s|$)"), # between GN= and space
'pe_level': re.compile(r"PE=(.+?)(\s|$)"), # between PE= and space
'sequence_version': re.compile(r"SV=(.+?)(\s|$)"), # between SV= and space
'gene_priority': re.compile(r"GP=(.+?)(\s|$)"), # between GP= and space
'swissprot': re.compile(r"^(tr|sp)\|"), # between > and |
}
metadata = []
for key in regexes: # loop through compiled regexes to extract metadata
match = regexes[key].search(str(record.description))

if match:
metadata.append(match.group(1))
if key == 'swissprot':
metadata.append('1') if match.group(1) == 'sp' else metadata.append('0')
else:
metadata.append(match.group(1))
else:
if key == 'protein_id':
if key == 'swissprot':
metadata.append('0')
elif key == 'protein_id':
metadata.append(str(record.id)) # get record.id from FASTA header instead
elif key == 'sequence_version':
metadata.append('1')
Expand Down
5 changes: 3 additions & 2 deletions pepmatch/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ def _create_tables(
'gene TEXT NOT NULL,'\
'pe_level INTEGER NOT NULL,'\
'sequence_version INTEGER NOT NULL,'\
'gene_priority INTEGER NOT NULL)'\
'gene_priority INTEGER NOT NULL,'\
'swissprot INTEGER NOT NULL)'\
)


Expand All @@ -200,7 +201,7 @@ def _insert_metadata(self, cursor: sqlite3.Cursor, metadata_table: str) -> None:
metadata_table: name of the metadata table."""

cursor.executemany(
f'INSERT INTO "{metadata_table}" VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
f'INSERT INTO "{metadata_table}" VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
self.all_metadata
)

Expand Down

0 comments on commit 4451b5c

Please sign in to comment.