diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py index 12bb0adc..73edc976 100644 --- a/chebai/preprocessing/datasets/go_uniprot.py +++ b/chebai/preprocessing/datasets/go_uniprot.py @@ -43,6 +43,11 @@ "IEP", "TAS", "IC", + "HTP", + "HDA", + "HMP", + "HGI", + "HEP", } # https://github.com/bio-ontology-research-group/deepgo/blob/d97447a05c108127fee97982fd2c57929b2cf7eb/aaindex.py#L8 @@ -414,7 +419,7 @@ def _get_swiss_to_go_mapping(self) -> pd.DataFrame: Quote from the DeepGo Paper: `We select proteins with annotations having experimental evidence codes - (EXP, IDA, IPI, IMP, IGI, IEP, TAS, IC) and filter the proteins by a + `EXPERIMENTAL_EVIDENCE_CODES` and filter the proteins by a maximum length of 1002, ignoring proteins with ambiguous amino acid codes (B, O, J, U, X, Z) in their sequence.` diff --git a/chebai/preprocessing/datasets/protein_pretraining.py b/chebai/preprocessing/datasets/protein_pretraining.py index 8550db2b..f6e9d66d 100644 --- a/chebai/preprocessing/datasets/protein_pretraining.py +++ b/chebai/preprocessing/datasets/protein_pretraining.py @@ -96,8 +96,8 @@ def _download_required_data(self) -> str: def _parse_protein_data_for_pretraining(self) -> pd.DataFrame: """ Parses the Swiss-Prot data and returns a DataFrame containing Swiss-Prot proteins which does not have any valid - Gene Ontology(GO) label. A valid GO label is the one which has one of the following evidence code - (EXP, IDA, IPI, IMP, IGI, IEP, TAS, IC). + Gene Ontology(GO) label. A valid GO label is the one which has one of the following evidence code defined in + `EXPERIMENTAL_EVIDENCE_CODES`. The DataFrame includes the following columns: - "swiss_id": The unique identifier for each Swiss-Prot record. diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py index a05b89f1..ca6148e7 100644 --- a/tests/unit/mock_data/ontology_mock_data.py +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -668,8 +668,8 @@ def get_UniProt_raw_data() -> str: - **Swiss_Prot_11**: Has only Invalid GO class but lacks a sequence. Note: - A valid GO label is the one which has one of the following evidence code - (EXP, IDA, IPI, IMP, IGI, IEP, TAS, IC). + A valid GO label is the one which has one of the following evidence code defined in + `EXPERIMENTAL_EVIDENCE_CODES`. Returns: str: The raw UniProt data in string format.