diff --git a/server/src/scimodom/services/__init__.py b/server/src/scimodom/services/__init__.py index 94bf4ea7..e69de29b 100644 --- a/server/src/scimodom/services/__init__.py +++ b/server/src/scimodom/services/__init__.py @@ -1,99 +0,0 @@ -from scimodom.database.database import get_session -from scimodom.services.importer.data import EUFDataImporter -from scimodom.services.importer.generic import BEDImporter -from scimodom.services.importer.header import EUFHeaderImporter - - -class Importer: - """Defines a general Importer class to handle - EU (bedRMod) formatted files. - - :param header: EU header importer - :type header: EUFHeaderImporter - :param data: EU data importer - :type data: EUFDataImporter - """ - - def __init__( - self, - header: EUFHeaderImporter, - data: EUFDataImporter | None = None, - ) -> None: - """Initializer method.""" - self._header = header - self._data = data - - def init_data_importer( - self, association: dict[str, int], seqids: list[str] - ) -> None: - """Instantiate EUFDataImporter. - - :param association: A dictionary of association IDs of the form - {name: association_id}, where name is the modification short_name. - The association ID provides information about the dataset (EUFID), - the modification, the organism, and the technology used. - :type association: dict of {str: int} - :param seqids: List of chromosomes or scaffolds. The seqid must be - one used with Ensembl, e.g. standard Ensembl chromosome name w/o - the "chr" prefix. Only records with seqid in seqids will be imported. - :type seqids: list of str - """ - - version = self._header._specs_ver - filen = self._header._filen - session = get_session() - if self._header._handle.closed is False: - self._header.close() - if self._data is None: - self._data = EUFDataImporter( - session=session, - filen=filen, - handle=open(filen, "r"), - association=association, - seqids=seqids, - specs_ver=version, - ) - - -def get_importer(filen: str, smid: str, eufid: str, title: str): - """Instantiate Importer. - - :param filen: File path - :type filen: str - :param smid: Sci-ModoM project ID or SMID - :type smid: str - :param eufid: EUF ID (dataset) or EUFID - :type eufid: str - :param title: Title associated with EUF/bedRMod dataset - :type title: str - """ - session = get_session() - - return Importer( - header=EUFHeaderImporter( - session=session, - filen=filen, - handle=open(filen, "r"), - smid=smid, - eufid=eufid, - title=title, - ), - data=None, - ) - - -def get_bed_importer( - filen: str, -): - """Instantiate BED Importer. - - :param filen: File path - :type filen: str - """ - session = get_session() - - return BEDImporter( - session=session, - filen=filen, - handle=open(filen, "r"), - ) diff --git a/server/src/scimodom/services/importer/__init__.py b/server/src/scimodom/services/importer/__init__.py index e69de29b..cc5d804f 100644 --- a/server/src/scimodom/services/importer/__init__.py +++ b/server/src/scimodom/services/importer/__init__.py @@ -0,0 +1,99 @@ +from scimodom.database.database import get_session +from scimodom.services.importer.data import EUFDataImporter +from scimodom.services.importer.generic import BEDImporter +from scimodom.services.importer.header import EUFHeaderImporter + + +class Importer: + """Defines a general Importer class to handle + EU (bedRMod) formatted files. + + :param header: EU header importer + :type header: EUFHeaderImporter + :param data: EU data importer + :type data: EUFDataImporter + """ + + def __init__( + self, + header: EUFHeaderImporter, + data: EUFDataImporter | None = None, + ) -> None: + """Initializer method.""" + self.header = header + self.data = data + + def init_data_importer( + self, association: dict[str, int], seqids: list[str] + ) -> None: + """Instantiate EUFDataImporter. + + :param association: A dictionary of association IDs of the form + {name: association_id}, where name is the modification short_name. + The association ID provides information about the dataset (EUFID), + the modification, the organism, and the technology used. + :type association: dict of {str: int} + :param seqids: List of chromosomes or scaffolds. The seqid must be + one used with Ensembl, e.g. standard Ensembl chromosome name w/o + the "chr" prefix. Only records with seqid in seqids will be imported. + :type seqids: list of str + """ + + version = self.header._specs_ver + filen = self.header._filen + session = get_session() + if self.header._handle.closed is False: + self.header.close() + if self.data is None: + self.data = EUFDataImporter( + session=session(), + filen=filen, + handle=open(filen, "r"), + association=association, + seqids=seqids, + specs_ver=version, + ) + + +def get_importer(filen: str, smid: str, eufid: str, title: str): + """Instantiate Importer. + + :param filen: File path + :type filen: str + :param smid: Sci-ModoM project ID or SMID + :type smid: str + :param eufid: EUF ID (dataset) or EUFID + :type eufid: str + :param title: Title associated with EUF/bedRMod dataset + :type title: str + """ + session = get_session() + + return Importer( + header=EUFHeaderImporter( + session=session(), + filen=filen, + handle=open(filen, "r"), + smid=smid, + eufid=eufid, + title=title, + ), + data=None, + ) + + +def get_bed_importer( + filen: str, +): + """Instantiate BED Importer. + + :param filen: File path + :type filen: str + """ + session = get_session() + + return BEDImporter( + session=session(), + filen=filen, + handle=open(filen, "r"), + ) diff --git a/server/src/scimodom/services/importer/base.py b/server/src/scimodom/services/importer/base.py index fdc14c3b..aa0db532 100644 --- a/server/src/scimodom/services/importer/base.py +++ b/server/src/scimodom/services/importer/base.py @@ -118,16 +118,16 @@ def __init__( self._buffer: BaseImporter._Buffer self._dtypes: dict[str, dict[str, Any]] = dict() self._lino: int = skiprows - if header is None: - self._header = self._get_header() - else: - self._header = header - self._num_cols: int = len(self._header) if comment is not None and len(comment) > 1: raise ValueError( f"Maximum length of 1 expected, got {len(comment)} for comment." ) self._comment = comment + if header is None: + self._header = self._get_header() + else: + self._header = header + self._num_cols: int = len(self._header) @abstractmethod def parse_record(self, record: dict[str, str]) -> dict[str, Any]: diff --git a/server/tests/unit/conftest.py b/server/tests/unit/conftest.py index 6733e3fb..dc0e735a 100644 --- a/server/tests/unit/conftest.py +++ b/server/tests/unit/conftest.py @@ -21,7 +21,7 @@ from scimodom.utils.specifications import SPECS_EUF # data path -DataPath = namedtuple("DataPath", "ASSEMBLY_PATH ANNOTATION_PATH META_PATH") +DataPath = namedtuple("DataPath", "LOC ASSEMBLY_PATH ANNOTATION_PATH META_PATH") @pytest.fixture() @@ -251,6 +251,9 @@ def project_template(): @pytest.fixture(scope="session") def data_path(tmp_path_factory): + format = SPECS_EUF["format"] + version = SPECS_EUF["versions"][-1] + loc = tmp_path_factory.mktemp("data") ASSEMBLY_PATH = loc / "assembly" ASSEMBLY_PATH.mkdir() @@ -273,4 +276,21 @@ def data_path(tmp_path_factory): with open(Path(path, chrom_file), "w") as f: f.write("1\t1000000") - yield DataPath(ASSEMBLY_PATH, ANNOTATION_PATH, META_PATH) + with open(Path(loc, "test.bed"), "w") as f: + f.write(f"#fileformat={format}v{version}\n") + f.write("#organism=9606\n") + f.write("#modification_type=RNA\n") + f.write("#assembly=GRCh38\n") + f.write("#annotation_source=Annotation\n") + f.write("#annotation_version=Version\n") + f.write("#sequencing_platform=Sequencing platform\n") + f.write("#basecalling=\n") + f.write("#bioinformatics_workflow=Workflow\n") + f.write("#experiment=Description of experiment.\n") + f.write("#external_source=\n") + f.write( + "#chrom\tchromstart\tchromEnd\tname\tscore\tstrand\tthickstart\tthickEnd\titermRgb\tcoverage\tfrequency\n" + ) + f.write("1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1\n") + + yield DataPath(loc, ASSEMBLY_PATH, ANNOTATION_PATH, META_PATH) diff --git a/server/tests/unit/services/test_data_importer.py b/server/tests/unit/services/test_data_importer.py index d06e7780..d639def3 100644 --- a/server/tests/unit/services/test_data_importer.py +++ b/server/tests/unit/services/test_data_importer.py @@ -50,6 +50,7 @@ def _get_data(EUF_specs): A\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1 1\t0\t10\tm6A\t1000\t\t0\t10\t0,0,0\t10\t1 1\t0\t10\tm5C\t1000\t+\t0\t10\t0,0,0\t10\t1 + 1\t0\t10\tm5C\t1000\t+ 1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t200""" return StringIO(string) @@ -76,6 +77,9 @@ def _get_data_with_header(fmt): chrom\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tcoverage\tfrequency 1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1""" comment = "@" + elif fmt == "wrong": + string = """chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tcoverage\tfrequency + 1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1""" return skiprows, comment, StringIO(string) @@ -176,12 +180,11 @@ def test_importer_parse_records(Session, EUF_specs): "fmt", [("first"), ("second"), ("third"), ("comment")], ) -def test_base_importer(fmt, Session): +def test_base_importer_header(fmt, Session): skiprows, comment, handle = _get_data_with_header(fmt) class TestBaseImporter(BaseImporter): def __init__(self): - # self._comment = comment super().__init__( session=Session(), filen="filen", @@ -197,4 +200,42 @@ def parse_record(record): return record importer = TestBaseImporter() - print(importer._header) + importer._validate_columns() + expected_header = [ + "chrom", + "start", + "end", + "name", + "score", + "strand", + "thick_start", + "thick_end", + "item_rgb", + "coverage", + "frequency", + ] + assert importer._header == expected_header + + +def test_base_importer_columns_fail(Session): + skiprows, comment, handle = _get_data_with_header("wrong") + + class TestBaseImporter(BaseImporter): + def __init__(self): + super().__init__( + session=Session(), + filen="filen", + handle=handle, + model=Data, + sep="\t", + header=None, + skiprows=skiprows, + comment=comment, + ) + + def parse_record(record): + return record + + importer = TestBaseImporter() + with pytest.raises(Exception) as excinfo: + importer._validate_columns() diff --git a/server/tests/unit/services/test_importer.py b/server/tests/unit/services/test_importer.py index 7941a706..837274f7 100644 --- a/server/tests/unit/services/test_importer.py +++ b/server/tests/unit/services/test_importer.py @@ -1,445 +1,47 @@ from io import StringIO -import uuid +from pathlib import Path -import pandas as pd import pytest -import shortuuid -from sqlalchemy import select -from scimodom.database.models import Data, Dataset -import scimodom.database.queries as queries -from scimodom.services.importer import EUFImporter, SpecsError -from scimodom.services.dataset import DataService -from scimodom.services.project import ProjectService -from scimodom.utils.specifications import SPECS_EUF -import scimodom.utils.utils as utils +# from sqlalchemy import select +# from scimodom.database.models import Data, Dataset +from scimodom.services.importer import get_importer -def _get_header(EUF_version, fmt=None): - specs = SPECS_EUF.copy() - specs_format = specs.pop("format") - _ = specs.pop("header") - _ = specs.pop("delimiter") - version = EUF_version - expected_version = f"{specs_format}v{version}" - if fmt == "string": - string = f"completelyWrongHeaderButVersionIs Ok{version}" - elif fmt == "version": - string = "#fileformat=bedRModv0.0" - elif fmt == "EOF": - string = "" - elif fmt == "full": # add blank spaces for some lines... this should work - string = f"""#fileformat=bedRModv{EUF_version} - #organism= 9606 - #modification_type=RNA - #assembly=GRCh38 - #annotation_source= Annotation - #annotation_version=Version - #sequencing_platform=Sequencing platform - #basecalling= - #bioinformatics_workflow=Workflow - #experiment=Description of experiment. - #external_source=""" - elif fmt == "misformatted": - string = f"""#fileformat=bedRModv{EUF_version} - #organism 9606 - #modification_type=RNA - #assembly=GRCh38 - #annotation_source= Annotation - #annotation_version=Version - #sequencing_platform=Sequencing platform - #basecalling= - #bioinformatics_workflow=Workflow - #experiment=Description of experiment. - #external_source=""" - elif fmt == "missing": - string = f"""#fileformat=bedRModv{EUF_version} - #organism=9606 - #modification_type=RNA - #assembly=GRCh38 - #annotation_source= Annotation - #annotation_version= - #sequencing_platform=Sequencing platform - #basecalling= - #bioinformatics_workflow=Workflow - #experiment=Description of experiment. - #external_source=""" - elif fmt == "longer": - string = f"""#fileformat=bedRModv{EUF_version} - #organism=9606 - #modification_type=RNA - #assembly=GRCh38 - #annotation_source= Annotation - #annotation_version=Version - #sequencing_platform=Sequencing platform - #basecalling= - #bioinformatics_workflow=Workflow - #experiment=Description of experiment. - #external_source= - #methods=method - #references=pubmed_id:12345678""" - elif fmt == "disordered": # fileformat first - string = f"""#fileformat=bedRModv{EUF_version} - #modification_type=RNA - #sequencing_platform=Sequencing platform - #organism= 9606 - #assembly=GRCh38 - #experiment=Description of experiment. - #annotation_source= Annotation - #annotation_version=Version - #basecalling= - #bioinformatics_workflow=Workflow - #external_source=""" - elif fmt == "columns_extra": # some misformatted columns... (case-insensitive) - if EUF_version == "1.6": - string = """#fileformat=bedRModv1.6 - #chrom\tchromstart\tchromEnd\tname\tscore\tstrand\tthickstart\tthickEnd\trgb\tcoverage\tfrequency\trefBase\textra""" - elif EUF_version == "1.7": - string = """#fileformat=bedRModv1.7 - #chrom\tchromstart\tchromEnd\tname\tscore\tstrand\tthickstart\tthickEnd\trgb\tcoverage\tfrequency\textra""" - elif fmt == "columns_short": - string = f"""#fileformat=bedRModv{EUF_version} - #chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tcoverage""" - elif fmt == "data": - if EUF_version == "1.6": - string = """#fileformat=bedRModv1.6 - 1\t139219\t139220\tm6A\t100\t+\t139219\t139220\t0,0,0\t50\t10\tA""" - elif EUF_version == "1.7": - string = """#fileformat=bedRModv1.7 - 1\t139219\t139220\tm6A\t100\t+\t139219\t139220\t0,0,0\t50\t10""" - elif fmt == "data_short": - string = f"""#fileformat=bedRModv{EUF_version} - 1\t139219\t139220\tm6A\t100\t+\t139219\t139220\t0,0,0\t50""" - elif fmt == "data_type": - if EUF_version == "1.6": - string = """#fileformat=bedRModv1.6 - 1\t139219\tstring\tm6A\t100\t+\t139219\t139220\t0,0,0\t50\t10\tA""" - elif EUF_version == "1.7": - string = """#fileformat=bedRModv1.7 - 1\t139219\tstring\tm6A\t100\t+\t139219\t139220\t0,0,0\t50\t10""" - elif fmt == "data_type_float": - if EUF_version == "1.6": - string = """#fileformat=bedRModv1.6 - 1\t139219\tstring\tm6A\t100\t+\t139219\t139220\t0,0,0\t50.0\t10.1\tA""" - elif EUF_version == "1.7": - string = """#fileformat=bedRModv1.7 - 1\t139219\tstring\tm6A\t100\t+\t139219\t139220\t0,0,0\t50.0\t10.1""" - else: - string = expected_version - return expected_version, StringIO(string) +# import scimodom.utils.utils as utils -def _get_file(EUF_version): - if EUF_version == "1.6": - string = """#fileformat=bedRModv1.6 - #organism=9606 - #modification_type=RNA - #assembly=GRCh38 - #annotation_source=Annotation - #annotation_version=Version - #sequencing_platform=Sequencing platform - #basecalling= - #bioinformatics_workflow=Workflow - #experiment=Description of experiment. - #external_source= - #chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tcoverage\tfrequency\trefBase - 1\t139219\t139220\tm6A\t100\t+\t139219\t139220\t0,0,0\t50\t10\tA - 1\t139220\t139221\tm6A\t5\t+\t139220\t139221\t0,0,0\t100\t5\tA - 1\t139221\t139222\tm6A\t5\t+\t139221\t139222\t0,0,0\t300\t5\tA - 1\t139222\t139223\tm6A\t500\t+\t139222\t139223\t0,0,0\t250\t100\tA - 1\t139223\t139224\tm6A\t5\t+\t139223\t139224\t0,0,0\t10\t5\tA""" - elif EUF_version == "1.7": - string = """#fileformat=bedRModv1.7 - #organism=9606 - #modification_type=RNA - #assembly=GRCh38 - #annotation_source=Annotation - #annotation_version=Version - #sequencing_platform=Sequencing platform - #basecalling= - #bioinformatics_workflow=Workflow - #experiment=Description of experiment. - #external_source= - #chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tcoverage\tfrequency - 1\t139219\t139220\tm6A\t100\t+\t139219\t139220\t0,0,0\t50\t10 - 1\t139220\t139221\tm6A\t5\t+\t139220\t139221\t0,0,0\t100\t5 - 1\t139221\t139222\tm6A\t5\t+\t139221\t139222\t0,0,0\t300\t5 - 1\t139222\t139223\tm6A\t500\t+\t139222\t139223\t0,0,0\t250\t100 - 1\t139223\t139224\tm6A\t5\t+\t139223\t139224\t0,0,0\t10\t5""" - return StringIO(string) +# test struture with rollback to checkpoint in case e.g. data or header is wrong +# test closing or not the open connection +# WHY WAS I ABLE TO ADD ASSOCIATION ID TO DATA IF THERE IS NO VALUE IN THE DB (FK)??? +# we may have to write to a file here -@pytest.mark.parametrize( - "fmt", - [ - (None), - ("string"), - ("version"), - ("EOF"), - ], -) -def test_importer_read_version(fmt, Session, EUF_version): - version, handle = _get_header(EUF_version, fmt) - importer = EUFImporter( - Session(), - "fileformat", - handle, - "ABCDEFGH", - "123456789ABC", - "Title", - 9606, - 1, - False, - {1: "m6A"}, - ) - if fmt == "version": - with pytest.raises(SpecsError) as excinfo: - importer._read_version() - elif fmt == "EOF": - with pytest.raises(EOFError) as excinfo: - importer._read_version() - else: - importer._read_version() - assert version == importer._version - - -@pytest.mark.parametrize( - "fmt", - [ - ("full"), - ("misformatted"), - ("missing"), - ("longer"), - ("disordered"), - ], -) -def test_importer_read_header(fmt, Session, EUF_version): - _, handle = _get_header(EUF_version, fmt) - importer = EUFImporter( - Session(), - "fileformat", - handle, - "ABCDEFGH", - "123456789ABC", - "Title", - 9606, - 1, - False, - {1: "m6A"}, - ) - importer._lino = 1 - importer._read_version() - importer._validate_attributes(Dataset, importer._specs["headers"].values()) - importer._buffers["Dataset"] = EUFImporter._Buffer(session=Session(), model=Dataset) - if fmt in ["full", "longer", "disordered"]: - importer._read_header() - expected_pointer = len(importer._specs["headers"]) - if fmt == "longer": - expected_pointer += 2 - assert importer._lino == expected_pointer - with Session() as session, session.begin(): - records = session.execute(select(Dataset)).scalar() - assert records.id == "123456789ABC" - assert records.project_id == "ABCDEFGH" - assert records.title == "Title" - assert records.file_format == f"bedRModv{EUF_version}" - assert records.modification_type == "RNA" - assert records.taxa_id == 9606 - assert records.assembly_id == 1 - assert records.lifted is False - assert records.annotation_source == "Annotation" - assert records.annotation_version == "Version" - assert records.sequencing_platform == "Sequencing platform" - assert records.basecalling is None - assert records.bioinformatics_workflow == "Workflow" - assert records.experiment == "Description of experiment." - assert records.external_source is None - else: - with pytest.raises(SpecsError) as excinfo: - importer._read_header() - - -@pytest.mark.parametrize( - "fmt", - [ - ("columns_extra"), - ("columns_short"), - ], -) -def test_importer_validate_columns(fmt, Session, EUF_version): - _, handle = _get_header(EUF_version, fmt) - importer = EUFImporter( - Session(), - "fileformat", - handle, - "ABCDEFGH", - "123456789ABC", - "Title", - 9606, - 1, - False, - {1: "m6A"}, - ) - importer._lino = 1 - importer._read_version() - if fmt == "columns_extra": - importer._validate_columns(next(importer._handle)) - else: - with pytest.raises(SpecsError) as excinfo: - importer._validate_columns(next(importer._handle)) +# def test_ +# get_importer(filen: str, smid: str, eufid: str, title: str) +# then call header methods -@pytest.mark.parametrize( - "fmt", - [ - ("data"), - ("data_short"), - ("data_type"), - ("data_type_float"), - ], -) -def test_importer_read_line(fmt, Session, caplog, EUF_version): - _, handle = _get_header(EUF_version, fmt) - importer = EUFImporter( - Session(), - "fileformat", - handle, - "ABCDEFGH", - "123456789ABC", - "Title", - 9606, - 1, - False, - {1: "m6A"}, - ) - importer._lino = 1 - importer._read_version() - importer._validate_attributes(Data, importer._specs["columns"].values()) - importer._buffers["Data"] = EUFImporter._Buffer(session=Session(), model=Data) - importer._int_types = [ - i - for i, v in enumerate(importer._specs["columns"].values()) - if v - in [ - "start", - "end", - "score", - "thick_start", - "thick_end", - "coverage", - "frequency", - ] - ] +# def init_data_importer( +# self, association: dict[str, int], seqids: list[str] +# then call dataimporter methods +# check - importer._chroms = ["1"] - line = next(importer._handle) - if fmt == "data": - importer._read_line(line) - importer._buffers["Data"].flush() - with Session() as session, session.begin(): - records = session.execute(select(Data)).scalar() - assert records.chrom == "1" - assert records.start == 139219 - assert records.end == 139220 - assert records.name == "m6A" - assert records.score == 100 - assert records.thick_start == 139219 - assert records.thick_end == 139220 - assert records.item_rgb == "0,0,0" - assert records.coverage == 50 - assert records.frequency == 10 - # assert records.ref_base == "A" - else: - # currently ValueError is excepted into a warning, wrong line is skipped... - # but does this includes TypeError? - importer._read_line(line) - assert "Warning: Failed to parse fileformat at row 1:" in caplog.text +# also test BED either separately, or just here -def test_importer(Session, setup, project_template, EUF_version, data_path): - # the order is that defined in the model... - columns = utils.get_table_columns(Data) +# def get_bed_importer( +# filen: str, - metadata = project_template["metadata"][0] - taxa_id = metadata["organism"]["taxa_id"] - with Session() as session, session.begin(): - session.add_all(setup) - session.flush() - query = queries.query_column_where( - "Assembly", "id", filters={"name": metadata["organism"]["assembly"]} - ) - assembly_id = session.execute(query).scalar() - session.commit() +# do we need to test also e.g. short/long lines, no cols, etc.? - ProjectService(Session(), project_template).create_project() - u = uuid.uuid4() - smid = shortuuid.encode(u)[: ProjectService.SMID_LENGTH] - u = uuid.uuid4() - eufid = shortuuid.encode(u)[: DataService.EUFID_LENGTH] - filen = "tabStringIO" - title = "Title" - - importer = EUFImporter( - Session(), - filen, - _get_file(EUF_version), - smid, - eufid, - title, - taxa_id, - assembly_id, - False, - {1: "m6A"}, +def test_importer(Session, data_path): + importer = get_importer( + filen=Path(data_path.LOC, "test.bed").as_posix(), + smid="12345678", + eufid="123456789ABC", + title="title", ) - importer._chroms = ["1"] - importer.parseEUF() - importer.close() - - with Session() as session, session.begin(): - records = session.execute(select(Dataset)).scalar() - assert records.id == eufid - assert records.project_id == smid - assert records.title == title - assert records.file_format == f"bedRModv{EUF_version}" - assert records.modification_type == "RNA" - assert records.taxa_id == 9606 - assert records.assembly_id == assembly_id - assert records.lifted is False - assert records.annotation_source == "Annotation" - assert records.annotation_version == "Version" - assert records.sequencing_platform == "Sequencing platform" - assert records.basecalling is None - assert records.bioinformatics_workflow == "Workflow" - assert records.experiment == "Description of experiment." - assert records.external_source is None - records = session.execute(select(Data)).scalars().all() - df = pd.DataFrame( - [ - ( - r.chrom, - r.start, - r.end, - r.name, - r.score, - r.strand, - r.thick_start, - r.thick_end, - r.item_rgb, - r.coverage, - r.frequency, - # r.ref_base, - ) - for r in records - ], - columns=columns[2:], - ) - expected_df = pd.read_csv( - _get_file(EUF_version), - sep="\t", - skiprows=12, - header=None, - names=columns[2:], - ) - expected_df = expected_df.astype(importer._dtypes["Data"]) - pd.testing.assert_frame_equal(df, expected_df, check_exact=True) + importer.header.parse_header() + importer.header.close()