Skip to content

Commit

Permalink
handling .z files as gzip
Browse files Browse the repository at this point in the history
improved tests

regarding #189
  • Loading branch information
seanmacavaney committed May 5, 2022
1 parent d325e24 commit 9632f32
Show file tree
Hide file tree
Showing 26 changed files with 160 additions and 6 deletions.
10 changes: 5 additions & 5 deletions ir_datasets/formats/trec.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,20 +126,20 @@ def docs_iter(self):

def _docs_iter(self, path):
if Path(path).is_file():
path_suffix = Path(path).suffix.lower()
if path_suffix == '.gz':
path_suffix = Path(path).suffix
if path_suffix.lower() == '.gz' or path_suffix == '.z':
with gzip.open(path, 'rb') as f:
yield from self._parser(f)
elif path_suffix in ['.z', '.0z', '.1z', '.2z']:
elif path_suffix in ['.Z', '.0Z', '.1Z', '.2Z']:
# unix "compress" command encoding
unlzw3 = ir_datasets.lazy_libs.unlzw3()
with io.BytesIO(unlzw3.unlzw(path)) as f:
with io.BytesIO(unlzw3.unlzw(Path(path))) as f:
yield from self._parser(f)
else:
with open(path, 'rb') as f:
yield from self._parser(f)
elif Path(path).is_dir():
for child in path.iterdir():
for child in sorted(Path(path).iterdir()):
yield from self._docs_iter(child)

def _parser_bs(self, stream):
Expand Down
Binary file added test/dummy/trecdocs/compress_uc_0z.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_0z/F00.0Z
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_0z/F01.0Z
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_z.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_z/F00.Z
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_z/F01.Z
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_gz.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_gz/F00.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_gz/F01.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_uc_gz.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_uc_gz/F00.GZ
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_uc_gz/F01.GZ
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_z.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_z/F00.z
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_z/F01.z
Binary file not shown.
Binary file added test/dummy/trecdocs/plaintext_noext.tar.gz
Binary file not shown.
29 changes: 29 additions & 0 deletions test/dummy/trecdocs/plaintext_noext/F00
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<DOC>
<DOCNO> D100A </DOCNO>
<PARENT> Something </PARENT>
<HT> Some text </HT>

<HEADLINE>
<AU> Header Text </AU>
Daily Report

</HEADLINE>

<TEXT>
Main body text
on multiple lines

with <F P=102> some markup
</F> here. Also, some invalid <T> markup &amp;.
</TEXT>

</DOC>

<DOC>
<DOCNO> 101 </DOCNO>

<TEXT>
More body text
</TEXT>

</DOC>
11 changes: 11 additions & 0 deletions test/dummy/trecdocs/plaintext_noext/F01
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<DOC>
<DOCNO> D102 </DOCNO>
<HT> more text </HT>

<TEXT>
some very <F P=102> fun text
<!-- commented out --> markup &AMP;

</TEXT>

</DOC>
Binary file added test/dummy/trecdocs/plaintext_txt.tar.gz
Binary file not shown.
29 changes: 29 additions & 0 deletions test/dummy/trecdocs/plaintext_txt/F00.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<DOC>
<DOCNO> D100A </DOCNO>
<PARENT> Something </PARENT>
<HT> Some text </HT>

<HEADLINE>
<AU> Header Text </AU>
Daily Report

</HEADLINE>

<TEXT>
Main body text
on multiple lines

with <F P=102> some markup
</F> here. Also, some invalid <T> markup &amp;.
</TEXT>

</DOC>

<DOC>
<DOCNO> 101 </DOCNO>

<TEXT>
More body text
</TEXT>

</DOC>
11 changes: 11 additions & 0 deletions test/dummy/trecdocs/plaintext_txt/F01.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<DOC>
<DOCNO> D102 </DOCNO>
<HT> more text </HT>

<TEXT>
some very <F P=102> fun text
<!-- commented out --> markup &AMP;

</TEXT>

</DOC>
Binary file added test/dummy/trecdocs/plaintext_uc_txt.tar.gz
Binary file not shown.
29 changes: 29 additions & 0 deletions test/dummy/trecdocs/plaintext_uc_txt/F00.TXT
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<DOC>
<DOCNO> D100A </DOCNO>
<PARENT> Something </PARENT>
<HT> Some text </HT>

<HEADLINE>
<AU> Header Text </AU>
Daily Report

</HEADLINE>

<TEXT>
Main body text
on multiple lines

with <F P=102> some markup
</F> here. Also, some invalid <T> markup &amp;.
</TEXT>

</DOC>

<DOC>
<DOCNO> 101 </DOCNO>

<TEXT>
More body text
</TEXT>

</DOC>
11 changes: 11 additions & 0 deletions test/dummy/trecdocs/plaintext_uc_txt/F01.TXT
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<DOC>
<DOCNO> D102 </DOCNO>
<HT> more text </HT>

<TEXT>
some very <F P=102> fun text
<!-- commented out --> markup &AMP;

</TEXT>

</DOC>
36 changes: 35 additions & 1 deletion test/formats/test_trec.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,22 @@
import os
import shutil
import unittest
import contextlib
from ir_datasets.formats import TrecQrel, TrecQrels, TrecQuery, TrecQueries, TrecDoc, TrecDocs
from ir_datasets.util import StringFile
from ir_datasets.util import StringFile, RelativePath


class File:
def __init__(self, path):
self._path = path

def path(self, force=True):
return self._path

@contextlib.contextmanager
def stream(self):
yield open(self._path, 'rb')



class TestTrec(unittest.TestCase):
Expand Down Expand Up @@ -127,6 +141,26 @@ def test_docs(self):
self.assertEqual(docs.docs_path(), 'MOCK')
self.assertEqual(list(docs.docs_iter()), expected_results)


def test_docs_formats(self):
expected_results = [
TrecDoc(doc_id='D100A', text='\n\n Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith some markup\n here. Also, some invalid markup &. \n\n', marked_up_doc='<HEADLINE>\n<AU> Header Text </AU>\nDaily Report \n\n</HEADLINE>\n<TEXT>\nMain body text\non multiple lines\n\nwith <F P=102> some markup\n</F> here. Also, some invalid <T> markup &amp;. \n</TEXT>\n'),
TrecDoc(doc_id='101', text='\n\nMore body text\n\n', marked_up_doc='<TEXT>\nMore body text\n</TEXT>\n'),
TrecDoc(doc_id='D102', text='\n\nsome very fun text\n markup &AMP;\n\n\n', marked_up_doc='<TEXT>\nsome very <F P=102> fun text\n<!-- commented out --> markup &AMP;\n\n</TEXT>\n'),
]

for source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz', 'gzip_z', 'gzip_uc_gz', 'compress_uc_z', 'compress_uc_0z']:
with self.subTest(source):
docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}')))
self.assertEqual(list(docs.docs_iter()), expected_results)

docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}')), path_globs=['F*'])
self.assertEqual(list(docs.docs_iter()), expected_results)

if source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz']:
docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}.tar.gz')), path_globs=['*/F*'])
self.assertEqual(list(docs.docs_iter()), expected_results)

def tearDown(self):
if os.path.exists('MOCK.pklz4'):
shutil.rmtree('MOCK.pklz4')
Expand Down

0 comments on commit 9632f32

Please sign in to comment.