From 7c95c4b318e86a8c4131994056c42958e74903e9 Mon Sep 17 00:00:00 2001 From: beaioun Date: Wed, 8 Nov 2023 01:10:54 -0800 Subject: [PATCH 1/2] Initial commit for adding zstd (de)compression support for workload corpora Signed-off-by: beaioun --- osbenchmark/utils/io.py | 40 +++++++++++++++++++++++++++++++++++++++- setup.py | 2 ++ tests/utils/io_test.py | 4 ++-- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/osbenchmark/utils/io.py b/osbenchmark/utils/io.py index 4ef02a62e..70be80fe6 100644 --- a/osbenchmark/utils/io.py +++ b/osbenchmark/utils/io.py @@ -31,6 +31,7 @@ import tarfile import zipfile from contextlib import suppress +import zstandard as zstd import mmap @@ -249,7 +250,7 @@ def is_archive(name): :return: True iff the given file name is an archive that is also recognized for decompression by Benchmark. """ _, ext = splitext(name) - return ext in [".zip", ".bz2", ".gz", ".tar", ".tar.gz", ".tgz", ".tar.bz2"] + return ext in [".zip", ".bz2", ".gz", ".tar", ".tar.gz", ".tgz", ".tar.bz2", ".zst"] def is_executable(name): @@ -272,6 +273,28 @@ def compress(source_directory, archive_name): _zipdir(source_directory, archive) +def compress_zstd(source_directory, archive_name): + """ + Compress a directory tree using Zstandard compression. + :param source_directory: The source directory to compress. Must be readable. + :param archive_name: The absolute path including the file name of the archive. Must have the extension .zst. + """ + zstc = zstd.ZstdCompressor() + + with open(archive_name, "wb") as archive_file: + with zstc.stream_writer(archive_file) as compressor: + for root, _, files in os.walk(source_directory): + for file in files: + file_path = os.path.join(root, file) + rel_path = os.path.relpath(file_path, source_directory) + # Write the file path (relative) to the archive to recreate the directory structure + compressor.write(rel_path.encode("utf-8")) + with open(file_path, "rb") as source_file: + # Write the content of the file to the archive + for chunk in source_file: + compressor.write(chunk) + + def decompress(zip_name, target_directory): """ Decompresses the provided archive to the target directory. The following file extensions are supported: @@ -283,6 +306,7 @@ def decompress(zip_name, target_directory): * tar.gz * tgz * tar.bz2 + * zst The decompression method is chosen based on the file extension. @@ -303,6 +327,8 @@ def decompress(zip_name, target_directory): _do_decompress_manually(target_directory, zip_name, decompressor_args, decompressor_lib) elif extension in [".tar", ".tar.gz", ".tgz", ".tar.bz2"]: _do_decompress(target_directory, tarfile.open(zip_name)) + elif extension == ".zst": + _do_decompress_zstd(target_directory, zip_name) else: raise RuntimeError("Unsupported file extension [%s]. Cannot decompress [%s]" % (extension, zip_name)) @@ -344,6 +370,18 @@ def _do_decompress_manually_with_lib(target_directory, filename, compressed_file compressed_file.close() +def _do_decompress_zstd(target_directory, filename): + path_without_extension = os.path.splitext(os.path.basename(filename))[0] + try: + with open(filename, 'rb') as compressed_file: + zstd_decompressor = zstd.ZstdDecompressor() + with open(os.path.join(target_directory, path_without_extension), "wb") as new_file: + for chunk in zstd_decompressor.read_to_iter(compressed_file.read): + new_file.write(chunk) + except Exception as e: + logging.getLogger(__name__).warning("Failed to decompress [%s] with Zstandard. Error: %s.", filename, str(e)) + + def _do_decompress(target_directory, compressed_file): try: compressed_file.extractall(path=target_directory) diff --git a/setup.py b/setup.py index f93e9d000..a25b359f7 100644 --- a/setup.py +++ b/setup.py @@ -98,6 +98,8 @@ def str_from_file(name): # jmespath: MIT # s3transfer: Apache 2.0 "boto3==1.28.62", + # Licence: BSD-3-Clause + "zstandard==0.22.0", ] tests_require = [ diff --git a/tests/utils/io_test.py b/tests/utils/io_test.py index 262b3aedb..13c399c65 100644 --- a/tests/utils/io_test.py +++ b/tests/utils/io_test.py @@ -76,7 +76,7 @@ def test_has_extension(self): class TestDecompression: def test_decompresses_supported_file_formats(self): - for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz"]: + for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz", "zst"]: tmp_dir = tempfile.mkdtemp() archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt.{ext}") decompressed_path = os.path.join(tmp_dir, "test.txt") @@ -90,7 +90,7 @@ def test_decompresses_supported_file_formats(self): @mock.patch.object(io, "is_executable", return_value=False) def test_decompresses_supported_file_formats_with_lib_as_failover(self, mocked_is_executable): - for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz"]: + for ext in ["zip", "gz", "bz2", "tgz", "tar.bz2", "tar.gz", "zst"]: tmp_dir = tempfile.mkdtemp() archive_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", f"test.txt.{ext}") decompressed_path = os.path.join(tmp_dir, "test.txt") From 4f37b4b080f59332a693cc7c6de84f96e9ee8575 Mon Sep 17 00:00:00 2001 From: beaioun Date: Fri, 17 Nov 2023 15:58:26 -0800 Subject: [PATCH 2/2] Let me commit the changes first and see if the error will be cleared. If not I will fix from this point Signed-off-by: beaioun --- osbenchmark/utils/io.py | 12 +++++++----- tests/utils/resources/test.txt.zst | Bin 0 -> 45 bytes 2 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 tests/utils/resources/test.txt.zst diff --git a/osbenchmark/utils/io.py b/osbenchmark/utils/io.py index 70be80fe6..a487be153 100644 --- a/osbenchmark/utils/io.py +++ b/osbenchmark/utils/io.py @@ -31,10 +31,12 @@ import tarfile import zipfile from contextlib import suppress -import zstandard as zstd import mmap + +import zstandard as zstd + from osbenchmark.utils import console @@ -371,15 +373,15 @@ def _do_decompress_manually_with_lib(target_directory, filename, compressed_file def _do_decompress_zstd(target_directory, filename): - path_without_extension = os.path.splitext(os.path.basename(filename))[0] + path_without_extension = basename(splitext(filename)[0]) try: with open(filename, 'rb') as compressed_file: zstd_decompressor = zstd.ZstdDecompressor() with open(os.path.join(target_directory, path_without_extension), "wb") as new_file: - for chunk in zstd_decompressor.read_to_iter(compressed_file.read): + for chunk in zstd_decompressor.read_to_iter(compressed_file): new_file.write(chunk) - except Exception as e: - logging.getLogger(__name__).warning("Failed to decompress [%s] with Zstandard. Error: %s.", filename, str(e)) + finally: + compressed_file.close() def _do_decompress(target_directory, compressed_file): diff --git a/tests/utils/resources/test.txt.zst b/tests/utils/resources/test.txt.zst new file mode 100644 index 0000000000000000000000000000000000000000..ebe886e7758783022bfd951e78584aa618de6f20 GIT binary patch literal 45 zcmdPcs{dC(MUjyqI5D>%Csm;&wW35JEx$;?B{exex1cDsxHvOEFC?|Nq?nfr0AiXC A!vFvP literal 0 HcmV?d00001