y-scope · davemarco · Dec 28, 2024 · Dec 28, 2024 · Dec 28, 2024 · Dec 28, 2024
diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
@@ -505,6 +505,9 @@ set(SOURCE_FILES_unitTest
         src/clp/streaming_archive/reader/Segment.hpp
         src/clp/streaming_archive/reader/SegmentManager.cpp
         src/clp/streaming_archive/reader/SegmentManager.hpp
+        src/clp/streaming_archive/single_file_archive/Defs.hpp
+        src/clp/streaming_archive/single_file_archive/writer.cpp
+        src/clp/streaming_archive/single_file_archive/writer.hpp
         src/clp/streaming_archive/writer/Archive.cpp
         src/clp/streaming_archive/writer/Archive.hpp
         src/clp/streaming_archive/writer/File.cpp

diff --git a/components/core/src/clp/clp/CMakeLists.txt b/components/core/src/clp/clp/CMakeLists.txt
@@ -108,6 +108,9 @@ set(
         ../streaming_archive/reader/Segment.hpp
         ../streaming_archive/reader/SegmentManager.cpp
         ../streaming_archive/reader/SegmentManager.hpp
+        ../streaming_archive/single_file_archive/Defs.hpp
+        ../streaming_archive/single_file_archive/writer.cpp
+        ../streaming_archive/single_file_archive/writer.hpp
         ../streaming_archive/writer/Archive.cpp
         ../streaming_archive/writer/Archive.hpp
         ../streaming_archive/writer/File.cpp

diff --git a/components/core/src/clp/clp/CommandLineArguments.cpp b/components/core/src/clp/clp/CommandLineArguments.cpp
@@ -373,6 +373,10 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) {
                             ->default_value(m_schema_file_path),
                     "Path to a schema file. If not specified, heuristics are used to determine "
                     "dictionary variables. See README-Schema.md for details."
+            )(
+                    "single-file-archive",
+                    po::bool_switch(&m_single_file_archive),
+                    "Output archive as a single-file"
-                    "Output archive as a single-file"
+                    "Output archive as a single-file archive"
-                    "Output archive as a single-file"
+                    "Output archive as a single-file archive"
             );
 
             po::options_description all_compression_options;

diff --git a/components/core/src/clp/clp/CommandLineArguments.hpp b/components/core/src/clp/clp/CommandLineArguments.hpp
@@ -23,6 +23,7 @@ class CommandLineArguments : public CommandLineArgumentsBase {
     explicit CommandLineArguments(std::string const& program_name)
             : CommandLineArgumentsBase(program_name),
               m_show_progress(false),
+              m_single_file_archive(false),
               m_sort_input_files(true),
               m_print_archive_stats_progress(false),
               m_target_segment_uncompressed_size(1L * 1024 * 1024 * 1024),
@@ -45,6 +46,8 @@ class CommandLineArguments : public CommandLineArgumentsBase {
 
     bool show_progress() const { return m_show_progress; }
 
+    bool get_use_single_file_archive() const { return m_single_file_archive; }
+
     bool sort_input_files() const { return m_sort_input_files; }
 
     bool print_archive_stats_progress() const { return m_print_archive_stats_progress; }
@@ -92,6 +95,7 @@ class CommandLineArguments : public CommandLineArgumentsBase {
     std::string m_output_dir;
     std::string m_schema_file_path;
     bool m_show_progress;
+    bool m_single_file_archive;
     bool m_print_archive_stats_progress;
     size_t m_target_encoded_file_size;
     size_t m_target_segment_uncompressed_size;

diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp
@@ -243,7 +243,9 @@ void FileCompressor::parse_and_encode_with_heuristic(
 
     // Parse content from file
     while (m_message_parser.parse_next_message(true, reader, m_parsed_message)) {
-        if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) {
+        if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts
+            && false == archive_writer.get_use_single_file_archive())
+        {
             split_file_and_archive(
                     archive_user_config,
                     path_for_compression,
@@ -337,7 +339,9 @@ bool FileCompressor::try_compressing_as_archive(
             parent_directories.emplace(file_parent_path);
         }
 
-        if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) {
+        if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts
+            && false == archive_writer.get_use_single_file_archive())
+        {
             split_archive(archive_user_config, archive_writer);
         }
 
@@ -537,7 +541,9 @@ std::error_code FileCompressor::compress_ir_stream_by_encoding(
         }
 
         // Split archive/encoded file if necessary before writing the new event
-        if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts) {
+        if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts
+            && false == archive.get_use_single_file_archive())
+        {
             split_file_and_archive(
                     archive_user_config,
                     path,

diff --git a/components/core/src/clp/clp/compression.cpp b/components/core/src/clp/clp/compression.cpp
@@ -107,6 +107,7 @@ bool compress(
     archive_user_config.global_metadata_db = global_metadata_db.get();
     archive_user_config.print_archive_stats_progress
             = command_line_args.print_archive_stats_progress();
+    archive_user_config.use_single_file_archive = command_line_args.get_use_single_file_archive();
 
     // Open Archive
     streaming_archive::writer::Archive archive_writer;
@@ -135,7 +136,9 @@ bool compress(
         );
     }
     for (auto it = files_to_compress.cbegin(); it != files_to_compress.cend(); ++it) {
-        if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) {
+        if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries
+            && false == archive_writer.get_use_single_file_archive())
+        {
             split_archive(archive_user_config, archive_writer);
         }
         if (false
@@ -163,7 +166,9 @@ bool compress(
          file_group_id_comparator);
     // Compress grouped files
     for (auto const& file_to_compress : grouped_files_to_compress) {
-        if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) {
+        if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries
+            && false == archive_writer.get_use_single_file_archive())
+        {
             split_archive(archive_user_config, archive_writer);
         }
         if (false

diff --git a/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp b/components/core/src/clp/streaming_archive/ArchiveMetadata.hpp
@@ -4,11 +4,15 @@
 #include <cstdint>
 
 #include "../Defs.h"
+#include "../ffi/encoding_methods.hpp"
 #include "../FileReader.hpp"
 #include "../FileWriter.hpp"
 #include "Constants.hpp"
 
 namespace clp::streaming_archive {
+
+static constexpr std::string_view cCompressionTypeZstd = "ZSTD";
+
 /**
  * A class to encapsulate metadata directly relating to an archive.
  */
@@ -79,6 +83,18 @@ class ArchiveMetadata {
 
     [[nodiscard]] auto get_end_timestamp() const { return m_end_timestamp; }
 
+    [[nodiscard]] auto get_variable_encoding_methods_version() const -> std::string const& {
+        return m_variable_encoding_methods_version;
+    }
+
+    [[nodiscard]] auto get_variables_schema_version() const -> std::string const& {
+        return m_variables_schema_version;
+    }
+
+    [[nodiscard]] auto get_compression_type() const -> std::string const& {
+        return m_compression_type;
+    }
+
     /**
      * Expands the archive's time range based to encompass the given time range
      * @param begin_timestamp
@@ -102,6 +118,12 @@ class ArchiveMetadata {
     // The size of the archive
     uint64_t m_compressed_size{0};
     uint64_t m_dynamic_compressed_size{0};
+    // TODO: The following fields are used in single-file archive; however, they are not
+    // currently part of multi-file archive metadata. Modifying multi-file archive metadata
+    // disk format is potentially a breaking change and not currently required.
+    std::string m_variable_encoding_methods_version{ffi::cVariableEncodingMethodsVersion};
+    std::string m_variables_schema_version{ffi::cVariablesSchemaVersion};
+    std::string m_compression_type{cCompressionTypeZstd};
 };
 }  // namespace clp::streaming_archive
 

diff --git a/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp b/components/core/src/clp/streaming_archive/single_file_archive/Defs.hpp
@@ -0,0 +1,84 @@
+#ifndef CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP
+#define CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP
+
+#include <cstdint>
+#include <string>
+
+#include "../../Defs.h"
+#include "../Constants.hpp"
+#include "msgpack.hpp"
+
+namespace clp::streaming_archive::single_file_archive {
+
+using single_file_archive_format_version_t = uint32_t;
+
+// Single file archive version.
+constexpr uint8_t cArchiveMajorVersion{0};
+constexpr uint8_t cArchiveMinorVersion{1};
+constexpr uint16_t cArchivePatchVersion{1};
+constexpr single_file_archive_format_version_t cArchiveVersion{
+        cArchiveMajorVersion << 24 | cArchiveMinorVersion << 16 | cArchivePatchVersion
+};
+
+static constexpr size_t cNumMagicNumberChars{4};
+static constexpr std::array<uint8_t, cNumMagicNumberChars>
+        cUnstructuredSfaMagicNumber{'Y', 'C', 'L', 'P'};
+static constexpr std::string_view cUnstructuredSfaExtension{".clp"};
+static constexpr size_t cFileSizeWarningThreshold{100L * 1024 * 1024};
+
+static constexpr size_t cNumStaticFiles{5};
+constexpr std::array<char const*, cNumStaticFiles> cStaticArchiveFileNames{
+        cMetadataDBFileName,
+        cLogTypeDictFilename,
+        cLogTypeSegmentIndexFilename,
+        cVarDictFilename,
+        cVarSegmentIndexFilename
+};
+
+static constexpr size_t cNumUnused{6};
+
+struct __attribute__((packed)) SingleFileArchiveHeader {
+    std::array<uint8_t, cNumMagicNumberChars> magic;
+    single_file_archive_format_version_t version;
+    uint64_t metadata_size;
+    std::array<uint64_t, cNumUnused> unused;
+};
+
+struct FileInfo {
+    std::string n;
+    uint64_t o;
+    MSGPACK_DEFINE_MAP(n, o);
+};
+
+struct MultiFileArchiveMetadata {
+    archive_format_version_t archive_format_version;
+    std::string variable_encoding_methods_version;
+    std::string variables_schema_version;
+    std::string compression_type;
+    std::string creator_id;
+    epochtime_t begin_timestamp;
+    epochtime_t end_timestamp;
+    uint64_t uncompressed_size;
+    uint64_t compressed_size;
+    MSGPACK_DEFINE_MAP(
+            archive_format_version,
+            variable_encoding_methods_version,
+            variables_schema_version,
+            compression_type,
+            creator_id,
+            begin_timestamp,
+            end_timestamp,
+            uncompressed_size,
+            compressed_size
+    );
+};
+
+struct SingleFileArchiveMetadata {
+    std::vector<FileInfo> archive_files;
+    MultiFileArchiveMetadata archive_metadata;
+    uint64_t num_segments;
+    MSGPACK_DEFINE_MAP(archive_files, archive_metadata, num_segments);
+};
+}  // namespace clp::streaming_archive::single_file_archive
+
+#endif  // CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP