-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(clp): Add the write path for single-file archives. #646
base: main
Are you sure you want to change the base?
Changes from all commits
a5ef64f
615fafd
b84c2da
d1ec9fe
869f1b3
d84c002
f4136f8
6bbf12e
5c75147
c1f12df
0ab6e0e
d4ed4f6
82b9802
393049b
5428403
7e261f7
0bd9b27
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -373,6 +373,10 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { | |||||
->default_value(m_schema_file_path), | ||||||
"Path to a schema file. If not specified, heuristics are used to determine " | ||||||
"dictionary variables. See README-Schema.md for details." | ||||||
)( | ||||||
"single-file-archive", | ||||||
po::bool_switch(&m_single_file_archive), | ||||||
"Output archive as a single-file" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Also, should we support the option that allows user to specify the filename? |
||||||
); | ||||||
|
||||||
po::options_description all_compression_options; | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ class CommandLineArguments : public CommandLineArgumentsBase { | |
explicit CommandLineArguments(std::string const& program_name) | ||
: CommandLineArgumentsBase(program_name), | ||
m_show_progress(false), | ||
m_single_file_archive(false), | ||
m_sort_input_files(true), | ||
m_print_archive_stats_progress(false), | ||
m_target_segment_uncompressed_size(1L * 1024 * 1024 * 1024), | ||
|
@@ -45,6 +46,8 @@ class CommandLineArguments : public CommandLineArgumentsBase { | |
|
||
bool show_progress() const { return m_show_progress; } | ||
|
||
bool get_use_single_file_archive() const { return m_single_file_archive; } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: we can name it to be "use_single_file_archive" or maybe even "single_file_archive", same as other boolean. |
||
|
||
bool sort_input_files() const { return m_sort_input_files; } | ||
|
||
bool print_archive_stats_progress() const { return m_print_archive_stats_progress; } | ||
|
@@ -92,6 +95,7 @@ class CommandLineArguments : public CommandLineArgumentsBase { | |
std::string m_output_dir; | ||
std::string m_schema_file_path; | ||
bool m_show_progress; | ||
bool m_single_file_archive; | ||
bool m_print_archive_stats_progress; | ||
size_t m_target_encoded_file_size; | ||
size_t m_target_segment_uncompressed_size; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -243,7 +243,9 @@ void FileCompressor::parse_and_encode_with_heuristic( | |
|
||
// Parse content from file | ||
while (m_message_parser.parse_next_message(true, reader, m_parsed_message)) { | ||
if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { | ||
if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just thinking, would it be cleaner if we add a new method with name like "should_split" to archive_writer, and embed this if logic into the method. Now the same if statements have been duplicated at multiple places, which is inefficient and error prone since one change requires you to update multiple places |
||
&& false == archive_writer.get_use_single_file_archive()) | ||
{ | ||
split_file_and_archive( | ||
archive_user_config, | ||
path_for_compression, | ||
|
@@ -337,7 +339,9 @@ bool FileCompressor::try_compressing_as_archive( | |
parent_directories.emplace(file_parent_path); | ||
} | ||
|
||
if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { | ||
if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dicts | ||
&& false == archive_writer.get_use_single_file_archive()) | ||
{ | ||
split_archive(archive_user_config, archive_writer); | ||
} | ||
|
||
|
@@ -537,7 +541,9 @@ std::error_code FileCompressor::compress_ir_stream_by_encoding( | |
} | ||
|
||
// Split archive/encoded file if necessary before writing the new event | ||
if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts) { | ||
if (archive.get_data_size_of_dictionaries() >= target_data_size_of_dicts | ||
&& false == archive.get_use_single_file_archive()) | ||
{ | ||
split_file_and_archive( | ||
archive_user_config, | ||
path, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -107,6 +107,7 @@ bool compress( | |
archive_user_config.global_metadata_db = global_metadata_db.get(); | ||
archive_user_config.print_archive_stats_progress | ||
= command_line_args.print_archive_stats_progress(); | ||
archive_user_config.use_single_file_archive = command_line_args.get_use_single_file_archive(); | ||
|
||
// Open Archive | ||
streaming_archive::writer::Archive archive_writer; | ||
|
@@ -135,7 +136,9 @@ bool compress( | |
); | ||
} | ||
for (auto it = files_to_compress.cbegin(); it != files_to_compress.cend(); ++it) { | ||
if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { | ||
if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
&& false == archive_writer.get_use_single_file_archive()) | ||
{ | ||
split_archive(archive_user_config, archive_writer); | ||
} | ||
if (false | ||
|
@@ -163,7 +166,9 @@ bool compress( | |
file_group_id_comparator); | ||
// Compress grouped files | ||
for (auto const& file_to_compress : grouped_files_to_compress) { | ||
if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries) { | ||
if (archive_writer.get_data_size_of_dictionaries() >= target_data_size_of_dictionaries | ||
&& false == archive_writer.get_use_single_file_archive()) | ||
{ | ||
split_archive(archive_user_config, archive_writer); | ||
} | ||
if (false | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#ifndef CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP | ||
#define CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP | ||
|
||
#include <cstdint> | ||
#include <string> | ||
|
||
#include "../../Defs.h" | ||
#include "../Constants.hpp" | ||
#include "msgpack.hpp" | ||
|
||
namespace clp::streaming_archive::single_file_archive { | ||
|
||
using single_file_archive_format_version_t = uint32_t; | ||
|
||
// Single file archive version. | ||
constexpr uint8_t cArchiveMajorVersion{0}; | ||
constexpr uint8_t cArchiveMinorVersion{1}; | ||
constexpr uint16_t cArchivePatchVersion{1}; | ||
constexpr single_file_archive_format_version_t cArchiveVersion{ | ||
cArchiveMajorVersion << 24 | cArchiveMinorVersion << 16 | cArchivePatchVersion | ||
}; | ||
|
||
static constexpr size_t cNumMagicNumberChars{4}; | ||
static constexpr std::array<uint8_t, cNumMagicNumberChars> | ||
cUnstructuredSfaMagicNumber{'Y', 'C', 'L', 'P'}; | ||
static constexpr std::string_view cUnstructuredSfaExtension{".clp"}; | ||
static constexpr size_t cFileSizeWarningThreshold{100L * 1024 * 1024}; | ||
|
||
static constexpr size_t cNumStaticFiles{5}; | ||
constexpr std::array<char const*, cNumStaticFiles> cStaticArchiveFileNames{ | ||
cMetadataDBFileName, | ||
cLogTypeDictFilename, | ||
cLogTypeSegmentIndexFilename, | ||
cVarDictFilename, | ||
cVarSegmentIndexFilename | ||
}; | ||
|
||
static constexpr size_t cNumUnused{6}; | ||
|
||
struct __attribute__((packed)) SingleFileArchiveHeader { | ||
std::array<uint8_t, cNumMagicNumberChars> magic; | ||
single_file_archive_format_version_t version; | ||
uint64_t metadata_size; | ||
std::array<uint64_t, cNumUnused> unused; | ||
}; | ||
|
||
struct FileInfo { | ||
std::string n; | ||
uint64_t o; | ||
MSGPACK_DEFINE_MAP(n, o); | ||
}; | ||
|
||
struct MultiFileArchiveMetadata { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @LinZhihao-723 @kirkrodrigues There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://google.github.io/styleguide/cppguide.html#Structs_vs._Classes
Do any of the fields in this struct have relationships between each other? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The relationships between the fields are weak, however, one could argue relationships between uncompressed and compressed size. One could also argue that the ArchiveMetadata class, the location where the values come from, should be responsible for enforcing the invariant, and not this struct, which is just a temporary container for msgpack serialization. Also note that class members are prefixed with "m_", which would complicate msgpack serialization since as default it just uses the variables names as keys. Lastly, I took this from clp-s code since i thought it was more elegant than the nholman json class serialization interface. |
||
archive_format_version_t archive_format_version; | ||
std::string variable_encoding_methods_version; | ||
std::string variables_schema_version; | ||
std::string compression_type; | ||
std::string creator_id; | ||
epochtime_t begin_timestamp; | ||
epochtime_t end_timestamp; | ||
uint64_t uncompressed_size; | ||
uint64_t compressed_size; | ||
MSGPACK_DEFINE_MAP( | ||
archive_format_version, | ||
variable_encoding_methods_version, | ||
variables_schema_version, | ||
compression_type, | ||
creator_id, | ||
begin_timestamp, | ||
end_timestamp, | ||
uncompressed_size, | ||
compressed_size | ||
); | ||
}; | ||
|
||
struct SingleFileArchiveMetadata { | ||
std::vector<FileInfo> archive_files; | ||
MultiFileArchiveMetadata archive_metadata; | ||
uint64_t num_segments; | ||
MSGPACK_DEFINE_MAP(archive_files, archive_metadata, num_segments); | ||
}; | ||
} // namespace clp::streaming_archive::single_file_archive | ||
|
||
#endif // CLP_STREAMING_ARCHIVE_SINGLE_FILE_ARCHIVE_DEFS_HPP |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💡 Codebase verification
Missing test coverage for the new single-file archive implementation
The new single-file archive files have been properly added to the CMake configuration, but I don't see any corresponding test files for this functionality. While there are several test files in the codebase:
test-clp_s-end_to_end.cpp
Recommended actions:
writer.cpp
🔗 Analysis chain
LGTM! The new single-file archive source files are properly integrated.
The files are correctly placed in a dedicated directory and follow the project's naming conventions.
Let's verify the test coverage for these new files:
🏁 Scripts executed
The following scripts were executed for the analysis:
Script:
Length of output: 175
Script:
Length of output: 1270
Script:
Length of output: 1828