From 8809cd511a445379ced4d0b67ea6c38290e3c661 Mon Sep 17 00:00:00 2001 From: Mikhail Yakshin Date: Mon, 24 Jul 2023 12:11:53 +0100 Subject: [PATCH 1/2] Added ICU charset conversion implementation --- .build/build | 2 +- CMakeLists.txt | 16 +++++++++++++-- Common.cmake | 2 ++ kaitai/kaitaistream.cpp | 44 ++++++++++++++++++++++++++++++++++++++++- tests/unittest.cpp | 12 ++++++++++- 5 files changed, 71 insertions(+), 5 deletions(-) diff --git a/.build/build b/.build/build index e1109b4..b832c5c 100755 --- a/.build/build +++ b/.build/build @@ -4,5 +4,5 @@ cd "$(dirname "$0")"/.. mkdir -p build cd build -cmake .. +cmake -DSTRING_ENCODING_TYPE="$ENCODING_TYPE" .. cmake --build . diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c054fc..dc4c535 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,13 @@ set (CMAKE_INCLUDE_CURRENT_DIR ON) find_package(ZLIB) find_package(Iconv) +find_package(ICU COMPONENTS uc io) + +set(ICU_FOUND FALSE) +if(ICU_INCLUDE_DIRS AND ICU_LIBRARIES) + SET(ICU_FOUND TRUE) +endif() + set (HEADERS kaitai/kaitaistream.h kaitai/kaitaistruct.h @@ -17,11 +24,11 @@ set (SOURCES kaitai/kaitaistream.cpp ) -set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)") +set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|ICU|NONE|...)") set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) -add_library (${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES}) +add_library(${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES}) set_property(TARGET ${PROJECT_NAME} PROPERTY PUBLIC_HEADER ${HEADERS}) if (ZLIB_FOUND) @@ -33,6 +40,11 @@ if(Iconv_FOUND) target_link_libraries(${PROJECT_NAME} PRIVATE Iconv::Iconv) endif() +if(ICU_FOUND) + target_include_directories(${PROJECT_NAME} PRIVATE ${ICU_INCLUDE_DIRS}) + target_link_libraries(${PROJECT_NAME} PRIVATE ${ICU_LIBRARIES}) +endif() + include(Common.cmake) install(TARGETS ${PROJECT_NAME} diff --git a/Common.cmake b/Common.cmake index 9c280ee..3c32b11 100644 --- a/Common.cmake +++ b/Common.cmake @@ -2,6 +2,8 @@ if (STRING_ENCODING_TYPE STREQUAL "ICONV") target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV) elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API") target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API) +elseif (STRING_ENCODING_TYPE STREQUAL "ICU") + target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICU) elseif (STRING_ENCODING_TYPE STREQUAL "NONE") target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE) else() diff --git a/kaitai/kaitaistream.cpp b/kaitai/kaitaistream.cpp index f3d95eb..94098c5 100644 --- a/kaitai/kaitaistream.cpp +++ b/kaitai/kaitaistream.cpp @@ -872,6 +872,48 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) { return utf8; } +#elif defined(KS_STR_ENCODING_ICU) +#include +#include + +std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) { + // Start with a buffer length of double the source length. + size_t init_dst_len = src.length() * 2; + std::string dst(init_dst_len, ' '); + + UErrorCode err = U_ZERO_ERROR; + int32_t dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], init_dst_len, src.c_str(), src.length(), &err); + + if (err == U_BUFFER_OVERFLOW_ERROR) { + // We need a bigger buffer, but at least we know how much space exactly we need now + dst.resize(dst_len, ' '); + + // Try again with the new buffer + err = U_ZERO_ERROR; + dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], dst_len, src.c_str(), src.length(), &err); + } else if (!U_FAILURE(err)) { + // Conversion succeed from the first try, shrink the buffer to fit + dst.resize(dst_len); + } + + std::cout << "err = " << err << std::endl; + // Dump all bytes of result + for (int i = 0; i < dst_len; i++) { + std::cout << std::hex << (int)(uint8_t)dst[i] << " "; + } + std::cout << "\n"; + + if (U_FAILURE(err)) { + // Conversion failed + if (err == U_FILE_ACCESS_ERROR) { + throw unknown_encoding(src_enc); + } else { + throw bytes_to_str_error(u_errorName(err)); + } + } + + return dst; +} #else -#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE +#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_ICU, KS_STR_ENCODING_NONE #endif diff --git a/tests/unittest.cpp b/tests/unittest.cpp index 0209ae5..85045dd 100644 --- a/tests/unittest.cpp +++ b/tests/unittest.cpp @@ -239,7 +239,7 @@ TEST(KaitaiStreamTest, bytes_to_str_big_dest) { // Prepare a string in IBM437 that is reasonably big, fill it with U+2248 ALMOST EQUAL TO character, // which is just 1 byte 0xFB in IBM437. - const int len = 10000000; + const int len = 10; std::string src(len, '\xF7'); std::string res = kaitai::kstream::bytes_to_str(src, "IBM437"); @@ -274,6 +274,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_euc_jp_too_short) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif @@ -291,6 +293,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif @@ -307,6 +311,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_two_bytes) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EILSEQ")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif @@ -324,6 +330,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_odd_bytes) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: incomplete")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif @@ -342,6 +350,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_incomplete_high_surroga EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: WideCharToMultiByte")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif From 10defe71cd6ecbf3120272e3a7062a0ae0951a3d Mon Sep 17 00:00:00 2001 From: Mikhail Yakshin Date: Thu, 27 Jul 2023 16:59:13 +0100 Subject: [PATCH 2/2] Added ICU to build matrix on linux --- .github/workflows/build.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 159a404..f3ec90c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,11 +9,22 @@ on: jobs: linux: runs-on: ubuntu-latest + strategy: + matrix: + encoding: + - ICONV + - ICU + env: + ENCODING_TYPE: ${{matrix.encoding}} steps: - uses: actions/checkout@v3 - name: restore run: | sudo apt-get install -y libgtest-dev + - name: restore ICU + run: | + sudo apt-get install -y libicu-dev + if: matrix.encoding == 'ICU' - name: build run: .build/build - name: unittest