Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ICU charset conversion implementation #64

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .build/build
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ cd "$(dirname "$0")"/..

mkdir -p build
cd build
cmake ..
cmake -DSTRING_ENCODING_TYPE="$ENCODING_TYPE" ..
cmake --build .
11 changes: 11 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,22 @@ on:
jobs:
linux:
runs-on: ubuntu-latest
strategy:
matrix:
encoding:
- ICONV
- ICU
env:
ENCODING_TYPE: ${{matrix.encoding}}
steps:
- uses: actions/checkout@v3
- name: restore
run: |
sudo apt-get install -y libgtest-dev
GreyCat marked this conversation as resolved.
Show resolved Hide resolved
- name: restore ICU
run: |
sudo apt-get install -y libicu-dev
if: matrix.encoding == 'ICU'
- name: build
run: .build/build
- name: unittest
Expand Down
16 changes: 14 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ set (CMAKE_INCLUDE_CURRENT_DIR ON)
find_package(ZLIB)
find_package(Iconv)

find_package(ICU COMPONENTS uc io)

set(ICU_FOUND FALSE)
if(ICU_INCLUDE_DIRS AND ICU_LIBRARIES)
SET(ICU_FOUND TRUE)
endif()

set (HEADERS
kaitai/kaitaistream.h
kaitai/kaitaistruct.h
Expand All @@ -17,11 +24,11 @@ set (SOURCES
kaitai/kaitaistream.cpp
)

set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)")
set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|ICU|NONE|...)")

set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

add_library (${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES})
add_library(${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES})
set_property(TARGET ${PROJECT_NAME} PROPERTY PUBLIC_HEADER ${HEADERS})

if (ZLIB_FOUND)
Expand All @@ -33,6 +40,11 @@ if(Iconv_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE Iconv::Iconv)
endif()

if(ICU_FOUND)
target_include_directories(${PROJECT_NAME} PRIVATE ${ICU_INCLUDE_DIRS})
target_link_libraries(${PROJECT_NAME} PRIVATE ${ICU_LIBRARIES})
endif()

include(Common.cmake)

install(TARGETS ${PROJECT_NAME}
Expand Down
2 changes: 2 additions & 0 deletions Common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ if (STRING_ENCODING_TYPE STREQUAL "ICONV")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV)
elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API)
elseif (STRING_ENCODING_TYPE STREQUAL "ICU")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICU)
elseif (STRING_ENCODING_TYPE STREQUAL "NONE")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE)
else()
Expand Down
44 changes: 43 additions & 1 deletion kaitai/kaitaistream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,48 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) {
return utf8;
}

#elif defined(KS_STR_ENCODING_ICU)
#include <unicode/ucnv.h>
#include <iostream>

std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) {
// Start with a buffer length of double the source length.
size_t init_dst_len = src.length() * 2;
std::string dst(init_dst_len, ' ');

UErrorCode err = U_ZERO_ERROR;
int32_t dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], init_dst_len, src.c_str(), src.length(), &err);

if (err == U_BUFFER_OVERFLOW_ERROR) {
// We need a bigger buffer, but at least we know how much space exactly we need now
dst.resize(dst_len, ' ');

// Try again with the new buffer
err = U_ZERO_ERROR;
dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], dst_len, src.c_str(), src.length(), &err);
} else if (!U_FAILURE(err)) {
// Conversion succeed from the first try, shrink the buffer to fit
dst.resize(dst_len);
}

std::cout << "err = " << err << std::endl;
// Dump all bytes of result
for (int i = 0; i < dst_len; i++) {
std::cout << std::hex << (int)(uint8_t)dst[i] << " ";
}
std::cout << "\n";

if (U_FAILURE(err)) {
// Conversion failed
if (err == U_FILE_ACCESS_ERROR) {
throw unknown_encoding(src_enc);
} else {
throw bytes_to_str_error(u_errorName(err));
}
}

return dst;
}
#else
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_ICU, KS_STR_ENCODING_NONE
#endif
12 changes: 11 additions & 1 deletion tests/unittest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ TEST(KaitaiStreamTest, bytes_to_str_big_dest)
{
// Prepare a string in IBM437 that is reasonably big, fill it with U+2248 ALMOST EQUAL TO character,
// which is just 1 byte 0xFB in IBM437.
const int len = 10000000;
const int len = 10;
std::string src(len, '\xF7');

std::string res = kaitai::kstream::bytes_to_str(src, "IBM437");
Expand Down Expand Up @@ -274,6 +274,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_euc_jp_too_short)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -291,6 +293,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -307,6 +311,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_two_bytes)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EILSEQ"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -324,6 +330,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_odd_bytes)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: incomplete"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -342,6 +350,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_incomplete_high_surroga
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: WideCharToMultiByte"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand Down