From 7d0acf533cd6282d3712c6d9be140de4c1dda6c4 Mon Sep 17 00:00:00 2001 From: RDW Date: Thu, 16 Jan 2025 00:53:14 +0100 Subject: [PATCH 1/5] Runtime: Export the low-level iconv API as well The bindings currently only export a decently-fast but rather inflexible iconv wrapper. For some use cases (e.g., gradually converting incoming bytes or resuming failed conversions) it would be useful to have access to all interfaces. This low-level API is rather difficult to use and also pretty slow, so maybe it'd be better to create an async/continuation-style API in C++ and export that. However, the maintenance cost of just exposing the existing function is arguably lower and it's a niche use case anyway. --- Runtime/Bindings/FFI/iconv/iconv.lua | 5 ++++- Runtime/Bindings/FFI/iconv/iconv_aliases.h | 1 + Runtime/Bindings/FFI/iconv/iconv_exports.h | 3 +++ Runtime/Bindings/FFI/iconv/iconv_ffi.cpp | 3 +++ Runtime/Bindings/FFI/iconv/iconv_ffi.hpp | 1 + 5 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 Runtime/Bindings/FFI/iconv/iconv_aliases.h diff --git a/Runtime/Bindings/FFI/iconv/iconv.lua b/Runtime/Bindings/FFI/iconv/iconv.lua index d00241d18..be4e122b8 100644 --- a/Runtime/Bindings/FFI/iconv/iconv.lua +++ b/Runtime/Bindings/FFI/iconv/iconv.lua @@ -10,7 +10,7 @@ local tostring = tostring local iconv = {} iconv.cdefs = [[ - +typedef void* iconv_t; typedef struct iconv_result_t { uint8_t status_code; size_t num_bytes_written; @@ -19,6 +19,9 @@ typedef struct iconv_result_t { struct static_iconv_exports_table { iconv_result_t (*iconv_convert)(char* input, size_t input_size, const char* input_encoding, const char* output_encoding, char* output, size_t output_size); + iconv_t (*iconv_open)(const char* input_encoding, const char* output_encoding); + int (*iconv_close)(iconv_t conversion_descriptor); + size_t (*iconv)(iconv_t conversion_descriptor, char** input, size_t* input_size, char** output, size_t* output_size); }; ]] diff --git a/Runtime/Bindings/FFI/iconv/iconv_aliases.h b/Runtime/Bindings/FFI/iconv/iconv_aliases.h new file mode 100644 index 000000000..34a099071 --- /dev/null +++ b/Runtime/Bindings/FFI/iconv/iconv_aliases.h @@ -0,0 +1 @@ +typedef void* iconv_t; \ No newline at end of file diff --git a/Runtime/Bindings/FFI/iconv/iconv_exports.h b/Runtime/Bindings/FFI/iconv/iconv_exports.h index f17b7a388..d4b6b86e3 100644 --- a/Runtime/Bindings/FFI/iconv/iconv_exports.h +++ b/Runtime/Bindings/FFI/iconv/iconv_exports.h @@ -6,4 +6,7 @@ typedef struct iconv_result_t { struct static_iconv_exports_table { iconv_result_t (*iconv_convert)(char* input, size_t input_size, const char* input_encoding, const char* output_encoding, char* output, size_t output_size); + iconv_t (*iconv_open)(const char* input_encoding, const char* output_encoding); + int (*iconv_close)(iconv_t conversion_descriptor); + size_t (*iconv)(iconv_t conversion_descriptor, char** input, size_t* input_size, char** output, size_t* output_size); }; \ No newline at end of file diff --git a/Runtime/Bindings/FFI/iconv/iconv_ffi.cpp b/Runtime/Bindings/FFI/iconv/iconv_ffi.cpp index 59a791af5..3c010f6c9 100644 --- a/Runtime/Bindings/FFI/iconv/iconv_ffi.cpp +++ b/Runtime/Bindings/FFI/iconv/iconv_ffi.cpp @@ -50,6 +50,9 @@ namespace iconv_ffi { void* getExportsTable() { static struct static_iconv_exports_table exports = { .iconv_convert = &iconv_convert, + .iconv_open = &iconv_open, + .iconv_close = &iconv_close, + .iconv = &iconv, }; return &exports; diff --git a/Runtime/Bindings/FFI/iconv/iconv_ffi.hpp b/Runtime/Bindings/FFI/iconv/iconv_ffi.hpp index 6a7fbd513..72d3ae858 100644 --- a/Runtime/Bindings/FFI/iconv/iconv_ffi.hpp +++ b/Runtime/Bindings/FFI/iconv/iconv_ffi.hpp @@ -3,6 +3,7 @@ #include #include +#include #include "iconv_exports.h" namespace iconv_ffi { From 8c135128adda0d81f4bf02da4f2f5e32baf18128 Mon Sep 17 00:00:00 2001 From: RDW Date: Thu, 16 Jan 2025 02:07:49 +0100 Subject: [PATCH 2/5] Tests: Add unit tests for the low-level iconv APIs --- Runtime/Bindings/FFI/iconv/iconv.lua | 4 ++- Tests/BDD/iconv-library.spec.lua | 52 ++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/Runtime/Bindings/FFI/iconv/iconv.lua b/Runtime/Bindings/FFI/iconv/iconv.lua index be4e122b8..0c51a5b72 100644 --- a/Runtime/Bindings/FFI/iconv/iconv.lua +++ b/Runtime/Bindings/FFI/iconv/iconv.lua @@ -7,7 +7,9 @@ local ffi_string = ffi.string local tonumber = tonumber local tostring = tostring -local iconv = {} +local iconv = { + ERROR_CONVERSION_FAILED = ffi.cast("size_t", -1ULL), +} iconv.cdefs = [[ typedef void* iconv_t; diff --git a/Tests/BDD/iconv-library.spec.lua b/Tests/BDD/iconv-library.spec.lua index d1836e75a..a7a39be08 100644 --- a/Tests/BDD/iconv-library.spec.lua +++ b/Tests/BDD/iconv-library.spec.lua @@ -71,6 +71,58 @@ describe("iconv", function() assertEquals(numBytesWritten, 12) end) end) + + describe("iconv_open", function() + local descriptors = {} + before(function() + descriptors.valid = iconv.bindings.iconv_open("CP949", "UTF-8") + descriptors.invalid = iconv.bindings.iconv_open("Not-a-real-encoding", "UTF-8") + assertEquals(table.count(descriptors), 2) + end) + + after(function() + for label, descriptor in pairs(descriptors) do + iconv.bindings.iconv_close(descriptor) -- NOOP if invalid + end + end) + + it("should indicate an error if the requested conversion isn't supported", function() + local descriptor = iconv.bindings.iconv_open("Not-a-real-encoding", "UTF-8") + assertEquals(ffi.cast("size_t", descriptor), iconv.ERROR_CONVERSION_FAILED) + iconv.bindings.iconv_close(descriptor) + end) + + it("should return a valid handle if the conversion is supported", function() + local descriptor = iconv.bindings.iconv_open("CP949", "UTF-8") + assertFalse(ffi.cast("size_t", descriptor) == iconv.ERROR_CONVERSION_FAILED) + iconv.bindings.iconv_close(descriptor) + end) + end) + + describe("iconv", function() + it("should be able to convert Windows encodings to UTF-8", function() + local descriptor = iconv.bindings.iconv_open("UTF-8", "CP949") + assert(descriptor ~= iconv.ERROR_CONVERSION_FAILED, "Failed to create conversion descriptor") + + local input = "\192\175\192\250\192\206\197\205\198\228\192\204\189\186" + local inputSize = ffi.new("size_t[1]", #input) + local inputBuffer = ffi.new("char[?]", #input, input) + local inputRef = ffi.new("char*[1]", inputBuffer) + + local outputSize = ffi.new("size_t[1]", 256) + local outputBuffer = ffi.new("char[256]") + local outputRef = ffi.new("char*[1]", outputBuffer) + + local result = iconv.bindings.iconv(descriptor, inputRef, inputSize, outputRef, outputSize) + assert(result ~= iconv.ERROR_CONVERSION_FAILED, "Conversion failed") + + local convertedSize = 256 - outputSize[0] + local converted = ffi.string(outputBuffer, convertedSize) + assertEquals(converted, "유저인터페이스") + + iconv.bindings.iconv_close(descriptor) + end) + end) end) describe("convert", function() From 70b6dfbd08a8e38145b8b1a31a01d8b7b74ac1b1 Mon Sep 17 00:00:00 2001 From: RDW Date: Thu, 16 Jan 2025 03:46:34 +0100 Subject: [PATCH 3/5] Perf: Add a benchmark for the iconv FFI bindings Certainly not the most representative workload, but it'll do. --- Benchmarks/iconv-charset-conversion.lua | 74 +++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 Benchmarks/iconv-charset-conversion.lua diff --git a/Benchmarks/iconv-charset-conversion.lua b/Benchmarks/iconv-charset-conversion.lua new file mode 100644 index 000000000..9b9c98426 --- /dev/null +++ b/Benchmarks/iconv-charset-conversion.lua @@ -0,0 +1,74 @@ +local console = require("console") +local iconv = require("iconv") +local ffi = require("ffi") + +local SAMPLE_SIZE = 500000 + +local function iconv_lowlevel() + local input = "\192\175\192\250\192\206\197\205\198\228\192\204\189\186" + local descriptor = iconv.bindings.iconv_open("UTF-8", "CP949") + + local inputSize = ffi.new("size_t[1]", #input) + local inputBuffer = ffi.new("char[?]", #input, input) + local inputRef = ffi.new("char*[1]", inputBuffer) + + local worstCaseOutputSize = #input * 4 + local outputSize = ffi.new("size_t[1]", worstCaseOutputSize) + local outputBuffer = ffi.new("char[256]") + local outputRef = ffi.new("char*[1]", outputBuffer) + + local result = iconv.bindings.iconv(descriptor, inputRef, inputSize, outputRef, outputSize) + local numConversionsPerformed = worstCaseOutputSize - outputSize[0] + local converted = ffi.string(outputBuffer, numConversionsPerformed) + + iconv.bindings.iconv_close(descriptor) + return converted, result +end + +local function iconv_lua() + local input = "\192\175\192\250\192\206\197\205\198\228\192\204\189\186" + local output, message = iconv.convert(input, "CP949", "UTF-8") + return output, message +end + +local function iconv_cpp() + local inputBuffer = buffer.new() + local outputBuffer = buffer.new(1024) + local ptr, len = outputBuffer:reserve(1024) + local result = iconv.bindings.iconv_convert(inputBuffer, #inputBuffer, "CP949", "UTF-8", ptr, len) + return result +end + +math.randomseed(os.clock()) +local availableBenchmarks = { + function() + local label = "[FFI] Low-level API (tedious and slow, but the most flexible)" + console.startTimer(label) + for i = 1, SAMPLE_SIZE, 1 do + iconv_lowlevel() + end + console.stopTimer(label) + end, + function() + local label = "[FFI] One-shot C++ conversion (fast but less flexible)" + console.startTimer(label) + for i = 1, SAMPLE_SIZE, 1 do + iconv_cpp() + end + console.stopTimer(label) + end, + function() + local label = "[FFI] Lua-friendly wrapper (safer, but slower)" + console.startTimer(label) + for i = 1, SAMPLE_SIZE, 1 do + iconv_lua() + end + console.stopTimer(label) + end, +} + +table.shuffle(availableBenchmarks) + +for _, benchmark in ipairs(availableBenchmarks) do + benchmark() +end From 6e033057d97b8b3d60b17a35cd7667f87890daf6 Mon Sep 17 00:00:00 2001 From: RDW Date: Thu, 16 Jan 2025 04:50:28 +0100 Subject: [PATCH 4/5] Refactor: Introduce a shared constant for iconv errors There's no universal system for error handling in place yet, but since selene can't parse LuaJIT's ULL syntax extension, might as well move this constant to C++ already. --- Runtime/Bindings/FFI/iconv/iconv.lua | 7 ++++--- Runtime/Bindings/FFI/iconv/iconv_exports.h | 3 +++ Runtime/Bindings/FFI/iconv/iconv_ffi.cpp | 5 ++++- Runtime/Bindings/FFI/iconv/iconv_ffi.hpp | 2 ++ Tests/BDD/iconv-library.spec.lua | 8 ++++---- 5 files changed, 17 insertions(+), 8 deletions(-) diff --git a/Runtime/Bindings/FFI/iconv/iconv.lua b/Runtime/Bindings/FFI/iconv/iconv.lua index 0c51a5b72..d4f02f2d4 100644 --- a/Runtime/Bindings/FFI/iconv/iconv.lua +++ b/Runtime/Bindings/FFI/iconv/iconv.lua @@ -7,9 +7,7 @@ local ffi_string = ffi.string local tonumber = tonumber local tostring = tostring -local iconv = { - ERROR_CONVERSION_FAILED = ffi.cast("size_t", -1ULL), -} +local iconv = {} iconv.cdefs = [[ typedef void* iconv_t; @@ -24,6 +22,9 @@ struct static_iconv_exports_table { iconv_t (*iconv_open)(const char* input_encoding, const char* output_encoding); int (*iconv_close)(iconv_t conversion_descriptor); size_t (*iconv)(iconv_t conversion_descriptor, char** input, size_t* input_size, char** output, size_t* output_size); + + // Shared constants + size_t CHARSET_CONVERSION_FAILED; }; ]] diff --git a/Runtime/Bindings/FFI/iconv/iconv_exports.h b/Runtime/Bindings/FFI/iconv/iconv_exports.h index d4b6b86e3..76b0351cd 100644 --- a/Runtime/Bindings/FFI/iconv/iconv_exports.h +++ b/Runtime/Bindings/FFI/iconv/iconv_exports.h @@ -9,4 +9,7 @@ struct static_iconv_exports_table { iconv_t (*iconv_open)(const char* input_encoding, const char* output_encoding); int (*iconv_close)(iconv_t conversion_descriptor); size_t (*iconv)(iconv_t conversion_descriptor, char** input, size_t* input_size, char** output, size_t* output_size); + + // Shared constants + size_t CHARSET_CONVERSION_FAILED; }; \ No newline at end of file diff --git a/Runtime/Bindings/FFI/iconv/iconv_ffi.cpp b/Runtime/Bindings/FFI/iconv/iconv_ffi.cpp index 3c010f6c9..2e84875c5 100644 --- a/Runtime/Bindings/FFI/iconv/iconv_ffi.cpp +++ b/Runtime/Bindings/FFI/iconv/iconv_ffi.cpp @@ -18,7 +18,7 @@ iconv_result_t iconv_convert(char* input, size_t input_length, const char* input size_t num_input_bytes_left = input_length; iconv_t conversion_descriptor = iconv_open(output_encoding, input_encoding); - if(conversion_descriptor == (iconv_t)-1) { + if(reinterpret_cast(conversion_descriptor) == iconv_ffi::CHARSET_CONVERSION_FAILED) { result.message = strerror(errno); result.status_code = errno; result.num_bytes_written = 0; @@ -53,6 +53,9 @@ namespace iconv_ffi { .iconv_open = &iconv_open, .iconv_close = &iconv_close, .iconv = &iconv, + + // Shared constants + .CHARSET_CONVERSION_FAILED = CHARSET_CONVERSION_FAILED, }; return &exports; diff --git a/Runtime/Bindings/FFI/iconv/iconv_ffi.hpp b/Runtime/Bindings/FFI/iconv/iconv_ffi.hpp index 72d3ae858..03811283e 100644 --- a/Runtime/Bindings/FFI/iconv/iconv_ffi.hpp +++ b/Runtime/Bindings/FFI/iconv/iconv_ffi.hpp @@ -7,5 +7,7 @@ #include "iconv_exports.h" namespace iconv_ffi { + constexpr std::size_t CHARSET_CONVERSION_FAILED = static_cast(-1); + void* getExportsTable(); } \ No newline at end of file diff --git a/Tests/BDD/iconv-library.spec.lua b/Tests/BDD/iconv-library.spec.lua index a7a39be08..ffd8f1b81 100644 --- a/Tests/BDD/iconv-library.spec.lua +++ b/Tests/BDD/iconv-library.spec.lua @@ -88,13 +88,13 @@ describe("iconv", function() it("should indicate an error if the requested conversion isn't supported", function() local descriptor = iconv.bindings.iconv_open("Not-a-real-encoding", "UTF-8") - assertEquals(ffi.cast("size_t", descriptor), iconv.ERROR_CONVERSION_FAILED) + assertEquals(ffi.cast("size_t", descriptor), iconv.bindings.CHARSET_CONVERSION_FAILED) iconv.bindings.iconv_close(descriptor) end) it("should return a valid handle if the conversion is supported", function() local descriptor = iconv.bindings.iconv_open("CP949", "UTF-8") - assertFalse(ffi.cast("size_t", descriptor) == iconv.ERROR_CONVERSION_FAILED) + assertFalse(ffi.cast("size_t", descriptor) == iconv.bindings.CHARSET_CONVERSION_FAILED) iconv.bindings.iconv_close(descriptor) end) end) @@ -102,7 +102,7 @@ describe("iconv", function() describe("iconv", function() it("should be able to convert Windows encodings to UTF-8", function() local descriptor = iconv.bindings.iconv_open("UTF-8", "CP949") - assert(descriptor ~= iconv.ERROR_CONVERSION_FAILED, "Failed to create conversion descriptor") + assert(descriptor ~= iconv.bindings.CHARSET_CONVERSION_FAILED, "Failed to create conversion descriptor") local input = "\192\175\192\250\192\206\197\205\198\228\192\204\189\186" local inputSize = ffi.new("size_t[1]", #input) @@ -114,7 +114,7 @@ describe("iconv", function() local outputRef = ffi.new("char*[1]", outputBuffer) local result = iconv.bindings.iconv(descriptor, inputRef, inputSize, outputRef, outputSize) - assert(result ~= iconv.ERROR_CONVERSION_FAILED, "Conversion failed") + assert(result ~= iconv.bindings.CHARSET_CONVERSION_FAILED, "Conversion failed") local convertedSize = 256 - outputSize[0] local converted = ffi.string(outputBuffer, convertedSize) From 10c513576d154de1ab34ba87dda8dabaee42cd31 Mon Sep 17 00:00:00 2001 From: RDW Date: Thu, 16 Jan 2025 15:46:33 +0100 Subject: [PATCH 5/5] Runtime: Add a safeguard for closing iconv handles on MSYS2 Closing invalid handles is a NOOP on Linux/macOS, but crashes on Windows. --- Runtime/Bindings/FFI/iconv/iconv.lua | 15 ++++++++++++++- Tests/BDD/iconv-library.spec.lua | 6 +++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/Runtime/Bindings/FFI/iconv/iconv.lua b/Runtime/Bindings/FFI/iconv/iconv.lua index d4f02f2d4..932dd8558 100644 --- a/Runtime/Bindings/FFI/iconv/iconv.lua +++ b/Runtime/Bindings/FFI/iconv/iconv.lua @@ -7,7 +7,11 @@ local ffi_string = ffi.string local tonumber = tonumber local tostring = tostring -local iconv = {} +local iconv = { + errorMessages = { + INVALID_CONVERSION_HANDLE = "Cannot close an invalid iconv_t descriptor", + }, +} iconv.cdefs = [[ typedef void* iconv_t; @@ -73,4 +77,13 @@ function iconv.convert(input, inputEncoding, outputEncoding) return tostring(outputBuffer), ffi_strerror(0) end +function iconv.try_close(descriptor) + if ffi.cast("size_t", descriptor) ~= iconv.bindings.CHARSET_CONVERSION_FAILED then + -- Guard this because MINGW64's iconv can't handle closing invalid descriptors + return iconv.bindings.iconv_close(descriptor) + end + + return nil, iconv.errorMessages.INVALID_CONVERSION_HANDLE +end + return iconv diff --git a/Tests/BDD/iconv-library.spec.lua b/Tests/BDD/iconv-library.spec.lua index ffd8f1b81..a80afe4fb 100644 --- a/Tests/BDD/iconv-library.spec.lua +++ b/Tests/BDD/iconv-library.spec.lua @@ -82,20 +82,20 @@ describe("iconv", function() after(function() for label, descriptor in pairs(descriptors) do - iconv.bindings.iconv_close(descriptor) -- NOOP if invalid + iconv.try_close(descriptor) end end) it("should indicate an error if the requested conversion isn't supported", function() local descriptor = iconv.bindings.iconv_open("Not-a-real-encoding", "UTF-8") assertEquals(ffi.cast("size_t", descriptor), iconv.bindings.CHARSET_CONVERSION_FAILED) - iconv.bindings.iconv_close(descriptor) + iconv.try_close(descriptor) end) it("should return a valid handle if the conversion is supported", function() local descriptor = iconv.bindings.iconv_open("CP949", "UTF-8") assertFalse(ffi.cast("size_t", descriptor) == iconv.bindings.CHARSET_CONVERSION_FAILED) - iconv.bindings.iconv_close(descriptor) + iconv.try_close(descriptor) end) end)