diff --git a/embed.fnc b/embed.fnc index d009d37e3bd9..43b9102e8fdc 100644 --- a/embed.fnc +++ b/embed.fnc @@ -794,8 +794,12 @@ Adp |int |bytes_cmp_utf8 |NN const U8 *b \ Adp |U8 * |bytes_from_utf8|NN const U8 *s \ |NN STRLEN *lenp \ |NN bool *is_utf8p -Adp |U8 * |bytes_to_utf8 |NN const U8 *s \ +Admp |U8 * |bytes_to_utf8 |NN const U8 *s \ |NN STRLEN *lenp +Adp |U8 * |bytes_to_utf8_free_me \ + |NN const U8 *s \ + |NN STRLEN *lenp \ + |NULLOK const U8 **free_me AOdp |SSize_t|call_argv |NN const char *sub_name \ |I32 flags \ |NN char **argv diff --git a/embed.h b/embed.h index 7d42b9800ef3..e4b4bb5e1cd9 100644 --- a/embed.h +++ b/embed.h @@ -155,7 +155,8 @@ # define block_start(a) Perl_block_start(aTHX_ a) # define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d) # define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c) -# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) +# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX,a,b) +# define bytes_to_utf8_free_me(a,b,c) Perl_bytes_to_utf8_free_me(aTHX_ a,b,c) # define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv # define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c) # define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b) diff --git a/proto.h b/proto.h index 79c10d176d1d..08ec30c5ce15 100644 --- a/proto.h +++ b/proto.h @@ -398,9 +398,12 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p); #define PERL_ARGS_ASSERT_BYTES_FROM_UTF8 \ assert(s); assert(lenp); assert(is_utf8p) +/* PERL_CALLCONV U8 * +Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp); */ + PERL_CALLCONV U8 * -Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp); -#define PERL_ARGS_ASSERT_BYTES_TO_UTF8 \ +Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, STRLEN *lenp, const U8 **free_me); +#define PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME \ assert(s); assert(lenp) /* PERL_CALLCONV bool diff --git a/utf8.c b/utf8.c index bfab721cb55b..71a4b89bee57 100644 --- a/utf8.c +++ b/utf8.c @@ -3257,39 +3257,80 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) } /* -=for apidoc bytes_to_utf8 +=for apidoc bytes_to_utf8 +=for apidoc_item bytes_to_utf8_free_me -Converts a string C of length C<*lenp> bytes from the native encoding into -UTF-8. -Returns a pointer to the newly-created string, and sets C<*lenp> to -reflect the new length in bytes. The caller is responsible for arranging for -the memory used by this string to get freed. +These each convert a string C of length C<*lenp> bytes from the native +encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to +the UTF-8 string, and setting C<*lenp> to its length in bytes. + +C always allocates new memory for the result, making sure it is +NUL-terminated. + +C simply returns a pointer to the input string if the +string's UTF-8 representation is the same as its native representation. +Otherwise, it behaves like C, returning a pointer to new memory +containing the conversion of the input. In other words, it returns the input +string if converting the string would be a no-op. Note that when no new string +is allocated, the function can't add a NUL to the original string if one wasn't +already there. + +In both cases, the caller is responsible for arranging for any new memory to +get freed. + +C takes an extra parameter, C to communicate. +to the caller that memory was allocated or not. If that parameter is NULL, +C acts identically to C, always +allocating new memory. + +But when it is a non-NULL pointer, C stores into it +either NULL if no memory was allocated; or a pointer to that new memory. This +allows the following convenient paradigm: + + U8 * free_me; + U8 converted = bytes_to_utf8_free_me(string, &len, &free_me); + + ... + + Safefree(free_me); + +You don't have to know if memory was allocated or not. Just call C +unconditionally. C will contain a suitable value to pass to +C for it to do the right thing, regardless. Upon return, the number of variants in the string can be computed by having saved the value of C<*lenp> before the call, and subtracting it from the after-call value of C<*lenp>. -A C character will be written after the end of the string. - -If you want to convert to UTF-8 from encodings other than -the native (Latin1 or EBCDIC), -see L(). +If you want to convert to UTF-8 from encodings other than the native (Latin1 or +EBCDIC), see L(). =cut */ U8* -Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) +Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp, + const U8 ** free_me_ptr) { + PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME; + PERL_UNUSED_CONTEXT; + const U8 * const send = s + (*lenp); + const Size_t variant_count = variant_under_utf8_count(s, send); + + /* Return the input unchanged if the flag indicates to do so, and there + * are no characters that differ when represented in UTF-8, and the + * original is NUL-terminated */ + if (free_me_ptr != NULL && variant_count == 0) { + *free_me_ptr = NULL; + return (U8 *) s; + } + U8 *d; U8 *dst; - PERL_ARGS_ASSERT_BYTES_TO_UTF8; - PERL_UNUSED_CONTEXT; - /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ - Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8); + Newx(d, (*lenp) + variant_count + 1, U8); dst = d; while (s < send) { @@ -3298,7 +3339,11 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) } *d = '\0'; - *lenp = d-dst; + *lenp = d - dst; + + if (free_me_ptr != NULL) { + *free_me_ptr = dst; + } return dst; } diff --git a/utf8.h b/utf8.h index 6ed7c3304e4d..f95311637c34 100644 --- a/utf8.h +++ b/utf8.h @@ -1330,6 +1330,7 @@ point's representation. #define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end) +#define Perl_bytes_to_utf8(mTHX, s, lenp) Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL) typedef enum { PL_utf8_to_bytes_overwrite = 0, PL_utf8_to_bytes_new_memory,