Add new function bytes_to_utf8_free_me

This is like bytes_to_utf8, but if the representation of the input string is the same in UTF-8 as it is in native format, the allocation of new memory is skipped. This presents optimization possibilities.
Perl · Jan 8, 2025 · 992f768 · 992f768
1 parent 3121b87
commit 992f768
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 21 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -794,8 +794,12 @@ Adp	|int	|bytes_cmp_utf8 |NN const U8 *b 			\
 Adp	|U8 *	|bytes_from_utf8|NN const U8 *s 			\
 				|NN STRLEN *lenp			\
 				|NN bool *is_utf8p
-Adp	|U8 *	|bytes_to_utf8	|NN const U8 *s 			\
+Admp	|U8 *	|bytes_to_utf8	|NN const U8 *s 			\
 				|NN STRLEN *lenp
+Adp	|U8 *	|bytes_to_utf8_free_me					\
+				|NN const U8 *s 			\
+				|NN STRLEN *lenp			\
+				|NULLOK const U8 **free_me
 AOdp	|SSize_t|call_argv	|NN const char *sub_name		\
 				|I32 flags				\
 				|NN char **argv

diff --git a/embed.h b/embed.h
@@ -155,7 +155,8 @@
 # define block_start(a)                         Perl_block_start(aTHX_ a)
 # define bytes_cmp_utf8(a,b,c,d)                Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
 # define bytes_from_utf8(a,b,c)                 Perl_bytes_from_utf8(aTHX_ a,b,c)
-# define bytes_to_utf8(a,b)                     Perl_bytes_to_utf8(aTHX_ a,b)
+# define bytes_to_utf8(a,b)                     Perl_bytes_to_utf8(aTHX,a,b)
+# define bytes_to_utf8_free_me(a,b,c)           Perl_bytes_to_utf8_free_me(aTHX_ a,b,c)
 # define c9strict_utf8_to_uv                    Perl_c9strict_utf8_to_uv
 # define call_argv(a,b,c)                       Perl_call_argv(aTHX_ a,b,c)
 # define call_atexit(a,b)                       Perl_call_atexit(aTHX_ a,b)

diff --git a/proto.h b/proto.h
diff --git a/utf8.c b/utf8.c
@@ -3257,39 +3257,80 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
 }
 
 /*
-=for apidoc bytes_to_utf8
+=for apidoc      bytes_to_utf8
+=for apidoc_item bytes_to_utf8_free_me
 
-Converts a string C<s> of length C<*lenp> bytes from the native encoding into
-UTF-8.
-Returns a pointer to the newly-created string, and sets C<*lenp> to
-reflect the new length in bytes.  The caller is responsible for arranging for
-the memory used by this string to get freed.
+These each convert a string C<s> of length C<*lenp> bytes from the native
+encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to
+the UTF-8 string, and setting C<*lenp> to its length in bytes.
+
+C<bytes_to_utf8> always allocates new memory for the result, making sure it is
+NUL-terminated.
+
+C<bytes_to_utf8_free_me> simply returns a pointer to the input string if the
+string's UTF-8 representation is the same as its native representation.
+Otherwise, it behaves like C<bytes_to_utf8>, returning a pointer to new memory
+containing the conversion of the input.  In other words, it returns the input
+string if converting the string would be a no-op.  Note that when no new string
+is allocated, the function can't add a NUL to the original string if one wasn't
+already there.
+
+In both cases, the caller is responsible for arranging for any new memory to
+get freed.
+
+C<bytes_to_utf8_free_me> takes an extra parameter, C<free_me> to communicate.
+to the caller that memory was allocated or not.  If that parameter is NULL,
+C<bytes_to_utf8_free_me> acts identically to C<bytes_to_utf8>, always
+allocating new memory.
+
+But when it is a non-NULL pointer, C<bytes_to_utf8_free_me> stores into it
+either NULL if no memory was allocated; or a pointer to that new memory.  This
+allows the following convenient paradigm:
+
+ U8 * free_me;
+ U8 converted = bytes_to_utf8_free_me(string, &len, &free_me);
+
+ ...
+
+ Safefree(free_me);
+
+You don't have to know if memory was allocated or not.  Just call C<Safefree>
+unconditionally.  C<free_me> will contain a suitable value to pass to
+C<Safefree> for it to do the right thing, regardless.
 
 Upon return, the number of variants in the string can be computed by
 having saved the value of C<*lenp> before the call, and subtracting it from the
 after-call value of C<*lenp>.
 
-A C<NUL> character will be written after the end of the string.
-
-If you want to convert to UTF-8 from encodings other than
-the native (Latin1 or EBCDIC),
-see L</sv_recode_to_utf8>().
+If you want to convert to UTF-8 from encodings other than the native (Latin1 or
+EBCDIC), see L</sv_recode_to_utf8>().
 
 =cut
 */
 
 U8*
-Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
+Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp,
+                                 const U8 ** free_me_ptr)
 {
+    PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME;
+    PERL_UNUSED_CONTEXT;
+
     const U8 * const send = s + (*lenp);
+    const Size_t variant_count = variant_under_utf8_count(s, send);
+
+    /* Return the input unchanged if the flag indicates to do so, and there
+     * are no characters that differ when represented in UTF-8, and the
+     * original is NUL-terminated */
+    if (free_me_ptr != NULL && variant_count == 0) {
+        *free_me_ptr = NULL;
+        return (U8 *) s;
+    }
+
     U8 *d;
     U8 *dst;
 
-    PERL_ARGS_ASSERT_BYTES_TO_UTF8;
-    PERL_UNUSED_CONTEXT;
-
     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
-    Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
+    Newx(d, (*lenp) + variant_count + 1, U8);
     dst = d;
 
     while (s < send) {
@@ -3298,7 +3339,11 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
     }
 
     *d = '\0';
-    *lenp = d-dst;
+    *lenp = d - dst;
+
+    if (free_me_ptr != NULL) {
+        *free_me_ptr = dst;
+    }
 
     return dst;
 }

diff --git a/utf8.h b/utf8.h
@@ -1330,6 +1330,7 @@ point's representation.
 
 #define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
 
+#define Perl_bytes_to_utf8(mTHX, s, lenp)  Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL)
 typedef enum {
     PL_utf8_to_bytes_overwrite = 0,
     PL_utf8_to_bytes_new_memory,