diff --git a/embed.fnc b/embed.fnc index 930a0f33b1a8..76278cae7d23 100644 --- a/embed.fnc +++ b/embed.fnc @@ -3780,6 +3780,10 @@ CTp |bool |utf8_to_uv_msgs_helper_ \ |U32 flags \ |NULLOK U32 *errors \ |NULLOK AV **msgs +ATdip |UV |utf8_to_uv_or_die \ + |NN const U8 * const s \ + |NN const U8 *e \ + |NULLOK Size_t *advance_p CDbdp |UV |utf8_to_uvuni |NN const U8 *s \ |NULLOK STRLEN *retlen : Used in perly.y diff --git a/embed.h b/embed.h index dfcc4f4881e6..c018648047dd 100644 --- a/embed.h +++ b/embed.h @@ -870,6 +870,7 @@ # define utf8_to_uv_flags Perl_utf8_to_uv_flags # define utf8_to_uv_msgs Perl_utf8_to_uv_msgs # define utf8_to_uv_msgs_helper_ Perl_utf8_to_uv_msgs_helper_ +# define utf8_to_uv_or_die Perl_utf8_to_uv_or_die # define utf8n_to_uvchr Perl_utf8n_to_uvchr # define utf8n_to_uvchr_error Perl_utf8n_to_uvchr_error # define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs diff --git a/inline.h b/inline.h index 3c8df436dfde..5c2856060870 100644 --- a/inline.h +++ b/inline.h @@ -3138,6 +3138,16 @@ Perl_utf8_to_uv_msgs(const U8 * const s0, return utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, flags, errors, msgs); } +PERL_STATIC_INLINE UV +Perl_utf8_to_uv_or_die(const U8 *s, const U8 *e, STRLEN *advance_p) +{ + PERL_ARGS_ASSERT_UTF8_TO_UV_OR_DIE; + + UV cp; + (void) utf8_to_uv_flags(s, e, &cp, advance_p, UTF8_DIE_IF_MALFORMED); + return cp; +} + PERL_STATIC_INLINE UV Perl_utf8n_to_uvchr_msgs(const U8 * const s0, STRLEN curlen, diff --git a/pod/perldelta.pod b/pod/perldelta.pod index d53738408312..f6835e32c945 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -436,9 +436,10 @@ New API functions are introduced to convert strings encoded in UTF-8 to their ordinal code point equivalent. These are safe to use by default, and generally more convenient to use than the existing ones. -L> replaces L> (which is -retained for backwards compatibility), but you should convert to use the -new form, as likely you aren't using the old one safely. +L> and L> replace +L> (which is retained for backwards +compatibility), but you should convert to use the new forms, as likely +you aren't using the old one safely. To convert in the opposite direction, you can now use L>. This is not a new function, but a new synonym diff --git a/proto.h b/proto.h index 98654997ea87..fe1becffba3b 100644 --- a/proto.h +++ b/proto.h @@ -10034,6 +10034,11 @@ Perl_utf8_to_uv_msgs(const U8 * const s0, const U8 *e, UV *cp_p, Size_t *advance # define PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS \ assert(s0); assert(e); assert(cp_p) +PERL_STATIC_INLINE UV +Perl_utf8_to_uv_or_die(const U8 * const s, const U8 *e, Size_t *advance_p); +# define PERL_ARGS_ASSERT_UTF8_TO_UV_OR_DIE \ + assert(s); assert(e) + PERL_STATIC_INLINE UV Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen); # define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF \ diff --git a/utf8.c b/utf8.c index 13d699561d61..de2149fc70ad 100644 --- a/utf8.c +++ b/utf8.c @@ -1003,6 +1003,7 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, =for apidoc_item extended_utf8_to_uv =for apidoc_item strict_utf8_to_uv =for apidoc_item c9strict_utf8_to_uv +=for apidoc_item utf8_to_uv_or_die =for apidoc_item utf8_to_uvchr_buf =for apidoc_item utf8_to_uvchr @@ -1099,6 +1100,11 @@ sequence. You can use that function or C> to exert more control over the input that is considered acceptable, and the warnings that are raised. +C has a simpler interface, for use when any errors are +fatal. It returns the code point instead of using an output parameter, and +throws an exception with any errors found where the other functions here would +have returned false. + Often, C is an arbitrarily long string containing the UTF-8 representations of many code points in a row, and these functions are called in the course of parsing C to find all those code points. @@ -1107,8 +1113,8 @@ If your code doesn't know how to deal with illegal input, as would be typical of a low level routine, the loop could look like: while (s < e) { - UV cp; Size_t advance; + UV cp; (void) utf8_to_uv(s, e, &cp, &advance); s += advance; @@ -1118,11 +1124,24 @@ A REPLACEMENT CHARACTER will be inserted everywhere that malformed input occurs. Obviously, we aren't expecting such outcomes, but your code will be protected from attacks and many harmful effects that could otherwise occur. +If the situation is such that it would be a bug for the input to be invalid, a +somewhat simpler loop suffices: + + while (s < e) { + Size_t advance; + UV cp = utf8_to_uv_or_die(s, e, &advance); + + s += advance; + } + +This will throw an exception on invalid input, so your code doesn't have to +concern itself with that possibility. + If you do have a plan for handling malformed input, you could instead write: while (s < e) { - UV cp; Size_t advance; + UV cp; if (UNLIKELY(! utf8_to_uv(s, e, &cp, &advance)) { @@ -1142,9 +1161,10 @@ attacks against such code; and it is extra work always, as the functions have already done the equivalent work and return the correct value in C, regardless of whether the input is well-formed or not. -You must always pass a non-NULL pointer into which to store the (first) code -point C represents. If you don't care about this value, you should be using -one of the C> functions instead. +Except with C, you must always pass a non-NULL pointer into +which to store the (first) code point C represents. If you don't care about +this value, you should be using one of the C> functions +instead. =item C forms @@ -1274,8 +1294,8 @@ This flag is ignored if C is also set. =item C These reject and/or warn about UTF-8 sequences that represent surrogate -characters. The warning categories C and C control if warnings -are actually raised. +characters. The warning categories C and C control if +warnings are actually raised. =item C @@ -1290,7 +1310,7 @@ are actually raised. =item C These reject and/or warn about UTF-8 sequences that represent code points -above 0x10FFFF. The warning categories C and C control if +above 0x10FFFF. The warning categories C and C control if warnings are actually raised. =item C @@ -1324,7 +1344,8 @@ These reject and/or warn on encountering sequences that require Perl's extension to UTF-8 to represent them. These are all for code points above 0x10FFFF, so these sequences are a subset of the ones controlled by SUPER or either of the illegal interchange sets of flags. The warning categories -C, C, and C control if warnings are actually raised. +C, C, and C control if warnings are actually +raised. Perl predates Unicode, and earlier standards allowed for code points up through 0x7FFF_FFFF (2**31 - 1). Perl, of course, would like you to be able to