Skip to content

Commit

Permalink
Rollup merge of #99544 - dylni:expose-utf8lossy, r=Mark-Simulacrum
Browse files Browse the repository at this point in the history
Expose `Utf8Lossy` as `Utf8Chunks`

This PR changes the feature for `Utf8Lossy` from `str_internals` to `utf8_lossy` and improves the API. This is done to eventually expose the API as stable.

Proposal: rust-lang/libs-team#54
Tracking Issue: #99543
  • Loading branch information
matthiaskrgr authored Aug 20, 2022
2 parents 859cde5 + f4419c7 commit e6eb6e3
Show file tree
Hide file tree
Showing 10 changed files with 273 additions and 184 deletions.
1 change: 1 addition & 0 deletions alloc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@
#![feature(unchecked_math)]
#![feature(unicode_internals)]
#![feature(unsize)]
#![feature(utf8_chunks)]
#![feature(std_internals)]
//
// Language features:
Expand Down
2 changes: 2 additions & 0 deletions alloc/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ pub use core::str::{RSplit, Split};
pub use core::str::{RSplitN, SplitN};
#[stable(feature = "rust1", since = "1.0.0")]
pub use core::str::{RSplitTerminator, SplitTerminator};
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub use core::str::{Utf8Chunk, Utf8Chunks};

/// Note: `str` in `Concat<str>` is not meaningful here.
/// This type parameter of the trait only exists to enable another impl.
Expand Down
16 changes: 8 additions & 8 deletions alloc/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ use core::ops::Bound::{Excluded, Included, Unbounded};
use core::ops::{self, Index, IndexMut, Range, RangeBounds};
use core::ptr;
use core::slice;
#[cfg(not(no_global_oom_handling))]
use core::str::lossy;
use core::str::pattern::Pattern;
#[cfg(not(no_global_oom_handling))]
use core::str::Utf8Chunks;

#[cfg(not(no_global_oom_handling))]
use crate::borrow::{Cow, ToOwned};
Expand Down Expand Up @@ -628,11 +628,11 @@ impl String {
#[cfg(not(no_global_oom_handling))]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
let mut iter = lossy::Utf8Lossy::from_bytes(v).chunks();
let mut iter = Utf8Chunks::new(v);

let first_valid = if let Some(chunk) = iter.next() {
let lossy::Utf8LossyChunk { valid, broken } = chunk;
if broken.is_empty() {
let valid = chunk.valid();
if chunk.invalid().is_empty() {
debug_assert_eq!(valid.len(), v.len());
return Cow::Borrowed(valid);
}
Expand All @@ -647,9 +647,9 @@ impl String {
res.push_str(first_valid);
res.push_str(REPLACEMENT);

for lossy::Utf8LossyChunk { valid, broken } in iter {
res.push_str(valid);
if !broken.is_empty() {
for chunk in iter {
res.push_str(chunk.valid());
if !chunk.invalid().is_empty() {
res.push_str(REPLACEMENT);
}
}
Expand Down
244 changes: 157 additions & 87 deletions core/src/str/lossy.rs
Original file line number Diff line number Diff line change
@@ -1,51 +1,170 @@
use crate::char;
use crate::fmt::{self, Write};
use crate::mem;
use crate::fmt;
use crate::fmt::Formatter;
use crate::fmt::Write;
use crate::iter::FusedIterator;

use super::from_utf8_unchecked;
use super::validations::utf8_char_width;

/// Lossy UTF-8 string.
#[unstable(feature = "str_internals", issue = "none")]
pub struct Utf8Lossy {
bytes: [u8],
/// An item returned by the [`Utf8Chunks`] iterator.
///
/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
/// when decoding a UTF-8 string.
///
/// # Examples
///
/// ```
/// #![feature(utf8_chunks)]
///
/// use std::str::Utf8Chunks;
///
/// // An invalid UTF-8 string
/// let bytes = b"foo\xF1\x80bar";
///
/// // Decode the first `Utf8Chunk`
/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
///
/// // The first three characters are valid UTF-8
/// assert_eq!("foo", chunk.valid());
///
/// // The fourth character is broken
/// assert_eq!(b"\xF1\x80", chunk.invalid());
/// ```
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Utf8Chunk<'a> {
valid: &'a str,
invalid: &'a [u8],
}

impl Utf8Lossy {
impl<'a> Utf8Chunk<'a> {
/// Returns the next validated UTF-8 substring.
///
/// This substring can be empty at the start of the string or between
/// broken UTF-8 characters.
#[must_use]
pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
// SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
unsafe { mem::transmute(bytes) }
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub fn valid(&self) -> &'a str {
self.valid
}

pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
Utf8LossyChunksIter { source: &self.bytes }
/// Returns the invalid sequence that caused a failure.
///
/// The returned slice will have a maximum length of 3 and starts after the
/// substring given by [`valid`]. Decoding will resume after this sequence.
///
/// If empty, this is the last chunk in the string. If non-empty, an
/// unexpected byte was encountered or the end of the input was reached
/// unexpectedly.
///
/// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
/// CHARACTER`].
///
/// [`valid`]: Self::valid
/// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
#[must_use]
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub fn invalid(&self) -> &'a [u8] {
self.invalid
}
}

/// Iterator over lossy UTF-8 string
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[must_use]
#[unstable(feature = "str_internals", issue = "none")]
pub struct Debug<'a>(&'a [u8]);

#[unstable(feature = "str_internals", issue = "none")]
#[allow(missing_debug_implementations)]
pub struct Utf8LossyChunksIter<'a> {
impl fmt::Debug for Debug<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_char('"')?;

for chunk in Utf8Chunks::new(self.0) {
// Valid part.
// Here we partially parse UTF-8 again which is suboptimal.
{
let valid = chunk.valid();
let mut from = 0;
for (i, c) in valid.char_indices() {
let esc = c.escape_debug();
// If char needs escaping, flush backlog so far and write, else skip
if esc.len() != 1 {
f.write_str(&valid[from..i])?;
for c in esc {
f.write_char(c)?;
}
from = i + c.len_utf8();
}
}
f.write_str(&valid[from..])?;
}

// Broken parts of string as hex escape.
for &b in chunk.invalid() {
write!(f, "\\x{:02X}", b)?;
}
}

f.write_char('"')
}
}

/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
///
/// If you want a simple conversion from UTF-8 byte slices to string slices,
/// [`from_utf8`] is easier to use.
///
/// [byteslice]: slice
/// [`from_utf8`]: super::from_utf8
///
/// # Examples
///
/// This can be used to create functionality similar to
/// [`String::from_utf8_lossy`] without allocating heap memory:
///
/// ```
/// #![feature(utf8_chunks)]
///
/// use std::str::Utf8Chunks;
///
/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
/// for chunk in Utf8Chunks::new(input) {
/// push(chunk.valid());
///
/// if !chunk.invalid().is_empty() {
/// push("\u{FFFD}");
/// }
/// }
/// }
/// ```
///
/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[derive(Clone)]
pub struct Utf8Chunks<'a> {
source: &'a [u8],
}

#[unstable(feature = "str_internals", issue = "none")]
#[derive(PartialEq, Eq, Debug)]
pub struct Utf8LossyChunk<'a> {
/// Sequence of valid chars.
/// Can be empty between broken UTF-8 chars.
pub valid: &'a str,
/// Single broken char, empty if none.
/// Empty iff iterator item is last.
pub broken: &'a [u8],
impl<'a> Utf8Chunks<'a> {
/// Creates a new iterator to decode the bytes.
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub fn new(bytes: &'a [u8]) -> Self {
Self { source: bytes }
}

#[doc(hidden)]
#[unstable(feature = "str_internals", issue = "none")]
pub fn debug(&self) -> Debug<'_> {
Debug(self.source)
}
}

impl<'a> Iterator for Utf8LossyChunksIter<'a> {
type Item = Utf8LossyChunk<'a>;
#[unstable(feature = "utf8_chunks", issue = "99543")]
impl<'a> Iterator for Utf8Chunks<'a> {
type Item = Utf8Chunk<'a>;

fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
fn next(&mut self) -> Option<Utf8Chunk<'a>> {
if self.source.is_empty() {
return None;
}
Expand Down Expand Up @@ -130,71 +249,22 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {

// SAFETY: `valid_up_to <= i` because it is only ever assigned via
// `valid_up_to = i` and `i` only increases.
let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };

Some(Utf8LossyChunk {
Some(Utf8Chunk {
// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
valid: unsafe { from_utf8_unchecked(valid) },
broken,
invalid,
})
}
}

impl fmt::Display for Utf8Lossy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// If we're the empty string then our iterator won't actually yield
// anything, so perform the formatting manually
if self.bytes.is_empty() {
return "".fmt(f);
}

for Utf8LossyChunk { valid, broken } in self.chunks() {
// If we successfully decoded the whole chunk as a valid string then
// we can return a direct formatting of the string which will also
// respect various formatting flags if possible.
if valid.len() == self.bytes.len() {
assert!(broken.is_empty());
return valid.fmt(f);
}

f.write_str(valid)?;
if !broken.is_empty() {
f.write_char(char::REPLACEMENT_CHARACTER)?;
}
}
Ok(())
}
}

impl fmt::Debug for Utf8Lossy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_char('"')?;
#[unstable(feature = "utf8_chunks", issue = "99543")]
impl FusedIterator for Utf8Chunks<'_> {}

for Utf8LossyChunk { valid, broken } in self.chunks() {
// Valid part.
// Here we partially parse UTF-8 again which is suboptimal.
{
let mut from = 0;
for (i, c) in valid.char_indices() {
let esc = c.escape_debug();
// If char needs escaping, flush backlog so far and write, else skip
if esc.len() != 1 {
f.write_str(&valid[from..i])?;
for c in esc {
f.write_char(c)?;
}
from = i + c.len_utf8();
}
}
f.write_str(&valid[from..])?;
}

// Broken parts of string as hex escape.
for &b in broken {
write!(f, "\\x{:02x}", b)?;
}
}

f.write_char('"')
#[unstable(feature = "utf8_chunks", issue = "99543")]
impl fmt::Debug for Utf8Chunks<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
}
}
6 changes: 3 additions & 3 deletions core/src/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ use crate::slice::{self, SliceIndex};

pub mod pattern;

#[unstable(feature = "str_internals", issue = "none")]
#[allow(missing_docs)]
pub mod lossy;
mod lossy;
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub use lossy::{Utf8Chunk, Utf8Chunks};

#[stable(feature = "rust1", since = "1.0.0")]
pub use converts::{from_utf8, from_utf8_unchecked};
Expand Down
1 change: 1 addition & 0 deletions core/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
#![feature(waker_getters)]
#![feature(slice_flatten)]
#![feature(provide_any)]
#![feature(utf8_chunks)]
#![deny(unsafe_op_in_unsafe_fn)]

extern crate test;
Expand Down
Loading

0 comments on commit e6eb6e3

Please sign in to comment.