diff --git a/Cargo.toml b/Cargo.toml index 2654e423..b31011f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ compact_str = "0.8.0" chrono = { version = "0.4", default-features = false, features = ["clock", "std", "wasmbind"] } delegate = "0.12.0" thiserror = "1.0" -nom = "7.1.1" +winnow = { version = "0.6", features = ["simd"] } num-integer = "0.1.44" num-traits = "0.2" arrayvec = "0.7" diff --git a/benches/read_many_structs.rs b/benches/read_many_structs.rs index d64278a3..d446b3c8 100644 --- a/benches/read_many_structs.rs +++ b/benches/read_many_structs.rs @@ -47,9 +47,8 @@ fn maximally_compact_1_1_data(num_values: usize) -> TestData_1_1 { let text_1_1_data = r#"(:event 1670446800245 418 "6" "1" "abc123" (:: "region 4" "2022-12-07T20:59:59.744000Z"))"#.repeat(num_values); - let mut binary_1_1_data = vec![0xE0u8, 0x01, 0x01, 0xEA]; // IVM #[rustfmt::skip] - let mut binary_1_1_data_body: Vec = [MacroTable::FIRST_USER_MACRO_ID as u8, // Macro ID + let binary_1_1_data: Vec = [MacroTable::FIRST_USER_MACRO_ID as u8, // Macro ID 0b10, // [NOTE: `0b`] `parameters*` arg is an arg group 0x66, // 6-byte integer (`timestamp` param) 0x75, 0x5D, 0x63, 0xEE, 0x84, 0x01, @@ -73,7 +72,6 @@ fn maximally_compact_1_1_data(num_values: usize) -> TestData_1_1 { 0x39, 0x3A, 0x35, 0x39, 0x2E, 0x37, 0x34, 0x34, 0x30, 0x30, 0x30, 0x5A].repeat(num_values); - binary_1_1_data.append(&mut binary_1_1_data_body); TestData_1_1 { name: "maximally compact".to_owned(), template_definition_text, @@ -107,9 +105,8 @@ fn moderately_compact_1_1_data(num_values: usize) -> TestData_1_1 { "#; let text_1_1_data = r#"(:event 1670446800245 418 "scheduler-thread-6" "example-client-1" "aws-us-east-5f-abc123" (:: "region 4" "2022-12-07T20:59:59.744000Z"))"#.repeat(num_values); - let mut binary_1_1_data = vec![0xE0u8, 0x01, 0x01, 0xEA]; // IVM #[rustfmt::skip] - let mut binary_1_1_data_body: Vec = [MacroTable::FIRST_USER_MACRO_ID as u8, // Macro ID + let binary_1_1_data: Vec = [MacroTable::FIRST_USER_MACRO_ID as u8, // Macro ID 0b10, // [NOTE: `0b` prefix] `parameters*` arg is an arg group 0x66, // 6-byte integer (`timestamp` param) 0x75, 0x5D, 0x63, 0xEE, 0x84, 0x01, @@ -142,7 +139,6 @@ fn moderately_compact_1_1_data(num_values: usize) -> TestData_1_1 { 0x2E, 0x37, 0x34, 0x34, 0x30, 0x30, 0x30, 0x5A].repeat(num_values); - binary_1_1_data.append(&mut binary_1_1_data_body); TestData_1_1 { name: "moderately compact".to_owned(), template_definition_text: template_definition_text.to_owned(), @@ -176,9 +172,8 @@ fn length_prefixed_moderately_compact_1_1_data(num_values: usize) -> TestData_1_ "#; let text_1_1_data = r#"(:event 1670446800245 418 "scheduler-thread-6" "example-client-1" "aws-us-east-5f-abc123" (:: "region 4" "2022-12-07T20:59:59.744000Z"))"#.repeat(num_values); - let mut binary_1_1_data = vec![0xE0u8, 0x01, 0x01, 0xEA]; // IVM #[rustfmt::skip] - let mut binary_1_1_data_body: Vec = [0xF5, // LP invocation + let binary_1_1_data: Vec = [0xF5, // LP invocation ((MacroTable::FIRST_USER_MACRO_ID * 2) + 1) as u8, // Macro ID 0xDF, // Length prefix: FlexUInt 111 0b10, // [NOTE: `0b` prefix] `parameters*` arg is an arg group @@ -213,7 +208,6 @@ fn length_prefixed_moderately_compact_1_1_data(num_values: usize) -> TestData_1_ 0x2E, 0x37, 0x34, 0x34, 0x30, 0x30, 0x30, 0x5A].repeat(num_values); - binary_1_1_data.append(&mut binary_1_1_data_body); TestData_1_1 { name: "moderately compact w/length-prefixed top level".to_owned(), template_definition_text: template_definition_text.to_owned(), @@ -444,12 +438,12 @@ mod benchmark { b.iter(|| { // We don't have an API for doing this with the application-level reader yet, so // for now we use a manually configured context and a raw reader. - let mut reader = LazyRawBinaryReader_1_1::new(binary_1_1_data); + let mut reader = LazyRawBinaryReader_1_1::new(context_ref, binary_1_1_data); let mut num_top_level_values: usize = 0; // Skip past the IVM - reader.next(context_ref).unwrap().expect_ivm().unwrap(); + reader.next().unwrap().expect_ivm().unwrap(); // Expect every top-level item to be an e-expression. - while let RawStreamItem::EExp(raw_eexp) = reader.next(context_ref).unwrap() { + while let RawStreamItem::EExp(raw_eexp) = reader.next().unwrap() { num_top_level_values += 1; // Look up the e-expression's invoked macro ID in the encoding context. let eexp = raw_eexp.resolve(context_ref).unwrap(); diff --git a/src/lazy/any_encoding.rs b/src/lazy/any_encoding.rs index 3e4e12ec..0a809b7b 100644 --- a/src/lazy/any_encoding.rs +++ b/src/lazy/any_encoding.rs @@ -1,8 +1,5 @@ #![allow(non_camel_case_types)] -use std::fmt::Debug; -use std::ops::Range; - use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator as RawBinaryAnnotationsIterator_1_0; use crate::lazy::binary::raw::r#struct::{ LazyRawBinaryFieldName_1_0, LazyRawBinaryStruct_1_0, RawBinaryStructIterator_1_0, @@ -45,28 +42,24 @@ use crate::lazy::raw_stream_item::LazyRawStreamItem; use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::span::Span; use crate::lazy::streaming_raw_reader::RawReaderState; -use crate::lazy::text::raw::r#struct::{ - LazyRawTextFieldName_1_0, LazyRawTextStruct_1_0, RawTextStructIterator_1_0, -}; +use crate::lazy::text::raw::r#struct::LazyRawTextFieldName; use crate::lazy::text::raw::reader::LazyRawTextReader_1_0; -use crate::lazy::text::raw::sequence::{ - LazyRawTextList_1_0, LazyRawTextSExp_1_0, RawTextListIterator_1_0, RawTextSExpIterator_1_0, -}; +use crate::lazy::text::raw::sequence::{RawTextList, RawTextSExp}; use crate::lazy::text::raw::v1_1::arg_group::{ EExpArg, EExpArgExpr, TextEExpArgGroup, TextEExpArgGroupIterator, }; use crate::lazy::text::raw::v1_1::reader::{ - LazyRawTextFieldName_1_1, LazyRawTextList_1_1, LazyRawTextReader_1_1, LazyRawTextSExp_1_1, - LazyRawTextStruct_1_1, MacroIdRef, RawTextSequenceCacheIterator_1_1, - RawTextStructCacheIterator_1_1, TextEExpression_1_1, + LazyRawTextReader_1_1, LazyRawTextStruct, MacroIdRef, RawTextSequenceCacheIterator, + RawTextStructCacheIterator, TextEExpression_1_1, }; use crate::lazy::text::value::{ LazyRawTextValue_1_0, LazyRawTextValue_1_1, LazyRawTextVersionMarker_1_0, LazyRawTextVersionMarker_1_1, RawTextAnnotationsIterator, }; use crate::symbol_table::{SystemSymbolTable, SYSTEM_SYMBOLS_1_0, SYSTEM_SYMBOLS_1_1}; -use crate::LazyRawValueKind::{Binary_1_0, Binary_1_1, Text_1_0, Text_1_1}; use crate::{try_next, Encoding, IonResult, IonType, RawStreamItem, RawSymbolRef}; +use std::fmt::Debug; +use std::ops::Range; /// An implementation of the `LazyDecoder` trait that can read any encoding of Ion. #[derive(Debug, Clone, Copy)] @@ -482,32 +475,32 @@ pub enum RawReaderKind<'data> { impl<'data> RawReaderKind<'data> { fn resume_at_offset( - data: &'data [u8], - stream_offset: usize, - encoding_hint: IonEncoding, + context: EncodingContextRef<'data>, + saved_state: RawReaderState<'data>, ) -> RawReaderKind<'data> { use IonEncoding::*; - match encoding_hint { - Text_1_0 => RawReaderKind::Text_1_0(LazyRawTextReader_1_0::resume_at_offset( - data, - stream_offset, - encoding_hint, - )), - Binary_1_0 => RawReaderKind::Binary_1_0(LazyRawBinaryReader_1_0::resume_at_offset( - data, - stream_offset, - encoding_hint, - )), - Text_1_1 => RawReaderKind::Text_1_1(LazyRawTextReader_1_1::resume_at_offset( - data, - stream_offset, - encoding_hint, - )), - Binary_1_1 => RawReaderKind::Binary_1_1(LazyRawBinaryReader_1_1::resume_at_offset( - data, - stream_offset, - encoding_hint, - )), + match saved_state.encoding() { + Text_1_0 => { + RawReaderKind::Text_1_0(LazyRawTextReader_1_0::resume(context, saved_state)) + } + Binary_1_0 => { + RawReaderKind::Binary_1_0(LazyRawBinaryReader_1_0::resume(context, saved_state)) + } + Text_1_1 => { + RawReaderKind::Text_1_1(LazyRawTextReader_1_1::resume(context, saved_state)) + } + Binary_1_1 => { + RawReaderKind::Binary_1_1(LazyRawBinaryReader_1_1::resume(context, saved_state)) + } + } + } + + fn context(&self) -> EncodingContextRef<'data> { + match self { + RawReaderKind::Text_1_0(r) => r.context(), + RawReaderKind::Binary_1_0(r) => r.context(), + RawReaderKind::Text_1_1(r) => r.context(), + RawReaderKind::Binary_1_1(r) => r.context(), } } } @@ -604,29 +597,28 @@ impl<'data> From> for LazyRawAnyReader<'data> { } impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> { - fn new(data: &'data [u8]) -> Self { - Self::resume_at_offset(data, 0, IonEncoding::default()) + fn new(context: EncodingContextRef<'data>, data: &'data [u8], is_final_data: bool) -> Self { + let encoding = Self::detect_encoding(data); + let state = RawReaderState::new(data, 0, is_final_data, encoding); + LazyRawAnyReader { + new_encoding: None, + encoding_reader: RawReaderKind::resume_at_offset(context, state), + } } - fn resume_at_offset(data: &'data [u8], offset: usize, mut encoding_hint: IonEncoding) -> Self { + fn resume(context: EncodingContextRef<'data>, mut saved_state: RawReaderState<'data>) -> Self { + let offset = saved_state.offset(); + let data = saved_state.data(); if offset == 0 { - // If we're at the beginning of the stream, the provided `encoding_hint` may be a + // If we're at the beginning of the stream, the saved state's encoding may be a // default. We need to inspect the bytes to see if we should override it. - encoding_hint = Self::detect_encoding(data); + saved_state.set_encoding(Self::detect_encoding(data)); } - match encoding_hint { - IonEncoding::Text_1_0 => { - LazyRawTextReader_1_0::resume_at_offset(data, offset, encoding_hint).into() - } - IonEncoding::Binary_1_0 => { - LazyRawBinaryReader_1_0::resume_at_offset(data, offset, encoding_hint).into() - } - IonEncoding::Text_1_1 => { - LazyRawTextReader_1_1::resume_at_offset(data, offset, encoding_hint).into() - } - IonEncoding::Binary_1_1 => { - LazyRawBinaryReader_1_1::resume_at_offset(data, offset, encoding_hint).into() - } + match saved_state.encoding() { + IonEncoding::Text_1_0 => LazyRawTextReader_1_0::resume(context, saved_state).into(), + IonEncoding::Binary_1_0 => LazyRawBinaryReader_1_0::resume(context, saved_state).into(), + IonEncoding::Text_1_1 => LazyRawTextReader_1_1::resume(context, saved_state).into(), + IonEncoding::Binary_1_1 => LazyRawBinaryReader_1_1::resume(context, saved_state).into(), } } @@ -641,36 +633,33 @@ impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> { // If we hit an IVM that changed the encoding but we haven't changed our reader yet, // we still want to report the new encoding. if let Some(new_encoding) = self.new_encoding { - return RawReaderState::new(reader_state.data(), reader_state.offset(), new_encoding); + return RawReaderState::new( + reader_state.data(), + reader_state.offset(), + reader_state.is_final_data(), + new_encoding, + ); } reader_state } - fn next<'top>( - &'top mut self, - context: EncodingContextRef<'top>, - ) -> IonResult> - where - 'data: 'top, - { + fn next(&mut self) -> IonResult> { // If we previously ran into an IVM that changed the stream encoding, replace our reader // with one that can read the new encoding. if let Some(new_encoding) = self.new_encoding.take() { - let reader_state = self.save_state(); - let new_encoding_reader = RawReaderKind::resume_at_offset( - reader_state.data(), - reader_state.offset(), - new_encoding, - ); + let mut reader_state = self.save_state(); + reader_state.set_encoding(new_encoding); + let new_encoding_reader = + RawReaderKind::resume_at_offset(self.encoding_reader.context(), reader_state); self.encoding_reader = new_encoding_reader; } use RawReaderKind::*; let item: LazyRawStreamItem<'_, AnyEncoding> = match &mut self.encoding_reader { - Text_1_0(r) => r.next(context)?.into(), + Text_1_0(r) => r.next()?.into(), Binary_1_0(r) => r.next()?.into(), - Text_1_1(r) => r.next(context)?.into(), - Binary_1_1(r) => r.next(context)?.into(), + Text_1_1(r) => r.next()?.into(), + Binary_1_1(r) => r.next()?.into(), }; // If this item is an IVM: @@ -1048,6 +1037,7 @@ impl<'top> LazyRawValue<'top, AnyEncoding> for LazyRawAnyValue<'top> { } fn is_delimited(&self) -> bool { + use LazyRawValueKind::*; match &self.encoding { Text_1_0(v) => v.is_delimited(), Binary_1_0(v) => v.is_delimited(), @@ -1166,25 +1156,26 @@ impl<'top> LazyRawAnyList<'top> { #[derive(Debug, Copy, Clone)] pub enum LazyRawListKind<'top> { - Text_1_0(LazyRawTextList_1_0<'top>), + Text_1_0(RawTextList<'top, TextEncoding_1_0>), Binary_1_0(LazyRawBinaryList_1_0<'top>), - Text_1_1(LazyRawTextList_1_1<'top>), + Text_1_1(RawTextList<'top, TextEncoding_1_1>), Binary_1_1(LazyRawBinaryList_1_1<'top>), } impl<'top> LazyContainerPrivate<'top, AnyEncoding> for LazyRawAnyList<'top> { fn from_value(value: LazyRawAnyValue<'top>) -> Self { + use LazyRawValueKind::*; match value.encoding { - LazyRawValueKind::Text_1_0(v) => LazyRawAnyList { - encoding: LazyRawListKind::Text_1_0(LazyRawTextList_1_0::from_value(v)), + Text_1_0(v) => LazyRawAnyList { + encoding: LazyRawListKind::Text_1_0(RawTextList::from_value(v)), }, - LazyRawValueKind::Binary_1_0(v) => LazyRawAnyList { + Binary_1_0(v) => LazyRawAnyList { encoding: LazyRawListKind::Binary_1_0(LazyRawBinaryList_1_0::from_value(v)), }, - LazyRawValueKind::Text_1_1(v) => LazyRawAnyList { - encoding: LazyRawListKind::Text_1_1(LazyRawTextList_1_1::from_value(v)), + Text_1_1(v) => LazyRawAnyList { + encoding: LazyRawListKind::Text_1_1(RawTextList::from_value(v)), }, - LazyRawValueKind::Binary_1_1(v) => LazyRawAnyList { + Binary_1_1(v) => LazyRawAnyList { encoding: LazyRawListKind::Binary_1_1(LazyRawBinaryList_1_1::from_value(v)), }, } @@ -1198,9 +1189,9 @@ pub struct RawAnyListIterator<'data> { #[derive(Debug, Copy, Clone)] pub enum RawAnyListIteratorKind<'data> { - Text_1_0(RawTextListIterator_1_0<'data>), + Text_1_0(RawTextSequenceCacheIterator<'data, TextEncoding_1_0>), Binary_1_0(RawBinarySequenceIterator_1_0<'data>), - Text_1_1(RawTextSequenceCacheIterator_1_1<'data>), + Text_1_1(RawTextSequenceCacheIterator<'data, TextEncoding_1_1>), Binary_1_1(RawBinarySequenceIterator_1_1<'data>), } @@ -1270,8 +1261,8 @@ impl<'top> LazyRawSequence<'top, AnyEncoding> for LazyRawAnyList<'top> { } } -impl<'data> From> for LazyRawAnyList<'data> { - fn from(value: LazyRawTextList_1_0<'data>) -> Self { +impl<'data> From> for LazyRawAnyList<'data> { + fn from(value: RawTextList<'data, TextEncoding_1_0>) -> Self { LazyRawAnyList { encoding: LazyRawListKind::Text_1_0(value), } @@ -1286,8 +1277,8 @@ impl<'data> From> for LazyRawAnyList<'data> { } } -impl<'data> From> for LazyRawAnyList<'data> { - fn from(value: LazyRawTextList_1_1<'data>) -> Self { +impl<'data> From> for LazyRawAnyList<'data> { + fn from(value: RawTextList<'data, TextEncoding_1_1>) -> Self { LazyRawAnyList { encoding: LazyRawListKind::Text_1_1(value), } @@ -1317,9 +1308,9 @@ impl<'top> LazyRawAnySExp<'top> { #[derive(Debug, Copy, Clone)] pub enum LazyRawSExpKind<'data> { - Text_1_0(LazyRawTextSExp_1_0<'data>), + Text_1_0(RawTextSExp<'data, TextEncoding_1_0>), Binary_1_0(LazyRawBinarySExp_1_0<'data>), - Text_1_1(LazyRawTextSExp_1_1<'data>), + Text_1_1(RawTextSExp<'data, TextEncoding_1_1>), Binary_1_1(LazyRawBinarySExp_1_1<'data>), } @@ -1339,13 +1330,13 @@ impl<'data> LazyContainerPrivate<'data, AnyEncoding> for LazyRawAnySExp<'data> { fn from_value(value: LazyRawAnyValue<'data>) -> Self { match value.encoding { LazyRawValueKind::Text_1_0(v) => LazyRawAnySExp { - encoding: LazyRawSExpKind::Text_1_0(LazyRawTextSExp_1_0::from_value(v)), + encoding: LazyRawSExpKind::Text_1_0(RawTextSExp::from_value(v)), }, LazyRawValueKind::Binary_1_0(v) => LazyRawAnySExp { encoding: LazyRawSExpKind::Binary_1_0(LazyRawBinarySExp_1_0::from_value(v)), }, LazyRawValueKind::Text_1_1(v) => LazyRawAnySExp { - encoding: LazyRawSExpKind::Text_1_1(LazyRawTextSExp_1_1::from_value(v)), + encoding: LazyRawSExpKind::Text_1_1(RawTextSExp::from_value(v)), }, LazyRawValueKind::Binary_1_1(v) => LazyRawAnySExp { encoding: LazyRawSExpKind::Binary_1_1(LazyRawBinarySExp_1_1::from_value(v)), @@ -1361,9 +1352,9 @@ pub struct RawAnySExpIterator<'data> { #[derive(Debug, Copy, Clone)] pub enum RawAnySExpIteratorKind<'data> { - Text_1_0(RawTextSExpIterator_1_0<'data>), + Text_1_0(RawTextSequenceCacheIterator<'data, TextEncoding_1_0>), Binary_1_0(RawBinarySequenceIterator_1_0<'data>), - Text_1_1(RawTextSequenceCacheIterator_1_1<'data>), + Text_1_1(RawTextSequenceCacheIterator<'data, TextEncoding_1_1>), Binary_1_1(RawBinarySequenceIterator_1_1<'data>), } @@ -1422,8 +1413,8 @@ impl<'top> LazyRawSequence<'top, AnyEncoding> for LazyRawAnySExp<'top> { } } -impl<'data> From> for LazyRawAnySExp<'data> { - fn from(value: LazyRawTextSExp_1_0<'data>) -> Self { +impl<'data> From> for LazyRawAnySExp<'data> { + fn from(value: RawTextSExp<'data, TextEncoding_1_0>) -> Self { LazyRawAnySExp { encoding: LazyRawSExpKind::Text_1_0(value), } @@ -1438,8 +1429,8 @@ impl<'data> From> for LazyRawAnySExp<'data> { } } -impl<'data> From> for LazyRawAnySExp<'data> { - fn from(value: LazyRawTextSExp_1_1<'data>) -> Self { +impl<'data> From> for LazyRawAnySExp<'data> { + fn from(value: RawTextSExp<'data, TextEncoding_1_1>) -> Self { LazyRawAnySExp { encoding: LazyRawSExpKind::Text_1_1(value), } @@ -1463,9 +1454,9 @@ pub struct LazyRawAnyStruct<'data> { #[derive(Debug, Copy, Clone)] pub enum LazyRawStructKind<'data> { - Text_1_0(LazyRawTextStruct_1_0<'data>), + Text_1_0(LazyRawTextStruct<'data, TextEncoding_1_0>), Binary_1_0(LazyRawBinaryStruct_1_0<'data>), - Text_1_1(LazyRawTextStruct_1_1<'data>), + Text_1_1(LazyRawTextStruct<'data, TextEncoding_1_1>), Binary_1_1(LazyRawBinaryStruct_1_1<'data>), } @@ -1487,9 +1478,9 @@ pub struct LazyRawAnyFieldName<'data> { #[derive(Debug, Copy, Clone)] pub enum LazyRawFieldNameKind<'data> { - Text_1_0(LazyRawTextFieldName_1_0<'data>), + Text_1_0(LazyRawTextFieldName<'data, TextEncoding_1_0>), Binary_1_0(LazyRawBinaryFieldName_1_0<'data>), - Text_1_1(LazyRawTextFieldName_1_1<'data>), + Text_1_1(LazyRawTextFieldName<'data, TextEncoding_1_1>), Binary_1_1(LazyRawBinaryFieldName_1_1<'data>), } @@ -1535,14 +1526,14 @@ impl<'top> From> for LazyRawAnyFieldName<'top> { } } -impl<'top> From> for LazyRawAnyFieldName<'top> { - fn from(value: LazyRawTextFieldName_1_0<'top>) -> Self { +impl<'top> From> for LazyRawAnyFieldName<'top> { + fn from(value: LazyRawTextFieldName<'top, TextEncoding_1_0>) -> Self { LazyRawFieldNameKind::Text_1_0(value).into() } } -impl<'top> From> for LazyRawAnyFieldName<'top> { - fn from(value: LazyRawTextFieldName_1_1<'top>) -> Self { +impl<'top> From> for LazyRawAnyFieldName<'top> { + fn from(value: LazyRawTextFieldName<'top, TextEncoding_1_1>) -> Self { LazyRawFieldNameKind::Text_1_1(value).into() } } @@ -1566,9 +1557,9 @@ pub struct RawAnyStructIterator<'data> { #[derive(Debug, Copy, Clone)] pub enum RawAnyStructIteratorKind<'data> { - Text_1_0(RawTextStructIterator_1_0<'data>), + Text_1_0(RawTextStructCacheIterator<'data, TextEncoding_1_0>), Binary_1_0(RawBinaryStructIterator_1_0<'data>), - Text_1_1(RawTextStructCacheIterator_1_1<'data>), + Text_1_1(RawTextStructCacheIterator<'data, TextEncoding_1_1>), Binary_1_1(RawBinaryStructIterator_1_1<'data>), } @@ -1649,13 +1640,17 @@ impl<'data> LazyContainerPrivate<'data, AnyEncoding> for LazyRawAnyStruct<'data> fn from_value(value: LazyRawAnyValue<'data>) -> Self { match value.encoding { LazyRawValueKind::Text_1_0(v) => LazyRawAnyStruct { - encoding: LazyRawStructKind::Text_1_0(LazyRawTextStruct_1_0::from_value(v)), + encoding: LazyRawStructKind::Text_1_0( + LazyRawTextStruct::::from_value(v), + ), }, LazyRawValueKind::Binary_1_0(v) => LazyRawAnyStruct { encoding: LazyRawStructKind::Binary_1_0(LazyRawBinaryStruct_1_0::from_value(v)), }, LazyRawValueKind::Text_1_1(v) => LazyRawAnyStruct { - encoding: LazyRawStructKind::Text_1_1(LazyRawTextStruct_1_1::from_value(v)), + encoding: LazyRawStructKind::Text_1_1( + LazyRawTextStruct::::from_value(v), + ), }, LazyRawValueKind::Binary_1_1(v) => LazyRawAnyStruct { encoding: LazyRawStructKind::Binary_1_1(LazyRawBinaryStruct_1_1::from_value(v)), @@ -1702,8 +1697,8 @@ impl<'top> LazyRawStruct<'top, AnyEncoding> for LazyRawAnyStruct<'top> { } } -impl<'data> From> for LazyRawAnyStruct<'data> { - fn from(value: LazyRawTextStruct_1_0<'data>) -> Self { +impl<'data> From> for LazyRawAnyStruct<'data> { + fn from(value: LazyRawTextStruct<'data, TextEncoding_1_0>) -> Self { LazyRawAnyStruct { encoding: LazyRawStructKind::Text_1_0(value), } @@ -1718,8 +1713,8 @@ impl<'data> From> for LazyRawAnyStruct<'data> { } } -impl<'data> From> for LazyRawAnyStruct<'data> { - fn from(value: LazyRawTextStruct_1_1<'data>) -> Self { +impl<'data> From> for LazyRawAnyStruct<'data> { + fn from(value: LazyRawTextStruct<'data, TextEncoding_1_1>) -> Self { LazyRawAnyStruct { encoding: LazyRawStructKind::Text_1_1(value), } @@ -1761,44 +1756,34 @@ mod tests { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let mut reader = LazyRawAnyReader::new(data); - assert_eq!(reader.next(context)?.expect_ivm()?.major_minor(), (1, 0)); - let _strukt = reader - .next(context)? - .expect_value()? - .read()? - .expect_struct()?; - let name = reader.next(context)?.expect_value()?; + let mut reader = LazyRawAnyReader::new(context, data, true); + assert_eq!(reader.next()?.expect_ivm()?.major_minor(), (1, 0)); + let _strukt = reader.next()?.expect_value()?.read()?.expect_struct()?; + let name = reader.next()?.expect_value()?; assert_eq!( name.annotations().next().unwrap()?, RawSymbolRef::SymbolId(4) ); assert_eq!(name.read()?.expect_string()?.text(), "Gary"); assert_eq!( - reader.next(context)?.expect_value()?.read()?, + reader.next()?.expect_value()?.read()?, RawValueRef::String("foo".into()) ); assert_eq!( - reader.next(context)?.expect_value()?.read()?, + reader.next()?.expect_value()?.read()?, RawValueRef::Int(5.into()) ); assert_eq!( - reader.next(context)?.expect_value()?.read()?, + reader.next()?.expect_value()?.read()?, RawValueRef::Timestamp(Timestamp::with_year(2023).with_month(8).build()?) ); assert_eq!( - reader.next(context)?.expect_value()?.read()?, + reader.next()?.expect_value()?.read()?, RawValueRef::Bool(false) ); let mut sum = 0; - for lazy_value_result in reader - .next(context)? - .expect_value()? - .read()? - .expect_list()? - .iter() - { + for lazy_value_result in reader.next()?.expect_value()?.read()?.expect_list()?.iter() { sum += lazy_value_result?.expect_value()?.read()?.expect_i64()?; } assert_eq!(sum, 6); @@ -1807,7 +1792,7 @@ mod tests { // local symbol table and the raw reader interprets that as a different value. assert!(matches!( - reader.next(context)?, + reader.next()?, LazyRawStreamItem::::EndOfStream(_) )); Ok(()) @@ -1832,7 +1817,6 @@ mod tests { } fn expect_version_change( - context_ref: EncodingContextRef<'_>, reader: &mut LazyRawAnyReader<'_>, encoding_before: IonEncoding, encoding_after: IonEncoding, @@ -1840,7 +1824,7 @@ mod tests { // The reader is using the expected encoding before we hit the IVM assert_eq!(reader.encoding(), encoding_before); // The next item is an IVM - let ivm = reader.next(context_ref)?.expect_ivm()?; + let ivm = reader.next()?.expect_ivm()?; // The IVM correctly reports the expected before/after encodings assert_eq!(ivm.stream_encoding_before_marker(), encoding_before); assert_eq!(ivm.stream_encoding_after_marker()?, encoding_after); @@ -1850,12 +1834,11 @@ mod tests { } fn expect_int( - context_ref: EncodingContextRef<'_>, reader: &mut LazyRawAnyReader<'_>, expected_encoding: IonEncoding, expected_int: i64, ) -> IonResult<()> { - let value = reader.next(context_ref)?.expect_value()?; + let value = reader.next()?.expect_value()?; let actual_int = value.read()?.expect_i64()?; assert_eq!(actual_int, expected_int); assert_eq!(reader.encoding(), expected_encoding); @@ -1876,58 +1859,37 @@ mod tests { 5 "#; - let mut reader = LazyRawAnyReader::new(DATA.as_bytes()); let encoding_context = EncodingContext::empty(); - let context_ref = encoding_context.get_ref(); + let mut reader = LazyRawAnyReader::new(encoding_context.get_ref(), DATA.as_bytes(), true); - expect_int(context_ref, &mut reader, IonEncoding::Text_1_0, 1)?; + expect_int(&mut reader, IonEncoding::Text_1_0, 1)?; // This IVM doesn't change the encoding. - expect_version_change( - context_ref, - &mut reader, - IonEncoding::Text_1_0, - IonEncoding::Text_1_0, - )?; + expect_version_change(&mut reader, IonEncoding::Text_1_0, IonEncoding::Text_1_0)?; - expect_int(context_ref, &mut reader, IonEncoding::Text_1_0, 2)?; + expect_int(&mut reader, IonEncoding::Text_1_0, 2)?; if cfg!(not(feature = "experimental-ion-1-1")) { reader - .next(context_ref) + .next() .expect_err("Ion 1.1 IVM should return an error."); return Ok(()); } // This IVM changes the encoding from 1.0 text to 1.1 text - expect_version_change( - context_ref, - &mut reader, - IonEncoding::Text_1_0, - IonEncoding::Text_1_1, - )?; + expect_version_change(&mut reader, IonEncoding::Text_1_0, IonEncoding::Text_1_1)?; - expect_int(context_ref, &mut reader, IonEncoding::Text_1_1, 3)?; + expect_int(&mut reader, IonEncoding::Text_1_1, 3)?; // This IVM doesn't change the encoding. - expect_version_change( - context_ref, - &mut reader, - IonEncoding::Text_1_1, - IonEncoding::Text_1_1, - )?; + expect_version_change(&mut reader, IonEncoding::Text_1_1, IonEncoding::Text_1_1)?; - expect_int(context_ref, &mut reader, IonEncoding::Text_1_1, 4)?; + expect_int(&mut reader, IonEncoding::Text_1_1, 4)?; // This IVM changes the encoding from 1.1 text to 1.0 text - expect_version_change( - context_ref, - &mut reader, - IonEncoding::Text_1_1, - IonEncoding::Text_1_0, - )?; + expect_version_change(&mut reader, IonEncoding::Text_1_1, IonEncoding::Text_1_0)?; - expect_int(context_ref, &mut reader, IonEncoding::Text_1_0, 5)?; + expect_int(&mut reader, IonEncoding::Text_1_0, 5)?; Ok(()) } @@ -1945,58 +1907,53 @@ mod tests { 0x21, 0x05, // 5 ]; - let mut reader = LazyRawAnyReader::new(DATA); let encoding_context = EncodingContext::empty(); - let context_ref = encoding_context.get_ref(); + let mut reader = LazyRawAnyReader::new(encoding_context.get_ref(), DATA, true); // When the reader is constructed it peeks at the leading bytes to see if they're an IVM. // In this case, they were a binary Ion v1.0 IVM, so the reader is already expecting to see // binary 1.0 data. Reading the binary version marker tells the reader to switch encodings. expect_version_change( - context_ref, &mut reader, IonEncoding::Binary_1_0, IonEncoding::Binary_1_0, )?; - expect_int(context_ref, &mut reader, IonEncoding::Binary_1_0, 2)?; + expect_int(&mut reader, IonEncoding::Binary_1_0, 2)?; if cfg!(not(feature = "experimental-ion-1-1")) { reader - .next(context_ref) + .next() .expect_err("Ion 1.1 IVM should return an error."); return Ok(()); } // This IVM changes the encoding from 1.0 binary to 1.1 binary expect_version_change( - context_ref, &mut reader, IonEncoding::Binary_1_0, IonEncoding::Binary_1_1, )?; - expect_int(context_ref, &mut reader, IonEncoding::Binary_1_1, 3)?; + expect_int(&mut reader, IonEncoding::Binary_1_1, 3)?; // This IVM doesn't change the encoding. expect_version_change( - context_ref, &mut reader, IonEncoding::Binary_1_1, IonEncoding::Binary_1_1, )?; - expect_int(context_ref, &mut reader, IonEncoding::Binary_1_1, 4)?; + expect_int(&mut reader, IonEncoding::Binary_1_1, 4)?; // This IVM changes the encoding from 1.1 binary to 1.0 binary expect_version_change( - context_ref, &mut reader, IonEncoding::Binary_1_1, IonEncoding::Binary_1_0, )?; - expect_int(context_ref, &mut reader, IonEncoding::Binary_1_0, 5)?; + expect_int(&mut reader, IonEncoding::Binary_1_0, 5)?; Ok(()) } diff --git a/src/lazy/binary/raw/reader.rs b/src/lazy/binary/raw/reader.rs index fe17d280..e67402a1 100644 --- a/src/lazy/binary/raw/reader.rs +++ b/src/lazy/binary/raw/reader.rs @@ -15,22 +15,30 @@ use crate::lazy::streaming_raw_reader::RawReaderState; /// A binary Ion 1.0 reader that yields [`LazyRawBinaryValue_1_0`]s representing the top level values found /// in the provided input stream. pub struct LazyRawBinaryReader_1_0<'data> { + context: EncodingContextRef<'data>, data: DataSource<'data>, } impl<'data> LazyRawBinaryReader_1_0<'data> { /// Constructs a `LazyRawReader` positioned at the beginning of the provided input stream. - pub fn new(data: &'data [u8]) -> LazyRawBinaryReader_1_0<'data> { - Self::new_with_offset(data, 0) + pub fn new( + context: EncodingContextRef<'data>, + data: &'data [u8], + ) -> LazyRawBinaryReader_1_0<'data> { + Self::new_with_offset(context, data, 0) } /// Constructs a `LazyRawReader` positioned at the beginning of the provided input stream. /// The provided input stream is itself a slice starting `offset` bytes from the beginning /// of a larger data stream. This offset is used for reporting the absolute (stream-level) /// position of values encountered in `data`. - fn new_with_offset(data: &'data [u8], offset: usize) -> LazyRawBinaryReader_1_0<'data> { + fn new_with_offset( + context: EncodingContextRef<'data>, + data: &'data [u8], + offset: usize, + ) -> LazyRawBinaryReader_1_0<'data> { let data = DataSource::new(BinaryBuffer::new_with_offset(data, offset)); - Self { data } + Self { context, data } } /// Helper method called by [`Self::next`]. Reads the current stream item as an Ion version @@ -70,10 +78,8 @@ impl<'data> LazyRawBinaryReader_1_0<'data> { Ok(RawStreamItem::Value(lazy_value)) } - pub fn next<'top>(&'top mut self) -> IonResult> - where - 'data: 'top, - { + #[allow(clippy::should_implement_trait)] + pub fn next(&mut self) -> IonResult> { // Get a new buffer view starting beyond the last item we returned. let mut buffer = self.data.advance_to_next_item()?; if buffer.is_empty() { @@ -102,18 +108,25 @@ impl<'data> LazyRawBinaryReader_1_0<'data> { self.read_value(buffer) } + + pub fn context(&self) -> EncodingContextRef<'data> { + self.context + } } impl<'data> LazyRawReader<'data, BinaryEncoding_1_0> for LazyRawBinaryReader_1_0<'data> { - fn resume_at_offset( - data: &'data [u8], - offset: usize, - // This argument is ignored by all raw readers except LazyRawAnyReader - _encoding_hint: IonEncoding, - ) -> Self { + fn new(context: EncodingContextRef<'data>, data: &'data [u8], is_final_data: bool) -> Self { + Self::resume( + context, + RawReaderState::new(data, 0, is_final_data, IonEncoding::Binary_1_0), + ) + } + + fn resume(context: EncodingContextRef<'data>, saved_state: RawReaderState<'data>) -> Self { LazyRawBinaryReader_1_0 { + context, data: DataSource { - buffer: BinaryBuffer::new_with_offset(data, offset), + buffer: BinaryBuffer::new_with_offset(saved_state.data(), saved_state.offset()), bytes_to_skip: 0, }, } @@ -124,17 +137,14 @@ impl<'data> LazyRawReader<'data, BinaryEncoding_1_0> for LazyRawBinaryReader_1_0 RawReaderState::new( &self.data.buffer.bytes()[self.data.bytes_to_skip..], stream_offset, + // The binary readers do not care whether the data is final because they can detect + // incomplete values in any case. They always report `false` for simplicity. + false, IonEncoding::Binary_1_0, ) } - fn next<'top>( - &'top mut self, - _context: EncodingContextRef<'top>, - ) -> IonResult> - where - 'data: 'top, - { + fn next(&mut self) -> IonResult> { self.next() } @@ -244,7 +254,7 @@ mod tests { use crate::lazy::decoder::{LazyRawFieldName, RawVersionMarker}; use crate::lazy::raw_stream_item::RawStreamItem; use crate::raw_symbol_ref::AsRawSymbolRef; - use crate::{IonResult, IonType, RawSymbolRef}; + use crate::{EncodingContext, IonResult, IonType, RawSymbolRef}; #[test] fn test_struct() -> IonResult<()> { @@ -254,7 +264,8 @@ mod tests { {name:"hi", name: "hello"} "#, )?; - let mut reader = LazyRawBinaryReader_1_0::new(data); + let context = EncodingContext::empty(); + let mut reader = LazyRawBinaryReader_1_0::new(context.get_ref(), data); let _ivm = reader.next()?.expect_ivm()?; let value = reader.next()?.expect_value()?; let lazy_struct = value.read()?.expect_struct()?; @@ -272,7 +283,8 @@ mod tests { [1, true, foo] "#, )?; - let mut reader = LazyRawBinaryReader_1_0::new(data); + let context = EncodingContext::empty(); + let mut reader = LazyRawBinaryReader_1_0::new(context.get_ref(), data); let _ivm = reader.next()?.expect_ivm()?; let _symbol_table = reader.next()?.expect_value()?; let lazy_list = reader.next()?.expect_value()?.read()?.expect_list()?; @@ -316,7 +328,8 @@ mod tests { {name:"hi", name: "hello"} "#, )?; - let mut reader = LazyRawBinaryReader_1_0::new(data); + let context = EncodingContext::empty(); + let mut reader = LazyRawBinaryReader_1_0::new(context.get_ref(), data); loop { use RawStreamItem::*; match reader.next()? { @@ -339,7 +352,8 @@ mod tests { foo::bar::baz::7 "#, )?; - let mut reader = LazyRawBinaryReader_1_0::new(data); + let context = EncodingContext::empty(); + let mut reader = LazyRawBinaryReader_1_0::new(context.get_ref(), data); let _ivm = reader.next()?.expect_ivm()?; // Read annotations from $ion_symbol_table::{...} @@ -374,7 +388,8 @@ mod tests { 0x0f, // null ]; - let mut reader = LazyRawBinaryReader_1_0::new(&data); + let context = EncodingContext::empty(); + let mut reader = LazyRawBinaryReader_1_0::new(context.get_ref(), &data); let _ivm = reader.next()?.expect_ivm()?; assert_eq!( @@ -396,7 +411,8 @@ mod tests { 0x0f, // null ]; - let mut reader = LazyRawBinaryReader_1_0::new(&data); + let context = EncodingContext::empty(); + let mut reader = LazyRawBinaryReader_1_0::new(context.get_ref(), &data); let _ivm = reader.next()?.expect_ivm()?; let _ivm = reader.next()?.expect_ivm()?; diff --git a/src/lazy/binary/raw/struct.rs b/src/lazy/binary/raw/struct.rs index da81d6b8..703fa314 100644 --- a/src/lazy/binary/raw/struct.rs +++ b/src/lazy/binary/raw/struct.rs @@ -146,7 +146,7 @@ mod tests { use std::ops::Range; use crate::lazy::binary::raw::reader::LazyRawBinaryReader_1_0; - use crate::IonResult; + use crate::{EncodingContext, IonResult}; use super::*; @@ -164,8 +164,9 @@ mod tests { &[(RawSymbolRef::SymbolId(4), 1..2)], ), ]; + let context = EncodingContext::empty(); for (input, field_name_ranges) in tests { - let mut reader = LazyRawBinaryReader_1_0::new(input); + let mut reader = LazyRawBinaryReader_1_0::new(context.get_ref(), input); let struct_ = reader.next()?.expect_value()?.read()?.expect_struct()?; for (field_result, (expected_name, range)) in struct_.iter().zip(field_name_ranges.iter()) diff --git a/src/lazy/binary/raw/v1_1/reader.rs b/src/lazy/binary/raw/v1_1/reader.rs index a69ec37c..339fc11c 100644 --- a/src/lazy/binary/raw/v1_1/reader.rs +++ b/src/lazy/binary/raw/v1_1/reader.rs @@ -2,7 +2,6 @@ use crate::lazy::any_encoding::IonEncoding; use crate::lazy::binary::raw::v1_1::immutable_buffer::{BinaryBuffer, ParseResult}; -use crate::lazy::binary::raw::v1_1::ION_1_1_OPCODES; use crate::lazy::decoder::{LazyRawReader, RawValueExpr}; use crate::lazy::encoder::private::Sealed; use crate::lazy::encoding::BinaryEncoding_1_1; @@ -12,82 +11,68 @@ use crate::lazy::streaming_raw_reader::RawReaderState; use crate::{Encoding, IonResult}; pub struct LazyRawBinaryReader_1_1<'data> { - input: &'data [u8], - // The offset from the beginning of the overall stream at which the `input` slice begins - stream_offset: usize, - // The offset from the beginning of `input` at which the reader is positioned - local_offset: usize, + input: BinaryBuffer<'data>, } impl<'data> LazyRawBinaryReader_1_1<'data> { - pub fn new(input: &'data [u8]) -> Self { - Self::new_with_offset(input, 0) + pub fn new(context: EncodingContextRef<'data>, input: &'data [u8]) -> Self { + Self::new_with_offset(context, input, 0) } - fn new_with_offset(input: &'data [u8], stream_offset: usize) -> Self { - Self { - input, - stream_offset, - local_offset: 0, - } + fn new_with_offset( + context: EncodingContextRef<'data>, + input: &'data [u8], + stream_offset: usize, + ) -> Self { + let input = BinaryBuffer::new_with_offset(context, input, stream_offset); + Self { input } + } + + pub fn context(&self) -> EncodingContextRef<'data> { + self.input.context() } fn end_of_stream(&self, position: usize) -> LazyRawStreamItem<'data, BinaryEncoding_1_1> { RawStreamItem::EndOfStream(EndPosition::new(BinaryEncoding_1_1.encoding(), position)) } - fn read_ivm<'top>( - &mut self, - buffer: BinaryBuffer<'top>, - ) -> IonResult> + fn read_ivm<'top>(&mut self) -> IonResult> where 'data: 'top, { - let (marker, buffer_after_ivm) = buffer.read_ivm()?; - self.local_offset = buffer_after_ivm.offset() - self.stream_offset; + let (marker, buffer_after_ivm) = self.input.read_ivm()?; + self.input = buffer_after_ivm; Ok(LazyRawStreamItem::::VersionMarker( marker, )) } - fn read_value_expr<'top>( - &'top mut self, - buffer: BinaryBuffer<'top>, - ) -> ParseResult<'top, LazyRawStreamItem<'top, BinaryEncoding_1_1>> - where - 'data: 'top, - { - let (maybe_expr, remaining) = buffer.read_sequence_value_expr()?; + fn read_value_expr( + &mut self, + ) -> ParseResult<'data, LazyRawStreamItem<'data, BinaryEncoding_1_1>> { + let (maybe_expr, remaining) = self.input.read_sequence_value_expr()?; let item = match maybe_expr { Some(RawValueExpr::ValueLiteral(lazy_value)) => RawStreamItem::Value(lazy_value), Some(RawValueExpr::EExp(eexpr)) => RawStreamItem::EExp(eexpr), - None => self.end_of_stream(buffer.offset()), + None => self.end_of_stream(self.input.offset()), }; - self.local_offset = remaining.offset() - self.stream_offset; + self.input = remaining; Ok((item, remaining)) } + #[allow(clippy::should_implement_trait)] #[inline(always)] - pub fn next<'top>( - &'top mut self, - context: EncodingContextRef<'top>, - ) -> IonResult> - where - 'data: 'top, - { - let data = &self.input[self.local_offset..]; - let Some(&first_byte) = data.first() else { + pub fn next(&mut self) -> IonResult> { + let Some(mut opcode) = self.input.peek_opcode() else { return Ok(self.end_of_stream(self.position())); }; - let mut buffer = BinaryBuffer::new_with_offset(context, data, self.position()); - let mut opcode = ION_1_1_OPCODES[first_byte as usize]; - if opcode.is_nop() && !buffer.opcode_after_nop(&mut opcode)? { - return Ok(self.end_of_stream(buffer.offset())); + if opcode.is_nop() && !self.input.opcode_after_nop(&mut opcode)? { + return Ok(self.end_of_stream(self.input.offset())); } if opcode.is_ivm_start() { - return self.read_ivm(buffer); + return self.read_ivm(); } - let (item, _remaining) = self.read_value_expr(buffer)?; + let (item, _remaining) = self.read_value_expr()?; Ok(item) } } @@ -95,39 +80,33 @@ impl<'data> LazyRawBinaryReader_1_1<'data> { impl Sealed for LazyRawBinaryReader_1_1<'_> {} impl<'data> LazyRawReader<'data, BinaryEncoding_1_1> for LazyRawBinaryReader_1_1<'data> { - fn new(data: &'data [u8]) -> Self { - Self::new(data) + fn new(context: EncodingContextRef<'data>, data: &'data [u8], is_final_data: bool) -> Self { + Self::resume( + context, + RawReaderState::new(data, 0, is_final_data, IonEncoding::Binary_1_1), + ) } - fn resume_at_offset( - data: &'data [u8], - offset: usize, - // This argument is ignored by all raw readers except LazyRawAnyReader - _encoding_hint: IonEncoding, - ) -> Self { - Self::new_with_offset(data, offset) + fn resume(context: EncodingContextRef<'data>, saved_state: RawReaderState<'data>) -> Self { + Self::new_with_offset(context, saved_state.data(), saved_state.offset()) } fn save_state(&self) -> RawReaderState<'data> { RawReaderState::new( - &self.input[self.local_offset..], + self.input.bytes(), self.position(), + // The binary reader doesn't care about `is_final`, so we just use `false`. + false, self.encoding(), ) } - fn next<'top>( - &'top mut self, - context: EncodingContextRef<'top>, - ) -> IonResult> - where - 'data: 'top, - { - self.next(context) + fn next(&mut self) -> IonResult> { + self.next() } fn position(&self) -> usize { - self.stream_offset + self.local_offset + self.input.offset() } fn encoding(&self) -> IonEncoding { @@ -158,15 +137,11 @@ mod tests { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(&data); - let _ivm = reader.next(context)?.expect_ivm()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, &data); + let _ivm = reader.next()?.expect_ivm()?; assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_null()?, + reader.next()?.expect_value()?.read()?.expect_null()?, IonType::Null ); @@ -182,22 +157,12 @@ mod tests { ]; let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(&data); - let _ivm = reader.next(context)?.expect_ivm()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, &data); + let _ivm = reader.next()?.expect_ivm()?; - assert!(reader - .next(context)? - .expect_value()? - .read()? - .expect_bool()?); + assert!(reader.next()?.expect_value()?.read()?.expect_bool()?); - assert!( - !(reader - .next(context)? - .expect_value()? - .read()? - .expect_bool()?) - ); + assert!(!(reader.next()?.expect_value()?.read()?.expect_bool()?)); Ok(()) } @@ -226,29 +191,29 @@ mod tests { ]; let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(&data); - let _ivm = reader.next(context)?.expect_ivm()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, &data); + let _ivm = reader.next()?.expect_ivm()?; assert_eq!( - reader.next(context)?.expect_value()?.read()?.expect_int()?, + reader.next()?.expect_value()?.read()?.expect_int()?, 0.into() ); assert_eq!( - reader.next(context)?.expect_value()?.read()?.expect_int()?, + reader.next()?.expect_value()?.read()?.expect_int()?, 17.into() ); assert_eq!( - reader.next(context)?.expect_value()?.read()?.expect_int()?, + reader.next()?.expect_value()?.read()?.expect_int()?, (-944).into() ); assert_eq!( - reader.next(context)?.expect_value()?.read()?.expect_int()?, + reader.next()?.expect_value()?.read()?.expect_int()?, 1.into() ); assert_eq!( - reader.next(context)?.expect_value()?.read()?.expect_int()?, + reader.next()?.expect_value()?.read()?.expect_int()?, 147573952589676412929i128.into() ); Ok(()) @@ -277,42 +242,23 @@ mod tests { ]; let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(&data); - let _ivm = reader.next(context)?.expect_ivm()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, &data); + let _ivm = reader.next()?.expect_ivm()?; - assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_string()?, - "" - ); + assert_eq!(reader.next()?.expect_value()?.read()?.expect_string()?, ""); assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_string()?, + reader.next()?.expect_value()?.read()?.expect_string()?, "hello" ); assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_string()?, + reader.next()?.expect_value()?.read()?.expect_string()?, "fourteen bytes" ); assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_string()?, + reader.next()?.expect_value()?.read()?.expect_string()?, "variable length encoding" ); @@ -354,8 +300,8 @@ mod tests { ]; let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(&data); - let _ivm = reader.next(context)?.expect_ivm()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, &data); + let _ivm = reader.next()?.expect_ivm()?; let expected_symbols: &[RawSymbolRef<'_>] = &[ RawSymbolRef::Text(""), @@ -372,11 +318,7 @@ mod tests { for expected_symbol in expected_symbols { assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_symbol()?, + reader.next()?.expect_value()?.read()?.expect_symbol()?, expected_symbol.clone() ); } @@ -405,36 +347,21 @@ mod tests { ]; let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(&data); - let _ivm = reader.next(context)?.expect_ivm()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, &data); + let _ivm = reader.next()?.expect_ivm()?; - assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_float()?, - 0.0 - ); + assert_eq!(reader.next()?.expect_value()?.read()?.expect_float()?, 0.0); // TODO: Implement Half-precision. - // assert_eq!(reader.next(context)?.expect_value()?.read()?.expect_float()?, 3.14); + // assert_eq!(reader.next()?.expect_value()?.read()?.expect_float()?, 3.14); assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_float()? as f32, + reader.next()?.expect_value()?.read()?.expect_float()? as f32, 3.1415927f32, ); assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_float()?, + reader.next()?.expect_value()?.read()?.expect_float()?, std::f64::consts::PI, ); @@ -498,17 +425,17 @@ mod tests { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader_txt = LazyRawTextReader_1_1::new(expected_txt.as_bytes()); - let mut reader_bin = LazyRawBinaryReader_1_1::new(ion_data); + let mut reader_txt = LazyRawTextReader_1_1::new(context, expected_txt.as_bytes(), true); + let mut reader_bin = LazyRawBinaryReader_1_1::new(context, ion_data); assert_eq!( reader_bin - .next(context)? + .next()? .expect_value()? .read()? .expect_decimal()?, reader_txt - .next(context)? + .next()? .expect_value()? .read()? .expect_decimal()?, @@ -543,11 +470,11 @@ mod tests { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader_txt = LazyRawTextReader_1_1::new(expected_txt.as_bytes()); - let mut reader_bin = LazyRawBinaryReader_1_1::new(ion_data); + let mut reader_txt = LazyRawTextReader_1_1::new(context, expected_txt.as_bytes(), true); + let mut reader_bin = LazyRawBinaryReader_1_1::new(context, ion_data); - let expected_value = reader_txt.next(context)?.expect_value()?.read()?; - let actual_value = reader_bin.next(context)?.expect_value()?.read()?; + let expected_value = reader_txt.next()?.expect_value()?.read()?; + let actual_value = reader_bin.next()?.expect_value()?.read()?; assert!(actual_value .expect_decimal()? @@ -577,17 +504,17 @@ mod tests { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader_txt = LazyRawTextReader_1_1::new(expected_txt.as_bytes()); - let mut reader_bin = LazyRawBinaryReader_1_1::new(ion_data); + let mut reader_txt = LazyRawTextReader_1_1::new(context, expected_txt.as_bytes(), true); + let mut reader_bin = LazyRawBinaryReader_1_1::new(context, ion_data); assert_eq!( reader_bin - .next(context)? + .next()? .expect_value()? .read()? .expect_timestamp()?, reader_txt - .next(context)? + .next()? .expect_value()? .read()? .expect_timestamp()?, @@ -610,17 +537,17 @@ mod tests { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader_txt = LazyRawTextReader_1_1::new(expected_txt.as_bytes()); - let mut reader_bin = LazyRawBinaryReader_1_1::new(ion_data); + let mut reader_txt = LazyRawTextReader_1_1::new(context, expected_txt.as_bytes(), true); + let mut reader_bin = LazyRawBinaryReader_1_1::new(context, ion_data); assert_eq!( reader_bin - .next(context)? + .next()? .expect_value()? .read()? .expect_timestamp()?, reader_txt - .next(context)? + .next()? .expect_value()? .read()? .expect_timestamp()?, @@ -638,21 +565,14 @@ mod tests { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(&data); - let _ivm = reader.next(context)?.expect_ivm()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, &data); + let _ivm = reader.next()?.expect_ivm()?; let bytes: &[u8] = &[ 0x49, 0x20, 0x61, 0x70, 0x70, 0x6c, 0x61, 0x75, 0x64, 0x20, 0x79, 0x6f, 0x75, 0x72, 0x20, 0x63, 0x75, 0x72, 0x69, 0x6f, 0x73, 0x69, 0x74, 0x79, ]; - assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_blob()?, - bytes - ); + assert_eq!(reader.next()?.expect_value()?.read()?.expect_blob()?, bytes); Ok(()) } @@ -667,22 +587,15 @@ mod tests { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(&data); - let _ivm = reader.next(context)?.expect_ivm()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, &data); + let _ivm = reader.next()?.expect_ivm()?; let bytes: &[u8] = &[ 0x49, 0x20, 0x61, 0x70, 0x70, 0x6c, 0x61, 0x75, 0x64, 0x20, 0x79, 0x6f, 0x75, 0x72, 0x20, 0x63, 0x75, 0x72, 0x69, 0x6f, 0x73, 0x69, 0x74, 0x79, ]; - assert_eq!( - reader - .next(context)? - .expect_value()? - .read()? - .expect_clob()?, - bytes - ); + assert_eq!(reader.next()?.expect_value()?.read()?.expect_clob()?, bytes); Ok(()) } @@ -701,12 +614,8 @@ mod tests { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(ion_data); - let container = reader - .next(context)? - .expect_value()? - .read()? - .expect_list()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, ion_data); + let container = reader.next()?.expect_value()?.read()?.expect_list()?; let mut top_iter = container.iter(); let actual_value = top_iter @@ -812,12 +721,8 @@ mod tests { for (ion_data, expected_types) in tests { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(ion_data); - let container = reader - .next(context)? - .expect_value()? - .read()? - .expect_list()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, ion_data); + let container = reader.next()?.expect_value()?.read()?.expect_list()?; let mut count = 0; for (actual_lazy_value, expected_type) in container.iter().zip(expected_types.iter()) { let value = actual_lazy_value?.expect_value()?; @@ -881,12 +786,8 @@ mod tests { for (ion_data, expected_types) in tests { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(ion_data); - let container = reader - .next(context)? - .expect_value()? - .read()? - .expect_sexp()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, ion_data); + let container = reader.next()?.expect_value()?.read()?.expect_sexp()?; let mut count = 0; for (actual_lazy_value, expected_type) in container.iter().zip(expected_types.iter()) { let value = actual_lazy_value?.expect_value()?; @@ -920,12 +821,8 @@ mod tests { for (data, expected_type) in data { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(&data); - let actual_type = reader - .next(context)? - .expect_value()? - .read()? - .expect_null()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, &data); + let actual_type = reader.next()?.expect_value()?.read()?.expect_null()?; assert_eq!(actual_type, expected_type); } Ok(()) @@ -947,12 +844,8 @@ mod tests { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(ion_data); - let container = reader - .next(context)? - .expect_value()? - .read()? - .expect_struct()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, ion_data); + let container = reader.next()?.expect_value()?.read()?.expect_struct()?; let mut top_iter = container.iter(); @@ -1099,12 +992,8 @@ mod tests { for (ion_data, field_pairs) in tests { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let mut reader = LazyRawBinaryReader_1_1::new(ion_data); - let actual_data = reader - .next(context)? - .expect_value()? - .read()? - .expect_struct()?; + let mut reader = LazyRawBinaryReader_1_1::new(context, ion_data); + let actual_data = reader.next()?.expect_value()?.read()?.expect_struct()?; for (actual_field, expected_field) in actual_data.iter().zip(field_pairs.iter()) { let (expected_name, expected_value_type) = expected_field; diff --git a/src/lazy/binary/raw/value.rs b/src/lazy/binary/raw/value.rs index ea8ca02f..062087c2 100644 --- a/src/lazy/binary/raw/value.rs +++ b/src/lazy/binary/raw/value.rs @@ -792,7 +792,7 @@ impl<'top> LazyRawBinaryValue_1_0<'top> { mod tests { use crate::lazy::binary::raw::reader::LazyRawBinaryReader_1_0; use crate::lazy::binary::test_utilities::to_binary_ion; - use crate::IonResult; + use crate::{EncodingContext, IonResult}; #[test] fn annotations_sequence() -> IonResult<()> { @@ -802,7 +802,8 @@ mod tests { foo // binary writer will omit the symtab if we don't use a symbol "#, )?; - let mut reader = LazyRawBinaryReader_1_0::new(data); + let context = EncodingContext::empty(); + let mut reader = LazyRawBinaryReader_1_0::new(context.get_ref(), data); let _ivm = reader.next()?.expect_ivm()?; let value = reader.next()?.expect_value()?; let annotations_sequence = value.annotations_sequence(); diff --git a/src/lazy/decoder.rs b/src/lazy/decoder.rs index acb225d2..74552342 100644 --- a/src/lazy/decoder.rs +++ b/src/lazy/decoder.rs @@ -10,7 +10,7 @@ use crate::lazy::encoding::{ BinaryEncoding, BinaryEncoding_1_0, RawValueLiteral, TextEncoding_1_0, }; use crate::lazy::expanded::macro_evaluator::RawEExpression; -use crate::lazy::expanded::{EncodingContext, EncodingContextRef}; +use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::LazyRawStreamItem; use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::span::Span; @@ -410,22 +410,22 @@ pub(crate) mod private { fn from_value(value: D::Value<'top>) -> Self; } - pub trait LazyRawStructPrivate<'top, D: Decoder> { - /// Creates an iterator that converts each raw struct field into an `FieldExpr`, a - /// common representation for both raw fields and template fields that is used in the - /// expansion process. - fn field_exprs( - &self, - context: EncodingContextRef<'top>, - ) -> RawStructFieldExprIterator<'top, D>; - } - pub struct RawStructFieldExprIterator<'top, D: Decoder> { context: EncodingContextRef<'top>, raw_fields: as LazyRawStruct<'top, D>>::Iterator, } impl<'top, D: Decoder> RawStructFieldExprIterator<'top, D> { + pub fn new( + context: EncodingContextRef<'top>, + raw_fields: as LazyRawStruct<'top, D>>::Iterator, + ) -> Self { + Self { + context, + raw_fields, + } + } + pub fn context(&self) -> EncodingContextRef<'top> { self.context } @@ -454,31 +454,13 @@ pub(crate) mod private { Some(Ok(unexpanded_field)) } } - - impl<'top, D: Decoder = S>, S> LazyRawStructPrivate<'top, D> for S - where - S: LazyRawStruct<'top, D>, - { - fn field_exprs( - &self, - context: EncodingContextRef<'top>, - ) -> RawStructFieldExprIterator<'top, D> { - let raw_fields = >::iter(self); - RawStructFieldExprIterator { - context, - raw_fields, - } - } - } } pub trait LazyRawReader<'data, D: Decoder>: Sized { /// Constructs a new raw reader using decoder `D` that will read from `data`. /// `data` must be the beginning of the stream. To continue reading from the middle of a - /// stream, see [`resume_at_offset`](Self::resume_at_offset). - fn new(data: &'data [u8]) -> Self { - Self::resume_at_offset(data, 0, IonEncoding::default()) - } + /// stream, see [`resume_at_offset`](Self::resume). + fn new(context: EncodingContextRef<'data>, data: &'data [u8], is_final_data: bool) -> Self; /// Constructs a new raw reader using decoder `D` that will read from `data`. /// @@ -486,17 +468,12 @@ pub trait LazyRawReader<'data, D: Decoder>: Sized { /// If offset is not zero, the caller must supply an `encoding_hint` indicating the expected /// encoding. Encoding-specific raw readers will ignore this hint--the stream's encoding must be /// the one that they support--but the `LazyRawAnyReader` will use it. - fn resume_at_offset(data: &'data [u8], offset: usize, encoding_hint: IonEncoding) -> Self; + fn resume(context: EncodingContextRef<'data>, saved_state: RawReaderState<'data>) -> Self; /// Deconstructs this reader, returning a tuple of `(remaining_data, stream_offset, encoding)`. fn save_state(&self) -> RawReaderState<'data>; - fn next<'top>( - &'top mut self, - context: EncodingContextRef<'top>, - ) -> IonResult> - where - 'data: 'top; + fn next(&mut self) -> IonResult>; /// The stream byte offset at which the reader will begin parsing the next item to return. /// This position is not necessarily the first byte of the next value; it may be (e.g.) a NOP, @@ -572,11 +549,9 @@ fn transcribe_raw_binary_to_text< writer: &mut Writer, ) -> IonResult<()> { const FLUSH_EVERY_N: usize = 100; - let encoding_context = EncodingContext::for_ion_version(IonVersion::v1_1); - let context_ref = encoding_context.get_ref(); let mut item_number: usize = 0; loop { - let item = reader.next(context_ref)?; + let item = reader.next()?; use crate::RawStreamItem::*; match item { VersionMarker(_m) if item_number == 0 => { @@ -675,12 +650,7 @@ where } pub trait LazyRawStruct<'top, D: Decoder>: - LazyRawContainer<'top, D> - + private::LazyContainerPrivate<'top, D> - + private::LazyRawStructPrivate<'top, D> - + Debug - + Copy - + Clone + LazyRawContainer<'top, D> + private::LazyContainerPrivate<'top, D> + Debug + Copy + Clone { type Iterator: RawStructIterator<'top, D>; diff --git a/src/lazy/encoder/text/v1_1/writer.rs b/src/lazy/encoder/text/v1_1/writer.rs index fa434aeb..a341cd94 100644 --- a/src/lazy/encoder/text/v1_1/writer.rs +++ b/src/lazy/encoder/text/v1_1/writer.rs @@ -112,7 +112,7 @@ impl LazyRawWriter for LazyRawTextWriter_1_1 { #[cfg(test)] mod tests { use crate::lazy::any_encoding::IonVersion; - use crate::lazy::decoder::{LazyRawReader, LazyRawSequence, LazyRawValue}; + use crate::lazy::decoder::{LazyRawReader, LazyRawValue}; use crate::lazy::encoder::text::v1_1::writer::LazyRawTextWriter_1_1; use crate::lazy::encoder::value_writer::{SequenceWriter, StructWriter, ValueWriter}; use crate::lazy::encoder::write_as_ion::WriteAsSExp; @@ -282,14 +282,14 @@ mod tests { let encoded_text = String::from_utf8(encoded_bytes).unwrap(); println!("{encoded_text}"); - let mut reader = LazyRawTextReader_1_1::new(encoded_text.as_bytes()); let mut context = EncodingContext::for_ion_version(IonVersion::v1_1); let macro_foo = TemplateCompiler::compile_from_source(context.get_ref(), "(macro foo (x*) null)")?; context.macro_table.add_template_macro(macro_foo)?; - let context = context.get_ref(); - let _marker = reader.next(context)?.expect_ivm()?; - let eexp = reader.next(context)?.expect_eexp()?; + let mut reader = + LazyRawTextReader_1_1::new(context.get_ref(), encoded_text.as_bytes(), true); + let _marker = reader.next()?.expect_ivm()?; + let eexp = reader.next()?.expect_eexp()?; assert_eq!(MacroIdRef::LocalName("foo"), eexp.id()); let mut args = eexp.raw_arguments(); let x = args.next().unwrap()?.expr().expect_arg_group()?; diff --git a/src/lazy/encoder/write_as_ion.rs b/src/lazy/encoder/write_as_ion.rs index 84bb823b..287f4e3a 100644 --- a/src/lazy/encoder/write_as_ion.rs +++ b/src/lazy/encoder/write_as_ion.rs @@ -302,8 +302,8 @@ impl WriteAsIon for RawValueRef<'_, D> { Timestamp(t) => value_writer.write_timestamp(t), Symbol(s) => value_writer.write_symbol(s), String(s) => value_writer.write_string(s.text()), - Clob(c) => value_writer.write_clob(c.as_ref()), - Blob(b) => value_writer.write_blob(b.as_ref()), + Clob(c) => value_writer.write_clob(c.data()), + Blob(b) => value_writer.write_blob(b.data()), List(l) => { let mut list_writer = value_writer.list_writer()?; for value_result in l.iter() { @@ -489,8 +489,8 @@ impl WriteAsIon for ValueRef<'_, D> { Timestamp(t) => value_writer.write_timestamp(t), Symbol(s) => value_writer.write_symbol(s), String(s) => value_writer.write_string(s.text()), - Clob(c) => value_writer.write_clob(c.as_ref()), - Blob(b) => value_writer.write_blob(b.as_ref()), + Clob(c) => value_writer.write_clob(c.data()), + Blob(b) => value_writer.write_blob(b.data()), List(l) => value_writer.write(l), SExp(s) => value_writer.write(s), Struct(s) => value_writer.write(s), diff --git a/src/lazy/encoding.rs b/src/lazy/encoding.rs index e61a5160..786c8fa1 100644 --- a/src/lazy/encoding.rs +++ b/src/lazy/encoding.rs @@ -1,13 +1,11 @@ #![allow(non_camel_case_types)] -use std::fmt::Debug; -use std::io; - use crate::lazy::any_encoding::{IonEncoding, IonVersion, LazyRawAnyValue}; use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; use crate::lazy::binary::raw::r#struct::{LazyRawBinaryFieldName_1_0, LazyRawBinaryStruct_1_0}; use crate::lazy::binary::raw::reader::LazyRawBinaryReader_1_0; use crate::lazy::binary::raw::sequence::{LazyRawBinaryList_1_0, LazyRawBinarySExp_1_0}; +use crate::lazy::binary::raw::v1_1::e_expression::BinaryEExpression_1_1; use crate::lazy::binary::raw::v1_1::r#struct::LazyRawBinaryFieldName_1_1; use crate::lazy::binary::raw::v1_1::reader::LazyRawBinaryReader_1_1; use crate::lazy::binary::raw::v1_1::value::LazyRawBinaryVersionMarker_1_1; @@ -18,27 +16,34 @@ use crate::lazy::binary::raw::v1_1::{ RawBinaryAnnotationsIterator_1_1, }; use crate::lazy::binary::raw::value::{LazyRawBinaryValue_1_0, LazyRawBinaryVersionMarker_1_0}; -use crate::lazy::decoder::Decoder; +use crate::lazy::decoder::{Decoder, LazyRawValueExpr, RawValueExpr}; use crate::lazy::encoder::write_as_ion::WriteAsIon; use crate::lazy::encoder::Encoder; use crate::lazy::never::Never; -use crate::lazy::text::raw::r#struct::{LazyRawTextFieldName_1_0, LazyRawTextStruct_1_0}; +use crate::lazy::text::buffer::{whitespace_and_then, IonParser, TextBuffer}; +use crate::lazy::text::encoded_value::EncodedTextValue; +use crate::lazy::text::matched::MatchedValue; +use crate::lazy::text::parse_result::fatal_parse_error; +use crate::lazy::text::raw::r#struct::{LazyRawTextFieldName, RawTextStructIterator}; use crate::lazy::text::raw::reader::LazyRawTextReader_1_0; -use crate::lazy::text::raw::sequence::{LazyRawTextList_1_0, LazyRawTextSExp_1_0}; +use crate::lazy::text::raw::sequence::{ + RawTextList, RawTextListIterator, RawTextSExp, RawTextSExpIterator, +}; use crate::lazy::text::raw::v1_1::reader::{ - LazyRawTextFieldName_1_1, LazyRawTextList_1_1, LazyRawTextReader_1_1, LazyRawTextSExp_1_1, - LazyRawTextStruct_1_1, TextEExpression_1_1, + LazyRawTextReader_1_1, LazyRawTextStruct, TextEExpression_1_1, }; use crate::lazy::text::value::{ LazyRawTextValue, LazyRawTextValue_1_0, LazyRawTextValue_1_1, LazyRawTextVersionMarker_1_0, LazyRawTextVersionMarker_1_1, RawTextAnnotationsIterator, }; - -use crate::lazy::binary::raw::v1_1::e_expression::BinaryEExpression_1_1; use crate::{ - AnnotationsEncoding, ContainerEncoding, FieldNameEncoding, IonResult, SymbolValueEncoding, - TextFormat, ValueWriterConfig, WriteConfig, + AnnotationsEncoding, ContainerEncoding, FieldNameEncoding, HasRange, IonError, IonResult, + LazyRawFieldExpr, SymbolValueEncoding, TextFormat, ValueWriterConfig, WriteConfig, }; +use std::fmt::Debug; +use std::io; +use winnow::combinator::{alt, opt, separated_pair}; +use winnow::Parser; /// Marker trait for types that represent an Ion encoding. pub trait Encoding: Encoder + Decoder { @@ -247,10 +252,164 @@ pub trait TextEncoding<'top>: Value<'top> = LazyRawTextValue<'top, Self>, > { - // No methods, just a marker + fn new_value( + input: TextBuffer<'top>, + encoded_text_value: EncodedTextValue<'top, Self>, + ) -> Self::Value<'top>; + + /// Matches an expression that appears in value position. + fn value_expr_matcher() -> impl IonParser<'top, LazyRawValueExpr<'top, Self>>; + + /// Matches an expression that appears in struct field position. Does NOT match trailing commas. + fn field_expr_matcher() -> impl IonParser<'top, LazyRawFieldExpr<'top, Self>>; + + fn list_matcher() -> impl IonParser<'top, EncodedTextValue<'top, Self>> { + let make_iter = |buffer: TextBuffer<'top>| RawTextListIterator::::new(buffer); + let end_matcher = (whitespace_and_then(opt(",")), whitespace_and_then("]")).take(); + Self::container_matcher("a list", "[", make_iter, end_matcher) + .map(|nested_expr_cache| EncodedTextValue::new(MatchedValue::List(nested_expr_cache))) + } + + fn sexp_matcher() -> impl IonParser<'top, EncodedTextValue<'top, Self>> { + let make_iter = |buffer: TextBuffer<'top>| RawTextSExpIterator::::new(buffer); + let end_matcher = whitespace_and_then(")"); + Self::container_matcher("an s-expression", "(", make_iter, end_matcher) + .map(|nested_expr_cache| EncodedTextValue::new(MatchedValue::SExp(nested_expr_cache))) + } + + fn struct_matcher() -> impl IonParser<'top, EncodedTextValue<'top, Self>> { + let make_iter = |buffer: TextBuffer<'top>| RawTextStructIterator::new(buffer); + let end_matcher = (whitespace_and_then(opt(",")), whitespace_and_then("}")).take(); + Self::container_matcher("a struct", "{", make_iter, end_matcher) + .map(|nested_expr_cache| EncodedTextValue::new(MatchedValue::Struct(nested_expr_cache))) + } + + /// Constructs an `IonParser` implementation using parsing logic common to all container types. + /// Caches all subexpressions in the bump allocator for future reference. + fn container_matcher( + // Text describing what is being parsed. For example: "a list". + // This message will be added to any error messages for context. + label: &'static str, + // The literal that begins the container. ("[", "(", etc.) + mut opening_token: &str, + // A closure or function that will construct an appropriate iterator to parse any child + // expressions. + mut make_iterator: MakeIterator, + // A parser that will match the expected end of the container. + mut end_matcher: impl IonParser<'top, TextBuffer<'top>>, + ) -> impl IonParser<'top, &'top [Expr]> + where + Expr: HasRange + 'top, + Iter: Iterator>, + MakeIterator: FnMut(TextBuffer<'top>) -> Iter, + { + use bumpalo::collections::Vec as BumpVec; + move |input: &mut TextBuffer<'top>| { + // Make a copy of the input buffer view so the iterator has one it can consume. + let mut iterator_input = *input; + // Confirm that the input begins with the expected opening token, consuming it in the process. + let _head = opening_token.parse_next(&mut iterator_input)?; + let iterator = make_iterator(iterator_input); + // Bump-allocate a space to store any child expressions we encounter as we traverse this + // container. + let mut child_expr_cache = BumpVec::new_in(input.context().allocator()); + // Visit each child expression yielded by the parser, reporting any errors. + for expr_result in iterator { + let expr = match expr_result { + Ok(expr) => expr, + Err(IonError::Incomplete(..)) => { + return input.incomplete(label); + } + Err(e) => { + return fatal_parse_error(*input, format!("failed to parse {label}: {e:?}")) + } + }; + // If there are no errors, add the new child expr to the cache. + child_expr_cache.push(expr); + } + + // Take note of where we finished. + let last_expr_end = child_expr_cache + .last() + // If we found child expressions, we'll resume immediately after the last child expression. + .map(|expr| expr.range().end - input.offset()) + // If we didn't find child expressions, we'll resume immediately after the opening token. + .unwrap_or(opening_token.len()); + // Advance `input` to the remaining data. + *input = input.slice_to_end(last_expr_end); + // Confirm that the last expression is followed by input that `end_matcher` approves of. + let _matched_end = end_matcher.parse_next(input)?; + Ok(child_expr_cache.into_bump_slice()) + } + } +} + +impl<'top> TextEncoding<'top> for TextEncoding_1_0 { + fn new_value( + input: TextBuffer<'top>, + encoded_text_value: EncodedTextValue<'top, Self>, + ) -> Self::Value<'top> { + LazyRawTextValue_1_0::new(input, encoded_text_value) + } + + fn value_expr_matcher() -> impl IonParser<'top, LazyRawValueExpr<'top, Self>> { + TextBuffer::match_annotated_value::.map(RawValueExpr::ValueLiteral) + } + + fn field_expr_matcher() -> impl IonParser<'top, LazyRawFieldExpr<'top, Self>> { + // A (name, eexp) pair + separated_pair( + whitespace_and_then(TextBuffer::match_struct_field_name), + whitespace_and_then(":"), + whitespace_and_then(TextBuffer::match_annotated_value::), + ) + .map(|(field_name, invocation)| { + LazyRawFieldExpr::NameValue(LazyRawTextFieldName::new(field_name), invocation) + }) + } +} +impl<'top> TextEncoding<'top> for TextEncoding_1_1 { + fn new_value( + input: TextBuffer<'top>, + encoded_text_value: EncodedTextValue<'top, Self>, + ) -> Self::Value<'top> { + LazyRawTextValue_1_1::new(input, encoded_text_value) + } + + fn value_expr_matcher() -> impl IonParser<'top, LazyRawValueExpr<'top, Self>> { + alt(( + TextBuffer::match_e_expression.map(RawValueExpr::EExp), + TextBuffer::match_annotated_value::.map(RawValueExpr::ValueLiteral), + )) + } + + fn field_expr_matcher() -> impl IonParser<'top, LazyRawFieldExpr<'top, Self>> { + alt(( + // A (name, eexp) pair. Check for this first to prevent `(:` from being considered + // the beginning of an s-expression. + separated_pair( + whitespace_and_then(TextBuffer::match_struct_field_name), + whitespace_and_then(":"), + whitespace_and_then(TextBuffer::match_e_expression), + ) + .map(|(field_name, invocation)| { + LazyRawFieldExpr::NameEExp(LazyRawTextFieldName::new(field_name), invocation) + }), + // A (name, value) pair + separated_pair( + whitespace_and_then(TextBuffer::match_struct_field_name), + whitespace_and_then(":"), + whitespace_and_then(TextBuffer::match_annotated_value::), + ) + .map(move |(field_name, value)| { + let field_name = LazyRawTextFieldName::new(field_name); + LazyRawFieldExpr::NameValue(field_name, value) + }), + // An e-expression + TextBuffer::match_e_expression.map(LazyRawFieldExpr::EExp), + )) + } } -impl TextEncoding<'_> for TextEncoding_1_0 {} -impl TextEncoding<'_> for TextEncoding_1_1 {} /// Marker trait for encodings that support macros. pub trait EncodingWithMacroSupport {} @@ -274,10 +433,10 @@ impl Decoder for TextEncoding_1_0 { const INITIAL_ENCODING_EXPECTED: IonEncoding = IonEncoding::Text_1_0; type Reader<'data> = LazyRawTextReader_1_0<'data>; type Value<'top> = LazyRawTextValue_1_0<'top>; - type SExp<'top> = LazyRawTextSExp_1_0<'top>; - type List<'top> = LazyRawTextList_1_0<'top>; - type Struct<'top> = LazyRawTextStruct_1_0<'top>; - type FieldName<'top> = LazyRawTextFieldName_1_0<'top>; + type SExp<'top> = RawTextSExp<'top, Self>; + type List<'top> = RawTextList<'top, Self>; + type Struct<'top> = LazyRawTextStruct<'top, Self>; + type FieldName<'top> = LazyRawTextFieldName<'top, Self>; type AnnotationsIterator<'top> = RawTextAnnotationsIterator<'top>; // Macros are not supported in Ion 1.0 type EExp<'top> = Never; @@ -288,10 +447,10 @@ impl Decoder for TextEncoding_1_1 { const INITIAL_ENCODING_EXPECTED: IonEncoding = IonEncoding::Text_1_1; type Reader<'data> = LazyRawTextReader_1_1<'data>; type Value<'top> = LazyRawTextValue_1_1<'top>; - type SExp<'top> = LazyRawTextSExp_1_1<'top>; - type List<'top> = LazyRawTextList_1_1<'top>; - type Struct<'top> = LazyRawTextStruct_1_1<'top>; - type FieldName<'top> = LazyRawTextFieldName_1_1<'top>; + type SExp<'top> = RawTextSExp<'top, Self>; + type List<'top> = RawTextList<'top, Self>; + type Struct<'top> = LazyRawTextStruct<'top, Self>; + type FieldName<'top> = LazyRawTextFieldName<'top, Self>; type AnnotationsIterator<'top> = RawTextAnnotationsIterator<'top>; type EExp<'top> = TextEExpression_1_1<'top>; type VersionMarker<'top> = LazyRawTextVersionMarker_1_1<'top>; diff --git a/src/lazy/expanded/struct.rs b/src/lazy/expanded/struct.rs index e20320ca..9ae10aca 100644 --- a/src/lazy/expanded/struct.rs +++ b/src/lazy/expanded/struct.rs @@ -1,5 +1,5 @@ use crate::element::iterators::SymbolsIterator; -use crate::lazy::decoder::private::{LazyRawStructPrivate, RawStructFieldExprIterator}; +use crate::lazy::decoder::private::RawStructFieldExprIterator; use crate::lazy::decoder::{Decoder, LazyRawFieldName, LazyRawStruct}; use crate::lazy::expanded::macro_evaluator::{ MacroEvaluator, MacroExpr, MacroExprArgsIterator, ValueExpr, @@ -269,10 +269,10 @@ impl<'top, D: Decoder> LazyExpandedStruct<'top, D> { .alloc_with(|| MacroEvaluator::new()); use ExpandedStructSource::*; let source = match &self.source { - ValueLiteral(raw_struct) => ExpandedStructIteratorSource::ValueLiteral( - evaluator, - raw_struct.field_exprs(self.context), - ), + ValueLiteral(raw_struct) => { + let field_exprs = RawStructFieldExprIterator::new(self.context, raw_struct.iter()); + ExpandedStructIteratorSource::ValueLiteral(evaluator, field_exprs) + } Template(environment, element, _index) => { evaluator.set_root_environment(*environment); let template = element.template(); diff --git a/src/lazy/expanded/template.rs b/src/lazy/expanded/template.rs index fd7dc4f2..a3585347 100644 --- a/src/lazy/expanded/template.rs +++ b/src/lazy/expanded/template.rs @@ -1129,7 +1129,7 @@ impl Debug for TemplateExprGroup<'_, D> { } } -/// A resolved version of [`TemplateBodyMacroInvocation`]; instead of holding addresses, this type +/// A resolved version of `TemplateBodyMacroInvocation`; instead of holding addresses, this type /// holds references to the invoked macro and its argument expressions. #[derive(Copy, Clone)] pub struct TemplateMacroInvocation<'top, D: Decoder> { diff --git a/src/lazy/raw_value_ref.rs b/src/lazy/raw_value_ref.rs index b2c27f1f..4f4b9cfb 100644 --- a/src/lazy/raw_value_ref.rs +++ b/src/lazy/raw_value_ref.rs @@ -217,7 +217,7 @@ impl<'top, D: Decoder> RawValueRef<'top, D> { mod tests { use crate::lazy::binary::raw::reader::LazyRawBinaryReader_1_0 as LazyRawBinaryReader; use crate::lazy::binary::test_utilities::to_binary_ion; - use crate::{Decimal, IonResult, IonType, RawSymbolRef, Timestamp}; + use crate::{Decimal, EncodingContext, IonResult, IonType, RawSymbolRef, Timestamp}; #[test] fn expect_type() -> IonResult<()> { @@ -238,7 +238,8 @@ mod tests { {this: is, a: struct} "#, )?; - let mut reader = LazyRawBinaryReader::new(&ion_data); + let context = EncodingContext::empty(); + let mut reader = LazyRawBinaryReader::new(context.get_ref(), &ion_data); // IVM reader.next()?.expect_ivm()?; // Symbol table @@ -305,7 +306,8 @@ mod tests { null.bool "#, )?; - let mut reader = LazyRawBinaryReader::new(&ion_data); + let context = EncodingContext::empty(); + let mut reader = LazyRawBinaryReader::new(context.get_ref(), &ion_data); // IVM reader.next()?.expect_ivm()?; diff --git a/src/lazy/streaming_raw_reader.rs b/src/lazy/streaming_raw_reader.rs index 4939e42a..9233d8d7 100644 --- a/src/lazy/streaming_raw_reader.rs +++ b/src/lazy/streaming_raw_reader.rs @@ -48,14 +48,16 @@ const DEFAULT_IO_BUFFER_SIZE: usize = 4 * 1024; pub struct RawReaderState<'a> { data: &'a [u8], offset: usize, + is_final_data: bool, encoding: IonEncoding, } impl<'a> RawReaderState<'a> { - pub fn new(data: &'a [u8], offset: usize, encoding: IonEncoding) -> Self { + pub fn new(data: &'a [u8], offset: usize, is_final_data: bool, encoding: IonEncoding) -> Self { Self { data, offset, + is_final_data, encoding, } } @@ -64,6 +66,10 @@ impl<'a> RawReaderState<'a> { self.data } + pub fn is_final_data(&self) -> bool { + self.is_final_data + } + pub fn offset(&self) -> usize { self.offset } @@ -71,6 +77,10 @@ impl<'a> RawReaderState<'a> { pub fn encoding(&self) -> IonEncoding { self.encoding } + + pub(crate) fn set_encoding(&mut self, encoding: IonEncoding) { + self.encoding = encoding; + } } impl StreamingRawReader { @@ -87,21 +97,13 @@ impl StreamingRawReader { /// Gets a reference to the data source and tries to fill its buffer. #[inline] fn pull_more_data_from_source(&mut self) -> IonResult { - // SAFETY: `self.input` is an `UnsafeCell`, which prevents the borrow - // checker from governing its contents. Because this method has a mutable reference - // to `self`, it is safe to modify `self`'s contents. - let input = unsafe { &mut *self.input.get() }; - input.fill_buffer() + self.input.get_mut().fill_buffer() } /// Returns true if the input buffer is empty. #[inline] - fn buffer_is_empty(&self) -> bool { - // SAFETY: `self.input` is an `UnsafeCell`, which prevents the borrow - // checker from governing its contents. Because this method has an immutable reference - // to `self`, it is safe to read `self`'s contents. - let input = unsafe { &*self.input.get() }; - input.buffer().is_empty() + fn buffer_is_empty(&mut self) -> bool { + self.input.get_mut().buffer().is_empty() } pub fn next<'top>( @@ -123,7 +125,9 @@ impl StreamingRawReader { context: EncodingContextRef<'top>, is_peek: bool, ) -> IonResult> { - let mut input_source_exhausted = false; + // If the input is a stream, we assume there may be more data available. + // If it's a fixed slice, we know it's already complete. + let mut input_source_exhausted = !Input::DataSource::IS_STREAMING; loop { // If the input buffer is empty, try to pull more data from the source before proceeding. // It's important that we do this _before_ reading from the buffer; any item returned @@ -133,42 +137,50 @@ impl StreamingRawReader { self.pull_more_data_from_source()?; } + // We're going to try to read a lazy value from the available input. If we + // succeed, we'll return it. If the data is incomplete, we'll return to the top + // of the loop. Conditionally returning a value in a loop is the borrow checker's + // Achilles' heel (see comment on the `StreamingRawReader` type), so we use an + // unsafe access to get a reference to the available bytes. + // + // SAFETY: If `self.input` needs to be refilled later on, `available_bytes` MUST NOT be + // read from in the same loop iteration afterward, since it may refer to a buffer + // that has been dropped. let available_bytes = unsafe { &*self.input.get() }.buffer(); - let unsafe_cell_reader = UnsafeCell::new( as LazyRawReader< - 'top, - Encoding, - >>::resume_at_offset( + let state = RawReaderState::new( available_bytes, self.stream_position, + input_source_exhausted, self.encoding(), - )); - let slice_reader = unsafe { &mut *unsafe_cell_reader.get() }; + ); + + // Construct a new raw reader picking up from where the StreamingRawReader left off. + let mut slice_reader = + as LazyRawReader<'top, Encoding>>::resume(context, state); let starting_position = slice_reader.position(); let old_encoding = slice_reader.encoding(); - let result = slice_reader.next(context); - // We're done modifying `slice_reader`, but we need to read some of its fields. These - // fields are _not_ the data to which `result` holds a reference. We have to circumvent - // the borrow checker's limitation (described in a comment on the StreamingRawReader type) - // by getting a second (read-only) reference to the reader. - let slice_reader_ref = unsafe { &*unsafe_cell_reader.get() }; - let new_encoding = slice_reader_ref.encoding(); - let end_position = slice_reader_ref.position(); + + let result = slice_reader.next(); + + let new_encoding = slice_reader.encoding(); + let end_position = slice_reader.position(); let bytes_read = end_position - starting_position; - let input = unsafe { &mut *self.input.get() }; + // If we ran out of data before we could get a result... if matches!( result, Err(IonError::Incomplete(_)) | Ok(LazyRawStreamItem::::EndOfStream(_)) ) { - // ...try to pull more data from the data source. It's ok to modify the buffer in - // this case because `result` (which holds a reference to the buffer) will be - // discarded. - if input.fill_buffer()? > 0 { - // If we get more data, try again. + if input_source_exhausted { + // There's no more data, so the result is final. + } else { + // ...more data may be available, so try to pull from the data source. + if self.pull_more_data_from_source()? == 0 { + input_source_exhausted = true; + } continue; } - // If there's nothing available, return the result we got. } else if let Ok(ref item) = result { // We have successfully read something from the buffer. // @@ -208,11 +220,8 @@ impl StreamingRawReader { // stream are all cases where the reader looking at a fixed slice of the // buffer may reach the wrong conclusion. _ => { - // Try to pull more data from the input source. This invalidates the `result` - // variable because `fill_buffer()` may cause the buffer to be reallocated, - // so we start this iteration over. This results in the last value being parsed - // a second time from the (potentially updated) buffer. - if input.fill_buffer()? == 0 { + // Try to pull more data from the input source. + if self.pull_more_data_from_source()? == 0 { input_source_exhausted = true; } continue; @@ -223,7 +232,7 @@ impl StreamingRawReader { // If this isn't just a peek, update our state to remember what we've already read. if !is_peek { // Mark those input bytes as having been consumed so they are not read again. - input.consume(bytes_read); + self.input.get_mut().consume(bytes_read); // Update the streaming reader's position to reflect the number of bytes we // just read. self.stream_position = end_position; @@ -244,6 +253,9 @@ impl StreamingRawReader { /// An input source--typically an implementation of either `AsRef<[u8]>` or `io::Read`--from which /// Ion can be read, paying the cost of buffering and I/O copies only when necessary. pub trait IonDataSource { + /// If `true`, the current contents of the buffer may not be the complete stream. + const IS_STREAMING: bool; + /// Returns a slice of all unread bytes that are currently available in the buffer. fn buffer(&self) -> &[u8]; @@ -287,6 +299,8 @@ impl> IonSlice { } impl> IonDataSource for IonSlice { + const IS_STREAMING: bool = false; + #[inline] fn buffer(&self) -> &[u8] { // Return the input slice containing all of the as-of-yet unread bytes. @@ -356,6 +370,8 @@ impl IonStream { } impl IonDataSource for IonStream { + const IS_STREAMING: bool = true; + fn buffer(&self) -> &[u8] { &self.buffer[self.position..self.limit] } diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index ee02ca46..2ba7a3e7 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -1,27 +1,20 @@ use std::fmt::{Debug, Formatter}; -use std::iter::{Copied, Enumerate}; -use std::ops::{Range, RangeFrom, RangeTo}; -use std::slice::Iter; +use std::ops::Range; use std::str::FromStr; -use nom::branch::alt; -use nom::bytes::complete::{ - is_a as complete_is_a, is_not as complete_is_not, tag as complete_tag, - take_while as complete_take_while, +use winnow::ascii::alphanumeric1; +use winnow::combinator::{ + alt, delimited, empty, eof, not, opt, peek, preceded, repeat, separated_pair, terminated, }; -use nom::bytes::streaming::{is_a, tag, take_until, take_while_m_n}; -use nom::character::complete::{ - char as complete_char, digit0 as complete_digit0, digit1 as complete_digit1, - one_of as complete_one_of, +use winnow::error::{ErrMode, Needed}; +use winnow::stream::{ + Accumulate, CompareResult, ContainsToken, FindSlice, Location, SliceLen, Stream, + StreamIsPartial, }; -use nom::character::streaming::{alphanumeric1, char, digit1, one_of, satisfy}; -use nom::combinator::{consumed, eof, map, not, opt, peek, recognize, success, value}; -use nom::error::{ErrorKind, ParseError}; -use nom::multi::{fold_many1, fold_many_m_n, many0_count, many1_count}; -use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}; -use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser}; - -use crate::lazy::decoder::{LazyRawFieldExpr, LazyRawValueExpr, RawValueExpr}; +use winnow::token::{one_of, take_till, take_until, take_while}; +use winnow::{dispatch, Parser}; + +use crate::lazy::decoder::{LazyRawValueExpr, RawValueExpr}; use crate::lazy::encoding::{TextEncoding, TextEncoding_1_0, TextEncoding_1_1}; use crate::lazy::expanded::EncodingContextRef; use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem}; @@ -33,26 +26,35 @@ use crate::lazy::text::matched::{ }; use crate::lazy::text::parse_result::{fatal_parse_error, InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; -use crate::lazy::text::raw::r#struct::{LazyRawTextFieldName_1_0, RawTextStructIterator_1_0}; -use crate::lazy::text::raw::sequence::{RawTextListIterator_1_0, RawTextSExpIterator_1_0}; use crate::lazy::text::raw::v1_1::arg_group::{EExpArg, EExpArgExpr, TextEExpArgGroup}; use crate::lazy::text::raw::v1_1::reader::{ - LazyRawTextFieldName_1_1, MacroIdRef, RawTextListIterator_1_1, RawTextSExpIterator_1_1, - RawTextStructIterator_1_1, SystemMacroAddress, TextEExpression_1_1, TextListSpanFinder_1_1, - TextSExpSpanFinder_1_1, TextStructSpanFinder_1_1, + MacroIdRef, + SystemMacroAddress, TextEExpression_1_1 }; use crate::lazy::text::value::{ LazyRawTextValue, LazyRawTextValue_1_0, LazyRawTextValue_1_1, LazyRawTextVersionMarker, }; use crate::result::DecodingError; use crate::{ - v1_1, Encoding, HasRange, IonError, IonResult, IonType, RawSymbolRef, TimestampPrecision, + Encoding, HasRange, IonError, IonResult, IonType, RawSymbolRef, TimestampPrecision, }; use crate::lazy::expanded::macro_table::{Macro, ION_1_1_SYSTEM_MACROS}; use crate::lazy::expanded::template::{Parameter, RestSyntaxPolicy}; use crate::lazy::text::as_utf8::AsUtf8; use bumpalo::collections::Vec as BumpVec; +use winnow::ascii::{digit0, digit1}; +use crate::lazy::text::raw::sequence::RawTextSExpIterator; + +/// Generates parser functions that map from an Ion type representation (`Decimal`, `Int`, etc) +/// to an `EncodedTextValue`. +macro_rules! scalar_value_matchers { + ($($parser:expr => $variant:ident => $new_parser:ident),*$(,)?) => { + $(fn $new_parser>(&mut self) -> IonParseResult<'top, EncodedTextValue<'top, E>> { + $parser.map(|matched| EncodedTextValue::new(MatchedValue::$variant(matched))).parse_next(self) + })* + }; +} impl Debug for TextBuffer<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { @@ -60,7 +62,7 @@ impl Debug for TextBuffer<'_> { write!(f, "TextBuffer {{")?; // Try to read the next several bytes from the buffer as UTF-8... let text_result = std::str::from_utf8(self.data); - // ...if it works, print the first 32 unicode scalars... + // ...if it works, print the first 64 Unicode scalars... if let Ok(text) = text_result { write!( f, @@ -82,18 +84,15 @@ impl Debug for TextBuffer<'_> { } /// The Ion specification's enumeration of whitespace characters. -const WHITESPACE_CHARACTERS: &[char] = &[ - ' ', // Space - '\t', // Tab - '\r', // Carriage return - '\n', // Newline - '\x09', // Horizontal tab - '\x0B', // Vertical tab - '\x0C', // Form feed -]; - -/// Same as [WHITESPACE_CHARACTERS], but formatted as a string for use in some `nom` APIs -pub(crate) const WHITESPACE_CHARACTERS_AS_STR: &str = " \t\r\n\x09\x0B\x0C"; +/// +/// ' ', Space +/// '\t', Tab +/// '\r', Carriage return +/// '\n', Newline +/// '\x09', Horizontal tab +/// '\x0B', Vertical tab +/// '\x0C', Form feed +pub(crate) const WHITESPACE_BYTES: &[u8] = b" \t\r\n\x09\x0B\x0C"; /// A slice of unsigned bytes that can be cheaply copied and which defines methods for parsing /// the various encoding elements of a text Ion stream. @@ -114,6 +113,7 @@ pub struct TextBuffer<'top> { data: &'top [u8], offset: usize, pub(crate) context: EncodingContextRef<'top>, + is_final_data: bool, } impl PartialEq for TextBuffer<'_> { @@ -125,8 +125,12 @@ impl PartialEq for TextBuffer<'_> { impl<'top> TextBuffer<'top> { /// Constructs a new `TextBuffer` that wraps `data`, setting the view's `offset` to zero. #[inline] - pub fn new(context: EncodingContextRef<'top>, data: &'top [u8]) -> TextBuffer<'top> { - Self::new_with_offset(context, data, 0) + pub fn new( + context: EncodingContextRef<'top>, + data: &'top [u8], + is_final_data: bool, + ) -> TextBuffer<'top> { + Self::new_with_offset(context, data, 0, is_final_data) } /// Constructs a new `TextBuffer` that wraps `data`, setting the view's `offset` to the @@ -137,27 +141,43 @@ impl<'top> TextBuffer<'top> { context: EncodingContextRef<'top>, data: &'top [u8], offset: usize, + is_final_data: bool, ) -> TextBuffer<'top> { TextBuffer { context, data, offset, + is_final_data, } } + /// Modifies the `TextBuffer` in place, discarding `num_bytes` bytes. + pub fn consume(&mut self, num_bytes: usize) { + debug_assert!( + self.data.len() >= num_bytes, + "tried to conusume {num_bytes} bytes, but only {} were available", + self.data.len() + ); + self.offset += num_bytes; + self.data = &self.data[num_bytes..]; + } + pub fn context(&self) -> EncodingContextRef<'top> { self.context } - pub fn local_lifespan<'a>(self) -> TextBuffer<'a> - where - 'top: 'a, - { - self.slice_to_end(0) + #[inline(never)] + pub(crate) fn incomplete(&self, label: &'static str) -> IonParseResult<'top, T> { + if self.is_final_data() { + fatal_parse_error(*self, format!("ran out of data while parsing {label}")) + } else { + Err(ErrMode::Incomplete(Needed::Unknown)) + } } /// Returns a subslice of the [`TextBuffer`] that starts at `offset` and continues for - /// `length` bytes. + /// `length` bytes. The subslice is considered to be 'final' data (i.e. not a potentially + /// incomplete buffer). /// /// Note that `offset` is relative to the beginning of the buffer, not the beginning of the /// larger stream of which the buffer is a piece. @@ -165,7 +185,8 @@ impl<'top> TextBuffer<'top> { TextBuffer { data: &self.data[offset..offset + length], offset: self.offset + offset, - context: self.context, + is_final_data: true, + ..*self } } @@ -178,7 +199,7 @@ impl<'top> TextBuffer<'top> { TextBuffer { data: &self.data[offset..], offset: self.offset + offset, - context: self.context, + ..*self } } @@ -187,6 +208,13 @@ impl<'top> TextBuffer<'top> { self.data } + pub fn peek_byte(&self) -> IonParseResult<'top, u8> { + let Some(byte) = self.bytes().first().copied() else { + return self.incomplete("a value"); + }; + Ok(byte) + } + /// Returns the number of bytes between the start of the original input byte array and the /// subslice of that byte array that this `TextBuffer` represents. pub fn offset(&self) -> usize { @@ -221,32 +249,44 @@ impl<'top> TextBuffer<'top> { }) } - pub fn match_whitespace(self) -> IonMatchResult<'top> { - complete_is_a(WHITESPACE_CHARACTERS_AS_STR)(self) - } - /// Always succeeds and consumes none of the input. Returns an empty slice of the buffer. // This method is useful for parsers that need to match an optional construct but don't want // to return an Option<_>. For an example, see its use in `match_optional_whitespace`. - fn match_nothing(self) -> IonMatchResult<'top> { - // Use nom's `success` parser to return an empty slice from the head position - success(self.slice(0, 0))(self) + #[inline] + fn match_nothing(&mut self) -> IonMatchResult<'top> { + // use winnow's `empty` parser to return an empty slice from the head position + empty.take().parse_next(self) + } + + /// Matches one or more whitespace characters. + pub fn match_whitespace1(&mut self) -> IonMatchResult<'top> { + take_while(1.., WHITESPACE_BYTES).parse_next(self) } /// Matches zero or more whitespace characters. - pub fn match_optional_whitespace(self) -> IonMatchResult<'top> { - // Either match whitespace and return what follows or just return the input as-is. - // This will always return `Ok`, but it is packaged as an IonMatchResult for compatability - // with other parsers. - alt((Self::match_whitespace, Self::match_nothing))(self) + pub fn match_whitespace0(&mut self) -> IonMatchResult<'top> { + take_while(0.., WHITESPACE_BYTES).parse_next(self) } /// Matches any amount of contiguous comments and whitespace, including none. - pub fn match_optional_comments_and_whitespace(self) -> IonMatchResult<'top> { - recognize(many0_count(alt(( - Self::match_whitespace, - Self::match_comment, - ))))(self) + #[inline] + pub fn match_optional_comments_and_whitespace(&mut self) -> IonMatchResult<'top> { + pub fn full_match_optional_comments_and_whitespace<'t>( + input: &mut TextBuffer<'t>, + ) -> IonMatchResult<'t> { + zero_or_more(alt(( + TextBuffer::match_whitespace1, + TextBuffer::match_comment, + ))) + .parse_next(input) + } + + if let Some(&byte) = self.bytes().first() { + if WHITESPACE_BYTES.contains_token(byte) || byte == b'/' { + return full_match_optional_comments_and_whitespace(self); + } + } + self.match_nothing() } /// Matches a single @@ -255,159 +295,136 @@ impl<'top> TextBuffer<'top> { /// /* multi /// line */ /// comment - pub fn match_comment(self) -> IonMatchResult<'top> { + pub fn match_comment(&mut self) -> IonMatchResult<'top> { alt(( Self::match_rest_of_line_comment, Self::match_multiline_comment, - ))(self) + )) + .parse_next(self) } /// Matches a single rest-of-the-line comment. - fn match_rest_of_line_comment(self) -> IonMatchResult<'top> { - preceded( - // Matches a leading "//". - // If there isn't a first '/', the input will be rejected. - // If the buffer is empty after the first '/', the input will be considered incomplete. - // If the next character in input isn't a second '/', the input will be rejected. - recognize(pair(complete_tag("/"), tag("/"))), - // ...followed by either... - alt(( - // '//' can appear at the end of the stream - peek(recognize(eof)), - // ...one or more non-EOL characters... - complete_is_not("\r\n"), - // ...or any EOL character. - peek(recognize(complete_one_of("\r\n"))), - // In either case, the line ending will not be consumed. - )), - )(self) + fn match_rest_of_line_comment(&mut self) -> IonMatchResult<'top> { + ("//", take_till(.., b"\r\n")).take().parse_next(self) } /// Matches a single multiline comment. - fn match_multiline_comment(self) -> IonMatchResult<'top> { - recognize(delimited( + fn match_multiline_comment(&mut self) -> IonMatchResult<'top> { + ( // Matches a leading "/*"... - complete_tag("/*"), + "/*", // ...any number of non-"*/" characters... - take_until("*/"), + take_until(.., "*/"), // ...and then a closing "*/" - complete_tag("*/"), - ))(self) + "*/", + ) + .take() + .parse_next(self) } /// Matches an Ion version marker (e.g. `$ion_1_0` or `$ion_1_1`.) pub fn match_ivm>( - self, + &mut self, ) -> IonParseResult<'top, LazyRawTextVersionMarker<'top, E>> { - let (remaining, (matched_marker, (matched_major, matched_minor))) = consumed(terminated( - preceded( - complete_tag("$ion_"), - separated_pair(complete_digit1, complete_tag("_"), complete_digit1), - ), + let ((matched_major, matched_minor), matched_marker) = terminated( + preceded("$ion_", separated_pair(digit1, "_", digit1)), // Look ahead to make sure the IVM isn't followed by a '::'. If it is, then it's not // an IVM, it's an annotation. - peek(whitespace_and_then(not(complete_tag(":")))), - ))(self)?; + peek(whitespace_and_then(not(":"))), + ) + .with_taken() + .parse_next(self)?; // `major` and `minor` are base 10 digits. Turning them into `&str`s is guaranteed to succeed. let major_version = u8::from_str(matched_major.as_text().unwrap()).map_err(|_| { let error = InvalidInputError::new(matched_major) .with_label("parsing an IVM major version") .with_description("value did not fit in an unsigned byte"); - nom::Err::Failure(IonParseError::Invalid(error)) + ErrMode::Cut(IonParseError::Invalid(error)) })?; let minor_version = u8::from_str(matched_minor.as_text().unwrap()).map_err(|_| { let error = InvalidInputError::new(matched_minor) .with_label("parsing an IVM minor version") .with_description("value did not fit in an unsigned byte"); - nom::Err::Failure(IonParseError::Invalid(error)) + ErrMode::Cut(IonParseError::Invalid(error)) })?; let marker = LazyRawTextVersionMarker::::new(matched_marker, major_version, minor_version); - Ok((remaining, marker)) + Ok(marker) } /// Matches one or more annotations. - pub fn match_annotations(self) -> IonMatchResult<'top> { - let (remaining, matched) = recognize(many1_count(Self::match_annotation))(self)?; - if matched.len() > u16::MAX as usize { - let error = InvalidInputError::new(matched) - .with_description("the maximum supported annotations sequence length is 65KB") - .with_label("parsing annotations"); - Err(nom::Err::Error(IonParseError::Invalid(error))) - } else { - Ok((remaining, matched)) + #[inline] + pub fn match_annotations(&mut self) -> IonMatchResult<'top> { + #[inline(never)] + fn full_match_annotations<'t>(input: &mut TextBuffer<'t>) -> IonMatchResult<'t> { + let matched = one_or_more(TextBuffer::match_annotation).parse_next(input)?; + if matched.len() > u16::MAX as usize { + let error = InvalidInputError::new(matched) + .with_description("the maximum supported annotations sequence length is 65KB") + .with_label("parsing annotations"); + Err(ErrMode::Cut(IonParseError::Invalid(error))) + } else { + Ok(matched) + } } + + if let Some(&byte) = self.bytes().first() { + if [b'\'', b'$', b'_'].contains(&byte) || byte.is_ascii_alphabetic() { + return full_match_annotations(self); + } + }; + self.match_nothing() } /// Matches an annotation (symbol token) and a terminating '::'. - pub fn match_annotation(self) -> IonParseResult<'top, (MatchedSymbol, Range)> { + pub fn match_annotation(&mut self) -> IonParseResult<'top, (MatchedSymbol, TextBuffer<'top>)> { terminated( - whitespace_and_then(match_and_span(Self::match_symbol)), - whitespace_and_then(terminated( - // The `complete_tag`/`tag` pair below allows the parser to recognize that: - // - // foo::bar::baz: - // - // is incomplete while: - // - // foo::bar::baz - // - // is a symbol with two annotations. - pair(complete_tag(":"), tag(":")), - Self::match_optional_comments_and_whitespace, - )), - )(self) - } - - /// Matches an optional annotations sequence and a value, including operators. - pub fn match_sexp_value(self) -> IonParseResult<'top, Option>> { - whitespace_and_then(alt(( - value(None, tag(")")), - pair( - opt(Self::match_annotations), - // We need the s-expression parser to recognize the input `--3` as the operator `--` and the - // int `3` while recognizing the input `-3` as the int `-3`. If `match_operator` runs before - // `match_value`, it will consume the sign (`-`) of negative number values, treating - // `-3` as an operator (`-`) and an int (`3`). Thus, we run `match_value` first. - whitespace_and_then(alt((Self::match_value, Self::match_operator))), - ) - .map(|(maybe_annotations, value)| self.apply_annotations(maybe_annotations, value)) - .map(Some), - ))) - .parse(self) + whitespace_and_then(Self::match_symbol.with_taken()), + whitespace_and_then(("::", Self::match_optional_comments_and_whitespace)), + ) + .parse_next(self) } /// Matches either: /// * A macro invocation /// * An optional annotations sequence and a value - pub fn match_sexp_value_1_1( - self, + pub fn match_sexp_item_1_1( + &mut self, ) -> IonParseResult<'top, Option>> { - whitespace_and_then(alt(( + let input = *self; + let result = whitespace_and_then(alt(( Self::match_e_expression.map(|matched| Some(RawValueExpr::EExp(matched))), - value(None, peek(tag(")"))), - pair( + peek(")").value(None), + ( opt(Self::match_annotations), // We need the s-expression parser to recognize the input `--3` as the operator `--` and the // int `3` while recognizing the input `-3` as the int `-3`. If `match_operator` runs before // `match_value`, it will consume the sign (`-`) of negative number values, treating // `-3` as an operator (`-`) and an int (`3`). Thus, we run `match_value` first. - whitespace_and_then(alt((Self::match_value_1_1, Self::match_operator))), + whitespace_and_then(alt((Self::match_value::, Self::match_operator))), ) - .map(|(maybe_annotations, value)| self.apply_annotations(maybe_annotations, value)) - .map(RawValueExpr::ValueLiteral) - .map(Some), + .map(|(maybe_annotations, value)| input.apply_annotations(maybe_annotations, value)) + .map(RawValueExpr::ValueLiteral) + .map(Some), ))) - .parse(self) + .parse_next(self); + result } - fn apply_annotations>( - self, + #[inline] + pub(crate) fn apply_annotations>( + &self, maybe_annotations: Option>, mut value: LazyRawTextValue<'top, E>, ) -> LazyRawTextValue<'top, E> { - if let Some(annotations) = maybe_annotations { + // This is a separately defined function so the common case (no annotations) is more readily + // inlined. + fn full_apply_annotations<'t, T: TextEncoding<'t>>( + input: &TextBuffer<'t>, + annotations: &TextBuffer<'t>, + value: &mut LazyRawTextValue<'t, T>, + ) { let annotations_length = u16::try_from(annotations.len()).expect("already length checked"); // Update the encoded value's record of how many bytes of annotations precede the data. @@ -416,198 +433,27 @@ impl<'top> TextBuffer<'top> { .with_annotations_sequence(annotations_length); let unannotated_value_length = value.input.len(); // Rewind the value's input to include the annotations sequence. - value.input = self.slice( - annotations.offset() - self.offset(), + value.input = input.slice( + annotations.offset() - input.offset(), annotations_length as usize + unannotated_value_length, ); } - value - } - - /// Matches a struct field name/value pair. - /// - /// If a pair is found, returns `Some(field)` and consumes the following comma if present. - /// If no pair is found (that is: the end of the struct is next), returns `None`. - pub fn match_struct_field( - self, - ) -> IonParseResult<'top, Option>> { - // A struct field can have leading whitespace, but we want the buffer slice that we match - // to begin with the field name. Here we skip any whitespace so we have another named - // slice (`input_including_field_name`) with that property. - let (input_including_field_name, _ws) = self.match_optional_comments_and_whitespace()?; - alt(( - // If the next thing in the input is a `}`, return `None`. - value(None, Self::match_struct_end), - // Otherwise, match a name/value pair and turn it into a `LazyRawTextField`. - Self::match_struct_field_name_and_value.map(move |(matched_field_name, value)| { - let field_name = LazyRawTextFieldName_1_0::new(matched_field_name); - Some(LazyRawFieldExpr::<'top, TextEncoding_1_0>::NameValue( - field_name, value, - )) - }), - ))(input_including_field_name) - } - - /// Matches any amount of whitespace followed by a closing `}`. - fn match_struct_end(self) -> IonMatchResult<'top> { - whitespace_and_then(peek(tag("}"))).parse(self) - } - - /// Matches a field name/value pair. Returns the syntax used for the field name, the range of - /// input bytes where the field name is found, and the value. - pub fn match_struct_field_name_and_value( - self, - ) -> IonParseResult<'top, (MatchedFieldName<'top>, LazyRawTextValue_1_0<'top>)> { - terminated( - separated_pair( - whitespace_and_then(Self::match_struct_field_name), - whitespace_and_then(tag(":")), - whitespace_and_then(Self::match_annotated_value), - ), - whitespace_and_then(alt((tag(","), peek(tag("}"))))), - )(self) - } - - /// Matches a struct field (name, value expression) pair. - /// - /// If a pair is found, returns `Some(field)` and consumes the following comma if present. - /// If no pair is found (that is: the end of the struct is next), returns `None`. - pub fn match_struct_field_1_1( - self, - ) -> IonParseResult<'top, Option>> { - // A struct field can have leading whitespace, but we want the buffer slice that we match - // to begin with the field name. Here we skip any whitespace so we have another named - // slice (`input_including_field_name`) with that property. - let (input_including_field_name, _ws) = self.match_optional_comments_and_whitespace()?; - let (input_after_field, field_expr_result) = alt(( - // If the next thing in the input is a `}`, return `None`. - Self::match_struct_end.map(|_| Ok(None)), - terminated( - Self::match_e_expression.map(|eexp| Ok(Some(LazyRawFieldExpr::EExp(eexp)))), - whitespace_and_then(alt((tag(","), peek(tag("}"))))), - ), - Self::match_struct_field_name_and_e_expression_1_1.map(|(field_name, invocation)| { - Ok(Some(LazyRawFieldExpr::NameEExp( - LazyRawTextFieldName_1_1::new(field_name), - invocation, - ))) - }), - // Otherwise, match a name/value pair and turn it into a `LazyRawTextField`. - Self::match_struct_field_name_and_value_1_1.map(move |(field_name, value)| { - let field_name = LazyRawTextFieldName_1_1::new(field_name); - Ok(Some(LazyRawFieldExpr::NameValue(field_name, value))) - }), - ))(input_including_field_name)?; - Ok((input_after_field, field_expr_result?)) - } - /// Matches a field (name, value expression) pair, where the value expression may be either - /// an annotated value or an e-expression. Returns the syntax used for the field name, the - /// range of input bytes where the field name is found, and the value. - pub fn match_struct_field_name_and_e_expression_1_1( - self, - ) -> IonParseResult<'top, (MatchedFieldName<'top>, TextEExpression_1_1<'top>)> { - terminated( - separated_pair( - whitespace_and_then(Self::match_struct_field_name), - whitespace_and_then(tag(":")), - whitespace_and_then(Self::match_e_expression), - ), - whitespace_and_then(alt((tag(","), peek(tag("}"))))), - )(self) - } - - /// Matches a field (name, value expression) pair, where the value expression may be either - /// an annotated value or an e-expression. Returns the syntax used for the field name, the - /// range of input bytes where the field name is found, and the value. - pub fn match_struct_field_name_and_value_1_1( - self, - ) -> IonParseResult<'top, (MatchedFieldName<'top>, LazyRawTextValue_1_1<'top>)> { - terminated( - separated_pair( - whitespace_and_then(Self::match_struct_field_name), - whitespace_and_then(tag(":")), - whitespace_and_then(alt(( - Self::match_annotated_long_string_in_struct, - Self::match_annotated_value_1_1, - ))), - ), - whitespace_and_then(alt((tag(","), peek(tag("}"))))), - )(self) + if let Some(annotations) = maybe_annotations { + full_apply_annotations(self, &annotations, &mut value); + } + value } /// Matches an optional annotation sequence and a trailing value. - pub fn match_annotated_value(self) -> IonParseResult<'top, LazyRawTextValue_1_0<'top>> { - pair( - opt(Self::match_annotations), - whitespace_and_then(Self::match_value), - ) - .map(|(maybe_annotations, value)| self.apply_annotations(maybe_annotations, value)) - .parse(self) - } - - /// Matches an optional annotation sequence and a trailing v1.1 value. - pub fn match_annotated_value_1_1(self) -> IonParseResult<'top, LazyRawTextValue_1_1<'top>> { - pair( + pub fn match_annotated_value>(&mut self) -> IonParseResult<'top, E::Value<'top>> { + let input = *self; + ( opt(Self::match_annotations), - whitespace_and_then(Self::match_value_1_1), + whitespace_and_then(Self::match_value::), ) - .map(|(maybe_annotations, value)| self.apply_annotations(maybe_annotations, value)) - .parse(self) - } - - /// Constructs a parser that reads an optional annotations sequence and a value read using the provided - /// `value_parser`. The constructed parser returns a `LazyRawTextValue_1_1`. - fn match_annotated_value_parser( - value_parser: impl Parser, IonParseError<'top>>, - ) -> impl Parser, IonParseError<'top>> { - consumed(pair( - opt(Self::match_annotations), - whitespace_and_then(value_parser), - )) - .map(|(matched_input, (maybe_annotations, encoded_value))| { - let value = LazyRawTextValue_1_1 { - encoded_value, - input: matched_input, - }; - matched_input.apply_annotations(maybe_annotations, value) - }) - } - - /// In the context of a list, long-form strings need to be parsed differently to properly detect incomplete - /// input. For example, at the top level... - /// ```ion - /// // string empty symbol - /// '''foo''' '' - /// ``` - /// - /// But in the context of a list... - /// - /// ```ion - /// [ // v--- Incomplete - /// '''foo''' '' - /// ``` - /// - /// the same partial value is an `Incomplete` because it must be followed by a `,` or `]` to be - /// complete. - pub fn match_annotated_long_string_in_list( - self, - ) -> IonParseResult<'top, LazyRawTextValue_1_1<'top>> { - Self::match_annotated_value_parser( - Self::match_long_string_in_list.map(|s| EncodedTextValue::new(MatchedValue::String(s))), - ) - .parse(self) - } - - /// Like `match_annotated_long_string_in_list` above, but for structs. - pub fn match_annotated_long_string_in_struct( - self, - ) -> IonParseResult<'top, LazyRawTextValue_1_1<'top>> { - Self::match_annotated_value_parser( - Self::match_long_string_in_struct - .map(|s| EncodedTextValue::new(MatchedValue::String(s))), - ) - .parse(self) + .map(|(maybe_annotations, value)| input.apply_annotations(maybe_annotations, value)) + .parse_next(self) } /// Matches a struct field name. That is: @@ -615,551 +461,158 @@ impl<'top> TextBuffer<'top> { /// * An identifier /// * A symbol ID /// * A short-form string - pub fn match_struct_field_name(self) -> IonParseResult<'top, MatchedFieldName<'top>> { - // When truncated, field names can end up looking like keywords. If the buffer contains - // a keyword and then ends, that's incomplete input. We do this check ahead of regular - // parsing because `match_symbol` will reject keywords as invalid (not incomplete). - if terminated(Self::match_keyword, eof)(self).is_ok() { - return Err(nom::Err::Incomplete(Needed::Unknown)); - } - consumed(alt(( + pub fn match_struct_field_name(&mut self) -> IonParseResult<'top, MatchedFieldName<'top>> { + alt(( Self::match_string.map(MatchedFieldNameSyntax::String), Self::match_symbol.map(MatchedFieldNameSyntax::Symbol), - ))) - .map(|(matched_input, syntax)| MatchedFieldName::new(matched_input, syntax)) - .parse(self) + )) + .with_taken() + .map( + #[inline] + |(syntax, matched_input)| MatchedFieldName::new(matched_input, syntax), + ) + .parse_next(self) } /// Matches a single top-level value, an IVM, or the end of the stream. pub fn match_top_level_item_1_0( - self, + &mut self, ) -> IonParseResult<'top, LazyRawStreamItem<'top, TextEncoding_1_0>> { // If only whitespace/comments remain, we're at the end of the stream. - let (input_after_ws, _ws) = self.match_optional_comments_and_whitespace()?; - if input_after_ws.is_empty() { - return Ok(( - input_after_ws, - RawStreamItem::EndOfStream(EndPosition::new( - TextEncoding_1_0.encoding(), - input_after_ws.offset(), - )), - )); + let _discarded_ws = self.match_optional_comments_and_whitespace()?; + if self.is_empty() { + return Ok(RawStreamItem::EndOfStream(EndPosition::new( + TextEncoding_1_0.encoding(), + self.offset(), + ))); } // Otherwise, the next item must be an IVM or a value. // We check for IVMs first because the rules for a symbol identifier will match them. alt(( Self::match_ivm::.map(RawStreamItem::VersionMarker), - Self::match_annotated_value + Self::match_annotated_value:: .map(LazyRawTextValue_1_0::from) .map(RawStreamItem::Value), - ))(input_after_ws) + )) + .parse_next(self) } /// Matches a single top-level value, e-expression (macro invocation), IVM, or the end of /// the stream. pub fn match_top_level_item_1_1( - self, + &mut self, ) -> IonParseResult<'top, LazyRawStreamItem<'top, TextEncoding_1_1>> { // If only whitespace/comments remain, we're at the end of the stream. - let (input_after_ws, _ws) = self.match_optional_comments_and_whitespace()?; - if input_after_ws.is_empty() { - return Ok(( - input_after_ws, - RawStreamItem::EndOfStream(EndPosition::new( - TextEncoding_1_1.encoding(), - input_after_ws.offset(), - )), - )); + let _discarded_whitespace = self.match_optional_comments_and_whitespace()?; + if self.is_empty() { + return Ok(RawStreamItem::EndOfStream(EndPosition::new( + TextEncoding_1_1.encoding(), + self.offset(), + ))); } // Otherwise, the next item must be an IVM or a value. // We check for IVMs first because the rules for a symbol identifier will match them. alt(( Self::match_ivm::.map(RawStreamItem::VersionMarker), Self::match_e_expression.map(RawStreamItem::EExp), - Self::match_annotated_value_1_1 + Self::match_annotated_value:: .map(LazyRawTextValue_1_1::from) .map(RawStreamItem::Value), - ))(input_after_ws) - } - - /// Matches a single scalar value or the beginning of a container. - pub fn match_value(self) -> IonParseResult<'top, LazyRawTextValue_1_0<'top>> { - consumed(alt(( - // For `null` and `bool`, we use `read_` instead of `match_` because there's no additional - // parsing to be done. - map(Self::match_null, |ion_type| { - EncodedTextValue::new(MatchedValue::Null(ion_type)) - }), - map(Self::match_bool, |value| { - EncodedTextValue::new(MatchedValue::Bool(value)) - }), - // For `int` and the other types, we use `match` and store the partially-processed input in the - // `matched_value` field of the `EncodedTextValue` we return. - map(Self::match_int, |matched_int| { - EncodedTextValue::new(MatchedValue::Int(matched_int)) - }), - map(Self::match_float, |matched_float| { - EncodedTextValue::new(MatchedValue::Float(matched_float)) - }), - map(Self::match_decimal, |matched_decimal| { - EncodedTextValue::new(MatchedValue::Decimal(matched_decimal)) - }), - map(Self::match_timestamp, |matched_timestamp| { - EncodedTextValue::new(MatchedValue::Timestamp(matched_timestamp)) - }), - map(Self::match_string, |matched_string| { - EncodedTextValue::new(MatchedValue::String(matched_string)) - }), - map(Self::match_symbol, |matched_symbol| { - EncodedTextValue::new(MatchedValue::Symbol(matched_symbol)) - }), - map(Self::match_blob, |matched_blob| { - EncodedTextValue::new(MatchedValue::Blob(matched_blob)) - }), - map(Self::match_clob, |matched_clob| { - EncodedTextValue::new(MatchedValue::Clob(matched_clob)) - }), - map(Self::match_list, |_matched_list| { - // TODO: Cache child expressions found in 1.0 list - let not_yet_used_in_1_0 = - bumpalo::collections::Vec::new_in(self.context.allocator()).into_bump_slice(); - EncodedTextValue::new(MatchedValue::List(not_yet_used_in_1_0)) - }), - map(Self::match_sexp, |_matched_sexp| { - // TODO: Cache child expressions found in 1.0 sexp - let not_yet_used_in_1_0 = - bumpalo::collections::Vec::new_in(self.context.allocator()).into_bump_slice(); - EncodedTextValue::new(MatchedValue::SExp(not_yet_used_in_1_0)) - }), - map(Self::match_struct, |_matched_struct| { - // TODO: Cache child expressions found in 1.0 struct - let not_yet_used_in_1_0 = - bumpalo::collections::Vec::new_in(self.context.allocator()).into_bump_slice(); - EncodedTextValue::new(MatchedValue::Struct(not_yet_used_in_1_0)) - }), - ))) - .map(|(input, encoded_value)| LazyRawTextValue_1_0 { - encoded_value, - input, - }) - .parse(self) - } - - pub fn match_value_1_1(self) -> IonParseResult<'top, LazyRawTextValue_1_1<'top>> { - consumed(alt(( - // For `null` and `bool`, we use `read_` instead of `match_` because there's no additional - // parsing to be done. - map(Self::match_null, |ion_type| { - EncodedTextValue::new(MatchedValue::Null(ion_type)) - }), - map(Self::match_bool, |value| { - EncodedTextValue::new(MatchedValue::Bool(value)) - }), - // For `int` and the other types, we use `match` and store the partially-processed input in the - // `matched_value` field of the `EncodedTextValue` we return. - map(Self::match_int, |matched_int| { - EncodedTextValue::new(MatchedValue::Int(matched_int)) - }), - map(Self::match_float, |matched_float| { - EncodedTextValue::new(MatchedValue::Float(matched_float)) - }), - map(Self::match_decimal, |matched_decimal| { - EncodedTextValue::new(MatchedValue::Decimal(matched_decimal)) - }), - map(Self::match_timestamp, |matched_timestamp| { - EncodedTextValue::new(MatchedValue::Timestamp(matched_timestamp)) - }), - map(Self::match_string, |matched_string| { - EncodedTextValue::new(MatchedValue::String(matched_string)) - }), - map(Self::match_symbol, |matched_symbol| { - EncodedTextValue::new(MatchedValue::Symbol(matched_symbol)) - }), - map(Self::match_blob, |matched_blob| { - EncodedTextValue::new(MatchedValue::Blob(matched_blob)) - }), - map(Self::match_clob, |matched_clob| { - EncodedTextValue::new(MatchedValue::Clob(matched_clob)) - }), - map(Self::match_list_1_1, |(_matched_list, child_expr_cache)| { - EncodedTextValue::new(MatchedValue::List(child_expr_cache)) - }), - map(Self::match_sexp_1_1, |(_matched_sexp, child_expr_cache)| { - EncodedTextValue::new(MatchedValue::SExp(child_expr_cache)) - }), - map( - Self::match_struct_1_1, - |(_matched_struct, field_expr_cache)| { - EncodedTextValue::new(MatchedValue::Struct(field_expr_cache)) - }, - ), - ))) - .map(|(input, encoded_value)| LazyRawTextValue_1_1 { - encoded_value, - input, - }) - .parse(self) - } - - /// Matches a list. - /// - /// If the input does not contain the entire list, returns `IonError::Incomplete(_)`. - pub fn match_list(self) -> IonMatchResult<'top> { - // If it doesn't start with [, it isn't a list. - if self.bytes().first() != Some(&b'[') { - let error = InvalidInputError::new(self); - return Err(nom::Err::Error(IonParseError::Invalid(error))); - } - // Scan ahead to find the end of this list. - let list_body = self.slice_to_end(1); - let sequence_iter = RawTextListIterator_1_0::new(list_body); - let span = match sequence_iter.find_span() { - Ok(span) => span, - // If the complete container isn't available, return an incomplete. - Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), - // If invalid syntax was encountered, return a failure to prevent nom from trying - // other parser kinds. - Err(e) => { - return { - let error = InvalidInputError::new(self) - .with_label("matching a list") - .with_description(format!("{}", e)); - Err(nom::Err::Failure(IonParseError::Invalid(error))) - } - } - }; - - // For the matched span, we use `self` again to include the opening `[` - let matched = self.slice(0, span.len()); - let remaining = self.slice_to_end(span.len()); - Ok((remaining, matched)) - } - - /// Matches an Ion v1.1 list, which allows e-expressions (macro invocations) to appear in value - /// position. - /// - /// If the input does not contain the entire list, returns `IonError::Incomplete(_)`. - // TODO: DRY with `match_list` - pub fn match_list_1_1( - self, - ) -> IonParseResult< - 'top, - ( - TextBuffer<'top>, - &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], - ), - > { - // If it doesn't start with [, it isn't a list. - if self.bytes().first() != Some(&b'[') { - let error = InvalidInputError::new(self); - return Err(nom::Err::Error(IonParseError::Invalid(error))); - } - // Scan ahead to find the end of this list. - let list_body = self.slice_to_end(1); - let sequence_iter = RawTextListIterator_1_1::new(list_body); - let (span, child_exprs) = match TextListSpanFinder_1_1::new( - self.context.allocator(), - sequence_iter, - ) - .find_span() - { - Ok((span, child_exprs)) => (span, child_exprs), - // If the complete container isn't available, return an incomplete. - Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), - // If invalid syntax was encountered, return a failure to prevent nom from trying - // other parser kinds. - Err(e) => { - return { - let error = InvalidInputError::new(self) - .with_label("matching a v1.1 list") - .with_description(format!("couldn't match span: {}", e)); - Err(nom::Err::Failure(IonParseError::Invalid(error))) - } - } - }; - - // For the matched span, we use `self` again to include the opening `[` - let matched = self.slice(0, span.len()); - let remaining = self.slice_to_end(span.len()); - Ok((remaining, (matched, child_exprs))) - } - - // TODO: DRY with `match_sexp` - pub fn match_sexp_1_1( - self, - ) -> IonParseResult< - 'top, - ( - TextBuffer<'top>, - &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], - ), - > { - if self.bytes().first() != Some(&b'(') { - let error = InvalidInputError::new(self); - return Err(nom::Err::Error(IonParseError::Invalid(error))); - } - // Scan ahead to find the end of this sexp - let sexp_body = self.slice_to_end(1); - let sexp_iter = RawTextSExpIterator_1_1::new(sexp_body); - let (span, child_expr_cache) = - match TextSExpSpanFinder_1_1::new(self.context.allocator(), sexp_iter).find_span(1) { - Ok((span, child_expr_cache)) => (span, child_expr_cache), - // If the complete container isn't available, return an incomplete. - Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), - // If invalid syntax was encountered, return a failure to prevent nom from trying - // other parser kinds. - Err(e) => { - return { - let error = InvalidInputError::new(self) - .with_label("matching a 1.1 sexp") - .with_description(format!("{}", e)); - Err(nom::Err::Failure(IonParseError::Invalid(error))) - } - } - }; - // For the matched span, we use `self` again to include the opening `(` - let matched = self.slice(0, span.len()); - let remaining = self.slice_to_end(span.len()); - Ok((remaining, (matched, child_expr_cache))) - } - - /// Matches a single value in a list OR the end of the list, allowing for leading whitespace - /// and comments in either case. - /// - /// If a value is found, returns `Ok(Some(value))`. If the end of the list is found, returns - /// `Ok(None)`. - pub fn match_list_value(self) -> IonParseResult<'top, Option>> { - preceded( - // Some amount of whitespace/comments... - Self::match_optional_comments_and_whitespace, - // ...followed by either the end of the list... - alt(( - value(None, tag("]")), - // ...or a value... - terminated( - Self::match_annotated_value.map(Some), - // ...followed by a comma or end-of-list - Self::match_delimiter_after_list_value, - ), - )), - )(self) - } - - /// Matches either: - /// * An e-expression (i.e. macro invocation) - /// * An optional annotations sequence and a value - pub fn match_list_value_1_1( - self, - ) -> IonParseResult<'top, Option>> { - whitespace_and_then(alt(( - terminated( - Self::match_e_expression, - Self::match_delimiter_after_list_value, - ) - .map(|matched| Some(RawValueExpr::EExp(matched))), - value(None, tag("]")), - terminated( - Self::match_annotated_long_string_in_list.map(Some), - Self::match_delimiter_after_list_value, - ) - .map(|maybe_matched| maybe_matched.map(RawValueExpr::ValueLiteral)), - terminated( - Self::match_annotated_value_1_1.map(Some), - // ...followed by a comma or end-of-list - Self::match_delimiter_after_list_value, - ) - .map(|maybe_matched| maybe_matched.map(RawValueExpr::ValueLiteral)), - ))) - .parse(self) - } - - /// Matches syntax that is expected to follow a value in a list: any amount of whitespace and/or - /// comments followed by either a comma (consumed) or an end-of-list `]` (not consumed). - fn match_delimiter_after_list_value(self) -> IonMatchResult<'top> { - preceded( - Self::match_optional_comments_and_whitespace, - alt((tag(","), peek(tag("]")))), - )(self) - } - - /// Matches an s-expression (sexp). - /// - /// If the input does not contain the entire s-expression, returns `IonError::Incomplete(_)`. - pub fn match_sexp(self) -> IonMatchResult<'top> { - if self.bytes().first() != Some(&b'(') { - let error = InvalidInputError::new(self); - return Err(nom::Err::Error(IonParseError::Invalid(error))); - } - // Scan ahead to find the end of this sexp - let sexp_body = self.slice_to_end(1); - let sexp_iter = RawTextSExpIterator_1_0::new(sexp_body); - let span = match sexp_iter.find_span(1) { - Ok(span) => span, - // If the complete container isn't available, return an incomplete. - Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), - // If invalid syntax was encountered, return a failure to prevent nom from trying - // other parser kinds. - Err(e) => { - return { - let error = InvalidInputError::new(self) - .with_label("matching a sexp") - .with_description(format!("{}", e)); - Err(nom::Err::Failure(IonParseError::Invalid(error))) - } - } - }; - // For the matched span, we use `self` again to include the opening `(` - let matched = self.slice(0, span.len()); - let remaining = self.slice_to_end(span.len()); - Ok((remaining, matched)) + )) + .parse_next(self) } - /// Matches a struct. - /// - /// If the input does not contain the entire struct, returns `IonError::Incomplete(_)`. - pub fn match_struct(self) -> IonMatchResult<'top> { - // If it doesn't start with {, it isn't a struct. - if self.bytes().first() != Some(&b'{') { - let error = InvalidInputError::new(self); - return Err(nom::Err::Error(IonParseError::Invalid(error))); - } - // Scan ahead to find the end of this struct. - let struct_body = self.slice_to_end(1); - let struct_iter = RawTextStructIterator_1_0::new(struct_body); - let span = match struct_iter.find_span() { - Ok(span) => span, - // If the complete container isn't available, return an incomplete. - Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), - // If invalid syntax was encountered, return a failure to prevent nom from trying - // other parser kinds. - Err(e) => { - return { - let error = InvalidInputError::new(self) - .with_label("matching a struct") - .with_description(format!("{}", e)); - Err(nom::Err::Failure(IonParseError::Invalid(error))) + /// Matches a single Ion 1.0 value. + pub fn match_value>(&mut self) -> IonParseResult<'top, E::Value<'top>> { + dispatch! { + |input: &mut TextBuffer<'top>| input.peek_byte(); + byte if byte.is_ascii_digit() || byte == b'-' => { + alt(( + Self::match_int_value, + Self::match_float_value, + Self::match_decimal_value, + Self::match_timestamp_value, + )) + }, + byte if byte.is_ascii_alphabetic() => { + alt(( + Self::match_null_value, + Self::match_bool_value, + Self::match_identifier_value, + Self::match_float_special_value, // nan + )) + }, + b'$' | b'_' => { + Self::match_symbol_value // identifiers and symbol IDs + }, + b'"' | b'\'' => { + alt(( + Self::match_string_value, + Self::match_symbol_value, + )) + }, + b'[' => E::list_matcher(), + b'(' => E::sexp_matcher(), + b'{' => { + alt(( + Self::match_blob_value, + Self::match_clob_value, + E::struct_matcher(), + )) + }, + b'+' => Self::match_float_special_value, // +inf + _other => { + // `other` is not a legal start-of-value byte. + |input: &mut TextBuffer<'top>| { + let error = InvalidInputError::new(*input); + Err(ErrMode::Backtrack(IonParseError::Invalid(error))) } - } - }; - - // For the matched span, we use `self` again to include the opening `{` - let matched = self.slice(0, span.len()); - let remaining = self.slice_to_end(span.len()); - Ok((remaining, matched)) - } - - pub fn match_struct_1_1( - self, - ) -> IonParseResult< - 'top, - ( - TextBuffer<'top>, - &'top [LazyRawFieldExpr<'top, TextEncoding_1_1>], - ), - > { - // If it doesn't start with {, it isn't a struct. - if self.bytes().first() != Some(&b'{') { - let error = InvalidInputError::new(self); - return Err(nom::Err::Error(IonParseError::Invalid(error))); + }, } - // Scan ahead to find the end of this struct. - let struct_body = self.slice_to_end(1); - let struct_iter = RawTextStructIterator_1_1::new(struct_body); - let (span, fields) = match TextStructSpanFinder_1_1::new( - self.context.allocator(), - struct_iter, - ) - .find_span() - { - Ok((span, fields)) => (span, fields), - // If the complete container isn't available, return an incomplete. - Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), - // If invalid syntax was encountered, return a failure to prevent nom from trying - // other parser kinds. - Err(e) => { - return { - let error = InvalidInputError::new(self) - .with_label("matching a v1.1 struct") - .with_description(format!("{}", e)); - Err(nom::Err::Failure(IonParseError::Invalid(error))) - } - } - }; - - // For the matched span, we use `self` again to include the opening `{` - let matched = self.slice(0, span.len()); - let remaining = self.slice_to_end(span.len()); - Ok((remaining, (matched, fields))) + .with_taken() + .map(|(encoded_value, input)| E::new_value(input, encoded_value)) + .parse_next(self) } pub fn match_e_expression_arg_group( - self, + &mut self, parameter: &'top Parameter, ) -> IonParseResult<'top, TextEExpArgGroup<'top>> { alt(( Self::parser_with_arg(Self::match_explicit_arg_group, parameter), Self::parser_with_arg(Self::match_rest, parameter), - ))(self) + )) + .parse_next(self) } /// Higher-order helper that takes a closure and an argument to pass and constructs a new /// parser that calls the closure with the provided argument. pub fn parser_with_arg( - mut parser: impl FnMut(Self, &'top A) -> IonParseResult<'top, O>, + mut parser: impl FnMut(&mut Self, &'top A) -> IonParseResult<'top, O>, arg_to_pass: &'top A, - ) -> impl Parser> { - move |input: TextBuffer<'top>| parser(input, arg_to_pass) + ) -> impl IonParser<'top, O> { + move |input: &mut TextBuffer<'top>| parser(input, arg_to_pass) } pub fn match_explicit_arg_group( - self, + &mut self, parameter: &'top Parameter, ) -> IonParseResult<'top, TextEExpArgGroup<'top>> { - let (group_body, group_head) = alt(( - // A trivially empty arg group: `(:)` - terminated(tag("(::"), peek(tag(")"))), - // An arg group that is not trivially empty, though it may only contain whitespace: - // (:: ) - // (:: 1 2 3) - recognize(pair(tag("(::"), Self::match_optional_whitespace)), - ))(self)?; - - // The rest of the group uses s-expression syntax. Scan ahead to find the end of this - // group. - let sexp_iter = RawTextSExpIterator_1_1::new(group_body); - // The sexp iterator holds the body of the expression. When finding the input span it occupies, - // we tell the iterator how many bytes comprised the head of the expression: `(:` followed - // by whitespace. - let initial_bytes_skipped = group_head.len(); - let (span, child_expr_cache) = - match TextSExpSpanFinder_1_1::new(self.context.allocator(), sexp_iter) - .find_span(initial_bytes_skipped) - { - Ok((span, child_expr_cache)) => (span, child_expr_cache), - // If the complete group isn't available, return an incomplete. - Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), - // If invalid syntax was encountered, return a failure to prevent nom from trying - // other parser kinds. - Err(e) => { - return { - let error = InvalidInputError::new(self) - .with_label("matching an e-expression argument group") - .with_description(format!("{}", e)); - Err(nom::Err::Failure(IonParseError::Invalid(error))) - } - } - }; - // For the matched span, we use `self` again to include the opening `(:` and whitespace. - let matched = self.slice(0, span.len()); - let remaining = self.slice_to_end(span.len()); - let arg_group = TextEExpArgGroup::new(parameter, matched, child_expr_cache); - Ok((remaining, arg_group)) + TextEncoding_1_1::container_matcher( + "an explicit argument group", + "(::", + RawTextSExpIterator::::new, + whitespace_and_then(")") + ) + .with_taken() + .map(|(expr_cache, input)| TextEExpArgGroup::new(parameter, input, expr_cache)) + .parse_next(self) } - pub fn match_e_expression_name(self) -> IonParseResult<'top, MacroIdRef<'top>> { - let (exp_body_after_id, (macro_id_bytes, matched_symbol)) = - consumed(Self::match_identifier)(self)?; + pub fn match_e_expression_name(&mut self) -> IonParseResult<'top, MacroIdRef<'top>> { + let (matched_symbol, macro_id_bytes) = + Self::match_identifier.with_taken().parse_next(self)?; let name = match matched_symbol .read(self.context.allocator(), macro_id_bytes) .expect("matched identifier but failed to read its bytes") @@ -1168,33 +621,32 @@ impl<'top> TextBuffer<'top> { RawSymbolRef::Text(text) => text, RawSymbolRef::SystemSymbol_1_1(system_symbol) => system_symbol.text(), }; - Ok((exp_body_after_id, MacroIdRef::LocalName(name))) + Ok(MacroIdRef::LocalName(name)) } - pub fn match_e_expression_address(self) -> IonParseResult<'top, MacroIdRef<'top>> { - let (exp_body_after_id, address) = Self::match_address(self)?; + pub fn match_e_expression_address(&mut self) -> IonParseResult<'top, MacroIdRef<'top>> { + let address = Self::match_address(self)?; let id = MacroIdRef::LocalAddress(address); - Ok((exp_body_after_id, id)) + Ok(id) } - pub fn match_system_eexp_id(self) -> IonParseResult<'top, MacroIdRef<'top>> { - let (after_system_annotation, _matched_system_annotation) = recognize(tuple(( - tag("$ion"), - whitespace_and_then(tag("::")), - Self::match_optional_whitespace, - ))) - .parse(self)?; + pub fn match_system_eexp_id(&mut self) -> IonParseResult<'top, MacroIdRef<'top>> { + let _matched_system_annotation = + ("$ion", whitespace_and_then("::"), Self::match_whitespace0) + .take() + .parse_next(self)?; - let (remaining, id) = alt(( + let id = alt(( Self::match_e_expression_address, Self::match_e_expression_name, )) - .parse(after_system_annotation)?; + .parse_next(self)?; + let system_id = match id { MacroIdRef::LocalName(name) => { let Some(macro_address) = ION_1_1_SYSTEM_MACROS.address_for_name(name) else { return fatal_parse_error( - after_system_annotation, + *self, format!("Found unrecognized system macro name: '{}'", name), ); }; @@ -1204,7 +656,7 @@ impl<'top> TextBuffer<'top> { MacroIdRef::LocalAddress(address) => { let Some(system_address) = SystemMacroAddress::new(address) else { return fatal_parse_error( - after_system_annotation, + *self, format!("Found out-of-bounds system macro address {}", address), ); }; @@ -1214,80 +666,77 @@ impl<'top> TextBuffer<'top> { unreachable!("`match_e_expression_address` always returns a LocalAddress") } }; - Ok((remaining, system_id)) + Ok(system_id) } - pub fn match_e_expression_id(self) -> IonParseResult<'top, MacroIdRef<'top>> { - let (input_after_id, id) = alt(( + pub fn match_e_expression_id(&mut self) -> IonParseResult<'top, MacroIdRef<'top>> { + let id = alt(( Self::match_system_eexp_id, Self::match_e_expression_name, Self::match_e_expression_address, - ))(self)?; + )) + .parse_next(self)?; - if input_after_id.is_empty() { - // Unlike a symbol value with identifier syntax, an e-expression identifier cannot be - // the last thing in the stream. - return Err(nom::Err::Incomplete(Needed::Unknown)); - }; - Ok((input_after_id, id)) + Ok(id) } /// Matches an e-expression invoking a macro. /// /// If the input does not contain the entire e-expression, returns `IonError::Incomplete(_)`. - pub fn match_e_expression(self) -> IonParseResult<'top, TextEExpression_1_1<'top>> { - let (eexp_body, _opening_tag) = tag("(:")(self)?; - let (mut remaining, id) = Self::match_e_expression_id(eexp_body)?; - let mut arg_expr_cache = BumpVec::new_in(self.context.allocator()); - - let macro_ref: &'top Macro = self - .context() - .macro_table() - .macro_with_id(id) - .ok_or_else(|| { - nom::Err::Failure(IonParseError::Invalid( - InvalidInputError::new(self) - .with_description(format!("could not find macro with id {:?}", id)), - )) - })? - .reference(); - let signature_params: &'top [Parameter] = macro_ref.signature().parameters(); - for (index, param) in signature_params.iter().enumerate() { - let (input_after_match, maybe_arg) = remaining.match_argument_for(param)?; - remaining = input_after_match; - match maybe_arg { - Some(arg) => arg_expr_cache.push(arg), - None => { - for param in &signature_params[index..] { - if !param.can_be_omitted() { - return fatal_parse_error( - self, - format!( - "e-expression did not include an argument for param '{}'", - param.name() - ), - ); + pub fn match_e_expression(&mut self) -> IonParseResult<'top, TextEExpression_1_1<'top>> { + let parser = |input: &mut TextBuffer<'top>| { + let _opening_tag = "(:".parse_next(input)?; + let id = Self::match_e_expression_id(input)?; + let mut arg_expr_cache = BumpVec::new_in(input.context.allocator()); + + let macro_ref: &'top Macro = input + .context() + .macro_table() + .macro_with_id(id) + .ok_or_else(|| { + ErrMode::Cut(IonParseError::Invalid( + InvalidInputError::new(*input) + .with_description(format!("could not find macro with id {:?}", id)), + )) + })? + .reference(); + let signature_params: &'top [Parameter] = macro_ref.signature().parameters(); + for (index, param) in signature_params.iter().enumerate() { + let maybe_arg = input.match_argument_for(param)?; + match maybe_arg { + Some(arg) => arg_expr_cache.push(arg), + None => { + for param in &signature_params[index..] { + if !param.can_be_omitted() { + return fatal_parse_error( + *input, + format!( + "e-expression did not include an argument for param '{}'", + param.name() + ), + ); + } } + break; } - break; } } - } - let (remaining, _end_of_eexp) = match whitespace_and_then(tag(")")).parse(remaining) { - Ok(result) => result, - Err(nom::Err::Incomplete(needed)) => return Err(nom::Err::Incomplete(needed)), - Err(_e) => { - return fatal_parse_error( - remaining, - format!( - "macro {id} signature has {} parameter(s), e-expression had an extra argument", - signature_params.len() - ), - ); + match whitespace_and_then(")").parse_next(input) { + Ok(_closing_delimiter) => Ok((id, macro_ref, arg_expr_cache)), + Err(ErrMode::Incomplete(_)) => input.incomplete("an e-expression"), + Err(_e) => { + fatal_parse_error( + *input, + format!( + "macro {id} signature has {} parameter(s), e-expression had an extra argument", + signature_params.len() + ), + ) + } } }; - - let matched_input = self.slice(0, remaining.offset() - self.offset()); + let ((macro_id, macro_ref, mut arg_expr_cache), matched_input) = + parser.with_taken().parse_next(self)?; let parameters = macro_ref.signature().parameters(); if arg_expr_cache.len() < parameters.len() { @@ -1300,12 +749,13 @@ impl<'top> TextBuffer<'top> { let last_explicit_arg_end = arg_expr_cache .last() .map(|arg| arg.expr().range().end) - .unwrap_or(remaining.offset); + .unwrap_or(self.offset); for parameter in ¶meters[arg_expr_cache.len()..] { let buffer = TextBuffer::new_with_offset( self.context, EMPTY_ARG_TEXT.as_bytes(), last_explicit_arg_end, + self.is_final_data(), ); arg_expr_cache.push(EExpArg::new( parameter, @@ -1318,21 +768,22 @@ impl<'top> TextBuffer<'top> { "every parameter must have an argument, explicit or implicit" ); - Ok(( - remaining, - TextEExpression_1_1::new(id, matched_input, arg_expr_cache.into_bump_slice()), + Ok(TextEExpression_1_1::new( + macro_id, + matched_input, + arg_expr_cache.into_bump_slice(), )) } pub fn match_argument_for( - self, + &mut self, parameter: &'top Parameter, ) -> IonParseResult<'top, Option>> { use crate::lazy::expanded::template::ParameterCardinality::*; match parameter.cardinality() { ExactlyOne => { - let (remaining, arg) = self.match_exactly_one(parameter)?; - Ok((remaining, Some(arg))) + let arg = self.match_exactly_one(parameter)?; + Ok(Some(arg)) } ZeroOrOne => self.match_zero_or_one(parameter), ZeroOrMore => self.match_zero_or_more(parameter), @@ -1341,25 +792,25 @@ impl<'top> TextBuffer<'top> { } pub fn match_exactly_one( - self, + &mut self, parameter: &'top Parameter, ) -> IonParseResult<'top, EExpArg<'top, TextEncoding_1_1>> { - let (after_ws, _ws) = self.match_optional_comments_and_whitespace()?; + let _whitespace = self.match_optional_comments_and_whitespace()?; // This check exists to offer a more human-friendly error message; without it, // the user simply sees a parsing failure. - if after_ws.bytes().starts_with(b"(::") { + if self.bytes().starts_with(b"(::") { return fatal_parse_error( - self, - format!("parameter '{}' has cardinality `ExactlyOne`; it cannot accept an expression group", parameter.name()) + *self, + format!("parameter '{}' has cardinality `ExactlyOne`; it cannot accept an expression group", parameter.name()), ); } - let (remaining, maybe_expr) = Self::match_sexp_value_1_1 + let maybe_expr = Self::match_sexp_item_1_1 .map(|expr| expr.map(EExpArgExpr::::from)) - .parse(after_ws)?; + .parse_next(self)?; match maybe_expr { - Some(expr) => Ok((remaining, EExpArg::new(parameter, expr))), + Some(expr) => Ok(EExpArg::new(parameter, expr)), None => fatal_parse_error( - after_ws, + *self, format!( "expected argument for required parameter '{}'", parameter.name() @@ -1368,60 +819,64 @@ impl<'top> TextBuffer<'top> { } } + + pub fn match_empty_arg_group( - self, + &mut self, parameter: &'top Parameter, ) -> IonParseResult<'top, EExpArg<'top, TextEncoding_1_1>> { - recognize(pair(tag("(::"), whitespace_and_then(tag(")")))) + ("(::", whitespace_and_then(")")) + .take() .map(|matched_expr| { let arg_group = TextEExpArgGroup::new(parameter, matched_expr, &[]); EExpArg::new(parameter, EExpArgExpr::ArgGroup(arg_group)) }) - .parse(self) + .parse_next(self) } pub fn match_zero_or_one( - self, + &mut self, parameter: &'top Parameter, ) -> IonParseResult<'top, Option>> { whitespace_and_then(alt(( Self::parser_with_arg(Self::match_empty_arg_group, parameter).map(Some), // TODO: Match a non-empty arg group and turn it into a failure with a helpful error message - Self::match_sexp_value_1_1.map(|maybe_expr| { + Self::match_sexp_item_1_1.map(|maybe_expr| { maybe_expr.map(|expr| { EExpArg::new(parameter, EExpArgExpr::::from(expr)) }) }), ))) - .parse(self) + .parse_next(self) } pub fn match_zero_or_more( - self, + &mut self, parameter: &'top Parameter, ) -> IonParseResult<'top, Option>> { - let (remaining, maybe_expr) = preceded( + let maybe_expr = preceded( Self::match_optional_comments_and_whitespace, alt(( Self::parser_with_arg(Self::match_e_expression_arg_group, parameter) .map(|group| Some(EExpArg::new(parameter, EExpArgExpr::ArgGroup(group)))), - Self::match_sexp_value_1_1.map(|expr| { + Self::match_sexp_item_1_1.map(|expr| { expr.map(EExpArgExpr::from) .map(|expr| EExpArg::new(parameter, expr)) }), - value(None, peek(tag(")"))), + peek(")").value(None), )), - )(self)?; - Ok((remaining, maybe_expr)) + ) + .parse_next(self)?; + Ok(maybe_expr) } pub fn match_one_or_more( - self, + &mut self, parameter: &'top Parameter, ) -> IonParseResult<'top, Option>> { if self.match_empty_arg_group(parameter).is_ok() { - return Err(nom::Err::Failure(IonParseError::Invalid( - InvalidInputError::new(self).with_description(format!( + return Err(ErrMode::Cut(IonParseError::Invalid( + InvalidInputError::new(*self).with_description(format!( "parameter '{}' is one-or-more (`+`) and cannot accept an empty stream", parameter.name() )), @@ -1432,90 +887,94 @@ impl<'top> TextBuffer<'top> { } pub fn match_rest( - self, + &mut self, parameter: &'top Parameter, ) -> IonParseResult<'top, TextEExpArgGroup<'top>> { if parameter.rest_syntax_policy() == RestSyntaxPolicy::NotAllowed { - return Err(nom::Err::Error(IonParseError::Invalid( - InvalidInputError::new(self) + return Err(ErrMode::Backtrack(IonParseError::Invalid( + InvalidInputError::new(*self) .with_description("parameter does not support rest syntax"), ))); } - let mut remaining = self; let mut cache = BumpVec::new_in(self.context().allocator()); - loop { - let (remaining_after_expr, maybe_expr) = alt(( - value(None, whitespace_and_then(peek(tag(")")))), - Self::match_sexp_value_1_1, + let parser = |input: &mut TextBuffer<'top>| { + while let Some(expr) = alt(( + whitespace_and_then(peek(")")).value(None), + Self::match_sexp_item_1_1, )) - .parse(remaining)?; - if let Some(expr) = maybe_expr { - remaining = remaining_after_expr; + .parse_next(input)? + { cache.push(expr); - } else { - return Ok(( - remaining, - TextEExpArgGroup::new(parameter, self, cache.into_bump_slice()), - )); } - } + Ok(()) + }; + let (_, matched_input) = parser.with_taken().parse_next(self)?; + + Ok(TextEExpArgGroup::new( + parameter, + matched_input, + cache.into_bump_slice(), + )) } /// Matches and returns a boolean value. - pub fn match_bool(self) -> IonParseResult<'top, bool> { + pub fn match_bool(&mut self) -> IonParseResult<'top, bool> { terminated( - alt(( - value(true, complete_tag("true")), - value(false, complete_tag("false")), - )), + alt(("true".value(true), "false".value(false))), Self::peek_stop_character, - )(self) + ) + .parse_next(self) } /// Matches and returns any type of null. (`null`, `null.null`, `null.int`, etc) - pub fn match_null(self) -> IonParseResult<'top, IonType> { + pub fn match_null(&mut self) -> IonParseResult<'top, IonType> { terminated( alt(( - pair(complete_tag("null."), Self::match_ion_type).map(|(_, ion_type)| ion_type), - complete_tag("null").map(|_| IonType::Null), + ("null.", Self::match_ion_type).map(|(_, ion_type)| ion_type), + "null".value(IonType::Null), )), Self::peek_stop_character, ) - .parse(self) + .parse_next(self) } /// Matches and returns an Ion type. - fn match_ion_type(self) -> IonParseResult<'top, IonType> { + fn match_ion_type(&mut self) -> IonParseResult<'top, IonType> { alt(( - value(IonType::Null, tag("null")), - value(IonType::Bool, tag("bool")), - value(IonType::Int, tag("int")), - value(IonType::Float, tag("float")), - value(IonType::Decimal, tag("decimal")), - value(IonType::Timestamp, tag("timestamp")), - value(IonType::Symbol, tag("symbol")), - value(IonType::String, tag("string")), - value(IonType::Clob, tag("clob")), - value(IonType::Blob, tag("blob")), - value(IonType::List, tag("list")), - value(IonType::SExp, tag("sexp")), - value(IonType::Struct, tag("struct")), - ))(self) + "null".value(IonType::Null), + "bool".value(IonType::Bool), + "int".value(IonType::Int), + "float".value(IonType::Float), + "decimal".value(IonType::Decimal), + "timestamp".value(IonType::Timestamp), + "symbol".value(IonType::Symbol), + "string".value(IonType::String), + "clob".value(IonType::Clob), + "blob".value(IonType::Blob), + "list".value(IonType::List), + "sexp".value(IonType::SExp), + "struct".value(IonType::Struct), + )) + .parse_next(self) } /// Matches any one of Ion's stop characters. - fn match_stop_character(self) -> IonMatchResult<'top> { - alt((eof, recognize(one_of("{}[](),\"' \t\n\r\u{0b}\u{0c}"))))(self) + fn match_stop_character(&mut self) -> IonMatchResult<'top> { + alt(( + eof, + one_of("{}[](),\"' \t\n\r\u{0b}\u{0c}".as_bytes()).take(), + )) + .parse_next(self) } /// Matches--but does not consume--any one of Ion's stop characters. - fn peek_stop_character(self) -> IonMatchResult<'top> { - peek(Self::match_stop_character).parse(self) + fn peek_stop_character(&mut self) -> IonMatchResult<'top> { + peek(Self::match_stop_character).parse_next(self) } /// Matches the three parts of an int--its base, its sign, and its digits--without actually /// constructing an Int from them. - pub fn match_int(self) -> IonParseResult<'top, MatchedInt> { + pub fn match_int(&mut self) -> IonParseResult<'top, MatchedInt> { terminated( // We test for base 16 and base 2 so the '0x' or '0b' isn't confused for a leading zero // in a base 10 number, which would be illegal. @@ -1525,165 +984,177 @@ impl<'top> TextBuffer<'top> { Self::match_base_10_int, )), Self::peek_stop_character, - )(self) - } + ) + .parse_next(self) + } + + scalar_value_matchers!( + Self::match_null => Null => match_null_value, + Self::match_bool => Bool => match_bool_value, + Self::match_int => Int => match_int_value, + Self::match_float => Float => match_float_value, + Self::match_float_special => Float => match_float_special_value, + Self::match_decimal => Decimal => match_decimal_value, + Self::match_timestamp => Timestamp => match_timestamp_value, + Self::match_string => String => match_string_value, + Self::match_symbol => Symbol => match_symbol_value, + Self::match_identifier => Symbol => match_identifier_value, + Self::match_blob => Blob => match_blob_value, + Self::match_clob => Clob => match_clob_value, + ); /// Matches a base-2 notation integer (e.g. `0b0`, `0B1010`, or `-0b0111`) and returns the /// partially parsed value as a [`MatchedInt`]. - fn match_base_2_int(self) -> IonParseResult<'top, MatchedInt> { - separated_pair( - opt(char('-')), - alt((complete_tag("0b"), complete_tag("0B"))), - Self::match_base_2_int_digits, - ) - .map(|(maybe_sign, digits)| { - MatchedInt::new(2, maybe_sign.is_some(), digits.offset() - self.offset()) - }) - .parse(self) + fn match_base_2_int(&mut self) -> IonParseResult<'top, MatchedInt> { + let initial_offset = self.offset(); + separated_pair(opt("-"), alt(("0b", "0B")), Self::match_base_2_int_digits) + .map(|(maybe_sign, digits)| { + MatchedInt::new(2, maybe_sign.is_some(), digits.offset() - initial_offset) + }) + .parse_next(self) } /// Matches the digits of a base-2 integer. - fn match_base_2_int_digits(self) -> IonMatchResult<'top> { - recognize(terminated( + fn match_base_2_int_digits(&mut self) -> IonMatchResult<'top> { + terminated( // Zero or more digits-followed-by-underscores - many0_count(pair(complete_is_a("01"), complete_tag("_"))), + zero_or_more((take_while(1.., b"01"), "_")), // One or more digits - pair( - one_of("01"), - many0_count(nom::character::complete::one_of("01")), - ), - ))(self) + one_or_more(one_of(b"01")), + ) + .take() + .parse_next(self) } /// Matches a base-10 notation integer (e.g. `0`, `255`, or `-1_024`) and returns the partially /// parsed value as a [`MatchedInt`]. - fn match_base_10_int(self) -> IonParseResult<'top, MatchedInt> { - pair(opt(char('-')), Self::match_base_10_int_digits) + fn match_base_10_int(&mut self) -> IonParseResult<'top, MatchedInt> { + let initial_offset = self.offset(); + (opt("-"), Self::match_base_10_int_digits) .map(|(maybe_sign, digits)| { - MatchedInt::new(10, maybe_sign.is_some(), digits.offset() - self.offset()) + MatchedInt::new(10, maybe_sign.is_some(), digits.offset() - initial_offset) }) - .parse(self) + .parse_next(self) } /// Matches the digits of a base-10 integer. (i.e. An integer without a sign.) - fn match_base_10_int_digits(self) -> IonMatchResult<'top> { + fn match_base_10_int_digits(&mut self) -> IonMatchResult<'top> { Self::match_base_10_digits_before_dot(self) } /// Matches either: /// * a zero /// * a non-zero followed by some number of digits with optional underscores - fn match_base_10_digits_before_dot(self) -> IonMatchResult<'top> { + fn match_base_10_digits_before_dot(&mut self) -> IonMatchResult<'top> { alt(( // The number is either a zero... - complete_tag("0"), + "0", // Or it's a non-zero followed by some number of '_'-separated digits - recognize(pair( + ( Self::match_base_10_leading_digit, Self::match_base_10_trailing_digits, - )), - ))(self) + ) + .take(), + )) + .parse_next(self) } /// Matches the first digit of a multi-digit base-10 integer. (i.e. Any digit but zero.) - fn match_base_10_leading_digit(self) -> IonMatchResult<'top> { - recognize(one_of("123456789"))(self) + fn match_base_10_leading_digit(&mut self) -> IonMatchResult<'top> { + one_of(b"123456789").take().parse_next(self) } /// Matches any number of digits with underscores optionally appearing in the middle. /// This parser accepts leading zeros, which is why it cannot be used for the beginning /// of a number. - fn match_base_10_trailing_digits(self) -> IonMatchResult<'top> { + fn match_base_10_trailing_digits(&mut self) -> IonMatchResult<'top> { // A sequence of zero or more... - recognize(many0_count(alt(( + zero_or_more(alt(( //...underscore-followed-by-a-digit... - recognize(preceded(complete_tag("_"), satisfy(|c| c.is_ascii_digit()))), - //...or a digit. - complete_digit1, - ))))(self) + ("_", one_of(|b: u8| b.is_ascii_digit())).take(), + //...or one or more digits. + digit1, + ))) + .parse_next(self) } /// Matches a base-10 notation integer (e.g. `0x0`, `0X20`, or `-0xCAFE`) and returns the /// partially parsed value as a [`MatchedInt`]. - fn match_base_16_int(self) -> IonParseResult<'top, MatchedInt> { + fn match_base_16_int(&mut self) -> IonParseResult<'top, MatchedInt> { + let initial_offset = self.offset(); separated_pair( - opt(char('-')), - alt((complete_tag("0x"), complete_tag("0X"))), + opt("-"), + alt(("0x", "0X")), Self::match_base_16_int_trailing_digits, ) .map(|(maybe_sign, digits)| { - MatchedInt::new(16, maybe_sign.is_some(), digits.offset() - self.offset()) + MatchedInt::new(16, maybe_sign.is_some(), digits.offset() - initial_offset) }) - .parse(self) + .parse_next(self) } /// Matches the digits that follow the '0x' or '0X' in a base-16 integer - fn match_base_16_int_trailing_digits(self) -> IonMatchResult<'top> { - recognize(terminated( + fn match_base_16_int_trailing_digits(&mut self) -> IonMatchResult<'top> { + terminated( // Zero or more digits-followed-by-underscores - many0_count(pair(Self::take_base_16_digits1, complete_tag("_"))), + zero_or_more((Self::take_base_16_digits1, "_")), // One or more digits Self::take_base_16_digits1, - ))(self) + ) + .take() + .parse_next(self) } /// Recognizes 1 or more consecutive base-16 digits. // This function's "1" suffix is a style borrowed from `nom`. - fn take_base_16_digits1(self) -> IonMatchResult<'top> { - recognize(pair( - // We need at least one digit; if input's empty, this is Incomplete. - satisfy(|c: char| c.is_ascii_hexdigit()), + fn take_base_16_digits1(&mut self) -> IonMatchResult<'top> { + ( + one_of(|b: u8| b.is_ascii_hexdigit()), // After we have our digit, take digits until we find a non-digit (including EOF). - complete_take_while(|b: u8| b.is_ascii_hexdigit()), - ))(self) + take_while(.., |b: u8| b.is_ascii_hexdigit()), + ) + .take() + .parse_next(self) } /// Matches `n` consecutive hex digits. pub(crate) fn match_n_hex_digits( count: usize, ) -> impl Parser, TextBuffer<'top>, IonParseError<'top>> { - // `fold_many_m_n` allows us to repeat the same parser between 'm' and 'n' times, - // specifying an operation to perform on each match. In our case, we just need the parser - // to run 'n' times exactly so `recognize` can return the accepted slice; our operation - // is a no-op. - recognize(fold_many_m_n( - count, - count, - satisfy(|c| c.is_ascii_hexdigit()), - || 0, - // no-op - |accum, _item| accum, - )) + n_times(count, one_of(|b: u8| b.is_ascii_hexdigit())).take() } /// Matches an Ion float of any syntax - fn match_float(self) -> IonParseResult<'top, MatchedFloat> { + fn match_float(&mut self) -> IonParseResult<'top, MatchedFloat> { terminated( alt(( - Self::match_float_special_value, + Self::match_float_special, Self::match_float_numeric_value, )), Self::peek_stop_character, - )(self) + ) + .parse_next(self) } /// Matches special IEEE-754 values, including +/- infinity and NaN. - fn match_float_special_value(self) -> IonParseResult<'top, MatchedFloat> { + fn match_float_special(&mut self) -> IonParseResult<'top, MatchedFloat> { alt(( - value(MatchedFloat::NotANumber, complete_tag("nan")), - value(MatchedFloat::PositiveInfinity, tag("+inf")), - value(MatchedFloat::NegativeInfinity, tag("-inf")), - ))(self) + "nan".value(MatchedFloat::NotANumber), + "+inf".value(MatchedFloat::PositiveInfinity), + "-inf".value(MatchedFloat::NegativeInfinity), + )) + .parse_next(self) } /// Matches numeric IEEE-754 floating point values. - fn match_float_numeric_value(self) -> IonParseResult<'top, MatchedFloat> { - recognize(pair( + fn match_float_numeric_value(&mut self) -> IonParseResult<'top, MatchedFloat> { + ( Self::match_number_with_optional_dot_and_digits, Self::match_float_exponent_marker_and_digits, - )) - .map(|_matched| MatchedFloat::Numeric) - .parse(self) + ) + .take() + .value(MatchedFloat::Numeric) + .parse_next(self) } /// Matches a number that may or may not have a decimal place and trailing fractional digits. @@ -1692,88 +1163,85 @@ impl<'top> TextBuffer<'top> { /// 1000 /// 1000.559 /// -25.2 - fn match_number_with_optional_dot_and_digits(self) -> IonMatchResult<'top> { - recognize(tuple(( - opt(complete_tag("-")), + fn match_number_with_optional_dot_and_digits(&mut self) -> IonMatchResult<'top> { + ( + opt("-"), Self::match_base_10_digits_before_dot, opt(Self::match_dot_followed_by_base_10_digits), - )))(self) + ) + .take() + .parse_next(self) } /// In a float or decimal, matches the digits that are permitted before the decimal point. /// This includes either a single zero, or a non-zero followed by any sequence of digits. - fn match_digits_before_dot(self) -> IonMatchResult<'top> { + fn match_digits_before_dot(&mut self) -> IonMatchResult<'top> { alt(( - complete_tag("0"), - recognize(pair(Self::match_leading_digit, Self::match_trailing_digits)), - ))(self) + "0", + (Self::match_leading_digit, Self::match_trailing_digits).take(), + )) + .parse_next(self) } /// Matches a single non-zero base 10 digit. - fn match_leading_digit(self) -> IonMatchResult<'top> { - recognize(one_of("123456789"))(self) + fn match_leading_digit(&mut self) -> IonMatchResult<'top> { + one_of(b"123456789").take().parse_next(self) } /// Matches any number of base 10 digits, allowing underscores at any position except the end. - fn match_trailing_digits(self) -> IonMatchResult<'top> { - recognize(many0_count(preceded( - opt(complete_char('_')), - complete_digit1, - )))(self) + fn match_trailing_digits(&mut self) -> IonMatchResult<'top> { + zero_or_more(preceded(opt("_"), digit1)).parse_next(self) } /// Recognizes a decimal point followed by any number of base-10 digits. - fn match_dot_followed_by_base_10_digits(self) -> IonMatchResult<'top> { - recognize(preceded( - complete_tag("."), - opt(Self::match_zero_or_more_digits_after_dot), - ))(self) + fn match_dot_followed_by_base_10_digits(&mut self) -> IonMatchResult<'top> { + (".", opt(Self::match_zero_or_more_digits_after_dot)) + .take() + .parse_next(self) } /// Like `match_digits_before_dot`, but allows leading zeros. - fn match_one_or_more_digits_after_dot(self) -> IonMatchResult<'top> { - recognize(terminated( + fn match_one_or_more_digits_after_dot(&mut self) -> IonMatchResult<'top> { + ( // Any number of digit-sequence-with-trailing-underscores... - many0_count(pair(complete_digit1, complete_char('_'))), + zero_or_more((digit1, "_")), // ...and at least one trailing digit. Inputs that don't have any underscores // will be handled by this parser branch. - pair(satisfy(|c| c.is_ascii_digit()), complete_digit0), - // Note: ^-- We use this `pair(satisfy(...), complete_digit0)` to guarantee a subtle - // behavior. At the end of the buffer, an empty input to this parser must be - // considered 'incomplete' instead of 'invalid'. In contrast, an input of a single - // digit would be considered complete even though the buffer could get more data later. - // (If the buffer gets more data, it's the StreamingRawReader's responsibility to - // discard the `1.1` and try again.) - ))(self) + (one_of(|b: u8| b.is_ascii_digit()), digit0), + ) + .take() + .parse_next(self) } /// Like `match_digits_before_dot`, but allows leading zeros. - fn match_zero_or_more_digits_after_dot(self) -> IonMatchResult<'top> { - recognize(terminated( + fn match_zero_or_more_digits_after_dot(&mut self) -> IonMatchResult<'top> { + terminated( // Zero or more digits-followed-by-underscores. - many0_count(pair( - complete_digit1, + zero_or_more(( + digit1, terminated( // The digit sequence can be followed by an underscore... - complete_char('_'), + "_", // ...as long as the character after the underscore is another digit. - peek(satisfy(|c| c.is_ascii_digit())), + peek(one_of(|b: u8| b.is_ascii_digit())), ), )), - // ...and zero or more trailing digits. This parser branch handles: - // * inputs that don't have any underscores - // * empty inputs - complete_digit0, - ))(self) + // ...and one or more trailing digits. This parser branch handles + // inputs that don't have any underscores. + digit1, + ) + .take() + .parse_next(self) } /// Matches an `e` or `E` followed by an optional sign (`+` or `-`) followed by one or more /// base 10 digits. - fn match_float_exponent_marker_and_digits(self) -> IonMatchResult<'top> { + fn match_float_exponent_marker_and_digits(&mut self) -> IonMatchResult<'top> { preceded( - complete_one_of("eE"), - recognize(Self::match_exponent_sign_and_one_or_more_digits), - )(self) + one_of(b"eE"), + Self::match_exponent_sign_and_one_or_more_digits.take(), + ) + .parse_next(self) } /// Matches the exponent portion of a decimal (everything after the 'd') or float @@ -1785,68 +1253,75 @@ impl<'top> TextBuffer<'top> { /// /// Returns a boolean indicating whether the sign was negative (vs absent or positive) /// and the buffer slice containing the digits. - fn match_exponent_sign_and_one_or_more_digits(self) -> IonParseResult<'top, (bool, Self)> { - pair( + fn match_exponent_sign_and_one_or_more_digits(&mut self) -> IonParseResult<'top, (bool, Self)> { + ( // Optional leading sign; if there's no sign, it's not negative. - opt(Self::match_any_sign).map(|s| s == Some('-')), + opt(Self::match_any_sign).map(|s| s == Some(b'-')), Self::match_one_or_more_digits_after_dot, - )(self) + ) + .parse_next(self) } /// Matches `-` OR `+`. /// /// This is used for matching exponent signs; most places in Ion do not allow `+`. - pub fn match_any_sign(self) -> IonParseResult<'top, std::primitive::char> { - complete_one_of("-+")(self) + pub fn match_any_sign(&mut self) -> IonParseResult<'top, std::primitive::u8> { + one_of(b"-+").parse_next(self) } - pub fn match_decimal_exponent(self) -> IonParseResult<'top, (bool, TextBuffer<'top>)> { + pub fn match_decimal_exponent(&mut self) -> IonParseResult<'top, (bool, TextBuffer<'top>)> { preceded( - complete_one_of("dD"), + one_of(b"dD"), Self::match_exponent_sign_and_one_or_more_digits, - )(self) + ) + .parse_next(self) } /// Match an optional sign (if present), digits before the decimal point, then digits after the /// decimal point (if present). - pub fn match_decimal(self) -> IonParseResult<'top, MatchedDecimal> { + pub fn match_decimal(&mut self) -> IonParseResult<'top, MatchedDecimal> { + let initial_offset = self.offset(); terminated( - tuple(( - opt(complete_tag("-")), + ( + opt("-"), Self::match_digits_before_dot, alt(( - tuple(( - complete_tag("."), + ( + ".", opt(Self::match_zero_or_more_digits_after_dot), opt(Self::match_decimal_exponent), - )) - .map(|(dot, maybe_digits_after_dot, maybe_exponent)| { - let digits_after_dot = match maybe_digits_after_dot { - Some(digits) => digits, - None => dot.slice(1, 0), - }; - let (exp_is_negative, exp_digits) = match maybe_exponent { - Some(exponent) => exponent, - None => (false, digits_after_dot.slice(digits_after_dot.len(), 0)), - }; - (digits_after_dot, exp_is_negative, exp_digits) - }), + ) + .map( + |(dot, maybe_digits_after_dot, maybe_exponent)| { + let digits_after_dot = match maybe_digits_after_dot { + Some(digits) => digits, + None => dot.slice(1, 0), + }; + let (exp_is_negative, exp_digits) = match maybe_exponent { + Some(exponent) => exponent, + None => { + (false, digits_after_dot.slice(digits_after_dot.len(), 0)) + } + }; + (digits_after_dot, exp_is_negative, exp_digits) + }, + ), // or just a d/D and exponent - consumed(Self::match_decimal_exponent).map( - |(matched, (exp_is_negative, exp_digits))| { + Self::match_decimal_exponent.with_taken().map( + |((exp_is_negative, exp_digits), matched)| { // Make an empty slice to represent the (absent) digits after dot let digits_after_dot = matched.slice(0, 0); (digits_after_dot, exp_is_negative, exp_digits) }, ), )), - )), + ), Self::peek_stop_character, ) .map( |(maybe_sign, leading_digits, (digits_after_dot, exponent_is_negative, exp_digits))| { let is_negative = maybe_sign.is_some(); - let digits_offset = (leading_digits.offset() - self.offset()) as u16; + let digits_offset = (leading_digits.offset() - initial_offset) as u16; let digits_length = match digits_after_dot.len() { 0 => leading_digits.len() as u16, trailing_digits_length => { @@ -1859,7 +1334,7 @@ impl<'top> TextBuffer<'top> { .iter() .filter(|b| b.is_ascii_digit()) .count() as u16; - let exponent_digits_offset = (exp_digits.offset() - self.offset()) as u16; + let exponent_digits_offset = (exp_digits.offset() - initial_offset) as u16; let exponent_digits_length = exp_digits.len() as u16; MatchedDecimal::new( is_negative, @@ -1872,17 +1347,17 @@ impl<'top> TextBuffer<'top> { ) }, ) - .parse(self) + .parse_next(self) } /// Matches short- or long-form string. - pub fn match_string(self) -> IonParseResult<'top, MatchedString> { - alt((Self::match_short_string, Self::match_long_string))(self) + pub fn match_string(&mut self) -> IonParseResult<'top, MatchedString> { + alt((Self::match_short_string, Self::match_long_string)).parse_next(self) } /// Matches a short string. For example: `"foo"` - pub(crate) fn match_short_string(self) -> IonParseResult<'top, MatchedString> { - delimited(char('"'), Self::match_short_string_body, char('"')) + pub(crate) fn match_short_string(&mut self) -> IonParseResult<'top, MatchedString> { + delimited("\"", Self::match_short_string_body, "\"") .map(|(_matched, contains_escaped_chars)| { if contains_escaped_chars { MatchedString::ShortWithEscapes @@ -1890,183 +1365,87 @@ impl<'top> TextBuffer<'top> { MatchedString::ShortWithoutEscapes } }) - .parse(self) + .parse_next(self) } /// Returns a matched buffer and a boolean indicating whether any escaped characters were /// found in the short string. - pub(crate) fn match_short_string_body(self) -> IonParseResult<'top, (Self, bool)> { + pub(crate) fn match_short_string_body(&mut self) -> IonParseResult<'top, (Self, bool)> { Self::match_text_until_unescaped(self, b'\"', false) } - pub fn match_long_string(self) -> IonParseResult<'top, MatchedString> { - // This method is used at the top level and inside s-expressions. - // Specific contexts that need to specify a delimiter will call - // `match_long_string_with_terminating_delimiter` themselves. - // This includes lists, structs, and clobs. - Self::match_only_complete_if_terminated( - "reading a long-form string", - Self::match_long_string_segments, - // Don't specify a terminating delimiter -- always succeed. - Self::match_nothing, - Self::match_partial_long_string_delimiter, - )(self) - } - - pub fn match_long_string_in_struct(self) -> IonParseResult<'top, MatchedString> { - Self::match_only_complete_if_terminated( - "reading a long-form string in a struct", - Self::match_long_string_segments, - alt((tag(","), tag("}"))), - Self::match_partial_long_string_delimiter, - )(self) + pub fn match_long_string(&mut self) -> IonParseResult<'top, MatchedString> { + Self::match_long_string_segments.parse_next(self) } - pub fn match_long_string_in_list(self) -> IonParseResult<'top, MatchedString> { - Self::match_only_complete_if_terminated( - "reading a long-form string in a list", - Self::match_long_string_segments, - alt((tag(","), tag("]"))), - Self::match_partial_long_string_delimiter, - )(self) - } + /// Matches a long string comprised of any number of `'''`-enclosed segments interleaved + /// with optional comments and whitespace. + pub(crate) fn match_long_string_segments(&mut self) -> IonParseResult<'top, MatchedString> { + struct Stats(usize, bool); - /// Matches a parser that must be followed by input that matches `terminator`. - /// - /// This is used in contexts where the expression being parsed must be followed by one of a - /// set of known delimiters (ignoring whitespace and comments). For example: - /// * in a list, a long string must be followed by `,` or `]` - /// * in a struct, a long string must be followed by `,` or `}` - /// * in a clob, a long string must be followed by `}}`. - /// - /// Without this, it would be impossible to determine whether `''' ''` is legal or incomplete - /// in a given context. - /// - /// If the input is NOT terminated properly, the parser will check to see if `partial` matches. - /// If so, it will return an `Incomplete`. - /// If not, it will return an `Err` that includes the provided `label`. - pub fn match_only_complete_if_terminated( - label: &'static str, - mut parser: impl FnMut(Self) -> IonParseResult<'top, Output> + 'top, - mut terminator: impl FnMut(Self) -> IonParseResult<'top, Output3>, - mut partial: impl FnMut(Self) -> IonParseResult<'top, Output2>, - ) -> impl FnMut(Self) -> IonParseResult<'top, Output> { - move |input: Self| { - // If the parser raises an error, bubble it up. - let (remaining, matched) = parser(input)?; - // If the next thing in input is the terminator, report success. - match peek(&mut terminator)(remaining) { - Ok(_) => return Ok((remaining, matched)), - Err(nom::Err::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), - _ => { - // no match - } - }; - // Otherwise, see if the next thing in input is an indication that the input was - // incomplete. - if peek(&mut partial)(remaining).is_ok() { - return incomplete(); + impl Accumulate for Stats { + fn initial(_capacity: Option) -> Self { + Stats(0, false) } - Err(nom::Err::Error(IonParseError::Invalid( - InvalidInputError::new(remaining).with_label(label), - ))) + fn accumulate(&mut self, acc: bool) { + self.0 += 1; + self.1 |= acc; + } } - } - /// Matches a long string comprised of any number of `'''`-enclosed segments interleaved - /// with optional comments and whitespace. - pub(crate) fn match_long_string_segments(self) -> IonParseResult<'top, MatchedString> { - fold_many1( - // Parser to keep applying repeatedly - whitespace_and_then(Self::match_long_string_segment), - // Initial accumulator value: segment count and whether the string contains escaped characters - || (0usize, false), - // Function to merge the current match's information with the accumulator - |(segment_count, string_contains_escapes), - (_matched_segment, segment_contains_escapes)| { - ( - segment_count + 1, - string_contains_escapes || segment_contains_escapes, - ) - }, - ) - .map( - |(segment_count, contains_escapes)| match (segment_count, contains_escapes) { - (1, false) => MatchedString::LongSingleSegmentWithoutEscapes, - (1, true) => MatchedString::LongSingleSegmentWithEscapes, - _ => MatchedString::Long, - }, - ) - .parse(self) - } - - /// In the context of a list or s-expression, a truncated long-form string makes it impossible - /// to tell whether the input is malformed or just incomplete. For example, at the top level, - /// this is incomplete: - /// '''foo''' ' - /// while this: - /// '''foo''' '' - /// is valid--it's a string followed by an empty symbol. Inside a list, however, the same partial - /// long string has to be read differently. If the reader sees this: - /// ['''foo''' '' - /// It needs to consider it incomplete, not valid; for the last token to be an empty symbol, - /// there would need to be a delimiting comma (`,`) between the two values. Structs also require - /// a delimiting comma between a value and the next field. - /// - /// If an error is encountered while traversing a list or struct, this method can be used to - /// see if the problematic data was the beginning of another string segment. - pub fn match_partial_long_string_delimiter(self) -> IonMatchResult<'top> { - whitespace_and_then(terminated(tag("''"), eof)).parse(self) + repeat(1.., |input: &mut TextBuffer<'top>| { + let (_segment, found_escape) = + whitespace_and_then(Self::match_long_string_segment).parse_next(input)?; + Ok(found_escape) + }) + .map(move |stats: Stats| match stats { + Stats(1, false) => MatchedString::LongSingleSegmentWithoutEscapes, + Stats(1, true) => MatchedString::LongSingleSegmentWithEscapes, + _ => MatchedString::Long, + }) + .parse_next(self) } /// Matches a single long string segment enclosed by `'''` delimiters. /// Returns the match and a boolean indicating whether the body contained escape sequences. - pub fn match_long_string_segment(self) -> IonParseResult<'top, (Self, bool)> { - // If the buffer is a single quote and then EOF, it's not known whether this was a - // partial long string segment or a partial quoted symbol. - if self.bytes() == b"'" { - return Err(nom::Err::Incomplete(Needed::Unknown)); - } - delimited( - complete_tag("'''"), - Self::match_long_string_segment_body, - tag("'''"), - )(self) + pub fn match_long_string_segment(&mut self) -> IonParseResult<'top, (Self, bool)> { + delimited("'''", Self::match_long_string_segment_body, "'''").parse_next(self) } /// Matches all input up to (but not including) the first unescaped instance of `'''`. /// Returns the match and a boolean indicating whether the body contained escape sequences. - fn match_long_string_segment_body(self) -> IonParseResult<'top, (Self, bool)> { + fn match_long_string_segment_body(&mut self) -> IonParseResult<'top, (Self, bool)> { Self::match_text_until_unescaped_str(self, "'''") } /// Matches an operator symbol, which can only legally appear within an s-expression - fn match_operator>( - self, + pub(crate) fn match_operator>( + &mut self, ) -> IonParseResult<'top, LazyRawTextValue<'top, E>> { - is_a("!#%&*+-./;<=>?@^`|~") + one_or_more(one_of(b"!#%&*+-./;<=>?@^`|~")) .map(|text: TextBuffer<'_>| LazyRawTextValue { input: text, encoded_value: EncodedTextValue::new(MatchedValue::Symbol(MatchedSymbol::Operator)), }) - .parse(self) + .parse_next(self) } /// Matches a symbol ID (`$28`), an identifier (`foo`), or a quoted symbol (`'foo'`). - fn match_symbol(self) -> IonParseResult<'top, MatchedSymbol> { + fn match_symbol(&mut self) -> IonParseResult<'top, MatchedSymbol> { alt(( Self::match_symbol_id, Self::match_identifier, Self::match_quoted_symbol, - ))(self) + )) + .parse_next(self) } /// Matches a symbol ID (`$28`). - fn match_symbol_id(self) -> IonParseResult<'top, MatchedSymbol> { - recognize(preceded(tag("$"), Self::match_address)) - .map(|_matched| MatchedSymbol::SymbolId) - .parse(self) + fn match_symbol_id(&mut self) -> IonParseResult<'top, MatchedSymbol> { + ("$", Self::match_address) + .value(MatchedSymbol::SymbolId) + .parse_next(self) } /// Matches the integer portion of a symbol ID or a macro address. @@ -2075,78 +1454,73 @@ impl<'top> TextBuffer<'top> { /// identifiers, not symbol IDs. /// * CAN have leading zeros. For example, `$0003` is the same as `$3`. // There's precedent for allowing leading zeros in ion-java, so we support it here for consistency. - fn match_address(self) -> IonParseResult<'top, usize> { + fn match_address(&mut self) -> IonParseResult<'top, usize> { // Any number of base-10 digits followed by something that is NOT an underscore. // We do this to make sure that input like `$1_02` gets parsed like an identifier; // If we didn't check for a trailing underscore, it would be a SID (`$1`) and an // identifier (`_02`). - terminated(complete_digit1, peek(not(complete_tag("_")))) + let initial_offset = self.offset(); + terminated(digit1, not("_")) .map(|buffer: TextBuffer<'_>| { // The matched buffer is ascii base 10 digits, parsing must succeed - usize::from_str(buffer.as_utf8(self.offset()).unwrap()).unwrap() + usize::from_str(buffer.as_utf8(initial_offset).unwrap()).unwrap() }) - .parse(self) + .parse_next(self) } /// Matches items that match the syntactic definition of an identifier but which have special /// meaning. (`true`, `false`, `nan`, `null`) - pub(crate) fn match_keyword(self) -> IonMatchResult<'top> { + pub(crate) fn match_keyword(&mut self) -> IonMatchResult<'top> { terminated( - alt(( - complete_tag("true"), - complete_tag("false"), - complete_tag("null"), - complete_tag("nan"), - )), + alt(("true", "false", "null", "nan")), Self::identifier_terminator, - )(self) + ) + .parse_next(self) } /// Matches an identifier (`foo`). - pub(crate) fn match_identifier(self) -> IonParseResult<'top, MatchedSymbol> { - let (remaining, identifier_text) = recognize(terminated( - pair( - Self::identifier_initial_character, - Self::identifier_trailing_characters, - ), + pub(crate) fn match_identifier(&mut self) -> IonParseResult<'top, MatchedSymbol> { + ( + not(Self::match_keyword), + Self::identifier_initial_character, + Self::identifier_trailing_characters, Self::identifier_terminator, - ))(self)?; - if identifier_text.match_keyword().is_ok() { - return Err(nom::Err::Error(IonParseError::Invalid( - InvalidInputError::new(self), - ))); - } - Ok((remaining, MatchedSymbol::Identifier)) + ) + .value(MatchedSymbol::Identifier) + .parse_next(self) } - fn identifier_terminator(self) -> IonMatchResult<'top> { - alt(( - eof, - recognize(peek(not(Self::identifier_trailing_character))), - ))(self) + fn identifier_terminator(&mut self) -> IonMatchResult<'top> { + not(Self::identifier_trailing_character) + .take() + .parse_next(self) } /// Matches any character that can appear at the start of an identifier. - fn identifier_initial_character(self) -> IonParseResult<'top, Self> { - recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphabetic()))))(self) + fn identifier_initial_character(&mut self) -> IonParseResult<'top, Self> { + alt((one_of(b"$_"), one_of(|b: u8| b.is_ascii_alphabetic()))) + .take() + .parse_next(self) } /// Matches any character that is legal in an identifier, though not necessarily at the beginning. - fn identifier_trailing_character(self) -> IonParseResult<'top, Self> { - recognize(alt(( - complete_one_of("$_"), - nom::character::complete::satisfy(|c| c.is_ascii_alphanumeric()), - )))(self) + fn identifier_trailing_character(&mut self) -> IonParseResult<'top, Self> { + alt((one_of(b"$_"), one_of(|c: u8| c.is_ascii_alphanumeric()))) + .take() + .parse_next(self) } /// Matches characters that are legal in an identifier, though not necessarily at the beginning. - fn identifier_trailing_characters(self) -> IonParseResult<'top, Self> { - complete_take_while(|c: u8| c.is_ascii_alphanumeric() || b"$_".contains(&c))(self) + fn identifier_trailing_characters(&mut self) -> IonParseResult<'top, Self> { + zero_or_more(one_of(|b: u8| { + b.is_ascii_alphanumeric() || b"$_".contains(&b) + })) + .parse_next(self) } /// Matches a quoted symbol (`'foo'`). - fn match_quoted_symbol(self) -> IonParseResult<'top, MatchedSymbol> { - delimited(complete_tag("'"), Self::match_quoted_symbol_body, tag("'")) + fn match_quoted_symbol(&mut self) -> IonParseResult<'top, MatchedSymbol> { + delimited("'", Self::match_quoted_symbol_body, "'") .map(|(_matched, contains_escaped_chars)| { if contains_escaped_chars { MatchedSymbol::QuotedWithEscapes @@ -2154,19 +1528,19 @@ impl<'top> TextBuffer<'top> { MatchedSymbol::QuotedWithoutEscapes } }) - .parse(self) + .parse_next(self) } /// Returns a matched buffer and a boolean indicating whether any escaped characters were /// found in the short string. - fn match_quoted_symbol_body(self) -> IonParseResult<'top, (Self, bool)> { + fn match_quoted_symbol_body(&mut self) -> IonParseResult<'top, (Self, bool)> { Self::match_text_until_unescaped(self, b'\'', false) } /// A helper method for matching bytes until the specified delimiter. Ignores any byte /// (including the delimiter) that is prefaced by the escape character `\`. fn match_text_until_unescaped( - self, + &mut self, delimiter: u8, allow_unescaped_newlines: bool, ) -> IonParseResult<'top, (Self, bool)> { @@ -2193,8 +1567,8 @@ impl<'top> TextBuffer<'top> { } if byte == delimiter { let matched = self.slice(0, index); - let remaining = self.slice_to_end(index); - return Ok((remaining, (matched, contains_escaped_chars))); + self.consume(index); + return Ok((matched, contains_escaped_chars)); } // If this is a control character, make sure it's a legal one. if byte < 0x20 { @@ -2207,12 +1581,12 @@ impl<'top> TextBuffer<'top> { } } } - Err(nom::Err::Incomplete(Needed::Unknown)) + self.incomplete("a text value without closing delimiter") } #[cold] fn validate_string_control_character( - self, + &mut self, byte: u8, index: usize, allow_unescaped_newlines: bool, @@ -2220,142 +1594,159 @@ impl<'top> TextBuffer<'top> { if byte == b'\n' && !allow_unescaped_newlines { let error = InvalidInputError::new(self.slice_to_end(index)) .with_description("unescaped newlines are not allowed in short string literals"); - return Err(nom::Err::Failure(IonParseError::Invalid(error))); + return Err(ErrMode::Cut(IonParseError::Invalid(error))); } - if !WHITESPACE_CHARACTERS_AS_STR.as_bytes().contains(&byte) { + if !WHITESPACE_BYTES.contains(&byte) { let error = InvalidInputError::new(self.slice_to_end(index)) .with_description("unescaped control characters are not allowed in text literals"); - return Err(nom::Err::Failure(IonParseError::Invalid(error))); + return Err(ErrMode::Cut(IonParseError::Invalid(error))); } - Ok((self.slice_to_end(1), ())) + Ok(()) } /// A helper method for matching bytes until the specified delimiter. Ignores any byte /// that is prefaced by the escape character `\`. /// /// The specified delimiter cannot be empty. - fn match_text_until_unescaped_str(self, delimiter: &str) -> IonParseResult<'top, (Self, bool)> { + fn match_text_until_unescaped_str( + &mut self, + delimiter: &str, + ) -> IonParseResult<'top, (Self, bool)> { // The first byte in the delimiter let delimiter_head = delimiter.as_bytes()[0]; // Whether we've encountered any escapes while looking for the delimiter let mut contained_escapes = false; // The input left to search - let mut remaining = self; + let mut remaining = *self; loop { // Look for the first unescaped instance of the delimiter's head. // If the input doesn't contain one, this will return an `Incomplete`. // `match_text_until_escaped` does NOT include the delimiter byte in the match, // so `remaining_after_match` starts at the delimiter byte. - let (remaining_after_match, (_, segment_contained_escapes)) = + let (_matched_input, segment_contained_escapes) = remaining.match_text_until_unescaped(delimiter_head, true)?; contained_escapes |= segment_contained_escapes; - remaining = remaining_after_match; // If the remaining input starts with the complete delimiter, it's a match. if remaining.bytes().starts_with(delimiter.as_bytes()) { let relative_match_end = remaining.offset() - self.offset(); let matched_input = self.slice(0, relative_match_end); - let remaining_input = self.slice_to_end(relative_match_end); - return Ok((remaining_input, (matched_input, contained_escapes))); + self.consume(relative_match_end); + return Ok((matched_input, contained_escapes)); } else { // Otherwise, advance by one and try again. - remaining = remaining.slice_to_end(1); + remaining.consume(1); } } } /// Matches a single base-10 digit, 0-9. - fn match_any_digit(self) -> IonParseResult<'top, std::primitive::char> { - satisfy(|c| c.is_ascii_digit())(self) + fn match_any_digit(&mut self) -> IonParseResult<'top, std::primitive::u8> { + one_of(|b: u8| b.is_ascii_digit()).parse_next(self) } /// Matches a timestamp of any precision. - pub fn match_timestamp(self) -> IonParseResult<'top, MatchedTimestamp> { - alt(( - Self::match_timestamp_y, - Self::match_timestamp_ym, - Self::match_timestamp_ymd, - Self::match_timestamp_ymd_hm, - Self::match_timestamp_ymd_hms, - Self::match_timestamp_ymd_hms_fractional, - ))(self) + #[inline] + pub fn match_timestamp(&mut self) -> IonParseResult<'top, MatchedTimestamp> { + #[inline(never)] + pub fn full_match_timestamp<'t>( + input: &mut TextBuffer<'t>, + ) -> IonParseResult<'t, MatchedTimestamp> { + // TODO: As-is, matching common timestamps (those with greater than second precision) + // is slow because the parser tries each shorter arrangement in turn. We should + // rewrite this to use a single path that can accept any precision. + alt(( + TextBuffer::match_timestamp_y, + TextBuffer::match_timestamp_ym, + TextBuffer::match_timestamp_ymd, + TextBuffer::match_timestamp_ymd_hm, + TextBuffer::match_timestamp_ymd_hms, + TextBuffer::match_timestamp_ymd_hms_fractional, + )) + .parse_next(input) + } + + match self.bytes().first() { + Some(byte) if byte.is_ascii_digit() => full_match_timestamp(self), + Some(_) => Err(ErrMode::Backtrack(IonParseError::Invalid( + InvalidInputError::new(*self), + ))), + None => self.incomplete("a timestamp"), + } } /// Matches a timestamp with year precision. - fn match_timestamp_y(self) -> IonParseResult<'top, MatchedTimestamp> { - terminated( - Self::match_timestamp_year, - pair(tag("T"), Self::peek_stop_character), - ) - .map(|_year| MatchedTimestamp::new(TimestampPrecision::Year)) - .parse(self) + fn match_timestamp_y(&mut self) -> IonParseResult<'top, MatchedTimestamp> { + terminated(Self::match_timestamp_year, ("T", Self::peek_stop_character)) + .map(|_year| MatchedTimestamp::new(TimestampPrecision::Year)) + .parse_next(self) } /// Matches a timestamp with month precision. - fn match_timestamp_ym(self) -> IonParseResult<'top, MatchedTimestamp> { + fn match_timestamp_ym(&mut self) -> IonParseResult<'top, MatchedTimestamp> { terminated( - pair(Self::match_timestamp_year, Self::match_timestamp_month), - pair(tag("T"), Self::peek_stop_character), + (Self::match_timestamp_year, Self::match_timestamp_month), + ("T", Self::peek_stop_character), ) .map(|(_year, _month)| MatchedTimestamp::new(TimestampPrecision::Month)) - .parse(self) + .parse_next(self) } /// Matches a timestamp with day precision. - fn match_timestamp_ymd(self) -> IonParseResult<'top, MatchedTimestamp> { + fn match_timestamp_ymd(&mut self) -> IonParseResult<'top, MatchedTimestamp> { terminated( - tuple(( + ( Self::match_timestamp_year, Self::match_timestamp_month, Self::match_timestamp_day, - )), - pair(opt(complete_tag("T")), Self::peek_stop_character), + ), + (opt("T"), Self::peek_stop_character), ) .map(|_| MatchedTimestamp::new(TimestampPrecision::Day)) - .parse(self) + .parse_next(self) } /// Matches a timestamp with hour-and-minute precision. - fn match_timestamp_ymd_hm(self) -> IonParseResult<'top, MatchedTimestamp> { + fn match_timestamp_ymd_hm(&mut self) -> IonParseResult<'top, MatchedTimestamp> { terminated( - tuple(( + ( Self::match_timestamp_year, Self::match_timestamp_month, Self::match_timestamp_day, Self::match_timestamp_hour_and_minute, Self::match_timestamp_offset, - )), + ), Self::peek_stop_character, ) .map(|(_y, _m, _d, _hm, offset)| { MatchedTimestamp::new(TimestampPrecision::HourAndMinute).with_offset(offset) }) - .parse(self) + .parse_next(self) } /// Matches a timestamp with second precision. - fn match_timestamp_ymd_hms(self) -> IonParseResult<'top, MatchedTimestamp> { + fn match_timestamp_ymd_hms(&mut self) -> IonParseResult<'top, MatchedTimestamp> { terminated( - tuple(( + ( Self::match_timestamp_year, Self::match_timestamp_month, Self::match_timestamp_day, Self::match_timestamp_hour_and_minute, Self::match_timestamp_seconds, Self::match_timestamp_offset, - )), + ), Self::peek_stop_character, ) .map(|(_y, _m, _d, _hm, _s, offset)| { MatchedTimestamp::new(TimestampPrecision::Second).with_offset(offset) }) - .parse(self) + .parse_next(self) } /// Matches a timestamp with second precision, including a fractional seconds component. - fn match_timestamp_ymd_hms_fractional(self) -> IonParseResult<'top, MatchedTimestamp> { + fn match_timestamp_ymd_hms_fractional(&mut self) -> IonParseResult<'top, MatchedTimestamp> { terminated( - tuple(( + ( Self::match_timestamp_year, Self::match_timestamp_month, Self::match_timestamp_day, @@ -2363,186 +1754,184 @@ impl<'top> TextBuffer<'top> { Self::match_timestamp_seconds, Self::match_timestamp_fractional_seconds, Self::match_timestamp_offset, - )), + ), Self::peek_stop_character, ) .map(|(_y, _m, _d, _hm, _s, _f, offset)| { MatchedTimestamp::new(TimestampPrecision::Second).with_offset(offset) }) - .parse(self) + .parse_next(self) } /// Matches the year component of a timestamp. - fn match_timestamp_year(self) -> IonMatchResult<'top> { - recognize(take_while_m_n(4, 4, |c: u8| c.is_ascii_digit()))(self) + fn match_timestamp_year(&mut self) -> IonMatchResult<'top> { + n_times(4, one_of(|c: u8| c.is_ascii_digit())).parse_next(self) } /// Matches the month component of a timestamp, including a leading `-`. - fn match_timestamp_month(self) -> IonMatchResult<'top> { + fn match_timestamp_month(&mut self) -> IonMatchResult<'top> { preceded( - complete_tag("-"), - recognize(alt(( - pair(char('0'), one_of("123456789")), - pair(char('1'), one_of("012")), - ))), - )(self) + "-", + alt((("0", one_of(b"123456789")), ("1", one_of(b"012")))).take(), + ) + .parse_next(self) } /// Matches the day component of a timestamp, including a leading `-`. - fn match_timestamp_day(self) -> IonMatchResult<'top> { + fn match_timestamp_day(&mut self) -> IonMatchResult<'top> { preceded( - tag("-"), - recognize(alt(( - pair(char('0'), one_of("123456789")), - pair(one_of("12"), Self::match_any_digit), - pair(char('3'), one_of("01")), - ))), - )(self) + "-", + alt(( + (b"0", one_of(b"123456789")), + // pair(one_of([b'1' as u8, b'2' as u8]), Self::match_any_digit), + (one_of(b"12".as_slice()).take(), Self::match_any_digit), + (b"3", one_of(b"01")), + )) + .take(), + ) + .parse_next(self) } /// Matches a leading `T`, a two-digit hour component of a timestamp, a delimiting ':', and a /// two-digit minute component. fn match_timestamp_hour_and_minute( - self, + &mut self, ) -> IonParseResult<'top, (TextBuffer<'top>, TextBuffer<'top>)> { preceded( - tag("T"), + "T", separated_pair( // Hour - recognize(alt(( - pair(one_of("01"), Self::match_any_digit), - pair(char('2'), one_of("0123")), - ))), + alt(( + (one_of(b"01").take(), Self::match_any_digit), + ("2", one_of(b"0123")), + )) + .take(), // Delimiter - tag(":"), + ":", // Minutes - recognize(pair(one_of("012345"), Self::match_any_digit)), + (one_of(b"012345"), Self::match_any_digit).take(), ), - )(self) + ) + .parse_next(self) } /// Matches a leading `:`, and any two-digit second component from `00` to `59` inclusive. - fn match_timestamp_seconds(self) -> IonMatchResult<'top> { - preceded( - tag(":"), - recognize(pair(one_of("012345"), Self::match_any_digit)), - )(self) + fn match_timestamp_seconds(&mut self) -> IonMatchResult<'top> { + preceded(":", (one_of(b"012345"), Self::match_any_digit).take()).parse_next(self) } /// Matches the fractional seconds component of a timestamp, including a leading `.`. - fn match_timestamp_fractional_seconds(self) -> IonMatchResult<'top> { - preceded(tag("."), digit1)(self) + fn match_timestamp_fractional_seconds(&mut self) -> IonMatchResult<'top> { + preceded(".", digit1).parse_next(self) } /// Matches a timestamp offset of any format. - fn match_timestamp_offset(self) -> IonParseResult<'top, MatchedTimestampOffset> { + fn match_timestamp_offset(&mut self) -> IonParseResult<'top, MatchedTimestampOffset> { alt(( - value(MatchedTimestampOffset::Zulu, tag("Z")), - value(MatchedTimestampOffset::Zulu, tag("+00:00")), - value(MatchedTimestampOffset::Unknown, tag("-00:00")), - map( - pair(one_of("-+"), Self::match_timestamp_offset_hours_and_minutes), - |(sign, (_hours, _minutes))| { - if sign == '-' { + "Z".value(MatchedTimestampOffset::Zulu), + "+00:00".value(MatchedTimestampOffset::Zulu), + "-00:00".value(MatchedTimestampOffset::Unknown), + ( + one_of(b"-+"), + Self::match_timestamp_offset_hours_and_minutes, + ) + .map(|(sign, (_hours, _minutes))| { + if sign == b'-' { MatchedTimestampOffset::NegativeHoursAndMinutes } else { MatchedTimestampOffset::PositiveHoursAndMinutes } - }, - ), - ))(self) + }), + )) + .parse_next(self) } /// Matches a timestamp offset encoded as a two-digit hour, a delimiting `:`, and a two-digit /// minute. - fn match_timestamp_offset_hours_and_minutes(self) -> IonParseResult<'top, (Self, Self)> { + fn match_timestamp_offset_hours_and_minutes(&mut self) -> IonParseResult<'top, (Self, Self)> { separated_pair( // Hour - recognize(alt(( - pair(one_of("01"), Self::match_any_digit), - pair(char('2'), one_of("0123")), - ))), + alt(( + (one_of(b"01").take(), Self::match_any_digit), + ("2", one_of(b"0123")), + )) + .take(), // Delimiter - tag(":"), + ":", // Minutes - recognize(pair(one_of("012345"), Self::match_any_digit)), - )(self) + (one_of(b"012345"), Self::match_any_digit).take(), + ) + .parse_next(self) } /// Matches a complete blob, including the opening `{{` and closing `}}`. - pub fn match_blob(self) -> IonParseResult<'top, MatchedBlob> { + pub fn match_blob(&mut self) -> IonParseResult<'top, MatchedBlob> { + let initial_offset = self.offset(); delimited( - complete_tag("{{"), + "{{", // Only whitespace (not comments) can appear within the blob - recognize(Self::match_base64_content), - preceded(Self::match_optional_whitespace, tag("}}")), + Self::match_base64_content, + (Self::match_whitespace0, "}}"), ) .map(|base64_data| { - MatchedBlob::new(base64_data.offset() - self.offset(), base64_data.len()) + MatchedBlob::new(base64_data.offset() - initial_offset, base64_data.len()) }) - .parse(self) + .parse_next(self) } /// Matches a clob of either short- or long-form syntax. - pub fn match_clob(self) -> IonParseResult<'top, MatchedClob> { + pub fn match_clob(&mut self) -> IonParseResult<'top, MatchedClob> { delimited( - complete_tag("{{"), + "{{", preceded( - Self::match_optional_whitespace, + Self::match_whitespace0, alt(( - value(MatchedClob::Short, Self::match_short_clob_body), - value( - MatchedClob::Long, - // Look ahead to make sure there's a complete triple quote following the `{{`. - // If there isn't, the input is incomplete. - preceded(peek(tag("'''")), Self::match_long_clob_body), - ), + Self::match_short_clob_body.value(MatchedClob::Short), + Self::match_long_clob_body.value(MatchedClob::Long), )), ), - preceded(Self::match_optional_whitespace, tag("}}")), - )(self) + preceded(Self::match_whitespace0, "}}"), + ) + .parse_next(self) } /// Matches the body (inside the `{{` and `}}`) of a short-form clob. - fn match_short_clob_body(self) -> IonMatchResult<'top> { - let (remaining, (body, _matched_string)) = consumed(Self::match_short_string)(self)?; + fn match_short_clob_body(&mut self) -> IonMatchResult<'top> { + let (_matched_string, body) = Self::match_short_string.with_taken().parse_next(self)?; body.validate_clob_text()?; - Ok((remaining, body)) + Ok(body) } /// Matches the body (inside the `{{` and `}}`) of a long-form clob. - fn match_long_clob_body(self) -> IonMatchResult<'top> { - let (remaining, body) = Self::match_only_complete_if_terminated( - "reading a long-form clob", - recognize(many1_count(preceded( - Self::match_optional_whitespace, - Self::match_long_clob_body_segment, - ))), - preceded(Self::match_optional_whitespace, tag(r#"}}"#)), - preceded(Self::match_optional_whitespace, tag("''")), - )(self)?; - - Ok((remaining, body)) + fn match_long_clob_body(&mut self) -> IonMatchResult<'top> { + one_or_more(preceded( + Self::match_whitespace0, + Self::match_long_clob_body_segment, + )) + .take() + .parse_next(self) } /// Matches a single segment of a long-form clob's content. - fn match_long_clob_body_segment(self) -> IonMatchResult<'top> { - let (remaining, (body, _matched_string)) = consumed(Self::match_long_string_segment)(self)?; + fn match_long_clob_body_segment(&mut self) -> IonMatchResult<'top> { + let (_matched_string, body) = Self::match_long_string_segment + .with_taken() + .parse_next(self)?; body.validate_clob_text()?; - Ok((remaining, body)) + Ok(body) } /// Returns an error if the buffer contains any byte that is not legal inside a clob. - fn validate_clob_text(self) -> IonMatchResult<'top> { + fn validate_clob_text(&self) -> IonParseResult<'top, ()> { for byte in self.bytes().iter().copied() { if !Self::byte_is_legal_clob_ascii(byte) { let message = format!("found an illegal byte '{:0x}' in clob", byte); - let error = InvalidInputError::new(self).with_description(message); - return Err(nom::Err::Failure(IonParseError::Invalid(error))); + let error = InvalidInputError::new(*self).with_description(message); + return Err(ErrMode::Cut(IonParseError::Invalid(error))); } } // Return success without consuming - Ok((self, self.slice(0, 0))) + Ok(()) } /// Returns `false` if the specified byte cannot appear unescaped in a clob. @@ -2552,184 +1941,145 @@ impl<'top> TextBuffer<'top> { // "characters >= 0x20", but that excludes lots of whitespace characters that are < 0x20. // Some say "displayable ASCII", but DEL (0x7F) is shown to be legal in one of the ion-tests. // The definition used here has largely been inferred from the contents of `ion-tests`. - b.is_ascii() - && (u32::from(b) >= 0x20 || WHITESPACE_CHARACTERS_AS_STR.as_bytes().contains(&b)) + b.is_ascii() && (u32::from(b) >= 0x20 || WHITESPACE_BYTES.contains(&b)) } /// Matches the base64 content within a blob. Ion allows the base64 content to be broken up with /// whitespace, so the matched input region may need to be stripped of whitespace before /// the data can be decoded. - fn match_base64_content(self) -> IonMatchResult<'top> { - recognize(terminated( - many0_count(preceded( - Self::match_optional_whitespace, - alt((alphanumeric1, is_a("+/"))), - )), - opt(preceded( - Self::match_optional_whitespace, - alt((tag("=="), tag("="))), + fn match_base64_content(&mut self) -> IonMatchResult<'top> { + ( + zero_or_more(( + Self::match_whitespace0, + alt((alphanumeric1, one_of(b"+/").take())), )), - ))(self) + opt(preceded(Self::match_whitespace0, alt(("==", "=")))), + ) + .take() + .parse_next(self) } -} -// === nom trait implementations === -// The trait implementations that follow are necessary for `TextBuffer` to be used as an input -// type in `nom` parsers. (`nom` only supports `&str` and `&[u8]` out of the box.) Defining our own -// input type makes it possible for us to carry around additional context during the parsing process, -// which is important for providing helpful error messages. For example: we can include the absolute -// offset of the input slice currently being read in our error messages. -// -// As `TextBuffer` is just a wrapper around a `&[u8]`, these implementations mostly delegate -// to the existing trait impls for `&[u8]`. - -impl nom::InputTake for TextBuffer<'_> { - fn take(&self, count: usize) -> Self { - self.slice(0, count) - } - - fn take_split(&self, count: usize) -> (Self, Self) { - let (before, after) = self.data.split_at(count); - let buffer_before = TextBuffer::new_with_offset(self.context, before, self.offset()); - let buffer_after = TextBuffer::new_with_offset(self.context, after, self.offset() + count); - // Nom's convention is to place the remaining portion of the buffer first, which leads to - // a potentially surprising reversed tuple order. - (buffer_after, buffer_before) + pub fn is_final_data(&self) -> bool { + self.is_final_data } } -impl nom::InputLength for TextBuffer<'_> { - fn input_len(&self) -> usize { +pub trait IonParser<'top, O>: Parser, O, IonParseError<'top>> { + // No additional functionality, this is just a trait alias +} + +impl<'data, O, P> IonParser<'data, O> for P where + P: Parser, O, IonParseError<'data>> +{ +} + +impl SliceLen for TextBuffer<'_> { + fn slice_len(&self) -> usize { self.len() } } -impl<'data> nom::InputIter for TextBuffer<'data> { - type Item = u8; - type Iter = Enumerate; - type IterElem = Copied>; +impl<'data> Stream for TextBuffer<'data> { + type Token = u8; + type Slice = Self; + type IterOffsets = <&'data [u8] as Stream>::IterOffsets; + type Checkpoint = Self; - fn iter_indices(&self) -> Self::Iter { - self.iter_elements().enumerate() + fn iter_offsets(&self) -> Self::IterOffsets { + self.data.iter_offsets() } - fn iter_elements(&self) -> Self::IterElem { - self.data.iter().copied() + fn eof_offset(&self) -> usize { + self.data.eof_offset() } - fn position

(&self, predicate: P) -> Option + fn next_token(&mut self) -> Option { + let byte = *self.data.first()?; + self.consume(1); + Some(byte) + } + + fn offset_for

(&self, predicate: P) -> Option where - P: Fn(Self::Item) -> bool, + P: Fn(Self::Token) -> bool, { - self.data.iter().position(|b| predicate(*b)) + self.data.offset_for(predicate) } - fn slice_index(&self, count: usize) -> Result { - self.data.slice_index(count) + fn offset_at(&self, tokens: usize) -> Result { + self.data.offset_at(tokens) } -} -impl<'a> nom::Compare<&'a str> for TextBuffer<'_> { - fn compare(&self, t: &'a str) -> CompareResult { - self.data.compare(t.as_bytes()) + fn next_slice(&mut self, offset: usize) -> Self::Slice { + let head = self.slice(0, offset); + self.consume(offset); + head } - fn compare_no_case(&self, t: &'a str) -> CompareResult { - self.data.compare_no_case(t.as_bytes()) + fn checkpoint(&self) -> Self::Checkpoint { + *self } -} -impl nom::Offset for TextBuffer<'_> { - fn offset(&self, second: &Self) -> usize { - self.data.offset(second.data) + fn reset(&mut self, checkpoint: &Self::Checkpoint) { + *self = *checkpoint; } -} -impl nom::Slice> for TextBuffer<'_> { - fn slice(&self, range: RangeFrom) -> Self { - self.slice_to_end(range.start) + fn raw(&self) -> &dyn Debug { + &self.data } } -impl nom::Slice> for TextBuffer<'_> { - fn slice(&self, range: RangeTo) -> Self { - self.slice(0, range.end) +impl StreamIsPartial for TextBuffer<'_> { + type PartialState = (); + + fn complete(&mut self) -> Self::PartialState {} + + fn restore_partial(&mut self, _state: Self::PartialState) { + // No-op. } -} -impl nom::FindSubstring<&str> for TextBuffer<'_> { - fn find_substring(&self, substr: &str) -> Option { - self.data.find_substring(substr) + fn is_partial_supported() -> bool { + true } -} -impl nom::InputTakeAtPosition for TextBuffer<'_> { - type Item = u8; + fn is_partial(&self) -> bool { + !self.is_final_data + } +} - fn split_at_position>(&self, predicate: P) -> IResult - where - P: Fn(Self::Item) -> bool, - { - match self.data.iter().position(|c| predicate(*c)) { - Some(i) => Ok(self.take_split(i)), - None => Err(nom::Err::Incomplete(Needed::new(1))), - } +impl<'a> winnow::stream::Compare<&'a str> for TextBuffer<'_> { + fn compare(&self, t: &'a str) -> CompareResult { + self.data.compare(t.as_bytes()) } +} - fn split_at_position1>( - &self, - predicate: P, - e: ErrorKind, - ) -> IResult - where - P: Fn(Self::Item) -> bool, - { - match self.data.iter().position(|c| predicate(*c)) { - Some(0) => Err(nom::Err::Error(E::from_error_kind(*self, e))), - Some(i) => Ok(self.take_split(i)), - None => Err(nom::Err::Incomplete(Needed::new(1))), - } +impl<'a> winnow::stream::Compare<&'a [u8]> for TextBuffer<'_> { + fn compare(&self, t: &'a [u8]) -> CompareResult { + self.data.compare(t) } +} - fn split_at_position_complete>( - &self, - predicate: P, - ) -> IResult - where - P: Fn(Self::Item) -> bool, - { - match self.data.iter().position(|c| predicate(*c)) { - Some(i) => Ok(self.take_split(i)), - None => Ok(self.take_split(self.input_len())), - } +impl<'a, const N: usize> winnow::stream::Compare<&'a [u8; N]> for TextBuffer<'_> { + fn compare(&self, t: &'a [u8; N]) -> CompareResult { + self.data.compare(t.as_slice()) } +} - fn split_at_position1_complete>( - &self, - predicate: P, - e: ErrorKind, - ) -> IResult - where - P: Fn(Self::Item) -> bool, - { - match self.data.iter().position(|c| predicate(*c)) { - Some(0) => Err(nom::Err::Error(E::from_error_kind(*self, e))), - Some(i) => Ok(self.take_split(i)), - None => { - if self.is_empty() { - Err(nom::Err::Error(E::from_error_kind(*self, e))) - } else { - Ok(self.take_split(self.input_len())) - } - } - } +impl winnow::stream::Offset for TextBuffer<'_> { + fn offset_from(&self, start: &Self) -> usize { + self.offset - start.offset } } -// === end of `nom` trait implementations +impl FindSlice<&str> for TextBuffer<'_> { + fn find_slice(&self, substr: &str) -> Option> { + self.data.find_slice(substr) + } +} -/// Convenience function to construct a nom `Incomplete` and wrap it in an `IonParseResult` -fn incomplete<'a, T>() -> IonParseResult<'a, T> { - Err(nom::Err::Incomplete(Needed::Unknown)) +impl Location for TextBuffer<'_> { + fn location(&self) -> usize { + self.offset() + } } /// Takes a given parser and returns a new one that accepts any amount of leading whitespace before @@ -2743,55 +2093,32 @@ where preceded(TextBuffer::match_optional_comments_and_whitespace, parser) } -/// Augments a given parser such that it returns the matched value and the number of input bytes -/// that it matched. -fn match_and_length<'data, P, O>( - mut parser: P, -) -> impl Parser, (O, usize), IonParseError<'data>> +pub fn zero_or_more<'data, P, O>( + parser: P, +) -> impl Parser, TextBuffer<'data>, IonParseError<'data>> where P: Parser, O, IonParseError<'data>>, { - move |input: TextBuffer<'data>| { - let offset_before = input.offset(); - let (remaining, matched) = match parser.parse(input) { - Ok((remaining, matched)) => (remaining, matched), - Err(e) => return Err(e), - }; - let offset_after = remaining.offset(); - let match_length = offset_after - offset_before; - Ok((remaining, (matched, match_length))) - } + repeat::<_, _, (), _, _>(.., parser).take() } -/// Augments a given parser such that it returns the matched value and the range of input bytes -/// that it matched. -pub(crate) fn match_and_span<'data, P, O>( - mut parser: P, -) -> impl Parser, (O, Range), IonParseError<'data>> +pub fn one_or_more<'data, P, O>( + parser: P, +) -> impl Parser, TextBuffer<'data>, IonParseError<'data>> where P: Parser, O, IonParseError<'data>>, { - move |input: TextBuffer<'data>| { - let offset_before = input.offset(); - let (remaining, matched) = match parser.parse(input) { - Ok((remaining, matched)) => (remaining, matched), - Err(e) => return Err(e), - }; - let offset_after = remaining.offset(); - let span = offset_before..offset_after; - Ok((remaining, (matched, span))) - } + repeat::<_, _, (), _, _>(1.., parser).take() } -/// Returns the number of bytes that the provided parser matched. -fn match_length<'data, P, O>( +pub fn n_times<'data, P, O>( + n: usize, parser: P, -) -> impl Parser, usize, IonParseError<'data>> +) -> impl Parser, TextBuffer<'data>, IonParseError<'data>> where P: Parser, O, IonParseError<'data>>, { - // Call `match_and_length` and discard the output - match_and_length(parser).map(|(_output, match_length)| match_length) + repeat::<_, _, (), _, _>(n, parser).take() } #[cfg(test)] @@ -2801,8 +2128,21 @@ mod tests { use crate::lazy::expanded::compiler::TemplateCompiler; use crate::lazy::expanded::template::{ParameterCardinality, ParameterEncoding}; use crate::lazy::expanded::EncodingContext; + use crate::{AnyEncoding, Reader}; use rstest::rstest; + /// Returns a parser that discards the output and instead reports the number of bytes that matched. + fn match_length<'data, P, O>( + parser: P, + ) -> impl Parser, usize, IonParseError<'data>> + where + P: Parser, O, IonParseError<'data>>, + { + parser + .with_span() + .map(|(_output, match_range)| match_range.len()) + } + /// Stores an input string that can be tested against a given parser. struct MatchTest { input: String, @@ -2819,6 +2159,13 @@ mod tests { } } + fn new_1_0(input: &str) -> Self { + MatchTest { + input: input.to_string(), + context: EncodingContext::for_ion_version(IonVersion::v1_0), + } + } + fn register_macro(&mut self, text: &str) -> &mut Self { let new_macro = TemplateCompiler::compile_from_source(self.context.get_ref(), text).unwrap(); @@ -2833,8 +2180,8 @@ mod tests { where P: Parser, O, IonParseError<'data>>, { - let buffer = TextBuffer::new(self.context.get_ref(), self.input.as_bytes()); - match_length(parser).parse(buffer) + let mut buffer = TextBuffer::new(self.context.get_ref(), self.input.as_bytes(), true); + match_length(parser).parse_next(&mut buffer) } fn expect_match<'data, P, O>(&'data self, parser: P) @@ -2842,7 +2189,7 @@ mod tests { P: Parser, O, IonParseError<'data>>, { let result = self.try_match(parser); - let (_remaining, match_length) = result.unwrap_or_else(|e| { + let match_length = result.unwrap_or_else(|e| { panic!("Unexpected parse fail for input <{}>\n{e}", self.input) }); // Inputs have a trailing newline and `0` that should _not_ be part of the match @@ -2864,7 +2211,7 @@ mod tests { // input will be rejected outright. match result { - Ok((_remaining, match_length)) => { + Ok(match_length) => { assert_ne!( match_length, self.input.len(), @@ -2882,59 +2229,36 @@ mod tests { Err(_) => {} } } - - fn expect_incomplete<'data, P, O>(&'data self, parser: P) - where - P: Parser, O, IonParseError<'data>>, - { - let result = self.try_match(parser); - - match result { - Ok((_remaining, match_length)) => { - assert_ne!( - match_length, - self.input.len(), - "parser unexpectedly matched the complete input: {:?}\nResult: {:?}", - self.input, - result - ); - } - Err(e) if e.is_incomplete() => {} - err => { - panic!( - "Parser reported an unexpected error for input: {}\nResult: {:?}", - self.input, err - ); - } - } - } } - /// A macro to concisely define basic test cases for matchers. Suitable when there are no type - /// annotations needed for the match function, and the input strings can be trimmed. + /// A macro to concisely define basic test cases for matchers. + /// Suitable when the input strings can be trimmed. macro_rules! matcher_tests { ($parser:ident $($expect:ident: [$($input:literal),+$(,)?]),+$(,)?) => { - mod $parser { + matcher_tests!($parser => TextBuffer::$parser, $($expect: [$($input),+]),+,); + }; + ($mod_name:ident => $parser:expr, $($expect:ident: [$($input:literal),+$(,)?]),+$(,)?) => { + mod $mod_name { use super::*; $( #[test] fn $expect() { - $(MatchTest::new($input.trim()).$expect(match_length(TextBuffer::$parser));) + $(MatchTest::new_1_0($input.trim()).$expect(match_length($parser) );) + } )+ } - }; + } } macro_rules! matcher_tests_with_macro { - ($mod_name:ident $parser:ident $macro_src:literal $($expect:ident: [$($input:literal),+$(,)?]),+$(,)?) => { + ($mod_name:ident => $parser:expr, $macro_src:literal $($expect:ident: [$($input:literal),+$(,)?]),+$(,)?) => { mod $mod_name { use super::*; $( #[test] fn $expect() { - $(MatchTest::new($input.trim()).register_macro($macro_src).$expect(match_length(TextBuffer::$parser));) + $(MatchTest::new($input.trim()).register_macro($macro_src).$expect(match_length($parser));) + } )+ @@ -3014,12 +2338,10 @@ mod tests { "nullify", "null..int", "string.null", - ], - expect_incomplete: [ "null.timestam", "null.strin", "null.nul" - ] + ], } matcher_tests! { @@ -3060,11 +2382,9 @@ mod tests { "_123", // Leading underscore "0x0x5", // Multiple 0x prefixes "0xx5", // Multiple Xs after 0 - ], - expect_incomplete: [ "0x", // Base 16 prefix w/no number "0b", // Base 2 prefix w/no number - ] + ], } matcher_tests! { @@ -3081,8 +2401,6 @@ mod tests { "0305e1", // Leading zero "+305e1", // Leading plus sign "--305e1", // Multiple negative signs - ], - expect_incomplete: [ "305e", // Has exponent delimiter but no exponent ] } @@ -3111,8 +2429,6 @@ mod tests { "2023-08-18T14:35:52.Z", // Dot but no fractional "2023-08-18T14:35:52.000+24:30", // Out of bounds offset hour "2023-08-18T14:35:52.000+00:60", // Out of bounds offset minute - ], - expect_incomplete: [ "2023", // No 'T'; it's an int "2023-08", // No 'T'; it's incomplete "2023-08-18T14:00", // No offset @@ -3152,8 +2468,6 @@ mod tests { r#" hello" "#, - ], - expect_incomplete: [ // Missing a closing quote r#" "hello @@ -3179,17 +2493,15 @@ mod tests { "$bar", "_baz_quux", ], - expect_incomplete: [ - "'hello", // No closing quote - "'hello\\'", // Closing quote is escaped - ], expect_mismatch: [ "$-8", // Negative SID + "'hello", // No closing quote + "'hello\\'", // Closing quote is escaped ], } matcher_tests! { - match_annotated_value + match_annotated_value => TextBuffer::match_annotated_value::, expect_match: [ "foo::5", "foo::bar::5", @@ -3199,8 +2511,7 @@ mod tests { "foo::bar::baz::quux::quuz::5", "foo::'bar'::baz::$10::5", ], - expect_incomplete: ["foo::"], - expect_mismatch: ["foo:bar", "foo:::bar"], + expect_mismatch: ["foo::", "foo:bar", "foo:::bar"], } matcher_tests! { @@ -3211,8 +2522,6 @@ mod tests { ], expect_mismatch: [ "123._456", "5", "05d", "-5.0+0", - ], - expect_incomplete: [ "5d", "-5d", "5.d", @@ -3225,7 +2534,7 @@ mod tests { } matcher_tests! { - match_sexp + match_sexp_1_0 => TextEncoding_1_0::sexp_matcher(), expect_match: [ "()", "(1)", @@ -3239,13 +2548,11 @@ mod tests { "((()))", "(1 (2 (3 4) 5) 6)", ], - expect_mismatch: ["foo", "1"], - expect_incomplete: ["(", "(1 2 (3 4 5)"] + expect_mismatch: ["foo", "1", "(", "(1 2 (3 4 5)"] } matcher_tests_with_macro! { - parsing_sexps - match_sexp_1_1 + parsing_sexps => TextEncoding_1_1::sexp_matcher(), "(macro foo (x*) null)" expect_match: [ "()", @@ -3261,41 +2568,101 @@ mod tests { "(1 (2 (3 4) 5) 6)", "(1 (:foo 2 3))", ], - expect_mismatch: ["foo", "1"], - expect_incomplete: ["(", "(1 2 (3 4 5)"] + expect_mismatch: ["foo", "1", "(", "(1 2 (3 4 5)"] } matcher_tests! { - match_list + match_list_1_0 => TextEncoding_1_0::list_matcher(), expect_match: [ "[]", "[1]", "[1, 2]", "[[]]", "[([])]", ], expect_mismatch: [ - "foo", "1", - ], - expect_incomplete: [ - "[", "[1, 2, [3, 4]", + "foo", + "1", + "[", + "[1, 2, [3, 4]", ] } matcher_tests_with_macro! { - parsing_lists - match_list_1_1 + match_list_1_1 => TextEncoding_1_1::list_matcher(), "(macro foo (x*) null)" expect_match: [ "[]", "[1]", "[1, 2]", "[[]]", "[([])]", "[1, (:foo 2 3)]" ], expect_mismatch: [ - "foo", "1" - ], - expect_incomplete: [ + "foo", "1", "[", "[1, 2, [3, 4]" ] } + matcher_tests! { + match_struct_1_0 => TextEncoding_1_0::struct_matcher(), + expect_match: [ + "{}", + "{$0:$0}", + "{'':''}", + r#"{"":""}"#, + "{foo:bar}", + "{foo: bar, baz: quux}", + "{'foo': bar, 'baz': quux}", + r#"{foo: bar, "baz": quux}"#, + r#"{'foo': bar, "baz": quux}"#, + "{_:_}", + "{foo: [1, 2, 3]}", + "{foo: foo, foo: foo}", + "{foo: foo::foo::foo, foo: foo::foo}" + ], + expect_mismatch: [ + "{", "{foo: bar", + "{1: bar}", + "{foo: bar baz: quux}", + "{foo: bar,, baz: quux}", + "{foo:: bar, baz: quux}", + "{, foo: bar, baz: quux}", + "{,}" + ] + } + + matcher_tests_with_macro! { + match_struct_1_1 => TextEncoding_1_1::struct_matcher(), + "(macro foo (x*) {quux: quuz})" + expect_match: [ + "{}", "{$0:$0}", "{'':''}", r#"{"":""}"#, "{foo:bar}", + "{foo: bar, baz: quux}", "{'foo': bar, 'baz': quux}", + r#"{foo: bar, "baz": quux}"#, r#"{'foo': bar, "baz": quux}"#, + "{_:_}", "{foo: [1, 2, 3]}", "{foo: foo, foo: foo}", + "{a: (:foo 1 2 3)}", + // With e-expressions + "{(:foo)}", + "{ (:foo)}", + "{(:foo) }", + "{(:foo), (:foo)}", + "{ (:foo) , (:foo) }", + "{ a : (:foo 1 2 3) , b : (:foo 4 5 6) }", + "{a:(:foo 1 2 3),b:(:foo 4 5 6)}", "{(:foo), (:foo)}", + "{a: (:foo 1 2 3), b: (:foo 4 5 6)}" + ], + expect_mismatch: [ + "{", "{foo: bar", + "{1: bar}", + "{foo: bar baz: quux}", + "{foo: bar,, baz: quux}", + "{foo:: bar, baz: quux}", + "{, foo: bar, baz: quux}", + "{,}", + "{(:foo}", + "{(:foo]}", + "{[:foo}", + "{(foo)}", + "{(:foo): bar}", + "{bar: (:foo}", + "{bar: (:foo) baz: quux}", + ] + } + matcher_tests_with_macro! { - parsing_eexps - match_e_expression + parsing_eexps => TextBuffer::match_e_expression, "(macro foo (x*) null)" expect_match: [ "(:foo)", @@ -3319,16 +2686,13 @@ mod tests { "(5)", // No `:` after opening paren "(:0x5)", // Hexadecimal not allowed "(:5_000)", // Underscores not allowed - ], - expect_incomplete: [ - "(:foo", - "(:5" + "(:foo", // Incomplete + "(:5" // Incomplete ] } matcher_tests_with_macro! { - allow_omitting_trailing_optionals - match_e_expression + allow_omitting_trailing_optionals => TextBuffer::match_e_expression, "(macro foo (a b+ c? d*) null)" expect_match: [ "(:foo 1 2)", @@ -3428,8 +2792,6 @@ mod tests { "{{=aGVsbG8}}", // too much padding "{{aGVsbG8===}}", - ], - expect_incomplete: [ "{{aGVsbG8h", "{{aGVsbG8h}" ] @@ -3458,21 +2820,43 @@ mod tests { r#"{{'''foo''' /*hi!*/ '''bar'''}}"#, // Interleaved comments r#"{{'''foo''' "bar"}}"#, // Mixed quote style r#"{{"😎🙂🙃"}}"#, // Contains unescaped non-ascii characters - ], - expect_incomplete: [ r#"{{"foo}}"#, // Missing closing quote r#"{{"foo"}"#, // Missing closing brace r#"{{'''foo'''}"#, // Missing closing brace ], } + #[test] fn test_match_text_until_unescaped_str() { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let input = TextBuffer::new(context, r" foo bar \''' baz''' quux ".as_bytes()); - let (_remaining, (matched, contains_escapes)) = - input.match_text_until_unescaped_str(r#"'''"#).unwrap(); + let mut input = TextBuffer::new(context, r" foo bar \''' baz''' quux ".as_bytes(), true); + let (matched, contains_escapes) = input.match_text_until_unescaped_str(r#"'''"#).unwrap(); assert_eq!(matched.as_text().unwrap(), " foo bar \\''' baz"); assert!(contains_escapes); } + + #[test] + fn expect_foo() { + MatchTest::new_1_0("\"hello\"").expect_match(match_length(TextBuffer::match_string)); + } + + #[test] + fn expect_long_foo() { + MatchTest::new_1_0("'''long hello'''").expect_match(match_length(TextBuffer::match_string)); + } + + #[test] + fn expect_bootstrap() -> IonResult<()> { + // MatchTest::new("\"foo\"").expect_match(match_length(TextBuffer::match_string)); + let mut reader = Reader::new(AnyEncoding, "()")?; + let value = reader.expect_next()?; + let _ = value.read()?.expect_sexp().unwrap(); + Ok(()) + } + + #[test] + fn expect_clob() { + MatchTest::new_1_0(r#"{{''''''}}"#).expect_match(match_length(TextBuffer::match_clob)); + } } diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index 060388fe..0b8b34ee 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -11,7 +11,7 @@ use crate::IonType; /// allowing a user to re-read (that is: parse) the body of the value as many times as necessary /// without re-parsing its header information each time. #[derive(Copy, Clone, Debug, PartialEq)] -pub(crate) struct EncodedTextValue<'top, E: TextEncoding<'top>> { +pub struct EncodedTextValue<'top, E: TextEncoding<'top>> { // Each encoded text value has up to three components, appearing in the following order: // // [annotations? | data ] diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index 9c822bda..045a7d7b 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -23,17 +23,6 @@ use std::num::IntErrorKind; use std::ops::{Neg, Range}; use std::str::FromStr; -use bumpalo::collections::Vec as BumpVec; -use bumpalo::Bump as BumpAllocator; -use ice_code::ice as cold_path; -use nom::branch::alt; -use nom::bytes::streaming::tag; -use nom::character::is_hex_digit; -use nom::sequence::preceded; -use nom::{AsBytes, AsChar, Parser}; -use num_traits::Zero; -use smallvec::SmallVec; - use crate::decimal::coefficient::Coefficient; use crate::lazy::bytes_ref::BytesRef; use crate::lazy::decoder::{Decoder, LazyRawFieldExpr, LazyRawValueExpr}; @@ -46,6 +35,15 @@ use crate::result::{DecodingError, IonFailure}; use crate::{ Decimal, Int, IonError, IonResult, IonType, RawSymbolRef, Timestamp, TimestampPrecision, }; +use bumpalo::collections::Vec as BumpVec; +use bumpalo::Bump as BumpAllocator; +use ice_code::ice as cold_path; +use num_traits::Zero; +use smallvec::SmallVec; +use winnow::combinator::alt; +use winnow::combinator::preceded; +use winnow::stream::{AsChar, Stream}; +use winnow::Parser; /// A partially parsed Ion value. #[derive(Clone, Copy, Debug)] @@ -456,12 +454,12 @@ impl MatchedString { // Iterate over the string segments using the match_long_string_segment parser. // This is the same parser that matched the input initially, which means that the only // reason it wouldn't succeed here is if the input is empty, meaning we're done reading. - while let Ok((remaining_after_match, (segment_body, _has_escapes))) = preceded( + while let Ok((segment_body, _has_escapes)) = preceded( TextBuffer::match_optional_comments_and_whitespace, TextBuffer::match_long_string_segment, - )(remaining) + ) + .parse_next(&mut remaining) { - remaining = remaining_after_match; replace_escapes_with_byte_values( segment_body, &mut sanitized, @@ -692,7 +690,7 @@ fn decode_hex_digits_escape<'data>( .iter() .take(num_digits) .copied() - .all(is_hex_digit); + .all(AsChar::is_hex_digit); if !all_are_hex_digits { return Err(IonError::Decoding( DecodingError::new(format!( @@ -750,15 +748,15 @@ fn complete_surrogate_pair<'data>( input: TextBuffer<'data>, ) -> IonResult> { let mut match_next_codepoint = preceded( - tag("\\"), + "\\", alt(( - preceded(tag("x"), TextBuffer::match_n_hex_digits(2)), - preceded(tag("u"), TextBuffer::match_n_hex_digits(4)), - preceded(tag("U"), TextBuffer::match_n_hex_digits(8)), + preceded("x", TextBuffer::match_n_hex_digits(2)), + preceded("u", TextBuffer::match_n_hex_digits(4)), + preceded("U", TextBuffer::match_n_hex_digits(8)), )), ); - let (remaining, hex_digits) = match match_next_codepoint.parse(input) { - Ok((remaining, hex_digits)) => (remaining, hex_digits), + let (remaining, hex_digits) = match match_next_codepoint.parse_peek(input) { + Ok(hex_digits) => hex_digits, Err(_) => { return { let error = @@ -1102,7 +1100,7 @@ impl MatchedBlob { .filter(|b| !b.is_ascii_whitespace()); sanitized_base64_text.extend(non_whitespaces_bytes); base64::decode_config_slice( - sanitized_base64_text.as_bytes(), + sanitized_base64_text.as_slice(), base64::STANDARD, decoding_buffer.as_mut_slice(), ) @@ -1173,7 +1171,7 @@ impl MatchedClob { // Use the existing short string body parser to identify all of the bytes up to the // unescaped closing `"`. This parser succeeded once during matching, so we know it will // succeed again here; it's safe to unwrap(). - let (_, (body, _has_escapes)) = remaining.match_short_string_body().unwrap(); + let (body, _has_escapes) = remaining.checkpoint().match_short_string_body().unwrap(); // There are escaped characters. We need to build a new version of our string // that replaces the escaped characters with their corresponding bytes. let mut sanitized = BumpVec::with_capacity_in(body.len(), allocator); @@ -1203,9 +1201,10 @@ impl MatchedClob { // This is the same parser that matched the input initially, which means that the only // reason it wouldn't succeed here is if the input is empty, meaning we're done reading. while let Ok((remaining_after_match, (segment_body, _has_escapes))) = preceded( - TextBuffer::match_optional_whitespace, + TextBuffer::match_whitespace0, TextBuffer::match_long_string_segment, - )(remaining) + ) + .parse_peek(remaining) { remaining = remaining_after_match; replace_escapes_with_byte_values( @@ -1223,11 +1222,12 @@ impl MatchedClob { #[cfg(test)] mod tests { - use crate::lazy::bytes_ref::BytesRef; use crate::lazy::expanded::{EncodingContext, EncodingContextRef}; use crate::lazy::text::buffer::TextBuffer; use crate::{Decimal, Int, IonResult, Timestamp}; + use winnow::combinator::peek; + use winnow::Parser; #[test] fn read_ints() -> IonResult<()> { @@ -1235,8 +1235,8 @@ mod tests { let expected: Int = expected.into(); let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let buffer = TextBuffer::new(context, data.as_bytes()); - let (_remaining, matched) = buffer.match_int().unwrap(); + let mut buffer = TextBuffer::new(context, data.as_bytes(), true); + let matched = peek(TextBuffer::match_int).parse_next(&mut buffer).unwrap(); let actual = matched.read(buffer).unwrap(); assert_eq!( actual, expected, @@ -1267,11 +1267,12 @@ mod tests { #[test] fn read_timestamps() -> IonResult<()> { fn expect_timestamp(data: &str, expected: Timestamp) { - let data = format!("{data} "); // Append a space let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let buffer = TextBuffer::new(context, data.as_bytes()); - let (_remaining, matched) = buffer.match_timestamp().unwrap(); + let mut buffer = TextBuffer::new(context, data.as_bytes(), true); + let matched = peek(TextBuffer::match_timestamp) + .parse_next(&mut buffer) + .unwrap(); let actual = matched.read(buffer).unwrap(); assert_eq!( actual, expected, @@ -1373,15 +1374,14 @@ mod tests { fn expect_decimal(data: &str, expected: Decimal) { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let buffer = TextBuffer::new(context, data.as_bytes()); - let result = buffer.match_decimal(); + let mut buffer = TextBuffer::new(context, data.as_bytes(), true); + let result = peek(TextBuffer::match_decimal).parse_next(&mut buffer); assert!( result.is_ok(), "Unexpected match error for input: '{data}': {:?}", result ); - let (_remaining, matched) = buffer.match_decimal().expect("match decimal"); - let result = matched.read(buffer); + let result = result.unwrap().read(buffer); assert!( result.is_ok(), "Unexpected read error for input '{data}': {:?}", @@ -1453,11 +1453,12 @@ mod tests { #[test] fn read_blobs() -> IonResult<()> { fn expect_blob(data: &str, expected: &str) { - let data = format!("{data} "); // Append a space let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let buffer = TextBuffer::new(context, data.as_bytes()); - let (_remaining, matched) = buffer.match_blob().unwrap(); + let mut buffer = TextBuffer::new(context, data.as_bytes(), true); + let matched = peek(TextBuffer::match_blob) + .parse_next(&mut buffer) + .unwrap(); let actual = matched.read(context.allocator(), buffer).unwrap(); assert_eq!( actual, @@ -1492,13 +1493,13 @@ mod tests { // For the sake of these tests, we're going to append one more value (`0`) to the input // stream so the parser knows that the long-form strings are complete. We then trim // our fabricated value off of the input before reading. - let data = format!("{data}\n0"); let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let buffer = TextBuffer::new(context, data.as_bytes()); - let (_remaining, matched) = buffer.match_string().unwrap(); - let matched_input = buffer.slice(0, buffer.len() - 2); - let actual = matched.read(context.allocator(), matched_input).unwrap(); + let mut buffer = TextBuffer::new(context, data.as_bytes(), true); + let matched = peek(TextBuffer::match_string) + .parse_next(&mut buffer) + .unwrap(); + let actual = matched.read(context.allocator(), buffer).unwrap(); assert_eq!( actual, expected, "Actual didn't match expected for input '{}'.\n{:?}\n!=\n{:?}", @@ -1534,10 +1535,12 @@ mod tests { context: EncodingContextRef<'a>, data: &'a str, ) -> IonResult> { - let buffer = TextBuffer::new(context, data.as_bytes()); + let mut buffer = TextBuffer::new(context, data.as_bytes(), true); // All `read_clob` usages should be accepted by the matcher, so we can `unwrap()` the // call to `match_clob()`. - let (_remaining, matched) = buffer.match_clob().unwrap(); + let matched = peek(TextBuffer::match_clob) + .parse_next(&mut buffer) + .unwrap(); // The resulting buffer slice may be rejected during reading. matched.read(context.allocator(), buffer) } diff --git a/src/lazy/text/parse_result.rs b/src/lazy/text/parse_result.rs index 1a0ac151..b2a155c8 100644 --- a/src/lazy/text/parse_result.rs +++ b/src/lazy/text/parse_result.rs @@ -1,48 +1,39 @@ -//! The [`nom` parser combinator crate](https://docs.rs/nom/latest/nom/) intentionally provides -//! bare-bones error reporting by default. Each error contains only a `&str` representing the input -//! that could not be matched and an [`ErrorKind`] enum variant indicating which `nom` parser produced -//! the error. This stack-allocated type is very cheap to create, which is important because a -//! typical parse will require creating large numbers of short-lived error values. -//! -//! This module defines `IonParseError`, a custom error type that can capture more information than is -//! supported by [`nom::error::Error`]. It also defines `IonParseResult`, a type alias for an -//! [`IResult`] that parses `TextBuffer`s and produces `IonParseError`s if something goes wrong. +//! This module defines `IonParseError`, a custom error type, and `IonParseResult`, a type alias for an +//! [`PResult`] that parses `TextBuffer`s and produces `IonParseError`s if something goes wrong. use crate::lazy::text::buffer::TextBuffer; use crate::position::Position; use crate::result::{DecodingError, IonFailure}; use crate::{IonError, IonResult}; -use nom::error::{Error as NomError, ErrorKind, ParseError}; -use nom::{Err, IResult}; use std::borrow::Cow; use std::fmt::{Debug, Display}; +use winnow::error::{ErrMode, ErrorKind, ParseError, ParserError}; +use winnow::stream::Stream; +use winnow::PResult; -/// A type alias for a [`IResult`] whose input is a `TextBuffer` and whose error type is an -/// [`InvalidInputError`]. All of the Ion parsers in the `text::parsers` module return an -/// [`IonParseResult`]. +/// A type alias for a [`PResult`] whose input is a [`TextBuffer`] and whose error type is an +/// `IonParseError`. All of the Ion parsers in the `TextBuffer` type return `IonParseResult`. /// /// If the parser is successful, it will return `Ok(output_value)`. If it encounters a problem, -/// it will return a `nom::Err`. [nom::Err] is a generic enum with three possible -/// variants: +/// it will return a `winnow::error::ErrMode`. +/// +/// [`ErrMode`] is a generic enum with three possible variants: /// 1. `Incomplete(_)` indicates that there wasn't enough input data to determine whether the /// parser should match or not. -/// 2. `Error(ion_parse_error)` indicates that the parser did not match the input text. -/// 3. `Failure(ion_parse_error)` indicates that the parser matched the text but encountered +/// 2. `Backtrack(ion_parse_error)` indicates that the parser did not match the input text; the reader should try another. +/// 3. `Cut(ion_parse_error)` indicates that the parser matched the text but encountered /// a problem when trying to materialize it into the `output_value`. In such cases, returning a -/// `Failure` signals that this was the correct parser to handle the input but it could not +/// `Cut` signals that this was the correct parser to handle the input but it could not /// be processed successfully for some reason. For example, a parser trying to match a number of /// hours and minutes might match the text `11:71`, but fail when it tries to turn `71` into a /// number of minutes because it's `>=60`. We know this was the right parser, but it wasn't /// able to process it. (This is slightly contrived; it would be possible to write a parser /// that rejected `71` as a number of minutes based on syntax alone.) -pub(crate) type IonParseResult<'a, O> = IResult, O, IonParseError<'a>>; -// Functions that return IonParseResult parse TextBuffer-^ ^ ^ -// ...return a value of type `O` -----+ | -// ...or a nom::Err if something goes wrong ----+ +pub(crate) type IonParseResult<'a, O> = PResult>; /// As above, but for parsers that simply identify (i.e. 'match') a slice of the input as a /// particular item. -pub(crate) type IonMatchResult<'a> = IResult, TextBuffer<'a>, IonParseError<'a>>; +pub(crate) type IonMatchResult<'a> = IonParseResult<'a, TextBuffer<'a>>; #[derive(Debug, PartialEq)] pub enum IonParseError<'data> { @@ -55,8 +46,8 @@ pub enum IonParseError<'data> { /// Describes a problem that occurred while trying to parse a given input `TextBuffer`. /// /// When returned as part of an `IonParseResult`, an `IonParseError` is always wrapped in -/// a [nom::Err] (see `IonParseResult`'s documentation for details). If the `nom::Err` is -/// a non-fatal `Error`, the `IonParseError`'s `description` will be `None`. If the `nom::Err` is +/// an [`ErrMode`] (see `IonParseResult`'s documentation for details). If the `ErrMode` is +/// a non-fatal `Error`, the `IonParseError`'s `description` will be `None`. If the `winnow::ErrMode` is /// a fatal `Failure`, the `description` will be `Some(String)`. In this way, using an /// `IonParseError` only incurs heap allocation costs when parsing is coming to an end. #[derive(Debug, PartialEq)] @@ -196,12 +187,13 @@ impl From> for IonError { } } -impl<'data> From>> for IonParseError<'data> { - fn from(value: Err>) -> Self { +impl<'data> From>> for IonParseError<'data> { + fn from(value: ErrMode>) -> Self { + use winnow::error::ErrMode::*; match value { - Err::Incomplete(_) => IonParseError::Incomplete, - Err::Error(e) => e, - Err::Failure(e) => e, + Incomplete(_) => IonParseError::Incomplete, + Backtrack(e) => e, + Cut(e) => e, } } } @@ -216,28 +208,31 @@ impl<'data> From<(TextBuffer<'data>, ErrorKind)> for IonParseError<'data> { } } -/// Allows a [nom::error::Error] to be converted into an [IonParseError] by calling `.into()`. -impl<'data> From>> for IonParseError<'data> { - fn from(nom_error: NomError>) -> Self { - InvalidInputError::new(nom_error.input) - .with_nom_error_kind(nom_error.code) - .into() +/// Allows an [`ErrMode`] to be converted into an [IonParseError] by calling `.into()`. +impl<'data> From, IonParseError<'data>>> for IonParseError<'data> { + fn from(parse_error: ParseError, IonParseError<'data>>) -> Self { + parse_error.into_inner() } } /// Allows `IonParseError` to be used as the error type in various `nom` functions. -impl<'data> ParseError> for IonParseError<'data> { - fn from_error_kind(input: TextBuffer<'data>, error_kind: ErrorKind) -> Self { - InvalidInputError::new(input) +impl<'data> ParserError> for IonParseError<'data> { + fn from_error_kind(input: &TextBuffer<'data>, error_kind: ErrorKind) -> Self { + InvalidInputError::new(*input) .with_nom_error_kind(error_kind) .into() } - fn append(_input: TextBuffer<'data>, _kind: ErrorKind, other: Self) -> Self { + fn append( + self, + input: &TextBuffer<'data>, + _checkpoint: & as Stream>::Checkpoint, + _kind: ErrorKind, + ) -> Self { // When an error stack is being built, this method is called to give the error // type an opportunity to aggregate the errors into a collection or a more descriptive // message. For now, we simply allow the most recent error to take precedence. - other + IonParseError::Invalid(InvalidInputError::new(*input)) } } @@ -259,24 +254,24 @@ impl<'data, T> ToIteratorOutput<'data, T> for IonResult<(TextBuffer<'data>, Opti } /// Converts the output of a text Ion parser (any of `IonParseResult`, `IonParseError`, -/// or `nom::Err`) into a general-purpose `IonResult`. If the implementing type +/// or `winnow::Err`) into a general-purpose `IonResult`. If the implementing type /// does not have its own `label` and `input`, the specified values will be used. pub(crate) trait AddContext<'data, T> { fn with_context<'a>( self, label: impl Into>, input: TextBuffer<'data>, - ) -> IonResult<(TextBuffer<'a>, T)> + ) -> IonResult where 'data: 'a; } -impl<'data, T> AddContext<'data, T> for nom::Err> { +impl<'data, T> AddContext<'data, T> for ErrMode> { fn with_context<'a>( self, label: impl Into>, input: TextBuffer<'data>, - ) -> IonResult<(TextBuffer<'a>, T)> + ) -> IonResult where 'data: 'a, { @@ -291,7 +286,7 @@ impl<'data, T> AddContext<'data, T> for IonParseError<'data> { self, label: impl Into>, input: TextBuffer<'data>, - ) -> IonResult<(TextBuffer<'a>, T)> + ) -> IonResult where 'data: 'a, { @@ -314,7 +309,7 @@ impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> { self, label: impl Into>, input: TextBuffer<'data>, - ) -> IonResult<(TextBuffer<'a>, T)> + ) -> IonResult where 'data: 'a, { @@ -326,20 +321,20 @@ impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> { } } -/// Constructs a `nom::Err::Failure` that contains an `IonParseError` describing the problem +/// Constructs a `winnow::error::ErrMode::Cut` that contains an `IonParseError` describing the problem /// that was encountered. pub(crate) fn fatal_parse_error>, O>( input: TextBuffer<'_>, description: D, ) -> IonParseResult<'_, O> { - Err(nom::Err::Failure( + Err(ErrMode::Cut( InvalidInputError::new(input) .with_description(description) .into(), )) } -/// An extension trait that allows a [std::result::Result] of any kind to be mapped to an +/// An extension trait that allows a [Result] of any kind to be mapped to an /// `IonParseResult` concisely. pub(crate) trait OrFatalParseError { fn or_fatal_parse_error( @@ -360,7 +355,7 @@ where label: L, ) -> IonParseResult<'_, T> { match self { - Ok(value) => Ok((input, value)), + Ok(value) => Ok(value), Err(error) => fatal_parse_error(input, format!("{label}: {error:?}")), } } diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index f1e18214..2cbb59e5 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -13,106 +13,95 @@ use crate::{Encoding, IonResult}; /// A text Ion 1.0 reader that yields [`LazyRawStreamItem`]s representing the top level values found /// in the provided input stream. pub struct LazyRawTextReader_1_0<'data> { - input: &'data [u8], - // The offset from the beginning of the overall stream at which the `input` slice begins - stream_offset: usize, - // The offset from the beginning of `input` at which the reader is positioned - local_offset: usize, + input: TextBuffer<'data>, } impl<'data> LazyRawTextReader_1_0<'data> { /// Constructs a `LazyRawTextReader` positioned at the beginning of the provided input stream. - pub fn new(data: &'data [u8]) -> LazyRawTextReader_1_0<'data> { - Self::new_with_offset(data, 0) + pub fn new( + context: EncodingContextRef<'data>, + data: &'data [u8], + is_final_data: bool, + ) -> LazyRawTextReader_1_0<'data> { + Self::new_with_offset(context, data, 0, is_final_data) } /// Constructs a `LazyRawTextReader` positioned at the beginning of the provided input stream. /// The provided input stream is itself a slice starting `offset` bytes from the beginning /// of a larger data stream. This offset is used for reporting the absolute (stream-level) /// position of values encountered in `data`. - fn new_with_offset(data: &'data [u8], offset: usize) -> LazyRawTextReader_1_0<'data> { - LazyRawTextReader_1_0 { - input: data, - // `data` begins at position `offset` within some larger stream. If `data` contains - // the entire stream, this will be zero. - stream_offset: offset, - // Start reading from the beginning of the slice `data` - local_offset: 0, - } + fn new_with_offset( + context: EncodingContextRef<'data>, + data: &'data [u8], + offset: usize, + is_final_data: bool, + ) -> LazyRawTextReader_1_0<'data> { + let input = TextBuffer::new_with_offset(context, data, offset, is_final_data); + LazyRawTextReader_1_0 { input } } - pub fn next<'top>( - &'top mut self, - context: EncodingContextRef<'top>, - ) -> IonResult> - where - 'data: 'top, - { - let input = TextBuffer::new_with_offset( - context, - &self.input[self.local_offset..], - self.stream_offset + self.local_offset, - ); - let (buffer_after_whitespace, _whitespace) = input + pub fn next(&mut self) -> IonResult> { + let _whitespace = self + .input .match_optional_comments_and_whitespace() - .with_context("reading whitespace/comments at the top level", input)?; - if buffer_after_whitespace.is_empty() { + .with_context("reading whitespace/comments at the top level", self.input)?; + if self.input.is_empty() { return Ok(RawStreamItem::EndOfStream(EndPosition::new( TextEncoding_1_0.encoding(), - buffer_after_whitespace.offset(), + self.input.offset(), ))); } // Consume any trailing whitespace that followed this item. Doing this allows us to check // whether this was the last item in the buffer by testing `buffer.is_empty()` afterward. - let buffer_after_whitespace = buffer_after_whitespace.local_lifespan(); - let (buffer_after_item, matched_item) = buffer_after_whitespace + let matched_item = self + .input .match_top_level_item_1_0() - .with_context("reading a top-level value", buffer_after_whitespace)?; + .with_context("reading a top-level value", self.input)?; - let (buffer_after_trailing_ws, _trailing_ws) = buffer_after_item + let _trailing_ws = self + .input .match_optional_comments_and_whitespace() - .with_context( - "reading trailing top-level whitespace/comments", - buffer_after_item, - )?; - - // Since we successfully matched the next value, we'll update the buffer - // so a future call to `next()` will resume parsing the remaining input. - self.local_offset = buffer_after_trailing_ws.offset() - self.stream_offset; + .with_context("reading trailing top-level whitespace/comments", self.input)?; + Ok(matched_item) } + + pub fn context(&self) -> EncodingContextRef<'data> { + self.input.context() + } } impl<'data> LazyRawReader<'data, TextEncoding_1_0> for LazyRawTextReader_1_0<'data> { - fn resume_at_offset( - data: &'data [u8], - offset: usize, - // This argument is ignored by all raw readers except LazyRawAnyReader - _encoding_hint: IonEncoding, - ) -> Self { - LazyRawTextReader_1_0::new_with_offset(data, offset) + fn new(context: EncodingContextRef<'data>, data: &'data [u8], is_final_data: bool) -> Self { + LazyRawTextReader_1_0::new(context, data, is_final_data) + } + + fn resume(context: EncodingContextRef<'data>, saved_state: RawReaderState<'data>) -> Self { + LazyRawTextReader_1_0 { + input: TextBuffer::new_with_offset( + context, + saved_state.data(), + saved_state.offset(), + saved_state.is_final_data(), + ), + } } fn save_state(&self) -> RawReaderState<'data> { RawReaderState::new( - &self.input[self.local_offset..], + self.input.bytes(), self.position(), + self.input.is_final_data(), self.encoding(), ) } - fn next<'top>( - &'top mut self, - context: EncodingContextRef<'top>, - ) -> IonResult> - where - 'data: 'top, - { - self.next(context) + fn next(&mut self) -> IonResult> { + self.next() } fn position(&self) -> usize { - self.stream_offset + self.local_offset + self.input.offset() } fn encoding(&self) -> IonEncoding { @@ -136,16 +125,13 @@ mod tests { } impl<'data> TestReader<'data> { - fn next(&mut self) -> IonResult> { - self.reader.next(self.context) + fn next(&mut self) -> IonResult> { + self.reader.next() } - fn expect_next<'a>(&'a mut self, expected: RawValueRef<'a, TextEncoding_1_0>) - where - 'data: 'a, - { - let TestReader { context, reader } = self; + fn expect_next(&mut self, expected: RawValueRef<'data, TextEncoding_1_0>) { + let TestReader { reader, .. } = self; let lazy_value = reader - .next(*context) + .next() .expect("advancing the reader failed") .expect_value() .expect("expected a value"); @@ -297,7 +283,7 @@ mod tests { let encoding_context = EncodingContext::empty(); let reader = &mut TestReader { - reader: LazyRawTextReader_1_0::new(data.as_bytes()), + reader: LazyRawTextReader_1_0::new(encoding_context.get_ref(), data.as_bytes(), true), context: encoding_context.get_ref(), }; @@ -475,21 +461,21 @@ mod tests { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); let data = b"foo 2024T bar::38 [1, 2, 3]"; - let mut reader = LazyRawTextReader_1_0::new(data); + let mut reader = LazyRawTextReader_1_0::new(context, data, true); - let foo = reader.next(context)?.expect_value()?; + let foo = reader.next()?.expect_value()?; assert_eq!(foo.span(), b"foo"); assert_eq!(foo.range(), 0..3); - let timestamp = reader.next(context)?.expect_value()?; + let timestamp = reader.next()?.expect_value()?; assert_eq!(timestamp.span(), b"2024T"); assert_eq!(timestamp.range(), 4..9); - let annotated_int = reader.next(context)?.expect_value()?; + let annotated_int = reader.next()?.expect_value()?; assert_eq!(annotated_int.span(), b"bar::38"); assert_eq!(annotated_int.range(), 10..17); - let list_value = reader.next(context)?.expect_value()?; + let list_value = reader.next()?.expect_value()?; assert_eq!(list_value.span(), b"[1, 2, 3]"); assert_eq!(list_value.range(), 18..27); diff --git a/src/lazy/text/raw/sequence.rs b/src/lazy/text/raw/sequence.rs index 0051d27b..3b6323a7 100644 --- a/src/lazy/text/raw/sequence.rs +++ b/src/lazy/text/raw/sequence.rs @@ -1,56 +1,57 @@ #![allow(non_camel_case_types)] +#![deny(dead_code)] use std::fmt; use std::fmt::{Debug, Formatter}; -use std::ops::Range; - -use nom::character::streaming::satisfy; +use std::marker::PhantomData; +use winnow::combinator::{alt, opt, peek, terminated}; +use winnow::Parser; use crate::lazy::decoder::private::LazyContainerPrivate; use crate::lazy::decoder::{ Decoder, LazyRawContainer, LazyRawSequence, LazyRawValue, LazyRawValueExpr, RawValueExpr, }; -use crate::lazy::encoding::TextEncoding_1_0; -use crate::lazy::text::buffer::TextBuffer; +use crate::lazy::encoding::TextEncoding; +use crate::lazy::text::buffer::{whitespace_and_then, TextBuffer}; +use crate::lazy::text::matched::MatchedValue; use crate::lazy::text::parse_result::AddContext; -use crate::lazy::text::parse_result::ToIteratorOutput; -use crate::lazy::text::value::{LazyRawTextValue_1_0, RawTextAnnotationsIterator}; +use crate::lazy::text::raw::v1_1::reader::RawTextSequenceCacheIterator; +use crate::lazy::text::value::{LazyRawTextValue, RawTextAnnotationsIterator}; use crate::{IonResult, IonType}; - // ===== Lists ===== #[derive(Copy, Clone)] -pub struct LazyRawTextList_1_0<'data> { - pub(crate) value: LazyRawTextValue_1_0<'data>, +pub struct RawTextList<'data, E: TextEncoding<'data>> { + pub(crate) value: LazyRawTextValue<'data, E>, } -impl<'data> LazyRawTextList_1_0<'data> { +impl<'data, E: TextEncoding<'data>> RawTextList<'data, E> { pub fn ion_type(&self) -> IonType { IonType::List } - pub fn iter(&self) -> RawTextListIterator_1_0<'data> { - // Skip past any annotations and the opening '[' - let list_contents_start = self.value.encoded_value.data_offset() + 1; - // Make an iterator over the input bytes that follow the initial `[` - RawTextListIterator_1_0::new(self.value.input.slice_to_end(list_contents_start)) + pub fn iter(&self) -> RawTextSequenceCacheIterator<'data, E> { + let MatchedValue::List(child_exprs) = self.value.encoded_value.matched() else { + unreachable!("list contained a matched value of the wrong type") + }; + RawTextSequenceCacheIterator::new(child_exprs) } } -impl<'data> LazyContainerPrivate<'data, TextEncoding_1_0> for LazyRawTextList_1_0<'data> { - fn from_value(value: LazyRawTextValue_1_0<'data>) -> Self { - LazyRawTextList_1_0 { value } +impl<'data, E: TextEncoding<'data>> LazyContainerPrivate<'data, E> for RawTextList<'data, E> { + fn from_value(value: LazyRawTextValue<'data, E>) -> Self { + RawTextList { value } } } -impl<'data> LazyRawContainer<'data, TextEncoding_1_0> for LazyRawTextList_1_0<'data> { - fn as_value(&self) -> ::Value<'data> { +impl<'data, E: TextEncoding<'data>> LazyRawContainer<'data, E> for RawTextList<'data, E> { + fn as_value(&self) -> ::Value<'data> { self.value } } -impl<'data> LazyRawSequence<'data, TextEncoding_1_0> for LazyRawTextList_1_0<'data> { - type Iterator = RawTextListIterator_1_0<'data>; +impl<'data, E: TextEncoding<'data>> LazyRawSequence<'data, E> for RawTextList<'data, E> { + type Iterator = RawTextSequenceCacheIterator<'data, E>; fn annotations(&self) -> RawTextAnnotationsIterator<'data> { self.value.annotations() @@ -61,20 +62,20 @@ impl<'data> LazyRawSequence<'data, TextEncoding_1_0> for LazyRawTextList_1_0<'da } fn iter(&self) -> Self::Iterator { - LazyRawTextList_1_0::iter(self) + Self::iter(self) } } -impl<'data> IntoIterator for &LazyRawTextList_1_0<'data> { - type Item = IonResult>; - type IntoIter = RawTextListIterator_1_0<'data>; +impl<'data, E: TextEncoding<'data>> IntoIterator for &RawTextList<'data, E> { + type Item = IonResult>; + type IntoIter = RawTextSequenceCacheIterator<'data, E>; fn into_iter(self) -> Self::IntoIter { self.iter() } } -impl Debug for LazyRawTextList_1_0<'_> { +impl<'data, E: TextEncoding<'data>> Debug for RawTextList<'data, E> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "[")?; for value in self { @@ -87,67 +88,43 @@ impl Debug for LazyRawTextList_1_0<'_> { } #[derive(Copy, Clone, Debug)] -pub struct RawTextListIterator_1_0<'data> { +pub struct RawTextListIterator<'data, E: TextEncoding<'data>> { input: TextBuffer<'data>, - // If this iterator has returned an error, it should return `None` forever afterwards has_returned_error: bool, + spooky: PhantomData, } -impl<'data> RawTextListIterator_1_0<'data> { - pub(crate) fn new(input: TextBuffer<'data>) -> RawTextListIterator_1_0<'data> { - RawTextListIterator_1_0 { +impl<'data, E: TextEncoding<'data>> RawTextListIterator<'data, E> { + pub(crate) fn new(input: TextBuffer<'data>) -> RawTextListIterator<'data, E> { + RawTextListIterator { input, has_returned_error: false, + spooky: PhantomData, } } } -impl RawTextListIterator_1_0<'_> { - pub(crate) fn find_span(&self) -> IonResult> { - // The input has already skipped past the opening delimiter. - let start = self.input.offset() - 1; - // We need to find the input slice containing the closing delimiter. It's either... - let input_after_last = if let Some(value_result) = self.last() { - let value = value_result?.expect_value()?; - // ...the input slice that follows the last sequence value... - self.input - .slice_to_end(value.input.offset() + value.total_length() - self.input.offset()) - } else { - // ...or there aren't values, so it's just the input after the opening delimiter. - self.input - }; - let (mut input_after_ws, _ws) = - input_after_last - .match_optional_comments_and_whitespace() - .with_context("seeking the end of a list", input_after_last)?; - // Skip an optional comma and more whitespace - if input_after_ws.bytes().first() == Some(&b',') { - (input_after_ws, _) = input_after_ws - .slice_to_end(1) - .match_optional_comments_and_whitespace() - .with_context("skipping a list's trailing comma", input_after_ws)?; - } - let (input_after_end, _end_delimiter) = satisfy(|c| c == ']')(input_after_ws) - .with_context("seeking the closing delimiter of a list", input_after_ws)?; - let end = input_after_end.offset(); - Ok(start..end) - } -} - -impl<'data> Iterator for RawTextListIterator_1_0<'data> { - type Item = IonResult>; +impl<'data, E: TextEncoding<'data>> Iterator for RawTextListIterator<'data, E> { + type Item = IonResult>; fn next(&mut self) -> Option { if self.has_returned_error { return None; } - match self.input.match_list_value() { - Ok((remaining, Some(value))) => { - self.input = remaining; - let value = RawValueExpr::ValueLiteral(LazyRawTextValue_1_0::from(value)); - Some(Ok(value)) - } - Ok((_remaining, None)) => { + let result = whitespace_and_then(alt(( + // We only peek at the end so future calls to `next()` will continue to yield `None`. + peek("]").value(None), + terminated( + E::value_expr_matcher(), + whitespace_and_then(alt((",", peek("]")))), + ) + .map(Some), + ))) + .parse_next(&mut self.input); + + match result { + Ok(Some(value_expr)) => Some(Ok(value_expr)), + Ok(None) => { // Don't update `remaining` so subsequent calls will continue to return None None } @@ -163,106 +140,92 @@ impl<'data> Iterator for RawTextListIterator_1_0<'data> { // ===== S-Expressions ===== #[derive(Copy, Clone)] -pub struct LazyRawTextSExp_1_0<'top> { - pub(crate) value: LazyRawTextValue_1_0<'top>, +pub struct RawTextSExp<'top, E: TextEncoding<'top>> { + pub(crate) value: LazyRawTextValue<'top, E>, } -impl<'data> LazyRawTextSExp_1_0<'data> { +impl<'data, E: TextEncoding<'data>> RawTextSExp<'data, E> { pub fn ion_type(&self) -> IonType { IonType::SExp } - pub fn iter(&self) -> RawTextSExpIterator_1_0<'data> { - // Make an iterator over the input bytes that follow the initial `(`; account for - // a leading annotations sequence. - let sexp_contents_start = self.value.encoded_value.data_offset() + 1; - RawTextSExpIterator_1_0::new(self.value.input.slice_to_end(sexp_contents_start)) + pub fn iter(&self) -> RawTextSequenceCacheIterator<'data, E> { + let MatchedValue::SExp(child_exprs) = self.value.encoded_value.matched() else { + unreachable!("sexp contained a matched value of the wrong type") + }; + RawTextSequenceCacheIterator::new(child_exprs) } } #[derive(Copy, Clone, Debug)] -pub struct RawTextSExpIterator_1_0<'top> { +pub struct RawTextSExpIterator<'top, E: TextEncoding<'top>> { input: TextBuffer<'top>, // If this iterator has returned an error, it should return `None` forever afterwards has_returned_error: bool, + spooky: PhantomData, } -impl<'top> RawTextSExpIterator_1_0<'top> { - pub(crate) fn new(input: TextBuffer<'top>) -> RawTextSExpIterator_1_0<'top> { - RawTextSExpIterator_1_0 { +impl<'top, E: TextEncoding<'top>> RawTextSExpIterator<'top, E> { + pub(crate) fn new(input: TextBuffer<'top>) -> RawTextSExpIterator<'top, E> { + RawTextSExpIterator { input, has_returned_error: false, + spooky: PhantomData, } } - - /// Scans ahead to find the end of this s-expression and reports the input span that it occupies. - /// - /// The `initial_bytes_skipped` parameter indicates how many bytes of input that represented the - /// beginning of the expression are not in the buffer. For plain s-expressions, this will always - /// be `1` as they begin with a single open parenthesis `(`. For e-expressions (which are used - /// to invoke macros from the data stream), it will always be a minimum of `3`: two bytes for - /// the opening `(:` and at least one for the macro identifier. (For example: `(:foo`.) - pub(crate) fn find_span(&self, initial_bytes_skipped: usize) -> IonResult> { - // The input has already skipped past the opening delimiter. - let start = self.input.offset() - initial_bytes_skipped; - // We need to find the input slice containing the closing delimiter. It's either... - let input_after_last = if let Some(value_result) = self.last() { - let value = value_result?.expect_value()?; - // ...the input slice that follows the last sequence value... - self.input - .slice_to_end(value.input.offset() + value.total_length() - self.input.offset()) - } else { - // ...or there aren't values, so it's just the input after the opening delimiter. - self.input - }; - let (input_after_ws, _ws) = input_after_last - .match_optional_comments_and_whitespace() - .with_context("seeking the end of a list", input_after_last)?; - let (input_after_end, _end_delimiter) = satisfy(|c| c == ')')(input_after_ws) - .with_context("seeking the closing delimiter of a sexp", input_after_ws)?; - let end = input_after_end.offset(); - Ok(start..end) - } } -impl<'data> Iterator for RawTextSExpIterator_1_0<'data> { - type Item = IonResult>; +impl<'data, E: TextEncoding<'data>> Iterator for RawTextSExpIterator<'data, E> { + type Item = IonResult>; fn next(&mut self) -> Option { if self.has_returned_error { return None; } - match self.input.match_sexp_value() { - Ok((remaining, Some(value))) => { - self.input = remaining; - Some(Ok(RawValueExpr::ValueLiteral(LazyRawTextValue_1_0::from( - value, - )))) - } - Ok((_remaining, None)) => None, + // Copy the original input so we can include any matched annotations. + let input = self.input; + let result = whitespace_and_then(alt(( + // We only peek at the end so future calls to `next()` will continue to yield `None`. + peek(")").value(None), + // An annotated value or (in Ion 1.1) an e-expression + E::value_expr_matcher().map(Some), + // A potentially annotated operator literal + ( + opt(TextBuffer::match_annotations), + whitespace_and_then(TextBuffer::match_operator), + ) + .map(|(maybe_annotations, value)| input.apply_annotations(maybe_annotations, value)) + .map(RawValueExpr::ValueLiteral) + .map(Some), + ))) + .parse_next(&mut self.input); + + match result { + Ok(Some(value_expr)) => Some(Ok(value_expr)), + Ok(None) => None, Err(e) => { self.has_returned_error = true; - e.with_context("reading the next sexp value", self.input) + e.with_context("reading the next s-expression value", self.input) .transpose() } } } } -impl<'data> LazyContainerPrivate<'data, TextEncoding_1_0> for LazyRawTextSExp_1_0<'data> { - fn from_value(value: LazyRawTextValue_1_0<'data>) -> Self { - LazyRawTextSExp_1_0 { value } +impl<'data, E: TextEncoding<'data>> LazyContainerPrivate<'data, E> for RawTextSExp<'data, E> { + fn from_value(value: LazyRawTextValue<'data, E>) -> Self { + RawTextSExp { value } } } -impl<'data> LazyRawContainer<'data, TextEncoding_1_0> for LazyRawTextSExp_1_0<'data> { - fn as_value(&self) -> ::Value<'data> { +impl<'data, E: TextEncoding<'data>> LazyRawContainer<'data, E> for RawTextSExp<'data, E> { + fn as_value(&self) -> ::Value<'data> { self.value } } -impl<'data> LazyRawSequence<'data, TextEncoding_1_0> for LazyRawTextSExp_1_0<'data> { - type Iterator = RawTextSExpIterator_1_0<'data>; +impl<'data, E: TextEncoding<'data>> LazyRawSequence<'data, E> for RawTextSExp<'data, E> { + type Iterator = RawTextSequenceCacheIterator<'data, E>; fn annotations(&self) -> RawTextAnnotationsIterator<'data> { self.value.annotations() @@ -273,20 +236,20 @@ impl<'data> LazyRawSequence<'data, TextEncoding_1_0> for LazyRawTextSExp_1_0<'da } fn iter(&self) -> Self::Iterator { - LazyRawTextSExp_1_0::iter(self) + RawTextSExp::<'data, E>::iter(self) } } -impl<'data> IntoIterator for &LazyRawTextSExp_1_0<'data> { - type Item = IonResult>; - type IntoIter = RawTextSExpIterator_1_0<'data>; +impl<'data, E: TextEncoding<'data>> IntoIterator for &RawTextSExp<'data, E> { + type Item = IonResult>; + type IntoIter = RawTextSequenceCacheIterator<'data, E>; fn into_iter(self) -> Self::IntoIter { self.iter() } } -impl Debug for LazyRawTextSExp_1_0<'_> { +impl<'top, E: TextEncoding<'top>> Debug for RawTextSExp<'top, E> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "(")?; for value in self { @@ -309,8 +272,8 @@ mod tests { fn expect_sequence_range(ion_data: &str, expected: Range) -> IonResult<()> { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let reader = &mut LazyRawTextReader_1_0::new(ion_data.as_bytes()); - let value = reader.next(context)?.expect_value()?; + let reader = &mut LazyRawTextReader_1_0::new(context, ion_data.as_bytes(), true); + let value = reader.next()?.expect_value()?; let actual_range = value.data_range(); assert_eq!( actual_range, expected, diff --git a/src/lazy/text/raw/struct.rs b/src/lazy/text/raw/struct.rs index 7d2a821a..d6120e76 100644 --- a/src/lazy/text/raw/struct.rs +++ b/src/lazy/text/raw/struct.rs @@ -1,79 +1,57 @@ #![allow(non_camel_case_types)] -use std::ops::Range; - -use nom::character::streaming::satisfy; - -use crate::lazy::decoder::private::LazyContainerPrivate; -use crate::lazy::decoder::{ - Decoder, HasRange, HasSpan, LazyRawContainer, LazyRawFieldExpr, LazyRawFieldName, - LazyRawStruct, LazyRawValue, -}; -use crate::lazy::encoding::TextEncoding_1_0; +use crate::lazy::decoder::{HasRange, HasSpan, LazyRawFieldExpr, LazyRawFieldName}; +use crate::lazy::encoding::{TextEncoding, TextEncoding_1_0, TextEncoding_1_1}; use crate::lazy::span::Span; -use crate::lazy::text::buffer::TextBuffer; +use crate::lazy::text::buffer::{whitespace_and_then, TextBuffer}; use crate::lazy::text::matched::MatchedFieldName; -use crate::lazy::text::parse_result::{AddContext, ToIteratorOutput}; -use crate::lazy::text::value::{LazyRawTextValue_1_0, RawTextAnnotationsIterator}; +use crate::lazy::text::parse_result::AddContext; use crate::{IonResult, RawSymbolRef}; +use std::marker::PhantomData; +use std::ops::Range; +use winnow::combinator::{alt, peek, terminated}; +use winnow::Parser; #[derive(Clone, Copy, Debug)] -pub struct RawTextStructIterator_1_0<'top> { +pub struct RawTextStructIterator<'top, E: TextEncoding<'top>> { input: TextBuffer<'top>, has_returned_error: bool, + spooky: PhantomData, } -impl<'top> RawTextStructIterator_1_0<'top> { - pub(crate) fn new(input: TextBuffer<'top>) -> Self { - RawTextStructIterator_1_0 { +impl<'top, E: TextEncoding<'top>> RawTextStructIterator<'top, E> { + pub fn new(input: TextBuffer<'top>) -> Self { + Self { input, has_returned_error: false, + spooky: PhantomData, } } - - pub(crate) fn find_span(&self) -> IonResult> { - // The input has already skipped past the opening delimiter. - let start = self.input.offset() - 1; - // We need to find the input slice containing the closing delimiter. It's either... - let input_after_last = if let Some(field_result) = self.last() { - let field = field_result?; - self.input - .slice_to_end(field.range().end - self.input.offset()) - } else { - // ...or there aren't fields, so it's just the input after the opening delimiter. - self.input - }; - let (mut input_after_ws, _ws) = - input_after_last - .match_optional_comments_and_whitespace() - .with_context("seeking the end of a struct", input_after_last)?; - // Skip an optional comma and more whitespace - if input_after_ws.bytes().first() == Some(&b',') { - (input_after_ws, _) = input_after_ws - .slice_to_end(1) - .match_optional_comments_and_whitespace() - .with_context("skipping a list's trailing comma", input_after_ws)?; - } - let (input_after_end, _end_delimiter) = satisfy(|c| c == '}')(input_after_ws) - .with_context("seeking the closing delimiter of a struct", input_after_ws)?; - let end = input_after_end.offset(); - Ok(start..end) - } } -impl<'top> Iterator for RawTextStructIterator_1_0<'top> { - type Item = IonResult>; +impl<'top, E: TextEncoding<'top>> Iterator for RawTextStructIterator<'top, E> { + type Item = IonResult>; fn next(&mut self) -> Option { if self.has_returned_error { return None; } - match self.input.match_struct_field() { - Ok((remaining_input, Some(field))) => { - self.input = remaining_input; - Some(Ok(field)) - } - Ok((_, None)) => None, + + let result = whitespace_and_then(alt(( + // If it's the end of the struct, don't consume it so future calls will also yield `None` + peek("}").value(None), + terminated( + E::field_expr_matcher().map(Some), + whitespace_and_then( + // Either a comma (consumed) or an upcoming end-of-struct (not consumed) + alt((",", peek("}"))), + ), + ), + ))) + .parse_next(&mut self.input); + match result { + Ok(Some(field)) => Some(Ok(field)), + Ok(None) => None, Err(e) => { self.has_returned_error = true; e.with_context("reading the next struct field", self.input) @@ -84,72 +62,49 @@ impl<'top> Iterator for RawTextStructIterator_1_0<'top> { } #[derive(Debug, Copy, Clone)] -pub struct LazyRawTextFieldName_1_0<'top> { +pub struct LazyRawTextFieldName<'top, E: TextEncoding<'top>> { matched: MatchedFieldName<'top>, + // XXX: Ion 1.0 and 1.1 use the same syntax for field names. + // This type is generic over the encoding because if it is not, the user must manually + // specify 1.0 or 1.1 in a variety of places. When it is generic, the compiler can infer + // the Ion version from context. + spooky: PhantomData, } -impl<'top> LazyRawTextFieldName_1_0<'top> { +impl<'top, E: TextEncoding<'top>> LazyRawTextFieldName<'top, E> { pub(crate) fn new(matched: MatchedFieldName<'top>) -> Self { - Self { matched } + Self { + matched, + spooky: PhantomData, + } } } -impl<'top> HasSpan<'top> for LazyRawTextFieldName_1_0<'top> { +impl<'top, E: TextEncoding<'top>> HasSpan<'top> for LazyRawTextFieldName<'top, E> { fn span(&self) -> Span<'top> { self.matched.span() } } -impl HasRange for LazyRawTextFieldName_1_0<'_> { +impl<'top, E: TextEncoding<'top>> HasRange for LazyRawTextFieldName<'top, E> { fn range(&self) -> Range { self.matched.range() } } -impl<'top> LazyRawFieldName<'top, TextEncoding_1_0> for LazyRawTextFieldName_1_0<'top> { +impl<'top> LazyRawFieldName<'top, TextEncoding_1_0> + for LazyRawTextFieldName<'top, TextEncoding_1_0> +{ fn read(&self) -> IonResult> { self.matched.read() } } -#[derive(Clone, Copy, Debug)] -pub struct LazyRawTextStruct_1_0<'top> { - pub(crate) value: LazyRawTextValue_1_0<'top>, -} - -impl<'top> LazyContainerPrivate<'top, TextEncoding_1_0> for LazyRawTextStruct_1_0<'top> { - fn from_value(value: LazyRawTextValue_1_0<'top>) -> Self { - LazyRawTextStruct_1_0 { value } - } -} - -impl<'top> LazyRawContainer<'top, TextEncoding_1_0> for LazyRawTextStruct_1_0<'top> { - fn as_value(&self) -> ::Value<'top> { - self.value - } -} - -impl<'top> LazyRawStruct<'top, TextEncoding_1_0> for LazyRawTextStruct_1_0<'top> { - type Iterator = RawTextStructIterator_1_0<'top>; - - fn annotations(&self) -> RawTextAnnotationsIterator<'top> { - self.value.annotations() - } - - fn iter(&self) -> Self::Iterator { - // Make an iterator over the input bytes that follow the initial `{`; account for - // a leading annotations sequence. - let struct_contents_start = self.value.encoded_value.data_offset() + 1; - RawTextStructIterator_1_0::new(self.value.input.slice_to_end(struct_contents_start)) - } -} - -impl<'top> IntoIterator for LazyRawTextStruct_1_0<'top> { - type Item = IonResult>; - type IntoIter = RawTextStructIterator_1_0<'top>; - - fn into_iter(self) -> Self::IntoIter { - self.iter() +impl<'top> LazyRawFieldName<'top, TextEncoding_1_1> + for LazyRawTextFieldName<'top, TextEncoding_1_1> +{ + fn read(&self) -> IonResult> { + self.matched.read() } } @@ -165,8 +120,8 @@ mod tests { fn expect_struct_range(ion_data: &str, expected: Range) -> IonResult<()> { let empty_context = EncodingContext::empty(); let context = empty_context.get_ref(); - let reader = &mut LazyRawTextReader_1_0::new(ion_data.as_bytes()); - let value = reader.next(context)?.expect_value()?; + let reader = &mut LazyRawTextReader_1_0::new(context, ion_data.as_bytes(), true); + let value = reader.next()?.expect_value()?; let actual_range = value.data_range(); assert_eq!( actual_range, expected, @@ -233,12 +188,8 @@ mod tests { for (input, field_name_ranges) in tests { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let mut reader = LazyRawTextReader_1_0::new(input.as_bytes()); - let struct_ = reader - .next(context)? - .expect_value()? - .read()? - .expect_struct()?; + let mut reader = LazyRawTextReader_1_0::new(context, input.as_bytes(), true); + let struct_ = reader.next()?.expect_value()?.read()?.expect_struct()?; for (field_result, (expected_name, expected_range)) in struct_.iter().zip(field_name_ranges.iter()) { diff --git a/src/lazy/text/raw/v1_1/reader.rs b/src/lazy/text/raw/v1_1/reader.rs index 7ca801a5..6c070981 100644 --- a/src/lazy/text/raw/v1_1/reader.rs +++ b/src/lazy/text/raw/v1_1/reader.rs @@ -7,10 +7,10 @@ use std::ops::Range; use crate::lazy::any_encoding::IonEncoding; use crate::lazy::decoder::private::LazyContainerPrivate; use crate::lazy::decoder::{ - Decoder, HasRange, HasSpan, LazyRawContainer, LazyRawFieldExpr, LazyRawFieldName, - LazyRawReader, LazyRawSequence, LazyRawStruct, LazyRawValue, LazyRawValueExpr, + Decoder, HasRange, HasSpan, LazyRawContainer, LazyRawFieldExpr, LazyRawReader, LazyRawStruct, + LazyRawValue, LazyRawValueExpr, }; -use crate::lazy::encoding::TextEncoding_1_1; +use crate::lazy::encoding::{TextEncoding, TextEncoding_1_1}; use crate::lazy::expanded::macro_evaluator::RawEExpression; use crate::lazy::expanded::macro_table::ION_1_1_SYSTEM_MACROS; use crate::lazy::expanded::EncodingContextRef; @@ -18,90 +18,84 @@ use crate::lazy::raw_stream_item::{EndPosition, LazyRawStreamItem, RawStreamItem use crate::lazy::span::Span; use crate::lazy::streaming_raw_reader::RawReaderState; use crate::lazy::text::buffer::TextBuffer; -use crate::lazy::text::matched::{MatchedFieldName, MatchedValue}; -use crate::lazy::text::parse_result::{AddContext, ToIteratorOutput}; +use crate::lazy::text::matched::MatchedValue; +use crate::lazy::text::parse_result::AddContext; use crate::lazy::text::raw::v1_1::arg_group::{EExpArg, TextEExpArgGroup}; -use crate::lazy::text::value::{LazyRawTextValue_1_1, RawTextAnnotationsIterator}; -use crate::{v1_1, Encoding, IonResult, IonType, RawSymbolRef}; -use bumpalo::collections::Vec as BumpVec; -use nom::character::streaming::satisfy; +use crate::lazy::text::value::{LazyRawTextValue, RawTextAnnotationsIterator}; +use crate::{v1_1, Encoding, IonResult}; pub struct LazyRawTextReader_1_1<'data> { - input: &'data [u8], - // The offset from the beginning of the overall stream at which the `input` slice begins - stream_offset: usize, - // The offset from the beginning of `input` at which the reader is positioned - local_offset: usize, + input: TextBuffer<'data>, +} + +impl<'data> LazyRawTextReader_1_1<'data> { + pub fn context(&self) -> EncodingContextRef<'data> { + self.input.context + } } impl<'data> LazyRawReader<'data, TextEncoding_1_1> for LazyRawTextReader_1_1<'data> { - fn resume_at_offset( - data: &'data [u8], - offset: usize, - // This argument is ignored by all raw readers except LazyRawAnyReader - _encoding_hint: IonEncoding, - ) -> Self { + fn new(context: EncodingContextRef<'data>, data: &'data [u8], is_final_data: bool) -> Self { + Self::resume( + context, + RawReaderState::new(data, 0, is_final_data, IonEncoding::Text_1_1), + ) + } + + fn resume(context: EncodingContextRef<'data>, saved_state: RawReaderState<'data>) -> Self { LazyRawTextReader_1_1 { - input: data, - // `data` begins at position `offset` within some larger stream. If `data` contains - // the entire stream, this will be zero. - stream_offset: offset, - // Start reading from the beginning of the slice `data` - local_offset: 0, + input: TextBuffer::new_with_offset( + context, + saved_state.data(), + saved_state.offset(), + saved_state.is_final_data(), + ), } } fn save_state(&self) -> RawReaderState<'data> { RawReaderState::new( - &self.input[self.local_offset..], + self.input.bytes(), self.position(), + self.input.is_final_data(), self.encoding(), ) } - fn next<'top>( - &'top mut self, - context: EncodingContextRef<'top>, - ) -> IonResult> - where - 'data: 'top, - { - let input = TextBuffer::new_with_offset( - context, - &self.input[self.local_offset..], - self.stream_offset + self.local_offset, - ); - let (buffer_after_whitespace, _whitespace) = input + fn next(&mut self) -> IonResult> { + let _whitespace = self + .input .match_optional_comments_and_whitespace() - .with_context("reading v1.1 whitespace/comments at the top level", input)?; - if buffer_after_whitespace.is_empty() { + .with_context( + "reading v1.1 whitespace/comments at the top level", + self.input, + )?; + if self.input.is_empty() { return Ok(RawStreamItem::EndOfStream(EndPosition::new( TextEncoding_1_1.encoding(), - buffer_after_whitespace.offset(), + self.input.offset(), ))); } // Consume any trailing whitespace that followed this item. Doing this allows us to check // whether this was the last item in the buffer by testing `buffer.is_empty()` afterward. - let (buffer_after_item, matched_item) = buffer_after_whitespace + let matched_item = self + .input .match_top_level_item_1_1() - .with_context("reading a v1.1 top-level value", buffer_after_whitespace)?; + .with_context("reading a v1.1 top-level value", self.input)?; - let (buffer_after_trailing_ws, _trailing_ws) = buffer_after_item + let _trailing_ws = self + .input .match_optional_comments_and_whitespace() .with_context( "reading trailing top-level whitespace/comments in v1.1", - buffer_after_item, + self.input, )?; - - // Since we successfully matched the next value, we'll update the buffer - // so a future call to `next()` will resume parsing the remaining input. - self.local_offset = buffer_after_trailing_ws.offset() - self.stream_offset; Ok(matched_item) } fn position(&self) -> usize { - self.stream_offset + self.local_offset + self.input.offset() } fn encoding(&self) -> IonEncoding { @@ -274,137 +268,14 @@ impl EncodedTextMacroInvocation { } } -#[derive(Copy, Clone)] -pub struct LazyRawTextList_1_1<'top> { - pub(crate) value: LazyRawTextValue_1_1<'top>, -} - -impl Debug for LazyRawTextList_1_1<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "[")?; - for value in self.iter() { - write!(f, "{:?}, ", value?.expect_value()?.read()?)?; - } - write!(f, "]").unwrap(); - - Ok(()) - } -} - #[derive(Debug, Copy, Clone)] -pub struct RawTextListIterator_1_1<'top> { - input: TextBuffer<'top>, - // If this iterator has returned an error, it should return `None` forever afterward - has_returned_error: bool, -} - -impl<'top> RawTextListIterator_1_1<'top> { - pub(crate) fn new(input: TextBuffer<'top>) -> Self { - Self { - input, - has_returned_error: false, - } - } -} - -/// Wraps a [`RawTextListIterator_1_1`] (which parses the body of a list) and caches the child -/// expressions the iterator yields along the way. Finally, returns a `Range` representing -/// the span of input bytes that the list occupies. -pub(crate) struct TextListSpanFinder_1_1<'top> { - pub(crate) allocator: &'top bumpalo::Bump, - pub(crate) iterator: RawTextListIterator_1_1<'top>, -} - -impl<'top> TextListSpanFinder_1_1<'top> { - pub(crate) fn find_span( - &self, - ) -> IonResult<( - Range, - &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], - )> { - // The input has already skipped past the opening delimiter. - let start = self.iterator.input.offset() - 1; - let mut child_expr_cache = BumpVec::new_in(self.allocator); - for expr_result in self.iterator { - let expr = expr_result?; - child_expr_cache.push(expr); - } - - let end = child_expr_cache - .last() - .map(|e| e.range().end) - .unwrap_or(self.iterator.input.offset()); - let input_after_last_expr = self - .iterator - .input - .slice_to_end(end - self.iterator.input.offset()); - - let (mut input_after_ws, _ws) = input_after_last_expr - .match_optional_comments_and_whitespace() - .with_context("seeking the end of a list", input_after_last_expr)?; - // Skip an optional comma and more whitespace - if input_after_ws.bytes().first() == Some(&b',') { - (input_after_ws, _) = input_after_ws - .slice_to_end(1) - .match_optional_comments_and_whitespace() - .with_context("skipping a v1.1 list's trailing comma", input_after_ws)?; - } - let (input_after_end, _end_delimiter) = satisfy(|c| c == ']')(input_after_ws) - .with_context("seeking the closing delimiter of a list", input_after_ws)?; - let end = input_after_end.offset(); - - let span = start..end; - Ok((span, child_expr_cache.into_bump_slice())) - } - pub fn new(allocator: &'top bumpalo::Bump, iterator: RawTextListIterator_1_1<'top>) -> Self { - Self { - allocator, - iterator, - } - } -} - -#[derive(Copy, Clone)] -pub struct LazyRawTextSExp_1_1<'top> { - pub(crate) value: LazyRawTextValue_1_1<'top>, -} - -impl Debug for LazyRawTextSExp_1_1<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "(")?; - for value in self.iter() { - write!(f, "{:?} ", value?.expect_value()?.read()?)?; - } - write!(f, ")").unwrap(); - - Ok(()) - } -} - -#[derive(Debug, Copy, Clone)] -pub struct RawTextSExpIterator_1_1<'top> { - input: TextBuffer<'top>, - // If this iterator has returned an error, it should return `None` forever afterwards - has_returned_error: bool, -} - -impl<'top> RawTextSExpIterator_1_1<'top> { - pub(crate) fn new(input: TextBuffer<'top>) -> Self { - Self { - input, - has_returned_error: false, - } - } -} - -#[derive(Debug, Copy, Clone)] -pub struct RawTextSequenceCacheIterator_1_1<'top> { - child_exprs: &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], +pub struct RawTextSequenceCacheIterator<'top, E: TextEncoding<'top>> { + child_exprs: &'top [LazyRawValueExpr<'top, E>], index: usize, } -impl<'top> RawTextSequenceCacheIterator_1_1<'top> { - pub fn new(child_exprs: &'top [LazyRawValueExpr<'top, TextEncoding_1_1>]) -> Self { +impl<'top, E: TextEncoding<'top>> RawTextSequenceCacheIterator<'top, E> { + pub fn new(child_exprs: &'top [LazyRawValueExpr<'top, E>]) -> Self { Self { child_exprs, index: 0, @@ -412,8 +283,8 @@ impl<'top> RawTextSequenceCacheIterator_1_1<'top> { } } -impl<'top> Iterator for RawTextSequenceCacheIterator_1_1<'top> { - type Item = IonResult>; +impl<'top, E: TextEncoding<'top>> Iterator for RawTextSequenceCacheIterator<'top, E> { + type Item = IonResult>; fn next(&mut self) -> Option { let next_expr = self.child_exprs.get(self.index)?; @@ -453,155 +324,12 @@ impl<'top> Iterator for TextEExpArgsIterator_1_1<'top> { } } -/// Wraps a [`RawTextSExpIterator_1_1`] (which parses the body of a sexp) and caches the child -/// expressions the iterator yields along the way. Finally, returns a `Range` representing -/// the span of input bytes that the sexp occupies. -pub(crate) struct TextSExpSpanFinder_1_1<'top> { - pub(crate) allocator: &'top bumpalo::Bump, - pub(crate) iterator: RawTextSExpIterator_1_1<'top>, -} - -impl<'top> TextSExpSpanFinder_1_1<'top> { - pub fn new(allocator: &'top bumpalo::Bump, iterator: RawTextSExpIterator_1_1<'top>) -> Self { - Self { - allocator, - iterator, - } - } - - /// Scans ahead to find the end of this s-expression and reports the input span that it occupies. - /// As it scans, it records lazy references to the S-expression's child expressions. - /// - /// The `initial_bytes_skipped` parameter indicates how many bytes of input that represented the - /// beginning of the expression are not in the buffer. For plain s-expressions, this will always - /// be `1` as they begin with a single open parenthesis `(`. For e-expressions (which are used - /// to invoke macros from the data stream), it will always be a minimum of `3`: two bytes for - /// the opening `(:` and at least one for the macro identifier. (For example: `(:foo`.) - pub(crate) fn find_span( - &self, - initial_bytes_skipped: usize, - ) -> IonResult<( - Range, - &'top [LazyRawValueExpr<'top, TextEncoding_1_1>], - )> { - // The input has already skipped past the opening delimiter. - let start = self.iterator.input.offset() - initial_bytes_skipped; - let mut child_expr_cache = BumpVec::new_in(self.allocator); - - for expr_result in self.iterator { - let expr = expr_result?; - child_expr_cache.push(expr); - } - - let end = child_expr_cache - .last() - .map(|e| e.range().end) - .unwrap_or(self.iterator.input.offset()); - let input_after_last_expr = self - .iterator - .input - .slice_to_end(end - self.iterator.input.offset()); - - let (input_after_ws, _ws) = input_after_last_expr - .match_optional_comments_and_whitespace() - .with_context("seeking the end of a sexp", input_after_last_expr)?; - let (input_after_end, _end_delimiter) = satisfy(|c| c == ')')(input_after_ws) - .with_context("seeking the closing delimiter of a sexp", input_after_ws)?; - let end = input_after_end.offset(); - - let range = start..end; - Ok((range, child_expr_cache.into_bump_slice())) - } -} - -impl<'top> LazyContainerPrivate<'top, TextEncoding_1_1> for LazyRawTextSExp_1_1<'top> { - fn from_value(value: LazyRawTextValue_1_1<'top>) -> Self { - LazyRawTextSExp_1_1 { value } - } -} - -impl<'top> LazyRawContainer<'top, TextEncoding_1_1> for LazyRawTextSExp_1_1<'top> { - fn as_value(&self) -> ::Value<'top> { - self.value - } -} - -impl<'top> LazyRawSequence<'top, TextEncoding_1_1> for LazyRawTextSExp_1_1<'top> { - type Iterator = RawTextSequenceCacheIterator_1_1<'top>; - - fn annotations(&self) -> RawTextAnnotationsIterator<'top> { - self.value.annotations() - } - - fn ion_type(&self) -> IonType { - self.value.ion_type() - } - - fn iter(&self) -> Self::Iterator { - let MatchedValue::SExp(child_exprs) = self.value.encoded_value.matched() else { - unreachable!("s-expression contained a matched value of the wrong type") - }; - RawTextSequenceCacheIterator_1_1::new(child_exprs) - } -} - -impl<'top> Iterator for RawTextSExpIterator_1_1<'top> { - type Item = IonResult>; - - fn next(&mut self) -> Option { - if self.has_returned_error { - return None; - } - match self.input.match_sexp_value_1_1() { - Ok((remaining, Some(value))) => { - self.input = remaining; - Some(Ok(value)) - } - Ok((_remaining, None)) => None, - Err(e) => { - self.has_returned_error = true; - e.with_context("reading the next sexp value", self.input) - .transpose() - } - } - } -} - -#[derive(Debug, Copy, Clone)] -pub struct LazyRawTextFieldName_1_1<'top> { - matched: MatchedFieldName<'top>, -} - -impl<'top> LazyRawTextFieldName_1_1<'top> { - pub(crate) fn new(matched: MatchedFieldName<'top>) -> Self { - Self { matched } - } -} - -impl<'top> HasSpan<'top> for LazyRawTextFieldName_1_1<'top> { - fn span(&self) -> Span<'top> { - self.matched.span() - } -} - -impl HasRange for LazyRawTextFieldName_1_1<'_> { - fn range(&self) -> Range { - self.matched.range() - } -} - -impl<'top> LazyRawFieldName<'top, TextEncoding_1_1> for LazyRawTextFieldName_1_1<'top> { - fn read(&self) -> IonResult> { - self.matched.read() - } -} - #[derive(Copy, Clone)] -pub struct LazyRawTextStruct_1_1<'top> { - pub(crate) value: LazyRawTextValue_1_1<'top>, +pub struct LazyRawTextStruct<'top, E: TextEncoding<'top>> { + pub(crate) value: LazyRawTextValue<'top, E>, } -impl Debug for LazyRawTextStruct_1_1<'_> { +impl<'top, E: TextEncoding<'top>> Debug for LazyRawTextStruct<'top, E> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "{{")?; for field_result in self.iter() { @@ -626,28 +354,13 @@ impl Debug for LazyRawTextStruct_1_1<'_> { } #[derive(Debug, Copy, Clone)] -pub struct RawTextStructIterator_1_1<'top> { - input: TextBuffer<'top>, - has_returned_error: bool, -} - -impl<'top> RawTextStructIterator_1_1<'top> { - pub(crate) fn new(input: TextBuffer<'top>) -> Self { - Self { - input, - has_returned_error: false, - } - } -} - -#[derive(Debug, Copy, Clone)] -pub struct RawTextStructCacheIterator_1_1<'top> { - field_exprs: &'top [LazyRawFieldExpr<'top, TextEncoding_1_1>], +pub struct RawTextStructCacheIterator<'top, E: TextEncoding<'top>> { + field_exprs: &'top [LazyRawFieldExpr<'top, E>], index: usize, } -impl<'top> RawTextStructCacheIterator_1_1<'top> { - pub fn new(field_exprs: &'top [LazyRawFieldExpr<'top, TextEncoding_1_1>]) -> Self { +impl<'top, E: TextEncoding<'top>> RawTextStructCacheIterator<'top, E> { + pub fn new(field_exprs: &'top [LazyRawFieldExpr<'top, E>]) -> Self { Self { field_exprs, index: 0, @@ -655,8 +368,8 @@ impl<'top> RawTextStructCacheIterator_1_1<'top> { } } -impl<'top> Iterator for RawTextStructCacheIterator_1_1<'top> { - type Item = IonResult>; +impl<'top, E: TextEncoding<'top>> Iterator for RawTextStructCacheIterator<'top, E> { + type Item = IonResult>; fn next(&mut self) -> Option { let next_expr = self.field_exprs.get(self.index)?; @@ -666,78 +379,20 @@ impl<'top> Iterator for RawTextStructCacheIterator_1_1<'top> { } } -// ===== Trait implementations ===== - -impl<'top> LazyContainerPrivate<'top, TextEncoding_1_1> for LazyRawTextList_1_1<'top> { - fn from_value(value: LazyRawTextValue_1_1<'top>) -> Self { - LazyRawTextList_1_1 { value } +impl<'top, E: TextEncoding<'top>> LazyContainerPrivate<'top, E> for LazyRawTextStruct<'top, E> { + fn from_value(value: LazyRawTextValue<'top, E>) -> Self { + LazyRawTextStruct { value } } } -impl<'top> LazyRawContainer<'top, TextEncoding_1_1> for LazyRawTextList_1_1<'top> { - fn as_value(&self) -> LazyRawTextValue_1_1<'top> { +impl<'top, E: TextEncoding<'top>> LazyRawContainer<'top, E> for LazyRawTextStruct<'top, E> { + fn as_value(&self) -> ::Value<'top> { self.value } } -impl<'top> LazyRawSequence<'top, TextEncoding_1_1> for LazyRawTextList_1_1<'top> { - type Iterator = RawTextSequenceCacheIterator_1_1<'top>; - - fn annotations(&self) -> RawTextAnnotationsIterator<'top> { - self.value.annotations() - } - - fn ion_type(&self) -> IonType { - self.value.ion_type() - } - - fn iter(&self) -> Self::Iterator { - let MatchedValue::List(child_exprs) = self.value.encoded_value.matched() else { - unreachable!("list contained a matched value of the wrong type") - }; - RawTextSequenceCacheIterator_1_1::new(child_exprs) - } -} - -impl<'top> Iterator for RawTextListIterator_1_1<'top> { - type Item = IonResult>; - - fn next(&mut self) -> Option { - if self.has_returned_error { - return None; - } - match self.input.match_list_value_1_1() { - Ok((remaining, Some(value_expr))) => { - self.input = remaining; - Some(Ok(value_expr)) - } - Ok((_remaining, None)) => { - // Don't update `remaining` so subsequent calls will continue to return None - None - } - Err(e) => { - self.has_returned_error = true; - e.with_context("reading the next list value", self.input) - .transpose() - } - } - } -} - -impl<'top> LazyContainerPrivate<'top, TextEncoding_1_1> for LazyRawTextStruct_1_1<'top> { - fn from_value(value: LazyRawTextValue_1_1<'top>) -> Self { - LazyRawTextStruct_1_1 { value } - } -} - -impl<'top> LazyRawContainer<'top, TextEncoding_1_1> for LazyRawTextStruct_1_1<'top> { - fn as_value(&self) -> ::Value<'top> { - self.value - } -} - -impl<'top> LazyRawStruct<'top, TextEncoding_1_1> for LazyRawTextStruct_1_1<'top> { - type Iterator = RawTextStructCacheIterator_1_1<'top>; +impl<'top, E: TextEncoding<'top>> LazyRawStruct<'top, E> for LazyRawTextStruct<'top, E> { + type Iterator = RawTextStructCacheIterator<'top, E>; fn annotations(&self) -> RawTextAnnotationsIterator<'top> { self.value.annotations() @@ -747,87 +402,7 @@ impl<'top> LazyRawStruct<'top, TextEncoding_1_1> for LazyRawTextStruct_1_1<'top> let MatchedValue::Struct(field_exprs) = self.value.encoded_value.matched() else { unreachable!("struct contained a matched value of the wrong type") }; - RawTextStructCacheIterator_1_1::new(field_exprs) - } -} - -impl<'top> Iterator for RawTextStructIterator_1_1<'top> { - type Item = IonResult>; - - fn next(&mut self) -> Option { - if self.has_returned_error { - return None; - } - match self.input.match_struct_field_1_1() { - Ok((remaining_input, Some(field))) => { - self.input = remaining_input; - Some(Ok(field)) - } - Ok((_, None)) => None, - Err(e) => { - self.has_returned_error = true; - e.with_context("reading the next struct field", self.input) - .transpose() - } - } - } -} - -/// Wraps a [`RawTextStructIterator_1_1`] (which parses the body of a struct) and caches the field -/// expressions the iterator yields along the way. Finally, returns a `Range` representing -/// the span of input bytes that the struct occupies. -pub(crate) struct TextStructSpanFinder_1_1<'top> { - pub(crate) allocator: &'top bumpalo::Bump, - pub(crate) iterator: RawTextStructIterator_1_1<'top>, -} - -impl<'top> TextStructSpanFinder_1_1<'top> { - pub fn new(allocator: &'top bumpalo::Bump, iterator: RawTextStructIterator_1_1<'top>) -> Self { - Self { - allocator, - iterator, - } - } - - /// Scans ahead to find the end of this struct and reports the input span that it occupies. - /// As it scans, it records lazy references to the struct's field expressions. - pub(crate) fn find_span( - &self, - ) -> IonResult<( - Range, - &'top [LazyRawFieldExpr<'top, TextEncoding_1_1>], - )> { - // The input has already skipped past the opening delimiter. - let start = self.iterator.input.offset() - 1; - let mut child_expr_cache = BumpVec::new_in(self.allocator); - for expr_result in self.iterator { - let expr = expr_result?; - child_expr_cache.push(expr); - } - - let end = child_expr_cache - .last() - .map(|e| e.range().end) - .unwrap_or(start + 1); - let input_after_last_field_expr = self - .iterator - .input - .slice_to_end(end - self.iterator.input.offset()); - - let (mut input_after_ws, _ws) = input_after_last_field_expr - .match_optional_comments_and_whitespace() - .with_context("seeking the end of a struct", input_after_last_field_expr)?; - // Skip an optional comma and more whitespace - if input_after_ws.bytes().first() == Some(&b',') { - (input_after_ws, _) = input_after_ws - .slice_to_end(1) - .match_optional_comments_and_whitespace() - .with_context("skipping a struct's trailing comma", input_after_ws)?; - } - let (input_after_end, _end_delimiter) = satisfy(|c| c == b'}' as char)(input_after_ws) - .with_context("seeking the closing delimiter of a struct", input_after_ws)?; - let end = input_after_end.offset(); - Ok((start..end, child_expr_cache.into_bump_slice())) + RawTextStructCacheIterator::new(field_exprs) } } @@ -837,17 +412,16 @@ mod tests { use crate::lazy::expanded::compiler::TemplateCompiler; use crate::lazy::expanded::EncodingContext; use crate::lazy::raw_value_ref::RawValueRef; - use crate::RawVersionMarker; + use crate::{IonType, RawVersionMarker}; use super::*; - fn expect_next<'top, 'data: 'top>( - context: EncodingContextRef<'top>, - reader: &'top mut LazyRawTextReader_1_1<'data>, - expected: RawValueRef<'top, TextEncoding_1_1>, + fn expect_next<'data>( + reader: &mut LazyRawTextReader_1_1<'data>, + expected: RawValueRef<'data, TextEncoding_1_1>, ) { let lazy_value = reader - .next(context) + .next() .expect("advancing the reader failed") .expect_value() .expect("expected a value"); @@ -875,21 +449,16 @@ mod tests { let macro_quux = TemplateCompiler::compile_from_source(context.get_ref(), "(macro quux (x) null)")?; context.macro_table.add_template_macro(macro_quux)?; - let reader = &mut LazyRawTextReader_1_1::new(data.as_bytes()); - let context = context.get_ref(); + let reader = &mut LazyRawTextReader_1_1::new(context.get_ref(), data.as_bytes(), true); // $ion_1_1 - assert_eq!(reader.next(context)?.expect_ivm()?.major_minor(), (1, 1)); + assert_eq!(reader.next()?.expect_ivm()?.major_minor(), (1, 1)); // "foo" - expect_next(context, reader, RawValueRef::String("foo".into())); + expect_next(reader, RawValueRef::String("foo".into())); // bar - expect_next(context, reader, RawValueRef::Symbol("bar".into())); + expect_next(reader, RawValueRef::Symbol("bar".into())); // (baz null.string) - let sexp = reader - .next(context)? - .expect_value()? - .read()? - .expect_sexp()?; + let sexp = reader.next()?.expect_value()?.read()?.expect_sexp()?; let mut children = sexp.iter(); assert_eq!( children.next().unwrap()?.expect_value()?.read()?, @@ -901,10 +470,10 @@ mod tests { ); assert!(children.next().is_none()); // (:quux quuz) - let macro_invocation = reader.next(context)?.expect_eexp()?; + let macro_invocation = reader.next()?.expect_eexp()?; assert_eq!(macro_invocation.id, MacroIdRef::LocalName("quux")); - expect_next(context, reader, RawValueRef::Int(77.into())); - expect_next(context, reader, RawValueRef::Bool(false)); + expect_next(reader, RawValueRef::Int(77.into())); + expect_next(reader, RawValueRef::Bool(false)); Ok(()) } } diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index 3a1ead49..c9c8d62b 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -1,10 +1,5 @@ #![allow(non_camel_case_types)] -use std::fmt; -use std::fmt::{Debug, Formatter}; -use std::marker::PhantomData; -use std::ops::Range; - use crate::lazy::decoder::private::LazyContainerPrivate; use crate::lazy::decoder::{Decoder, HasRange, HasSpan, LazyRawValue, RawVersionMarker}; use crate::lazy::encoding::{TextEncoding, TextEncoding_1_0, TextEncoding_1_1}; @@ -13,6 +8,10 @@ use crate::lazy::span::Span; use crate::lazy::text::buffer::TextBuffer; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::{IonEncoding, IonResult, IonType, RawSymbolRef}; +use std::fmt; +use std::fmt::{Debug, Formatter}; +use std::marker::PhantomData; +use std::ops::Range; /// A value that has been identified in the text input stream but whose data has not yet been read. /// @@ -235,6 +234,7 @@ pub struct RawTextAnnotationsIterator<'data> { impl<'top> RawTextAnnotationsIterator<'top> { pub(crate) fn new(input: TextBuffer<'top>) -> Self { + debug_assert!(input.is_final_data()); RawTextAnnotationsIterator { input, has_returned_error: false, @@ -252,13 +252,10 @@ impl<'top> Iterator for RawTextAnnotationsIterator<'top> { // Match the first annotation in the input. In order for this iterator to be created, // the parser already successfully matched this input once before, so we know it will succeed. - use nom::Parser; - let (remaining, (symbol, span)) = TextBuffer::match_annotation - .parse(self.input) + use winnow::Parser; + let (symbol, matched_input) = TextBuffer::match_annotation + .parse_next(&mut self.input) .expect("annotations were already matched successfully by this parser"); - let matched_input = self - .input - .slice(span.start - self.input.offset(), span.len()); let text = match symbol.read(self.input.context.allocator(), matched_input) { Ok(text) => text, Err(e) => { @@ -266,7 +263,6 @@ impl<'top> Iterator for RawTextAnnotationsIterator<'top> { return Some(Err(e)); } }; - self.input = remaining; Some(Ok(text)) } } @@ -283,7 +279,7 @@ mod tests { fn test(input: &str) -> IonResult<()> { let encoding_context = EncodingContext::empty(); let context = encoding_context.get_ref(); - let input = TextBuffer::new(context, input.as_bytes()); + let input = TextBuffer::new(context, input.as_bytes(), true); let mut iter = RawTextAnnotationsIterator::new(input); assert_eq!(iter.next().unwrap()?, RawSymbolRef::Text("foo")); assert_eq!(iter.next().unwrap()?, RawSymbolRef::Text("bar")); diff --git a/src/lazy/value_ref.rs b/src/lazy/value_ref.rs index f4c96e05..b49cf6ee 100644 --- a/src/lazy/value_ref.rs +++ b/src/lazy/value_ref.rs @@ -234,7 +234,7 @@ impl<'top, D: Decoder> ValueRef<'top, D> { if let ValueRef::SExp(s) = self { Ok(s) } else { - IonResult::decoding_error("expected a sexp") + IonResult::decoding_error(format!("expected an s-expression but found a(n) {self:?}")) } } diff --git a/src/text/text_formatter.rs b/src/text/text_formatter.rs index ebef4d8c..38a10338 100644 --- a/src/text/text_formatter.rs +++ b/src/text/text_formatter.rs @@ -239,7 +239,7 @@ impl fmt::Write for IoValueFormatter { } } -impl FmtValueFormatter<'_, W> { +impl FmtValueFormatter<'_, W> { /// Returns `true` if the provided `token`'s text is an 'identifier'. That is, the text starts /// with a `$`, `_` or ASCII letter and is followed by a sequence of `$`, `_`, or ASCII letters /// and numbers. Examples: diff --git a/tests/detect_incomplete_text.rs b/tests/detect_incomplete_text.rs index 56be95e5..32c92c29 100644 --- a/tests/detect_incomplete_text.rs +++ b/tests/detect_incomplete_text.rs @@ -1,6 +1,6 @@ #![cfg(feature = "experimental-reader-writer")] -use crate::ion_tests::{DataStraw, SkipList, ELEMENT_GLOBAL_SKIP_LIST}; +use crate::ion_tests::{DataStraw, ELEMENT_GLOBAL_SKIP_LIST}; use ion_rs::{ AnyEncoding, Element, ElementReader, IonData, IonError, IonResult, IonStream, Reader, }; @@ -13,29 +13,12 @@ use test_generator::test_resources; mod ion_tests; -// These tests are all failing because multipart long strings are not handled correctly when the -// "part" boundary happens to also fall on a point where the reader needs to refill the input buffer. -const INCOMPLETE_LONG_STRING_SKIP_LIST: SkipList = &[ - "ion-tests/iontestdata/good/equivs/localSymbolTableAppend.ion", - "ion-tests/iontestdata/good/equivs/localSymbolTableNullSlots.ion", - "ion-tests/iontestdata/good/equivs/longStringsWithComments.ion", - "ion-tests/iontestdata/good/equivs/strings.ion", - "ion-tests/iontestdata/good/lists.ion", - "ion-tests/iontestdata/good/strings.ion", - "ion-tests/iontestdata/good/stringsWithWhitespace.ion", - "ion-tests/iontestdata/good/strings_cr_nl.ion", - "ion-tests/iontestdata/good/strings2.ion", - "ion-tests/iontestdata/good/structs.ion", - "ion-tests/iontestdata/good/strings_nl.ion", -]; - // A copy of the `ELEMENT_GLOBAL_SKIP_LIST` in which each file name has been canonicalized for the // current host machine. This makes it possible to compare names in the list with names of files // on the host without worrying about differences in (for example) path separators. static CANONICAL_FILE_NAMES: LazyLock> = LazyLock::new(|| { ELEMENT_GLOBAL_SKIP_LIST .iter() - .chain(INCOMPLETE_LONG_STRING_SKIP_LIST.iter()) .filter_map(|filename| { // Canonicalize the skip list file names so they're in the host OS' preferred format. // This involves looking up the actual file; if canonicalization fails, the file could diff --git a/tests/ion_hash_tests.rs b/tests/ion_hash_tests.rs index b2da287e..bb088ad8 100644 --- a/tests/ion_hash_tests.rs +++ b/tests/ion_hash_tests.rs @@ -75,11 +75,7 @@ fn without_trailing_zeros(data: &[u8]) -> &[u8] { return data; } - let index = data - .as_ref() - .iter() - .rposition(|byte| *byte != 0x00) - .unwrap(); + let index = data.iter().rposition(|byte| *byte != 0x00).unwrap(); &data[0..=index] }