From 6e96e11cbf080393f20aee92a26984321fd382b3 Mon Sep 17 00:00:00 2001 From: Martin Haug Date: Fri, 12 Jul 2024 17:15:36 +0200 Subject: [PATCH] Add methods to check for PDF/A compliance --- src/color.rs | 15 ++++++++ src/content.rs | 20 ++++++++++- src/font.rs | 31 ++++++++++++++++ src/lib.rs | 23 ++++++++++-- src/object.rs | 49 +++++++++++++++++++++++++- src/structure.rs | 92 +++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 225 insertions(+), 5 deletions(-) diff --git a/src/color.rs b/src/color.rs index c5e8090..13a465d 100644 --- a/src/color.rs +++ b/src/color.rs @@ -632,6 +632,21 @@ impl<'a> DeviceN<'a> { DeviceNAttrs::start(self.array.push()) } + + /// Finish writing the `DeviceN` color space array while checking some + /// provisions of PDF/A-2 clause 6.2.4.4 and 6.1.13. + pub fn finish_pdfa(self) -> PdfaResult<()> { + if self.array.len() > 8 { + return Err(PdfaError::TooManyColorants(self.array.len() as usize)); + } + + if !self.has_alternate || !self.has_tint { + return Err(PdfaError::MalformedDeviceNArray); + } + + self.finish(); + Ok(()) + } } /// Writer for a _DeviceN attributes dictionary_. PDF 1.6+. diff --git a/src/content.rs b/src/content.rs index 354ff32..2b605da 100644 --- a/src/content.rs +++ b/src/content.rs @@ -3,6 +3,7 @@ use super::*; /// A builder for a content stream. pub struct Content { buf: Vec, + q_nesting: usize, } /// Core methods. @@ -16,7 +17,7 @@ impl Content { /// Create a new content stream with the specified initial buffer capacity. pub fn with_capacity(capacity: usize) -> Self { - Self { buf: Vec::with_capacity(capacity) } + Self { buf: Vec::with_capacity(capacity), q_nesting: 0 } } /// Start writing an arbitrary operation. @@ -243,13 +244,30 @@ impl Content { #[inline] pub fn save_state(&mut self) -> &mut Self { self.op("q"); + + // Saturating is okay here since we would have returned an error way + // before if the nesting was checked. + self.q_nesting = self.q_nesting.saturating_add(1); self } + /// `q`: Save the graphics state on the stack while checking that the + /// nesting limit in PDF/A-2 clause 6.1.13 is respected. + #[inline] + pub fn save_state_checked(&mut self) -> PdfaResult<&mut Self> { + if self.q_nesting >= 28 { + return Err(PdfaError::OverlyNestedGraphicsState); + } + + Ok(self.save_state()) + } + /// `Q`: Restore the graphics state from the stack. #[inline] pub fn restore_state(&mut self) -> &mut Self { self.op("Q"); + + self.q_nesting = self.q_nesting.saturating_sub(1); self } diff --git a/src/font.rs b/src/font.rs index 398aa1d..6c5acde 100644 --- a/src/font.rs +++ b/src/font.rs @@ -939,17 +939,46 @@ where self.pair_with_multiple(glyph, [codepoint]); } + /// Add a mapping from a glyph ID to a codepoint, checking for codepoints + /// that are invalid in some PDF/A profiles. + pub fn pair_pdfa(&mut self, glyph: G, codepoint: char) -> PdfaResult<()> { + self.pair_with_multiple_pdfa(glyph, [codepoint]) + } + /// Add a mapping from a glyph ID to multiple codepoints. pub fn pair_with_multiple( &mut self, glyph: G, codepoints: impl IntoIterator, ) { + self.pair_with_multiple_impl(glyph, codepoints, false).unwrap(); + } + + /// Add a mapping from a glyph ID to multiple codepoints, checking for + /// codepoints that are invalid in some PDF/A profiles. + pub fn pair_with_multiple_pdfa( + &mut self, + glyph: G, + codepoints: impl IntoIterator, + ) -> PdfaResult<()> { + self.pair_with_multiple_impl(glyph, codepoints, true) + } + + fn pair_with_multiple_impl( + &mut self, + glyph: G, + codepoints: impl IntoIterator, + check_pdfa: bool, + ) -> PdfaResult<()> { self.mappings.push(b'<'); glyph.push(&mut self.mappings); self.mappings.extend(b"> <"); for c in codepoints { + if check_pdfa && (c == '\u{0}' || c == '\u{feff}' || c == '\u{fffe}') { + return Err(PdfaError::InvalidCMapCodepoint); + } + for &mut part in c.encode_utf16(&mut [0; 2]) { self.mappings.push_hex_u16(part); } @@ -962,6 +991,8 @@ where if self.count >= 100 { self.flush_range(); } + + Ok(()) } /// Finish building the character map. diff --git a/src/lib.rs b/src/lib.rs index 4bfb456..4ab22a7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -177,13 +177,15 @@ pub mod types { pub use object::Predictor; pub use renditions::{MediaClipType, RenditionType, TempFileType}; pub use structure::{ - Direction, NumberingStyle, OutlineItemFlags, PageLayout, PageMode, StructRole, - TabOrder, TrappingStatus, + Direction, NumberingStyle, OutlineItemFlags, PageLayout, PageMode, PdfaError, + PdfaResult, StructRole, TabOrder, TrappingStatus, }; pub use transitions::{TransitionAngle, TransitionStyle}; pub use xobject::SMaskInData; } +use structure::{PdfaError, PdfaResult}; + pub use self::chunk::Chunk; pub use self::content::Content; pub use self::object::{ @@ -276,6 +278,23 @@ impl Pdf { self.indirect(id).start() } + /// Write the cross-reference table and file trailer and return the + /// underlying buffer while checking the number of indirect objects and + /// whether a file ID was written for compliance with PDF/A. + /// + /// Panics if any indirect reference id was used twice. + pub fn finish_pdfa(self) -> PdfaResult> { + if self.chunk.offsets.len() > 8388607 { + return Err(PdfaError::TooManyIndirectObjects(self.chunk.offsets.len())); + } + + if self.file_id.is_none() { + return Err(PdfaError::MissingFileID); + } + + Ok(self.finish()) + } + /// Write the cross-reference table and file trailer and return the /// underlying buffer. /// diff --git a/src/object.rs b/src/object.rs index ea34a1f..e85dc4f 100644 --- a/src/object.rs +++ b/src/object.rs @@ -3,6 +3,8 @@ use std::marker::PhantomData; use std::mem::ManuallyDrop; use std::num::NonZeroI32; +use structure::{PdfaError, PdfaResult}; + use super::*; /// A primitive PDF object. @@ -52,7 +54,18 @@ impl Primitive for f32 { #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub struct Str<'a>(pub &'a [u8]); -impl Str<'_> { +impl<'a> Str<'a> { + /// Construct a new string and check that it is no longer than 32767 bytes. + /// + /// This helps to ensure compliance with Section 6.1.8 in the PDF/A-2 spec. + pub fn pdfa(bytes: &'a [u8]) -> PdfaResult { + if bytes.len() > 32767 { + return Err(PdfaError::OverlongString(bytes.len())); + } + + Ok(Self(bytes)) + } + /// Whether the parentheses in the byte string are balanced. fn is_balanced(self) -> bool { let mut depth = 0; @@ -149,6 +162,22 @@ impl Primitive for TextStr<'_> { #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub struct Name<'a>(pub &'a [u8]); +impl<'a> Name<'a> { + /// Create a new name from a byte string and check that it is valid UTF-8 + /// and no longer than 127 bytes. + /// + /// This helps to ensure compliance with Section 6.1.8 in the PDF/A + /// specifications PDF/A-2, PDF/A-3, and PDF/A-4. + pub fn pdfa(bytes: &'a [u8]) -> PdfaResult { + if bytes.len() > 127 { + return Err(PdfaError::OverlongName(bytes.len())); + } + + std::str::from_utf8(bytes).map_err(PdfaError::NameNotUtf8)?; + Ok(Self(bytes)) + } +} + impl Primitive for Name<'_> { fn write(self, buf: &mut Vec) { buf.reserve(1 + self.0.len()); @@ -271,6 +300,24 @@ impl Rect { Self { x1, y1, x2, y2 } } + /// Create a new rectangle that complies with the implementation limits for + /// page sizes. + #[inline] + pub fn page(x1: f32, y1: f32, x2: f32, y2: f32) -> PdfaResult { + let width = (x2 - x1).abs(); + let height = (y2 - y1).abs(); + + if !(3.0..=14400.0).contains(&width) { + return Err(PdfaError::PageWidthOutOfRange(width)); + } + + if !(3.0..=14400.0).contains(&height) { + return Err(PdfaError::PageHeightOutOfRange(height)); + } + + Ok(Self { x1, y1, x2, y2 }) + } + /// Convert this rectangle into 8 floats describing the four corners of the /// rectangle in counterclockwise order. #[inline] diff --git a/src/structure.rs b/src/structure.rs index 453d38c..5e02a4d 100644 --- a/src/structure.rs +++ b/src/structure.rs @@ -1,3 +1,5 @@ +use std::str::Utf8Error; + use crate::color::SeparationInfo; use super::*; @@ -141,9 +143,9 @@ impl<'a> Catalog<'a> { /// /// Each entry in the array is an [output intent /// dictionary.](writers::OutputIntent) - pub fn output_intents(&mut self) -> TypedArray<'_, Dict> { /// /// Must be present in PDF/X documents, encouraged in PDF/A documents. + pub fn output_intents(&mut self) -> TypedArray<'_, OutputIntent> { self.insert(Name(b"OutputIntents")).array().typed() } } @@ -1570,3 +1572,91 @@ impl<'a> Metadata<'a> { } deref!('a, Metadata<'a> => Stream<'a>, stream); + +/// A result type for operations that check for PDF/A compliance. +pub type PdfaResult = Result; + +/// Errors that pdf-writer can automatically detect when writing PDF/A files. +/// +/// Please note that these errors only enforce provisions of clauses 6.1.8, +/// 6.1.13, 6.1.3, and 6.2.11.7.2 of the PDF/A-2 spec. They do not enforce the +/// entire spec, so additional attention needs to be paid to write compliant +/// files. +/// +/// Integer and float implementation limits are not checked since they are +/// already enforced by the `i32` and `f32` types, respectively. +#[derive(Debug, Clone, PartialEq)] +pub enum PdfaError { + /// A string contained more than 32767 bytes. + OverlongString(usize), + /// A name object contained more than 127 bytes. + OverlongName(usize), + /// A name object was not UTF-8 decodable. + NameNotUtf8(Utf8Error), + /// The file has more than 8388607 indirect objects. + TooManyIndirectObjects(usize), + /// The graphics state was nested more than 28 levels deep. + OverlyNestedGraphicsState, + /// A DeviceN color space had more than 8 colorants. + TooManyColorants(usize), + /// The DeviceN array does not comply with clause 8.6.6.5 of ISO + /// 32000-1:2008. + MalformedDeviceNArray, + /// The file trailer is missing a file ID. + /// + /// Call [`crate::Pdf::set_file_id`] before finishing the file. + MissingFileID, + /// The CMap maps to a codepoint 0, U+FFFE, or U+FEFF. + /// + /// Only applicable to PDF/A-2u, PDF/A-2a, and similar profiles in other + /// parts. + InvalidCMapCodepoint, + /// The page width is out of range. + PageWidthOutOfRange(f32), + /// The page height is out of range. + PageHeightOutOfRange(f32), +} + +impl std::fmt::Display for PdfaError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Self::OverlongString(len) => { + write!(f, "string contained {} bytes but must not exceed 32767", len) + } + Self::OverlongName(len) => { + write!(f, "name contained {} bytes but must not exceed 127", len) + } + Self::NameNotUtf8(e) => write!(f, "name was not UTF-8 decodable ({})", e), + Self::TooManyIndirectObjects(count) => write!( + f, + "file has {} indirect objects but must not exceed 8388607", + count + ), + Self::OverlyNestedGraphicsState => { + f.write_str("graphics state (q) was nested more than 28 levels deep") + } + Self::TooManyColorants(count) => write!( + f, + "DeviceN color space had {} colorants but must not exceed 8", + count + ), + Self::MalformedDeviceNArray => f.write_str("DeviceN array is malformed"), + Self::MissingFileID => f.write_str("file trailer is missing a file ID"), + Self::InvalidCMapCodepoint => { + f.write_str("CMap maps to a forbidden codepoint") + } + Self::PageWidthOutOfRange(w) if *w < 3.0 => { + write!(f, "page width {} is too small (must be at least 3)", w) + } + Self::PageWidthOutOfRange(w) => { + write!(f, "page width {} is too large (must be at most 14400)", w) + } + Self::PageHeightOutOfRange(h) if *h < 3.0 => { + write!(f, "page height {} is too small (must be at least 3)", h) + } + Self::PageHeightOutOfRange(h) => { + write!(f, "page height {} is too large (must be at most 14400)", h) + } + } + } +}