diff --git a/.dockerignore b/.dockerignore index cf10e8046c0..c46bd93573c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -95,6 +95,9 @@ blocks/**/.env .env.local .env.*.local +# Generated credentials from google-github-actions/auth +gha-creds-*.json + # macOS directory file **/.DS_Store diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c2e6b59b506..3241ab9c321 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -108,6 +108,11 @@ jobs: fail-fast: false if: needs.setup.outputs.unit-tests != '{"package":[],"include":[]}' runs-on: ubuntu-24.04 + permissions: + # Required to fetch an OIDC token, used to auth with Google Cloud Platform for @rust/chonky tests + id-token: "write" + # Maintain permission to read repo contents + contents: "read" steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -154,6 +159,15 @@ jobs: rm -rf $temp_dir echo "PDFIUM_DYNAMIC_LIB_PATH=$(pwd)/${{ matrix.directory }}/libs/" >> $GITHUB_ENV + # Sets GOOGLE_APPLICATION_CREDENTIALS in the environment, to be consumed by gcloud or client libraries + - name: Generate Google Cloud credential configuration + if: matrix.package == '@rust/chonky' + uses: google-github-actions/auth@6fc4af4b145ae7821d527454aa9bd537d1f2dc5f # v2.1.7 + with: + project_id: ${{ secrets.GOOGLE_CLOUD_HASH_PROJECT_ID }} + service_account: ${{ secrets.GOOGLE_CLOUD_VERTEX_SERVICE_ACCOUNT }} + workload_identity_provider: ${{ secrets.GOOGLE_CLOUD_IDENTITY_PROVIDER }} + - name: Install Rust toolchain if: always() && steps.tests.outputs.has-rust == 'true' uses: ./.github/actions/install-rust-toolchain @@ -184,6 +198,10 @@ jobs: continue-on-error: ${{ steps.tests.outputs.allow-failure == 'true' }} env: TEST_COVERAGE: ${{ github.event_name != 'merge_group' }} + # Variables needed for chonky tests + GOOGLE_PROJECT_ID: ${{ secrets.GOOGLE_CLOUD_HASH_PROJECT_ID }} + GOOGLE_DEFAULT_CREDENTIALS: ${{ env.GOOGLE_DEFAULT_CREDENTIALS }} # set by google-github-actions/auth + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} run: | turbo run test:unit --env-mode=loose --filter "${{ matrix.package }}" echo "TRIMMED_PACKAGE_NAME=$(echo "${{ matrix.package }}" | sed 's|@||g' | sed 's|/|.|g')" >> $GITHUB_ENV @@ -232,6 +250,11 @@ jobs: fail-fast: false if: needs.setup.outputs.integration-tests != '{"package":[],"include":[]}' runs-on: ubuntu-24.04 + permissions: + # Required to fetch an OIDC token, used to auth with Google Cloud Platform for @rust/chonky tests + id-token: "write" + # Maintain permission to read repo contents + contents: "read" steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -303,6 +326,15 @@ jobs: rm -rf $temp_dir echo "PDFIUM_DYNAMIC_LIB_PATH=$(pwd)/${{ matrix.directory }}/libs/" >> $GITHUB_ENV + # Sets GOOGLE_APPLICATION_CREDENTIALS in the environment, to be consumed by gcloud or client libraries + - name: Generate Google Cloud credential configuration + if: matrix.package == '@rust/chonky' + uses: google-github-actions/auth@6fc4af4b145ae7821d527454aa9bd537d1f2dc5f # v2.1.7 + with: + project_id: ${{ secrets.GOOGLE_CLOUD_HASH_PROJECT_ID }} + service_account: ${{ secrets.GOOGLE_CLOUD_VERTEX_SERVICE_ACCOUNT }} + workload_identity_provider: ${{ secrets.GOOGLE_CLOUD_IDENTITY_PROVIDER }} + - name: Install playwright if: matrix.package == '@tests/hash-playwright' uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 @@ -360,6 +392,11 @@ jobs: - name: Run tests continue-on-error: ${{ steps.tests.outputs.allow-failure == 'true' }} + env: + # Variables needed for chonky tests + GOOGLE_PROJECT_ID: ${{ secrets.GOOGLE_CLOUD_HASH_PROJECT_ID }} + GOOGLE_DEFAULT_CREDENTIALS: ${{ env.GOOGLE_DEFAULT_CREDENTIALS }} # set by google-github-actions/auth + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} run: | turbo run test:integration --env-mode=loose --filter "${{ matrix.package }}" echo "TRIMMED_PACKAGE_NAME=$(echo "${{ matrix.package }}" | sed 's|@||g' | sed 's|/|.|g')" >> $GITHUB_ENV diff --git a/.gitignore b/.gitignore index 8c0a51f5ad3..95f4538b03c 100644 --- a/.gitignore +++ b/.gitignore @@ -78,6 +78,9 @@ blocks/**/.env .env.local .env.*.local +# Generated credentials from google-github-actions/auth +gha-creds-*.json + # macOS directory file **/.DS_Store diff --git a/Cargo.lock b/Cargo.lock index 6b24efe1d1e..dc54a253e2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1185,11 +1185,19 @@ dependencies = [ name = "chonky" version = "0.0.0" dependencies = [ + "base64 0.22.1", + "clap", "error-stack", + "futures", "image", "insta", + "num-traits", "pdfium-render", + "reqwest", + "serde", + "serde_json", "thiserror 2.0.11", + "tokio", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index aa18b58164c..6d12d505f57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -93,6 +93,7 @@ ariadne = { version = "=0.5.0", default-features = false } aws-types = { version = "=1.3.3", default-features = false } axum = { version = "=0.7.5" } axum-core = { version = "=0.5.0" } +base64 = { version = "=0.22.1" } bumpalo = { version = "=3.16.0", default-features = false } bytes = { version = "=1.9.0" } clap_builder = { version = "=4.5.26", default-features = false, features = ["std"] } diff --git a/libs/chonky/Cargo.toml b/libs/chonky/Cargo.toml index 44a88789d0c..7c9ec09addc 100644 --- a/libs/chonky/Cargo.toml +++ b/libs/chonky/Cargo.toml @@ -15,8 +15,16 @@ edition.workspace = true error-stack = { workspace = true, public = true } # Public third-party dependencies +base64 = { workspace = true } +clap = { workspace = true, features = ["derive"] } +futures = { workspace = true, features = ["alloc"] } image = { workspace = true, public = true, features = ["png", "bmp"] } -pdfium-render = { workspace = true, public = true } +num-traits = { workspace = true } +pdfium-render = { workspace = true, features = ["thread_safe"], public = true } +reqwest = { workspace = true, features = ["json"] } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt-multi-thread", "fs"] } # Private workspace dependencies diff --git a/libs/chonky/src/embedding/hugging_face_api.rs b/libs/chonky/src/embedding/hugging_face_api.rs new file mode 100644 index 00000000000..c18aecfe172 --- /dev/null +++ b/libs/chonky/src/embedding/hugging_face_api.rs @@ -0,0 +1,177 @@ +use std::path::Path; + +use error_stack::{Report, ResultExt as _}; +use reqwest::{ + Client, + header::{AUTHORIZATION, CONTENT_TYPE, HeaderMap, HeaderValue}, +}; +use serde::Deserialize; +use tokio::fs; + +use crate::ChonkyError; + +#[derive(Deserialize, Debug)] +pub struct BoundingBox { + pub xmin: f32, + pub ymin: f32, + pub xmax: f32, + pub ymax: f32, +} + +#[derive(Deserialize, Debug)] +pub struct TablePrediction { + pub score: f32, + #[serde(rename = "box")] + pub bounding_box: BoundingBox, +} + +//for now just have it as environment variable +fn get_hugging_face_token() -> Result> { + std::env::var("HUGGING_FACE_TOKEN").change_context(ChonkyError::HuggingFaceAPI) +} + +async fn get_binary_image_data( + image_path: impl AsRef + Send + Sync, +) -> Result, Report> { + fs::read(image_path) + .await + .change_context(ChonkyError::ImageError) +} + +fn extract_bounding_boxes(json_payload: &str) -> Result, Report> { + serde_json::from_str(json_payload).change_context(ChonkyError::HuggingFaceAPI) +} + +/// A function that calls `HuggingFace` Serverless Inference API to perform +/// table recognition on a given image and returns the vectors of bounding boxes of these tables +/// +/// # Errors +/// +/// [`ChonkyError::HuggingFaceAPI`] when there are HTTP request errors +pub async fn make_table_recognition_request( + image_path: impl AsRef + Send + Sync, + retry: bool, +) -> Result, Report> { + let url = "https://api-inference.huggingface.co/models/microsoft/table-transformer-detection"; + + let access_token = get_hugging_face_token()?; + let payload = get_binary_image_data(&image_path).await?; + + // Create a new reqwest async client + let client = Client::new(); + + // Prepare the headers + let mut headers = HeaderMap::new(); + headers.insert( + AUTHORIZATION, + HeaderValue::from_str(&format!("Bearer {access_token}")) + .change_context(ChonkyError::HuggingFaceAPI)?, + ); + headers.insert( + CONTENT_TYPE, + HeaderValue::from_static("application/octet-stream"), + ); + + headers.insert( + "x-wait-for-model", + HeaderValue::from_str(&format!("{retry}")).change_context(ChonkyError::HuggingFaceAPI)?, + ); + + // Send the POST request with payload and headers + let response = client + .post(url) + .headers(headers) + .body(payload) + .send() + .await + .change_context(ChonkyError::HuggingFaceAPI)?; + + // Check if the response status is success + + let cold_model_status = 503; + if response.status() == cold_model_status { + // call the model again allowing extra time to wait + // this should not recurse forever since 503 error + // only occurs for cold models which the new header will wait for + return Box::pin(make_table_recognition_request(image_path, true)).await; + } else if !response.status().is_success() { + return Err(Report::new(ChonkyError::HuggingFaceAPI)); + } + + // Read the response body as text + let response_text = response + .text() + .await + .change_context(ChonkyError::HuggingFaceAPI)?; + + // Process the response + extract_bounding_boxes(&response_text) + + // // this is where we would wish to provide add the retry mechanism + + // // error code when model is warm is a 503 error, we can then add x-wait-for-model:true for + // // it to work + // let url = "https://api-inference.huggingface.co/models/microsoft/table-transformer-detection"; + + // let access_token = get_hugging_face_token()?; + // let payload = get_binary_image_data(image_path)?; + + // let mut easy = Easy::new(); + // easy.url(url).change_context(ChonkyError::HuggingFaceAPI)?; + // easy.post(true) + // .change_context(ChonkyError::HuggingFaceAPI)?; + + // let mut headers = List::new(); + // headers + // .append(&format!("Authorization: Bearer {access_token}")) + // .change_context(ChonkyError::HuggingFaceAPI)?; + + // // we add wait for model to be true if receiving api error prev + // headers + // .append(&format!("x-wait-for-model:{retry}")) + // .change_context(ChonkyError::HuggingFaceAPI)?; + + // easy.http_headers(headers) + // .change_context(ChonkyError::HuggingFaceAPI)?; + + // easy.post_fields_copy(&payload) + // .change_context(ChonkyError::HuggingFaceAPI)?; + + // let mut response = Vec::new(); + // { + // let mut transfer = easy.transfer(); + // transfer + // .write_function(|data| { + // response.extend_from_slice(data); + // Ok(data.len()) + // }) + // .change_context(ChonkyError::HuggingFaceAPI)?; + // transfer + // .perform() + // .change_context(ChonkyError::HuggingFaceAPI)?; + // } + + // extract_bounding_boxes( + // &String::from_utf8(response).change_context(ChonkyError::HuggingFaceAPI)?, + // ) +} + +#[cfg(test)] +mod tests { + use insta::assert_snapshot; + + use super::*; + + #[tokio::test] + async fn table_recognition() -> Result<(), Report> { + let file_path = "tests/docs/table-testing.png"; + + let table_predictions = make_table_recognition_request(file_path, true).await?; + + assert_snapshot!( + "table_bounding_boxes.txt", + format!("{:?}", table_predictions) + ); + Ok(()) + } +} diff --git a/libs/chonky/src/embedding/mod.rs b/libs/chonky/src/embedding/mod.rs new file mode 100644 index 00000000000..19d02c06231 --- /dev/null +++ b/libs/chonky/src/embedding/mod.rs @@ -0,0 +1,2 @@ +pub mod hugging_face_api; +pub mod multi_modal_embedding; diff --git a/libs/chonky/src/embedding/multi_modal_embedding.rs b/libs/chonky/src/embedding/multi_modal_embedding.rs new file mode 100644 index 00000000000..f8ac1a12fe5 --- /dev/null +++ b/libs/chonky/src/embedding/multi_modal_embedding.rs @@ -0,0 +1,427 @@ +use std::path::PathBuf; + +use base64::{Engine as _, engine::general_purpose}; +use error_stack::{Report, ResultExt as _}; +use image::DynamicImage; +use reqwest::{ + Client, + header::{AUTHORIZATION, CONTENT_TYPE, HeaderMap, HeaderValue}, +}; +use serde_json::{Value as JsonValue, json}; + +use crate::{ + ChonkyError, DocumentEmbeddings, Embedding, ImageEmbedding, PageImageObjects, + PageImageObjectsEmbeddings, PageTableObjects, PageTableObjectsEmbeddings, StructuralEmbedding, + StructuralMetadata, TableEmbedding, TextEmbedding, +}; + +fn get_vertex_access_token() -> Result> { + Ok(String::from_utf8( + std::process::Command::new("gcloud") + .args(["auth", "print-access-token"]) + .output() + .change_context(ChonkyError::VertexAPI) + .attach_printable("Issues getting the Google Cloud Auth Token")? + .stdout, + ) + .change_context(ChonkyError::VertexAPI)? + .trim() + .to_owned()) +} + +fn base64_json(image_data: impl AsRef<[u8]>) -> JsonValue { + let base64_encoded_img = general_purpose::STANDARD.encode(image_data); + + json!({ + "instances": [ + { + "image": { + "bytesBase64Encoded": base64_encoded_img + } + } + ] + }) +} + +/// Googles Multimodal embedding text can only take 1024 characters so for now we will only truncate +/// the first 1000 characters, this function would be responsible for chunking the text appropriatly +fn text_json(text: &[String]) -> JsonValue { + //merge all text into one without seperator for now + let mut text = text.concat(); + text.truncate(1000); + json!({ + "instances": [ + { + "text": text + } + ] + }) +} + +/// Given the extracted images from the pdf, embeds them +/// +/// # Errors +/// +/// [`ChonkyError::VertexAPI`] when there are HTTP request errors +pub async fn embed_pdf_object_images( + pdf_image_extract: Vec, + project_id: &str, +) -> Result, Report> { + let mut embeddings = Vec::new(); + for page_images in pdf_image_extract { + let page_image_iter = page_images.clone().owned_iter(); + let image_embeddings = embed_screenshots(page_images.owned_iter(), project_id) + .await? + .into_iter(); + + embeddings.push(PageImageObjectsEmbeddings { + _embeddings: image_embeddings + .zip(page_image_iter) + .map(|(embedding, image)| ImageEmbedding { + embedding, + _image: image, + }) + .collect(), + }); + // embeddings.push(PageImageObjectsEmbeddings { + // _embeddings: ImageEmbedding { + // embedding: embed_screenshots(page_images.iter(), project_id).await?, + // _image: page.next(), + // }, + // }); + } + Ok(embeddings.into_boxed_slice()) +} + +/// Given the screenshot of each page in pdf return its embeddings +/// +/// # Errors +/// +/// [`ChonkyError::VertexAPI`] when there are HTTP request errors +pub async fn embed_screenshots( + pdf_image_extract: impl IntoIterator + Send, + project_id: &str, +) -> Result, Report> { + let mut embeddings = Vec::new(); + for image in pdf_image_extract { + let mut buffer = Vec::new(); + let encoder = image::codecs::png::PngEncoder::new(&mut buffer); + + image + .write_with_encoder(encoder) + .change_context(ChonkyError::ImageError)?; + + // at this point we are transfering ownership of images, cannot use reference without + // cloning? + embeddings.push(Embedding { + _model_used: "Google Multimodal Embeddings".into(), + embedding_vector: make_multimodal_api_request( + project_id, + Some(image.into_bytes()), + None, + ) + .await?, + }); + } + Ok(embeddings.into_boxed_slice()) +} + +/// Given the tables on each page of the pdf, embeds each pages tables seperately into a vector +/// for each page +/// +/// # Errors +/// +/// [`ChonkyError::VertexAPI`] when there are HTTP request errors +pub async fn embed_tables( + pdf_table_bounds: Vec, + project_id: &str, +) -> Result, Report> { + let mut embeddings = Vec::new(); + for page_tables in pdf_table_bounds { + let mut page_embeddings = Vec::new(); + for table in page_tables.page_table_objects { + let mut buffer = Vec::new(); + let encoder = image::codecs::png::PngEncoder::new(&mut buffer); + + table + .image + .write_with_encoder(encoder) + .change_context(ChonkyError::ImageError)?; + + page_embeddings.push(TableEmbedding { + embedding: Embedding { + _model_used: "Google Multimodal Embeddings".into(), + embedding_vector: make_multimodal_api_request(project_id, Some(buffer), None) + .await?, + }, + _table: table, + }); + + // page_embeddings + // .push(make_multimodal_api_request(project_id, Some(buffer), None).await?); + } + embeddings.push(PageTableObjectsEmbeddings { + _embeddings: page_embeddings, + }); + } + Ok(embeddings.into_boxed_slice()) +} + +/// Given the text on each page of the pdf, embeds each pages text seperately into a vector +/// +/// # Errors +/// +/// [`ChonkyError::VertexAPI`] when there are HTTP request errors +pub async fn embed_text( + pdf_text_extract: &[&[String]], + project_id: &str, +) -> Result, Report> { + let mut embeddings = Vec::new(); + for page_text in pdf_text_extract { + embeddings.push(TextEmbedding { + _embedding: Embedding { + _model_used: "Google Multimodal Embeddings".into(), + embedding_vector: make_multimodal_api_request(project_id, None, Some(page_text)) + .await?, + }, + _text: page_text.concat(), + }); + } + Ok(embeddings) +} + +/// A function that performs authentication with Google Vertex API and performs +/// a request to obtain multimodal embeddings given an image path +/// +/// # Errors +/// +/// [`ChonkyError::VertexAPI`] when there are HTTP request errors +/// [`ChonkyError::ImageError`] when there are errors converting to base64 encoding +pub async fn make_multimodal_api_request( + project_id: &str, + image_data: Option>, + text_data: Option<&[String]>, +) -> Result, Report> { + let url = format!( + "https://us-central1-aiplatform.googleapis.com/v1/projects/{project_id}/locations/us-central1/publishers/google/models/multimodalembedding@001:predict" + ); + + let access_token = get_vertex_access_token()?; // assuming this function is synchronous + + // Create the reqwest async client + let client = Client::new(); + + // Prepare headers + let mut headers = HeaderMap::new(); + headers.insert( + AUTHORIZATION, + HeaderValue::from_str(&format!("Bearer {access_token}")) + .change_context(ChonkyError::VertexAPI)?, + ); + headers.insert( + CONTENT_TYPE, + HeaderValue::from_static("application/json; charset=utf-8"), + ); + + // Prepare payload make sure its initialized + let mut payload = json!(null); + if let Some(image_payload) = image_data { + payload = base64_json(&image_payload); + } else if let Some(text_payload) = text_data { + payload = text_json(text_payload); + } + + // Make the POST request + let response = client + .post(&url) + .headers(headers) + .body(payload.to_string()) + .send() + .await + .change_context(ChonkyError::VertexAPI) + .attach_printable("Failed to build post request for Vertex API")?; + + // Check the response status + if !response.status().is_success() { + return Err( + Report::new(ChonkyError::VertexAPI).attach_printable(format!( + "Received the error code {} in the response status with error text {:?}", + response.status(), + response + .error_for_status() + .change_context(ChonkyError::VertexAPI)? + .text() + .await, + )), + ); + } + + // Read and process the response + let response_text = response + .json() + .await + .change_context(ChonkyError::VertexAPI)?; + + extract_embedding(&response_text) +} + +// Parses the response to extract the image or text embedding vector +fn extract_embedding(response: &JsonValue) -> Result, Report> { + let prediction = response + .as_object() + .and_then(|obj| obj.get("predictions")) + .and_then(JsonValue::as_array) + .and_then(|arr| arr.first()) + .ok_or_else(|| Report::new(ChonkyError::VertexAPI)) + .attach_printable("Unexpected response format")?; + + let embedding = match ( + prediction.get("imageEmbedding"), + prediction.get("textEmbedding"), + ) { + (Some(embedding), None) | (None, Some(embedding)) => embedding + .as_array() + .ok_or(ChonkyError::VertexAPI) + .attach_printable("Unexpected response format")?, + (None, None) => { + return Err(ChonkyError::VertexAPI).attach_printable("No embedding found in response"); + } + (Some(_), Some(_)) => { + return Err(ChonkyError::VertexAPI) + .attach_printable("Embedding found in both image and text fields"); + } + }; + embedding + .iter() + .map(|x| { + x.as_f64() + .ok_or_else(|| Report::new(ChonkyError::VertexAPI)) + }) + .collect() +} + +pub fn add_structural_embedding( + document_embeddings: &mut DocumentEmbeddings, + page_number: usize, + file_path: PathBuf, + embedding_vector: Vec, +) { + let structural_metadata = StructuralMetadata { + _page_number: page_number, + _image_path: file_path, + }; + + let embedding = Embedding { + _model_used: "VertexAPIMultiModalEmbeddings".into(), + embedding_vector, + }; + + let structural_embedding = StructuralEmbedding { + _metadata: structural_metadata, + _embedding: embedding, + }; + + document_embeddings + .structural_embeddings + .push(structural_embedding); +} + +#[cfg(test)] +mod tests { + use insta::{assert_binary_snapshot, assert_snapshot}; + use serde_json::to_string_pretty; + use tokio::fs; + + use super::*; + use crate::create_document_embedding; + + #[tokio::test] + async fn base64_conversion() -> Result<(), Report> { + let test_path = PathBuf::from("./tests/docs/page_1.png"); + let image_data: Vec = fs::read(test_path) + .await + .change_context(ChonkyError::ImageError)?; + // source of truth found by decoding base64 encoding to get same image + // must use string_pretty since there is autoformating done by compiler with addition of + // newline + assert_binary_snapshot!( + "page_1.json", + format!( + "{}\n", + to_string_pretty(&base64_json(image_data)) + .change_context(ChonkyError::ImageError)? + ) + .into() + ); + Ok(()) + } + + #[tokio::test] + async fn image_embedding() -> Result<(), Report> { + //since embeddings are nondeterminatic they vary slightly + //thus a good way to test is to check if cosine similarity close to 1 + + let test_image_path = PathBuf::from("./tests/docs/page_1.png"); + + let test_json_path = PathBuf::from("./src/snapshots/google_test_embedding_page_1.json"); + + let source_embedding = extract_embedding( + &serde_json::from_slice( + &fs::read(test_json_path) + .await + .change_context(ChonkyError::ImageError)?, + ) + .change_context(ChonkyError::ImageError)?, + )?; + + let image_data: Vec = fs::read(test_image_path) + .await + .change_context(ChonkyError::ImageError)?; + + //project id + + let project_id = + std::env::var("GOOGLE_PROJECT_ID").change_context(ChonkyError::VertexAPI)?; + + let test_embedding = + make_multimodal_api_request(&project_id, Some(image_data), None).await?; + + //find cosine similarity of vectors + + let mut dot_prod: f64 = 0.0; + let mut source_mag: f64 = 0.0; + let mut test_mag: f64 = 0.0; + + for index in 0..test_embedding.len() { + dot_prod += source_embedding[index] * test_embedding[index]; + source_mag += source_embedding[index] * source_embedding[index]; + test_mag += test_embedding[index] * test_embedding[index]; + } + + let similarity = dot_prod / (test_mag.sqrt() * source_mag.sqrt()); + + let expected_similarity_threshold = 0.999; + + if similarity >= expected_similarity_threshold { + Ok(()) + } else { + Err(Report::new(ChonkyError::Pdfium).attach_printable(format!( + "Cosine similarity is lower than expected: got {similarity}, expected at least \ + {expected_similarity_threshold}" + ))) + } + } + + #[test] + fn create_embedding_data() { + let mut document_embeddings = create_document_embedding(); + + add_structural_embedding( + &mut document_embeddings, + 1, + PathBuf::from("test/path"), + vec![0.1, 0.2, 0.3], + ); + assert_snapshot!(format!("{:?}", document_embeddings)); + } +} diff --git a/libs/chonky/src/embedding/snapshots/chonky__embedding__hugging_face_api__tests__table_bounding_boxes.txt.snap b/libs/chonky/src/embedding/snapshots/chonky__embedding__hugging_face_api__tests__table_bounding_boxes.txt.snap new file mode 100644 index 00000000000..f39203450bd --- /dev/null +++ b/libs/chonky/src/embedding/snapshots/chonky__embedding__hugging_face_api__tests__table_bounding_boxes.txt.snap @@ -0,0 +1,6 @@ +--- +source: libs/chonky/src/embedding/hugging_face_api.rs +expression: "format!(\"{:?}\", table_predictions)" +snapshot_kind: text +--- +[TablePrediction { score: 0.9997482, bounding_box: BoundingBox { xmin: 187.0, ymin: 138.0, xmax: 808.0, ymax: 315.0 } }] diff --git a/libs/chonky/src/embedding/snapshots/chonky__embedding__multi_modal_embedding__tests__create_embedding_data.snap b/libs/chonky/src/embedding/snapshots/chonky__embedding__multi_modal_embedding__tests__create_embedding_data.snap new file mode 100644 index 00000000000..65c9817861f --- /dev/null +++ b/libs/chonky/src/embedding/snapshots/chonky__embedding__multi_modal_embedding__tests__create_embedding_data.snap @@ -0,0 +1,6 @@ +--- +source: libs/chonky/src/embedding/multi_modal_embedding.rs +expression: "format!(\"{:?}\", document_embeddings)" +snapshot_kind: text +--- +DocumentEmbeddings { structural_embeddings: [StructuralEmbedding { _metadata: StructuralMetadata { _page_number: 1, _image_path: "test/path" }, _embedding: Embedding { _model_used: "VertexAPIMultiModalEmbeddings", embedding_vector: [0.1, 0.2, 0.3] } }], content_embeddings: [] } diff --git a/libs/chonky/src/embedding/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap b/libs/chonky/src/embedding/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap new file mode 100644 index 00000000000..af3d1af2ad4 --- /dev/null +++ b/libs/chonky/src/embedding/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap @@ -0,0 +1,6 @@ +--- +source: libs/chonky/src/embedding/multi_modal_embedding.rs +expression: base64_json(image_data).to_string().into() +extension: json +snapshot_kind: binary +--- diff --git a/libs/chonky/src/embedding/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap.json b/libs/chonky/src/embedding/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap.json new file mode 100644 index 00000000000..2647bb9e012 --- /dev/null +++ b/libs/chonky/src/embedding/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap.json @@ -0,0 +1,9 @@ +{ + "instances": [ + { + "image": { + "bytesBase64Encoded": "" + } + } + ] +} diff --git a/libs/chonky/src/lib.rs b/libs/chonky/src/lib.rs index 68c7d624a0f..721eae17920 100644 --- a/libs/chonky/src/lib.rs +++ b/libs/chonky/src/lib.rs @@ -2,12 +2,14 @@ extern crate alloc; -#[cfg(not(feature = "static"))] use alloc::borrow::Cow; +use std::path::PathBuf; #[cfg(not(feature = "static"))] use std::{env, path::Path}; use error_stack::{Report, ResultExt as _}; +use image::DynamicImage; +use pdf_segmentation::ExtractedTable; use pdfium_render::prelude::Pdfium; use thiserror::Error; @@ -18,11 +20,21 @@ pub enum ChonkyError { #[error("pdfium error")] Pdfium, #[error("write error to system")] - Write, + ImageError, #[error("Issues with CLI input")] Arguments, + #[error("Issues with Google's Vertex API call")] + VertexAPI, + #[error("Issues with HuggingFace Inference Serverless API")] + HuggingFaceAPI, + #[error("Problem Storing Embedding Information")] + Embedding, } +mod embedding; + +pub use embedding::{hugging_face_api, multi_modal_embedding}; + /// Attempts to link to the `PDFium` library. /// /// ## Loading strategy @@ -58,14 +70,123 @@ pub fn link_pdfium() -> Result> { } } +#[derive(Debug, Clone)] +pub struct DocumentEmbeddings { + // Embeddings for structural chunks (page screenshots) + pub structural_embeddings: Vec, + pub content_embeddings: Vec, +} + +#[derive(Debug, Clone)] +pub struct PageContentEmbedding { + _image: PageImageObjectsEmbeddings, + _table: PageTableObjectsEmbeddings, + _text: TextEmbedding, +} + +#[derive(Debug, Clone)] +pub struct ImageEmbedding { + pub embedding: Embedding, + _image: DynamicImage, +} + +#[derive(Debug, Clone)] +pub struct TableEmbedding { + pub embedding: Embedding, + _table: ExtractedTable, +} +#[derive(Debug, Clone)] +pub struct TextEmbedding { + _embedding: Embedding, + _text: String, +} + +#[derive(Debug, Clone)] +pub struct Embedding { + _model_used: Cow<'static, str>, //model name reveals image or text embedding model + pub embedding_vector: Vec, //the actual embedding vector +} + +#[derive(Debug, Clone)] +pub struct StructuralMetadata { + _page_number: usize, //discuss additional metadata useful here + _image_path: PathBuf, //location of pdf image for embedding +} + +#[derive(Debug, Clone)] +pub struct StructuralEmbedding { + _metadata: StructuralMetadata, + _embedding: Embedding, +} + +#[derive(Debug, Clone)] +pub struct PageImageObjects { + pub page_image_objects: Vec, +} + +impl PageImageObjects { + pub fn iter(&self) -> impl Iterator + Send { + self.page_image_objects.iter() + } + + pub fn owned_iter(self) -> impl Iterator + Send { + self.page_image_objects.into_iter() + } +} + +#[derive(Debug, Clone)] +pub struct PageImageObjectsEmbeddings { + _embeddings: Box<[ImageEmbedding]>, +} + +#[derive(Debug, Clone)] +pub struct PageTableObjects { + pub page_table_objects: Vec, +} + +#[derive(Debug, Clone)] +pub struct PageTableObjectsEmbeddings { + _embeddings: Vec, +} + +#[derive(Debug, Clone)] +pub struct PageScreenshot { + _page_image_objects: Vec, +} + +#[must_use] +pub const fn create_document_embedding() -> DocumentEmbeddings { + DocumentEmbeddings { + structural_embeddings: Vec::new(), + content_embeddings: Vec::new(), + } +} + pub mod pdf_segmentation { + + use std::path::PathBuf; + use error_stack::{Report, ResultExt as _}; + use futures::future::try_join_all; use image::{DynamicImage, GrayImage, RgbaImage}; use pdfium_render::prelude::{ - PdfBitmap, PdfBitmapFormat, PdfDocument, PdfPoints, PdfRenderConfig, Pdfium, + PdfBitmap, PdfBitmapFormat, PdfDocument, PdfPageObjectCommon as _, + PdfPageObjectsCommon as _, PdfPoints, PdfRect, PdfRenderConfig, Pdfium, + }; + + use crate::{ + ChonkyError, DocumentEmbeddings, PageContentEmbedding, PageImageObjects, PageTableObjects, + embedding::{ + hugging_face_api::make_table_recognition_request, + multi_modal_embedding::{embed_pdf_object_images, embed_tables, embed_text}, + }, }; - use crate::ChonkyError; + #[derive(Debug, Clone)] + pub struct ExtractedTable { + bounding_box: PdfRect, //model name reveals image or text embedding model + pub image: DynamicImage, //the actual embedding vector + } /// Function to read the pdf /// @@ -75,21 +196,251 @@ pub mod pdf_segmentation { /// permission to read it. pub fn load_pdf<'a>( pdfium: &'a Pdfium, - file_path: &str, + file_path: &PathBuf, ) -> Result, Report> { pdfium - .load_pdf_from_file(file_path, None) + .load_pdf_from_file(&file_path, None) .map_err(|err| Report::new(err).change_context(ChonkyError::Pdfium)) } - // /// TODO: This function returns the extracted text that is segmented in proper reading order - // and /// grouped by boundaries such as newline spacing and other layout information, - // segments can /// contain texts with different formatting (such as a sentence with a - // **bold** inside) /// - // /// #Errors - // /// - // /// TBD - //pub fn extract_text(pdf: &PdfDocument) -> () {} + #[expect( + clippy::future_not_send, + reason = "Will Implement Safe Data Sending of Pdfium Documents in future" + )] + async fn extract_tables( + pdf: &PdfDocument<'_>, + images: &[PathBuf], + config: &PdfRenderConfig, + ) -> Result, Report> { + let table_predictions_list = try_join_all( + images + .iter() + .map(|image_path| make_table_recognition_request(image_path, false)), + ) + .await?; + + let mut pdf_table_bounds = Vec::new(); + + //task::spawn_blocking(move || { + for (index, page) in pdf.pages().iter().enumerate() { + let table_predictions = &table_predictions_list[index]; + + let mut page_table_bounds: Vec = Vec::new(); + //convert the pixels back to pdf points + for table in table_predictions { + if table.score < 0.95 { + continue; + } + let bbox = &table.bounding_box; + + // Convert to i32 safely discarding decimals and rounding down + // normally bbox should already be an integer that needs to be casted + let xmin: i32 = num_traits::cast(bbox.xmin).ok_or(ChonkyError::Pdfium)?; + let ymin: i32 = num_traits::cast(bbox.ymin).ok_or(ChonkyError::Pdfium)?; + let xmax: i32 = num_traits::cast(bbox.xmax).ok_or(ChonkyError::Pdfium)?; + let ymax: i32 = num_traits::cast(bbox.ymax).ok_or(ChonkyError::Pdfium)?; + + // Calculate bottom-left and top-right + let bottom_left = page + .pixels_to_points(xmin, ymax, config) + .change_context(ChonkyError::Pdfium)?; + let top_right = page + .pixels_to_points(xmax, ymin, config) + .change_context(ChonkyError::Pdfium)?; + + // Render PDF with cropped info and save as Dynamic Image + let image_bitmap = page + .render_with_config(&create_config().clip(xmin, ymin, xmax, ymax)) + .change_context(ChonkyError::Pdfium)?; + + let width = u32::try_from(xmax - xmin).change_context(ChonkyError::Pdfium)?; + let height = u32::try_from(ymax - ymin).change_context(ChonkyError::Pdfium)?; + let xmin = u32::try_from(xmin).change_context(ChonkyError::Pdfium)?; + let ymin = u32::try_from(ymin).change_context(ChonkyError::Pdfium)?; + + // Crop image using safe dimensions + let image = image_bitmap.as_image().crop(xmin, ymin, width, height); + + //add the proper table bound for checking extracted text + let extracted_table = ExtractedTable { + bounding_box: PdfRect::new( + bottom_left.1, + bottom_left.0, + top_right.1, + top_right.0, + ), + image, + }; + page_table_bounds.push(extracted_table); + //later step to extract table textual information + } + pdf_table_bounds.push(PageTableObjects { + page_table_objects: page_table_bounds, + }); + } + Ok(pdf_table_bounds) + } + + // TODO: This function will returns the extracted text that is segmented in proper reading order + // and grouped by boundaries such as newline spacing and other layout information, + // segments can contain texts with different formatting (such as a sentence with a + // **bold** inside) + /// + /// For now, this function just solely extracts all text that is in a singular page for + /// extraction + /// + /// # Errors + /// + /// [`ChonkyError::Pdfium`] if conversion from pixels to pdf points fails + #[must_use] + pub fn extract_text( + pdf: &PdfDocument, + pdf_table_bounds: &[PageTableObjects], + ) -> Vec> { + let mut pages_text_extract: Vec> = Vec::new(); + //process page by page + for (index, page) in pdf.pages().iter().enumerate() { + //we know index of images and pdf must be the same + + //check if text bounding boxes overlap with the pdf table bounds + let page_table_bounds = &pdf_table_bounds[index].page_table_objects; + + let page_text: Vec = page + .objects() + .iter() + .filter_map(|object| { + object.as_text_object().and_then(|text_object| { + page_table_bounds + .iter() + .all(|table_box| { + //silently ignore errors if not overlapping + !table_box + .bounding_box + .does_overlap(&text_object.bounds().unwrap_or(PdfRect::zero())) + }) + .then(|| text_object.text()) + }) + }) + .collect::>(); + + //let mut page_text_object = Vec::new(); + + // // Explicitly iterate over pdf_objects without using an iterator chain + // for object in page.objects().iter() { + // if let Some(text_object) = object.as_text_object() { + // if pdf_table_bounds.iter().all(|table_box| { + // !table_box.does_overlap(&text_object.bounds().unwrap_or(PdfRect::zero())) + // }) { + // // Move the text_object directly into the vector + // page_text_object.push(text_object); + // } + // } + // } + + //let page_text = group_similar_segments(page_text_object)?; + + pages_text_extract.push(page_text); + } + pages_text_extract + } + + fn extract_images(pdf: &PdfDocument) -> Vec { + let mut pdf_image_extract = Vec::new(); + + for page in pdf.pages().iter() { + let mut page_image_extract = Vec::new(); + + page.objects().iter().for_each(|object| { + if let Some(image) = object.as_image_object() { + if let Ok(image) = image.get_raw_image() { + page_image_extract.push(image); + } + } + }); + // println!( + // "There are {} images on page {}", + // page_image_extract.len(), + // _index + // ); + pdf_image_extract.push(PageImageObjects { + page_image_objects: page_image_extract, + }); + } + pdf_image_extract + } + + /// This function takes in the pdf and the paths of the pdf pages as images and modfies the + /// document embeddings to include the embeddings of tables, images, and text inside the pdf + /// + /// # Errors + /// + /// [`ChonkyError::Pdfium`] if pdf rendering of tables fails + /// [`ChonkyError::VertexAPI`] if the Multimodal Embedding Model fails + /// [`ChonkyError::HuggingFaceAPI`] if there are issues parsing the table + #[expect( + clippy::future_not_send, + reason = "Will Implement Safe Data Sending of Pdfium Documents in future" + )] + pub async fn embed_pdf<'a>( + pdf: &PdfDocument<'_>, + images: &[PathBuf], + document_embeddings: &'a mut DocumentEmbeddings, + ) -> Result<&'a mut DocumentEmbeddings, Report> { + let project_id = + std::env::var("GOOGLE_PROJECT_ID").change_context(ChonkyError::VertexAPI)?; + + let pdf_table_bounds = extract_tables(pdf, images, &create_config()).await?; + + let pdf_text_extract = extract_text(pdf, &pdf_table_bounds); + + let pdf_image_extract = extract_images(pdf); + + let image_embeddings = embed_pdf_object_images(pdf_image_extract, &project_id).await?; + + let table_embeddings = embed_tables(pdf_table_bounds, &project_id).await?; + + let pdf_text_embeddings = embed_text( + &pdf_text_extract + .iter() + .map(|text| &**text) + .collect::>(), + &project_id, + ) + .await?; + + //TODO: implement in a way to prevent so much unnecessary cloning + + //turn image embedding vector into iterator + let mut image_embeddings = image_embeddings.into_iter(); + + let mut table_embeddings = table_embeddings.into_iter(); + + let mut text_embeddings = pdf_text_embeddings.into_iter(); + + for _ in 0..pdf.pages().len() { + //create Page content embedding now + let page_content_embedding = PageContentEmbedding { + _image: image_embeddings + .next() + .ok_or(ChonkyError::VertexAPI) + .attach_printable("Missing Page Image Object Embeddings")?, + _table: table_embeddings + .next() + .ok_or(ChonkyError::VertexAPI) + .attach_printable("Missing Page Table Object Embeddings")?, + _text: text_embeddings + .next() + .ok_or(ChonkyError::VertexAPI) + .attach_printable("Missing Text Embeddings")?, + }; + + document_embeddings + .content_embeddings + .push(page_content_embedding); + } + + Ok(document_embeddings) + } // /// TODO: Given a list of segments of a PDF this function reads the segments via the bounding // /// box order, with the naive approach of top→bottom (and if same top then left→right) and @@ -220,10 +571,143 @@ pub mod pdf_segmentation { _ => return Err(Report::new(ChonkyError::Pdfium)), }) } + + fn create_config() -> PdfRenderConfig { + //may adjust resolution depending on need + let resolution_width = 1000; + + PdfRenderConfig::new().set_target_width(resolution_width) + } + + #[cfg(test)] + mod tests { + + use insta::{assert_binary_snapshot, assert_snapshot}; + + use super::*; + use crate::{create_document_embedding, link_pdfium}; + + #[tokio::test] + async fn pdf_table_extraction() -> Result<(), Report> { + let pdfium = link_pdfium()?; + let file_path = PathBuf::from("./tests/docs/table-testing.pdf"); + + let pdf = load_pdf(&pdfium, &file_path).change_context(ChonkyError::Pdfium)?; + + let images = vec![PathBuf::from("./tests/docs/table-testing.png")]; + + let table_info = extract_tables(&pdf, &images, &create_config()).await?; + //just take first vector + + let table = table_info[0].page_table_objects[0].clone(); + + assert_snapshot!("extracted_table.txt", format!("{:#?}", table.bounding_box)); + + let mut buffer = Vec::new(); + let encoder = image::codecs::bmp::BmpEncoder::new(&mut buffer); + + table + .image + .write_with_encoder(encoder) + .expect("image should be able to be encoded into a bitmap"); + assert_binary_snapshot!("extracted_table.bmp", buffer); + + Ok(()) + } + + #[tokio::test] + async fn pdf_text_extraction() -> Result<(), Report> { + let pdfium = link_pdfium()?; + let file_path = PathBuf::from("./tests/docs/table-testing.pdf"); + + let pdf = load_pdf(&pdfium, &file_path).change_context(ChonkyError::Pdfium)?; + + let images = vec![PathBuf::from("./tests/docs/table-testing.png")]; + + let table_info = extract_tables(&pdf, &images, &create_config()).await?; + //just take first vector + + let text_info = extract_text(&pdf, &table_info); + //just take first vector + + let text = text_info[0].join(""); + + assert_snapshot!("extracted_text.txt", text); + + Ok(()) + } + + #[test] + fn pdf_image_extract() -> Result<(), Report> { + let pdfium = link_pdfium()?; + + let file_path = PathBuf::from("./tests/docs/test-doc.pdf"); + + let pdf = load_pdf(&pdfium, &file_path).change_context(ChonkyError::Pdfium)?; + + let images = extract_images(&pdf); + + let mut buffer = Vec::new(); + let encoder = image::codecs::bmp::BmpEncoder::new(&mut buffer); + + //the third page has an image to verify + images[2].page_image_objects[0] + .write_with_encoder(encoder) + .expect("image should be able to be encoded into a bitmap"); + assert_binary_snapshot!("extracted_image.bmp", buffer); + + Ok(()) + } + + #[tokio::test] + async fn content_embeddings() -> Result<(), Report> { + let pdfium = link_pdfium()?; + + let file_path = PathBuf::from("./tests/docs/table-testing.pdf"); + + let pdf = load_pdf(&pdfium, &file_path).change_context(ChonkyError::Pdfium)?; + + let images = vec![PathBuf::from("./tests/docs/table-testing.png")]; + + let mut document_embeddings = create_document_embedding(); + let document_embeddings = embed_pdf(&pdf, &images, &mut document_embeddings).await?; + + //cannot use binary snapshot since embeddings vary + //check vector length since we individually check embeddings in other tests + + if document_embeddings.content_embeddings.len() != 1 { + return Err(Report::new(ChonkyError::Pdfium).attach_printable(format!( + "Expected there to be {} pages of content embeddings but found {}", + 1, + document_embeddings.content_embeddings.len() + ))); + } + + // if !document_embeddings.content_embeddings[0]._image.is_empty() { + // return Err(Report::new(ChonkyError::Pdfium).attach_printable(format!( + // "Expected there to be {} images but found {}", + // 0, + // document_embeddings.content_embeddings.len() + // ))); + // } + + // if document_embeddings.content_embeddings[0]._table.len() != 1 { + // return Err(Report::new(ChonkyError::Pdfium).attach_printable(format!( + // "Expected there to be {} tables but found {}", + // 1, + // document_embeddings.content_embeddings.len() + // ))); + // } + + Ok(()) + } + } } #[cfg(test)] mod tests { + use std::path::PathBuf; + use error_stack::{Report, ResultExt as _}; use insta::assert_binary_snapshot; @@ -233,9 +717,9 @@ mod tests { fn pdf_load_success() -> Result<(), Report> { let pdfium = link_pdfium()?; - let test_pdf_string = "tests/docs/test-doc.pdf"; + let test_pdf_string = PathBuf::from("tests/docs/test-doc.pdf"); - let _pdf = pdf_segmentation::load_pdf(&pdfium, test_pdf_string) + let _pdf = pdf_segmentation::load_pdf(&pdfium, &test_pdf_string) .change_context(ChonkyError::Pdfium)?; Ok(()) @@ -245,10 +729,10 @@ mod tests { fn pdf_load_failure() -> Result<(), Report> { let pdfium = link_pdfium()?; - let test_pdf_string = "tests/docs/invalid.pdf"; + let test_pdf_string = PathBuf::from("tests/docs/invalid.pdf"); // Should return an error when loading an invalid PDF - let result = pdf_segmentation::load_pdf(&pdfium, test_pdf_string) + let result = pdf_segmentation::load_pdf(&pdfium, &test_pdf_string) .change_context(ChonkyError::Pdfium); if result.is_err() { @@ -264,9 +748,9 @@ mod tests { fn pdf_image_conversion() -> Result<(), Report> { let pdfium = link_pdfium()?; - let test_pdf_string = "tests/docs/test-doc.pdf"; + let test_pdf_string = PathBuf::from("tests/docs/test-doc.pdf"); - let pdf = pdf_segmentation::load_pdf(&pdfium, test_pdf_string) + let pdf = pdf_segmentation::load_pdf(&pdfium, &test_pdf_string) .change_context(ChonkyError::Pdfium)?; //number of pages of pdf diff --git a/libs/chonky/src/main.rs b/libs/chonky/src/main.rs index 36a4fc90e35..34a87cb0791 100644 --- a/libs/chonky/src/main.rs +++ b/libs/chonky/src/main.rs @@ -1,33 +1,74 @@ -use std::env; +use std::path::PathBuf; -use chonky::{ChonkyError, pdf_segmentation}; -use error_stack::{Report, ResultExt as _, ensure}; +use chonky::{ + ChonkyError, PageImageObjects, create_document_embedding, + multi_modal_embedding::{add_structural_embedding, embed_screenshots}, + pdf_segmentation::{self, embed_pdf}, +}; +use clap::Parser; +use error_stack::{Report, ResultExt as _}; -fn main() -> Result<(), Report> { - // read file path arguments - // TODO: implement with clap - let args: Vec = env::args().collect(); +#[derive(Parser)] +struct CliArgs { + /// Path to the PDF file + pdf_path: PathBuf, +} - ensure!(args.len() > 1, ChonkyError::Arguments); +#[tokio::main] +async fn main() -> Result<(), Report> { + // read file path arguments + let args = CliArgs::parse(); let pdfium = chonky::link_pdfium()?; - let pdf = pdf_segmentation::load_pdf(&pdfium, &args[1]).change_context(ChonkyError::Pdfium)?; + let pdf = + pdf_segmentation::load_pdf(&pdfium, &args.pdf_path).change_context(ChonkyError::Pdfium)?; let preprocessed_pdf = pdf_segmentation::pdf_to_images(&pdf).change_context(ChonkyError::Pdfium)?; - //for now we will print all these images to a folder - // this will be a seperate function in the future once knowledge about error-stack increases let output_folder = "./out"; + let mut document_embeddings = create_document_embedding(); + + let mut images = Vec::new(); + + let project_id = std::env::var("GOOGLE_PROJECT_ID").change_context(ChonkyError::VertexAPI)?; + for (index, image) in preprocessed_pdf.iter().enumerate() { // Generate a unique filename for each page image let file_path = format!("{}/page_{}.png", output_folder, index + 1); // Save the image as a PNG file - image.save(&file_path).change_context(ChonkyError::Write)?; + image + .save(&file_path) + .change_context(ChonkyError::ImageError)?; + + images.push(PathBuf::from(file_path)); } + let doc_screenshots = embed_screenshots( + PageImageObjects { + page_image_objects: preprocessed_pdf, + } + .owned_iter(), + &project_id, + ) + .await?; + + for (index, screenshot) in doc_screenshots.into_iter().enumerate() { + add_structural_embedding( + &mut document_embeddings, + index + 1, + PathBuf::from(format!("{}/page_{}.png", output_folder, index + 1)), + screenshot.embedding_vector, + ); + } + + embed_pdf(&pdf, &images, &mut document_embeddings) + .await + .change_context(ChonkyError::Pdfium)?; + + //dbg!("{:?}", document_embeddings); Ok(()) } diff --git a/libs/chonky/src/snapshots/chonky__embedding__multi_modal_embedding__tests__create_embedding_data.snap b/libs/chonky/src/snapshots/chonky__embedding__multi_modal_embedding__tests__create_embedding_data.snap new file mode 100644 index 00000000000..6fea1424e93 --- /dev/null +++ b/libs/chonky/src/snapshots/chonky__embedding__multi_modal_embedding__tests__create_embedding_data.snap @@ -0,0 +1,6 @@ +--- +source: libs/chonky/src/embedding.rs +expression: "format!(\"{:?}\", document_embeddings)" +snapshot_kind: text +--- +DocumentEmbeddings { structural_embeddings: [StructuralEmbedding { _metadata: StructuralMetadata { _page_number: 1, _image_path: "test/path" }, _embedding: Embedding { _model_used: "VertexAPIMultiModalEmbeddings", _embedding_vector: [0.1, 0.2, 0.3] } }], content_embeddings: [] } diff --git a/libs/chonky/src/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap b/libs/chonky/src/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap new file mode 100644 index 00000000000..d630166b429 --- /dev/null +++ b/libs/chonky/src/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap @@ -0,0 +1,6 @@ +--- +source: libs/chonky/src/embedding.rs +expression: "base64_json(\"./tests/docs/page_1.png\")?.into()" +extension: json +snapshot_kind: binary +--- diff --git a/libs/chonky/src/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap.json b/libs/chonky/src/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap.json new file mode 100644 index 00000000000..2647bb9e012 --- /dev/null +++ b/libs/chonky/src/snapshots/chonky__embedding__multi_modal_embedding__tests__page_1.snap.json @@ -0,0 +1,9 @@ +{ + "instances": [ + { + "image": { + "bytesBase64Encoded": "" + } + } + ] +} diff --git a/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_image.snap b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_image.snap new file mode 100644 index 00000000000..71131a21ad5 --- /dev/null +++ b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_image.snap @@ -0,0 +1,6 @@ +--- +source: libs/chonky/src/lib.rs +expression: buffer +extension: bmp +snapshot_kind: binary +--- diff --git a/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_image.snap.bmp b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_image.snap.bmp new file mode 100644 index 00000000000..d335c2d27af Binary files /dev/null and b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_image.snap.bmp differ diff --git a/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_table.snap b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_table.snap new file mode 100644 index 00000000000..71131a21ad5 --- /dev/null +++ b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_table.snap @@ -0,0 +1,6 @@ +--- +source: libs/chonky/src/lib.rs +expression: buffer +extension: bmp +snapshot_kind: binary +--- diff --git a/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_table.snap.bmp b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_table.snap.bmp new file mode 100644 index 00000000000..f065681d327 Binary files /dev/null and b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_table.snap.bmp differ diff --git a/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_table.txt.snap b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_table.txt.snap new file mode 100644 index 00000000000..da80acd6df6 --- /dev/null +++ b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_table.txt.snap @@ -0,0 +1,19 @@ +--- +source: libs/chonky/src/lib.rs +expression: "format!(\"{:#?}\", table.bounding_box)" +snapshot_kind: text +--- +PdfRect { + bottom: PdfPoints { + value: 599.20245, + }, + left: PdfPoints { + value: 114.444, + }, + top: PdfPoints { + value: 707.5363, + }, + right: PdfPoints { + value: 494.496, + }, +} diff --git a/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_text.txt.snap b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_text.txt.snap new file mode 100644 index 00000000000..899d112e4df --- /dev/null +++ b/libs/chonky/src/snapshots/chonky__pdf_segmentation__tests__extracted_text.txt.snap @@ -0,0 +1,6 @@ +--- +source: libs/chonky/src/lib.rs +expression: text +snapshot_kind: text +--- +Published as a conference paper at ICLR 2021Table 2: Comparison with state of the art on popular image classification benchmarks. We report mean and standard deviation of the accuracies, averaged over three fine-tuning runs. VisionTransformer models pre-trained on the JFT-300M dataset outperform ResNet-based baselines on alldatasets, while taking substantially less computational resources to pre-train. ViT pre-trained on thesmaller public ImageNet-21k dataset performs well too. ∗Slightly improved 88.5% result reportedin Touvron et al. (2020).Figure 2: Breakdown of VTAB performance in Natural, Specialized, and Structured task groups.model still took substantially less compute to pre-train than prior state of the art. However, we notethat pre-training efficiency may be affected not only by the architecture choice, but also other parameters, such as training schedule, optimizer, weight decay, etc. We provide a controlled study ofperformance vs. compute for different architectures in Section 4.4. Finally, the ViT-L/16 modelpre-trained on the public ImageNet-21k dataset performs well on most datasets too, while takingfewer resources to pre-train: it could be trained using a standard cloud TPUv3 with 8 cores in approximately 30 days.Figure 2 decomposes the VTAB tasks into their respective groups, and compares to previous SOTAmethods on this benchmark: BiT, VIVI – a ResNet co-trained on ImageNet and Youtube (Tschannenet al., 2020), and S4L – supervised plus semi-supervised learning on ImageNet (Zhai et al., 2019a).ViT-H/14 outperforms BiT-R152x4, and other methods, on the Natural and Structured tasks. On theSpecialized the performance of the top two models is similar.4.3 PRE-TRAINING DATA REQUIREMENTSThe Vision Transformer performs well when pre-trained on a large JFT-300M dataset. With fewerinductive biases for vision than ResNets, how crucial is the dataset size? We perform two series ofexperiments.First, we pre-train ViT models on datasets of increasing size: ImageNet, ImageNet-21k, and JFT300M. To boost the performance on the smaller datasets, we optimize three basic regularizationparameters – weight decay, dropout, and label smoothing. Figure 3 shows the results after finetuning to ImageNet (results on other datasets are shown in Table 5)2. When pre-trained on thesmallest dataset, ImageNet, ViT-Large models underperform compared to ViT-Base models, despite(moderate) regularization. With ImageNet-21k pre-training, their performances are similar. Onlywith JFT-300M, do we see the full benefit of larger models. Figure 3 also shows the performance2Note that the ImageNet pre-trained models are also fine-tuned, but again on ImageNet. This is because theresolution increase during fine-tuning improves the performance.6 diff --git a/libs/chonky/src/snapshots/google_test_embedding_page_1.json b/libs/chonky/src/snapshots/google_test_embedding_page_1.json new file mode 100644 index 00000000000..69390f0e48f --- /dev/null +++ b/libs/chonky/src/snapshots/google_test_embedding_page_1.json @@ -0,0 +1,320 @@ +{ + "predictions": [ + { + "imageEmbedding": [ + 0.0200794507, 0.0727531835, -0.0118073784, 0.0387928113, -0.0394414, + -0.0300136041, -0.0219337083, -0.00991160143, -0.030519845, + 0.0123786097, -0.021295242, 0.0221384745, -0.0270472094, 0.0903312936, + 0.0154524082, -0.000384369749, 0.0252925958, 0.00456746621, + 0.00611424912, -0.0012179903, 0.0187779311, -0.00174762285, + -0.0181801617, 0.0262305401, -0.0571274757, 0.00924724154, 0.0137181552, + -0.0103711933, 0.00572055066, 0.0288961586, -0.0215082187, + 0.00884132553, -0.0453789383, 0.037997555, -0.0138459, 0.0132075641, + 0.0201186035, -0.0527875498, -0.00995267276, 0.0147078112, + -0.0221115947, 0.00832059421, 0.0275015477, 0.00621457072, + 0.00385735929, -0.00514552, 0.0146847777, -0.00225471938, 0.0159577653, + 0.020497337, 0.0210882984, -0.0559281409, -0.0474126, 0.0186752919, + 0.0413711779, 0.0125551531, 0.0163566768, 0.0218617842, -0.0186067801, + -0.00399777154, -0.0237672515, 0.0586896129, -0.0505793281, + 0.0269862246, -0.0394900553, -0.00704604341, -0.00641412614, + 0.0412042737, -0.0367216431, -0.0108876172, 0.0568020195, 0.0162842385, + 0.0428199247, -0.0643725172, -0.0145920916, -0.00681277411, + -0.0721000805, -0.00576667255, -0.00254351785, -0.00220164354, + -0.0315022543, 0.0184254907, -0.0248583704, -0.00186481106, + -0.0116791297, -0.0521355048, -0.0317877643, 0.0152493091, + -0.0225893296, 0.000711053, 0.052535478, -0.020968914, 0.0273086503, + -0.130931154, -0.0384063162, -0.0459572747, -0.0340217538, + -0.00853932183, 0.00115204754, -0.0370576233, -0.0040066191, + 0.0737694502, 0.00811708346, 0.0034538866, 0.0154484957, -0.0415875204, + -0.00176832976, -0.0338066854, 0.0320476592, 0.00990479, -0.0259598549, + -0.0479121432, 0.0498655736, 0.00224597054, -0.00801020581, + 0.0158196837, 0.0291785989, -0.00789069943, 0.0456606485, -0.0178818349, + -0.0255469233, -0.00855124835, -0.0190197397, -0.0160852969, + -0.0499708839, -0.0492461361, -0.033667028, 0.0127423918, 0.00614815066, + 0.0168152228, -0.00293308, 0.0116070369, -0.00815388, 0.00494600693, + -0.0230694097, 0.0106020151, 0.0130336611, -0.0237730704, 0.0265423208, + -0.0238192249, -0.012212473, -0.0344414562, -0.0206737071, 0.0158406664, + -0.0182467457, 0.0183475055, -0.0197015591, -0.0113007026, + -0.0251154322, 0.0173691679, 0.0640703663, 0.00715500675, 0.0205982272, + -0.0298886988, 0.00395428529, -0.00575352274, -0.013640644, + 0.0248482786, 0.0123412963, -0.005353841, 0.00364228967, -0.00875655189, + -0.0264218189, 0.0099193966, 0.00941459462, 0.00874940399, + -0.00510607287, -0.0110344728, 0.0333686545, 0.00498574227, + -0.025924135, -0.0164656304, -0.0309006684, 0.00477741798, 0.0232512821, + 0.0107584037, 0.00695351278, 0.00140760886, 0.00817680638, 0.0151648317, + -0.0205331054, 0.0044620824, -0.04447091, -0.0157380067, 0.0138357468, + -0.0327164344, -0.0245906841, -0.0126390522, -0.0224467516, + -0.0171758533, -0.0331096612, -0.0166997, -0.0407635756, 0.00811867882, + 0.0158348344, 0.00912262592, -0.0217924118, 0.0136932535, 0.00707948394, + 0.00422922336, 0.0257224627, -0.0258570835, -0.0245650876, 0.0451356508, + -0.0355140679, -0.043719206, -0.0363498405, 0.0185538642, 0.017455535, + 0.0299994443, 0.00553353643, -0.00326871336, 0.0315691829, 0.0273369644, + 0.00565885, -0.0324013978, -0.0209106989, -0.0310577918, -0.0171664581, + 0.0424342752, 0.0449889414, 0.00161450764, -0.0198931713, 0.00832869671, + 0.00532779377, 0.0322870873, -0.017043151, -0.0106690628, -0.0269485451, + -0.0175905079, 0.0195470955, -0.00310604, -0.0222307369, -0.0174242128, + -0.0307845175, 0.00168827921, -0.031500306, -0.0239113923, -0.0192108, + -0.0325956792, -0.0150319831, -0.0257696304, 0.00113791344, + -0.030418165, -0.0136852181, -0.015717864, 0.0193069149, -0.00971315708, + -0.0160518792, -0.0146469176, -0.0108018406, -0.0470841639, + -0.0177005846, 0.022074908, 0.0122866761, -0.0400707684, -0.0384144, + -0.0137234787, 0.00811227877, 0.00116491213, 0.0410275944, 0.0204653349, + 0.0113176471, 0.0141015202, -0.00605957303, 0.0236300658, -0.0310151596, + -0.0106318956, 0.00480514765, 0.0108092623, -0.0156595148, + -0.0247851368, 0.0105164591, -0.0349469073, 0.0190469306, 0.0353395343, + -0.0111926803, 0.0101663414, 0.0360605, 0.0972545594, 0.014762097, + 0.0114156995, -0.0358252, 0.0119302543, 0.00434173597, -0.00686672842, + -0.111650869, 0.0297358781, 0.0124527253, 0.0315340236, -0.0108470339, + -0.00753354095, 0.0262790527, 0.0107064806, 0.00686907861, + -0.0162234474, 0.00110694522, 0.00518654753, -0.0000906666828, + 0.00508946041, -0.0325438641, 0.00725959148, 0.0231076684, + -0.0252057761, -0.0318221152, 0.0461781472, -0.0342227146, + -0.0414473414, 0.0081457831, 0.0179869831, -0.0214805938, 0.00344512123, + -0.0046609235, -0.0058999653, 0.000664868334, -0.00133454986, + 0.0100209964, -0.00316612446, -0.0109791113, -0.00798935257, + 0.00603624061, -0.0176395047, -0.0480125211, -0.0101084644, 0.016962612, + -0.0506634861, 0.0319967754, -0.0224775262, 0.0568186156, -0.0311297588, + -0.0243818425, 0.00136393309, -0.0268775877, 0.0575457886, + -0.00334202754, 0.0347409807, 0.00663562864, -0.0132650752, + -0.0116647966, 0.00973663386, -0.0439043194, -0.000579640153, + -0.0283873137, -0.00699932454, -0.0269404538, 0.00126372522, + -0.00400872855, 0.00907869358, 0.0342610478, -0.0102650719, + 0.0820957348, -0.0303597, -0.0134116411, -0.034482237, -0.0149571048, + -0.00262419856, -0.0316910781, -0.029764723, 0.0118451687, 0.0190796014, + -0.00654050056, -0.0478619747, 0.016611293, -0.000785895914, + -0.00468459539, -0.0176734738, 0.042204231, 0.018262146, 0.00199765339, + 0.0167115424, 0.0115687139, -0.015229675, -0.0408321097, -0.0339653, + -0.0607094727, 0.00284047332, -0.00996038225, -0.0215976387, + -0.0418740734, -0.00398819242, 0.0308392812, 0.0199317653, 0.0292516984, + 0.0254485793, 0.0333487, -0.0625300631, -0.00138035021, 0.0130197247, + -0.0544026, 0.0235591736, 0.0124013275, -0.0512735136, 0.00969952531, + -0.0279471241, -0.0138655808, -0.0280096922, 0.0151817836, 0.015103111, + 0.0392551832, -0.0148400217, -0.0133888265, 0.00654882146, 0.0198969394, + -0.0122558288, 0.00300772558, -0.0121245747, -0.00260192, + -0.00621000398, -0.0162321422, -0.0331034549, 0.0115734022, + -0.011995743, 0.00505140657, 0.0281787869, -0.017650757, 0.0130830426, + -0.00644282857, 0.0207544174, 0.0203262512, -0.045325622, 0.0548431873, + 0.0383436903, -0.0601317696, -0.028722316, -0.0121619264, 0.00825800747, + -0.00212718709, 0.0229251813, -0.00699642068, -0.0110462159, + 0.012756004, 0.00503579946, -0.00153946411, 0.0363759026, 0.0106749842, + -0.0316457562, 0.0205926765, 0.0120015126, -0.0405500457, -0.0203145351, + 0.0113656595, 0.0277445298, 0.00869626459, 0.026911417, -0.0445918702, + -0.024621373, -0.0224110913, -0.00418887241, 0.0231625065, 0.0493302494, + -0.00569535699, 0.00979854, -0.014650939, -0.0275991037, 0.0200079642, + 0.0108461156, -0.043581713, -0.0291460082, 0.0358892046, 0.00485635828, + 0.0274123345, 0.0160650071, -0.0122359172, -0.0149006974, 0.00616822951, + -0.0417919904, 0.0121834539, 0.0175651293, 0.00778748933, 0.0336856097, + -0.00485151773, -0.00742530404, 0.0125049446, -0.00175703352, + -0.00328684459, -0.0472250693, 0.00665207626, -0.0289331973, + -0.0427108333, -0.0111280698, 0.00240747025, -0.0133775361, + -0.023427546, -0.0310953893, 0.00755348196, 0.0310655553, 0.028316088, + 0.0293623228, 0.0182108, 0.0180183928, -0.0339946859, -0.0267358422, + 0.0269593932, 0.0195297301, -0.00878691394, 0.0133624589, 0.0109142987, + 0.0166089647, -0.0119280117, 0.0583108626, -0.0103669874, -0.0104782879, + 0.0115101105, -0.00680659385, 0.0136693474, 0.00926319323, + 0.00801794603, 0.0260645393, 0.017171571, -0.0375067368, -0.0228026938, + -0.0293940138, -0.0236931127, -0.0238084495, 0.00200319546, + 0.0442974381, -0.013169311, 0.0128285708, -0.0129514886, -0.00792446826, + 0.0147309015, 0.0315737203, 0.0363607481, -0.00588304782, -0.0545949191, + -0.00206409884, 0.059211351, -0.0166553874, 0.0195842087, 0.00987605, + -0.0249910448, 0.00760935899, 0.000271902, 0.0246488731, 0.0166088976, + 0.0132115707, -0.0477463976, -0.0162763596, -0.0434952, 0.0251127947, + -0.0507098176, -0.0219555218, 0.00259535387, -0.0284369159, + -0.0450235046, -0.0710127726, 0.0145530608, 0.0227186382, 0.020867113, + 0.00730509125, -0.00655677402, 0.0128724491, 0.00504125655, + 0.0114655122, -0.0284118485, -0.016804561, -0.0256101936, 0.00504152, + -0.00687041692, -0.014085494, -0.0336099, 0.0147280674, 0.0157194976, + -0.0260903761, 0.0226975, -0.0251513254, 0.044349011, 0.0381043963, + -0.0144195817, 0.0192006826, -0.0340053774, -0.0214964915, 0.0159361586, + 0.0299740247, -0.0189940054, -0.0340108499, 0.0350475572, + -0.00914715789, -0.0176734459, -0.00708493963, 0.0122759026, + -0.0151949506, -0.00382534857, -0.00618836563, 0.0401804894, + -0.0134629216, -0.0147635052, -0.00738917757, 0.0271829963, + -0.0363052413, 0.0131927039, 0.0092771221, 0.00118602021, -0.015329076, + 0.0201174095, 0.00888214819, 0.00441296306, -0.0573665276, + 0.00310249696, -0.0218316335, -0.0347739309, 0.00115993957, + -0.0170522518, -0.00374893472, 0.0160980113, -0.0261501353, + -0.0206314344, -0.0068035177, -0.0270256139, -0.0745542943, + -0.016034374, -0.0155530497, -0.00825508125, -0.000142721925, + 0.00259413477, 0.00416106824, 0.000704567239, -0.000376670097, + -0.0309293047, -0.0575292259, -0.0354600437, -0.00709351525, + 0.0211882256, 0.00653045857, -0.00768915378, -0.0391729102, + -0.0168421064, 0.0123850126, -0.0341681391, -0.0116141131, + 0.00543836318, -0.0145025346, -0.00871191267, 0.0118249943, + 0.000419740187, -0.00384515431, -0.0205040649, 0.0224966928, + -0.0314637125, 0.00652385131, 0.0222869739, -0.00254431623, + -0.0393526591, 0.00730004907, -0.0084729027, 0.0194466114, 0.0179618187, + 0.0272312239, -0.0564603172, 0.0137194097, 0.00705754384, 0.0207322687, + 0.0164968222, -0.0154356137, 0.0152093843, 0.0108388243, -0.0133247636, + 0.0394562036, -0.021821104, -0.0151041215, -0.171952352, -0.00614952482, + -0.0340941846, 0.0536923483, 0.00969231129, 0.0108262282, + -0.00443865452, 0.0191259626, 0.00279664621, -0.00820668787, + 0.0170391258, 0.00745780533, -0.149354264, 0.00530553143, + -0.00120837928, -0.00200300175, 0.0286234878, 0.00840980373, + -0.0430695191, -0.00352477911, -0.0244644079, 0.00768952584, + -0.0326302312, -0.00710009923, 0.00182367628, 0.00812096428, + -0.0195258241, 0.0157954227, -0.00859312806, -0.0000694446207, + -0.033467494, 0.0504676551, 0.0199554935, 0.0153865237, -0.00534388237, + -0.0436336473, -0.0243477467, -0.0195666458, 0.0267537795, + -0.0155006759, 0.00818326417, -0.0246115271, -0.0671336725, + 0.0239930861, -0.0121858018, 0.0067680846, 0.00530457, 0.0149790281, + 0.0516754, -0.103635773, 0.000773494132, 0.00328713842, 0.00529097626, + -0.0426898487, 0.0130740916, -0.0503690317, -0.00970230624, + 0.000398179283, -0.0115862768, 0.00424743351, -0.0187369734, + 0.0207669809, -0.031930998, -0.00531767914, 0.00774476631, -0.0056497, + 0.00588185433, 0.00909983, -0.0780757, 0.0459470414, -0.00945746899, + -0.00806615315, -0.0252163708, -0.00608803751, -0.000875713304, + -0.0673136711, 0.0727208629, -0.000904412766, -0.0157243572, + -0.0476896651, -0.0217101611, -0.00720090186, 0.0177019984, + -0.0122179324, 0.0206996687, -0.0392113663, -0.0267971419, 0.0385536477, + -0.182963267, -0.0175348464, -0.0213053562, 0.00980788935, + -0.0177251883, 0.0190708712, 0.0246611517, -0.000561563356, + -0.00319550582, -0.00296727498, -0.0288800802, 0.00161457015, + -0.0145582119, 0.00574282929, 0.0574918352, -0.0132808425, + 0.00414043479, -0.000178870047, 0.000366607128, -0.0190121308, + 0.0364717953, 0.00417773239, 0.00618093414, 0.0137917623, 0.00261084293, + 0.0125148734, -0.0394730233, -0.0305693354, 0.0359926634, -0.0430680625, + -0.0522889867, 0.00756339869, -0.0102640381, -0.00577089889, + 0.0142202778, -0.0332152843, -0.0113711087, -0.000816995685, + 0.0197970551, 0.0124188038, -0.00106603687, 0.00703300629, 0.0389869772, + -0.00973942317, 0.0229291413, 0.00711151, -0.0147652971, -0.00428463379, + -0.0139042092, 0.00265471195, -0.00401064288, -0.00653148582, + 0.0019402022, -0.016071599, 0.0084224306, -0.0238962732, 0.0304724257, + 0.00559564, -0.0161983036, -0.0218582377, 0.0251766723, 0.0357669294, + -0.00312981335, 0.0377037525, -0.0205542725, -0.0037168772, + -0.0232195519, -0.0436959155, 0.0128921568, -0.0151355444, 0.0177663844, + 0.00788191427, 0.0476557501, -0.0267430786, 0.0890309215, 0.0121643217, + 0.0307274442, -0.0260096, -0.00612257328, -0.0317230448, -0.02372564, + -0.0210982207, -0.0061263, -0.0137099056, -0.0112341642, -0.0497641899, + -0.0200990848, -0.00596580934, 0.0423967764, 0.0267690271, + -0.0347951949, 0.0193116665, 0.0200607292, 0.0130544435, 0.010885613, + 0.0218913183, -0.00309879403, 0.0400102027, -0.0249893349, + -0.0165777802, -0.0180280041, 0.00276423269, -0.0211881232, + -0.0340437219, -0.0212098081, 0.00616344856, -0.00228470308, + 0.0106081581, 0.0107657546, 0.0292795654, -0.00721715298, + -0.00503393961, 0.0393781252, -0.0214204136, -0.00669409055, + 0.00130266522, 0.0127016548, 0.0249566492, -0.0289214849, 0.00130508328, + -0.0302580539, 0.0458944663, -0.000148194813, -0.0039588809, + -0.0184638631, 0.0277188309, -0.0030655968, 0.0245242305, -0.0242547449, + -0.0232463181, 0.0349476524, 0.00204681908, -0.00861816481, + 0.0124527477, 0.0212625768, -0.0246081632, 0.00972012337, + -0.00277936156, -0.0216701664, -0.026145665, -0.024488952, + -0.000709131302, -0.022229325, 0.0142183881, -0.00989455357, + 0.018014865, -0.0198584, 0.0164782647, 0.0173409097, 0.00976632535, + 0.0124288192, -0.0408389755, -0.0120075392, 0.00142397662, 0.0478935689, + -0.0444919802, 0.015540082, 0.0394340605, 0.0202748701, -0.000304086076, + 0.00662816036, -0.0190697648, 0.0140714515, -0.0264617298, + -0.000244835537, 0.0202282351, 0.0165772866, 0.0434172861, + -0.00351414969, -0.00927356724, 0.0465599447, 0.00514820963, + 0.0310756397, 0.0204647481, -0.00956788845, -0.00505019492, + -0.0100129386, 0.0193647351, 0.0034627507, 0.00467265304, -0.0153254075, + 0.0211381968, -0.0220802985, -0.00323239085, -0.0127215981, + 0.0465795472, 0.0166741088, -0.000689326727, 0.00955610909, + -0.0143910646, -0.00198538718, 0.0407436788, 0.0253659915, + -0.00343311741, -0.0310920887, 0.0265372545, 0.00654901937, + 0.0188235529, 0.00199738727, 0.0112797469, -0.0014535326, -0.0397842303, + 0.00509123923, -0.0105526308, 0.019441938, 0.00334139331, 0.0365989096, + 0.00230900804, -0.0531785153, 0.010751849, 0.00935982913, 0.0384312607, + 0.0096363565, -0.0220984519, 0.00152336573, -0.0321582444, 0.0292694196, + -0.0260823835, 0.0337933078, 0.00405835779, -0.0189365819, 0.0156697482, + 0.0422811, 0.00566361845, 0.0205916148, -0.0202185232, 0.0202315152, + -0.00685426686, 0.015565997, 0.00501579419, -0.0229780059, 0.0151474588, + 0.00053506461, 0.00191962544, 0.0133708566, 0.00212826976, + -0.0481293574, -0.00169397751, 0.010574148, -0.0209433287, 0.0110796, + 0.0065798522, -0.00667468179, -0.0308473092, 0.00740753859, + -0.0221598223, 0.00512656942, 0.0195889119, 0.0556428172, + -0.00257600704, 0.00370742916, 0.00411983952, 0.0658061802, + -0.00873719063, -0.043811474, -0.00329023506, 0.00454754382, + -0.000812734361, -0.0109089259, 0.000750568288, 0.0247720573, + -0.00580075663, 0.0015899404, 0.0297173653, 0.0270499606, 0.0305265728, + 0.00772903, 0.0280447453, 0.0173935164, -0.0167136174, 0.0506906323, + -0.0101519544, 0.0353015289, -0.0114279818, 0.0264521521, -0.0293146223, + -0.00393620925, -0.0591186136, 0.00581971416, 0.00844934676, + 0.023740761, -0.0238940492, -0.0214789622, -0.0180806, 0.00577302137, + 0.0272331201, -0.061273057, -0.0228527281, 0.0134820975, 0.0267619714, + 0.0205253121, -0.00703013409, -0.0101485271, 0.00390325161, + -0.0137371421, -0.0489562973, -0.0127959596, 0.0242894366, -0.008546344, + -0.0133359162, -0.0476422794, -0.0142559884, -0.02404215, 0.00224654726, + -0.0387420654, -0.0043096086, 0.0413263887, 0.00274349866, 0.016133111, + 0.00154414598, -0.00368582713, -0.0215509403, 0.00142613705, + 0.0106891431, 0.0150609631, -0.00133193098, -0.0330797434, + -0.00433516176, -0.00501346029, 0.0261584409, 0.0321044028, + -0.0240512174, -0.0229956023, 0.013655521, -0.0122943223, -0.0218713544, + -0.0333506167, -0.00955368672, -0.00437915325, -0.00088354724, + 0.0101670045, 0.000485445024, 0.00869695563, -0.0196477175, + -0.0551166572, 0.0161440037, -0.0234405119, 0.0176831577, 0.00752860308, + -0.00384146953, -0.000213336752, 0.0370079428, -0.0245727263, + -0.0504004359, -0.020394396, 0.000584760157, 0.00978445262, + -0.0102744028, -0.0442414545, -0.00635753712, 0.00860350206, + 0.00640854379, -0.00895388, -0.0232050586, 0.0102997683, -0.0327524543, + -0.0326812938, -0.00686040334, -0.0435493663, 0.00337665598, + 0.0166181475, 0.0215597413, -0.0049460181, -0.0086199, -0.00389243802, + -0.0136604272, 0.00993004814, 0.00282117934, 0.00515508605, + -0.00519424165, -0.00999070145, -0.00147068209, 0.00369803212, + 0.03990601, 0.0120543828, -0.00327076251, -0.0168122407, -0.0221371967, + 0.0255554263, 0.001270968, 0.00407504896, -0.0312685817, -0.023212146, + -0.057882797, -0.0369412, -0.00547598302, -0.0362123623, -0.0358127877, + -0.000455933099, -0.00527426368, 0.010957012, 0.0205564238, + -0.00896162074, -0.00111860048, 0.00703405542, -0.0711964145, + -0.022021845, 0.0307536609, -0.0400716141, -0.00308295665, 0.0351293832, + 0.00499952724, 0.0195838138, 0.023860069, 0.0113117332, 0.0515782908, + 0.00432363432, -0.0231705476, 0.0174186751, -0.0195914954, 0.0101942793, + 0.0167301353, -0.000199523696, -0.0101376269, 0.0182858892, + 0.0869283155, 0.0243913196, 0.0669196248, 0.0185533017, 0.0612070113, + 0.0160951559, -0.0314462, -0.0206926726, -0.00790190697, 0.0424785279, + 0.0262046363, 0.00937181525, 0.0342449658, 0.0064783697, 0.0249037649, + -0.0137123372, 0.0210513435, 0.00541570131, 0.0000486832141, + -0.0259706378, -0.0300789289, 0.00782210939, -0.0173217971, + -0.014066509, -0.0234325249, 0.0139920497, -0.00396463089, + -0.0519054495, 0.0144946137, -0.00234021782, -0.00807238277, + 0.000436707051, -0.0390327275, 0.0459594689, 0.038718313, 0.0280687064, + 0.0078713242, 0.00640660943, -0.000269388198, 0.0280300248, + -0.00306039862, -0.0437871, -0.0240810756, -0.0502388291, -0.024014743, + -0.00745526515, -0.00378584093, -0.00694045611, -0.0111657418, + 0.0311096702, -0.0250221714, -0.0223973524, -0.00616492284, + 0.00813332666, 0.0237598233, 0.0202788152, -0.0151113095, + -0.00223956537, -0.0344038941, -0.0297126081, 0.0501965, -0.00374444248, + -0.00474519609, 0.0274300929, 0.00198469171, -0.00560412, -0.0156734232, + 0.0489428192, 0.00170261681, -0.0429477394, -0.0138243567, 0.0127956429, + -0.00822142605, -0.0181886517, -0.00203846791, -0.0795444697, + -0.00548215583, 0.0227267649, -0.00599628594, 0.0141998325, 0.00994171, + 0.0386609919, -0.0162332971, 0.00768113928, 0.00947586168, 0.0384875946, + 0.0311741475, -0.0237468574, -0.026139833, -0.0375245363, 0.00990575366, + -0.013840178, 0.0436373353, -0.0241348632, -0.0223932434, 0.011154647, + 0.00320306304, 0.0073458706, -0.00155779871, -0.00879557058, + -0.00106137153, 0.0178080741, 0.01561544, 0.0171815734, 0.027200561, + -0.00627345312, 0.0630974099, -0.00833350141, 0.0210855063, + 0.0129463533, -0.00873967446, -0.0290490817, -0.00855392776, + 0.0234951116, -0.0178814046, 0.00453126756, -0.00369682023, + 0.0111865364, -0.0160761904, 0.0195884425, 0.0463038795, -0.0171864741, + -0.0374323912, 0.020723097, 0.00428105518, 0.0363555327, -0.00964644, + -0.016611604, -0.0145120844, -0.00521347532, -0.0269056894, + -0.020038154, -0.0224927366, -0.0139585258, -0.0562770776, -0.031290371, + 0.0204596929, 0.0259389598, 0.0441130511, -0.0189397074, -0.0140432119, + 0.00946220942, -0.0449446961, 0.016192643, -0.0074085691, 0.0101288641, + -0.000958178134, 0.0276184548, -0.0366303846, -0.0138140963, + -0.04580836, 0.0111556947, 0.0207933057, 0.0135189211, 0.0149673419, + -0.00344952615, -0.0230011381, -0.0390263125, -0.0166007336, + 0.0126238707, 0.00453956239, -0.0125422413, -0.098842442, + 0.000431181514, -0.00155359611, -0.00795070454, -0.00552688399, + -0.0132899862, -0.0123376995, 0.0058303047, 0.0157832783, -0.0131928995, + -0.028023487, 0.00560258701, -0.0321100354, 0.0231736638, -0.011532641, + -0.0356754363, 0.036068771, 0.00983694755, 0.00925140548, + -0.00488466304, 0.0307478346, -0.0323061, 0.00300523499, 0.00322845415, + 0.0155146504, 0.00867948495, 0.0256663766, 0.0324363075, -0.0159262363, + -0.0257178117, -0.00510563236, 0.0183420014, 0.015630031, -0.0045425212, + 0.0115154097, -0.00850601494, -0.026883712, 0.00459420681, 0.0260308515, + -0.00445165345, -0.0289291851, 0.0258517023, 0.0207562819, 0.0240250453, + 0.0386538692, -0.0128346132, 0.0108000804, -0.019390611, -0.0190595314, + -0.0410954468, -0.0061017354, 0.0271149799, -0.0362317339, + 0.000981613761, 0.0258738622, 0.0173024498, 0.00453861756, 0.0087875193, + 0.0478484035, 0.0569758713, 0.0169108119, -0.0114248702, -0.0229106583, + 0.00611786405, -0.0169372503, -0.0064141443, 0.00871222094, + -0.00948232599, -0.0101533411, 0.0245107636, 0.0117161162, + 0.00554489577, -0.0239735134, 0.000193733649, 0.00507702306, + 0.00331792422, -0.0412947647, -0.053955432, -0.0455085486, + -0.0119136237, -0.0157709774, -0.00261563505, 0.005211832 + ] + } + ] +} diff --git a/libs/chonky/tests/docs/page_1.png b/libs/chonky/tests/docs/page_1.png new file mode 100644 index 00000000000..8abaeb2d08f Binary files /dev/null and b/libs/chonky/tests/docs/page_1.png differ diff --git a/libs/chonky/tests/docs/table-testing.pdf b/libs/chonky/tests/docs/table-testing.pdf new file mode 100644 index 00000000000..f0612740195 Binary files /dev/null and b/libs/chonky/tests/docs/table-testing.pdf differ diff --git a/libs/chonky/tests/docs/table-testing.png b/libs/chonky/tests/docs/table-testing.png new file mode 100644 index 00000000000..e7134202cce Binary files /dev/null and b/libs/chonky/tests/docs/table-testing.png differ