From 438079f64758a2cb00b9389b4ae1e5f7bbd85e6b Mon Sep 17 00:00:00 2001
From: akhil-testsigma <appini.akhil@testsigma.com>
Date: Sat, 11 Jan 2025 05:39:32 +0530
Subject: [PATCH] extract pdf data using ocr

---
 extract_pdf_data_using_ocr/pom.xml            | 107 +++++++++++
 .../addons/web/OCRextractFromPDF.java         | 181 ++++++++++++++++++
 .../main/resources/testsigma-sdk.properties   |   1 +
 3 files changed, 289 insertions(+)
 create mode 100644 extract_pdf_data_using_ocr/pom.xml
 create mode 100644 extract_pdf_data_using_ocr/src/main/java/com/testsigma/addons/web/OCRextractFromPDF.java
 create mode 100644 extract_pdf_data_using_ocr/src/main/resources/testsigma-sdk.properties
diff --git a/extract_pdf_data_using_ocr/pom.xml b/extract_pdf_data_using_ocr/pom.xml
new file mode 100644
index 00000000..e08cdf10
--- /dev/null
+++ b/extract_pdf_data_using_ocr/pom.xml
@@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project
+    xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>com.testsigma.addons</groupId>
+    <artifactId>extract_pdf_data_using_ocr</artifactId>
+    <version>1.0.0</version>
+    <packaging>jar</packaging>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <maven.compiler.source>11</maven.compiler.source>
+        <maven.compiler.target>11</maven.compiler.target>
+        <testsigma.sdk.version>1.2.18_cloud</testsigma.sdk.version>
+        <junit.jupiter.version>5.8.0-M1</junit.jupiter.version>
+        <testsigma.addon.maven.plugin>1.0.0</testsigma.addon.maven.plugin>
+        <maven.source.plugin.version>3.2.1</maven.source.plugin.version>
+        <lombok.version>1.18.20</lombok.version>
+
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.testsigma</groupId>
+            <artifactId>testsigma-java-sdk</artifactId>
+            <version>${testsigma.sdk.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.projectlombok</groupId>
+            <artifactId>lombok</artifactId>
+            <version>${lombok.version}</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter-api</artifactId>
+            <version>${junit.jupiter.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.testng</groupId>
+            <artifactId>testng</artifactId>
+            <version>6.14.3</version>
+        </dependency>
+        <!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java -->
+        <dependency>
+            <groupId>org.seleniumhq.selenium</groupId>
+            <artifactId>selenium-java</artifactId>
+            <version>4.14.1</version>
+        </dependency>
+        <!-- https://mvnrepository.com/artifact/io.appium/java-client -->
+        <dependency>
+            <groupId>io.appium</groupId>
+            <artifactId>java-client</artifactId>
+            <version>9.0.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-annotations</artifactId>
+            <version>2.13.0</version>
+        </dependency>
+    <dependency>
+        <groupId>org.apache.commons</groupId>
+        <artifactId>commons-lang3</artifactId>
+        <version>3.14.0</version>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.pdfbox</groupId>
+        <artifactId>pdfbox</artifactId>
+        <version>3.0.1</version>
+    </dependency>
+
+    </dependencies>
+    <build>
+        <finalName>extract_pdf_data_using_ocr</finalName>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>3.2.4</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-source-plugin</artifactId>
+                <version>${maven.source.plugin.version}</version>
+                <executions>
+                    <execution>
+                        <id>attach-sources</id>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+</project>
diff --git a/extract_pdf_data_using_ocr/src/main/java/com/testsigma/addons/web/OCRextractFromPDF.java b/extract_pdf_data_using_ocr/src/main/java/com/testsigma/addons/web/OCRextractFromPDF.java
new file mode 100644
index 00000000..d0903bc8
--- /dev/null
+++ b/extract_pdf_data_using_ocr/src/main/java/com/testsigma/addons/web/OCRextractFromPDF.java
@@ -0,0 +1,181 @@
+package com.testsigma.addons.web;
+
+import com.testsigma.sdk.*;
+import com.testsigma.sdk.annotation.Action;
+import com.testsigma.sdk.annotation.OCR;
+import com.testsigma.sdk.annotation.RunTimeData;
+import com.testsigma.sdk.annotation.TestData;
+import lombok.Data;
+import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
+import java.io.*;
+import java.net.URL;
+import java.nio.file.Paths;
+import java.util.List;
+
+@Data
+@Action(actionText = "Extract data from PDF pdf_path using ocr and store it in runtime variable variable-name",
+        description = "Extracts text from a PDF file and performs OCR on images embedded in the PDF.",
+        applicationType = ApplicationType.WEB)
+public class OCRextractFromPDF extends WebAction {
+
+    @TestData(reference = "pdf_path")
+    private com.testsigma.sdk.TestData pdfPath;
+
+    @TestData(reference = "variable-name", isRuntimeVariable = true)
+    private com.testsigma.sdk.TestData testdata;
+
+    @RunTimeData
+    private com.testsigma.sdk.RunTimeData runTimeData;
+
+    @OCR
+    private com.testsigma.sdk.OCR ocr;
+
+    @Override
+    protected Result execute() {
+        Result result = Result.SUCCESS;
+        try {
+            String filePath = pdfPath.getValue().toString();
+            File pdfFile;
+
+            // Check if the path is a URL
+            if (filePath.startsWith("http://") || filePath.startsWith("https://")) {
+                pdfFile = downloadFile(filePath);
+            } else {
+                pdfFile = new File(filePath);
+            }
+
+
+            //Extract text from PDF document
+            String textFromPdf = extractTextFromPdf(pdfFile.getAbsolutePath());
+            logger.info("Text from PDF: " + textFromPdf);
+
+            //Extract images and perform OCR to extract text from images
+            String ocrText = extractTextFromImages(pdfFile.getAbsolutePath());
+            logger.info("OCR Text: " + ocrText);
+
+            // Combine the text from the PDF and OCR text
+            String combinedText = textFromPdf + " " + ocrText;
+            logger.info("The text extracted from the PDF: " + combinedText);
+
+            runTimeData.setKey(testdata.getValue().toString());
+            runTimeData.setValue(combinedText);
+
+            setSuccessMessage("The text was successfully extracted from the PDF and images, and stored in variable: " + testdata.getValue().toString() + ". Value: " + combinedText);
+            logger.info("The text was successfully extracted from the PDF and images, and stored in variable: " + testdata.getValue().toString());
+
+            if (filePath.startsWith("http://") || filePath.startsWith("https://")) {
+                pdfFile.delete();
+            }
+
+        } catch (Exception e) {
+            setErrorMessage("Error during PDF extraction process: " + ExceptionUtils.getStackTrace(e));
+            result = Result.FAILED;
+        }
+        return result;
+    }
+
+    private String extractTextFromPdf(String pdfFilePath) throws IOException {
+        // Extract text from PDF
+        PDDocument document = Loader.loadPDF(new File(pdfFilePath));
+        PDFTextStripper stripper = new PDFTextStripper();
+        String text = stripper.getText(document);
+        document.close();
+        logger.info("Text extracted from the PDF: " + text);
+        return text;
+    }
+
+    private String extractTextFromImages(String pdfFilePath) throws IOException {
+        // Extract text from images using OCR
+        PDDocument document = Loader.loadPDF(new File(pdfFilePath));
+        StringBuilder ocrText = new StringBuilder();
+
+        // Iterate through pages and extract images
+        for (PDPage page : document.getPages()) {
+            // Get the resources on the page, which include images
+            PDResources resources = page.getResources();
+
+            // Iterate through the resources and extract images
+            for (COSName name : resources.getXObjectNames()) {
+                PDXObject xObject = resources.getXObject(name);
+
+                if (xObject instanceof PDImageXObject) {
+                    // Handle images here
+                    PDImageXObject imageXObject = (PDImageXObject) xObject;
+
+                    // Convert the PDImageXObject to a BufferedImage
+                    BufferedImage bufferedImage = imageXObject.getImage();
+                    if(bufferedImage == null) {
+                        logger.info("Skipping null image");
+                        continue;
+                    }
+
+                    // Save the BufferedImage to a temporary file
+                    File tempFile = saveBufferedImageToTempFile(bufferedImage);
+
+                    // Pass the temp file to OCR
+                    OCRImage imageObj = new OCRImage();
+                    imageObj.setOcrImageFile(tempFile); // Set the temporary image file
+
+                    // Perform OCR on the image
+                    try {
+                        List<OCRTextPoint> textPoints = ocr.extractTextFromImage(imageObj);
+                        if(textPoints != null){
+                            for (OCRTextPoint textPoint : textPoints) {
+                                ocrText.append(textPoint.getText()).append(" ");
+                            }
+                        }
+                    } catch (Exception ex) {
+                        logger.warn("Error during OCR extraction: " + ExceptionUtils.getStackTrace(ex));
+                    }
+
+                    // Delete the temporary file after processing
+                    tempFile.delete();
+                }
+            }
+        }
+        document.close();
+        return ocrText.toString();
+    }
+
+
+    private File saveBufferedImageToTempFile(BufferedImage bufferedImage) throws IOException {
+        // Create a temporary file
+        File tempFile = File.createTempFile("ocr_image", ".png");
+
+        // Write the BufferedImage to the temp file as PNG
+        ImageIO.write(bufferedImage, "PNG", tempFile);
+
+        return tempFile;
+    }
+
+    private File downloadFile(String fileUrl) throws IOException {
+        URL url = new URL(fileUrl);
+        String fileName = Paths.get(url.getPath()).getFileName().toString();
+        File tempFile = File.createTempFile("downloaded-", fileName);
+        try (InputStream in = url.openStream();
+             OutputStream out = new FileOutputStream(tempFile)) {
+            byte[] buffer = new byte[1024];
+            int bytesRead;
+            while ((bytesRead = in.read(buffer)) != -1) {
+                out.write(buffer, 0, bytesRead);
+            }
+        }
+        return tempFile;
+    }
+}
+
+
+
+
+
diff --git a/extract_pdf_data_using_ocr/src/main/resources/testsigma-sdk.properties b/extract_pdf_data_using_ocr/src/main/resources/testsigma-sdk.properties
new file mode 100644
index 00000000..27b1c2eb
--- /dev/null
+++ b/extract_pdf_data_using_ocr/src/main/resources/testsigma-sdk.properties
@@ -0,0 +1 @@
+testsigma-sdk.api.key=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIyMjMyMmM2Ni04NWYzLWIyN2UtN2FiOS0zM2U2M2Q4OWM1MGIiLCJ1bmlxdWVJZCI6IjQxNDMiLCJpZGVudGl0eUFjY291bnRVVUlkIjoiMzUifQ.diLO-MgxgOtMxFRIbmhU2tLsdHYavCjpdg4ma_sXJVqUot0oundpdCCIc71GuEsWqPfCTS0SpH7-6QKVtfmDWA
\ No newline at end of file