Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/CR-1842-extract pdf data using ocr #84

Merged
merged 1 commit into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions extract_pdf_data_using_ocr/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.testsigma.addons</groupId>
<artifactId>extract_pdf_data_using_ocr</artifactId>
<version>1.0.0</version>
<packaging>jar</packaging>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<testsigma.sdk.version>1.2.18_cloud</testsigma.sdk.version>
<junit.jupiter.version>5.8.0-M1</junit.jupiter.version>
<testsigma.addon.maven.plugin>1.0.0</testsigma.addon.maven.plugin>
<maven.source.plugin.version>3.2.1</maven.source.plugin.version>
<lombok.version>1.18.20</lombok.version>

</properties>

<dependencies>
<dependency>
<groupId>com.testsigma</groupId>
<artifactId>testsigma-java-sdk</artifactId>
<version>${testsigma.sdk.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>${junit.jupiter.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.14.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.14.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/io.appium/java-client -->
<dependency>
<groupId>io.appium</groupId>
<artifactId>java-client</artifactId>
<version>9.0.0</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.13.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.14.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.1</version>
</dependency>

</dependencies>
<build>
<finalName>extract_pdf_data_using_ocr</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.4</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>${maven.source.plugin.version}</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package com.testsigma.addons.web;

import com.testsigma.sdk.*;
import com.testsigma.sdk.annotation.Action;
import com.testsigma.sdk.annotation.OCR;
import com.testsigma.sdk.annotation.RunTimeData;
import com.testsigma.sdk.annotation.TestData;
import lombok.Data;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.net.URL;
import java.nio.file.Paths;
import java.util.List;

@Data
@Action(actionText = "Extract data from PDF pdf_path using ocr and store it in runtime variable variable-name",
description = "Extracts text from a PDF file and performs OCR on images embedded in the PDF.",
applicationType = ApplicationType.WEB)
public class OCRextractFromPDF extends WebAction {

@TestData(reference = "pdf_path")
private com.testsigma.sdk.TestData pdfPath;

@TestData(reference = "variable-name", isRuntimeVariable = true)
private com.testsigma.sdk.TestData testdata;

@RunTimeData
private com.testsigma.sdk.RunTimeData runTimeData;

@OCR
private com.testsigma.sdk.OCR ocr;

@Override
protected Result execute() {
Result result = Result.SUCCESS;
try {
String filePath = pdfPath.getValue().toString();
File pdfFile;

// Check if the path is a URL
if (filePath.startsWith("http://") || filePath.startsWith("https://")) {
pdfFile = downloadFile(filePath);
} else {
pdfFile = new File(filePath);
}


//Extract text from PDF document
String textFromPdf = extractTextFromPdf(pdfFile.getAbsolutePath());
logger.info("Text from PDF: " + textFromPdf);

//Extract images and perform OCR to extract text from images
String ocrText = extractTextFromImages(pdfFile.getAbsolutePath());
logger.info("OCR Text: " + ocrText);

// Combine the text from the PDF and OCR text
String combinedText = textFromPdf + " " + ocrText;
logger.info("The text extracted from the PDF: " + combinedText);

runTimeData.setKey(testdata.getValue().toString());
runTimeData.setValue(combinedText);

setSuccessMessage("The text was successfully extracted from the PDF and images, and stored in variable: " + testdata.getValue().toString() + ". Value: " + combinedText);
logger.info("The text was successfully extracted from the PDF and images, and stored in variable: " + testdata.getValue().toString());

if (filePath.startsWith("http://") || filePath.startsWith("https://")) {
pdfFile.delete();
}

} catch (Exception e) {
setErrorMessage("Error during PDF extraction process: " + ExceptionUtils.getStackTrace(e));
result = Result.FAILED;
}
return result;
}

private String extractTextFromPdf(String pdfFilePath) throws IOException {
// Extract text from PDF
PDDocument document = Loader.loadPDF(new File(pdfFilePath));
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(document);
document.close();
logger.info("Text extracted from the PDF: " + text);
return text;
}

private String extractTextFromImages(String pdfFilePath) throws IOException {
// Extract text from images using OCR
PDDocument document = Loader.loadPDF(new File(pdfFilePath));
StringBuilder ocrText = new StringBuilder();

// Iterate through pages and extract images
for (PDPage page : document.getPages()) {
// Get the resources on the page, which include images
PDResources resources = page.getResources();

// Iterate through the resources and extract images
for (COSName name : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(name);

if (xObject instanceof PDImageXObject) {
// Handle images here
PDImageXObject imageXObject = (PDImageXObject) xObject;

// Convert the PDImageXObject to a BufferedImage
BufferedImage bufferedImage = imageXObject.getImage();
if(bufferedImage == null) {
logger.info("Skipping null image");
continue;
}

// Save the BufferedImage to a temporary file
File tempFile = saveBufferedImageToTempFile(bufferedImage);

// Pass the temp file to OCR
OCRImage imageObj = new OCRImage();
imageObj.setOcrImageFile(tempFile); // Set the temporary image file

// Perform OCR on the image
try {
List<OCRTextPoint> textPoints = ocr.extractTextFromImage(imageObj);
if(textPoints != null){
for (OCRTextPoint textPoint : textPoints) {
ocrText.append(textPoint.getText()).append(" ");
}
}
} catch (Exception ex) {
logger.warn("Error during OCR extraction: " + ExceptionUtils.getStackTrace(ex));
}

// Delete the temporary file after processing
tempFile.delete();
}
}
}
document.close();
return ocrText.toString();
}


private File saveBufferedImageToTempFile(BufferedImage bufferedImage) throws IOException {
// Create a temporary file
File tempFile = File.createTempFile("ocr_image", ".png");

// Write the BufferedImage to the temp file as PNG
ImageIO.write(bufferedImage, "PNG", tempFile);

return tempFile;
}

private File downloadFile(String fileUrl) throws IOException {
URL url = new URL(fileUrl);
String fileName = Paths.get(url.getPath()).getFileName().toString();
File tempFile = File.createTempFile("downloaded-", fileName);
try (InputStream in = url.openStream();
OutputStream out = new FileOutputStream(tempFile)) {
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = in.read(buffer)) != -1) {
out.write(buffer, 0, bytesRead);
}
}
return tempFile;
}
}





Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
testsigma-sdk.api.key=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIyMjMyMmM2Ni04NWYzLWIyN2UtN2FiOS0zM2U2M2Q4OWM1MGIiLCJ1bmlxdWVJZCI6IjQxNDMiLCJpZGVudGl0eUFjY291bnRVVUlkIjoiMzUifQ.diLO-MgxgOtMxFRIbmhU2tLsdHYavCjpdg4ma_sXJVqUot0oundpdCCIc71GuEsWqPfCTS0SpH7-6QKVtfmDWA
Loading