Skip to content

Commit

Permalink
Merge pull request #111 from Myyyvothrr/main
Browse files Browse the repository at this point in the history
PDF/bytes reader
  • Loading branch information
abrami authored Feb 6, 2025
2 parents 74c930d + b6e39bc commit f353478
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -1683,7 +1683,7 @@ public TypeSystemDescription instantiate_pipeline() throws Exception {
private JCas run_pipeline(String name, JCas jc, long documentWaitTime, Vector<PipelinePart> pipeline) throws Exception {
progress.set(0);

DUUIDocument document = new DUUIDocument("Text", "Text", jc.getDocumentText().getBytes(StandardCharsets.UTF_8));
DUUIDocument document = new DUUIDocument("Text", "Text", jc);
if (JCasUtil.select(jc, DocumentMetaData.class).isEmpty()) {
DocumentMetaData dmd = DocumentMetaData.create(jc);
dmd.setDocumentId(document.getName());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,24 @@ public DUUIDocument(String name, String path, byte[] bytes) {
this.size = bytes.length;
}

public DUUIDocument(String name, String path, JCas jCas) {
if (jCas.getDocumentText() != null) {
this.bytes = jCas.getDocumentText().getBytes(StandardCharsets.UTF_8);
}
else if (jCas.getSofaDataStream() != null) {
try {
this.bytes = jCas.getSofaDataStream().readAllBytes();
}
catch (Exception e) {
e.printStackTrace();
}
}

this.name = name;
this.path = path;
this.size = bytes.length;
}

@Override
public boolean equals(Object o) {
if (o == this) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader.bytes;

import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.ByteArray;
import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import org.dkpro.core.api.resources.CompressionUtils;

import java.io.InputStream;

public class DUUIBytesReader extends JCasResourceCollectionReader_ImplBase {
public static final String PARAM_MIME_TYPE = "mimeType";
@ConfigurationParameter(name = PARAM_MIME_TYPE, mandatory = false)
protected String mimeType;

@Override
public void getNext(JCas jCas) throws CollectionException {
Resource res = nextFile();
initCas(jCas, res);
try (InputStream is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream())) {
byte[] content = is.readAllBytes();
ByteArray data = new ByteArray(jCas, content.length);
data.copyFromArray(content, 0, 0, content.length);
jCas.setSofaDataArray(data, mimeType);
} catch (Exception e) {
throw new CollectionException(e);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader.pdf;

import org.apache.uima.UimaContext;
import org.apache.uima.resource.ResourceInitializationException;
import org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader.bytes.DUUIBytesReader;

public class DUUIPDFReader extends DUUIBytesReader {
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);

if (mimeType == null) {
mimeType = "application/pdf";
}
}
}

0 comments on commit f353478

Please sign in to comment.