diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java index 52b66aaf..4cb73601 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java @@ -2,6 +2,9 @@ import com.github.dbmdz.solrocr.model.SourcePointer; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.nio.charset.StandardCharsets; import java.util.Arrays; @@ -127,7 +130,7 @@ public String readAsciiString(int start, int len) throws IOException { if (start + len > this.length()) { len = this.length() - start; } - StringBuilder sb = new StringBuilder(); + StringBuilder sb = new StringBuilder(len); int numRead = 0; while (numRead < len) { Section section = getAsciiSection(start + numRead); @@ -242,4 +245,54 @@ public Section getAsciiSection(int offset) throws IOException { return section; } + + /** + * Get a {@link java.io.Reader} instance for this SourceReader. + * + *

This is a generic implementation that should be overriden with a more efficient + * source-specific implementation, if available. + */ + public Reader getReader() { + return new InputStreamReader( + new InputStream() { + int position = 0; + final byte[] buf = new byte[sectionSize]; + int currentSectionStart = -1; + int currentSectionEnd = -1; + + @Override + public int read() throws IOException { + if (position >= length()) { + return -1; + } + if (position > currentSectionEnd) { + currentSectionEnd = + position + BaseSourceReader.this.readBytes(buf, 0, position, sectionSize); + currentSectionStart = position; + } + int out = buf[position - currentSectionStart] & 0xFF; + position++; + return out; + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + int numRead = BaseSourceReader.this.readBytes(buf, off, position, len); + this.position += numRead; + return numRead; + } + + @Override + public long skip(long n) throws IOException { + int toSkip = Math.min(length() - position, (int) n); + this.position += toSkip; + return toSkip; + } + + @Override + public void close() throws IOException { + BaseSourceReader.this.close(); + } + }); + } } diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java index b9df381b..d81b77e1 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java @@ -2,6 +2,7 @@ import com.github.dbmdz.solrocr.model.SourcePointer; import java.io.IOException; +import java.io.Reader; import java.util.Locale; import org.apache.lucene.index.QueryTimeout; @@ -74,4 +75,9 @@ public Section getAsciiSection(int offset) throws IOException { checkAndThrow(); return input.getAsciiSection(offset); } + + @Override + public Reader getReader() throws IOException { + return input.getReader(); + } } diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java index bce8898a..7b74fd57 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java @@ -2,8 +2,11 @@ import com.github.dbmdz.solrocr.model.SourcePointer; import java.io.IOException; +import java.io.Reader; import java.nio.ByteBuffer; +import java.nio.channels.Channels; import java.nio.channels.FileChannel; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; @@ -43,4 +46,9 @@ public void close() throws IOException { public String getIdentifier() { return this.path.toString(); } + + @Override + public Reader getReader() { + return Channels.newReader(this.chan, StandardCharsets.UTF_8.newDecoder(), -1); + } } diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java deleted file mode 100644 index 1193b0f0..00000000 --- a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.github.dbmdz.solrocr.reader; - -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.util.LinkedList; -import java.util.List; -import java.util.Locale; -import java.util.Queue; - -public class MultiFileReader extends Reader { - private final Queue remainingSources; - private Reader currentReader; - - public MultiFileReader(List sourcePaths) throws FileNotFoundException { - for (Path path : sourcePaths) { - if (!path.toFile().exists()) { - throw new FileNotFoundException( - String.format(Locale.US, "File at %s could not be found", path)); - } else if (path.toFile().isDirectory()) { - throw new FileNotFoundException( - String.format(Locale.US, "File at %s is a directory", path)); - } - } - this.remainingSources = new LinkedList<>(sourcePaths); - this.currentReader = - new InputStreamReader( - new FileInputStream(remainingSources.remove().toFile()), StandardCharsets.UTF_8); - } - - @Override - public int read(char[] cbuf, int off, int len) throws IOException { - if (this.currentReader == null) { - // No readers available, nothing to read - return -1; - } - int numRead = 0; - while (numRead < len && currentReader != null) { - int read = this.currentReader.read(cbuf, off, len); - if (read < len) { - this.currentReader.close(); - if (this.remainingSources.isEmpty()) { - // No more readers, return what was read so far - this.currentReader = null; - } else { - this.currentReader = - new InputStreamReader( - Files.newInputStream(remainingSources.remove(), StandardOpenOption.READ), - StandardCharsets.UTF_8); - } - } - if (read < 0) { - continue; - } - numRead += read; - off += read; - len -= read; - } - return numRead > 0 ? numRead : -1; - } - - @Override - public void close() throws IOException { - if (this.currentReader != null) { - this.currentReader.close(); - } - } -} diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java index deb0c681..7a106689 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java @@ -3,9 +3,14 @@ import com.github.dbmdz.solrocr.model.SourcePointer; import com.github.dbmdz.solrocr.util.ArrayUtils; import java.io.IOException; +import java.io.Reader; import java.lang.invoke.MethodHandles; import java.nio.ByteBuffer; +import java.nio.channels.Channels; import java.nio.channels.FileChannel; +import java.nio.channels.ReadableByteChannel; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; @@ -124,4 +129,43 @@ public String getIdentifier() { .map(p -> p.toAbsolutePath().toString()) .collect(Collectors.joining(", "))); } + + @Override + public Reader getReader() { + CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder(); + ReadableByteChannel multiFileChannel = + new ReadableByteChannel() { + private boolean closed = false; + private int position = 0; + + @Override + public boolean isOpen() { + return !closed; + } + + @Override + public void close() throws IOException { + MultiFileSourceReader.this.close(); + this.closed = true; + } + + @Override + public int read(ByteBuffer byteBuffer) throws IOException { + if (!byteBuffer.hasArray()) { + throw new UnsupportedOperationException( + "Currently only ByteBuffers backed by an array are supported."); + } + int numRead = + MultiFileSourceReader.this.readBytes( + byteBuffer.array(), byteBuffer.arrayOffset(), position, byteBuffer.remaining()); + if (numRead > 0) { + byteBuffer.position(byteBuffer.position() + numRead); + } + this.position += numRead; + return numRead; + } + }; + + return Channels.newReader(multiFileChannel, decoder, -1); + } } diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java index 895d3485..e3fba2ba 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java @@ -2,6 +2,7 @@ import com.github.dbmdz.solrocr.model.SourcePointer; import java.io.IOException; +import java.io.Reader; /** API for reading data from a source. */ public interface SourceReader { @@ -36,6 +37,8 @@ public interface SourceReader { */ Section getAsciiSection(int offset) throws IOException; + Reader getReader() throws IOException; + class Section { public final int start; public final int end; diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java index 7263a3b1..b77362ce 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java @@ -1,6 +1,9 @@ package com.github.dbmdz.solrocr.reader; import com.github.dbmdz.solrocr.model.SourcePointer; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; /** SourceReader that reads from a String. */ public class StringSourceReader implements SourceReader { @@ -28,6 +31,11 @@ public Section getAsciiSection(int offset) { return new Section(0, str.length(), str); } + @Override + public Reader getReader() throws IOException { + return new StringReader(str); + } + @Override public int length() { return this.str.length(); diff --git a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java index 019887b7..147724ca 100644 --- a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java +++ b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java @@ -2,19 +2,15 @@ import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter; import com.github.dbmdz.solrocr.model.SourcePointer; -import com.github.dbmdz.solrocr.reader.MultiFileReader; import com.github.dbmdz.solrocr.util.Utf8; import com.google.common.collect.ImmutableList; import java.io.BufferedReader; import java.io.File; import java.io.IOException; -import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; import java.nio.file.StandardOpenOption; import java.util.List; import java.util.Locale; @@ -67,24 +63,16 @@ public Reader create(Reader input) { // This is very expensive, but we need this since all IO from here on out is character-based. toCharOffsets(pointer); - Reader r; if (pointer.sources.isEmpty()) { throw new RuntimeException( "No source files could be determined from pointer. " + "Is it pointing to files that exist and are readable? " + "Pointer was: " + ptrStr); - } else if (pointer.sources.size() > 1) { - r = - new MultiFileReader( - pointer.sources.stream().map(s -> s.path).collect(Collectors.toList())); - } else { - r = - new InputStreamReader( - Files.newInputStream(pointer.sources.get(0).path, StandardOpenOption.READ), - StandardCharsets.UTF_8); } - + // NOTE: Section size doesn't matter much, since we only use the APIs for unaligned + // reads through the Reader implementations. + Reader r = pointer.getReader(512 * 1024, 0).getReader(); List charRegions = pointer.sources.stream().flatMap(s -> s.regions.stream()).collect(Collectors.toList()); return new ExternalUtf8ContentFilter(new BufferedReader(r), charRegions, ptrStr); diff --git a/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java b/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java index 2c005c91..ce8b4d50 100644 --- a/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java @@ -4,7 +4,6 @@ import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter; import com.github.dbmdz.solrocr.model.SourcePointer.Region; -import com.github.dbmdz.solrocr.reader.MultiFileReader; import com.github.dbmdz.solrocr.util.Utf8; import com.google.common.collect.ImmutableList; import java.io.BufferedReader; @@ -149,17 +148,4 @@ public void multipleLongerFiles() throws IOException { assertThat(filtered).isEqualTo(fullText); } } - - @Test - public void testMultiFileReader() throws IOException { - Path aPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00001.xml"); - Path bPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00002.xml"); - try (MultiFileReader r = new MultiFileReader(ImmutableList.of(aPath, bPath))) { - String fromReader = IOUtils.toString(r); - String aText = new String(Files.readAllBytes(aPath), StandardCharsets.UTF_8); - String bText = new String(Files.readAllBytes(bPath), StandardCharsets.UTF_8); - String fromFiles = aText + bText; - assertThat(fromReader).isEqualTo(fromFiles); - } - } } diff --git a/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java b/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java index 84872bab..988284ec 100644 --- a/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java @@ -7,6 +7,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.SeekableByteChannel; +import java.nio.charset.StandardCharsets; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; @@ -14,6 +15,8 @@ import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; @@ -116,4 +119,23 @@ public void shouldReadCorrectlyAlignedSections(int sectionSize) throws IOExcepti assertThat(section.end).isEqualTo(sectionStart + sectionSize); assertThat(section.text).isEqualTo(expectedStr); } + + @Test + public void shouldReturnValidReader() throws IOException { + SourceReader reader = + new MultiFileSourceReader(filePaths, pointer, 512 * 1024, maxCacheEntries); + String fromReader = IOUtils.toString(reader.getReader()); + String fromFiles = + filePaths.stream() + .map( + fp -> { + try { + return new String(Files.readAllBytes(fp), StandardCharsets.UTF_8); + } catch (IOException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.joining("")); + assertThat(fromReader).isEqualTo(fromFiles); + } }