diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java
index 52b66aaf..4cb73601 100644
--- a/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java
+++ b/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java
@@ -2,6 +2,9 @@
import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
@@ -127,7 +130,7 @@ public String readAsciiString(int start, int len) throws IOException {
if (start + len > this.length()) {
len = this.length() - start;
}
- StringBuilder sb = new StringBuilder();
+ StringBuilder sb = new StringBuilder(len);
int numRead = 0;
while (numRead < len) {
Section section = getAsciiSection(start + numRead);
@@ -242,4 +245,54 @@ public Section getAsciiSection(int offset) throws IOException {
return section;
}
+
+ /**
+ * Get a {@link java.io.Reader} instance for this SourceReader.
+ *
+ *
This is a generic implementation that should be overriden with a more efficient
+ * source-specific implementation, if available.
+ */
+ public Reader getReader() {
+ return new InputStreamReader(
+ new InputStream() {
+ int position = 0;
+ final byte[] buf = new byte[sectionSize];
+ int currentSectionStart = -1;
+ int currentSectionEnd = -1;
+
+ @Override
+ public int read() throws IOException {
+ if (position >= length()) {
+ return -1;
+ }
+ if (position > currentSectionEnd) {
+ currentSectionEnd =
+ position + BaseSourceReader.this.readBytes(buf, 0, position, sectionSize);
+ currentSectionStart = position;
+ }
+ int out = buf[position - currentSectionStart] & 0xFF;
+ position++;
+ return out;
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ int numRead = BaseSourceReader.this.readBytes(buf, off, position, len);
+ this.position += numRead;
+ return numRead;
+ }
+
+ @Override
+ public long skip(long n) throws IOException {
+ int toSkip = Math.min(length() - position, (int) n);
+ this.position += toSkip;
+ return toSkip;
+ }
+
+ @Override
+ public void close() throws IOException {
+ BaseSourceReader.this.close();
+ }
+ });
+ }
}
diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java
index b9df381b..d81b77e1 100644
--- a/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java
+++ b/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java
@@ -2,6 +2,7 @@
import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
+import java.io.Reader;
import java.util.Locale;
import org.apache.lucene.index.QueryTimeout;
@@ -74,4 +75,9 @@ public Section getAsciiSection(int offset) throws IOException {
checkAndThrow();
return input.getAsciiSection(offset);
}
+
+ @Override
+ public Reader getReader() throws IOException {
+ return input.getReader();
+ }
}
diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java
index bce8898a..7b74fd57 100644
--- a/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java
+++ b/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java
@@ -2,8 +2,11 @@
import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
+import java.io.Reader;
import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
@@ -43,4 +46,9 @@ public void close() throws IOException {
public String getIdentifier() {
return this.path.toString();
}
+
+ @Override
+ public Reader getReader() {
+ return Channels.newReader(this.chan, StandardCharsets.UTF_8.newDecoder(), -1);
+ }
}
diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java
deleted file mode 100644
index 1193b0f0..00000000
--- a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java
+++ /dev/null
@@ -1,74 +0,0 @@
-package com.github.dbmdz.solrocr.reader;
-
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardOpenOption;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Locale;
-import java.util.Queue;
-
-public class MultiFileReader extends Reader {
- private final Queue remainingSources;
- private Reader currentReader;
-
- public MultiFileReader(List sourcePaths) throws FileNotFoundException {
- for (Path path : sourcePaths) {
- if (!path.toFile().exists()) {
- throw new FileNotFoundException(
- String.format(Locale.US, "File at %s could not be found", path));
- } else if (path.toFile().isDirectory()) {
- throw new FileNotFoundException(
- String.format(Locale.US, "File at %s is a directory", path));
- }
- }
- this.remainingSources = new LinkedList<>(sourcePaths);
- this.currentReader =
- new InputStreamReader(
- new FileInputStream(remainingSources.remove().toFile()), StandardCharsets.UTF_8);
- }
-
- @Override
- public int read(char[] cbuf, int off, int len) throws IOException {
- if (this.currentReader == null) {
- // No readers available, nothing to read
- return -1;
- }
- int numRead = 0;
- while (numRead < len && currentReader != null) {
- int read = this.currentReader.read(cbuf, off, len);
- if (read < len) {
- this.currentReader.close();
- if (this.remainingSources.isEmpty()) {
- // No more readers, return what was read so far
- this.currentReader = null;
- } else {
- this.currentReader =
- new InputStreamReader(
- Files.newInputStream(remainingSources.remove(), StandardOpenOption.READ),
- StandardCharsets.UTF_8);
- }
- }
- if (read < 0) {
- continue;
- }
- numRead += read;
- off += read;
- len -= read;
- }
- return numRead > 0 ? numRead : -1;
- }
-
- @Override
- public void close() throws IOException {
- if (this.currentReader != null) {
- this.currentReader.close();
- }
- }
-}
diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java
index deb0c681..7a106689 100644
--- a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java
+++ b/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java
@@ -3,9 +3,14 @@
import com.github.dbmdz.solrocr.model.SourcePointer;
import com.github.dbmdz.solrocr.util.ArrayUtils;
import java.io.IOException;
+import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
+import java.nio.channels.ReadableByteChannel;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
@@ -124,4 +129,43 @@ public String getIdentifier() {
.map(p -> p.toAbsolutePath().toString())
.collect(Collectors.joining(", ")));
}
+
+ @Override
+ public Reader getReader() {
+ CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
+ ReadableByteChannel multiFileChannel =
+ new ReadableByteChannel() {
+ private boolean closed = false;
+ private int position = 0;
+
+ @Override
+ public boolean isOpen() {
+ return !closed;
+ }
+
+ @Override
+ public void close() throws IOException {
+ MultiFileSourceReader.this.close();
+ this.closed = true;
+ }
+
+ @Override
+ public int read(ByteBuffer byteBuffer) throws IOException {
+ if (!byteBuffer.hasArray()) {
+ throw new UnsupportedOperationException(
+ "Currently only ByteBuffers backed by an array are supported.");
+ }
+ int numRead =
+ MultiFileSourceReader.this.readBytes(
+ byteBuffer.array(), byteBuffer.arrayOffset(), position, byteBuffer.remaining());
+ if (numRead > 0) {
+ byteBuffer.position(byteBuffer.position() + numRead);
+ }
+ this.position += numRead;
+ return numRead;
+ }
+ };
+
+ return Channels.newReader(multiFileChannel, decoder, -1);
+ }
}
diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java
index 895d3485..e3fba2ba 100644
--- a/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java
+++ b/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java
@@ -2,6 +2,7 @@
import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
+import java.io.Reader;
/** API for reading data from a source. */
public interface SourceReader {
@@ -36,6 +37,8 @@ public interface SourceReader {
*/
Section getAsciiSection(int offset) throws IOException;
+ Reader getReader() throws IOException;
+
class Section {
public final int start;
public final int end;
diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java
index 7263a3b1..b77362ce 100644
--- a/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java
+++ b/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java
@@ -1,6 +1,9 @@
package com.github.dbmdz.solrocr.reader;
import com.github.dbmdz.solrocr.model.SourcePointer;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
/** SourceReader that reads from a String. */
public class StringSourceReader implements SourceReader {
@@ -28,6 +31,11 @@ public Section getAsciiSection(int offset) {
return new Section(0, str.length(), str);
}
+ @Override
+ public Reader getReader() throws IOException {
+ return new StringReader(str);
+ }
+
@Override
public int length() {
return this.str.length();
diff --git a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java
index 019887b7..147724ca 100644
--- a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java
+++ b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java
@@ -2,19 +2,15 @@
import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter;
import com.github.dbmdz.solrocr.model.SourcePointer;
-import com.github.dbmdz.solrocr.reader.MultiFileReader;
import com.github.dbmdz.solrocr.util.Utf8;
import com.google.common.collect.ImmutableList;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
-import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
import java.nio.file.StandardOpenOption;
import java.util.List;
import java.util.Locale;
@@ -67,24 +63,16 @@ public Reader create(Reader input) {
// This is very expensive, but we need this since all IO from here on out is character-based.
toCharOffsets(pointer);
- Reader r;
if (pointer.sources.isEmpty()) {
throw new RuntimeException(
"No source files could be determined from pointer. "
+ "Is it pointing to files that exist and are readable? "
+ "Pointer was: "
+ ptrStr);
- } else if (pointer.sources.size() > 1) {
- r =
- new MultiFileReader(
- pointer.sources.stream().map(s -> s.path).collect(Collectors.toList()));
- } else {
- r =
- new InputStreamReader(
- Files.newInputStream(pointer.sources.get(0).path, StandardOpenOption.READ),
- StandardCharsets.UTF_8);
}
-
+ // NOTE: Section size doesn't matter much, since we only use the APIs for unaligned
+ // reads through the Reader implementations.
+ Reader r = pointer.getReader(512 * 1024, 0).getReader();
List charRegions =
pointer.sources.stream().flatMap(s -> s.regions.stream()).collect(Collectors.toList());
return new ExternalUtf8ContentFilter(new BufferedReader(r), charRegions, ptrStr);
diff --git a/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java b/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java
index 2c005c91..ce8b4d50 100644
--- a/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java
+++ b/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java
@@ -4,7 +4,6 @@
import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter;
import com.github.dbmdz.solrocr.model.SourcePointer.Region;
-import com.github.dbmdz.solrocr.reader.MultiFileReader;
import com.github.dbmdz.solrocr.util.Utf8;
import com.google.common.collect.ImmutableList;
import java.io.BufferedReader;
@@ -149,17 +148,4 @@ public void multipleLongerFiles() throws IOException {
assertThat(filtered).isEqualTo(fullText);
}
}
-
- @Test
- public void testMultiFileReader() throws IOException {
- Path aPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00001.xml");
- Path bPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00002.xml");
- try (MultiFileReader r = new MultiFileReader(ImmutableList.of(aPath, bPath))) {
- String fromReader = IOUtils.toString(r);
- String aText = new String(Files.readAllBytes(aPath), StandardCharsets.UTF_8);
- String bText = new String(Files.readAllBytes(bPath), StandardCharsets.UTF_8);
- String fromFiles = aText + bText;
- assertThat(fromReader).isEqualTo(fromFiles);
- }
- }
}
diff --git a/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java b/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java
index 84872bab..988284ec 100644
--- a/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java
+++ b/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java
@@ -7,6 +7,7 @@
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.SeekableByteChannel;
+import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -14,6 +15,8 @@
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
@@ -116,4 +119,23 @@ public void shouldReadCorrectlyAlignedSections(int sectionSize) throws IOExcepti
assertThat(section.end).isEqualTo(sectionStart + sectionSize);
assertThat(section.text).isEqualTo(expectedStr);
}
+
+ @Test
+ public void shouldReturnValidReader() throws IOException {
+ SourceReader reader =
+ new MultiFileSourceReader(filePaths, pointer, 512 * 1024, maxCacheEntries);
+ String fromReader = IOUtils.toString(reader.getReader());
+ String fromFiles =
+ filePaths.stream()
+ .map(
+ fp -> {
+ try {
+ return new String(Files.readAllBytes(fp), StandardCharsets.UTF_8);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ })
+ .collect(Collectors.joining(""));
+ assertThat(fromReader).isEqualTo(fromFiles);
+ }
}