diff --git a/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java b/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java index fb983ea9..fd916695 100644 --- a/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java +++ b/src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java @@ -7,6 +7,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; @@ -22,31 +23,58 @@ public class SourcePointer { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - public static class FileSource { + public enum SourceType { + FILESYSTEM, + }; - public final Path path; + public static class Source { + + public final SourceType type; + public final String target; public List regions; public boolean isAscii; - public FileSource(Path path, List regions, boolean isAscii) throws IOException { - this.path = path; - if (!path.toFile().exists()) { - throw new FileNotFoundException( - String.format(Locale.US, "File at %s does not exist.", path)); - } - if (path.toFile().length() == 0) { - throw new IOException(String.format(Locale.US, "File at %s is empty.", path)); - } + public Source(String target, List regions, boolean isAscii) throws IOException { + this.type = determineType(target); + Source.validateTarget(target, this.type); + this.target = target; this.regions = regions; this.isAscii = isAscii; } - static FileSource parse(String pointer) { + static SourceType determineType(String target) throws IOException { + if (target.startsWith("/")) { + return SourceType.FILESYSTEM; + } else if (Files.exists(Paths.get(target))) { + return SourceType.FILESYSTEM; + } else { + throw new IOException( + String.format(Locale.US, "Target %s is currently not supported.", target)); + } + } + + static void validateTarget(String target, SourceType type) throws IOException { + if (type == SourceType.FILESYSTEM) { + Path path = Paths.get(target); + if (!Files.exists(path)) { + throw new FileNotFoundException( + String.format(Locale.US, "File at %s does not exist.", target)); + } + if (Files.size(path) == 0) { + throw new IOException(String.format(Locale.US, "File at %s is empty.", target)); + } + } else { + throw new IOException( + String.format(Locale.US, "Target %s is currently not supported.", target)); + } + } + + static Source parse(String pointer) { Matcher m = POINTER_PAT.matcher(pointer); if (!m.find()) { throw new RuntimeException("Could not parse source pointer from '" + pointer + "."); } - Path sourcePath = Paths.get(m.group("path")); + String target = m.group("target"); List regions = ImmutableList.of(); if (m.group("regions") != null) { regions = @@ -56,16 +84,25 @@ static FileSource parse(String pointer) { .collect(Collectors.toList()); } try { - return new FileSource(sourcePath, regions, m.group("isAscii") != null); + return new Source(target, regions, m.group("isAscii") != null); } catch (FileNotFoundException e) { - throw new RuntimeException("Could not locate file at '" + sourcePath + "."); + throw new RuntimeException("Could not locate file at '" + target + "."); } catch (IOException e) { - throw new RuntimeException("Could not read file at '" + sourcePath + "."); + throw new RuntimeException("Could not read target at '" + target + "."); + } + } + + public SourceReader getReader(int sectionSize, int maxCacheEntries) throws IOException { + if (this.type == SourceType.FILESYSTEM) { + return new FileSourceReader( + Paths.get(this.target), SourcePointer.parse(this.target), sectionSize, maxCacheEntries); + } else { + throw new UnsupportedOperationException("Unsupported source type '" + this.type + "'."); } } public String toString() { - StringBuilder sb = new StringBuilder(path.toString()); + StringBuilder sb = new StringBuilder(target); if (isAscii) { sb.append("{ascii}"); } @@ -117,9 +154,9 @@ public String toString() { } static final Pattern POINTER_PAT = - Pattern.compile("^(?.+?)(?\\{ascii})?(?:\\[(?[0-9:,]+)])?$"); + Pattern.compile("^(?.+?)(?\\{ascii})?(?:\\[(?[0-9:,]+)])?$"); - public final List sources; + public final List sources; public static boolean isPointer(String pointer) { if (pointer.startsWith("<")) { @@ -134,34 +171,43 @@ public static SourcePointer parse(String pointer) { throw new RuntimeException("Could not parse pointer: " + pointer); } String[] sourceTokens = pointer.split("\\+"); - List fileSources = - Arrays.stream(sourceTokens).map(FileSource::parse).collect(Collectors.toList()); - if (fileSources.isEmpty()) { + List sources = + Arrays.stream(sourceTokens).map(Source::parse).collect(Collectors.toList()); + if (sources.isEmpty()) { return null; } else { - return new SourcePointer(fileSources); + return new SourcePointer(sources); } } - public SourcePointer(List sources) { + public SourcePointer(List sources) { this.sources = sources; } @Override public String toString() { - return sources.stream().map(FileSource::toString).collect(Collectors.joining("+")); + return sources.stream().map(Source::toString).collect(Collectors.joining("+")); } /** Create a reader for the data pointed at by this source pointer. */ public SourceReader getReader(int sectionSize, int maxCacheEntries) throws IOException { - if (this.sources.size() == 1) { - return new FileSourceReader(this.sources.get(0).path, this, sectionSize, maxCacheEntries); + if (this.sources.stream().allMatch(s -> s.type == SourceType.FILESYSTEM)) { + if (this.sources.size() == 1) { + return new FileSourceReader( + Paths.get(this.sources.get(0).target), this, sectionSize, maxCacheEntries); + } else { + return new MultiFileSourceReader( + this.sources.stream().map(s -> Paths.get(s.target)).collect(Collectors.toList()), + this, + sectionSize, + maxCacheEntries); + } } else { - return new MultiFileSourceReader( - this.sources.stream().map(s -> s.path).collect(Collectors.toList()), - this, - sectionSize, - maxCacheEntries); + throw new IOException( + String.format( + Locale.US, + "Pointer %s contains unsupported target types or a mix of target types.", + this)); } } } diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java index 52b66aaf..23419ded 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/BaseSourceReader.java @@ -54,13 +54,6 @@ public BaseSourceReader(SourcePointer pointer, int sectionSize, int maxCacheEntr this.maxCacheEntries = maxCacheEntries; } - /** - * Read {@param len} bytes starting at {@param start} from the source into the buffer {@param dst} - * starting at offset {@param dstOffset}, returning the number of bytes read. - */ - protected abstract int readBytes(byte[] dst, int dstOffset, int start, int len) - throws IOException; - @Override public abstract int length() throws IOException; @@ -127,7 +120,7 @@ public String readAsciiString(int start, int len) throws IOException { if (start + len > this.length()) { len = this.length() - start; } - StringBuilder sb = new StringBuilder(); + StringBuilder sb = new StringBuilder(len); int numRead = 0; while (numRead < len) { Section section = getAsciiSection(start + numRead); diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java index b9df381b..3e3501ed 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/ExitingSourceReader.java @@ -2,6 +2,8 @@ import com.github.dbmdz.solrocr.model.SourcePointer; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.SeekableByteChannel; import java.util.Locale; import org.apache.lucene.index.QueryTimeout; @@ -74,4 +76,17 @@ public Section getAsciiSection(int offset) throws IOException { checkAndThrow(); return input.getAsciiSection(offset); } + + @Override + public int readBytes(ByteBuffer dst, int start) throws IOException { + checkAndThrow(); + return input.readBytes(dst, start); + } + + @Override + public SeekableByteChannel getByteChannel() throws IOException { + // Just provided for completeness, this type is not used during indexing where this method + // matters. + return input.getByteChannel(); + } } diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java index bce8898a..11f8b8e7 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/FileSourceReader.java @@ -4,6 +4,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.nio.channels.SeekableByteChannel; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; @@ -22,8 +23,8 @@ public FileSourceReader(Path path, SourcePointer ptr, int sectionSize, int maxCa } @Override - protected int readBytes(byte[] dst, int dstOffset, int start, int len) throws IOException { - return this.chan.read(ByteBuffer.wrap(dst, dstOffset, len), start); + public int readBytes(ByteBuffer dst, int start) throws IOException { + return this.chan.read(dst, start); } @Override @@ -43,4 +44,9 @@ public void close() throws IOException { public String getIdentifier() { return this.path.toString(); } + + @Override + public SeekableByteChannel getByteChannel() throws IOException { + return this.chan; + } } diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java deleted file mode 100644 index 1193b0f0..00000000 --- a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.github.dbmdz.solrocr.reader; - -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.util.LinkedList; -import java.util.List; -import java.util.Locale; -import java.util.Queue; - -public class MultiFileReader extends Reader { - private final Queue remainingSources; - private Reader currentReader; - - public MultiFileReader(List sourcePaths) throws FileNotFoundException { - for (Path path : sourcePaths) { - if (!path.toFile().exists()) { - throw new FileNotFoundException( - String.format(Locale.US, "File at %s could not be found", path)); - } else if (path.toFile().isDirectory()) { - throw new FileNotFoundException( - String.format(Locale.US, "File at %s is a directory", path)); - } - } - this.remainingSources = new LinkedList<>(sourcePaths); - this.currentReader = - new InputStreamReader( - new FileInputStream(remainingSources.remove().toFile()), StandardCharsets.UTF_8); - } - - @Override - public int read(char[] cbuf, int off, int len) throws IOException { - if (this.currentReader == null) { - // No readers available, nothing to read - return -1; - } - int numRead = 0; - while (numRead < len && currentReader != null) { - int read = this.currentReader.read(cbuf, off, len); - if (read < len) { - this.currentReader.close(); - if (this.remainingSources.isEmpty()) { - // No more readers, return what was read so far - this.currentReader = null; - } else { - this.currentReader = - new InputStreamReader( - Files.newInputStream(remainingSources.remove(), StandardOpenOption.READ), - StandardCharsets.UTF_8); - } - } - if (read < 0) { - continue; - } - numRead += read; - off += read; - len -= read; - } - return numRead > 0 ? numRead : -1; - } - - @Override - public void close() throws IOException { - if (this.currentReader != null) { - this.currentReader.close(); - } - } -} diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java index deb0c681..ef381d46 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReader.java @@ -34,8 +34,8 @@ private OpenFile(Path p, int startOffset) throws IOException { this.startOffset = startOffset; } - public int read(byte[] dst, int dstOffset, int start, int len) throws IOException { - return this.channel.read(ByteBuffer.wrap(dst, dstOffset, len), start); + public int read(ByteBuffer dst, int start) throws IOException { + return this.channel.read(dst, start); } public void close() throws IOException { @@ -68,7 +68,7 @@ public MultiFileSourceReader( } @Override - protected int readBytes(byte[] dst, int dstOffset, int start, int len) throws IOException { + public int readBytes(ByteBuffer dst, int start) throws IOException { int fileIdx = ArrayUtils.binaryFloorIdxSearch(startOffsets, start); if (fileIdx < 0) { throw new RuntimeException(String.format("Offset %d is out of bounds", start)); @@ -79,9 +79,10 @@ protected int readBytes(byte[] dst, int dstOffset, int start, int len) throws IO } OpenFile file = openFiles[fileIdx]; + int len = dst.remaining(); int numRead = 0; while (numRead < len) { - numRead += file.read(dst, dstOffset + numRead, (start + numRead) - fileOffset, len - numRead); + numRead += file.read(dst, (start + numRead) - fileOffset); if (numRead < len) { fileIdx++; if (fileIdx >= paths.length) { diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java index 895d3485..0a996396 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/SourceReader.java @@ -2,6 +2,8 @@ import com.github.dbmdz.solrocr.model.SourcePointer; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.SeekableByteChannel; /** API for reading data from a source. */ public interface SourceReader { @@ -36,6 +38,73 @@ public interface SourceReader { */ Section getAsciiSection(int offset) throws IOException; + /** + * Read into {@param dst} starting at {@param start} from the source. , returning the number of + * bytes read. + */ + int readBytes(ByteBuffer dst, int start) throws IOException; + + default int readBytes(byte[] dst, int dstOffset, int start, int len) throws IOException { + return readBytes(ByteBuffer.wrap(dst, dstOffset, len), start); + } + + /** + * Get a {@link java.nio.channels.SeekableByteChannel} for this SourceReader. + * + *

This is a generic implementation that should be overriden with a more efficient + * source-specific implementation, if available. + */ + default SeekableByteChannel getByteChannel() throws IOException { + return new SeekableByteChannel() { + int position = 0; + boolean closed = false; + + @Override + public int read(ByteBuffer byteBuffer) throws IOException { + int numRead = SourceReader.this.readBytes(byteBuffer, position); + this.position += numRead; + return numRead; + } + + @Override + public int write(ByteBuffer byteBuffer) throws IOException { + throw new UnsupportedOperationException("Channel is read-only"); + } + + @Override + public long position() throws IOException { + return position; + } + + @Override + public SeekableByteChannel position(long newPosition) throws IOException { + this.position = (int) newPosition; + return this; + } + + @Override + public long size() throws IOException { + return SourceReader.this.length(); + } + + @Override + public SeekableByteChannel truncate(long l) throws IOException { + throw new UnsupportedOperationException("Channel is read-only"); + } + + @Override + public boolean isOpen() { + return !this.closed; + } + + @Override + public void close() throws IOException { + SourceReader.this.close(); + this.closed = true; + } + }; + } + class Section { public final int start; public final int end; diff --git a/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java b/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java index 7263a3b1..63f68eec 100644 --- a/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java +++ b/src/main/java/com/github/dbmdz/solrocr/reader/StringSourceReader.java @@ -1,6 +1,8 @@ package com.github.dbmdz.solrocr.reader; import com.github.dbmdz.solrocr.model.SourcePointer; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; /** SourceReader that reads from a String. */ public class StringSourceReader implements SourceReader { @@ -28,6 +30,14 @@ public Section getAsciiSection(int offset) { return new Section(0, str.length(), str); } + @Override + public int readBytes(ByteBuffer dst, int start) { + byte[] bytes = str.getBytes(StandardCharsets.UTF_8); + int limit = Math.min(dst.remaining(), bytes.length); + dst.put(bytes, 0, limit); + return limit; + } + @Override public int length() { return this.str.length(); diff --git a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java index 019887b7..385eb319 100644 --- a/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java +++ b/src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java @@ -2,20 +2,21 @@ import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter; import com.github.dbmdz.solrocr.model.SourcePointer; -import com.github.dbmdz.solrocr.reader.MultiFileReader; +import com.github.dbmdz.solrocr.model.SourcePointer.Source; +import com.github.dbmdz.solrocr.model.SourcePointer.SourceType; +import com.github.dbmdz.solrocr.reader.SourceReader; import com.github.dbmdz.solrocr.util.Utf8; import com.google.common.collect.ImmutableList; import java.io.BufferedReader; import java.io.File; import java.io.IOException; -import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; +import java.nio.channels.Channels; +import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.StandardOpenOption; +import java.nio.file.Paths; import java.util.List; import java.util.Locale; import java.util.Map; @@ -67,24 +68,16 @@ public Reader create(Reader input) { // This is very expensive, but we need this since all IO from here on out is character-based. toCharOffsets(pointer); - Reader r; if (pointer.sources.isEmpty()) { throw new RuntimeException( "No source files could be determined from pointer. " + "Is it pointing to files that exist and are readable? " + "Pointer was: " + ptrStr); - } else if (pointer.sources.size() > 1) { - r = - new MultiFileReader( - pointer.sources.stream().map(s -> s.path).collect(Collectors.toList())); - } else { - r = - new InputStreamReader( - Files.newInputStream(pointer.sources.get(0).path, StandardOpenOption.READ), - StandardCharsets.UTF_8); } - + Reader r = + Channels.newReader( + pointer.getReader(512 * 1024, 0).getByteChannel(), StandardCharsets.UTF_8.name()); List charRegions = pointer.sources.stream().flatMap(s -> s.regions.stream()).collect(Collectors.toList()); return new ExternalUtf8ContentFilter(new BufferedReader(r), charRegions, ptrStr); @@ -96,19 +89,25 @@ public Reader create(Reader input) { } } - private void validateSource(SourcePointer.FileSource src) { + private void validateSource(Source src) { // TODO: Check if sourcePath is located under one of the allowed base directories, else abort // TODO: Check if sourcePath's filename matches one of the allowed filename patterns, else abort - File f = src.path.toFile(); - if (!f.exists() || !f.canRead()) { + if (src.type == SourceType.FILESYSTEM) { + File f = Paths.get(src.target).toFile(); + if (!f.exists() || !f.canRead()) { + throw new SolrException( + ErrorCode.BAD_REQUEST, + String.format( + Locale.US, "File at %s either does not exist or cannot be read.", src.target)); + } + } else { throw new SolrException( ErrorCode.BAD_REQUEST, - String.format( - Locale.US, "File at %s either does not exist or cannot be read.", src.path)); + String.format(Locale.US, "Pointer has target with unsupported type: %s", src.target)); } } - private static long getUtf8DecodedLength(FileChannel fChan, ByteBuffer buf, long numBytes) + private static long getUtf8DecodedLength(SeekableByteChannel chan, ByteBuffer buf, long numBytes) throws IOException { long numRead = 0; long decodedLength = 0; @@ -116,7 +115,7 @@ private static long getUtf8DecodedLength(FileChannel fChan, ByteBuffer buf, long if (buf.remaining() > (numBytes - numRead)) { buf.limit((int) (numBytes - numRead)); } - int read = fChan.read(buf); + int read = chan.read(buf); if (read < 0) { break; } @@ -144,32 +143,35 @@ private void toCharOffsets(SourcePointer ptr) throws IOException { // file every time // TODO: Think about building the UTF8 -> UTF16 offset map right here if the mapping part should // become a bottle neck - for (SourcePointer.FileSource src : ptr.sources) { - try (FileChannel fChan = FileChannel.open(src.path, StandardOpenOption.READ)) { - final int fSize = (int) fChan.size(); + for (SourcePointer.Source src : ptr.sources) { + SourceReader reader = src.getReader(512 * 1024, 0); + try { + SeekableByteChannel chan = reader.getByteChannel(); + final int size = (int) chan.size(); int bomOffset = 0; if (!src.isAscii) { - // Check for BOM, we need to skip it as to not break mult-file parsing + // Check for BOM without modifying channel position, we need to skip it as to not break + // mult-file parsing ByteBuffer bomBuf = ByteBuffer.allocate(3); - fChan.read(bomBuf, 0); + chan.read(bomBuf); + chan.position(0); bomBuf.flip(); if (bomBuf.equals(ByteBuffer.wrap(new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}))) { bomOffset = 3; } - fChan.position(0); } // Byte offset of the current file from the beginning of the first file final int baseOffset = byteOffset; if (src.regions.isEmpty()) { - src.regions = ImmutableList.of(new SourcePointer.Region(0, fSize)); + src.regions = ImmutableList.of(new SourcePointer.Region(0, size)); } for (SourcePointer.Region region : src.regions) { if (src.isAscii) { // Optimization for pure-ASCII sources, where we don't need to do any mapping region.start += baseOffset; - region.end = Math.min(region.end + baseOffset, fSize + baseOffset); + region.end = Math.min(region.end + baseOffset, size + baseOffset); continue; } if (region.start == 0) { @@ -179,14 +181,14 @@ private void toCharOffsets(SourcePointer ptr) throws IOException { // Make region offsets relative to the beginning of the first file region.start += baseOffset; if (region.end < 0) { - region.end = fSize; + region.end = size; } - region.end = Math.min(region.end + baseOffset, fSize + baseOffset); + region.end = Math.min(region.end + baseOffset, size + baseOffset); // Read until the start of the region if (byteOffset != region.start) { // Read the data between the current offset and the start of the region int len = region.start - byteOffset; - charOffset += getUtf8DecodedLength(fChan, buf, len); + charOffset += (int) getUtf8DecodedLength(chan, buf, len); byteOffset += len; } @@ -194,18 +196,20 @@ private void toCharOffsets(SourcePointer ptr) throws IOException { region.start = charOffset; region.startOffset = byteOffset; // Read region, determine character offset of region end - charOffset += getUtf8DecodedLength(fChan, buf, regionSize); + charOffset += (int) getUtf8DecodedLength(chan, buf, regionSize); byteOffset += regionSize; region.end = charOffset; } // Determine character offset of the end of the file if (src.isAscii) { - byteOffset += fSize; - } else if (byteOffset != baseOffset + fSize) { - int len = (baseOffset + fSize) - byteOffset; - charOffset += getUtf8DecodedLength(fChan, buf, len); + byteOffset += size; + } else if (byteOffset != baseOffset + size) { + int len = (baseOffset + size) - byteOffset; + charOffset += (int) getUtf8DecodedLength(chan, buf, len); byteOffset += len; } + } finally { + reader.close(); } } } diff --git a/src/test/java/com/github/dbmdz/solrocr/formats/alto/AltoParserTest.java b/src/test/java/com/github/dbmdz/solrocr/formats/alto/AltoParserTest.java index e4679370..ec19ec9b 100644 --- a/src/test/java/com/github/dbmdz/solrocr/formats/alto/AltoParserTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/formats/alto/AltoParserTest.java @@ -128,6 +128,7 @@ public void testMultiFileParse() throws XMLStreamException, IOException { .filter(p -> p.getFileName().toString().startsWith("1860-")) .map(Path::toAbsolutePath) .map(Path::toString) + .sorted() .collect(Collectors.joining("+")); List boxes = new AltoParser(filterFac.create(new StringReader(ptr))) diff --git a/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java b/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java index 2c005c91..ce8b4d50 100644 --- a/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/lucene/ExternalUtf8ContentFilterTest.java @@ -4,7 +4,6 @@ import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter; import com.github.dbmdz.solrocr.model.SourcePointer.Region; -import com.github.dbmdz.solrocr.reader.MultiFileReader; import com.github.dbmdz.solrocr.util.Utf8; import com.google.common.collect.ImmutableList; import java.io.BufferedReader; @@ -149,17 +148,4 @@ public void multipleLongerFiles() throws IOException { assertThat(filtered).isEqualTo(fullText); } } - - @Test - public void testMultiFileReader() throws IOException { - Path aPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00001.xml"); - Path bPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00002.xml"); - try (MultiFileReader r = new MultiFileReader(ImmutableList.of(aPath, bPath))) { - String fromReader = IOUtils.toString(r); - String aText = new String(Files.readAllBytes(aPath), StandardCharsets.UTF_8); - String bText = new String(Files.readAllBytes(bPath), StandardCharsets.UTF_8); - String fromFiles = aText + bText; - assertThat(fromReader).isEqualTo(fromFiles); - } - } } diff --git a/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java b/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java index 84872bab..7c60e56d 100644 --- a/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java +++ b/src/test/java/com/github/dbmdz/solrocr/reader/MultiFileSourceReaderTest.java @@ -6,14 +6,19 @@ import com.github.dbmdz.solrocr.reader.SourceReader.Section; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.channels.Channels; import java.nio.channels.SeekableByteChannel; +import java.nio.charset.StandardCharsets; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import java.util.stream.Collectors; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; @@ -28,6 +33,7 @@ class MultiFileSourceReaderTest { try (DirectoryStream stream = Files.newDirectoryStream(root, "1860-11-30*.xml")) { stream.forEach(filePaths::add); } + filePaths.sort(Comparator.comparing(Path::toString)); pointer = SourcePointer.parse( filePaths.stream() @@ -116,4 +122,25 @@ public void shouldReadCorrectlyAlignedSections(int sectionSize) throws IOExcepti assertThat(section.end).isEqualTo(sectionStart + sectionSize); assertThat(section.text).isEqualTo(expectedStr); } + + @Test + public void shouldReturnValidReader() throws IOException { + SourceReader reader = + new MultiFileSourceReader(filePaths, pointer, 512 * 1024, maxCacheEntries); + String fromReader = + IOUtils.toString( + Channels.newReader(reader.getByteChannel(), StandardCharsets.UTF_8.name())); + String fromFiles = + filePaths.stream() + .map( + fp -> { + try { + return new String(Files.readAllBytes(fp), StandardCharsets.UTF_8); + } catch (IOException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.joining("")); + assertThat(fromReader).isEqualTo(fromFiles); + } }