Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make I/O during indexing generic #438

Merged
merged 3 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

Expand Down Expand Up @@ -127,7 +130,7 @@ public String readAsciiString(int start, int len) throws IOException {
if (start + len > this.length()) {
len = this.length() - start;
}
StringBuilder sb = new StringBuilder();
StringBuilder sb = new StringBuilder(len);
int numRead = 0;
while (numRead < len) {
Section section = getAsciiSection(start + numRead);
Expand Down Expand Up @@ -242,4 +245,54 @@ public Section getAsciiSection(int offset) throws IOException {

return section;
}

/**
* Get a {@link java.io.Reader} instance for this SourceReader.
*
* <p>This is a generic implementation that should be overriden with a more efficient
* source-specific implementation, if available.
*/
public Reader getReader() {
return new InputStreamReader(
new InputStream() {
int position = 0;
final byte[] buf = new byte[sectionSize];
int currentSectionStart = -1;
int currentSectionEnd = -1;

@Override
public int read() throws IOException {
if (position >= length()) {
return -1;
}
if (position > currentSectionEnd) {
currentSectionEnd =
position + BaseSourceReader.this.readBytes(buf, 0, position, sectionSize);
currentSectionStart = position;
}
int out = buf[position - currentSectionStart] & 0xFF;
position++;
return out;
}

@Override
public int read(byte[] b, int off, int len) throws IOException {
int numRead = BaseSourceReader.this.readBytes(buf, off, position, len);
this.position += numRead;
return numRead;
}

@Override
public long skip(long n) throws IOException {
int toSkip = Math.min(length() - position, (int) n);
this.position += toSkip;
return toSkip;
}

@Override
public void close() throws IOException {
BaseSourceReader.this.close();
}
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.Reader;
import java.util.Locale;
import org.apache.lucene.index.QueryTimeout;

Expand Down Expand Up @@ -74,4 +75,9 @@ public Section getAsciiSection(int offset) throws IOException {
checkAndThrow();
return input.getAsciiSection(offset);
}

@Override
public Reader getReader() throws IOException {
return input.getReader();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
Expand Down Expand Up @@ -43,4 +46,9 @@ public void close() throws IOException {
public String getIdentifier() {
return this.path.toString();
}

@Override
public Reader getReader() {
return Channels.newReader(this.chan, StandardCharsets.UTF_8.newDecoder(), -1);
}
}
74 changes: 0 additions & 74 deletions src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@
import com.github.dbmdz.solrocr.model.SourcePointer;
import com.github.dbmdz.solrocr.util.ArrayUtils;
import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
Expand Down Expand Up @@ -124,4 +129,43 @@ public String getIdentifier() {
.map(p -> p.toAbsolutePath().toString())
.collect(Collectors.joining(", ")));
}

@Override
public Reader getReader() {
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
ReadableByteChannel multiFileChannel =
new ReadableByteChannel() {
private boolean closed = false;
private int position = 0;

@Override
public boolean isOpen() {
return !closed;
}

@Override
public void close() throws IOException {
MultiFileSourceReader.this.close();
this.closed = true;
}

@Override
public int read(ByteBuffer byteBuffer) throws IOException {
if (!byteBuffer.hasArray()) {
throw new UnsupportedOperationException(
"Currently only ByteBuffers backed by an array are supported.");
}
int numRead =
MultiFileSourceReader.this.readBytes(
byteBuffer.array(), byteBuffer.arrayOffset(), position, byteBuffer.remaining());
if (numRead > 0) {
byteBuffer.position(byteBuffer.position() + numRead);
this.position += numRead;
}
return numRead;
}
};

return Channels.newReader(multiFileChannel, decoder, -1);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.Reader;

/** API for reading data from a source. */
public interface SourceReader {
Expand Down Expand Up @@ -36,6 +37,8 @@ public interface SourceReader {
*/
Section getAsciiSection(int offset) throws IOException;

Reader getReader() throws IOException;

class Section {
public final int start;
public final int end;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package com.github.dbmdz.solrocr.reader;

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

/** SourceReader that reads from a String. */
public class StringSourceReader implements SourceReader {
Expand Down Expand Up @@ -28,6 +31,11 @@ public Section getAsciiSection(int offset) {
return new Section(0, str.length(), str);
}

@Override
public Reader getReader() throws IOException {
return new StringReader(str);
}

@Override
public int length() {
return this.str.length();
Expand Down
18 changes: 3 additions & 15 deletions src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,15 @@

import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter;
import com.github.dbmdz.solrocr.model.SourcePointer;
import com.github.dbmdz.solrocr.reader.MultiFileReader;
import com.github.dbmdz.solrocr.util.Utf8;
import com.google.common.collect.ImmutableList;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.StandardOpenOption;
import java.util.List;
import java.util.Locale;
Expand Down Expand Up @@ -67,24 +63,16 @@ public Reader create(Reader input) {
// This is very expensive, but we need this since all IO from here on out is character-based.
toCharOffsets(pointer);

Reader r;
if (pointer.sources.isEmpty()) {
throw new RuntimeException(
"No source files could be determined from pointer. "
+ "Is it pointing to files that exist and are readable? "
+ "Pointer was: "
+ ptrStr);
} else if (pointer.sources.size() > 1) {
r =
new MultiFileReader(
pointer.sources.stream().map(s -> s.path).collect(Collectors.toList()));
} else {
r =
new InputStreamReader(
Files.newInputStream(pointer.sources.get(0).path, StandardOpenOption.READ),
StandardCharsets.UTF_8);
}

// NOTE: Section size doesn't matter much, since we only use the APIs for unaligned
// reads through the Reader implementations.
Reader r = pointer.getReader(512 * 1024, 0).getReader();
List<SourcePointer.Region> charRegions =
pointer.sources.stream().flatMap(s -> s.regions.stream()).collect(Collectors.toList());
return new ExternalUtf8ContentFilter(new BufferedReader(r), charRegions, ptrStr);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ public void testMultiFileParse() throws XMLStreamException, IOException {
.filter(p -> p.getFileName().toString().startsWith("1860-"))
.map(Path::toAbsolutePath)
.map(Path::toString)
.sorted()
.collect(Collectors.joining("+"));
List<OcrBox> boxes =
new AltoParser(filterFac.create(new StringReader(ptr)))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter;
import com.github.dbmdz.solrocr.model.SourcePointer.Region;
import com.github.dbmdz.solrocr.reader.MultiFileReader;
import com.github.dbmdz.solrocr.util.Utf8;
import com.google.common.collect.ImmutableList;
import java.io.BufferedReader;
Expand Down Expand Up @@ -149,17 +148,4 @@ public void multipleLongerFiles() throws IOException {
assertThat(filtered).isEqualTo(fullText);
}
}

@Test
public void testMultiFileReader() throws IOException {
Path aPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00001.xml");
Path bPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00002.xml");
try (MultiFileReader r = new MultiFileReader(ImmutableList.of(aPath, bPath))) {
String fromReader = IOUtils.toString(r);
String aText = new String(Files.readAllBytes(aPath), StandardCharsets.UTF_8);
String bText = new String(Files.readAllBytes(bPath), StandardCharsets.UTF_8);
String fromFiles = aText + bText;
assertThat(fromReader).isEqualTo(fromFiles);
}
}
}
Loading
Loading