Skip to content

Commit

Permalink
Use SourceReader to obtain a java.io.Reader for reading content durin…
Browse files Browse the repository at this point in the history
…g indexing
  • Loading branch information
jbaiter committed Jun 12, 2024
1 parent c701e35 commit d683e1e
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 104 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

Expand Down Expand Up @@ -127,7 +130,7 @@ public String readAsciiString(int start, int len) throws IOException {
if (start + len > this.length()) {
len = this.length() - start;
}
StringBuilder sb = new StringBuilder();
StringBuilder sb = new StringBuilder(len);
int numRead = 0;
while (numRead < len) {
Section section = getAsciiSection(start + numRead);
Expand Down Expand Up @@ -242,4 +245,54 @@ public Section getAsciiSection(int offset) throws IOException {

return section;
}

/**
* Get a {@link java.io.Reader} instance for this SourceReader.
*
* <p>This is a generic implementation that should be overriden with a more efficient
* source-specific implementation, if available.
*/
public Reader getReader() {
return new InputStreamReader(
new InputStream() {
int position = 0;
final byte[] buf = new byte[sectionSize];
int currentSectionStart = -1;
int currentSectionEnd = -1;

@Override
public int read() throws IOException {
if (position >= length()) {
return -1;
}
if (position > currentSectionEnd) {
currentSectionEnd =
position + BaseSourceReader.this.readBytes(buf, 0, position, sectionSize);
currentSectionStart = position;
}
int out = buf[position - currentSectionStart] & 0xFF;
position++;
return out;
}

@Override
public int read(byte[] b, int off, int len) throws IOException {
int numRead = BaseSourceReader.this.readBytes(buf, off, position, len);
this.position += numRead;
return numRead;
}

@Override
public long skip(long n) throws IOException {
int toSkip = Math.min(length() - position, (int) n);
this.position += toSkip;
return toSkip;
}

@Override
public void close() throws IOException {
BaseSourceReader.this.close();
}
});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.Reader;
import java.util.Locale;
import org.apache.lucene.index.QueryTimeout;

Expand Down Expand Up @@ -74,4 +75,9 @@ public Section getAsciiSection(int offset) throws IOException {
checkAndThrow();
return input.getAsciiSection(offset);
}

@Override
public Reader getReader() throws IOException {
return input.getReader();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
Expand Down Expand Up @@ -43,4 +46,9 @@ public void close() throws IOException {
public String getIdentifier() {
return this.path.toString();
}

@Override
public Reader getReader() {
return Channels.newReader(this.chan, StandardCharsets.UTF_8.newDecoder(), -1);
}
}
74 changes: 0 additions & 74 deletions src/main/java/com/github/dbmdz/solrocr/reader/MultiFileReader.java

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@
import com.github.dbmdz.solrocr.model.SourcePointer;
import com.github.dbmdz.solrocr.util.ArrayUtils;
import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
Expand Down Expand Up @@ -124,4 +129,43 @@ public String getIdentifier() {
.map(p -> p.toAbsolutePath().toString())
.collect(Collectors.joining(", ")));
}

@Override
public Reader getReader() {
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
ReadableByteChannel multiFileChannel =
new ReadableByteChannel() {
private boolean closed = false;
private int position = 0;

@Override
public boolean isOpen() {
return !closed;
}

@Override
public void close() throws IOException {
MultiFileSourceReader.this.close();
this.closed = true;
}

@Override
public int read(ByteBuffer byteBuffer) throws IOException {
if (!byteBuffer.hasArray()) {
throw new UnsupportedOperationException(
"Currently only ByteBuffers backed by an array are supported.");
}
int numRead =
MultiFileSourceReader.this.readBytes(
byteBuffer.array(), byteBuffer.arrayOffset(), position, byteBuffer.remaining());
if (numRead > 0) {
byteBuffer.position(byteBuffer.position() + numRead);
}
this.position += numRead;
return numRead;
}
};

return Channels.newReader(multiFileChannel, decoder, -1);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.Reader;

/** API for reading data from a source. */
public interface SourceReader {
Expand Down Expand Up @@ -36,6 +37,8 @@ public interface SourceReader {
*/
Section getAsciiSection(int offset) throws IOException;

Reader getReader() throws IOException;

class Section {
public final int start;
public final int end;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package com.github.dbmdz.solrocr.reader;

import com.github.dbmdz.solrocr.model.SourcePointer;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

/** SourceReader that reads from a String. */
public class StringSourceReader implements SourceReader {
Expand Down Expand Up @@ -28,6 +31,11 @@ public Section getAsciiSection(int offset) {
return new Section(0, str.length(), str);
}

@Override
public Reader getReader() throws IOException {
return new StringReader(str);
}

@Override
public int length() {
return this.str.length();
Expand Down
18 changes: 3 additions & 15 deletions src/main/java/solrocr/ExternalUtf8ContentFilterFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,15 @@

import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter;
import com.github.dbmdz.solrocr.model.SourcePointer;
import com.github.dbmdz.solrocr.reader.MultiFileReader;
import com.github.dbmdz.solrocr.util.Utf8;
import com.google.common.collect.ImmutableList;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.StandardOpenOption;
import java.util.List;
import java.util.Locale;
Expand Down Expand Up @@ -67,24 +63,16 @@ public Reader create(Reader input) {
// This is very expensive, but we need this since all IO from here on out is character-based.
toCharOffsets(pointer);

Reader r;
if (pointer.sources.isEmpty()) {
throw new RuntimeException(
"No source files could be determined from pointer. "
+ "Is it pointing to files that exist and are readable? "
+ "Pointer was: "
+ ptrStr);
} else if (pointer.sources.size() > 1) {
r =
new MultiFileReader(
pointer.sources.stream().map(s -> s.path).collect(Collectors.toList()));
} else {
r =
new InputStreamReader(
Files.newInputStream(pointer.sources.get(0).path, StandardOpenOption.READ),
StandardCharsets.UTF_8);
}

// NOTE: Section size doesn't matter much, since we only use the APIs for unaligned
// reads through the Reader implementations.
Reader r = pointer.getReader(512 * 1024, 0).getReader();
List<SourcePointer.Region> charRegions =
pointer.sources.stream().flatMap(s -> s.regions.stream()).collect(Collectors.toList());
return new ExternalUtf8ContentFilter(new BufferedReader(r), charRegions, ptrStr);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import com.github.dbmdz.solrocr.lucene.filters.ExternalUtf8ContentFilter;
import com.github.dbmdz.solrocr.model.SourcePointer.Region;
import com.github.dbmdz.solrocr.reader.MultiFileReader;
import com.github.dbmdz.solrocr.util.Utf8;
import com.google.common.collect.ImmutableList;
import java.io.BufferedReader;
Expand Down Expand Up @@ -149,17 +148,4 @@ public void multipleLongerFiles() throws IOException {
assertThat(filtered).isEqualTo(fullText);
}
}

@Test
public void testMultiFileReader() throws IOException {
Path aPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00001.xml");
Path bPath = Paths.get("src/test/resources/data/alto_multi/1865-05-24_01-00002.xml");
try (MultiFileReader r = new MultiFileReader(ImmutableList.of(aPath, bPath))) {
String fromReader = IOUtils.toString(r);
String aText = new String(Files.readAllBytes(aPath), StandardCharsets.UTF_8);
String bText = new String(Files.readAllBytes(bPath), StandardCharsets.UTF_8);
String fromFiles = aText + bText;
assertThat(fromReader).isEqualTo(fromFiles);
}
}
}
Loading

0 comments on commit d683e1e

Please sign in to comment.