Skip to content

Commit

Permalink
Remove text extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
maeb committed Apr 27, 2020
1 parent 6640320 commit e82645c
Show file tree
Hide file tree
Showing 7 changed files with 7 additions and 301 deletions.
17 changes: 0 additions & 17 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
<veidemann.api.version>1.0.0-beta14</veidemann.api.version>
<veidemann.commons.version>0.4.1</veidemann.commons.version>
<veidemann.rethinkdbadapter.version>0.4.5</veidemann.rethinkdbadapter.version>
<org.apache.tika.version>1.22</org.apache.tika.version>
<org.jwat.version>1.1.1</org.jwat.version>
<log4j.version>2.7</log4j.version>
<rethinkdb.version>2.4.0</rethinkdb.version>
Expand Down Expand Up @@ -88,22 +87,6 @@
<version>${org.jwat.version}</version>
</dependency>

<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${org.apache.tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>${org.apache.tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-langdetect</artifactId>
<version>${org.apache.tika.version}</version>
</dependency>

<!-- Configuration framework -->
<dependency>
<groupId>com.typesafe</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import io.grpc.ServerBuilder;
import io.opentracing.contrib.ServerTracingInterceptor;
import io.opentracing.util.GlobalTracer;
import no.nb.nna.veidemann.contentwriter.text.TextExtractor;
import no.nb.nna.veidemann.contentwriter.warc.WarcCollectionRegistry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -44,12 +43,12 @@ public class ApiServer implements AutoCloseable {
/**
* Construct a new REST API server.
*/
public ApiServer(int port, int shutdownTimeoutSeconds, WarcCollectionRegistry warcCollectionRegistry, TextExtractor textExtractor) {
this(ServerBuilder.forPort(port), warcCollectionRegistry, textExtractor);
public ApiServer(int port, int shutdownTimeoutSeconds, WarcCollectionRegistry warcCollectionRegistry) {
this(ServerBuilder.forPort(port), warcCollectionRegistry);
this.shutdownTimeoutSeconds = shutdownTimeoutSeconds;
}

public ApiServer(ServerBuilder<?> serverBuilder, WarcCollectionRegistry warcCollectionRegistry, TextExtractor textExtractor) {
public ApiServer(ServerBuilder<?> serverBuilder, WarcCollectionRegistry warcCollectionRegistry) {

ServerTracingInterceptor tracingInterceptor = new ServerTracingInterceptor.Builder(GlobalTracer.get())
.withTracedAttributes(ServerTracingInterceptor.ServerRequestAttribute.CALL_ATTRIBUTES,
Expand All @@ -61,7 +60,7 @@ public ApiServer(ServerBuilder<?> serverBuilder, WarcCollectionRegistry warcColl
threadPool = Executors.newCachedThreadPool();
serverBuilder.executor(threadPool);

server = serverBuilder.addService(new ContentWriterService(warcCollectionRegistry, textExtractor)).build();
server = serverBuilder.addService(new ContentWriterService(warcCollectionRegistry)).build();
}

public ApiServer start() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import no.nb.nna.veidemann.commons.db.DbService;
import no.nb.nna.veidemann.commons.opentracing.TracerFactory;
import no.nb.nna.veidemann.contentwriter.settings.Settings;
import no.nb.nna.veidemann.contentwriter.text.TextExtractor;
import no.nb.nna.veidemann.contentwriter.warc.WarcCollectionRegistry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -60,8 +59,7 @@ public ContentWriter() {
public ContentWriter start() {
try (DbService db = DbService.configure(SETTINGS);
WarcCollectionRegistry warcCollectionRegistry = new WarcCollectionRegistry();
TextExtractor textExtractor = new TextExtractor();
ApiServer apiServer = new ApiServer(SETTINGS.getApiPort(), SETTINGS.getTerminationGracePeriodSeconds(), warcCollectionRegistry, textExtractor);) {
ApiServer apiServer = new ApiServer(SETTINGS.getApiPort(), SETTINGS.getTerminationGracePeriodSeconds(), warcCollectionRegistry)) {

registerShutdownHook();

Expand Down Expand Up @@ -89,15 +87,7 @@ private void registerShutdownHook() {
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
// Use stderr here since the logger may have been reset by its JVM shutdown hook.
System.err.println("*** shutting down since JVM is shutting down");

mainThread.interrupt();
try {
mainThread.join();
} catch (InterruptedException e) {
//
}
System.err.println("*** gracefully shut down");

}));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import no.nb.nna.veidemann.api.contentwriter.v1.WriteRequest;
import no.nb.nna.veidemann.api.contentwriter.v1.WriteResponseMeta;
import no.nb.nna.veidemann.contentwriter.WriteSessionContext.RecordData;
import no.nb.nna.veidemann.contentwriter.text.TextExtractor;
import no.nb.nna.veidemann.contentwriter.warc.SingleWarcWriter;
import no.nb.nna.veidemann.contentwriter.warc.WarcCollection;
import no.nb.nna.veidemann.contentwriter.warc.WarcCollection.Instance;
Expand All @@ -43,11 +42,8 @@ public class ContentWriterService extends ContentWriterGrpc.ContentWriterImplBas

private final WarcCollectionRegistry warcCollectionRegistry;

private final TextExtractor textExtractor;

public ContentWriterService(WarcCollectionRegistry warcCollectionRegistry, TextExtractor textExtractor) {
public ContentWriterService(WarcCollectionRegistry warcCollectionRegistry) {
this.warcCollectionRegistry = warcCollectionRegistry;
this.textExtractor = textExtractor;
}

@Override
Expand Down Expand Up @@ -140,10 +136,6 @@ public void onCompleted() {

URI ref = warcWriters.getWarcWriter(recordData.getSubCollectionType()).writeRecord(recordData);

if (context.getCollectionConfig().getCrawlConfig().getExtra().getExtractText()) {
textExtractor.extractText(recordData);
}

WriteResponseMeta.RecordMeta.Builder responseMeta = WriteResponseMeta.RecordMeta.newBuilder()
.setRecordNum(recordNum)
.setType(recordData.getRecordType())
Expand Down

This file was deleted.

This file was deleted.

Loading

0 comments on commit e82645c

Please sign in to comment.