Skip to content

Commit

Permalink
Tweak testing for #299
Browse files Browse the repository at this point in the history
  • Loading branch information
anjackson committed Sep 30, 2022
1 parent baa131a commit 5043147
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@
*/

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
Expand Down Expand Up @@ -95,24 +99,23 @@ public void testFullIndexerJob() throws Exception {
output, new OutputLogFilter()));
Assert.assertEquals(reducers, outputFiles.length);

// Check contents of the output:
// Get the output:
List<String> resultFiles = new ArrayList<String>();
for( Path output : outputFiles ) {
log.info(" --- output : "+output);
if( getFileSystem().isFile(output) ) {
InputStream is = getFileSystem().open(output);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
String line = null;
int counter = 0;
while( ( line = reader.readLine()) != null ) {
log.info(line);
//System.out.println(line);
counter++;
if( counter > 5) {
log.info("Stopping printing output after 5 lines...");
break;
}
String resultFile = "target/indexer-" + output.getName();
resultFiles.add(resultFile);
FileOutputStream out = new FileOutputStream(resultFile);
log.info(" --- output : " + output + " is being written to " + resultFile);
if (getFileSystem().isFile(output)) {
InputStream is = getFileSystem().open(output);
IOUtils.copy(is, out);
} else {
log.info(" --- ...skipping directory...");
}
reader.close();
out.flush();
out.close();
} else {
log.info(" --- ...skipping directory...");
}
Expand Down
2 changes: 1 addition & 1 deletion warc-indexer/src/main/resources/dataset-generation.conf
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
# Extract the first bytes of the file (for shingling):
"first_bytes" : {
# Enabled?
"enabled" : false,
"enabled" : true,
# Number of bytes to extract (>=4 to allow content_ffb to work):
"num_bytes" : 32
}
Expand Down

0 comments on commit 5043147

Please sign in to comment.