Skip to content

Commit

Permalink
Adding in more data, support JSONL output.
Browse files Browse the repository at this point in the history
  • Loading branch information
anjackson committed Sep 29, 2022
1 parent 5e2f930 commit baa131a
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 77 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,12 @@ public void testFullIndexerJob() throws Exception {
// Set up arguments for the job:
// FIXME The input file could be written by this test.
int reducers = 1;
String[] args = { "--dummy-run", "-w", "-S", "http://none", "-R",
String[] args = {
//"--jsonl",
"--no-solr",
"-w",
"-S", "http://none",
"-R",
""
+ reducers,
"-i",
Expand Down Expand Up @@ -97,9 +102,15 @@ public void testFullIndexerJob() throws Exception {
InputStream is = getFileSystem().open(output);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
String line = null;
int counter = 0;
while( ( line = reader.readLine()) != null ) {
log.info(line);
System.out.println(line);
//System.out.println(line);
counter++;
if( counter > 5) {
log.info("Stopping printing output after 5 lines...");
break;
}
}
reader.close();
} else {
Expand Down
91 changes: 39 additions & 52 deletions warc-indexer/src/main/java/uk/bl/wa/Memento.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,18 +67,18 @@ public class Memento {
//private byte[] content_ffb; // First four bytes, output as lower-case hex string. Derive from first bytes.

@JsonProperty("content_first_bytes")
private byte[] contentFirstBytes; // First 32 bytes, output as space-separated hex.
private String contentFirstBytes; // First 32 bytes, output as space-separated hex.

@JsonProperty("content_language")
private String contentLanguage;

@JsonProperty("content_length")
private long contentLength;
private Long contentLength;

// String content_metadata, // Not sure how to store that

@JsonProperty("content_text_length")
private long contentTextLength;
private Long contentTextLength;

@JsonProperty("content_type_droid")
private String contentTypeDroid;
Expand Down Expand Up @@ -111,10 +111,10 @@ public class Memento {

//List<String> hashes; // Not in use

// long id_long; // Not in use
// Long id_long; // Not in use

@JsonProperty("wayback_date")
private long waybackDate;
private Long waybackDate;

// List<Date> crawl_dates; // Not in use

Expand All @@ -123,12 +123,12 @@ public class Memento {

// List<int> crawl_years; // Not in use

// int crawl_year; // To be generated from crawl_date;
// Integer crawl_year; // To be generated from crawl_date;

@JsonProperty("last_modified")
private String lastModified;

// int last_modified_year; // To be generated from last_modified
// Integer last_modified_year; // To be generated from last_modified

@JsonProperty("url_norm")
private String urlNorm;
Expand All @@ -148,36 +148,33 @@ public class Memento {
private String host;

@JsonProperty("host_surt")
private List<String> hostSurt;
private String hostSurt;

@JsonProperty("public_suffix")
private String publicSuffix;

private String resourcename; // Needed?

@JsonProperty("resourcenameFacet")
private String resourcename_facet; // Needed? Copied from resourcename?

@JsonProperty("image_colours")
private List<String> imageColours;

@JsonProperty("image_dominant_colour")
private String imageDominantColour;

@JsonProperty("image_faces_count")
private int imageFacesCount;
private Integer imageFacesCount;

@JsonProperty("image_faces")
private List<String> imageFaces;

@JsonProperty("image_height")
private long imageHeight;
private Long imageHeight;

@JsonProperty("image_width")
private long imageWidth;
private Long imageWidth;

@JsonProperty("image_size")
private long imageSize;
private Long imageSize;

@JsonProperty("links_images")
private List<String> linksImages;
Expand Down Expand Up @@ -216,7 +213,7 @@ public class Memento {
private String publicationDate;

@JsonProperty("publication_year")
private int publicationYear;
private Integer publicationYear;

@JsonProperty("record_type")
private String recordType;
Expand All @@ -228,7 +225,7 @@ public class Memento {
private List<String> server;

@JsonProperty("status_code")
private int statusCode;
private Integer statusCode;

private List<String> generator;

Expand All @@ -241,7 +238,7 @@ public class Memento {
private String sourceFilePath;

@JsonProperty("source_file_offset")
private long sourceFileOffset;
private Long sourceFileOffset;

@JsonProperty("source_file")
private String sourceFile;
Expand Down Expand Up @@ -371,12 +368,12 @@ public void setContentTextOriginalEncoding(String contentTextOriginalEncoding) {
}


public byte[] getContentFirstBytes() {
public String getContentFirstBytes() {
return contentFirstBytes;
}


public void setContentFirstBytes(byte[] contentFirstBytes) {
public void setContentFirstBytes(String contentFirstBytes) {
this.contentFirstBytes = contentFirstBytes;
}

Expand All @@ -391,22 +388,22 @@ public void setContentLanguage(String contentLanguage) {
}


public long getContentLength() {
public Long getContentLength() {
return contentLength;
}


public void setContentLength(long contentLength) {
public void setContentLength(Long contentLength) {
this.contentLength = contentLength;
}


public long getContentTextLength() {
public Long getContentTextLength() {
return contentTextLength;
}


public void setContentTextLength(long contentTextLength) {
public void setContentTextLength(Long contentTextLength) {
this.contentTextLength = contentTextLength;
}

Expand Down Expand Up @@ -511,12 +508,12 @@ public void setHash(String hash) {
}


public long getWaybackDate() {
public Long getWaybackDate() {
return waybackDate;
}


public void setWaybackDate(long waybackDate) {
public void setWaybackDate(Long waybackDate) {
this.waybackDate = waybackDate;
}

Expand Down Expand Up @@ -601,12 +598,12 @@ public void setHost(String host) {
}


public List<String> getHostSurt() {
public String getHostSurt() {
return hostSurt;
}


public void setHostSurt(List<String> hostSurt) {
public void setHostSurt(String hostSurt) {
this.hostSurt = hostSurt;
}

Expand All @@ -631,16 +628,6 @@ public void setResourcename(String resourcename) {
}


public String getResourcename_facet() {
return resourcename_facet;
}


public void setResourcename_facet(String resourcename_facet) {
this.resourcename_facet = resourcename_facet;
}


public List<String> getImageColours() {
return imageColours;
}
Expand All @@ -661,12 +648,12 @@ public void setImageDominantColour(String imageDominantColour) {
}


public int getImageFacesCount() {
public Integer getImageFacesCount() {
return imageFacesCount;
}


public void setImageFacesCount(int imageFacesCount) {
public void setImageFacesCount(Integer imageFacesCount) {
this.imageFacesCount = imageFacesCount;
}

Expand All @@ -681,32 +668,32 @@ public void setImageFaces(List<String> imageFaces) {
}


public long getImageHeight() {
public Long getImageHeight() {
return imageHeight;
}


public void setImageHeight(long imageHeight) {
public void setImageHeight(Long imageHeight) {
this.imageHeight = imageHeight;
}


public long getImageWidth() {
public Long getImageWidth() {
return imageWidth;
}


public void setImageWidth(long imageWidth) {
public void setImageWidth(Long imageWidth) {
this.imageWidth = imageWidth;
}


public long getImageSize() {
public Long getImageSize() {
return imageSize;
}


public void setImageSize(long imageSize) {
public void setImageSize(Long imageSize) {
this.imageSize = imageSize;
}

Expand Down Expand Up @@ -841,12 +828,12 @@ public void setPublicationDate(String publicationDate) {
}


public int getPublicationYear() {
public Integer getPublicationYear() {
return publicationYear;
}


public void setPublicationYear(int publicationYear) {
public void setPublicationYear(Integer publicationYear) {
this.publicationYear = publicationYear;
}

Expand All @@ -871,12 +858,12 @@ public void setServer(List<String> server) {
}


public int getStatusCode() {
public Integer getStatusCode() {
return statusCode;
}


public void setStatusCode(int statusCode) {
public void setStatusCode(Integer statusCode) {
this.statusCode = statusCode;
}

Expand Down Expand Up @@ -911,12 +898,12 @@ public void setSourceFilePath(String sourceFilePath) {
}


public long getSourceFileOffset() {
public Long getSourceFileOffset() {
return sourceFileOffset;
}


public void setSourceFileOffset(long sourceFileOffset) {
public void setSourceFileOffset(Long sourceFileOffset) {
this.sourceFileOffset = sourceFileOffset;
}

Expand Down
Loading

0 comments on commit baa131a

Please sign in to comment.