Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Weekday & time of day #177

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.TimeZone;

Expand Down Expand Up @@ -487,8 +488,7 @@ public SolrRecord extract( String archiveName, ArchiveRecord record, boolean isT
if (results.getResults().size() > 0) {
SolrDocument fr = results.getResults().get(0);
if (fr.containsKey(SolrFields.CRAWL_DATES)) {
for (Object cds : fr
.getFieldValues(SolrFields.CRAWL_DATES)) {
for (Object cds : fr.getFieldValues(SolrFields.CRAWL_DATES)) {
currentCrawlDates.add((Date) cds);
}
}
Expand Down Expand Up @@ -525,8 +525,9 @@ public SolrRecord extract( String archiveName, ArchiveRecord record, boolean isT
dateList.add(crawlDate);
Collections.sort(dateList);
Date firstDate = dateList.get(0);
solr.setField(SolrFields.CRAWL_DATE,
formatter.format(firstDate));
solr.setField(SolrFields.CRAWL_DATE, formatter.format(firstDate));
solr.setField(SolrFields.CRAWL_WEEKDAY, formatter_weekday.format(firstDate));
solr.setField(SolrFields.CRAWL_TIME_OF_DAY, formatter_time_of_day.format(firstDate));
solr.setField( SolrFields.CRAWL_YEAR, getYearFromDate(firstDate) );

// Use the current value as the waybackDate:
Expand Down Expand Up @@ -755,12 +756,15 @@ protected static String parseExtension(String path) {

/**
* Timestamp parsing, for the Crawl Date.
* Note: DateFormatters are not Thread safe
*/

public static SimpleDateFormat formatter = new SimpleDateFormat(
"yyyy-MM-dd'T'HH:mm:ss'Z'");
public static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
public static SimpleDateFormat formatter_weekday = new SimpleDateFormat("EEEE", Locale.ENGLISH);
public static SimpleDateFormat formatter_time_of_day = new SimpleDateFormat("'0001-01-01T'HH:mm:ss'Z'");
static {
formatter.setTimeZone( TimeZone.getTimeZone( "GMT" ) );
formatter_weekday.setTimeZone( TimeZone.getTimeZone( "GMT" ) );
}

/**
Expand Down
4 changes: 4 additions & 0 deletions warc-indexer/src/main/java/uk/bl/wa/solr/SolrFields.java
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,15 @@ public interface SolrFields {
public static final String WAYBACK_DATE = "wayback_date";
public static final String CRAWL_DATE = "crawl_date";
public static final String CRAWL_DATES = "crawl_dates";
public static final String CRAWL_WEEKDAY = "crawl_weekday";
public static final String CRAWL_TIME_OF_DAY = "crawl_time_of_day";
public static final String CRAWL_YEAR = "crawl_year";
public static final String CRAWL_YEARS = "crawl_years";
public static final String PUBLICATION_DATE = "publication_date"; // Does not seem to be used
public static final String PUBLICATION_YEAR = "publication_year"; // Does not seem to be used
public static final String LAST_MODIFIED = "last_modified";
public static final String LAST_MODIFIED_WEEKDAY = "last_modified_weekday";
public static final String LAST_MODIFIED_TIME_OF_DAY = "last_modified_time_of_day";
public static final String LAST_MODIFIED_YEAR = "last_modified_year";

//Image Exif metadata
Expand Down
4 changes: 4 additions & 0 deletions warc-indexer/src/main/java/uk/bl/wa/solr/TikaExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.io.Writer;
import java.util.Date;
import java.util.List;
import java.util.Locale;

import org.apache.commons.io.input.BoundedInputStream;
import org.apache.commons.lang.StringUtils;
Expand Down Expand Up @@ -348,6 +349,9 @@ public SolrRecord extract( SolrRecord solr, InputStream is, String url ) throws
// solr.getSolrDocument().setField(SolrFields.LAST_MODIFIED,
// edate);
solr.setField(SolrFields.LAST_MODIFIED, iso_df.print(edate));
solr.setField(SolrFields.LAST_MODIFIED_TIME_OF_DAY,
"0001-01-01T" + ISODateTimeFormat.hourMinuteSecond().withZoneUTC().print(edate));
solr.setField(SolrFields.LAST_MODIFIED_WEEKDAY, edate.dayOfWeek().getAsText(Locale.ENGLISH));
}
}

Expand Down
4 changes: 4 additions & 0 deletions warc-indexer/src/main/solr/solr/discovery/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
<field name="content_type_version" type="string" indexed="true" docValues="true" multiValued="false"/>
<field name="crawl_dates" type="tdate" indexed="true" stored="true" multiValued="true"/>
<field name="crawl_date" type="tdate" indexed="true" stored="false" multiValued="false" docValues="true"/>
<field name="crawl_weekday" type="string" indexed="true" stored="false" multiValued="false" docValues="true"/>
<field name="crawl_time_of_day" type="tdate" indexed="true" stored="false" multiValued="false" docValues="true"/>
<field name="crawl_year_month_day" type="int" indexed="true" docValues="true" multiValued="false"/>
<field name="crawl_year_month" type="int" indexed="true" docValues="true" multiValued="false"/>
<field name="crawl_years" type="int" indexed="true" docValues="true" multiValued="true"/>
Expand All @@ -87,6 +89,8 @@
<field name="image_width" type="tlong" indexed="true" stored="false" docValues="true" multiValued="false"/>
<field name="keywords" type="text_general" indexed="true" stored="true"/>
<field name="last_modified" type="tdate" indexed="true" stored="false" docValues="true"/>
<field name="last_modified_weekday" type="string" indexed="true" stored="false" multiValued="false" docValues="true"/>
<field name="last_modified_time_of_day" type="date" indexed="true" stored="false" multiValued="false" docValues="true"/>
<field name="last_modified_year" type="string" indexed="true" docValues="true"/>
<field name="license_url" type="string" indexed="true" docValues="true" multiValued="true"/>
<field name="links_images" type="string" indexed="true" docValues="true" multiValued="true"/>
Expand Down
28 changes: 27 additions & 1 deletion warc-indexer/src/main/solr/solr7/discovery/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,20 @@ This schema is for Solr 7+ and will not work under Solr 6.
<!-- month_day & day not used for anything -->
<!-- <field name="crawl_year_month_day" type="int" />
<field name="crawl_year_month" type="int" />-->
<!-- The weekday extracted from crawl_date (Monday, Tuesday, Wednesday...).
Sample use: Limiting searches to crawls done between Friday evening and Saturday morning:
fq=(crawl_weekday:Friday AND crawl_time_of_day:[0001-01-01T19:00:00Z TO *]) OR (crawl_weekday:Saturday AND crawl_time_of_day:[* TO 0001-01-01T6:00:00Z])
Note: This makes little sense for standard web resources as the crawl-time does not reflect when the
resource was created.
-->
<field name="crawl_weekday" type="string" />
<!-- The time of the day extracted from crawl_date, represented by fixing the date to 0001-01-01.
Sample use: Limiting searches to data harvested around noon:
fq=crawl_time_of_day:[0001-01-01T11:00:00Z TO 0001-01-01T14:00:00Z]
Note: This makes little sense for standard web resources as the crawl-time does not reflect when the
resource was created.
-->
<field name="crawl_time_of_day" type="date" />
<!-- If webarchive-discovery runs in update-mode, multiple harvests of the same URL will be collapsed to
a single document and the years from the dates from the different harvests will be added to this field -->
<field name="crawl_years" type="int" multiValued="true" />
Expand All @@ -151,10 +165,22 @@ This schema is for Solr 7+ and will not work under Solr 6.
Note: This is not a very reliable timestamp for most formats. JPEGs tend to work quite well.
Sample use: Sorting by age as stated in the format sort=last_modified asc -->
<field name="last_modified" type="date" />
<!-- The weekday extracted from last_modified (Monday, Tuesday, Wednesday...).
Sample use: Limiting searches to resources from Friday evening and Saturday morning:
fq=(crawl_weekday:Friday AND crawl_time_of_day:[0001-01-01T19:00:00Z TO *]) OR (crawl_weekday:Saturday AND crawl_time_of_day:[* TO 0001-01-01T6:00:00Z])
Note: This works very well for resources where the timestamp is valid, such as Twitter & Jodel posts.
-->
<field name="las_modified_weekday" type="string" />
<!-- The time of the day extracted from last_modified, represented by fixing the date to 0001-01-01.
Sample use: Limiting searches to resources created/updated around noon:
fq=last_modified_time_of_day:[0001-01-01T11:00:00Z TO 0001-01-01T14:00:00Z]
Note: This works very well for resources where the timestamp is valid, such as Twitter & Jodel posts.
-->
<field name="last_modified_time_of_day" type="date" />
<!-- The year from last_modified -->
<field name="last_modified_year" type="string" /> <!-- Why is this a string? -->

<!-- Heavily normalised URL: http/https is collepsed to http, everything is lowercased, trailing / are removed
<!-- Heavily normalised URL: http/https is collapsed to http, everything is lowercased, trailing / are removed
for all URLs, except those pointing to root, e.g. "http://example.com/". There is more processing than
that. If the field is to be queried with a user-provided URL, it is highly recommended to use the method
Normalisation.canonicaliseURL() from webarchive-discovery to ensure match.
Expand Down