From 093ef16f4d895ed407bb3f413ec43862364cdb8c Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Tue, 29 May 2018 15:35:39 +0200 Subject: [PATCH 1/2] First attempt at issue #161 (indexing weekday) --- .../java/uk/bl/wa/indexer/WARCIndexer.java | 16 +++++++---- .../main/java/uk/bl/wa/solr/SolrFields.java | 4 +++ .../java/uk/bl/wa/solr/TikaExtractor.java | 4 +++ .../main/solr/solr/discovery/conf/schema.xml | 4 +++ .../main/solr/solr7/discovery/conf/schema.xml | 28 ++++++++++++++++++- 5 files changed, 49 insertions(+), 7 deletions(-) diff --git a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java index 8e5b97ce..e48008f2 100644 --- a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java +++ b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java @@ -43,6 +43,7 @@ import java.util.Date; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Properties; import java.util.TimeZone; @@ -487,8 +488,7 @@ public SolrRecord extract( String archiveName, ArchiveRecord record, boolean isT if (results.getResults().size() > 0) { SolrDocument fr = results.getResults().get(0); if (fr.containsKey(SolrFields.CRAWL_DATES)) { - for (Object cds : fr - .getFieldValues(SolrFields.CRAWL_DATES)) { + for (Object cds : fr.getFieldValues(SolrFields.CRAWL_DATES)) { currentCrawlDates.add((Date) cds); } } @@ -525,8 +525,9 @@ public SolrRecord extract( String archiveName, ArchiveRecord record, boolean isT dateList.add(crawlDate); Collections.sort(dateList); Date firstDate = dateList.get(0); - solr.setField(SolrFields.CRAWL_DATE, - formatter.format(firstDate)); + solr.setField(SolrFields.CRAWL_DATE, formatter.format(firstDate)); + solr.setField(SolrFields.CRAWL_WEEKDAY, formatter_weekday.format(firstDate)); + solr.setField(SolrFields.CRAWL_TIME_OF_DAY, formatter_time_of_day.format(firstDate)); solr.setField( SolrFields.CRAWL_YEAR, getYearFromDate(firstDate) ); // Use the current value as the waybackDate: @@ -755,12 +756,15 @@ protected static String parseExtension(String path) { /** * Timestamp parsing, for the Crawl Date. + * Note: DateFormatters are not Thread safe */ - public static SimpleDateFormat formatter = new SimpleDateFormat( - "yyyy-MM-dd'T'HH:mm:ss'Z'"); + public static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); + public static SimpleDateFormat formatter_weekday = new SimpleDateFormat("EEEE", Locale.ENGLISH); + public static SimpleDateFormat formatter_time_of_day = new SimpleDateFormat("'0001-01-01T'HH:mm:ss'Z'"); static { formatter.setTimeZone( TimeZone.getTimeZone( "GMT" ) ); + formatter_weekday.setTimeZone( TimeZone.getTimeZone( "GMT" ) ); } /** diff --git a/warc-indexer/src/main/java/uk/bl/wa/solr/SolrFields.java b/warc-indexer/src/main/java/uk/bl/wa/solr/SolrFields.java index 1f6ee25c..ffbc5b64 100644 --- a/warc-indexer/src/main/java/uk/bl/wa/solr/SolrFields.java +++ b/warc-indexer/src/main/java/uk/bl/wa/solr/SolrFields.java @@ -112,11 +112,15 @@ public interface SolrFields { public static final String WAYBACK_DATE = "wayback_date"; public static final String CRAWL_DATE = "crawl_date"; public static final String CRAWL_DATES = "crawl_dates"; + public static final String CRAWL_WEEKDAY = "crawl_weekday"; + public static final String CRAWL_TIME_OF_DAY = "crawl_time_of_day"; public static final String CRAWL_YEAR = "crawl_year"; public static final String CRAWL_YEARS = "crawl_years"; public static final String PUBLICATION_DATE = "publication_date"; // Does not seem to be used public static final String PUBLICATION_YEAR = "publication_year"; // Does not seem to be used public static final String LAST_MODIFIED = "last_modified"; + public static final String LAST_MODIFIED_WEEKDAY = "last_modified_weekday"; + public static final String LAST_MODIFIED_TIME_OF_DAY = "last_modified_time_of_day"; public static final String LAST_MODIFIED_YEAR = "last_modified_year"; //Image Exif metadata diff --git a/warc-indexer/src/main/java/uk/bl/wa/solr/TikaExtractor.java b/warc-indexer/src/main/java/uk/bl/wa/solr/TikaExtractor.java index 2b3bb9c6..e1ea5420 100644 --- a/warc-indexer/src/main/java/uk/bl/wa/solr/TikaExtractor.java +++ b/warc-indexer/src/main/java/uk/bl/wa/solr/TikaExtractor.java @@ -32,6 +32,7 @@ import java.io.Writer; import java.util.Date; import java.util.List; +import java.util.Locale; import org.apache.commons.io.input.BoundedInputStream; import org.apache.commons.lang.StringUtils; @@ -348,6 +349,9 @@ public SolrRecord extract( SolrRecord solr, InputStream is, String url ) throws // solr.getSolrDocument().setField(SolrFields.LAST_MODIFIED, // edate); solr.setField(SolrFields.LAST_MODIFIED, iso_df.print(edate)); + solr.setField(SolrFields.LAST_MODIFIED_TIME_OF_DAY, + ISODateTimeFormat.basicTimeNoMillis().withZoneUTC().print(edate)); + solr.setField(SolrFields.LAST_MODIFIED_TIME_OF_DAY, edate.dayOfWeek().getAsText(Locale.ENGLISH)); } } diff --git a/warc-indexer/src/main/solr/solr/discovery/conf/schema.xml b/warc-indexer/src/main/solr/solr/discovery/conf/schema.xml index 5a8fb60f..3bda78f8 100644 --- a/warc-indexer/src/main/solr/solr/discovery/conf/schema.xml +++ b/warc-indexer/src/main/solr/solr/discovery/conf/schema.xml @@ -64,6 +64,8 @@ + + @@ -87,6 +89,8 @@ + + diff --git a/warc-indexer/src/main/solr/solr7/discovery/conf/schema.xml b/warc-indexer/src/main/solr/solr7/discovery/conf/schema.xml index ce5bb97c..708ccf3a 100644 --- a/warc-indexer/src/main/solr/solr7/discovery/conf/schema.xml +++ b/warc-indexer/src/main/solr/solr7/discovery/conf/schema.xml @@ -140,6 +140,20 @@ This schema is for Solr 7+ and will not work under Solr 6. + + + + @@ -151,10 +165,22 @@ This schema is for Solr 7+ and will not work under Solr 6. Note: This is not a very reliable timestamp for most formats. JPEGs tend to work quite well. Sample use: Sorting by age as stated in the format sort=last_modified asc --> + + + + -