diff --git a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java
index 8e5b97ce..e48008f2 100644
--- a/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java
+++ b/warc-indexer/src/main/java/uk/bl/wa/indexer/WARCIndexer.java
@@ -43,6 +43,7 @@
import java.util.Date;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
import java.util.Properties;
import java.util.TimeZone;
@@ -487,8 +488,7 @@ public SolrRecord extract( String archiveName, ArchiveRecord record, boolean isT
if (results.getResults().size() > 0) {
SolrDocument fr = results.getResults().get(0);
if (fr.containsKey(SolrFields.CRAWL_DATES)) {
- for (Object cds : fr
- .getFieldValues(SolrFields.CRAWL_DATES)) {
+ for (Object cds : fr.getFieldValues(SolrFields.CRAWL_DATES)) {
currentCrawlDates.add((Date) cds);
}
}
@@ -525,8 +525,9 @@ public SolrRecord extract( String archiveName, ArchiveRecord record, boolean isT
dateList.add(crawlDate);
Collections.sort(dateList);
Date firstDate = dateList.get(0);
- solr.setField(SolrFields.CRAWL_DATE,
- formatter.format(firstDate));
+ solr.setField(SolrFields.CRAWL_DATE, formatter.format(firstDate));
+ solr.setField(SolrFields.CRAWL_WEEKDAY, formatter_weekday.format(firstDate));
+ solr.setField(SolrFields.CRAWL_TIME_OF_DAY, formatter_time_of_day.format(firstDate));
solr.setField( SolrFields.CRAWL_YEAR, getYearFromDate(firstDate) );
// Use the current value as the waybackDate:
@@ -755,12 +756,15 @@ protected static String parseExtension(String path) {
/**
* Timestamp parsing, for the Crawl Date.
+ * Note: DateFormatters are not Thread safe
*/
- public static SimpleDateFormat formatter = new SimpleDateFormat(
- "yyyy-MM-dd'T'HH:mm:ss'Z'");
+ public static SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
+ public static SimpleDateFormat formatter_weekday = new SimpleDateFormat("EEEE", Locale.ENGLISH);
+ public static SimpleDateFormat formatter_time_of_day = new SimpleDateFormat("'0001-01-01T'HH:mm:ss'Z'");
static {
formatter.setTimeZone( TimeZone.getTimeZone( "GMT" ) );
+ formatter_weekday.setTimeZone( TimeZone.getTimeZone( "GMT" ) );
}
/**
diff --git a/warc-indexer/src/main/java/uk/bl/wa/solr/SolrFields.java b/warc-indexer/src/main/java/uk/bl/wa/solr/SolrFields.java
index 1f6ee25c..ffbc5b64 100644
--- a/warc-indexer/src/main/java/uk/bl/wa/solr/SolrFields.java
+++ b/warc-indexer/src/main/java/uk/bl/wa/solr/SolrFields.java
@@ -112,11 +112,15 @@ public interface SolrFields {
public static final String WAYBACK_DATE = "wayback_date";
public static final String CRAWL_DATE = "crawl_date";
public static final String CRAWL_DATES = "crawl_dates";
+ public static final String CRAWL_WEEKDAY = "crawl_weekday";
+ public static final String CRAWL_TIME_OF_DAY = "crawl_time_of_day";
public static final String CRAWL_YEAR = "crawl_year";
public static final String CRAWL_YEARS = "crawl_years";
public static final String PUBLICATION_DATE = "publication_date"; // Does not seem to be used
public static final String PUBLICATION_YEAR = "publication_year"; // Does not seem to be used
public static final String LAST_MODIFIED = "last_modified";
+ public static final String LAST_MODIFIED_WEEKDAY = "last_modified_weekday";
+ public static final String LAST_MODIFIED_TIME_OF_DAY = "last_modified_time_of_day";
public static final String LAST_MODIFIED_YEAR = "last_modified_year";
//Image Exif metadata
diff --git a/warc-indexer/src/main/java/uk/bl/wa/solr/TikaExtractor.java b/warc-indexer/src/main/java/uk/bl/wa/solr/TikaExtractor.java
index 2b3bb9c6..5bdc7e9e 100644
--- a/warc-indexer/src/main/java/uk/bl/wa/solr/TikaExtractor.java
+++ b/warc-indexer/src/main/java/uk/bl/wa/solr/TikaExtractor.java
@@ -32,6 +32,7 @@
import java.io.Writer;
import java.util.Date;
import java.util.List;
+import java.util.Locale;
import org.apache.commons.io.input.BoundedInputStream;
import org.apache.commons.lang.StringUtils;
@@ -348,6 +349,9 @@ public SolrRecord extract( SolrRecord solr, InputStream is, String url ) throws
// solr.getSolrDocument().setField(SolrFields.LAST_MODIFIED,
// edate);
solr.setField(SolrFields.LAST_MODIFIED, iso_df.print(edate));
+ solr.setField(SolrFields.LAST_MODIFIED_TIME_OF_DAY,
+ "0001-01-01T" + ISODateTimeFormat.hourMinuteSecond().withZoneUTC().print(edate));
+ solr.setField(SolrFields.LAST_MODIFIED_WEEKDAY, edate.dayOfWeek().getAsText(Locale.ENGLISH));
}
}
diff --git a/warc-indexer/src/main/solr/solr/discovery/conf/schema.xml b/warc-indexer/src/main/solr/solr/discovery/conf/schema.xml
index 5a8fb60f..3bda78f8 100644
--- a/warc-indexer/src/main/solr/solr/discovery/conf/schema.xml
+++ b/warc-indexer/src/main/solr/solr/discovery/conf/schema.xml
@@ -64,6 +64,8 @@
+
+
@@ -87,6 +89,8 @@
+
+
diff --git a/warc-indexer/src/main/solr/solr7/discovery/conf/schema.xml b/warc-indexer/src/main/solr/solr7/discovery/conf/schema.xml
index ce5bb97c..708ccf3a 100644
--- a/warc-indexer/src/main/solr/solr7/discovery/conf/schema.xml
+++ b/warc-indexer/src/main/solr/solr7/discovery/conf/schema.xml
@@ -140,6 +140,20 @@ This schema is for Solr 7+ and will not work under Solr 6.
+
+
+
+
@@ -151,10 +165,22 @@ This schema is for Solr 7+ and will not work under Solr 6.
Note: This is not a very reliable timestamp for most formats. JPEGs tend to work quite well.
Sample use: Sorting by age as stated in the format sort=last_modified asc -->
+
+
+
+
-