diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/util/MetadataTransfer.java b/core/src/main/java/com/digitalpebble/stormcrawler/util/MetadataTransfer.java index 3e9d5568e..6df5a940e 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/util/MetadataTransfer.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/util/MetadataTransfer.java @@ -65,13 +65,13 @@ public class MetadataTransfer { /** Metadata key name for tracking a non-default max depth */ public static final String maxDepthKeyName = "max.depth"; - private final Set mdToTransfer = new HashSet<>(); + protected final Set mdToTransfer = new HashSet<>(); - private final Set mdToPersistOnly = new HashSet<>(); + protected final Set mdToPersistOnly = new HashSet<>(); - private boolean trackPath = true; + protected boolean trackPath = true; - private boolean trackDepth = true; + protected boolean trackDepth = true; public static MetadataTransfer getInstance(Map conf) { String className = ConfUtils.getString(conf, metadataTransferClassParamName); @@ -156,12 +156,34 @@ public Metadata filter(Metadata metadata) { return filtered_md; } + /** + * Filter the metadata based on a set of keys. If a key ends with a * then all the keys starting + * with the prefix will be added. + */ private Metadata _filter(Metadata metadata, Set filter) { Metadata filtered_md = new Metadata(); + + Set filterKeys = new HashSet<>(); for (String key : filter) { - String[] vals = metadata.getValues(key); - if (vals != null) filtered_md.setValues(key, vals); + if (key.endsWith("*")) { + String prefix = key.substring(0, key.length() - 1); + for (String mdKey : metadata.keySet()) { + if (mdKey.startsWith(prefix)) { + filterKeys.add(mdKey); + } + } + } else { + filterKeys.add(key); + } + + for (String filterKey : filterKeys) { + String[] values = metadata.getValues(filterKey); + if (values != null) { + filtered_md.setValues(filterKey, values); + } + } } + return filtered_md; } } diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/util/MetadataTransferTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/util/MetadataTransferTest.java index 13b4b9034..9cc1b745f 100644 --- a/core/src/test/java/com/digitalpebble/stormcrawler/util/MetadataTransferTest.java +++ b/core/src/test/java/com/digitalpebble/stormcrawler/util/MetadataTransferTest.java @@ -17,6 +17,7 @@ import com.digitalpebble.stormcrawler.Metadata; import java.net.MalformedURLException; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.junit.Assert; import org.junit.Test; @@ -61,6 +62,45 @@ public void testCustomTransferClass() throws MalformedURLException { } Assert.assertEquals(false, hasThrownException); } + + @Test + public void testFilter() { + Metadata metadata = new Metadata(); + metadata.addValue("fetch.statusCode", "500"); + metadata.addValue("fetch.error.count", "2"); + metadata.addValue("fetch.exception", "java.lang.Exception"); + metadata.addValue("fetchInterval", "200"); + metadata.addValue("isFeed", "true"); + metadata.addValue("depth", "1"); + + // test for empty metadata.transfer list + Map conf = new HashMap<>(); + conf.put(MetadataTransfer.metadataTransferParamName, List.of()); + MetadataTransfer mdt = MetadataTransfer.getInstance(conf); + Metadata filteredMetadata = mdt.filter(metadata); + Assert.assertEquals(2, filteredMetadata.size()); + + // test for metadata.transfer list with asterisk entry + conf = new HashMap<>(); + conf.put(MetadataTransfer.metadataTransferParamName, List.of("fetch*")); + mdt = MetadataTransfer.getInstance(conf); + filteredMetadata = mdt.filter(metadata); + Assert.assertEquals(5, filteredMetadata.size()); + + // test for metadata.transfer list with asterisk entry after a dot + conf = new HashMap<>(); + conf.put(MetadataTransfer.metadataTransferParamName, List.of("fetch.*")); + mdt = MetadataTransfer.getInstance(conf); + filteredMetadata = mdt.filter(metadata); + Assert.assertEquals(4, filteredMetadata.size()); + + // test for transfer all metadata + conf = new HashMap<>(); + conf.put(MetadataTransfer.metadataTransferParamName, List.of("*")); + mdt = MetadataTransfer.getInstance(conf); + filteredMetadata = mdt.filter(metadata); + Assert.assertEquals(6, filteredMetadata.size()); + } } class myCustomTransferClass extends MetadataTransfer {}