Skip to content

Commit

Permalink
Issue #728: Adding asterisk for metadata transfer (#1117)
Browse files Browse the repository at this point in the history
* Issue #728: Adding asterisk for metadata transfer

Signed-off-by: Michael Dinzinger <[email protected]>

* Add method keySet(prefix) and metadata.copy

Signed-off-by: Michael Dinzinger <[email protected]>

* Add comments to crawler-default.yaml and add test cases for metadata.persist + transfer

Signed-off-by: Michael Dinzinger <[email protected]>

---------

Signed-off-by: Michael Dinzinger <[email protected]>
  • Loading branch information
michaeldinzinger authored Nov 13, 2023
1 parent 5e8802f commit 4d3340f
Show file tree
Hide file tree
Showing 11 changed files with 152 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,16 @@ config:
- com.digitalpebble.stormcrawler.Metadata
- com.digitalpebble.stormcrawler.persistence.Status

# metadata to transfer to the outlinks
# used by Fetcher for redirections, sitemapparser, etc...
# these are also persisted for the parent document (see below)
# Lists the metadata to transfer to outlinks
# Used by Fetcher and SiteMapParser for redirections,
# discovered links, passing cookies to child pages, etc.
# These are also persisted for the parent document (see below).
# Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.".
# metadata.transfer:
# - customMetadataName

# lists the metadata to persist to storage
# these are not transferred to the outlinks
# Lists the metadata to persist to storage
# These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*".
metadata.persist:
- _redirTo
- error.cause
Expand Down
18 changes: 18 additions & 0 deletions core/src/main/java/com/digitalpebble/stormcrawler/Metadata.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;

/** Wrapper around Map &lt;String,String[]&gt; * */
Expand Down Expand Up @@ -208,6 +209,13 @@ public Set<String> keySet() {
return md.keySet();
}

/** Returns the keySet for all keys starting with a given prefix */
public Set<String> keySet(String prefix) {
return md.keySet().stream()
.filter(key -> key.startsWith(prefix))
.collect(Collectors.toSet());
}

/** Returns the first non empty value found for the keys or null if none found. */
public static String getFirstValue(Metadata md, String... keys) {
for (String key : keys) {
Expand All @@ -218,6 +226,16 @@ public static String getFirstValue(Metadata md, String... keys) {
return null;
}

/**
* Copies the values arrays for a given key to another metadata object
*
* @param targetMetadata the metadata to copy to
* @param key the key to copy
*/
public void copy(Metadata targetMetadata, String key) {
targetMetadata.setValues(key, getValues(key));
}

/** Returns the underlying Map * */
public Map<String, String[]> asMap() {
return md;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -653,8 +653,7 @@ public void run() {

// get any metrics from the protocol metadata
// expect Longs
response.getMetadata().keySet().stream()
.filter(s -> s.startsWith("metrics."))
response.getMetadata().keySet("metrics.").stream()
.forEach(
s ->
averagedMetrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -425,8 +425,7 @@ public void execute(Tuple input) {
final int byteLength = response.getContent().length;

// get any metrics from the protocol metadata
response.getMetadata().keySet().stream()
.filter(s -> s.startsWith("metrics."))
response.getMetadata().keySet("metrics.").stream()
.forEach(
s ->
averagedMetrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ public class MetadataTransfer {
/** Metadata key name for tracking a non-default max depth */
public static final String maxDepthKeyName = "max.depth";

private final Set<String> mdToTransfer = new HashSet<>();
protected final Set<String> mdToTransfer = new HashSet<>();

private final Set<String> mdToPersistOnly = new HashSet<>();
protected final Set<String> mdToPersistOnly = new HashSet<>();

private boolean trackPath = true;
protected boolean trackPath = true;

private boolean trackDepth = true;
protected boolean trackDepth = true;

public static MetadataTransfer getInstance(Map<String, Object> conf) {
String className = ConfUtils.getString(conf, metadataTransferClassParamName);
Expand Down Expand Up @@ -156,12 +156,24 @@ public Metadata filter(Metadata metadata) {
return filtered_md;
}

/**
* Filter the metadata based on a set of keys. If a key ends with a * then all the keys starting
* with the prefix will be added.
*/
private Metadata _filter(Metadata metadata, Set<String> filter) {
Metadata filtered_md = new Metadata();

for (String key : filter) {
String[] vals = metadata.getValues(key);
if (vals != null) filtered_md.setValues(key, vals);
if (key.endsWith("*")) {
String prefix = key.substring(0, key.length() - 1);
for (String k : metadata.keySet(prefix)) {
metadata.copy(filtered_md, k);
}
} else {
metadata.copy(filtered_md, key);
}
}

return filtered_md;
}
}
13 changes: 7 additions & 6 deletions core/src/main/resources/crawler-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,16 @@ config:

urlbuffer.class: "com.digitalpebble.stormcrawler.persistence.urlbuffer.SimpleURLBuffer"

# metadata to transfer to the outlinks
# used by Fetcher for redirections, sitemapparser,
# passing cookies to child pages, etc.
# These are also persisted for the parent document (see below)
# Lists the metadata to transfer to outlinks
# Used by Fetcher and SiteMapParser for redirections,
# discovered links, passing cookies to child pages, etc.
# These are also persisted for the parent document (see below).
# Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.".
# metadata.transfer:
# - customMetadataName

# lists the metadata to persist to storage
# these are not transferred to the outlinks
# Lists the metadata to persist to storage
# These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*".
metadata.persist:
- _redirTo
- error.cause
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package com.digitalpebble.stormcrawler;

import org.junit.Assert;
import org.junit.Test;

public class MetadataTest {

@Test
public void testCopyWithPrefix() {
Metadata metadata = new Metadata();
metadata.addValue("fetch.statusCode", "500");
metadata.addValue("fetch.error.count", "2");
metadata.addValue("fetch.exception", "java.lang.Exception");
metadata.addValue("fetchInterval", "200");
metadata.addValue("isFeed", "true");
metadata.addValue("depth", "1");

Metadata copy = new Metadata();
for (String key : metadata.keySet("fetch.")) {
metadata.copy(copy, key);
}

Assert.assertEquals(3, copy.size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,7 @@ public void testNonSitemapParsing() throws IOException {
}

private void assertNewsAttributes(Metadata metadata) {
long numAttributes =
metadata.keySet().stream()
.filter(key -> key.startsWith(Extension.NEWS.name() + "."))
.count();
long numAttributes = metadata.keySet(Extension.NEWS.name() + ".").size();
Assert.assertEquals(7, numAttributes);
Assert.assertEquals(
"The Example Times", metadata.getFirstValue(Extension.NEWS.name() + "." + "name"));
Expand All @@ -265,10 +262,7 @@ private void assertNewsAttributes(Metadata metadata) {
}

private void assertImageAttributes(Metadata metadata) {
long numAttributes =
metadata.keySet().stream()
.filter(key -> key.startsWith(Extension.IMAGE.name() + "."))
.count();
long numAttributes = metadata.keySet(Extension.IMAGE.name() + ".").size();
Assert.assertEquals(5, numAttributes);
Assert.assertEquals(
"This is the caption.",
Expand All @@ -288,10 +282,7 @@ private void assertImageAttributes(Metadata metadata) {
}

private void assertLinksAttributes(Metadata metadata) {
long numAttributes =
metadata.keySet().stream()
.filter(key -> key.startsWith(Extension.LINKS.name() + "."))
.count();
long numAttributes = metadata.keySet(Extension.LINKS.name() + ".").size();
Assert.assertEquals(3, numAttributes);
Assert.assertEquals(
"alternate", metadata.getFirstValue(Extension.LINKS.name() + "." + "params.rel"));
Expand All @@ -303,10 +294,7 @@ private void assertLinksAttributes(Metadata metadata) {
}

private void assertVideoAttributes(Metadata metadata) {
long numAttributes =
metadata.keySet().stream()
.filter(key -> key.startsWith(Extension.VIDEO.name() + "."))
.count();
long numAttributes = metadata.keySet(Extension.VIDEO.name() + ".").size();
Assert.assertEquals(20, numAttributes);
Assert.assertEquals(
"http://www.example.com/thumbs/123.jpg",
Expand Down Expand Up @@ -362,10 +350,7 @@ private void assertVideoAttributes(Metadata metadata) {
}

private void assertMobileAttributes(Metadata metadata) {
long numAttributes =
metadata.keySet().stream()
.filter(key -> key.startsWith(Extension.MOBILE.name() + "."))
.count();
long numAttributes = metadata.keySet(Extension.MOBILE.name() + ".").size();
Assert.assertEquals(0, numAttributes);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
import com.digitalpebble.stormcrawler.Metadata;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.junit.Assert;
import org.junit.Test;

Expand All @@ -26,13 +28,27 @@ public class MetadataTransferTest {
public void testTransfer() throws MalformedURLException {
Map<String, Object> conf = new HashMap<>();
conf.put(MetadataTransfer.trackDepthParamName, true);
conf.put(MetadataTransfer.metadataTransferParamName, List.of("cookie.*"));
MetadataTransfer mdt = MetadataTransfer.getInstance(conf);

Metadata parentMD = new Metadata();
parentMD.addValue("cookie.id", "42");
parentMD.addValue("cookie.source", "example.com");
parentMD.addValue("fetchInterval", "200");

Metadata outlinkMD =
mdt.getMetaForOutlink(
"http://www.example.com/outlink.html", "http://www.example.com", parentMD);
// test the value of track seed and depth

// test the value of track seed, depth and fetch fields
Assert.assertEquals("1", outlinkMD.getFirstValue(MetadataTransfer.depthKeyName));
Set<String> expectedFields =
Set.of(
MetadataTransfer.urlPathKeyName,
MetadataTransfer.depthKeyName,
"cookie.id",
"cookie.source");
Assert.assertEquals(expectedFields, outlinkMD.keySet());
String[] urlpath = outlinkMD.getValues(MetadataTransfer.urlPathKeyName);
Assert.assertEquals(1, urlpath.length);
}
Expand Down Expand Up @@ -61,6 +77,45 @@ public void testCustomTransferClass() throws MalformedURLException {
}
Assert.assertEquals(false, hasThrownException);
}

@Test
public void testFilterWithAsterisk() {
Metadata metadata = new Metadata();
metadata.addValue("fetch.statusCode", "500");
metadata.addValue("fetch.error.count", "2");
metadata.addValue("fetch.exception", "java.lang.Exception");
metadata.addValue("fetchInterval", "200");
metadata.addValue("isFeed", "true");
metadata.addValue("depth", "1");

// test for empty metadata.persist list
Map<String, Object> conf = new HashMap<>();
conf.put(MetadataTransfer.metadataPersistParamName, List.of());
MetadataTransfer mdt = MetadataTransfer.getInstance(conf);
Metadata filteredMetadata = mdt.filter(metadata);
Assert.assertEquals(2, filteredMetadata.size());

// test for metadata.persist list with asterisk entry
conf = new HashMap<>();
conf.put(MetadataTransfer.metadataPersistParamName, List.of("fetch*"));
mdt = MetadataTransfer.getInstance(conf);
filteredMetadata = mdt.filter(metadata);
Assert.assertEquals(5, filteredMetadata.size());

// test for metadata.persist list with asterisk entry after a dot
conf = new HashMap<>();
conf.put(MetadataTransfer.metadataPersistParamName, List.of("fetch.*"));
mdt = MetadataTransfer.getInstance(conf);
filteredMetadata = mdt.filter(metadata);
Assert.assertEquals(4, filteredMetadata.size());

// test for persist all metadata
conf = new HashMap<>();
conf.put(MetadataTransfer.metadataPersistParamName, List.of("*"));
mdt = MetadataTransfer.getInstance(conf);
filteredMetadata = mdt.filter(metadata);
Assert.assertEquals(6, filteredMetadata.size());
}
}

class myCustomTransferClass extends MetadataTransfer {}
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,16 @@ config:
- com.digitalpebble.stormcrawler.Metadata
- com.digitalpebble.stormcrawler.persistence.Status

# metadata to transfer to the outlinks
# used by Fetcher for redirections, sitemapparser, etc...
# these are also persisted for the parent document (see below)
# Lists the metadata to transfer to outlinks
# Used by Fetcher and SiteMapParser for redirections,
# discovered links, passing cookies to child pages, etc.
# These are also persisted for the parent document (see below).
# Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.".
# metadata.transfer:
# - customMetadataName

# lists the metadata to persist to storage
# these are not transferred to the outlinks
# Lists the metadata to persist to storage
# These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*".
metadata.persist:
- _redirTo
- error.cause
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,16 @@ config:
- com.digitalpebble.stormcrawler.Metadata
- com.digitalpebble.stormcrawler.persistence.Status

# metadata to transfer to the outlinks
# used by Fetcher for redirections, sitemapparser, etc...
# these are also persisted for the parent document (see below)
# Lists the metadata to transfer to outlinks
# Used by Fetcher and SiteMapParser for redirections,
# discovered links, passing cookies to child pages, etc.
# These are also persisted for the parent document (see below).
# Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.".
# metadata.transfer:
# - customMetadataName

# lists the metadata to persist to storage
# these are not transferred to the outlinks
# Lists the metadata to persist to storage
# These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*".
metadata.persist:
- _redirTo
- error.cause
Expand Down

0 comments on commit 4d3340f

Please sign in to comment.