Skip to content

Commit

Permalink
Issue #728: Adding asterisk for metadata transfer
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Dinzinger <[email protected]>
  • Loading branch information
michaeldinzinger committed Nov 4, 2023
1 parent 91ae977 commit 374e0bb
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ public class MetadataTransfer {
/** Metadata key name for tracking a non-default max depth */
public static final String maxDepthKeyName = "max.depth";

private final Set<String> mdToTransfer = new HashSet<>();
protected final Set<String> mdToTransfer = new HashSet<>();

private final Set<String> mdToPersistOnly = new HashSet<>();
protected final Set<String> mdToPersistOnly = new HashSet<>();

private boolean trackPath = true;
protected boolean trackPath = true;

private boolean trackDepth = true;
protected boolean trackDepth = true;

public static MetadataTransfer getInstance(Map<String, Object> conf) {
String className = ConfUtils.getString(conf, metadataTransferClassParamName);
Expand Down Expand Up @@ -156,12 +156,34 @@ public Metadata filter(Metadata metadata) {
return filtered_md;
}

/**
* Filter the metadata based on a set of keys. If a key ends with a * then all the keys starting
* with the prefix will be added.
*/
private Metadata _filter(Metadata metadata, Set<String> filter) {
Metadata filtered_md = new Metadata();

Set<String> filterKeys = new HashSet<>();
for (String key : filter) {
String[] vals = metadata.getValues(key);
if (vals != null) filtered_md.setValues(key, vals);
if (key.endsWith("*")) {
String prefix = key.substring(0, key.length() - 1);
for (String mdKey : metadata.keySet()) {
if (mdKey.startsWith(prefix)) {
filterKeys.add(mdKey);
}
}
} else {
filterKeys.add(key);
}

for (String filterKey : filterKeys) {
String[] values = metadata.getValues(filterKey);
if (values != null) {
filtered_md.setValues(filterKey, values);
}
}
}

return filtered_md;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import com.digitalpebble.stormcrawler.Metadata;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.junit.Assert;
import org.junit.Test;
Expand Down Expand Up @@ -61,6 +62,45 @@ public void testCustomTransferClass() throws MalformedURLException {
}
Assert.assertEquals(false, hasThrownException);
}

@Test
public void testFilter() {
Metadata metadata = new Metadata();
metadata.addValue("fetch.statusCode", "500");
metadata.addValue("fetch.error.count", "2");
metadata.addValue("fetch.exception", "java.lang.Exception");
metadata.addValue("fetchInterval", "200");
metadata.addValue("isFeed", "true");
metadata.addValue("depth", "1");

// test for empty metadata.transfer list
Map<String, Object> conf = new HashMap<>();
conf.put(MetadataTransfer.metadataTransferParamName, List.of());
MetadataTransfer mdt = MetadataTransfer.getInstance(conf);
Metadata filteredMetadata = mdt.filter(metadata);
Assert.assertEquals(2, filteredMetadata.size());

// test for metadata.transfer list with asterisk entry
conf = new HashMap<>();
conf.put(MetadataTransfer.metadataTransferParamName, List.of("fetch*"));
mdt = MetadataTransfer.getInstance(conf);
filteredMetadata = mdt.filter(metadata);
Assert.assertEquals(5, filteredMetadata.size());

// test for metadata.transfer list with asterisk entry after a dot
conf = new HashMap<>();
conf.put(MetadataTransfer.metadataTransferParamName, List.of("fetch.*"));
mdt = MetadataTransfer.getInstance(conf);
filteredMetadata = mdt.filter(metadata);
Assert.assertEquals(4, filteredMetadata.size());

// test for transfer all metadata
conf = new HashMap<>();
conf.put(MetadataTransfer.metadataTransferParamName, List.of("*"));
mdt = MetadataTransfer.getInstance(conf);
filteredMetadata = mdt.filter(metadata);
Assert.assertEquals(6, filteredMetadata.size());
}
}

class myCustomTransferClass extends MetadataTransfer {}

0 comments on commit 374e0bb

Please sign in to comment.