diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java index 5c828bc1a..015403d06 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java @@ -30,6 +30,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.stream.Stream; import org.apache.commons.lang.StringUtils; import org.apache.storm.metric.api.MultiCountMetric; import org.apache.storm.task.OutputCollector; @@ -269,7 +270,10 @@ public void execute(Tuple tuple) { final URL baseURL = new URL(url); for (Element link : links) { // nofollow - boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel")); + String[] relkeywords = link.attr("rel").split(" "); + boolean noFollow = + Stream.of(relkeywords).anyMatch(x -> x.equalsIgnoreCase("nofollow")); + // remove altogether if (noFollow && robots_noFollow_strict) { continue; diff --git a/core/src/test/resources/digitalpebble.com.html b/core/src/test/resources/digitalpebble.com.html index 8b1de6881..683321640 100644 --- a/core/src/test/resources/digitalpebble.com.html +++ b/core/src/test/resources/digitalpebble.com.html @@ -50,6 +50,7 @@ +