Skip to content

Commit

Permalink
Fixed issue with illegal URLs containing spaces being returned by Sea…
Browse files Browse the repository at this point in the history
…rXNG (#42)
  • Loading branch information
bnayfeh authored Dec 28, 2024
1 parent 34577ce commit 3155af0
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ private static String toCSV(List<?> values) {
return String.join(",", values.toString());
}

static URI makeURI(String urlString) {
if (urlString == null || urlString.isBlank()) {
throw new IllegalArgumentException("urlString can not be null or blank");
}
return URI.create(urlString.replaceAll("\\s+", "%20"));
}

private static Map<String, String> extractMetadata(SearXNGResult result) {
final Map<String, String> metadata = new HashMap<>();
metadata.put("engine", result.getEngine());
Expand All @@ -60,13 +67,8 @@ private static Map<String, String> extractMetadata(SearXNGResult result) {
}

private static WebSearchOrganicResult toWebSearchOrganicResult(SearXNGResult result) {
// FIXME: temporarily fix URI illegal character, raise a issue to solve it.
String url = result.getUrl();
int illegalChar = url.indexOf('#');
url = illegalChar == -1 ? url : url.substring(0, illegalChar);

return WebSearchOrganicResult.from(
result.getTitle(), URI.create(url), result.getContent(), null, extractMetadata(result));
result.getTitle(), makeURI(result.getUrl()), result.getContent(), null, extractMetadata(result));
}

private static boolean hasValue(String value) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package dev.langchain4j.community.web.search.searxng;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertThrows;

import java.net.URI;
import org.junit.jupiter.api.Test;

class SearXNGWebSearchEngineTest {

private static void testURI(String uriString) {
assertDoesNotThrow(() -> SearXNGWebSearchEngine.makeURI(uriString));
final URI uri = SearXNGWebSearchEngine.makeURI(uriString);
assertNotNull(uri);
if (uriString.matches(".*\\s+.*")) {
assertNotEquals(uriString, uri.toString());
} else {
assertEquals(uriString, uri.toString());
}
}

@Test
void test_malformed_urls() {
assertThrows(
IllegalArgumentException.class,
() -> URI.create(
"https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j is a groundbreaking Java,by Python and JavaScript libraries."));
assertDoesNotThrow(
() -> URI.create(
"https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j"));
assertDoesNotThrow(() -> URI.create(
"https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j is a groundbreaking Java,by Python and JavaScript libraries."
.replaceAll(" ", "%20")));
assertDoesNotThrow(() -> URI.create(
"https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j is a groundbreaking Java,by Python and JavaScript libraries."
.replaceAll("\\s+", "%20")));
assertDoesNotThrow(() -> URI.create(
"https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f"));
}

@Test
void test_make_uri() {
assertThrows(IllegalArgumentException.class, () -> SearXNGWebSearchEngine.makeURI(null));
assertThrows(IllegalArgumentException.class, () -> SearXNGWebSearchEngine.makeURI(""));
assertThrows(IllegalArgumentException.class, () -> SearXNGWebSearchEngine.makeURI(" \\t"));
testURI(
"https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j");
testURI(
"https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j is a groundbreaking Java,by Python and JavaScript libraries.");
testURI("https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f");
}
}

0 comments on commit 3155af0

Please sign in to comment.