From 3155af0559c5240c30f6debb652a83d13bbfc8d3 Mon Sep 17 00:00:00 2001 From: bnayfeh <45442435+bnayfeh@users.noreply.github.com> Date: Sat, 28 Dec 2024 10:13:34 +0300 Subject: [PATCH] Fixed issue with illegal URLs containing spaces being returned by SearXNG (#42) --- .../searxng/SearXNGWebSearchEngine.java | 14 +++-- .../searxng/SearXNGWebSearchEngineTest.java | 55 +++++++++++++++++++ 2 files changed, 63 insertions(+), 6 deletions(-) create mode 100644 web-search-engines/langchain4j-community-web-search-engine-searxng/src/test/java/dev/langchain4j/community/web/search/searxng/SearXNGWebSearchEngineTest.java diff --git a/web-search-engines/langchain4j-community-web-search-engine-searxng/src/main/java/dev/langchain4j/community/web/search/searxng/SearXNGWebSearchEngine.java b/web-search-engines/langchain4j-community-web-search-engine-searxng/src/main/java/dev/langchain4j/community/web/search/searxng/SearXNGWebSearchEngine.java index 402392d..fab5537 100644 --- a/web-search-engines/langchain4j-community-web-search-engine-searxng/src/main/java/dev/langchain4j/community/web/search/searxng/SearXNGWebSearchEngine.java +++ b/web-search-engines/langchain4j-community-web-search-engine-searxng/src/main/java/dev/langchain4j/community/web/search/searxng/SearXNGWebSearchEngine.java @@ -49,6 +49,13 @@ private static String toCSV(List values) { return String.join(",", values.toString()); } + static URI makeURI(String urlString) { + if (urlString == null || urlString.isBlank()) { + throw new IllegalArgumentException("urlString can not be null or blank"); + } + return URI.create(urlString.replaceAll("\\s+", "%20")); + } + private static Map extractMetadata(SearXNGResult result) { final Map metadata = new HashMap<>(); metadata.put("engine", result.getEngine()); @@ -60,13 +67,8 @@ private static Map extractMetadata(SearXNGResult result) { } private static WebSearchOrganicResult toWebSearchOrganicResult(SearXNGResult result) { - // FIXME: temporarily fix URI illegal character, raise a issue to solve it. - String url = result.getUrl(); - int illegalChar = url.indexOf('#'); - url = illegalChar == -1 ? url : url.substring(0, illegalChar); - return WebSearchOrganicResult.from( - result.getTitle(), URI.create(url), result.getContent(), null, extractMetadata(result)); + result.getTitle(), makeURI(result.getUrl()), result.getContent(), null, extractMetadata(result)); } private static boolean hasValue(String value) { diff --git a/web-search-engines/langchain4j-community-web-search-engine-searxng/src/test/java/dev/langchain4j/community/web/search/searxng/SearXNGWebSearchEngineTest.java b/web-search-engines/langchain4j-community-web-search-engine-searxng/src/test/java/dev/langchain4j/community/web/search/searxng/SearXNGWebSearchEngineTest.java new file mode 100644 index 0000000..91f9538 --- /dev/null +++ b/web-search-engines/langchain4j-community-web-search-engine-searxng/src/test/java/dev/langchain4j/community/web/search/searxng/SearXNGWebSearchEngineTest.java @@ -0,0 +1,55 @@ +package dev.langchain4j.community.web.search.searxng; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.net.URI; +import org.junit.jupiter.api.Test; + +class SearXNGWebSearchEngineTest { + + private static void testURI(String uriString) { + assertDoesNotThrow(() -> SearXNGWebSearchEngine.makeURI(uriString)); + final URI uri = SearXNGWebSearchEngine.makeURI(uriString); + assertNotNull(uri); + if (uriString.matches(".*\\s+.*")) { + assertNotEquals(uriString, uri.toString()); + } else { + assertEquals(uriString, uri.toString()); + } + } + + @Test + void test_malformed_urls() { + assertThrows( + IllegalArgumentException.class, + () -> URI.create( + "https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j is a groundbreaking Java,by Python and JavaScript libraries.")); + assertDoesNotThrow( + () -> URI.create( + "https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j")); + assertDoesNotThrow(() -> URI.create( + "https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j is a groundbreaking Java,by Python and JavaScript libraries." + .replaceAll(" ", "%20"))); + assertDoesNotThrow(() -> URI.create( + "https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j is a groundbreaking Java,by Python and JavaScript libraries." + .replaceAll("\\s+", "%20"))); + assertDoesNotThrow(() -> URI.create( + "https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f")); + } + + @Test + void test_make_uri() { + assertThrows(IllegalArgumentException.class, () -> SearXNGWebSearchEngine.makeURI(null)); + assertThrows(IllegalArgumentException.class, () -> SearXNGWebSearchEngine.makeURI("")); + assertThrows(IllegalArgumentException.class, () -> SearXNGWebSearchEngine.makeURI(" \\t")); + testURI( + "https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j"); + testURI( + "https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f#:~:text=LangChain4j is a groundbreaking Java,by Python and JavaScript libraries."); + testURI("https://www.linkedin.com/pulse/introduction-langchain4j-supercharging-java-llms-ibrahim-jimoh-kyj4f"); + } +}