From 378c41ffedf237f6049602e415cc4e44215ffb58 Mon Sep 17 00:00:00 2001 From: Santhosh Gandhe <1909520+san81@users.noreply.github.com> Date: Mon, 3 Feb 2025 12:42:35 -0800 Subject: [PATCH] Saving page content as text Signed-off-by: Santhosh Gandhe <1909520+san81@users.noreply.github.com> --- .../confluence-source/build.gradle | 1 + .../source/confluence/ConfluenceClient.java | 7 +- .../source/confluence/ConfluenceService.java | 15 +- .../utils/HtmlToTextConversionUtil.java | 53 ++++ .../utils/HtmlToTextConversionUtilTest.java | 276 ++++++++++++++++++ 5 files changed, 342 insertions(+), 10 deletions(-) create mode 100644 data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/utils/HtmlToTextConversionUtil.java create mode 100644 data-prepper-plugins/saas-source-plugins/confluence-source/src/test/java/org/opensearch/dataprepper/plugins/source/confluence/utils/HtmlToTextConversionUtilTest.java diff --git a/data-prepper-plugins/saas-source-plugins/confluence-source/build.gradle b/data-prepper-plugins/saas-source-plugins/confluence-source/build.gradle index 1cf3854ab4..b59ab8d251 100644 --- a/data-prepper-plugins/saas-source-plugins/confluence-source/build.gradle +++ b/data-prepper-plugins/saas-source-plugins/confluence-source/build.gradle @@ -15,6 +15,7 @@ dependencies { implementation 'com.fasterxml.jackson.core:jackson-core' implementation 'com.fasterxml.jackson.core:jackson-databind' implementation 'javax.inject:javax.inject:1' + implementation 'org.jsoup:jsoup:1.18.3' implementation("org.springframework:spring-web:${libs.versions.spring.get()}") implementation 'org.projectlombok:lombok:1.18.30' diff --git a/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/ConfluenceClient.java b/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/ConfluenceClient.java index f4dc7cfe4f..b5ee7a1446 100644 --- a/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/ConfluenceClient.java +++ b/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/ConfluenceClient.java @@ -13,6 +13,7 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.annotations.VisibleForTesting; import org.opensearch.dataprepper.model.acknowledgements.AcknowledgementSet; import org.opensearch.dataprepper.model.buffer.Buffer; @@ -20,6 +21,7 @@ import org.opensearch.dataprepper.model.event.EventType; import org.opensearch.dataprepper.model.event.JacksonEvent; import org.opensearch.dataprepper.model.record.Record; +import org.opensearch.dataprepper.plugins.source.confluence.utils.HtmlToTextConversionUtil; import org.opensearch.dataprepper.plugins.source.source_crawler.base.CrawlerClient; import org.opensearch.dataprepper.plugins.source.source_crawler.base.CrawlerSourceConfig; import org.opensearch.dataprepper.plugins.source.source_crawler.base.PluginExecutorServiceProvider; @@ -117,8 +119,9 @@ public void executePartition(SaasWorkerProgressState state, .map(CompletableFuture::join) .map(contentJson -> { try { - return objectMapper.readValue(contentJson, new TypeReference<>() { + ObjectNode contentJsonObj = objectMapper.readValue(contentJson, new TypeReference<>() { }); + return HtmlToTextConversionUtil.convertHtmlToText(contentJsonObj, "body/view/value"); } catch (JsonProcessingException e) { throw new RuntimeException(e); } @@ -127,7 +130,7 @@ public void executePartition(SaasWorkerProgressState state, .withEventType(eventType) .withData(t) .build()) - .map(event -> new Record<>(event)) + .map(Record::new) .collect(Collectors.toList()); try { diff --git a/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/ConfluenceService.java b/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/ConfluenceService.java index 457c98c40c..b71342089f 100644 --- a/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/ConfluenceService.java +++ b/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/ConfluenceService.java @@ -117,9 +117,8 @@ private void searchForNewContentAndAddToQueue(ConfluenceSourceConfig configurati * @param itemInfoQueue Item info queue. */ private void addItemsToQueue(List issueList, Queue itemInfoQueue) { - issueList.forEach(issue -> { - itemInfoQueue.add(ConfluenceItemInfo.builder().withEventTime(Instant.now()).withIssueBean(issue).build()); - }); + issueList.forEach(issue -> itemInfoQueue.add(ConfluenceItemInfo.builder() + .withEventTime(Instant.now()).withIssueBean(issue).build())); } @@ -173,7 +172,7 @@ private void validateSpaceFilters(ConfluenceSourceConfig configuration) { log.trace("Validating project filters"); List badFilters = new ArrayList<>(); Set includedProjects = new HashSet<>(); - List includedAndExcludedProjects = new ArrayList<>(); + List includedAndExcludedSpaces = new ArrayList<>(); Pattern regex = Pattern.compile("[^A-Z0-9]"); ConfluenceConfigHelper.getSpacesNameIncludeFilter(configuration).forEach(projectFilter -> { Matcher matcher = regex.matcher(projectFilter); @@ -185,7 +184,7 @@ private void validateSpaceFilters(ConfluenceSourceConfig configuration) { ConfluenceConfigHelper.getSpacesNameExcludeFilter(configuration).forEach(projectFilter -> { Matcher matcher = regex.matcher(projectFilter); if (includedProjects.contains(projectFilter)) { - includedAndExcludedProjects.add(projectFilter); + includedAndExcludedSpaces.add(projectFilter); } if (matcher.find() || projectFilter.length() <= 1 || projectFilter.length() > 10) { badFilters.add(projectFilter); @@ -198,9 +197,9 @@ private void validateSpaceFilters(ConfluenceSourceConfig configuration) { "Invalid Space key found in filter configuration for " + filters); } - if (!includedAndExcludedProjects.isEmpty()) { - String filters = String.join("\"" + includedAndExcludedProjects + "\"", ", "); - log.error("One or more Space keys found in both include and exclude: {}", includedAndExcludedProjects); + if (!includedAndExcludedSpaces.isEmpty()) { + String filters = String.join("\"" + includedAndExcludedSpaces + "\"", ", "); + log.error("One or more Space keys found in both include and exclude: {}", includedAndExcludedSpaces); throw new BadRequestException("Bad request exception occurred " + "Space filters is invalid because the following space are listed in both include and exclude" + filters); diff --git a/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/utils/HtmlToTextConversionUtil.java b/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/utils/HtmlToTextConversionUtil.java new file mode 100644 index 0000000000..3d34dbf8e3 --- /dev/null +++ b/data-prepper-plugins/saas-source-plugins/confluence-source/src/main/java/org/opensearch/dataprepper/plugins/source/confluence/utils/HtmlToTextConversionUtil.java @@ -0,0 +1,53 @@ +package org.opensearch.dataprepper.plugins.source.confluence.utils; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.util.Optional; + +public class HtmlToTextConversionUtil { + + public static JsonNode convertHtmlToText(ObjectNode jsonObject, String path) { + Optional valueAtGivenPath = getValueAtGivenPath(jsonObject, path); + if (valueAtGivenPath.isPresent()) { + String html = valueAtGivenPath.get().textValue(); + String txtBody = convertHtmlToText(html); + setValueAtGivenPath(jsonObject, path, txtBody); + } + return jsonObject; + } + + public static void setValueAtGivenPath(ObjectNode jsonObject, String path, String value) { + String[] keys = path.split("/"); + JsonNode current = jsonObject; + for (int i = 0; i < keys.length - 1; i++) { + current = current.get(keys[i]); + } + ((ObjectNode) current).put(keys[keys.length - 1], value); + } + + public static Optional getValueAtGivenPath(ObjectNode jsonObject, String path) { + try { + String[] keys = path.split("/"); + ObjectNode current = jsonObject; + for (int i = 0; i < keys.length - 1; i++) { + current = (ObjectNode) current.get(keys[i]); + } + return Optional.of(current.get(keys[keys.length - 1])); + } catch (Exception e) { + return Optional.empty(); + } + } + + public static String convertHtmlToText(String html) { + if (html == null || html.isEmpty()) { + return ""; + } + Document document = Jsoup.parse(html); + // Remove scripts and style elements + document.select("script, style").remove(); + return document.text(); + } +} diff --git a/data-prepper-plugins/saas-source-plugins/confluence-source/src/test/java/org/opensearch/dataprepper/plugins/source/confluence/utils/HtmlToTextConversionUtilTest.java b/data-prepper-plugins/saas-source-plugins/confluence-source/src/test/java/org/opensearch/dataprepper/plugins/source/confluence/utils/HtmlToTextConversionUtilTest.java new file mode 100644 index 0000000000..23d6421aea --- /dev/null +++ b/data-prepper-plugins/saas-source-plugins/confluence-source/src/test/java/org/opensearch/dataprepper/plugins/source/confluence/utils/HtmlToTextConversionUtilTest.java @@ -0,0 +1,276 @@ +package org.opensearch.dataprepper.plugins.source.confluence.utils; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.Optional; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class HtmlToTextConversionUtilTest { + + private ObjectMapper objectMapper; + private ObjectNode jsonObject; + + private static Stream provideDifferentTypeValues() { + return Stream.of( + Arguments.of(42, "number"), + Arguments.of(true, "boolean"), + Arguments.of(3.14, "number"), + Arguments.of("string value", "string") + ); + } + + @BeforeEach + void setUp() { + objectMapper = new ObjectMapper(); + jsonObject = objectMapper.createObjectNode(); + } + + @Test + void convertHtmlToText_WithValidHtmlAndPath_ShouldConvertSuccessfully() { + // Arrange + jsonObject.put("content", "

Hello World

"); + String path = "content"; + + // Act + JsonNode result = HtmlToTextConversionUtil.convertHtmlToText(jsonObject, path); + + // Assert + assertTrue(result.has("content")); + assertEquals("Hello World", result.get("content").textValue()); + } + + @Test + void convertHtmlToText_WithNestedPath_ShouldConvertSuccessfully() { + // Arrange + ObjectNode nestedNode = objectMapper.createObjectNode(); + nestedNode.put("body", "

Hello World

"); + jsonObject.set("content", nestedNode); + String path = "content/body"; + + // Act + JsonNode result = HtmlToTextConversionUtil.convertHtmlToText(jsonObject, path); + + // Assert + assertTrue(result.has("content")); + assertTrue(result.get("content").has("body")); + assertEquals("Hello World", result.get("content").get("body").textValue()); + } + + @Test + void setValueAtGivenPath_WithSimplePath_ShouldSetValue() { + // Arrange + String path = "title"; + String value = "Test Title"; + + // Act + HtmlToTextConversionUtil.setValueAtGivenPath(jsonObject, path, value); + + // Assert + assertEquals(value, jsonObject.get("title").asText()); + } + + @Test + void setValueAtGivenPath_WithNestedPath_ShouldSetValue() { + // Arrange + jsonObject.putObject("content").putObject("body"); + String path = "content/body/text"; + String value = "Test Content"; + + // Act + HtmlToTextConversionUtil.setValueAtGivenPath(jsonObject, path, value); + + // Assert + assertEquals(value, jsonObject.get("content").get("body").get("text").asText()); + } + + @Test + void convertHtmlToText_WithNullHtml_ShouldReturnEmptyString() { + // Act + String result = HtmlToTextConversionUtil.convertHtmlToText(null); + + // Assert + assertEquals("", result); + } + + @Test + void convertHtmlToText_WithEmptyHtml_ShouldReturnEmptyString() { + // Act + String result = HtmlToTextConversionUtil.convertHtmlToText(""); + + // Assert + assertEquals("", result); + } + + @Test + void convertHtmlToText_WithComplexHtml_ShouldConvertToPlainText() { + // Arrange + String html = "

Title

This is a test paragraph

"; + + // Act + String result = HtmlToTextConversionUtil.convertHtmlToText(html); + + // Assert + assertEquals("Title This is a test paragraph", result.trim()); + } + + @Test + void convertHtmlToText_WithInvalidPath_ShouldReturnOriginalJson() { + // Arrange + jsonObject.put("content", "

Hello World

"); + String invalidPath = "invalid.path"; + + // Act + JsonNode result = HtmlToTextConversionUtil.convertHtmlToText(jsonObject, invalidPath); + + // Assert + assertEquals(jsonObject, result); + } + + @Test + void getValueAtGivenPath_WithSimplePath_ShouldReturnValue() { + // Arrange + String expectedValue = "test value"; + jsonObject.put("key", expectedValue); + + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath(jsonObject, "key"); + + // Assert + assertTrue(result.isPresent()); + assertEquals(expectedValue, result.get().asText()); + } + + @Test + void getValueAtGivenPath_WithNestedPath_ShouldReturnValue() { + // Arrange + String expectedValue = "nested value"; + ObjectNode nestedNode = jsonObject.putObject("parent"); + nestedNode.put("child", expectedValue); + + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath(jsonObject, "parent/child"); + + // Assert + assertTrue(result.isPresent()); + assertEquals(expectedValue, result.get().asText()); + } + + @Test + void getValueAtGivenPath_WithDeeplyNestedPath_ShouldReturnValue() { + // Arrange + String expectedValue = "deeply nested value"; + ObjectNode level1 = jsonObject.putObject("level1"); + ObjectNode level2 = level1.putObject("level2"); + level2.put("level3", expectedValue); + + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath( + jsonObject, "level1/level2/level3"); + + // Assert + assertTrue(result.isPresent()); + assertEquals(expectedValue, result.get().asText()); + } + + @Test + void getValueAtGivenPath_WithNonExistentPath_ShouldReturnEmpty() { + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath( + jsonObject, "nonexistent/path"); + + // Assert + assertFalse(result.isPresent()); + } + + @Test + void getValueAtGivenPath_WithNullPath_ShouldReturnEmpty() { + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath(jsonObject, null); + + // Assert + assertFalse(result.isPresent()); + } + + @Test + void getValueAtGivenPath_WithEmptyPath_ShouldReturnEmpty() { + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath(jsonObject, ""); + + // Assert + assertFalse(result.isPresent()); + } + + @ParameterizedTest + @MethodSource("provideDifferentTypeValues") + void getValueAtGivenPath_WithDifferentTypes_ShouldReturnCorrectValue(Object value, String expectedType) { + // Arrange + if (value instanceof Integer) { + jsonObject.put("key", (Integer) value); + } else if (value instanceof Boolean) { + jsonObject.put("key", (Boolean) value); + } else if (value instanceof Double) { + jsonObject.put("key", (Double) value); + } else { + jsonObject.put("key", String.valueOf(value)); + } + + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath(jsonObject, "key"); + + // Assert + assertTrue(result.isPresent()); + assertEquals(value.toString(), result.get().asText()); + } + + @Test + void getValueAtGivenPath_WithInvalidIntermediatePath_ShouldReturnEmpty() { + // Arrange + jsonObject.put("key", "value"); + + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath( + jsonObject, "nonexistent/key/child"); + + // Assert + assertFalse(result.isPresent()); + } + + @Test + void getValueAtGivenPath_WithNullIntermediateNode_ShouldReturnEmpty() { + // Arrange + ObjectNode parentNode = jsonObject.putObject("parent"); + parentNode.putNull("child"); + + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath( + jsonObject, "parent/child/grandchild"); + + // Assert + assertFalse(result.isPresent()); + } + + @Test + void getValueAtGivenPath_WithArrayNode_ShouldReturnEmpty() { + // Arrange + jsonObject.putArray("array").add("value"); + + // Act + Optional result = HtmlToTextConversionUtil.getValueAtGivenPath( + jsonObject, "array/0"); + + // Assert + assertFalse(result.isPresent()); + } +} +