Skip to content

Commit

Permalink
Saving page content as text
Browse files Browse the repository at this point in the history
Signed-off-by: Santhosh Gandhe <[email protected]>
  • Loading branch information
san81 committed Feb 3, 2025
1 parent 6f40d65 commit 378c41f
Show file tree
Hide file tree
Showing 5 changed files with 342 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies {
implementation 'com.fasterxml.jackson.core:jackson-core'
implementation 'com.fasterxml.jackson.core:jackson-databind'
implementation 'javax.inject:javax.inject:1'
implementation 'org.jsoup:jsoup:1.18.3'
implementation("org.springframework:spring-web:${libs.versions.spring.get()}")

implementation 'org.projectlombok:lombok:1.18.30'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.annotations.VisibleForTesting;
import org.opensearch.dataprepper.model.acknowledgements.AcknowledgementSet;
import org.opensearch.dataprepper.model.buffer.Buffer;
import org.opensearch.dataprepper.model.event.Event;
import org.opensearch.dataprepper.model.event.EventType;
import org.opensearch.dataprepper.model.event.JacksonEvent;
import org.opensearch.dataprepper.model.record.Record;
import org.opensearch.dataprepper.plugins.source.confluence.utils.HtmlToTextConversionUtil;
import org.opensearch.dataprepper.plugins.source.source_crawler.base.CrawlerClient;
import org.opensearch.dataprepper.plugins.source.source_crawler.base.CrawlerSourceConfig;
import org.opensearch.dataprepper.plugins.source.source_crawler.base.PluginExecutorServiceProvider;
Expand Down Expand Up @@ -117,8 +119,9 @@ public void executePartition(SaasWorkerProgressState state,
.map(CompletableFuture::join)
.map(contentJson -> {
try {
return objectMapper.readValue(contentJson, new TypeReference<>() {
ObjectNode contentJsonObj = objectMapper.readValue(contentJson, new TypeReference<>() {
});
return HtmlToTextConversionUtil.convertHtmlToText(contentJsonObj, "body/view/value");
} catch (JsonProcessingException e) {
throw new RuntimeException(e);
}
Expand All @@ -127,7 +130,7 @@ public void executePartition(SaasWorkerProgressState state,
.withEventType(eventType)
.withData(t)
.build())
.map(event -> new Record<>(event))
.map(Record::new)
.collect(Collectors.toList());

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,8 @@ private void searchForNewContentAndAddToQueue(ConfluenceSourceConfig configurati
* @param itemInfoQueue Item info queue.
*/
private void addItemsToQueue(List<ConfluenceItem> issueList, Queue<ItemInfo> itemInfoQueue) {
issueList.forEach(issue -> {
itemInfoQueue.add(ConfluenceItemInfo.builder().withEventTime(Instant.now()).withIssueBean(issue).build());
});
issueList.forEach(issue -> itemInfoQueue.add(ConfluenceItemInfo.builder()
.withEventTime(Instant.now()).withIssueBean(issue).build()));
}


Expand Down Expand Up @@ -173,7 +172,7 @@ private void validateSpaceFilters(ConfluenceSourceConfig configuration) {
log.trace("Validating project filters");
List<String> badFilters = new ArrayList<>();
Set<String> includedProjects = new HashSet<>();
List<String> includedAndExcludedProjects = new ArrayList<>();
List<String> includedAndExcludedSpaces = new ArrayList<>();
Pattern regex = Pattern.compile("[^A-Z0-9]");
ConfluenceConfigHelper.getSpacesNameIncludeFilter(configuration).forEach(projectFilter -> {
Matcher matcher = regex.matcher(projectFilter);
Expand All @@ -185,7 +184,7 @@ private void validateSpaceFilters(ConfluenceSourceConfig configuration) {
ConfluenceConfigHelper.getSpacesNameExcludeFilter(configuration).forEach(projectFilter -> {
Matcher matcher = regex.matcher(projectFilter);
if (includedProjects.contains(projectFilter)) {
includedAndExcludedProjects.add(projectFilter);
includedAndExcludedSpaces.add(projectFilter);
}
if (matcher.find() || projectFilter.length() <= 1 || projectFilter.length() > 10) {
badFilters.add(projectFilter);
Expand All @@ -198,9 +197,9 @@ private void validateSpaceFilters(ConfluenceSourceConfig configuration) {
"Invalid Space key found in filter configuration for "
+ filters);
}
if (!includedAndExcludedProjects.isEmpty()) {
String filters = String.join("\"" + includedAndExcludedProjects + "\"", ", ");
log.error("One or more Space keys found in both include and exclude: {}", includedAndExcludedProjects);
if (!includedAndExcludedSpaces.isEmpty()) {
String filters = String.join("\"" + includedAndExcludedSpaces + "\"", ", ");
log.error("One or more Space keys found in both include and exclude: {}", includedAndExcludedSpaces);
throw new BadRequestException("Bad request exception occurred " +
"Space filters is invalid because the following space are listed in both include and exclude"
+ filters);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.opensearch.dataprepper.plugins.source.confluence.utils;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.util.Optional;

public class HtmlToTextConversionUtil {

public static JsonNode convertHtmlToText(ObjectNode jsonObject, String path) {
Optional<JsonNode> valueAtGivenPath = getValueAtGivenPath(jsonObject, path);
if (valueAtGivenPath.isPresent()) {
String html = valueAtGivenPath.get().textValue();
String txtBody = convertHtmlToText(html);
setValueAtGivenPath(jsonObject, path, txtBody);
}
return jsonObject;
}

public static void setValueAtGivenPath(ObjectNode jsonObject, String path, String value) {
String[] keys = path.split("/");
JsonNode current = jsonObject;
for (int i = 0; i < keys.length - 1; i++) {
current = current.get(keys[i]);
}
((ObjectNode) current).put(keys[keys.length - 1], value);
}

public static Optional<JsonNode> getValueAtGivenPath(ObjectNode jsonObject, String path) {
try {
String[] keys = path.split("/");
ObjectNode current = jsonObject;
for (int i = 0; i < keys.length - 1; i++) {
current = (ObjectNode) current.get(keys[i]);
}
return Optional.of(current.get(keys[keys.length - 1]));
} catch (Exception e) {
return Optional.empty();
}
}

public static String convertHtmlToText(String html) {
if (html == null || html.isEmpty()) {
return "";
}
Document document = Jsoup.parse(html);
// Remove scripts and style elements
document.select("script, style").remove();
return document.text();
}
}
Loading

0 comments on commit 378c41f

Please sign in to comment.