Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SaaS Crawler Module #5095

Merged
merged 24 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
bc24971
Introducing SaaS sources gradle module and SaaS crawler as a common m…
san81 Oct 21, 2024
e257f31
test classes
san81 Oct 21, 2024
b238ae9
Plain empty Jira Source plugin
san81 Oct 21, 2024
696c1a5
Parition Factory Tests
san81 Oct 21, 2024
439ef92
additional tests
san81 Oct 21, 2024
1a76aac
additional tests
san81 Oct 21, 2024
44828f8
full test coverage for base folder, spotless fixes
Galactus22625 Oct 21, 2024
0881b43
additional tests
san81 Oct 21, 2024
26137ea
Merge pull request #6 from Galactus22625/Saas-Crawler-Base-UnitTests
san81 Oct 21, 2024
f535b32
additional test coverage
san81 Oct 22, 2024
7096f1c
Merge branch 'opensearch-project:main' into saas-sources-module
san81 Oct 22, 2024
b90b929
merging main
san81 Oct 22, 2024
eccabb8
Merge remote-tracking branch 'origin/saas-sources-module' into saas-s…
san81 Oct 22, 2024
86338bf
addressing review comments and also package name refactoring based on…
san81 Oct 22, 2024
65bc47b
more review comments
san81 Oct 22, 2024
31ab7b9
adjusted the log level and removed unwanted log messages
san81 Oct 22, 2024
8af4240
small clean ups
san81 Oct 22, 2024
9a6fda7
test case assertion fix
san81 Oct 23, 2024
cb3d94f
better coverage
san81 Oct 23, 2024
c5af55a
step down the log level based on the review comments
san81 Oct 23, 2024
647ba10
Merge branch 'opensearch-project:main' into saas-sources-module
san81 Oct 23, 2024
5a4e966
taking the coverage to 100%
san81 Oct 23, 2024
cf35364
addressing review comments
san81 Oct 24, 2024
39d98e7
module name renamed to source-crawler
san81 Oct 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@ data-prepper-main/src/test/resources/logstash-conf/logstash-filter.yaml

# Python virtual environments
.venv

# output folder created when we run test cases
**/out/
2 changes: 1 addition & 1 deletion data-prepper-plugins/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ subprojects {
}

dependencies {
subprojects.forEach { api project(':data-prepper-plugins:' + it.name) }
subprojects.findAll { api project(it.path) }
}
13 changes: 13 additions & 0 deletions data-prepper-plugins/saas-source-plugins/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
plugins {
id 'java-library'
}


subprojects {
apply plugin: 'data-prepper.publish'
group = 'org.opensearch.dataprepper.plugins.source'
}

dependencies {
subprojects.forEach { api project(':data-prepper-plugins::saas-source-plugins:' + it.name) }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

# Metrics

### Counter
- `issuesRequested`: measures total number of issue Requests sent.

### Timer
- `requestProcessDuration`: measures latency of requests processed by the jira source plugin.
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
plugins {
id 'java'
}

dependencies {

implementation project(path: ':data-prepper-plugins:saas-source-plugins:saas-crawler')
implementation project(path: ':data-prepper-api')
implementation project(path: ':data-prepper-plugins:aws-plugin-api')
implementation project(path: ':data-prepper-plugins:buffer-common')
implementation project(path: ':data-prepper-plugins:common')

implementation libs.commons.io
implementation 'io.micrometer:micrometer-core'
implementation 'com.fasterxml.jackson.core:jackson-core'
implementation 'com.fasterxml.jackson.core:jackson-databind'
implementation 'javax.inject:javax.inject:1'
implementation 'org.springframework:spring-web:5.3.39'
san81 marked this conversation as resolved.
Show resolved Hide resolved
implementation 'org.springframework.retry:spring-retry:1.3.4'
dlvenable marked this conversation as resolved.
Show resolved Hide resolved

implementation 'org.projectlombok:lombok:1.18.30'
annotationProcessor 'org.projectlombok:lombok:1.18.30'

implementation(libs.spring.context) {
exclude group: 'commons-logging', module: 'commons-logging'
}
}

test {
useJUnitPlatform()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package org.opensearch.dataprepper.plugins.source.saas.jira;

import org.opensearch.dataprepper.model.buffer.Buffer;
import org.opensearch.dataprepper.model.event.Event;
import org.opensearch.dataprepper.model.record.Record;
import org.opensearch.dataprepper.plugins.source.saas_crawler.base.SaasClient;
import org.opensearch.dataprepper.plugins.source.saas_crawler.base.SaasSourceConfig;
import org.opensearch.dataprepper.plugins.source.saas_crawler.coordination.state.SaasWorkerProgressState;
import org.opensearch.dataprepper.plugins.source.saas_crawler.model.ItemInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.inject.Named;
import java.util.Iterator;

/**
* This class represents a Jira client.
*/
@Named
public class JiraClient implements SaasClient {

private static final Logger log = LoggerFactory.getLogger(JiraClient.class);
private long lastPollTime;

public JiraClient() {
}


@Override
public Iterator<ItemInfo> listItems() {
return null;
}

@Override
public void setLastPollTime(long lastPollTime) {
log.info("Setting the lastPollTime: {}", lastPollTime);
this.lastPollTime = lastPollTime;
}

@Override
public void executePartition(SaasWorkerProgressState state, Buffer<Record<Event>> buffer, SaasSourceConfig configuration) {
log.info("Logic for executing the partitions");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package org.opensearch.dataprepper.plugins.source.saas.jira;


import org.opensearch.dataprepper.metrics.PluginMetrics;
import org.opensearch.dataprepper.model.acknowledgements.AcknowledgementSetManager;
import org.opensearch.dataprepper.model.annotations.DataPrepperPlugin;
import org.opensearch.dataprepper.model.annotations.DataPrepperPluginConstructor;
import org.opensearch.dataprepper.model.buffer.Buffer;
import org.opensearch.dataprepper.model.codec.ByteDecoder;
import org.opensearch.dataprepper.model.event.Event;
import org.opensearch.dataprepper.model.plugin.PluginFactory;
import org.opensearch.dataprepper.model.record.Record;
import org.opensearch.dataprepper.model.source.Source;
import org.opensearch.dataprepper.plugins.source.saas_crawler.SaasCrawlerApplicationContextMarker;
import org.opensearch.dataprepper.plugins.source.saas_crawler.base.Crawler;
import org.opensearch.dataprepper.plugins.source.saas_crawler.base.SaasPluginExecutorServiceProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
* JiraConnector connector entry point.
*/

@DataPrepperPlugin(name = "jira",
pluginType = Source.class,
packagesToScan = {SaasCrawlerApplicationContextMarker.class, JiraSource.class}
)
public class JiraSource implements Source<Record<Event>> {
dlvenable marked this conversation as resolved.
Show resolved Hide resolved

private static final Logger log = LoggerFactory.getLogger(JiraSource.class);


@DataPrepperPluginConstructor
public JiraSource(final PluginMetrics pluginMetrics,
final PluginFactory pluginFactory,
final AcknowledgementSetManager acknowledgementSetManager,
Crawler crawler,
SaasPluginExecutorServiceProvider executorServiceProvider) {
log.info("Create Jira Source Connector");
}

public void start(Buffer<Record<Event>> buffer) {
log.info("Starting Jira Source Plugin... ");
}

@Override
public void stop() {

}

@Override
public ByteDecoder getDecoder() {
return Source.super.getDecoder();
}

}
30 changes: 30 additions & 0 deletions data-prepper-plugins/saas-source-plugins/saas-crawler/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
plugins {
dlvenable marked this conversation as resolved.
Show resolved Hide resolved
id 'java-library'
}

group = 'org.opensearch.dataprepper.plugins.source.saas_crawler'

tasks.withType(Javadoc).configureEach {
enabled = false
}

dependencies {

implementation project(path: ':data-prepper-api')

implementation 'com.fasterxml.jackson.core:jackson-core'
implementation 'com.fasterxml.jackson.core:jackson-databind'
implementation 'org.projectlombok:lombok:1.18.30'
implementation 'javax.inject:javax.inject:1'
implementation 'javax.annotation:javax.annotation-api:1.3.2'

implementation(libs.spring.context) {
exclude group: 'commons-logging', module: 'commons-logging'
}

annotationProcessor 'org.projectlombok:lombok:1.18.30'
}

test {
dlvenable marked this conversation as resolved.
Show resolved Hide resolved
useJUnitPlatform()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package org.opensearch.dataprepper.plugins.source.saas_crawler;

/**
* Market interface to indicate the base package to scan for dependency injection
*/
public interface SaasCrawlerApplicationContextMarker {
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package org.opensearch.dataprepper.plugins.source.saas_crawler.base;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you not see a need for common interface for all crawlers? I was thinking there would be an interface with some default implementation and so on.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Crawler relay on a source plugin specific iterator implementation and dispatch the work to source plugin specific client implementation. Crawler itself has generic logic for pagination.


import org.opensearch.dataprepper.model.buffer.Buffer;
import org.opensearch.dataprepper.model.event.Event;
import org.opensearch.dataprepper.model.record.Record;
import org.opensearch.dataprepper.model.source.coordinator.enhanced.EnhancedSourceCoordinator;
import org.opensearch.dataprepper.plugins.source.saas_crawler.coordination.partition.SaasSourcePartition;
import org.opensearch.dataprepper.plugins.source.saas_crawler.coordination.state.SaasWorkerProgressState;
import org.opensearch.dataprepper.plugins.source.saas_crawler.model.ItemInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.inject.Named;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

@Named
public class Crawler {
private static final Logger log = LoggerFactory.getLogger(Crawler.class);
private static final int maxItemsPerPage = 20;
public static final String CREATED = "created";
public static final String UPDATED = "updated";

private final SaasClient client;

public Crawler(SaasClient client) {
this.client = client;
}

public long crawl(long lastPollTime,
dlvenable marked this conversation as resolved.
Show resolved Hide resolved
EnhancedSourceCoordinator coordinator) {
long startTime = System.currentTimeMillis();
client.setLastPollTime(lastPollTime);
Iterator<ItemInfo> itemInfoIterator = client.listItems();
log.info("Starting to crawl the source with lastPollTime: {}", lastPollTime);
long updatedPollTime = 0;
do {
final List<ItemInfo> itemInfoList = new ArrayList<>();
for (int i = 0; i < maxItemsPerPage && itemInfoIterator.hasNext(); i++) {
ItemInfo nextItem = itemInfoIterator.next();
if(nextItem==null) {
//we don't expect null items, but just in case, we'll skip them
continue;
dlvenable marked this conversation as resolved.
Show resolved Hide resolved
}
itemInfoList.add(nextItem);
Map<String, Object> metadata = nextItem.getMetadata();
long niCreated = Long.parseLong(metadata.get(CREATED)!=null? (String)metadata.get(CREATED):"0");
dlvenable marked this conversation as resolved.
Show resolved Hide resolved
long niUpdated = Long.parseLong(metadata.get(UPDATED)!=null? (String)metadata.get(UPDATED):"0");
updatedPollTime = Math.max(updatedPollTime, niCreated);
updatedPollTime = Math.max(updatedPollTime, niUpdated);
}
createPartition(itemInfoList, coordinator);
}while (itemInfoIterator.hasNext());
log.debug("Crawling completed in {} ms", System.currentTimeMillis() - startTime);
dlvenable marked this conversation as resolved.
Show resolved Hide resolved
return updatedPollTime != 0 ? updatedPollTime : startTime;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, looks like it is falling back to current time here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

}

public void executePartition(SaasWorkerProgressState state, Buffer<Record<Event>> buffer, SaasSourceConfig sourceConfig) {
client.executePartition(state, buffer, sourceConfig);
}

private void createPartition(List<ItemInfo> itemInfoList, EnhancedSourceCoordinator coordinator) {
if(itemInfoList.isEmpty()) {
return;
}
ItemInfo itemInfo = itemInfoList.get(0);
String partitionKey = itemInfo.getPartitionKey();
List<String> itemIds = itemInfoList.stream().map(ItemInfo::getId).collect(Collectors.toList());
SaasWorkerProgressState state = new SaasWorkerProgressState();
state.setKeyAttributes(itemInfo.getKeyAttributes());
state.setItemIds(itemIds);
state.setExportStartTime(System.currentTimeMillis());
state.setLoadedItems(itemInfoList.size());
SaasSourcePartition sourcePartition = new SaasSourcePartition(state, partitionKey);
coordinator.createPartition(sourcePartition);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package org.opensearch.dataprepper.plugins.source.saas_crawler.base;

import org.opensearch.dataprepper.model.buffer.Buffer;
import org.opensearch.dataprepper.model.event.Event;
import org.opensearch.dataprepper.model.record.Record;
import org.opensearch.dataprepper.plugins.source.saas_crawler.coordination.state.SaasWorkerProgressState;
import org.opensearch.dataprepper.plugins.source.saas_crawler.model.ItemInfo;

import java.util.Iterator;

/**
* Interface for saas client. This interface can be implemented by different saas clients.
* For example, Jira, Salesforce, ServiceNow, etc.
*/
public interface SaasClient {
dlvenable marked this conversation as resolved.
Show resolved Hide resolved


/**
* This will be the main API called by crawler. This method assumes that {@link
* SaasSourceConfig} is available as a member to {@link SaasClient}, as a result of
* which, other scanning properties will also be available to this method
*
* @return returns an {@link Iterator} of {@link ItemInfo}
*/
Iterator<ItemInfo> listItems();


/**
* Method to set the last time we polled the service to check for any changes.
*
* @param lastPollTime time in milliseconds
*/
void setLastPollTime(long lastPollTime);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add javadoc comments for all API

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added 👍

dlvenable marked this conversation as resolved.
Show resolved Hide resolved

/**
* Method for executing a particular partition or a chunk of work
*
* @param state worker node state holds the details of this particular chunk of work
* @param buffer pipeline buffer to write the results into
* @param sourceConfig pipeline configuration from the yaml
*/
void executePartition(SaasWorkerProgressState state, Buffer<Record<Event>> buffer, SaasSourceConfig sourceConfig);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package org.opensearch.dataprepper.plugins.source.saas_crawler.base;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.PreDestroy;
import javax.inject.Named;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

@Named
public class SaasPluginExecutorServiceProvider {
private static final Logger log = LoggerFactory.getLogger(SaasPluginExecutorServiceProvider.class);
public static final int DEFAULT_THREAD_COUNT = 50;
dlvenable marked this conversation as resolved.
Show resolved Hide resolved
private final ExecutorService executorService;

public SaasPluginExecutorServiceProvider() {
executorService = Executors.newFixedThreadPool(DEFAULT_THREAD_COUNT);
}

/**
* Constructor for testing
*/
public SaasPluginExecutorServiceProvider(ExecutorService testExecutorService) {
executorService = testExecutorService;
}

public ExecutorService get() {
return executorService;
}

@PreDestroy
public void terminateExecutor() {
try {
log.debug("Shutting down ExecutorService " + executorService);
executorService.shutdown();
boolean isExecutorTerminated = executorService
.awaitTermination(30, TimeUnit.SECONDS);
log.debug("ExecutorService terminated : " + isExecutorTerminated);
} catch (InterruptedException e) {
log.error("Interrupted while terminating executor : " + e.getMessage());
Thread.currentThread().interrupt();
} finally {
executorService.shutdownNow();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package org.opensearch.dataprepper.plugins.source.saas_crawler.base;

/**
* Marker interface to all the SAAS connectors configuration
*/
public interface SaasSourceConfig {

int DEFAULT_NUMBER_OF_WORKERS = 1;
}
Loading
Loading