Skip to content

Commit

Permalink
Merge pull request #661 from NASA-IMPACT/658-automate-file-creation
Browse files Browse the repository at this point in the history
Automate file creation
  • Loading branch information
CarsonDavis authored Apr 17, 2024
2 parents f6b3c09 + a74fa5b commit 75474dc
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 145 deletions.
8 changes: 5 additions & 3 deletions .envs/.local/.django
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ DJANGO_AWS_STORAGE_BUCKET_NAME=''

# GitHub (please create a new file called .env and put these in there)
# ------------------------------------------------------------------------------
GITHUB_ACCESS_TOKEN=''
SINEQUA_CONFIGS_GITHUB_REPO=''
GITHUB_BRANCH_FOR_WEBAPP=''
GITHUB_ACCESS_TOKEN=
SINEQUA_CONFIGS_GITHUB_REPO='NASA-IMPACT/sde-backend'
SINEQUA_CONFIGS_REPO_MASTER_BRANCH='master'
SINEQUA_CONFIGS_REPO_DEV_BRANCH='dev'
SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch'
19 changes: 6 additions & 13 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,7 @@
]
# https://docs.djangoproject.com/en/dev/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator"
},
{"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator"},
{"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator"},
{"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator"},
{"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator"},
Expand Down Expand Up @@ -254,12 +252,7 @@
LOGGING = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"verbose": {
"format": "%(levelname)s %(asctime)s %(module)s "
"%(process)d %(thread)d %(message)s"
}
},
"formatters": {"verbose": {"format": "%(levelname)s %(asctime)s %(module)s " "%(process)d %(thread)d %(message)s"}},
"handlers": {
"console": {
"level": "DEBUG",
Expand Down Expand Up @@ -334,14 +327,14 @@
"rest_framework.renderers.BrowsableAPIRenderer",
"rest_framework_datatables.renderers.DatatablesRenderer",
),
"DEFAULT_FILTER_BACKENDS": (
"rest_framework_datatables.filters.DatatablesFilterBackend",
),
"DEFAULT_FILTER_BACKENDS": ("rest_framework_datatables.filters.DatatablesFilterBackend",),
"DEFAULT_PAGINATION_CLASS": "rest_framework_datatables.pagination.DatatablesPageNumberPagination",
"PAGE_SIZE": 50,
"EXCEPTION_HANDLER": "sde_indexing_helper.utils.exceptions.custom_exception_handler",
}

GITHUB_ACCESS_TOKEN = env("GITHUB_ACCESS_TOKEN")
SINEQUA_CONFIGS_GITHUB_REPO = env("SINEQUA_CONFIGS_GITHUB_REPO")
GITHUB_BRANCH_FOR_WEBAPP = env("GITHUB_BRANCH_FOR_WEBAPP")
SINEQUA_CONFIGS_REPO_MASTER_BRANCH = env("SINEQUA_CONFIGS_REPO_MASTER_BRANCH")
SINEQUA_CONFIGS_REPO_DEV_BRANCH = env("SINEQUA_CONFIGS_REPO_DEV_BRANCH")
SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH = env("SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH")
55 changes: 26 additions & 29 deletions config_generation/db_to_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,38 +98,45 @@ def convert_indexer_to_scraper(self) -> None:
some values must now be modified so it will be an effective scraper
"""
self.update_or_add_element_value("Indexers", "")
self.update_or_add_element_value(
"Plugin", "SMD_Plugins/Sinequa.Plugin.ListCandidateUrls"
)
self.update_or_add_element_value("Plugin", "SMD_Plugins/Sinequa.Plugin.ListCandidateUrls")
self.update_or_add_element_value("ShardIndexes", "")
self.update_or_add_element_value("ShardingStrategy", "")
self.update_or_add_element_value("WorkerCount", "8")
self.update_or_add_element_value("LogLevel", "0", parent_element_name="System")
self.update_or_add_element_value(
"Simulate", "true", parent_element_name="IndexerClient"
)
self.update_or_add_element_value("Simulate", "true", parent_element_name="IndexerClient")

def convert_scraper_to_indexer(self) -> None:
# this is specialized for the production instance right now
self.update_or_add_element_value("Indexers", "")
self.update_or_add_element_value("Plugin", "")
self.update_or_add_element_value(
"Identity", "NodeIndexer1/identity0"
) # maybe make this blank?
self.update_or_add_element_value("Identity", "NodeIndexer1/identity0") # maybe make this blank?
self.update_or_add_element_value("ShardIndexes", "")
self.update_or_add_element_value("ShardingStrategy", "")
self.update_or_add_element_value("WorkerCount", "8")
self.update_or_add_element_value("LogLevel", "20", parent_element_name="System")
self.update_or_add_element_value(
"Simulate", "false", parent_element_name="IndexerClient"
)
self.update_or_add_element_value("Simulate", "false", parent_element_name="IndexerClient")

def convert_template_to_scraper(self, url: str) -> None:
def convert_template_to_scraper(self, collection) -> None:
"""
assuming this class has been instantiated with the scraper_template.xml
the only remaining step is to add the base url to be scraped
"""
self.update_or_add_element_value("Url", url)
self.update_or_add_element_value("Url", collection.url)

self.update_or_add_element_value("TreeRoot", collection.tree_root)
if collection.document_type:
self.add_document_type_mapping(document_type=collection.get_document_type_display(), criteria=None)

scraper_config = self.update_config_xml()
return scraper_config

def convert_template_to_indexer(self, collection) -> None:
"""
assuming this class has been instantiated with the indexer_template.xml
"""
self.update_or_add_element_value("Collection", f"/SDE/{collection.config_folder}/")
indexer_config = self.update_config_xml()

return indexer_config

def _mapping_exists(self, new_mapping: ET.Element):
"""
Expand All @@ -138,14 +145,8 @@ def _mapping_exists(self, new_mapping: ET.Element):
xml_root = self.xml_tree.getroot()

for mapping in xml_root.findall("Mapping"):
existing_mapping = {
child.tag: (child.text if child.text is not None else "")
for child in mapping
}
new_mapping_dict = {
child.tag: (child.text if child.text is not None else "")
for child in new_mapping
}
existing_mapping = {child.tag: (child.text if child.text is not None else "") for child in mapping}
new_mapping_dict = {child.tag: (child.text if child.text is not None else "") for child in new_mapping}
if existing_mapping == new_mapping_dict:
return True

Expand All @@ -165,9 +166,7 @@ def _standardize_selection(selection):
# "*'</Selection>", "'</Selection>"
# )

return list(
set(selection, standardized_quotes) # , standardized_quotes_less_selective)
)
return list(set(selection, standardized_quotes)) # , standardized_quotes_less_selective)

def _generic_mapping(
self,
Expand Down Expand Up @@ -341,9 +340,7 @@ def fetch_url(self):
def fetch_document_type(self):
DOCUMENT_TYPE_COLUMN = "sourcestr56"
try:
document_type_text = self.xml_tree.find(
f"Mapping[Name='{DOCUMENT_TYPE_COLUMN}']/Value"
).text
document_type_text = self.xml_tree.find(f"Mapping[Name='{DOCUMENT_TYPE_COLUMN}']/Value").text
except AttributeError:
return None

Expand Down
133 changes: 65 additions & 68 deletions sde_collections/models/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,54 +26,28 @@ class Collection(models.Model):
"""Model definition for Collection."""

name = models.CharField("Name", max_length=1024)
config_folder = models.CharField(
"Config Folder", max_length=2048, unique=True, editable=False
)
config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False)
url = models.URLField("URL", max_length=2048, blank=True)
division = models.IntegerField(choices=Divisions.choices)
turned_on = models.BooleanField("Turned On", default=True)
connector = models.IntegerField(
choices=ConnectorChoices.choices, default=ConnectorChoices.CRAWLER2
)
connector = models.IntegerField(choices=ConnectorChoices.choices, default=ConnectorChoices.CRAWLER2)

source = models.IntegerField(
choices=SourceChoices.choices, default=SourceChoices.BOTH
)
update_frequency = models.IntegerField(
choices=UpdateFrequencies.choices, default=UpdateFrequencies.WEEKLY
)
document_type = models.IntegerField(
choices=DocumentTypes.choices, null=True, blank=True
)
tree_root_deprecated = models.CharField(
"Tree Root", max_length=1024, default="", blank=True
)
source = models.IntegerField(choices=SourceChoices.choices, default=SourceChoices.BOTH)
update_frequency = models.IntegerField(choices=UpdateFrequencies.choices, default=UpdateFrequencies.WEEKLY)
document_type = models.IntegerField(choices=DocumentTypes.choices, null=True, blank=True)
tree_root_deprecated = models.CharField("Tree Root", max_length=1024, default="", blank=True)
delete = models.BooleanField(default=False)

# audit columns for production
audit_hierarchy = models.CharField(
"Audit Hierarchy", max_length=2048, default="", blank=True
)
audit_hierarchy = models.CharField("Audit Hierarchy", max_length=2048, default="", blank=True)
audit_url = models.CharField("Audit URL", max_length=2048, default="", blank=True)
audit_mapping = models.CharField(
"Audit Mapping", max_length=2048, default="", blank=True
)
audit_label = models.CharField(
"Audit Label", max_length=2048, default="", blank=True
)
audit_query = models.CharField(
"Audit Query", max_length=2048, default="", blank=True
)
audit_duplicate_results = models.CharField(
"Audit Duplicate Results", max_length=2048, default="", blank=True
)
audit_metrics = models.CharField(
"Audit Metrics", max_length=2048, default="", blank=True
)
audit_mapping = models.CharField("Audit Mapping", max_length=2048, default="", blank=True)
audit_label = models.CharField("Audit Label", max_length=2048, default="", blank=True)
audit_query = models.CharField("Audit Query", max_length=2048, default="", blank=True)
audit_duplicate_results = models.CharField("Audit Duplicate Results", max_length=2048, default="", blank=True)
audit_metrics = models.CharField("Audit Metrics", max_length=2048, default="", blank=True)

cleaning_assigned_to = models.CharField(
"Cleaning Assigned To", max_length=128, default="", blank=True
)
cleaning_assigned_to = models.CharField("Cleaning Assigned To", max_length=128, default="", blank=True)

github_issue_number = models.IntegerField("Issue Number in Github", default=0)
notes = models.TextField("Notes", blank=True, default="")
Expand All @@ -89,9 +63,7 @@ class Collection(models.Model):
choices=WorkflowStatusChoices.choices,
default=WorkflowStatusChoices.RESEARCH_IN_PROGRESS,
)
curated_by = models.ForeignKey(
User, on_delete=models.DO_NOTHING, null=True, blank=True
)
curated_by = models.ForeignKey(User, on_delete=models.DO_NOTHING, null=True, blank=True)
curation_started = models.DateTimeField("Curation Started", null=True, blank=True)

class Meta:
Expand All @@ -100,6 +72,18 @@ class Meta:
verbose_name = "Collection"
verbose_name_plural = "Collections"

@property
def _scraper_config_path(self) -> str:
return f"sources/scrapers/{self.config_folder}/default.xml"

@property
def _plugin_config_path(self) -> str:
return f"sources/SDE/{self.config_folder}/default.xml"

@property
def _indexer_config_path(self) -> str:
return f"jobs/collection.indexer.{self.config_folder}.xml"

@property
def tree_root(self) -> str:
return f"/{self.get_division_display()}/{self.name}/"
Expand Down Expand Up @@ -170,15 +154,11 @@ def workflow_status_button_color(self) -> str:

def _process_exclude_list(self):
"""Process the exclude list."""
return [
pattern._process_match_pattern() for pattern in self.excludepattern.all()
]
return [pattern._process_match_pattern() for pattern in self.excludepattern.all()]

def _process_include_list(self):
"""Process the include list."""
return [
pattern._process_match_pattern() for pattern in self.includepattern.all()
]
return [pattern._process_match_pattern() for pattern in self.includepattern.all()]

def _process_title_list(self):
"""Process the title list"""
Expand All @@ -202,30 +182,44 @@ def _process_document_type_list(self):
document_type_rules.append(processed_pattern)
return document_type_rules

def create_config_xml(self):
def _write_to_github(self, path, content, overwrite):
gh = GitHubHandler()
if overwrite:
gh.update_file(path, content)
else:
gh.create_file(path, content)

def create_scraper_config(self, overwrite: bool = False):
"""
Reads from the model data and creates a new config folder
and xml file on sde-backend/sources/SDE/<config_folder>/default.xml
Reads from the model data and creates the initial scraper config xml file
if overwrite is True, it will overwrite the existing file
"""

original_config_string = open(
"config_generation/xmls/indexing_template.xml"
).read()
editor = XmlEditor(original_config_string)
scraper_template = open("config_generation/xmls/webcrawler_initial_crawl.xml").read()
editor = XmlEditor(scraper_template)
scraper_config = editor.convert_template_to_scraper(self)
self._write_to_github(self._scraper_config_path, scraper_config, overwrite)

# add the URL
editor.update_or_add_element_value("Url", self.url)
def create_plugin_config(self, overwrite: bool = False):
"""
Reads from the model data and creates the plugin config xml file that calls the api
editor.update_or_add_element_value("TreeRoot", self.tree_root)
if self.document_type:
editor.add_document_type_mapping(
document_type=self.get_document_type_display(), criteria=None
)
if overwrite is True, it will overwrite the existing file
"""
plugin_config = open("config_generation/xmls/plugin_indexing_template.xml").read()
self._write_to_github(self._plugin_config_path, plugin_config, overwrite)

updated_config_xml_string = editor.update_config_xml()
def create_indexer_config(self, overwrite: bool = False):
"""
Reads from the model data and creates indexer job that calls the plugin config
gh = GitHubHandler([self])
return gh.create_and_initialize_config_file(self, updated_config_xml_string)
if overwrite is True, it will overwrite the existing file
"""
indexer_template = open("config_generation/xmls/job_template.xml").read()
editor = XmlEditor(indexer_template)
indexer_config = editor.convert_template_to_indexer(self)
self._write_to_github(self._indexer_config_path, indexer_config, overwrite)

def update_config_xml(self, original_config_string):
"""
Expand Down Expand Up @@ -420,6 +414,11 @@ def save(self, *args, **kwargs):
if not self.config_folder:
self.config_folder = self._compute_config_folder_name()

# create all initial config files
self.create_scraper_config(overwrite=False)
self.create_plugin_config(overwrite=False)
self.create_indexer_config(overwrite=False)

# Call the parent class's save method
super().save(*args, **kwargs)

Expand All @@ -440,9 +439,7 @@ def __str__(self) -> str:


class Comments(models.Model):
collection = models.ForeignKey(
"Collection", related_name="comments", on_delete=models.CASCADE
)
collection = models.ForeignKey("Collection", related_name="comments", on_delete=models.CASCADE)
user = models.ForeignKey(get_user_model(), on_delete=models.CASCADE)
text = models.TextField()
created_at = models.DateTimeField(auto_now_add=True)
Expand Down
Loading

0 comments on commit 75474dc

Please sign in to comment.