Merge pull request #661 from NASA-IMPACT/658-automate-file-creation

Automate file creation
NASA-IMPACT · Apr 17, 2024 · 75474dc · 75474dc
2 parents f6b3c09 + a74fa5b
commit 75474dc
Show file tree

Hide file tree

Showing 5 changed files with 182 additions and 145 deletions.
diff --git a/.envs/.local/.django b/.envs/.local/.django
@@ -24,6 +24,8 @@ DJANGO_AWS_STORAGE_BUCKET_NAME=''
 
 # GitHub (please create a new file called .env and put these in there)
 # ------------------------------------------------------------------------------
-GITHUB_ACCESS_TOKEN=''
-SINEQUA_CONFIGS_GITHUB_REPO=''
-GITHUB_BRANCH_FOR_WEBAPP=''
+GITHUB_ACCESS_TOKEN=
+SINEQUA_CONFIGS_GITHUB_REPO='NASA-IMPACT/sde-backend'
+SINEQUA_CONFIGS_REPO_MASTER_BRANCH='master'
+SINEQUA_CONFIGS_REPO_DEV_BRANCH='dev'
+SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch'
diff --git a/config/settings/base.py b/config/settings/base.py
@@ -131,9 +131,7 @@
 ]
 # https://docs.djangoproject.com/en/dev/ref/settings/#auth-password-validators
 AUTH_PASSWORD_VALIDATORS = [
-    {
-        "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator"
-    },
+    {"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator"},
     {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator"},
     {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator"},
     {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator"},
@@ -254,12 +252,7 @@
 LOGGING = {
     "version": 1,
     "disable_existing_loggers": False,
-    "formatters": {
-        "verbose": {
-            "format": "%(levelname)s %(asctime)s %(module)s "
-            "%(process)d %(thread)d %(message)s"
-        }
-    },
+    "formatters": {"verbose": {"format": "%(levelname)s %(asctime)s %(module)s " "%(process)d %(thread)d %(message)s"}},
     "handlers": {
         "console": {
             "level": "DEBUG",
@@ -334,14 +327,14 @@
         "rest_framework.renderers.BrowsableAPIRenderer",
         "rest_framework_datatables.renderers.DatatablesRenderer",
     ),
-    "DEFAULT_FILTER_BACKENDS": (
-        "rest_framework_datatables.filters.DatatablesFilterBackend",
-    ),
+    "DEFAULT_FILTER_BACKENDS": ("rest_framework_datatables.filters.DatatablesFilterBackend",),
     "DEFAULT_PAGINATION_CLASS": "rest_framework_datatables.pagination.DatatablesPageNumberPagination",
     "PAGE_SIZE": 50,
     "EXCEPTION_HANDLER": "sde_indexing_helper.utils.exceptions.custom_exception_handler",
 }
 
 GITHUB_ACCESS_TOKEN = env("GITHUB_ACCESS_TOKEN")
 SINEQUA_CONFIGS_GITHUB_REPO = env("SINEQUA_CONFIGS_GITHUB_REPO")
-GITHUB_BRANCH_FOR_WEBAPP = env("GITHUB_BRANCH_FOR_WEBAPP")
+SINEQUA_CONFIGS_REPO_MASTER_BRANCH = env("SINEQUA_CONFIGS_REPO_MASTER_BRANCH")
+SINEQUA_CONFIGS_REPO_DEV_BRANCH = env("SINEQUA_CONFIGS_REPO_DEV_BRANCH")
+SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH = env("SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH")
diff --git a/config_generation/db_to_xml.py b/config_generation/db_to_xml.py
@@ -98,38 +98,45 @@ def convert_indexer_to_scraper(self) -> None:
         some values must now be modified so it will be an effective scraper
         """
         self.update_or_add_element_value("Indexers", "")
-        self.update_or_add_element_value(
-            "Plugin", "SMD_Plugins/Sinequa.Plugin.ListCandidateUrls"
-        )
+        self.update_or_add_element_value("Plugin", "SMD_Plugins/Sinequa.Plugin.ListCandidateUrls")
         self.update_or_add_element_value("ShardIndexes", "")
         self.update_or_add_element_value("ShardingStrategy", "")
         self.update_or_add_element_value("WorkerCount", "8")
         self.update_or_add_element_value("LogLevel", "0", parent_element_name="System")
-        self.update_or_add_element_value(
-            "Simulate", "true", parent_element_name="IndexerClient"
-        )
+        self.update_or_add_element_value("Simulate", "true", parent_element_name="IndexerClient")
 
     def convert_scraper_to_indexer(self) -> None:
         # this is specialized for the production instance right now
         self.update_or_add_element_value("Indexers", "")
         self.update_or_add_element_value("Plugin", "")
-        self.update_or_add_element_value(
-            "Identity", "NodeIndexer1/identity0"
-        )  # maybe make this blank?
+        self.update_or_add_element_value("Identity", "NodeIndexer1/identity0")  # maybe make this blank?
         self.update_or_add_element_value("ShardIndexes", "")
         self.update_or_add_element_value("ShardingStrategy", "")
         self.update_or_add_element_value("WorkerCount", "8")
         self.update_or_add_element_value("LogLevel", "20", parent_element_name="System")
-        self.update_or_add_element_value(
-            "Simulate", "false", parent_element_name="IndexerClient"
-        )
+        self.update_or_add_element_value("Simulate", "false", parent_element_name="IndexerClient")
 
-    def convert_template_to_scraper(self, url: str) -> None:
+    def convert_template_to_scraper(self, collection) -> None:
         """
         assuming this class has been instantiated with the scraper_template.xml
-        the only remaining step is to add the base url to be scraped
         """
-        self.update_or_add_element_value("Url", url)
+        self.update_or_add_element_value("Url", collection.url)
+
+        self.update_or_add_element_value("TreeRoot", collection.tree_root)
+        if collection.document_type:
+            self.add_document_type_mapping(document_type=collection.get_document_type_display(), criteria=None)
+
+        scraper_config = self.update_config_xml()
+        return scraper_config
+
+    def convert_template_to_indexer(self, collection) -> None:
+        """
+        assuming this class has been instantiated with the indexer_template.xml
+        """
+        self.update_or_add_element_value("Collection", f"/SDE/{collection.config_folder}/")
+        indexer_config = self.update_config_xml()
+
+        return indexer_config
 
     def _mapping_exists(self, new_mapping: ET.Element):
         """
@@ -138,14 +145,8 @@ def _mapping_exists(self, new_mapping: ET.Element):
         xml_root = self.xml_tree.getroot()
 
         for mapping in xml_root.findall("Mapping"):
-            existing_mapping = {
-                child.tag: (child.text if child.text is not None else "")
-                for child in mapping
-            }
-            new_mapping_dict = {
-                child.tag: (child.text if child.text is not None else "")
-                for child in new_mapping
-            }
+            existing_mapping = {child.tag: (child.text if child.text is not None else "") for child in mapping}
+            new_mapping_dict = {child.tag: (child.text if child.text is not None else "") for child in new_mapping}
             if existing_mapping == new_mapping_dict:
                 return True
 
@@ -165,9 +166,7 @@ def _standardize_selection(selection):
         #     "*'</Selection>", "'</Selection>"
         # )
 
-        return list(
-            set(selection, standardized_quotes)  # , standardized_quotes_less_selective)
-        )
+        return list(set(selection, standardized_quotes))  # , standardized_quotes_less_selective)
 
     def _generic_mapping(
         self,
@@ -341,9 +340,7 @@ def fetch_url(self):
     def fetch_document_type(self):
         DOCUMENT_TYPE_COLUMN = "sourcestr56"
         try:
-            document_type_text = self.xml_tree.find(
-                f"Mapping[Name='{DOCUMENT_TYPE_COLUMN}']/Value"
-            ).text
+            document_type_text = self.xml_tree.find(f"Mapping[Name='{DOCUMENT_TYPE_COLUMN}']/Value").text
         except AttributeError:
             return None
 

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
@@ -26,54 +26,28 @@ class Collection(models.Model):
     """Model definition for Collection."""
 
     name = models.CharField("Name", max_length=1024)
-    config_folder = models.CharField(
-        "Config Folder", max_length=2048, unique=True, editable=False
-    )
+    config_folder = models.CharField("Config Folder", max_length=2048, unique=True, editable=False)
     url = models.URLField("URL", max_length=2048, blank=True)
     division = models.IntegerField(choices=Divisions.choices)
     turned_on = models.BooleanField("Turned On", default=True)
-    connector = models.IntegerField(
-        choices=ConnectorChoices.choices, default=ConnectorChoices.CRAWLER2
-    )
+    connector = models.IntegerField(choices=ConnectorChoices.choices, default=ConnectorChoices.CRAWLER2)
 
-    source = models.IntegerField(
-        choices=SourceChoices.choices, default=SourceChoices.BOTH
-    )
-    update_frequency = models.IntegerField(
-        choices=UpdateFrequencies.choices, default=UpdateFrequencies.WEEKLY
-    )
-    document_type = models.IntegerField(
-        choices=DocumentTypes.choices, null=True, blank=True
-    )
-    tree_root_deprecated = models.CharField(
-        "Tree Root", max_length=1024, default="", blank=True
-    )
+    source = models.IntegerField(choices=SourceChoices.choices, default=SourceChoices.BOTH)
+    update_frequency = models.IntegerField(choices=UpdateFrequencies.choices, default=UpdateFrequencies.WEEKLY)
+    document_type = models.IntegerField(choices=DocumentTypes.choices, null=True, blank=True)
+    tree_root_deprecated = models.CharField("Tree Root", max_length=1024, default="", blank=True)
     delete = models.BooleanField(default=False)
 
     # audit columns for production
-    audit_hierarchy = models.CharField(
-        "Audit Hierarchy", max_length=2048, default="", blank=True
-    )
+    audit_hierarchy = models.CharField("Audit Hierarchy", max_length=2048, default="", blank=True)
     audit_url = models.CharField("Audit URL", max_length=2048, default="", blank=True)
-    audit_mapping = models.CharField(
-        "Audit Mapping", max_length=2048, default="", blank=True
-    )
-    audit_label = models.CharField(
-        "Audit Label", max_length=2048, default="", blank=True
-    )
-    audit_query = models.CharField(
-        "Audit Query", max_length=2048, default="", blank=True
-    )
-    audit_duplicate_results = models.CharField(
-        "Audit Duplicate Results", max_length=2048, default="", blank=True
-    )
-    audit_metrics = models.CharField(
-        "Audit Metrics", max_length=2048, default="", blank=True
-    )
+    audit_mapping = models.CharField("Audit Mapping", max_length=2048, default="", blank=True)
+    audit_label = models.CharField("Audit Label", max_length=2048, default="", blank=True)
+    audit_query = models.CharField("Audit Query", max_length=2048, default="", blank=True)
+    audit_duplicate_results = models.CharField("Audit Duplicate Results", max_length=2048, default="", blank=True)
+    audit_metrics = models.CharField("Audit Metrics", max_length=2048, default="", blank=True)
 
-    cleaning_assigned_to = models.CharField(
-        "Cleaning Assigned To", max_length=128, default="", blank=True
-    )
+    cleaning_assigned_to = models.CharField("Cleaning Assigned To", max_length=128, default="", blank=True)
 
     github_issue_number = models.IntegerField("Issue Number in Github", default=0)
     notes = models.TextField("Notes", blank=True, default="")
@@ -89,9 +63,7 @@ class Collection(models.Model):
         choices=WorkflowStatusChoices.choices,
         default=WorkflowStatusChoices.RESEARCH_IN_PROGRESS,
     )
-    curated_by = models.ForeignKey(
-        User, on_delete=models.DO_NOTHING, null=True, blank=True
-    )
+    curated_by = models.ForeignKey(User, on_delete=models.DO_NOTHING, null=True, blank=True)
     curation_started = models.DateTimeField("Curation Started", null=True, blank=True)
 
     class Meta:
@@ -100,6 +72,18 @@ class Meta:
         verbose_name = "Collection"
         verbose_name_plural = "Collections"
 
+    @property
+    def _scraper_config_path(self) -> str:
+        return f"sources/scrapers/{self.config_folder}/default.xml"
+
+    @property
+    def _plugin_config_path(self) -> str:
+        return f"sources/SDE/{self.config_folder}/default.xml"
+
+    @property
+    def _indexer_config_path(self) -> str:
+        return f"jobs/collection.indexer.{self.config_folder}.xml"
+
     @property
     def tree_root(self) -> str:
         return f"/{self.get_division_display()}/{self.name}/"
@@ -170,15 +154,11 @@ def workflow_status_button_color(self) -> str:
 
     def _process_exclude_list(self):
         """Process the exclude list."""
-        return [
-            pattern._process_match_pattern() for pattern in self.excludepattern.all()
-        ]
+        return [pattern._process_match_pattern() for pattern in self.excludepattern.all()]
 
     def _process_include_list(self):
         """Process the include list."""
-        return [
-            pattern._process_match_pattern() for pattern in self.includepattern.all()
-        ]
+        return [pattern._process_match_pattern() for pattern in self.includepattern.all()]
 
     def _process_title_list(self):
         """Process the title list"""
@@ -202,30 +182,44 @@ def _process_document_type_list(self):
             document_type_rules.append(processed_pattern)
         return document_type_rules
 
-    def create_config_xml(self):
+    def _write_to_github(self, path, content, overwrite):
+        gh = GitHubHandler()
+        if overwrite:
+            gh.update_file(path, content)
+        else:
+            gh.create_file(path, content)
+
+    def create_scraper_config(self, overwrite: bool = False):
         """
-        Reads from the model data and creates a new config folder
-        and xml file on sde-backend/sources/SDE/<config_folder>/default.xml
+        Reads from the model data and creates the initial scraper config xml file
+
+        if overwrite is True, it will overwrite the existing file
         """
 
-        original_config_string = open(
-            "config_generation/xmls/indexing_template.xml"
-        ).read()
-        editor = XmlEditor(original_config_string)
+        scraper_template = open("config_generation/xmls/webcrawler_initial_crawl.xml").read()
+        editor = XmlEditor(scraper_template)
+        scraper_config = editor.convert_template_to_scraper(self)
+        self._write_to_github(self._scraper_config_path, scraper_config, overwrite)
 
-        # add the URL
-        editor.update_or_add_element_value("Url", self.url)
+    def create_plugin_config(self, overwrite: bool = False):
+        """
+        Reads from the model data and creates the plugin config xml file that calls the api
 
-        editor.update_or_add_element_value("TreeRoot", self.tree_root)
-        if self.document_type:
-            editor.add_document_type_mapping(
-                document_type=self.get_document_type_display(), criteria=None
-            )
+        if overwrite is True, it will overwrite the existing file
+        """
+        plugin_config = open("config_generation/xmls/plugin_indexing_template.xml").read()
+        self._write_to_github(self._plugin_config_path, plugin_config, overwrite)
 
-        updated_config_xml_string = editor.update_config_xml()
+    def create_indexer_config(self, overwrite: bool = False):
+        """
+        Reads from the model data and creates indexer job that calls the plugin config
 
-        gh = GitHubHandler([self])
-        return gh.create_and_initialize_config_file(self, updated_config_xml_string)
+        if overwrite is True, it will overwrite the existing file
+        """
+        indexer_template = open("config_generation/xmls/job_template.xml").read()
+        editor = XmlEditor(indexer_template)
+        indexer_config = editor.convert_template_to_indexer(self)
+        self._write_to_github(self._indexer_config_path, indexer_config, overwrite)
 
     def update_config_xml(self, original_config_string):
         """
@@ -420,6 +414,11 @@ def save(self, *args, **kwargs):
         if not self.config_folder:
             self.config_folder = self._compute_config_folder_name()
 
+        # create all initial config files
+        self.create_scraper_config(overwrite=False)
+        self.create_plugin_config(overwrite=False)
+        self.create_indexer_config(overwrite=False)
+
         # Call the parent class's save method
         super().save(*args, **kwargs)
 
@@ -440,9 +439,7 @@ def __str__(self) -> str:
 
 
 class Comments(models.Model):
-    collection = models.ForeignKey(
-        "Collection", related_name="comments", on_delete=models.CASCADE
-    )
+    collection = models.ForeignKey("Collection", related_name="comments", on_delete=models.CASCADE)
     user = models.ForeignKey(get_user_model(), on_delete=models.CASCADE)
     text = models.TextField()
     created_at = models.DateTimeField(auto_now_add=True)