[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
NASA-IMPACT · Aug 23, 2024 · caceb37 · caceb37
1 parent 9491a25
commit caceb37
Show file tree

Hide file tree

Showing 63 changed files with 178 additions and 359 deletions.
diff --git a/README.md b/README.md
@@ -84,7 +84,7 @@ Navigate to the server running prod, then to the project folder. Run the followi
 ```bash
 docker-compose -f production.yml run --rm --user root django python manage.py dumpdata --natural-foreign --natural-primary --exclude=contenttypes --exclude=auth.Permission --indent 2 --output /app/backups/prod_backup-20240812.json
 ```
-This will have saved the backup in a folder outside of the docker container. Now you can copy it to your local machine. 
+This will have saved the backup in a folder outside of the docker container. Now you can copy it to your local machine.
 
 ```bash
 mv ~/prod_backup-20240812.json <project_path>/prod_backup-20240812.json

diff --git a/config/settings/local.py b/config/settings/local.py
@@ -26,9 +26,7 @@
 # EMAIL
 # ------------------------------------------------------------------------------
 # https://docs.djangoproject.com/en/dev/ref/settings/#email-backend
-EMAIL_BACKEND = env(
-    "DJANGO_EMAIL_BACKEND", default="django.core.mail.backends.console.EmailBackend"
-)
+EMAIL_BACKEND = env("DJANGO_EMAIL_BACKEND", default="django.core.mail.backends.console.EmailBackend")
 
 # django-debug-toolbar
 # ------------------------------------------------------------------------------

diff --git a/config/settings/production.py b/config/settings/production.py
@@ -13,9 +13,7 @@
 # https://docs.djangoproject.com/en/dev/ref/settings/#secret-key
 SECRET_KEY = env("DJANGO_SECRET_KEY")
 # https://docs.djangoproject.com/en/dev/ref/settings/#allowed-hosts
-ALLOWED_HOSTS = env.list(
-    "DJANGO_ALLOWED_HOSTS", default=["sde-indexing-helper.nasa-impact.net"]
-)
+ALLOWED_HOSTS = env.list("DJANGO_ALLOWED_HOSTS", default=["sde-indexing-helper.nasa-impact.net"])
 
 # DATABASES
 # ------------------------------------------------------------------------------
@@ -36,15 +34,11 @@
 # TODO: set this to 60 seconds first and then to 518400 once you prove the former works
 SECURE_HSTS_SECONDS = 60
 # https://docs.djangoproject.com/en/dev/ref/settings/#secure-hsts-include-subdomains
-SECURE_HSTS_INCLUDE_SUBDOMAINS = env.bool(
-    "DJANGO_SECURE_HSTS_INCLUDE_SUBDOMAINS", default=True
-)
+SECURE_HSTS_INCLUDE_SUBDOMAINS = env.bool("DJANGO_SECURE_HSTS_INCLUDE_SUBDOMAINS", default=True)
 # https://docs.djangoproject.com/en/dev/ref/settings/#secure-hsts-preload
 SECURE_HSTS_PRELOAD = env.bool("DJANGO_SECURE_HSTS_PRELOAD", default=True)
 # https://docs.djangoproject.com/en/dev/ref/middleware/#x-content-type-options-nosniff
-SECURE_CONTENT_TYPE_NOSNIFF = env.bool(
-    "DJANGO_SECURE_CONTENT_TYPE_NOSNIFF", default=True
-)
+SECURE_CONTENT_TYPE_NOSNIFF = env.bool("DJANGO_SECURE_CONTENT_TYPE_NOSNIFF", default=True)
 
 # STORAGES
 # ------------------------------------------------------------------------------
@@ -61,9 +55,7 @@
 # DO NOT change these unless you know what you're doing.
 _AWS_EXPIRY = 60 * 60 * 24 * 7
 # https://django-storages.readthedocs.io/en/latest/backends/amazon-S3.html#settings
-AWS_S3_OBJECT_PARAMETERS = {
-    "CacheControl": f"max-age={_AWS_EXPIRY}, s-maxage={_AWS_EXPIRY}, must-revalidate"
-}
+AWS_S3_OBJECT_PARAMETERS = {"CacheControl": f"max-age={_AWS_EXPIRY}, s-maxage={_AWS_EXPIRY}, must-revalidate"}
 # https://django-storages.readthedocs.io/en/latest/backends/amazon-S3.html#settings
 AWS_S3_MAX_MEMORY_SIZE = env.int(
     "DJANGO_AWS_S3_MAX_MEMORY_SIZE",
@@ -128,12 +120,7 @@
 LOGGING = {
     "version": 1,
     "disable_existing_loggers": True,
-    "formatters": {
-        "verbose": {
-            "format": "%(levelname)s %(asctime)s %(module)s "
-            "%(process)d %(thread)d %(message)s"
-        }
-    },
+    "formatters": {"verbose": {"format": "%(levelname)s %(asctime)s %(module)s " "%(process)d %(thread)d %(message)s"}},
     "handlers": {
         "console": {
             "level": "DEBUG",

diff --git a/config/urls.py b/config/urls.py
@@ -4,9 +4,7 @@
 from django.urls import include, path
 from django.views import defaults as default_views
 
-admin.site.site_header = (
-    "SDE Indexing Helper Administration"  # default: "Django Administration"
-)
+admin.site.site_header = "SDE Indexing Helper Administration"  # default: "Django Administration"
 admin.site.index_title = "SDE Indexing Helper"  # default: "Site administration"
 admin.site.site_title = "SDE Indexing Helper"  # default: "Django site admin"
 

diff --git a/config_generation/api.py b/config_generation/api.py
@@ -20,9 +20,7 @@
 
 class Api:
     def __init__(self, server_name: str) -> None:
-        self.headers: dict[str, str] = {
-            "Authorization": f"Bearer {tokens[server_name]}"
-        }
+        self.headers: dict[str, str] = {"Authorization": f"Bearer {tokens[server_name]}"}
         self.app_name: str = server_configs[server_name]["app_name"]
         self.query_name: str = server_configs[server_name]["query_name"]
         self.base_url: str = server_configs[server_name]["base_url"]
@@ -53,15 +51,11 @@ def query(self, term: str):
 
         return self.process_response(url, payload)
 
-    def sql(
-        self, source: str, collection: str = "", fetch_all: bool = False
-    ) -> dict[str, Any]:
+    def sql(self, source: str, collection: str = "", fetch_all: bool = False) -> dict[str, Any]:
         url = f"{self.base_url}/api/v1/engine.sql"
 
         collection_name = f"/{source}/{collection}/"
-        sql_command_all = (
-            "select url1,title,collection from @@ScienceMissionDirectorate"
-        )
+        sql_command_all = "select url1,title,collection from @@ScienceMissionDirectorate"
         if fetch_all:
             sql_command = sql_command_all
         else:

diff --git a/config_generation/config_example.py b/config_generation/config_example.py
@@ -1,6 +1,4 @@
-from sources_to_scrape import (
-    sources_to_index_test_grid_20240809,
-)
+from sources_to_scrape import sources_to_index_test_grid_20240809
 
 tokens: dict[str, str] = {
     "test_server": "token here",

diff --git a/config_generation/delete_server_content.py b/config_generation/delete_server_content.py
@@ -2,7 +2,13 @@
 
 from db_to_xml_file_based import XmlEditor
 
-from config import batch_delete_name, collection_list, indexes_to_delete_from, source, engines
+from config import (
+    batch_delete_name,
+    collection_list,
+    engines,
+    indexes_to_delete_from,
+    source,
+)
 
 COMMAND_FILES_PATH = "../sinequa_configs/commands/"
 DELETE_COMMAND_TEMPLATE_PATH = "xmls/delete_template.xml"

diff --git a/config_generation/export_collections.py b/config_generation/export_collections.py
@@ -44,17 +44,13 @@
     # Create JSON file
     print("Creating JSON dump...")
     json_data = json.dumps(bulk_data)
-    file_path = (
-        f"{TEMP_FOLDER_NAME}/{collection}/urls.json"  # Provide the desired file path
-    )
+    file_path = f"{TEMP_FOLDER_NAME}/{collection}/urls.json"  # Provide the desired file path
     with open(file_path, "w") as file:
         file.write(json_data)
 
     # Zip the JSON file
     print("Creating zip file...")
-    zip_file_path = (
-        f"{TEMP_FOLDER_NAME}/{collection}.zip"  # Provide the desired zip file path
-    )
+    zip_file_path = f"{TEMP_FOLDER_NAME}/{collection}.zip"  # Provide the desired zip file path
     with zipfile.ZipFile(zip_file_path, "w") as zip_file:
         zip_file.write(file_path, os.path.basename(file_path))
 

diff --git a/config_generation/export_whole_index.py b/config_generation/export_whole_index.py
@@ -44,9 +44,7 @@
 
 # Upload the zip file to S3
 s3_bucket_name = env("DJANGO_AWS_STORAGE_BUCKET_NAME")
-s3_key = (
-    "scraped_urls_all/all_data.zip"  # Provide the desired S3 key for the uploaded file
-)
+s3_key = "scraped_urls_all/all_data.zip"  # Provide the desired S3 key for the uploaded file
 s3_client = boto3.client(
     "s3",
     region_name="us-east-1",

diff --git a/config_generation/generate_collection_list.py b/config_generation/generate_collection_list.py
@@ -22,9 +22,7 @@ def create_xml_path(collection_name):
 
 def get_turned_on_sources():
     # remove sources that were just scraped
-    turned_on_remaining_sources = [
-        source for source in turned_on_sources if source not in already_scraped_sources
-    ]
+    turned_on_remaining_sources = [source for source in turned_on_sources if source not in already_scraped_sources]
 
     # filter all sources to only webcrawler sources
     turned_on_remaining_webcrawlers = []
@@ -68,8 +66,5 @@ def get_sources_20230605():
     folders = get_all_config_folder_names()
     folders = folders + interrupted_sources
     return [
-        folder
-        for folder in folders
-        if folder not in sources_with_documents_20230605
-        and is_collection_crawler(folder)
+        folder for folder in folders if folder not in sources_with_documents_20230605 and is_collection_crawler(folder)
     ]
diff --git a/config_generation/generate_emac_indexer.py b/config_generation/generate_emac_indexer.py
@@ -71,9 +71,7 @@
 editor.update_or_add_element_value("Description", f"Webcrawler for the {name}")
 editor.update_or_add_element_value("Url", url)
 editor.update_or_add_element_value("TreeRoot", tree_root)
-editor.update_or_add_element_value(
-    "ShardIndexes", "@SMD_ASTRO_Repository_1,@SMD_ASTRO_Repository_2"
-)
+editor.update_or_add_element_value("ShardIndexes", "@SMD_ASTRO_Repository_1,@SMD_ASTRO_Repository_2")
 editor.update_or_add_element_value("ShardingStrategy", "Balanced")
 
 # rule adding

diff --git a/config_generation/generate_scrapers.py b/config_generation/generate_scrapers.py
@@ -17,9 +17,7 @@ def get_or_create_folder(scraper_folder_name="scraping_configs"):
     this gets that folder path
     """
 
-    scraper_folder_path = os.path.join(
-        os.path.dirname(os.getcwd()), scraper_folder_name
-    )
+    scraper_folder_path = os.path.join(os.path.dirname(os.getcwd()), scraper_folder_name)
     # creates the folder if it doesn't already exist
     create_folder(scraper_folder_path)
 
@@ -32,18 +30,14 @@ def get_scraper_folder(scraper_folder_name="scraping_configs"):
     this gets that folder path
     """
 
-    scraper_folder_path = os.path.join(
-        os.path.dirname(os.getcwd()), scraper_folder_name
-    )
+    scraper_folder_path = os.path.join(os.path.dirname(os.getcwd()), scraper_folder_name)
     # creates the folder if it doesn't already exist
     create_folder(scraper_folder_path)
 
     return scraper_folder_path
 
 
-def generate_brand_new_scraper(
-    source_name, url, scraper_template_path="xmls/scraper_template.xml"
-):
+def generate_brand_new_scraper(source_name, url, scraper_template_path="xmls/scraper_template.xml"):
     """
 
     Args:

diff --git a/config_generation/minimum_api.py b/config_generation/minimum_api.py
@@ -53,18 +53,14 @@ def query(self, term: str, page: int, collection_config_folder=None):
 
         return self._process_response(response)
 
-    def sql(
-        self, source: str = "SMD", collection: str = "", fetch_all: bool = False
-    ) -> dict[str, Any]:
+    def sql(self, source: str = "SMD", collection: str = "", fetch_all: bool = False) -> dict[str, Any]:
         if not self.token:
             raise ValueError("you must have a token to use the SQL endpoint")
 
         url = f"{self.base_url}/api/v1/engine.sql"
 
         collection_name = f"/{source}/{collection}/"
-        sql_command_all = (
-            "select url1,title,collection from @@ScienceMissionDirectorate"
-        )
+        sql_command_all = "select url1,title,collection from @@ScienceMissionDirectorate"
         if fetch_all:
             sql_command = sql_command_all
         else:

diff --git a/config_generation/preprocess_sources.py b/config_generation/preprocess_sources.py
@@ -47,12 +47,4 @@ def ensure_index_of_root(path):
 print(len(turned_on_sources))  # 139
 print(len(turned_on_remaining_webcrawlers))  # 114
 print(len(remove_top_limitation_sources))  # 5
-print(
-    len(
-        [
-            s
-            for s in remove_top_limitation_sources
-            if s in turned_on_remaining_webcrawlers
-        ]
-    )
-)  # 5
+print(len([s for s in remove_top_limitation_sources if s in turned_on_remaining_webcrawlers]))  # 5
diff --git a/config_generation/xmls/delete_template.xml b/config_generation/xmls/delete_template.xml
@@ -19,4 +19,4 @@
         <OtherOptions></OtherOptions>
     </Java>
     <IsSqlPattern>false</IsSqlPattern>
-</Sinequa>
+</Sinequa>
diff --git a/document_classifier/encoder.py b/document_classifier/encoder.py
@@ -72,12 +72,7 @@ def extract_text(self, text):
 
         """
 
-        keywords = (
-            self.image_keyword
-            + self.software_keyword
-            + self.mission_keyword
-            + self.training_keyword
-        )
+        keywords = self.image_keyword + self.software_keyword + self.mission_keyword + self.training_keyword
         software_count, mission_count, image_count, training_count = 0, 0, 0, 0
         word_positions = {}
         start = -1
@@ -103,12 +98,7 @@ def extract_text(self, text):
                 training_count = training_count + 1
                 word_positions[word].append((start, end))
 
-        if (
-            software_count == 0
-            and mission_count == 0
-            and image_count == 0
-            and training_count == 0
-        ):
+        if software_count == 0 and mission_count == 0 and image_count == 0 and training_count == 0:
             mid = int(len(text) / 2)
             start_pos = mid - 512
             end_pos = mid + 512  # in terms of characters

diff --git a/document_classifier/model.py b/document_classifier/model.py
@@ -58,16 +58,14 @@ def make_model(self):
         tokenizer_class = getattr(transformers, self.config["model"])
         # Load the tokenizer and model
         self.tokenizer = tokenizer_class.from_pretrained(self.config["model_type"])
-        self.model = model_class.from_pretrained(
-            self.config["model_type"], num_labels=self.config["num_labels"]
-        ).to(self.device)
+        self.model = model_class.from_pretrained(self.config["model_type"], num_labels=self.config["num_labels"]).to(
+            self.device
+        )
         return self.model, self.tokenizer
 
     def load_model(self):
         """This function loads the models and processes the data for evaluation"""
-        self.state_dict = torch.load(
-            self.config["saved_model_name"], map_location=self.device
-        )
+        self.state_dict = torch.load(self.config["saved_model_name"], map_location=self.device)
         model1, _ = self.make_model()
         model1.load_state_dict(self.state_dict)
         return model1
diff --git a/document_classifier/preprocessing.py b/document_classifier/preprocessing.py
@@ -4,7 +4,6 @@
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
-
 from Document_Classifier_inference.async_scraper import get_text_table, scraper
 
 
@@ -64,9 +63,7 @@ def remove_header_footer(self):
                 text = get_text_table(soup)
                 text = re.sub(r"\W+", " ", text)
                 if text == "" or text is None:
-                    soup, text = asyncio.get_event_loop().run_until_complete(
-                        scraper(each_url)
-                    )
+                    soup, text = asyncio.get_event_loop().run_until_complete(scraper(each_url))
                 result = soup.find("header")
                 if result:
                     result.extract()  # removing header element from the HTML code
@@ -102,7 +99,5 @@ def preprocessed_features(self):
               lists of urls with pdf reponse, and lists of urls with image response.
         """
         self.remove_header_footer()
-        self.data["soup"] = self.data["soup"].apply(
-            lambda x: re.sub(r"\W+", " ", get_text_table(x).strip())
-        )
+        self.data["soup"] = self.data["soup"].apply(lambda x: re.sub(r"\W+", " ", get_text_table(x).strip()))
         return self.data, self.pdf_lists, self.image_lists