Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Aug 23, 2024
1 parent 9491a25 commit caceb37
Show file tree
Hide file tree
Showing 63 changed files with 178 additions and 359 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ Navigate to the server running prod, then to the project folder. Run the followi
```bash
docker-compose -f production.yml run --rm --user root django python manage.py dumpdata --natural-foreign --natural-primary --exclude=contenttypes --exclude=auth.Permission --indent 2 --output /app/backups/prod_backup-20240812.json
```
This will have saved the backup in a folder outside of the docker container. Now you can copy it to your local machine.
This will have saved the backup in a folder outside of the docker container. Now you can copy it to your local machine.

```bash
mv ~/prod_backup-20240812.json <project_path>/prod_backup-20240812.json
Expand Down
4 changes: 1 addition & 3 deletions config/settings/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@
# EMAIL
# ------------------------------------------------------------------------------
# https://docs.djangoproject.com/en/dev/ref/settings/#email-backend
EMAIL_BACKEND = env(
"DJANGO_EMAIL_BACKEND", default="django.core.mail.backends.console.EmailBackend"
)
EMAIL_BACKEND = env("DJANGO_EMAIL_BACKEND", default="django.core.mail.backends.console.EmailBackend")

# django-debug-toolbar
# ------------------------------------------------------------------------------
Expand Down
23 changes: 5 additions & 18 deletions config/settings/production.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
# https://docs.djangoproject.com/en/dev/ref/settings/#secret-key
SECRET_KEY = env("DJANGO_SECRET_KEY")
# https://docs.djangoproject.com/en/dev/ref/settings/#allowed-hosts
ALLOWED_HOSTS = env.list(
"DJANGO_ALLOWED_HOSTS", default=["sde-indexing-helper.nasa-impact.net"]
)
ALLOWED_HOSTS = env.list("DJANGO_ALLOWED_HOSTS", default=["sde-indexing-helper.nasa-impact.net"])

# DATABASES
# ------------------------------------------------------------------------------
Expand All @@ -36,15 +34,11 @@
# TODO: set this to 60 seconds first and then to 518400 once you prove the former works
SECURE_HSTS_SECONDS = 60
# https://docs.djangoproject.com/en/dev/ref/settings/#secure-hsts-include-subdomains
SECURE_HSTS_INCLUDE_SUBDOMAINS = env.bool(
"DJANGO_SECURE_HSTS_INCLUDE_SUBDOMAINS", default=True
)
SECURE_HSTS_INCLUDE_SUBDOMAINS = env.bool("DJANGO_SECURE_HSTS_INCLUDE_SUBDOMAINS", default=True)
# https://docs.djangoproject.com/en/dev/ref/settings/#secure-hsts-preload
SECURE_HSTS_PRELOAD = env.bool("DJANGO_SECURE_HSTS_PRELOAD", default=True)
# https://docs.djangoproject.com/en/dev/ref/middleware/#x-content-type-options-nosniff
SECURE_CONTENT_TYPE_NOSNIFF = env.bool(
"DJANGO_SECURE_CONTENT_TYPE_NOSNIFF", default=True
)
SECURE_CONTENT_TYPE_NOSNIFF = env.bool("DJANGO_SECURE_CONTENT_TYPE_NOSNIFF", default=True)

# STORAGES
# ------------------------------------------------------------------------------
Expand All @@ -61,9 +55,7 @@
# DO NOT change these unless you know what you're doing.
_AWS_EXPIRY = 60 * 60 * 24 * 7
# https://django-storages.readthedocs.io/en/latest/backends/amazon-S3.html#settings
AWS_S3_OBJECT_PARAMETERS = {
"CacheControl": f"max-age={_AWS_EXPIRY}, s-maxage={_AWS_EXPIRY}, must-revalidate"
}
AWS_S3_OBJECT_PARAMETERS = {"CacheControl": f"max-age={_AWS_EXPIRY}, s-maxage={_AWS_EXPIRY}, must-revalidate"}
# https://django-storages.readthedocs.io/en/latest/backends/amazon-S3.html#settings
AWS_S3_MAX_MEMORY_SIZE = env.int(
"DJANGO_AWS_S3_MAX_MEMORY_SIZE",
Expand Down Expand Up @@ -128,12 +120,7 @@
LOGGING = {
"version": 1,
"disable_existing_loggers": True,
"formatters": {
"verbose": {
"format": "%(levelname)s %(asctime)s %(module)s "
"%(process)d %(thread)d %(message)s"
}
},
"formatters": {"verbose": {"format": "%(levelname)s %(asctime)s %(module)s " "%(process)d %(thread)d %(message)s"}},
"handlers": {
"console": {
"level": "DEBUG",
Expand Down
4 changes: 1 addition & 3 deletions config/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
from django.urls import include, path
from django.views import defaults as default_views

admin.site.site_header = (
"SDE Indexing Helper Administration" # default: "Django Administration"
)
admin.site.site_header = "SDE Indexing Helper Administration" # default: "Django Administration"
admin.site.index_title = "SDE Indexing Helper" # default: "Site administration"
admin.site.site_title = "SDE Indexing Helper" # default: "Django site admin"

Expand Down
12 changes: 3 additions & 9 deletions config_generation/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@

class Api:
def __init__(self, server_name: str) -> None:
self.headers: dict[str, str] = {
"Authorization": f"Bearer {tokens[server_name]}"
}
self.headers: dict[str, str] = {"Authorization": f"Bearer {tokens[server_name]}"}
self.app_name: str = server_configs[server_name]["app_name"]
self.query_name: str = server_configs[server_name]["query_name"]
self.base_url: str = server_configs[server_name]["base_url"]
Expand Down Expand Up @@ -53,15 +51,11 @@ def query(self, term: str):

return self.process_response(url, payload)

def sql(
self, source: str, collection: str = "", fetch_all: bool = False
) -> dict[str, Any]:
def sql(self, source: str, collection: str = "", fetch_all: bool = False) -> dict[str, Any]:
url = f"{self.base_url}/api/v1/engine.sql"

collection_name = f"/{source}/{collection}/"
sql_command_all = (
"select url1,title,collection from @@ScienceMissionDirectorate"
)
sql_command_all = "select url1,title,collection from @@ScienceMissionDirectorate"
if fetch_all:
sql_command = sql_command_all
else:
Expand Down
4 changes: 1 addition & 3 deletions config_generation/config_example.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from sources_to_scrape import (
sources_to_index_test_grid_20240809,
)
from sources_to_scrape import sources_to_index_test_grid_20240809

tokens: dict[str, str] = {
"test_server": "token here",
Expand Down
8 changes: 7 additions & 1 deletion config_generation/delete_server_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

from db_to_xml_file_based import XmlEditor

from config import batch_delete_name, collection_list, indexes_to_delete_from, source, engines
from config import (
batch_delete_name,
collection_list,
engines,
indexes_to_delete_from,
source,
)

COMMAND_FILES_PATH = "../sinequa_configs/commands/"
DELETE_COMMAND_TEMPLATE_PATH = "xmls/delete_template.xml"
Expand Down
8 changes: 2 additions & 6 deletions config_generation/export_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,13 @@
# Create JSON file
print("Creating JSON dump...")
json_data = json.dumps(bulk_data)
file_path = (
f"{TEMP_FOLDER_NAME}/{collection}/urls.json" # Provide the desired file path
)
file_path = f"{TEMP_FOLDER_NAME}/{collection}/urls.json" # Provide the desired file path
with open(file_path, "w") as file:
file.write(json_data)

# Zip the JSON file
print("Creating zip file...")
zip_file_path = (
f"{TEMP_FOLDER_NAME}/{collection}.zip" # Provide the desired zip file path
)
zip_file_path = f"{TEMP_FOLDER_NAME}/{collection}.zip" # Provide the desired zip file path
with zipfile.ZipFile(zip_file_path, "w") as zip_file:
zip_file.write(file_path, os.path.basename(file_path))

Expand Down
4 changes: 1 addition & 3 deletions config_generation/export_whole_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,7 @@

# Upload the zip file to S3
s3_bucket_name = env("DJANGO_AWS_STORAGE_BUCKET_NAME")
s3_key = (
"scraped_urls_all/all_data.zip" # Provide the desired S3 key for the uploaded file
)
s3_key = "scraped_urls_all/all_data.zip" # Provide the desired S3 key for the uploaded file
s3_client = boto3.client(
"s3",
region_name="us-east-1",
Expand Down
9 changes: 2 additions & 7 deletions config_generation/generate_collection_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ def create_xml_path(collection_name):

def get_turned_on_sources():
# remove sources that were just scraped
turned_on_remaining_sources = [
source for source in turned_on_sources if source not in already_scraped_sources
]
turned_on_remaining_sources = [source for source in turned_on_sources if source not in already_scraped_sources]

# filter all sources to only webcrawler sources
turned_on_remaining_webcrawlers = []
Expand Down Expand Up @@ -68,8 +66,5 @@ def get_sources_20230605():
folders = get_all_config_folder_names()
folders = folders + interrupted_sources
return [
folder
for folder in folders
if folder not in sources_with_documents_20230605
and is_collection_crawler(folder)
folder for folder in folders if folder not in sources_with_documents_20230605 and is_collection_crawler(folder)
]
4 changes: 1 addition & 3 deletions config_generation/generate_emac_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,7 @@
editor.update_or_add_element_value("Description", f"Webcrawler for the {name}")
editor.update_or_add_element_value("Url", url)
editor.update_or_add_element_value("TreeRoot", tree_root)
editor.update_or_add_element_value(
"ShardIndexes", "@SMD_ASTRO_Repository_1,@SMD_ASTRO_Repository_2"
)
editor.update_or_add_element_value("ShardIndexes", "@SMD_ASTRO_Repository_1,@SMD_ASTRO_Repository_2")
editor.update_or_add_element_value("ShardingStrategy", "Balanced")

# rule adding
Expand Down
12 changes: 3 additions & 9 deletions config_generation/generate_scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ def get_or_create_folder(scraper_folder_name="scraping_configs"):
this gets that folder path
"""

scraper_folder_path = os.path.join(
os.path.dirname(os.getcwd()), scraper_folder_name
)
scraper_folder_path = os.path.join(os.path.dirname(os.getcwd()), scraper_folder_name)
# creates the folder if it doesn't already exist
create_folder(scraper_folder_path)

Expand All @@ -32,18 +30,14 @@ def get_scraper_folder(scraper_folder_name="scraping_configs"):
this gets that folder path
"""

scraper_folder_path = os.path.join(
os.path.dirname(os.getcwd()), scraper_folder_name
)
scraper_folder_path = os.path.join(os.path.dirname(os.getcwd()), scraper_folder_name)
# creates the folder if it doesn't already exist
create_folder(scraper_folder_path)

return scraper_folder_path


def generate_brand_new_scraper(
source_name, url, scraper_template_path="xmls/scraper_template.xml"
):
def generate_brand_new_scraper(source_name, url, scraper_template_path="xmls/scraper_template.xml"):
"""
Args:
Expand Down
8 changes: 2 additions & 6 deletions config_generation/minimum_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,14 @@ def query(self, term: str, page: int, collection_config_folder=None):

return self._process_response(response)

def sql(
self, source: str = "SMD", collection: str = "", fetch_all: bool = False
) -> dict[str, Any]:
def sql(self, source: str = "SMD", collection: str = "", fetch_all: bool = False) -> dict[str, Any]:
if not self.token:
raise ValueError("you must have a token to use the SQL endpoint")

url = f"{self.base_url}/api/v1/engine.sql"

collection_name = f"/{source}/{collection}/"
sql_command_all = (
"select url1,title,collection from @@ScienceMissionDirectorate"
)
sql_command_all = "select url1,title,collection from @@ScienceMissionDirectorate"
if fetch_all:
sql_command = sql_command_all
else:
Expand Down
10 changes: 1 addition & 9 deletions config_generation/preprocess_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,4 @@ def ensure_index_of_root(path):
print(len(turned_on_sources)) # 139
print(len(turned_on_remaining_webcrawlers)) # 114
print(len(remove_top_limitation_sources)) # 5
print(
len(
[
s
for s in remove_top_limitation_sources
if s in turned_on_remaining_webcrawlers
]
)
) # 5
print(len([s for s in remove_top_limitation_sources if s in turned_on_remaining_webcrawlers])) # 5
2 changes: 1 addition & 1 deletion config_generation/xmls/delete_template.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@
<OtherOptions></OtherOptions>
</Java>
<IsSqlPattern>false</IsSqlPattern>
</Sinequa>
</Sinequa>
14 changes: 2 additions & 12 deletions document_classifier/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,7 @@ def extract_text(self, text):
"""

keywords = (
self.image_keyword
+ self.software_keyword
+ self.mission_keyword
+ self.training_keyword
)
keywords = self.image_keyword + self.software_keyword + self.mission_keyword + self.training_keyword
software_count, mission_count, image_count, training_count = 0, 0, 0, 0
word_positions = {}
start = -1
Expand All @@ -103,12 +98,7 @@ def extract_text(self, text):
training_count = training_count + 1
word_positions[word].append((start, end))

if (
software_count == 0
and mission_count == 0
and image_count == 0
and training_count == 0
):
if software_count == 0 and mission_count == 0 and image_count == 0 and training_count == 0:
mid = int(len(text) / 2)
start_pos = mid - 512
end_pos = mid + 512 # in terms of characters
Expand Down
10 changes: 4 additions & 6 deletions document_classifier/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,14 @@ def make_model(self):
tokenizer_class = getattr(transformers, self.config["model"])
# Load the tokenizer and model
self.tokenizer = tokenizer_class.from_pretrained(self.config["model_type"])
self.model = model_class.from_pretrained(
self.config["model_type"], num_labels=self.config["num_labels"]
).to(self.device)
self.model = model_class.from_pretrained(self.config["model_type"], num_labels=self.config["num_labels"]).to(
self.device
)
return self.model, self.tokenizer

def load_model(self):
"""This function loads the models and processes the data for evaluation"""
self.state_dict = torch.load(
self.config["saved_model_name"], map_location=self.device
)
self.state_dict = torch.load(self.config["saved_model_name"], map_location=self.device)
model1, _ = self.make_model()
model1.load_state_dict(self.state_dict)
return model1
9 changes: 2 additions & 7 deletions document_classifier/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pandas as pd
import requests
from bs4 import BeautifulSoup

from Document_Classifier_inference.async_scraper import get_text_table, scraper


Expand Down Expand Up @@ -64,9 +63,7 @@ def remove_header_footer(self):
text = get_text_table(soup)
text = re.sub(r"\W+", " ", text)
if text == "" or text is None:
soup, text = asyncio.get_event_loop().run_until_complete(
scraper(each_url)
)
soup, text = asyncio.get_event_loop().run_until_complete(scraper(each_url))
result = soup.find("header")
if result:
result.extract() # removing header element from the HTML code
Expand Down Expand Up @@ -102,7 +99,5 @@ def preprocessed_features(self):
lists of urls with pdf reponse, and lists of urls with image response.
"""
self.remove_header_footer()
self.data["soup"] = self.data["soup"].apply(
lambda x: re.sub(r"\W+", " ", get_text_table(x).strip())
)
self.data["soup"] = self.data["soup"].apply(lambda x: re.sub(r"\W+", " ", get_text_table(x).strip()))
return self.data, self.pdf_lists, self.image_lists
Loading

0 comments on commit caceb37

Please sign in to comment.