Skip to content

Commit

Permalink
Merge pull request #12 from internetstandards/domain_normalization
Browse files Browse the repository at this point in the history
fix a small data pollution issue from merklemap
  • Loading branch information
stitch authored Dec 10, 2024
2 parents 6a8c38c + 006cc6c commit 431a6de
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 2 deletions.
12 changes: 10 additions & 2 deletions src/ctlssa/suggestions/logic/domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,21 @@ def __init__(self):
self.new_data = []
self.auto_write_batch_size = settings.AUTO_WRITE_BATCH_SIZE

def clean_domain(self, domain: str) -> str:
# some normalization as a data source might be polluted, merklemap is and there might be an off-day in ct too
domain = domain.lower().strip()

# some domains end with a dot, because DNS entries are stored like this. This is a slight leak/pollution in the
# dataset from merklemap.
return domain.removesuffix(".")

def add_domain(self, domain: str, processing_date: date):
# wildcards do not exist in the hostname field, so no need to filter.

# some normalization as a data source might be polluted, merklemap is and there might be an off-day in ct too
domain = domain.lower().strip()
domain = self.clean_domain(domain)

# we can use this shortcut as there are no two level top level domains in the dutch zones
# this approach will fall apart in case of .co.uk domains
# the partition method is the fastest:
rest, delimiter, suffix = domain.rpartition(".")
subdomain, delimiter, domain = rest.rpartition(".")
Expand Down
13 changes: 13 additions & 0 deletions tests/test_ingest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ctlssa.suggestions.logic.domains import CaseOptimizedBulkInsert
from ctlssa.suggestions.logic.ingest import add_domains, certstream_callback
from ctlssa.suggestions.models import Domain

Expand Down Expand Up @@ -159,3 +160,15 @@ def test_add_domains(db, caplog): # sourcery skip: extract-duplicate-method

# test if logging works correctly, disabled due to flooding
# assert "ingesting" in caplog.text


def test_merklemap_clean_domain():

cobi = CaseOptimizedBulkInsert()

# test removal of last dot in polluted data
assert cobi.clean_domain("test.nu.nl.") == "test.nu.nl"
assert cobi.clean_domain("test.nu.nl") == "test.nu.nl"

# test case and padding removal
assert cobi.clean_domain(" TEST.nu.nl ") == "test.nu.nl"

0 comments on commit 431a6de

Please sign in to comment.