Skip to content

Commit

Permalink
✨ (documents) Iterate through changed keys
Browse files Browse the repository at this point in the history
  • Loading branch information
simonwoerpel committed Jan 6, 2025
1 parent bef43d4 commit ed642fd
Showing 1 changed file with 26 additions and 1 deletion.
27 changes: 26 additions & 1 deletion leakrfc/archive/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import pandas as pd
from anystore.io import DoesNotExist, logged_io_items
from anystore.types import StrGenerator
from ftmq.types import CEGenerator

from leakrfc.archive.cache import get_cache
Expand Down Expand Up @@ -157,7 +158,9 @@ def open(

def get_versions(self) -> list[str]:
keys: list[str] = []
glob = self.dataset._make_path(self.dataset.metadata_prefix, "documents.*.diff")
glob = self.dataset._make_path(
self.dataset.metadata_prefix, "documents.csv.*.diff"
)
for key in self.dataset._storage.iterate_keys(glob=glob):
ts = key[:-5].split("documents.csv.")[-1]
keys.append(ts)
Expand All @@ -168,3 +171,25 @@ def get_current_version(self) -> str:
if revs:
return revs[-1]
return ""

def get_keys_added(self, version: str) -> StrGenerator:
key = f"documents.csv.{version}.diff"
path = self.dataset._make_path(self.dataset.metadata_prefix, key)
for line in self.dataset._storage.stream(path, mode="r"):
if line.startswith("+") and not line.startswith("+++"):
io = StringIO(line)
reader = csv.reader(io)
for row in reader:
yield row[0]
break

def get_keys_deleted(self, version: str) -> StrGenerator:
key = f"documents.csv.{version}.diff"
path = self.dataset._make_path(self.dataset.metadata_prefix, key)
for line in self.dataset._storage.stream(path, mode="r"):
if line.startswith("-") and not line.startswith("---"):
io = StringIO(line)
reader = csv.reader(io)
for row in reader:
yield row[0]
break

0 comments on commit ed642fd

Please sign in to comment.