Skip to content

Commit

Permalink
my.github.gdpr/my.zulip.organization: use kompress support for tar.gz…
Browse files Browse the repository at this point in the history
… if it's available

otherwise fall back onto unpacking into tmp dir via my.core.structure
  • Loading branch information
karlicoss committed Sep 18, 2024
1 parent 201ddd4 commit ec25c10
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 77 deletions.
6 changes: 2 additions & 4 deletions my/core/kompress.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .internal import assert_subpackage; assert_subpackage(__name__)

from . import warnings

# do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath)
Expand All @@ -8,10 +9,7 @@
from kompress import *
except ModuleNotFoundError as e:
if e.name == 'kompress':
warnings.high('Please install kompress (pip3 install kompress), it will be required in the future. Falling onto vendorized kompress for now.')
warnings.high('Please install kompress (pip3 install kompress). Falling onto vendorized kompress for now.')
from ._deprecated.kompress import * # type: ignore[assignment]
else:
raise e

# this is deprecated in compress, keep here for backwards compatibility
open = kopen # noqa: F405
96 changes: 58 additions & 38 deletions my/github/gdpr.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,42 @@
"""
Github data (uses [[https://github.com/settings/admin][official GDPR export]])
"""
from dataclasses import dataclass

from __future__ import annotations

import json
from abc import abstractmethod
from pathlib import Path
import tarfile
from typing import Iterable, Any, Sequence, Dict, Optional
from typing import Any, Iterator, Sequence

from my.core import get_files, Res, PathIsh, stat, Stats, make_logger
from my.core.cfg import make_config
from my.core.error import notnone, echain
from my.core import Paths, Res, Stats, get_files, make_logger, stat, warnings
from my.core.error import echain

from .common import Event, parse_dt, EventIds
from .common import Event, EventIds, parse_dt

# TODO later, use a separate user config? (github_gdpr)
from my.config import github as user_config
logger = make_logger(__name__)


@dataclass
class github(user_config):
gdpr_dir: PathIsh # path to unpacked GDPR archive
class config:
@property
@abstractmethod
def gdpr_dir(self) -> Paths:
raise NotImplementedError


config = make_config(github)
def make_config() -> config:
# TODO later, use a separate user config? (github_gdpr)
from my.config import github as user_config

class combined_config(user_config, config):
pass

logger = make_logger(__name__)
return combined_config()


def inputs() -> Sequence[Path]:
gdir = config.gdpr_dir
res = get_files(gdir)
gdpr_dir = make_config().gdpr_dir
res = get_files(gdpr_dir)
schema_json = [f for f in res if f.name == 'schema.json']
was_unpacked = len(schema_json) > 0
if was_unpacked:
Expand All @@ -43,22 +49,37 @@ def inputs() -> Sequence[Path]:
return res


def events() -> Iterable[Res[Event]]:
def events() -> Iterator[Res[Event]]:
last = max(inputs())

logger.info(f'extracting data from {last}')

# a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples
# another one is zulip archive
if last.is_dir():
files = sorted(last.glob('*.json')) # looks like all files are in the root
open_file = lambda f: f.open()
root: Path | None = None

if last.is_dir(): # if it's already CPath, this will match it
root = last
else:
# treat as .tar.gz
tfile = tarfile.open(last)
files = sorted(map(Path, tfile.getnames()))
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./
try:
from kompress import CPath

root = CPath(last)
assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support
except Exception as e:
logger.exception(e)
warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")

if root is None:
from my.core.structure import match_structure

with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns
[root] = res
yield from _process_one(root)
else:
yield from _process_one(root)


def _process_one(root: Path) -> Iterator[Res[Event]]:
files = sorted(root.glob('*.json')) # looks like all files are in the root

# fmt: off
handler_map = {
Expand Down Expand Up @@ -100,8 +121,7 @@ def events() -> Iterable[Res[Event]]:
# ignored
continue

with open_file(f) as fo:
j = json.load(fo)
j = json.loads(f.read_text())
for r in j:
try:
yield handler(r)
Expand All @@ -116,7 +136,7 @@ def stats() -> Stats:


# TODO typing.TypedDict could be handy here..
def _parse_common(d: Dict) -> Dict:
def _parse_common(d: dict) -> dict:
url = d['url']
body = d.get('body')
return {
Expand All @@ -126,7 +146,7 @@ def _parse_common(d: Dict) -> Dict:
}


def _parse_repository(d: Dict) -> Event:
def _parse_repository(d: dict) -> Event:
pref = 'https://github.com/'
url = d['url']
dts = d['created_at']
Expand All @@ -142,13 +162,13 @@ def _parse_repository(d: Dict) -> Event:


# user may be None if the user was deleted
def _is_bot(user: Optional[str]) -> bool:
def _is_bot(user: str | None) -> bool:
if user is None:
return False
return "[bot]" in user


def _parse_issue_comment(d: Dict) -> Event:
def _parse_issue_comment(d: dict) -> Event:
url = d['url']
return Event(
**_parse_common(d),
Expand All @@ -158,7 +178,7 @@ def _parse_issue_comment(d: Dict) -> Event:
)


def _parse_issue(d: Dict) -> Event:
def _parse_issue(d: dict) -> Event:
url = d['url']
title = d['title']
return Event(
Expand All @@ -169,7 +189,7 @@ def _parse_issue(d: Dict) -> Event:
)


def _parse_pull_request(d: Dict) -> Event:
def _parse_pull_request(d: dict) -> Event:
dts = d['created_at']
url = d['url']
title = d['title']
Expand All @@ -183,7 +203,7 @@ def _parse_pull_request(d: Dict) -> Event:
)


def _parse_project(d: Dict) -> Event:
def _parse_project(d: dict) -> Event:
url = d['url']
title = d['name']
is_bot = "[bot]" in d["creator"]
Expand All @@ -198,7 +218,7 @@ def _parse_project(d: Dict) -> Event:
)


def _parse_release(d: Dict) -> Event:
def _parse_release(d: dict) -> Event:
tag = d['tag_name']
return Event(
**_parse_common(d),
Expand All @@ -207,7 +227,7 @@ def _parse_release(d: Dict) -> Event:
)


def _parse_commit_comment(d: Dict) -> Event:
def _parse_commit_comment(d: dict) -> Event:
url = d['url']
return Event(
**_parse_common(d),
Expand Down
91 changes: 63 additions & 28 deletions my/zulip/organization.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,55 @@
"""
Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
"""

from __future__ import annotations

import json
from abc import abstractmethod
from dataclasses import dataclass
from datetime import datetime, timezone
from itertools import count
import json
from pathlib import Path
from typing import Sequence, Iterator, Dict, Union
from typing import Iterator, Sequence

from my.core import (
assert_never,
datetime_aware,
get_files,
stat,
Json,
Paths,
Res,
Stats,
assert_never,
datetime_aware,
get_files,
make_logger,
stat,
warnings,
)
from my.core.error import notnone
import my.config

logger = make_logger(__name__)

@dataclass
class organization(my.config.zulip.organization):
# paths[s]/glob to the exported JSON data
export_path: Paths

class config:
@property
@abstractmethod
def export_path(self) -> Paths:
"""paths[s]/glob to the exported JSON data"""
raise NotImplementedError


def make_config() -> config:
from my.config import zulip as user_config

class combined_config(user_config.zulip.organization, config):
pass

return combined_config()


def inputs() -> Sequence[Path]:
# TODO: seems like export ids are kinda random..
# not sure what's the best way to figure out the last without renaming?
# could use mtime perhaps?
return get_files(organization.export_path, sort=False)
return get_files(make_config().export_path, sort=False)


@dataclass(frozen=True)
Expand Down Expand Up @@ -85,19 +102,39 @@ def permalink(self) -> str:


# todo cache it
def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
def _entities() -> Iterator[Res[Server | Sender | _Message]]:
last = max(inputs())

# todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
# I guess makes sense to have a special implementation for .tar.gz considering how common are they
import tarfile
logger.info(f'extracting data from {last}')

root: Path | None = None

if last.is_dir(): # if it's already CPath, this will match it
root = last
else:
try:
from kompress import CPath

root = CPath(last)
assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support
except Exception as e:
logger.exception(e)
warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")

if root is None:
from my.core.structure import match_structure

with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns
[root] = res
yield from _process_one(root)
else:
yield from _process_one(root)

tfile = tarfile.open(last)

subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that
def _process_one(root: Path) -> Iterator[Res[Server | Sender | _Message]]:
[subdir] = root.iterdir() # there is a directory inside tar file, first name should be that

with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
rj = json.load(fo)
rj = json.loads((subdir / 'realm.json').read_text())

[sj] = rj['zerver_realm']
server = Server(
Expand Down Expand Up @@ -136,12 +173,10 @@ def _parse_message(j: Json) -> _Message:

for idx in count(start=1, step=1):
fname = f'messages-{idx:06}.json'
fpath = f'{subdir}/{fname}'
if fpath not in tfile.getnames():
# tarfile doesn't have .exists?
fpath = subdir / fname
if not fpath.exists():
break
with notnone(tfile.extractfile(fpath)) as fo:
mj = json.load(fo)
mj = json.loads(fpath.read_text())
# TODO handle zerver_usermessage
for j in mj['zerver_message']:
try:
Expand All @@ -151,8 +186,8 @@ def _parse_message(j: Json) -> _Message:


def messages() -> Iterator[Res[Message]]:
id2sender: Dict[int, Sender] = {}
id2server: Dict[int, Server] = {}
id2sender: dict[int, Sender] = {}
id2server: dict[int, Server] = {}
for x in _entities():
if isinstance(x, Exception):
yield x
Expand Down
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
from setuptools import setup, find_namespace_packages # type: ignore

INSTALL_REQUIRES = [
'pytz', # even though it's not needed by the core, it's so common anyway...
'typing-extensions', # one of the most common pypi packages, ok to depend for core
'appdirs', # very common, and makes it portable
'more-itertools', # it's just too useful and very common anyway
'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI
'kompress' , # for transparent access to compressed files via pathlib.Path
'pytz' , # even though it's not needed by the core, it's so common anyway...
'typing-extensions' , # one of the most common pypi packages, ok to depend for core
'appdirs' , # very common, and makes it portable
'more-itertools' , # it's just too useful and very common anyway
'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI
'kompress>=0.2.20240918' , # for transparent access to compressed files via pathlib.Path
]


Expand Down

0 comments on commit ec25c10

Please sign in to comment.