Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

small zulip and dogsheep updates #331

Merged
merged 2 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions my/hackernews/dogsheep.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,19 @@
from __future__ import annotations

from dataclasses import dataclass
from datetime import datetime
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterator, Sequence, Optional

from my.core import get_files, Paths, Res
from my.core import get_files, Paths, Res, datetime_aware
from my.core.sqlite import sqlite_connection
import my.config

from my.config import hackernews as user_config
from .common import hackernews_link


@dataclass
class config(user_config.dogsheep):
class config(my.config.hackernews.dogsheep):
# paths[s]/glob to the dogsheep database
export_path: Paths

Expand All @@ -26,24 +27,23 @@ def inputs() -> Sequence[Path]:
return get_files(config.export_path)


from .common import hackernews_link

# TODO not sure if worth splitting into Comment and Story?
@dataclass(unsafe_hash=True)
class Item:
id: str
type: str
# TODO is it urc??
created: datetime
created: datetime_aware # checked and it's utc
title: Optional[str] # only present for Story
text_html: Optional[str] # should be present for Comment and might for Story
url: Optional[str] # might be present for Story
text_html: Optional[str] # should be present for Comment and might for Story
url: Optional[str] # might be present for Story
# todo process 'deleted'? fields?
# todo process 'parent'?

@property
def permalink(self) -> str:
return hackernews_link(self.id)


# TODO hmm kinda annoying that permalink isn't getting serialized
# maybe won't be such a big problem if we used hpi query directly on objects, without jsons?
# so we could just take .permalink thing
Expand All @@ -56,7 +56,7 @@ def items() -> Iterator[Res[Item]]:
yield Item(
id=r['id'],
type=r['type'],
created=datetime.fromtimestamp(r['time']),
created=datetime.fromtimestamp(r['time'], tz=timezone.utc),
title=r['title'],
# todo hmm maybe a method to strip off html tags would be nice
text_html=r['text'],
Expand Down
65 changes: 33 additions & 32 deletions my/zulip/organization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,37 @@
Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
"""
from dataclasses import dataclass
from typing import Sequence, Iterator, Dict
from datetime import datetime, timezone
from itertools import count
import json
from pathlib import Path
from typing import Sequence, Iterator, Dict, Union

from my.core import (
assert_never,
datetime_aware,
get_files,
stat,
Json,
Paths,
Res,
Stats,
)
from my.core.error import notnone
import my.config

from my.config import zulip as user_config

from ..core import Paths
@dataclass
class organization(user_config.organization):
class organization(my.config.zulip.organization):
# paths[s]/glob to the exported JSON data
export_path: Paths


from pathlib import Path
from ..core import get_files, Json
def inputs() -> Sequence[Path]:
return get_files(organization.export_path)


from datetime import datetime
# TODO: seems like export ids are kinda random..
# not sure what's the best way to figure out the last without renaming?
# could use mtime perhaps?
return get_files(organization.export_path, sort=False)


@dataclass(frozen=True)
Expand All @@ -39,16 +52,11 @@ class Sender:

# from the data, seems that subjects are completely implicit and determined by name?
# streams have ids (can extract from realm/zerver_stream), but unclear how to correlate messages/topics to streams?

@dataclass(frozen=True)
class _Message:
# todo hmm not sure what would be a good field order..
id: int
sent: datetime
# TODO hmm kinda unclear whether it uses UTC or not??
# https://github.com/zulip/zulip/blob/0c2e4eec200d986a9a020f3e9a651d27216e0e85/zerver/models.py#L3071-L3076
# it keeps it tz aware.. but not sure what happens after?
# https://github.com/zulip/zulip/blob/1dfddffc8dac744fd6a6fbfd937018074c8bb166/zproject/computed_settings.py#L151
sent: datetime_aware # double checked and they are in utc
subject: str
sender_id: int
server_id: int
Expand All @@ -60,7 +68,7 @@ class _Message:
@dataclass(frozen=True)
class Message:
id: int
sent: datetime
sent: datetime_aware
subject: str
sender: Sender
server: Server
Expand All @@ -76,23 +84,18 @@ def permalink(self) -> str:
return f'https://{self.server.string_id}.zulipchat.com/#narrow/near/{self.id}'


from typing import Union
from itertools import count
import json
from ..core import Res, assert_never
# todo cache it
def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
# TODO hmm -- not sure if max lexicographically will actually be latest?
last = max(inputs())

subdir = last.with_suffix('').stem # there is a directory inside tar.gz

# todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
# I guess makes sense to have a special implementation for .tar.gz considering how common are they
import tarfile
from ..core.error import notnone

tfile = tarfile.open(last)

subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that

with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
rj = json.load(fo)

Expand All @@ -114,20 +117,22 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
for j in rj['zerver_userprofile_crossrealm']: # e.g. zulip bot
yield Sender(
id=j['id'],
full_name=j['email'], # doesn't seem to have anything
full_name=j['email'], # doesn't seem to have anything
email=j['email'],
)

def _parse_message(j: Json) -> _Message:
ds = j['date_sent']
# fmt: off
return _Message(
id = j['id'],
sent = datetime.fromtimestamp(ds),
sent = datetime.fromtimestamp(ds, tz=timezone.utc),
subject = j['subject'],
sender_id = j['sender'],
server_id = server.id,
content = j['content'],
)
# fmt: on

for idx in count(start=1, step=1):
fname = f'messages-{idx:06}.json'
Expand Down Expand Up @@ -172,9 +177,5 @@ def messages() -> Iterator[Res[Message]]:
assert_never(x)


from my.core import Stats
def stats() -> Stats:
from my.core import stat
return {
**stat(messages)
}
return {**stat(messages)}