diff --git a/README.org b/README.org
index f6dc088..086b8dc 100644
--- a/README.org
+++ b/README.org
@@ -21,6 +21,10 @@ I write in detail about usecases and motivation for it [[https://beepb00p.xyz/or
- =pip3 install --user .=
- after that you can use =python3 -m orger.modules.modulename=, same way as the previous section, or run =modules/modulename.py= directly
+- [optional]: install [[https://pandoc.org/installing.html][pandoc]], it might give you better org-mode outputs for some modules
+
+ If you do have pandoc installed, but don't want the module to use it, pass =--disable-pandoc= flag to it.
+
* Usage and examples
I usually run Orger modules overnight via cron.
@@ -89,9 +93,9 @@ print(orger.Queue.__doc__)
#+RESULTS:
:results:
- *Queue* (old name =InteractiveView=): works as a queue, *only previously unseen items* from the data source are appended to the output org-mode file.
+ *Queue* (old name =InteractiveView=): works as a queue, *only previously unseen items* from the data source are added to the output org-mode file.
- To keep track of old/new items, it's using a separate JSON =state= file.
+ To keep track of previously seen iteems, it's using a separate JSON =state= file.
A typical usecase is a todo list, or a content processing queue.
You can use such a module as you use any other org-mode file: schedule/refile/comment/set priorities, etc.
@@ -102,10 +106,13 @@ print(orger.Queue.__doc__)
You can run such a module as:
- : # initialize the state file first to avoid surprises (you only need to do it once)
- : ./orger_module.py --to /path/to/output.org --state /path/to/state.json --init
- : # after that you can just run it:
- : ./orger_module.py --to /path/to/output.org --state /path/to/state.json
+ : ./orger_module.py --to /path/to/output.org
+
+ This will keep the state file in your user config dir (e.g. =~/.config/orger/=).
+
+ Alternatively, you can pass the state file explicitly:
+
+ : ./orger_module.py --to /path/to/output.org --state /path/to/state.json
* FAQ
- Why are the files output by some modules read only?
diff --git a/modules/github.py b/modules/github.py
index 7528916..2833400 100755
--- a/modules/github.py
+++ b/modules/github.py
@@ -2,8 +2,11 @@
from orger import Mirror
from orger.inorganic import node, link
from orger.common import dt_heading, error
+from orger import pandoc
import my.coding.github as gh
+# todo use later: import my.github.ghexport as gh. also careful about using events() -- need to sort?
+# I guess makes sense to generally expose get_ methods?
class Github(Mirror):
@@ -13,14 +16,22 @@ def get_items(self) -> Mirror.Results:
yield error(e)
continue
# TODO filter only events that have body? e.g. not sure if much point emitting pull requests here
+ summary = e.summary
+ body = e.body
+ if body is None:
+ lines = summary.splitlines(keepends=True)
+ if len(lines) > 1:
+ summary = lines[0].strip()
+ body = ''.join(lines[1:]) # todo meh. hacky, better to extract bodies in the provider properly
+ if body.strip() == '':
+ body = None
+
yield node(
dt_heading(
e.dt,
- link(url=e.link, title=e.summary) if e.link is not None else e.summary
+ link(url=e.link, title=summary) if e.link is not None else summary
),
- # TODO would be nice to convert from markdown to org here
- # TODO use pandoc thingie? make it configurable too
- body=e.body,
+ body=None if body is None else pandoc.to_org(body, from_='gfm'), # github flavored markdown
)
diff --git a/modules/polar.py b/modules/polar.py
index 64638aa..086824b 100755
--- a/modules/polar.py
+++ b/modules/polar.py
@@ -15,13 +15,23 @@
"""
-from orger import StaticView
-from orger.inorganic import node, link
+from orger import Mirror
+from orger.inorganic import node, link, OrgNode
from orger.common import dt_heading
+from orger import pandoc
-class PolarView(StaticView):
+
+class PolarView(Mirror):
def get_items(self):
from my.reading import polar
+
+ def make_comment(c: polar.Comment) -> OrgNode:
+ text = pandoc.to_org(data=c.text, from_='html', logger=self.logger)
+ return node(
+ heading=dt_heading(c.created, text.splitlines()[0]),
+ body=text,
+ )
+
def make_item(res: polar.Result):
if isinstance(res, polar.Error):
# TODO could create error heading from exception automatically? take first line as heading and rest + traceback as the body
@@ -39,10 +49,7 @@ def make_item(res: polar.Result):
heading=dt_heading(hl.created, hl.selection),
tags=hl.tags,
properties=None if hl.color is None else {'POLAR_COLOR': hex2name(hl.color)},
- children=[node(
- heading=dt_heading(c.created, c.text.splitlines()[0]),
- body=html2org(c.text, logger=self.logger),
- ) for c in hl.comments]
+ children=[make_comment(c) for c in hl.comments],
) for hl in book.items]
)
for res in polar.get_entries():
@@ -61,43 +68,6 @@ def hex2name(hexc: str) -> str:
)
-# TODO move to base?
-def html2org(html: str, logger) -> str:
- # meh. for some reason they are converted to \\ otherwise
- html = html.replace('
', '')
-
-
- from subprocess import run, PIPE
- try:
- r = run(
- ['pandoc', '-f', 'html', '-t', 'org', '--wrap=none'],
- check=True,
- input=html.encode('utf8'),
- stdout=PIPE,
- )
- except FileNotFoundError as fe:
- import warnings
- warnings.warn("Please install 'pandoc' to convert HTML to org-mode. See https://pandoc.org/installing.html")
- except Exception as e:
- logger.exception(e)
- else:
- return r.stdout.decode('utf8')
- return html # fallback
-
-
-# TODO decode text incoming from polar?
-
-def test_html2org():
- import logging
- # html = "
and a comment too
multiline!
" - # TODO ok, it's annoying... not sure what to do with nonpritable crap - html = "and a comment too
multiline!
" - assert html2org(html, logger=logging) == r''' -and a /comment/ too - -*multiline*! -'''.lstrip() - if __name__ == '__main__': PolarView.main() diff --git a/modules/reddit.py b/modules/reddit2org.py similarity index 94% rename from modules/reddit.py rename to modules/reddit2org.py index 834b3bd..0ac6edb 100755 --- a/modules/reddit.py +++ b/modules/reddit2org.py @@ -2,13 +2,14 @@ """ Better interface for reading saved reddit posts/comments """ -from orger import InteractiveView +from orger import Mirror from orger.inorganic import node, link from orger.common import dt_heading from my.reddit import saved -class RedditView(InteractiveView): + +class RedditView(Mirror): def get_items(self): for s in saved(): yield s.sid, node( @@ -21,6 +22,7 @@ def get_items(self): body=s.text, ) + # todo this could be generic, i.e. checking all urls? def is_dead_url(self, url: str) -> bool: assert self.cmdline_args is not None if not self.cmdline_args.mark_dead: diff --git a/modules/roamresearch.py b/modules/roamresearch.py index bc2d7e0..fe49364 100755 --- a/modules/roamresearch.py +++ b/modules/roamresearch.py @@ -2,26 +2,14 @@ from itertools import chain from typing import Iterable -from orger import StaticView +from orger import Mirror from orger.inorganic import node, link, OrgNode from orger.common import dt_heading +from orger import pandoc import my.roamresearch as roamresearch -from subprocess import run, PIPE - -def md2org(text: str) -> str: - # TODO use batch?? or talk to a process - r = run( - ['pandoc', '-f', 'markdown', '-t', 'org', '--wrap=none'], - check=True, - input=text.encode('utf8'), - stdout=PIPE, - ) - return r.stdout.decode('utf8') - - # todo ^^ ^^ things are highlight? def roam_text_to_org(text: str) -> str: """ @@ -31,7 +19,7 @@ def roam_text_to_org(text: str) -> str: ('{{[[slider]]}}', ''), ]: text = text.replace(f, t) - org = md2org(text) + org = pandoc.to_org(text, from_='markdown') org = org.replace(r'\_', '_') # unescape, it's a bit aggressive.. return org @@ -87,7 +75,7 @@ def roam_note_to_org(node: roamresearch.Node, top=False) -> Iterable[OrgNode]: ) -class RoamView(StaticView): +class RoamView(Mirror): def get_items(self): rr = roamresearch.roam() from concurrent.futures import ThreadPoolExecutor diff --git a/modules/youtube.py b/modules/youtube.py index f8ae0d0..37c5f13 100755 --- a/modules/youtube.py +++ b/modules/youtube.py @@ -3,25 +3,26 @@ from orger.inorganic import node, link from orger.common import dt_heading -from my.media.youtube import get_watched +from my.media.youtube import watched from itertools import groupby class YoutubeView(Mirror): def get_items(self) -> Mirror.Results: - watched = get_watched() by_url = lambda w: w.url by_when = lambda w: w.when items = [ max(group, key=by_when) - for _, group in groupby(sorted(watched, key=by_url), key=by_url) + for _, group in groupby(sorted(watched(), key=by_url), key=by_url) ] items = sorted(items, key=by_when) # TODO for each url only take latest? for item in items: + deleted = item.url == item.title # todo move to HPI? + l = link(title=item.title + (' (DELETED)' if deleted else ''), url=item.url) yield (item.url, node( - heading=dt_heading(item.when, link(title=item.title, url=item.url)), + heading=dt_heading(item.when, l), )) diff --git a/scripts/ci/run b/scripts/ci/run index 82d691f..0d4e0e0 100755 --- a/scripts/ci/run +++ b/scripts/ci/run @@ -21,7 +21,8 @@ if ! [ -z "$CI" ]; then fi # vim is used in one of the tests -command -v vim || sudo apt install vim +command -v vim || sudo apt install vim +command -v pandoc || sudo apt install pandoc pip3 install --user tox tox diff --git a/setup.py b/setup.py index 5cfde5d..91d8e72 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,10 @@ def main(): author_email='karlicoss@gmail.com', description='Converts data into org-mode', - install_requires=['atomicwrites'], + install_requires=[ + 'appdirs' , # to keep state files + 'atomicwrites', # to safely append data to a file + ], extras_require={ 'testing': ['pytest'], 'linting': [ diff --git a/src/orger/common.py b/src/orger/common.py index 59004b5..e334c1b 100644 --- a/src/orger/common.py +++ b/src/orger/common.py @@ -1,11 +1,13 @@ from datetime import datetime from typing import Optional +from pathlib import Path from .inorganic import OrgNode, timestamp, timestamp_with_style, TimestampStyle class settings: DEFAULT_TIMESTAMP_STYLE = TimestampStyle.INACTIVE + USE_PANDOC: bool = True def dt_heading(dt: Optional[datetime], heading: str) -> str: @@ -44,3 +46,8 @@ def todo(dt: datetime, **kwargs): # todo use klogging2? from .klogging import LazyLogger, setup_logger + + +def orger_user_dir() -> Path: + import appdirs # type: ignore[import] + return Path(appdirs.user_config_dir('orger')) diff --git a/src/orger/org_view.py b/src/orger/org_view.py index e573cbd..74017e9 100644 --- a/src/orger/org_view.py +++ b/src/orger/org_view.py @@ -11,7 +11,7 @@ from .inorganic import OrgNode, TimestampStyle from .state import JsonState from .atomic_append import PathIsh, atomic_append_check, assert_not_edited -from .common import setup_logger +from .common import setup_logger, orger_user_dir # TODO tests for determinism? not sure where should they be... # think of some generic thing to test that? @@ -61,10 +61,20 @@ def main_common(self) -> None: settings.DEFAULT_TIMESTAMP_STYLE = _style_map[timestamp_style] setup_logger(self.logger, level=logging.DEBUG) + pandoc = self.args.pandoc + settings.USE_PANDOC = pandoc + @classmethod def parser(cls) -> ArgumentParser: - p = ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + F = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=120) + p = argparse.ArgumentParser(formatter_class=F) # type: ignore + p.add_argument( + '--disable-pandoc', + action='store_false', + dest='pandoc', + help='Pass to disable pandoc conversions to org-mode (it might be slow in some cases)', + ) p.add_argument( '--timestamps', type=str, @@ -93,7 +103,7 @@ class Mirror(OrgView): @classmethod def main(cls, setup_parser=None) -> None: p = cls.parser() - p.add_argument('--to', type=Path, default=Path(cls.name() + '.org')) + p.add_argument('--to', type=Path, default=Path(cls.name() + '.org'), help='Filename to output') if setup_parser is not None: setup_parser(p) @@ -163,9 +173,9 @@ def test(): class Queue(OrgView): """ - *Queue* (old name =InteractiveView=): works as a queue, *only previously unseen items* from the data source are appended to the output org-mode file. + *Queue* (old name =InteractiveView=): works as a queue, *only previously unseen items* from the data source are added to the output org-mode file. - To keep track of old/new items, it's using a separate JSON =state= file. + To keep track of previously seen iteems, it's using a separate JSON =state= file. A typical usecase is a todo list, or a content processing queue. You can use such a module as you use any other org-mode file: schedule/refile/comment/set priorities, etc. @@ -183,8 +193,16 @@ def _run( dry_run: bool=False, ) -> None: if not to.exists() and not init: - raise RuntimeError(f"target {to} doesn't exist! Try running with --init") + err = RuntimeError(f"{to} doesn't exist! Try running with --init") + import sys + if sys.stdin.isatty(): + resp = input(f"{to} doesn't exist. Create empty file? y/n ").strip().lower() + if resp != 'y': + raise err + else: + raise err + state_path.parent.mkdir(parents=True, exist_ok=True) # not sure... state = JsonState( path=state_path, logger=self.logger, @@ -221,11 +239,12 @@ def get_items(self) -> Iterable[OrgWithKey]: @classmethod def main(cls, setup_parser=None) -> None: + default_state = orger_user_dir() / 'states' / (cls.name() + '.state.json') p = cls.parser() - p.add_argument('--to' , type=Path, default=Path(cls.name() + '.org') , help='file where new items are appended') - p.add_argument('--state', type=Path, default=Path(cls.name() + '.state.json'), help='state file for keeping track of handled items') - p.add_argument('--init', action='store_true') - p.add_argument('--dry-run', action='store_true') + p.add_argument('--to' , type=Path, default=Path(cls.name() + '.org') , help='file where new items are added') + p.add_argument('--state', type=Path, default=default_state, help='state file for keeping track of handled items') + p.add_argument('--init', action='store_true') # todo not sure if I really need it? + p.add_argument('--dry-run', action='store_true', help='Run without modifying the state file') if setup_parser is not None: setup_parser(p) diff --git a/src/orger/pandoc.py b/src/orger/pandoc.py new file mode 100644 index 0000000..0257e62 --- /dev/null +++ b/src/orger/pandoc.py @@ -0,0 +1,52 @@ +""" +Helper for converting stuff to pandoc +""" +import logging +import shutil +from subprocess import run, PIPE +from typing import Optional + + +from .common import settings + +if settings.USE_PANDOC: + has_pandoc = shutil.which('pandoc') is not None + + if not has_pandoc: + import warnings + warnings.warn("Please install 'pandoc' to convert HTML to org-mode. See https://pandoc.org/installing.html") + settings.USE_PANDOC = False + + +def to_org(data: str, *, from_: str, logger=logging) -> str: + if not settings.USE_PANDOC: + return data + # TODO batch?? + + # meh. for some reason they are converted to \\ otherwise + if from_ == 'html': + data = data.replace('and a comment too
multiline!
" + # TODO ok, it's annoying... not sure what to do with nonpritable crap + html = "and a comment too
multiline!
" + assert to_org(data=html, from_='html') == r''' +and a /comment/ too + +*multiline*! +'''.lstrip()