From b08db7c2d0f8977ef75d3a87452bf880f0f31552 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 27 Sep 2024 09:45:58 +0100 Subject: [PATCH] sources.takeout: use reconstruct_comment_content directly to workaround weird formatted json data in takeout see https://github.com/seanbreckenridge/google_takeout_parser/pull/79 --- src/promnesia/sources/filetypes.py | 1 + src/promnesia/sources/takeout.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/promnesia/sources/filetypes.py b/src/promnesia/sources/filetypes.py index 92a5d5f3..b0e86ff9 100644 --- a/src/promnesia/sources/filetypes.py +++ b/src/promnesia/sources/filetypes.py @@ -121,6 +121,7 @@ def ignore(*_args, **_kwargs): 'font/woff': ignore, 'text/x-Algol68': ignore, # ugh some license file had this?? maybe always index text/ as text? 'text/x-bytecode.python': ignore, # todo ignore all x-bytecode? + 'text/calendar': ignore, # TODO not sure what to do about these.. 'application/octet-stream': handle_later, diff --git a/src/promnesia/sources/takeout.py b/src/promnesia/sources/takeout.py index 8ebcee01..3a5ce947 100644 --- a/src/promnesia/sources/takeout.py +++ b/src/promnesia/sources/takeout.py @@ -4,7 +4,6 @@ from __future__ import annotations -import json import warnings from typing import Any, Iterable, NamedTuple @@ -142,7 +141,7 @@ def warn_once_if_not_seen(e: Any) -> Iterable[Exception]: url=url, dt=e.dt, context=e.content, locator=Loc(title=e.content, href=url) ) elif imported_yt_csv_models and isinstance(e, CSVYoutubeComment): - contentJSON = json.loads(e.contentJSON) + contentJSON = e.contentJSON content = reconstruct_comment_content(contentJSON, format='text') if isinstance(content, Exception): yield content @@ -160,7 +159,7 @@ def warn_once_if_not_seen(e: Any) -> Iterable[Exception]: url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url) ) elif imported_yt_csv_models and isinstance(e, CSVYoutubeLiveChat): - contentJSON = json.loads(e.contentJSON) + contentJSON = e.contentJSON content = reconstruct_comment_content(contentJSON, format='text') if isinstance(content, Exception): yield content