Skip to content

Commit

Permalink
my.youtube: use new my.google.takeout.parser module for its data
Browse files Browse the repository at this point in the history
- fallback on the old logic if google_takeout_parser isn't available
- move to my.youtube.takeout (possibly mixing in other sources later)
- keep my.media.youtube, but issue deprecation warning
  currently used in orger etc, so doesn't hurt to keep
- also fixes #113
  • Loading branch information
karlicoss committed Apr 20, 2022
1 parent 915cfe6 commit 78f6ae9
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 50 deletions.
7 changes: 7 additions & 0 deletions my/core/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,10 @@ def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwa

dest.cursor().executescript(tempfile.read())
dest.commit()


# can remove after python3.9
def removeprefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix):]
return text
Empty file removed my/media/__init__.py
Empty file.
46 changes: 4 additions & 42 deletions my/media/youtube.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,43 +1,5 @@
#!/usr/bin/env python3
from datetime import datetime
from typing import NamedTuple, List, Iterable

from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout


class Watched(NamedTuple):
url: str
title: str
when: datetime

@property
def eid(self) -> str:
return f'{self.url}-{self.when.isoformat()}'


def watched() -> Iterable[Watched]:
# TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
# TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
last = get_last_takeout(path=path)
if last is None:
return []


watches: List[Watched] = []
for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt))

# TODO hmm they already come sorted.. wonder if should just rely on it..
return list(sorted(watches, key=lambda e: e.when))


from ..core import stat, Stats
def stats() -> Stats:
return stat(watched)


# todo deprecate
get_watched = watched
from ..core.warnings import high
high("DEPRECATED! Please use my.youtube.takeout instead.")
from ..core.util import __NOT_HPI_MODULE__

from ..youtube.takeout import *
120 changes: 120 additions & 0 deletions my/youtube/takeout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from typing import NamedTuple, List, Iterable

from ..core import datetime_aware, Res, LazyLogger
from ..core.compat import removeprefix


logger = LazyLogger(__name__)


class Watched(NamedTuple):
url: str
title: str
when: datetime_aware

@property
def eid(self) -> str:
return f'{self.url}-{self.when.isoformat()}'


# todo define error policy?
# although it has one from google takeout module.. so not sure

def watched() -> Iterable[Res[Watched]]:
try:
from ..google.takeout.parser import events
from google_takeout_parser.models import Activity
except ModuleNotFoundError as ex:
logger.exception(ex)
from ..core.warnings import high
high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
yield from _watched_legacy()
return

YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='

# TODO would be nice to filter, e.g. it's kinda pointless to process Location events
for e in events():
if isinstance(e, Exception):
yield e

if not isinstance(e, Activity):
continue

url = e.titleUrl
header = e.header
title = e.title

if url is None:
continue

if header in {'Image Search', 'Search', 'Chrome'}:
# sometimes results in youtube links.. but definitely not watch history
continue

if header not in {'YouTube', 'youtube.com'}:
# TODO hmm -- wonder if these would end up in dupes in takeout? would be nice to check
# perhaps this would be easier once we have universal ids
if YOUTUBE_VIDEO_LINK in url:
# TODO maybe log in this case or something?
pass
continue

if header == 'youtube.com' and title.startswith('Visited '):
continue

if title.startswith('Searched for') and url.startswith('https://www.youtube.com/results'):
# search activity, don't need it here
continue

if title.startswith('Subscribed to') and url.startswith('https://www.youtube.com/channel/'):
# todo might be interesting to process somwhere?
continue

# all titles contain it, so pointless to include 'Watched '
# also compatible with legacy titles
title = removeprefix(title, 'Watched ')

if YOUTUBE_VIDEO_LINK not in url:
if e.details == ['From Google Ads']:
# weird, sometimes results in odd
continue
if title == 'Used YouTube' and e.products == ['Android']:
continue

yield RuntimeError(f'Unexpected url: {e}')
continue

yield Watched(
url=url,
title=title,
when=e.time,
)


from ..core import stat, Stats
def stats() -> Stats:
return stat(watched)


### deprecated stuff (keep in my.media.youtube)

get_watched = watched


def _watched_legacy() -> Iterable[Watched]:
from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout

# todo looks like this one doesn't have retention? so enough to use the last
path = 'Takeout/My Activity/YouTube/MyActivity.html'
last = get_last_takeout(path=path)
if last is None:
return []

watches: List[Watched] = []
for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt))

# todo hmm they already come sorted.. wonder if should just rely on it..
return list(sorted(watches, key=lambda e: e.when))
30 changes: 22 additions & 8 deletions tests/youtube.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,36 @@
# TODO move elsewhere?
# these tests would only make sense with some existing data? although some of them would work for everyone..
# not sure what's a good way of handling this..
from datetime import datetime
import pytz
from more_itertools import bucket


from .common import skip_if_not_karlicoss as pytestmark

# TODO ugh. if i uncomment this here (on top level), then this test vvv fails
# from my.media.youtube import get_watched, Watched
# HPI_TESTS_KARLICOSS=true pytest -raps tests/tz.py tests/youtube.py


def test() -> None:
from my.media.youtube import get_watched, Watched
watched = list(get_watched())
assert len(watched) > 1000
from my.youtube.takeout import watched, Watched
videos = [w for w in watched() if not isinstance(w, Exception)]
assert len(videos) > 1000

from datetime import datetime
import pytz
w = Watched(
# results in nicer errors, otherwise annoying to check against thousands of videos
grouped = bucket(videos, key=lambda w: (w.url, w.title))

w1 = Watched(
url='https://www.youtube.com/watch?v=hTGJfRPLe08',
title='Jamie xx - Gosh',
when=datetime(year=2018, month=6, day=21, hour=5, minute=48, second=34, tzinfo=pytz.utc),
when=pytz.timezone('Europe/London').localize(datetime(year=2018, month=6, day=21, hour=6, minute=48, second=34)),
)
assert w1 in list(grouped[(w1.url, w1.title)])

w2 = Watched(
url='https://www.youtube.com/watch?v=IZ_8b_Ydsv0',
title='Why LESS Sensitive Tests Might Be Better',
when=pytz.utc.localize(datetime(year=2021, month=1, day=15, hour=17, minute=54, second=12)),
)
assert w in watched
assert w2 in list(grouped[(w2.url, w2.title)])

0 comments on commit 78f6ae9

Please sign in to comment.