-
Notifications
You must be signed in to change notification settings - Fork 99
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(Subtitle): Absolutify timestamps on WebVTT downloads
This applies the X-TIMESTAMP-MAP data to timestamps as it reads through a concatenated (merged) WebVTT file to correct timestamps on segmented WebVTT streams. The timescale and segment duration information is saved in the Subtitle's data dictionary under the hls/dash key: timescale (dash-only) and segment_durations. Note that this information will only be available post-download. This is done regardless if you are converting to another subtitle or not, since the downloader automatically and forcefully concatenated the segmented subtitle data. We do not support the use of segmented Subtitles for downloading or otherwise, nor do we plan to.
- Loading branch information
1 parent
af95ba0
commit 229fa0b
Showing
4 changed files
with
257 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
import re | ||
import sys | ||
import typing | ||
from typing import Optional | ||
|
||
from pycaption import Caption, CaptionList, CaptionNode, CaptionReadError, WebVTTReader, WebVTTWriter | ||
|
||
|
||
class CaptionListExt(CaptionList): | ||
@typing.no_type_check | ||
def __init__(self, iterable=None, layout_info=None): | ||
self.first_segment_mpegts = 0 | ||
super().__init__(iterable, layout_info) | ||
|
||
|
||
class CaptionExt(Caption): | ||
@typing.no_type_check | ||
def __init__(self, start, end, nodes, style=None, layout_info=None, segment_index=0, mpegts=0, cue_time=0.0): | ||
style = style or {} | ||
self.segment_index: int = segment_index | ||
self.mpegts: float = mpegts | ||
self.cue_time: float = cue_time | ||
super().__init__(start, end, nodes, style, layout_info) | ||
|
||
|
||
class WebVTTReaderExt(WebVTTReader): | ||
# HLS extension support <https://datatracker.ietf.org/doc/html/rfc8216#section-3.5> | ||
RE_TIMESTAMP_MAP = re.compile(r"X-TIMESTAMP-MAP.*") | ||
RE_MPEGTS = re.compile(r"MPEGTS:(\d+)") | ||
RE_LOCAL = re.compile(r"LOCAL:((?:(\d{1,}):)?(\d{2}):(\d{2})\.(\d{3}))") | ||
|
||
def _parse(self, lines: list[str]) -> CaptionList: | ||
captions = CaptionListExt() | ||
start = None | ||
end = None | ||
nodes: list[CaptionNode] = [] | ||
layout_info = None | ||
found_timing = False | ||
segment_index = -1 | ||
mpegts = 0 | ||
cue_time = 0.0 | ||
|
||
# The first segment MPEGTS is needed to calculate the rest. It is possible that | ||
# the first segment contains no cue and is ignored by pycaption, this acts as a fallback. | ||
captions.first_segment_mpegts = 0 | ||
|
||
for i, line in enumerate(lines): | ||
if "-->" in line: | ||
found_timing = True | ||
timing_line = i | ||
last_start_time = captions[-1].start if captions else 0 | ||
try: | ||
start, end, layout_info = self._parse_timing_line(line, last_start_time) | ||
except CaptionReadError as e: | ||
new_msg = f"{e.args[0]} (line {timing_line})" | ||
tb = sys.exc_info()[2] | ||
raise type(e)(new_msg).with_traceback(tb) from None | ||
|
||
elif "" == line: | ||
if found_timing and nodes: | ||
found_timing = False | ||
caption = CaptionExt( | ||
start, | ||
end, | ||
nodes, | ||
layout_info=layout_info, | ||
segment_index=segment_index, | ||
mpegts=mpegts, | ||
cue_time=cue_time, | ||
) | ||
captions.append(caption) | ||
nodes = [] | ||
|
||
elif "WEBVTT" in line: | ||
# Merged segmented VTT doesn't have index information, track manually. | ||
segment_index += 1 | ||
mpegts = 0 | ||
cue_time = 0.0 | ||
elif m := self.RE_TIMESTAMP_MAP.match(line): | ||
if r := self.RE_MPEGTS.search(m.group()): | ||
mpegts = int(r.group(1)) | ||
|
||
cue_time = self._parse_local(m.group()) | ||
|
||
# Early assignment in case the first segment contains no cue. | ||
if segment_index == 0: | ||
captions.first_segment_mpegts = mpegts | ||
|
||
else: | ||
if found_timing: | ||
if nodes: | ||
nodes.append(CaptionNode.create_break()) | ||
nodes.append(CaptionNode.create_text(self._decode(line))) | ||
else: | ||
# it's a comment or some metadata; ignore it | ||
pass | ||
|
||
# Add a last caption if there are remaining nodes | ||
if nodes: | ||
caption = CaptionExt(start, end, nodes, layout_info=layout_info, segment_index=segment_index, mpegts=mpegts) | ||
captions.append(caption) | ||
|
||
return captions | ||
|
||
@staticmethod | ||
def _parse_local(string: str) -> float: | ||
""" | ||
Parse WebVTT LOCAL time and convert it to seconds. | ||
""" | ||
m = WebVTTReaderExt.RE_LOCAL.search(string) | ||
if not m: | ||
return 0 | ||
|
||
parsed = m.groups() | ||
if not parsed: | ||
return 0 | ||
hours = int(parsed[1]) | ||
minutes = int(parsed[2]) | ||
seconds = int(parsed[3]) | ||
milliseconds = int(parsed[4]) | ||
return (milliseconds / 1000) + seconds + (minutes * 60) + (hours * 3600) | ||
|
||
|
||
def merge_segmented_webvtt(vtt_raw: str, segment_durations: Optional[list[int]] = None, timescale: int = 1) -> str: | ||
""" | ||
Merge Segmented WebVTT data. | ||
Parameters: | ||
vtt_raw: The concatenated WebVTT files to merge. All WebVTT headers must be | ||
appropriately spaced apart, or it may produce unwanted effects like | ||
considering headers as captions, timestamp lines, etc. | ||
segment_durations: A list of each segment's duration. If not provided it will try | ||
to get it from the X-TIMESTAMP-MAP headers, specifically the MPEGTS number. | ||
timescale: The number of time units per second. | ||
This parses the X-TIMESTAMP-MAP data to compute new absolute timestamps, replacing | ||
the old start and end timestamp values. All X-TIMESTAMP-MAP header information will | ||
be removed from the output as they are no longer of concern. Consider this function | ||
the opposite of a WebVTT Segmenter, a WebVTT Joiner of sorts. | ||
Algorithm borrowed from N_m3u8DL-RE and shaka-player. | ||
""" | ||
MPEG_TIMESCALE = 90_000 | ||
|
||
vtt = WebVTTReaderExt().read(vtt_raw) | ||
for lang in vtt.get_languages(): | ||
prev_caption = None | ||
duplicate_index: list[int] = [] | ||
captions = vtt.get_captions(lang) | ||
|
||
if captions[0].segment_index == 0: | ||
first_segment_mpegts = captions[0].mpegts | ||
else: | ||
first_segment_mpegts = segment_durations[0] if segment_durations else captions.first_segment_mpegts | ||
|
||
caption: CaptionExt | ||
for i, caption in enumerate(captions): | ||
# DASH WebVTT doesn't have MPEGTS timestamp like HLS. Instead, | ||
# calculate the timestamp from SegmentTemplate/SegmentList duration. | ||
likely_dash = first_segment_mpegts == 0 and caption.mpegts == 0 | ||
if likely_dash and segment_durations: | ||
duration = segment_durations[caption.segment_index] | ||
caption.mpegts = MPEG_TIMESCALE * (duration / timescale) | ||
|
||
if caption.mpegts == 0: | ||
continue | ||
|
||
seconds = (caption.mpegts - first_segment_mpegts) / MPEG_TIMESCALE - caption.cue_time | ||
offset = seconds * 1_000_000 # pycaption use microseconds | ||
|
||
if caption.start < offset: | ||
caption.start += offset | ||
caption.end += offset | ||
|
||
# If the difference between current and previous captions is <=1ms | ||
# and the payload is equal then splice. | ||
if ( | ||
prev_caption | ||
and not caption.is_empty() | ||
and (caption.start - prev_caption.end) <= 1000 # 1ms in microseconds | ||
and caption.get_text() == prev_caption.get_text() | ||
): | ||
prev_caption.end = caption.end | ||
duplicate_index.append(i) | ||
|
||
prev_caption = caption | ||
|
||
# Remove duplicate | ||
captions[:] = [c for c_index, c in enumerate(captions) if c_index not in set(duplicate_index)] | ||
|
||
return WebVTTWriter().write(vtt) |