From a845312ab40e145f94d3d4939b8eff2e362da19c Mon Sep 17 00:00:00 2001 From: Shivelight Date: Fri, 8 Dec 2023 10:34:46 +0800 Subject: [PATCH] Make subtitle timestamp fix an opt-out with `--no-timestamp-fix` --- devine/commands/dl.py | 3 ++ devine/core/tracks/subtitle.py | 62 ++++++++++++++-------------------- devine/core/utils/webvtt.py | 6 ++-- 3 files changed, 31 insertions(+), 40 deletions(-) diff --git a/devine/commands/dl.py b/devine/commands/dl.py index 8134a8c9..21173348 100644 --- a/devine/commands/dl.py +++ b/devine/commands/dl.py @@ -105,6 +105,8 @@ class dl: @click.option("--sub-format", type=click.Choice(Subtitle.Codec, case_sensitive=False), default=None, help="Set Output Subtitle Format, only converting if necessary.") + @click.option("--no-timestamp-fix", "fix_sub_timestamp", is_flag=True, default=True, + help="Disable subtitle timestamp fix.") @click.option("-V", "--video-only", is_flag=True, default=False, help="Only download video tracks.") @click.option("-A", "--audio-only", is_flag=True, default=False, @@ -266,6 +268,7 @@ def result( v_lang: list[str], s_lang: list[str], sub_format: Optional[Subtitle.Codec], + fix_sub_timestamp: bool, video_only: bool, audio_only: bool, subs_only: bool, diff --git a/devine/core/tracks/subtitle.py b/devine/core/tracks/subtitle.py index 4982219c..3b0817c8 100644 --- a/devine/core/tracks/subtitle.py +++ b/devine/core/tracks/subtitle.py @@ -264,8 +264,22 @@ def convert(self, codec: Subtitle.Codec) -> Path: if writer is None: raise NotImplementedError(f"Cannot yet convert {self.codec.name} to {codec.name}.") - caption_set = self.parse(self.path.read_bytes(), self.codec) + if self.descriptor == Subtitle.Descriptor.DASH: + # TODO: Populated in DASH.download_track. Perhaps DASH/HLS class should + # use a dict instead of a tuple? + # TODO PR#67 rlaphoenix: This will be moved/done within self.parse instead + extra = { + "_timescale": self.data["dash"]["_timescale"], + "_segment_duration": self.data["dash"]["_segment_duration"], + } + else: + extra = None + + # TODO PR#67 rlaphoenix: the True is a bool to say if we should fix webvtt timestamps or not, a bool from dl CLI args + # The Subtitle.convert() method is not passed it but idc cause I will remove this anyway + caption_set = self.parse(self.path.read_bytes(), self.codec, True, extra) Subtitle.merge_same_cues(caption_set) + subtitle_text = writer().write(caption_set) output_path.write_text(subtitle_text, encoding="utf8") @@ -279,7 +293,9 @@ def convert(self, codec: Subtitle.Codec) -> Path: return output_path @staticmethod - def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet: + def parse(data: bytes, codec: Subtitle.Codec, fix_sub_timestamp: bool = False, extra: Optional[dict] = None) -> pycaption.CaptionSet: + extra = extra or {} + # TODO: Use an "enum" for subtitle codecs if not isinstance(data, bytes): raise ValueError(f"Subtitle data must be parsed as bytes data, not {type(data).__name__}") @@ -310,7 +326,13 @@ def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet: caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists) elif codec == Subtitle.Codec.WebVTT: text = Subtitle.space_webvtt_headers(data) - caption_set = pycaption.WebVTTReader().read(text) + caption_set: pycaption.CaptionSet + if fix_sub_timestamp: + duration = extra.get("_segment_duration") + timescale = extra.get("_timescale", 1) + caption_set = fix_webvtt_timestamp(text, segment_duration=duration, timescale=timescale) + else: + caption_set = pycaption.WebVTTReader().read(text) else: raise ValueError(f"Unknown Subtitle format \"{codec}\"...") except pycaption.exceptions.CaptionReadSyntaxError as e: @@ -575,39 +597,5 @@ def reverse_rtl(self) -> None: stdout=subprocess.DEVNULL ) - def fix_webvtt_timestamp(self) -> None: - # TODO PR#67 rlaphoenix: This func name clashes with the import from newly added utils.webvtt - """ - Convert segmented WebVTT timestamps where each cue starts at 0 (relative to the segment) - to absolute timestamps. - - This function is not called by default; instead, service code should explicitly call - this function when needed. Example using a callback:: - - if isinstance(track, Subtitle): - track.OnDownloaded = lambda track: track.fix_webvtt_timestamp() - - """ - if not self.path or not self.path.exists(): - raise ValueError("You must download the subtitle track first.") - - if self.codec is not Subtitle.Codec.WebVTT: - raise ValueError(f"Expected subtitle codec to be a {Subtitle.Codec.WebVTT}, not {self.codec}.") - - if self.descriptor is Subtitle.Descriptor.MPD: - segment_duration = self.data["dash"]["_segment_duration"] - timescale = self.data["dash"]["_timescale"] - elif self.descriptor is Subtitle.Descriptor.M3U: - segment_duration = None - timescale = 1 - else: - return - - text = Subtitle.space_webvtt_headers(self.path.read_text("utf8")) - fixed = fix_webvtt_timestamp( - text, segment_duration=segment_duration, timescale=timescale - ) - - self.path.write_text(fixed, "utf8") __all__ = ("Subtitle",) diff --git a/devine/core/utils/webvtt.py b/devine/core/utils/webvtt.py index 96682241..ad296fce 100644 --- a/devine/core/utils/webvtt.py +++ b/devine/core/utils/webvtt.py @@ -3,7 +3,7 @@ import typing from typing import Optional -from pycaption import Caption, CaptionList, CaptionNode, CaptionReadError, WebVTTReader, WebVTTWriter +from pycaption import Caption, CaptionList, CaptionNode, CaptionReadError, CaptionSet, WebVTTReader class CaptionListExt(CaptionList): @@ -121,7 +121,7 @@ def _parse_local(string: str) -> float: return (milliseconds / 1000) + seconds + (minutes * 60) + (hours * 3600) -def fix_webvtt_timestamp(vtt_raw: str, segment_duration: Optional[list[int]] = None, timescale: int = 1) -> str: +def fix_webvtt_timestamp(vtt_raw: str, segment_duration: Optional[list[int]] = None, timescale: int = 1) -> CaptionSet: """ Fix relative timestamp from segmented WebVTT to absolute timestamp. @@ -175,4 +175,4 @@ def fix_webvtt_timestamp(vtt_raw: str, segment_duration: Optional[list[int]] = N # Remove duplicate captions[:] = [c for c_index, c in enumerate(captions) if c_index not in set(duplicate_index)] - return WebVTTWriter().write(vtt) + return vtt