BREAKING: full refactor to pipelines and steps (#15)

* wip: audio pipeline impl * update pipeline steps to lambdas * add individual clip normalization * add to readme * add tests for all audio pipeline and steps * delete old process-audio * delete old files * add video pipeline * fix tests * fix: wrong input case * add extra comments * feat: add step to delete intermediate, non-downloaded files * add make clean-dirs to wipe cache and output dirs
metrophilly · Dec 31, 2024 · 0cfdf03 · 0cfdf03
1 parent 2a1c7da
commit 0cfdf03
Show file tree

Hide file tree

Showing 65 changed files with 2,421 additions and 1,039 deletions.
diff --git a/.env.example b/.env.example
diff --git a/.gitattributes b/.gitattributes
diff --git a/.gitignore b/.gitignore
@@ -3,5 +3,9 @@ __pycache__/
 data/
 tmp/
 
-.env
+.vscode/
+
+config/pipeline_config.json
+cache/
+output/
 tests/logs
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -4,44 +4,66 @@
     "acodec",
     "acompressor",
     "acrossfade",
+    "alimiter",
     "anullsrc",
     "anyio",
+    "asetrate",
     "audacityteam",
     "bestaudio",
     "bestvideo",
     "Bitwarden",
     "certifi",
+    "colorama",
     "compand",
     "crossfade",
     "crossfadein",
     "crossfading",
+    "dataclass",
     "dotenv",
+    "downloaders",
     "escription",
     "fadein",
     "ffprobe",
     "httpx",
     "idna",
     "imageio",
+    "jsonschema",
     "lavfi",
+    "levelname",
     "libx",
     "loudnorm",
     "lxml",
     "mdfind",
+    "nokey",
+    "nopostoverwrites",
+    "noprint",
     "numpy",
+    "outtmpl",
     "perfcounter",
     "pluggy",
+    "postprocessors",
+    "preferredcodec",
+    "preferredquality",
     "proglog",
     "pytest",
+    "PYTHONPATH",
+    "pytube",
+    "quantizer",
     "rootdir",
     "strerror",
     "studylight",
     "subclip",
+    "superfast",
     "tqdm",
     "udio",
+    "ultrafast",
     "urllib",
     "versetxt",
+    "veryfast",
+    "veryslow",
     "videoclips",
     "videofile",
-    "xfade"
+    "xfade",
+    "ytdl"
   ]
 }
diff --git a/Makefile b/Makefile
@@ -1,7 +1,9 @@
-COMPOSE = docker-compose
+COMPOSE = docker-compose -f docker/docker-compose.yml
 SERVICE = sermon-processor
 TEST_SERVICE = tests
 LOG_DIR = tests/logs
+CACHE_DIR = cache
+OUTPUT_DIR = output
 TIMESTAMP = $(shell date +"%Y%m%d_%H%M%S")
 LOG_FILE = $(LOG_DIR)/test_output_$(TIMESTAMP).log
 
@@ -22,22 +24,61 @@ clean:
 	docker image prune -f || true
 	@echo "Cleanup complete."
 
+clean-dirs:
+	@echo "Deleting cache and output directories..."
+	@if [ -d "$(CACHE_DIR)" ]; then \
+		rm -rf $(CACHE_DIR); \
+		echo "Deleted: $(CACHE_DIR)"; \
+	else \
+		echo "$(CACHE_DIR) not found. Skipping..."; \
+	fi
+	@if [ -d "$(OUTPUT_DIR)" ]; then \
+		rm -rf $(OUTPUT_DIR); \
+		echo "Deleted: $(OUTPUT_DIR)"; \
+	else \
+		echo "$(OUTPUT_DIR) not found. Skipping..."; \
+	fi
+	@echo "Cache and output directories cleaned."
+
+clean-all: clean clean-dirs
+	@echo "Performed full cleanup of Docker and directories."
+
 # Run the production service (depends on build and clean)
 run: clean build
 	@echo "Running '$(SERVICE)' Docker service..."
 	$(COMPOSE) run --rm --name $(SERVICE) $(SERVICE)
-	@echo ""
-	@echo "---"
-	@echo "Production environment is running."
+
+# Run the audio pipeline
+run-audio: clean build
+	@echo "Running the audio pipeline..."
+	$(COMPOSE) run --rm $(SERVICE) python3 scripts/run_audio_pipeline.py
+
+# Run the video pipeline
+run-video: clean build
+	@echo "Running the video pipeline..."
+	$(COMPOSE) run --rm $(SERVICE) python3 scripts/run_video_pipeline.py
+
+# Run both pipelines
+run-both: clean build
+	@echo "Running the audio pipeline..."
+	$(COMPOSE) run --rm $(SERVICE) python3 scripts/run_audio_pipeline.py
+	@echo "Running the video pipeline..."
+	$(COMPOSE) run --rm $(SERVICE) python3 scripts/run_video_pipeline.py
 
 # Run tests
 test: create-log-dir
 	@echo "Building Docker images for testing..."
 	$(COMPOSE) build $(TEST_SERVICE)
 	@echo "Running tests..."
-	$(COMPOSE) run --rm $(TEST_SERVICE) | tee $(LOG_FILE)
+	@if [ -n "$(TEST_FILE)" ]; then \
+		$(COMPOSE) run --rm $(TEST_SERVICE) pytest $(TEST_FILE) | tee $(LOG_FILE); \
+	else \
+		$(COMPOSE) run --rm $(TEST_SERVICE) pytest tests/ | tee $(LOG_FILE); \
+	fi
 	@echo "Test logs saved to $(LOG_FILE)"
 	@echo ""
 	@echo "---"
 	@echo "Cleaning up dangling Docker images..."
-	docker image prune -f || true
+	docker image prune -f || true
+
+# TODO: maybe add in a 'clean-logs' to wipe the test logs
diff --git a/README.md b/README.md
@@ -1,69 +1,101 @@
 # 🎛️ Metro Sermons Processor
 
-This is a small script to automate the downloading, processing, and scrubbing of
-Metro's weekly sermons from YouTube and audio of Spotify. This guide will walk
-you through setting up `sermon-processor` project using Docker.
+## tl;dr
 
-## Prerequisites
+1. set `config/pipeline_config.json`
+2. run `make run`
+3. check `output/` for processed files
 
-Ensure you have `docker` and `docker-compose` installed on your system. You can
-check if they're installed by running:
+## Configuration
 
-```bash
-docker --version
-docker-compose --version
-```
+`config/pipeline_config.json` is the main config file needed to run this thing.
 
-## Set config parameters
+- run the following to copy the example config to get the structure:
+  - `cp config/pipeline_config.json.example config/pipeline_config.json`
+- then update file with actual configurations (ask paul for now—will add to
+  bitwarden vault when done)
 
-The `.env` file allows for pre-filled parameters. If you don't have it, run the
-following command to use the provided template. Please reference the Bitwarden
-Vault for keys
+## Running
 
-```bash
-cp .env.example .env
-```
+Run scripts are based out of the `Makefile` (make sure you have Docker running
+in the background)
 
-Ensure that you have the `url`, `start`, and `end` params set to run the script.
+- to run:
+  - `make run`
+- to run just audio processing:
+  - `make run-audio`
+- to run just video processing:
+  - `make run-video`
+- to run both sequentially:
+  - `make run-both`
 
-## Run the script
+## Testing
 
-1. Ensure that you have Docker Desktop running in the background.,
+All tests are held in `tests/`, and are run through `pytest`
 
-2. Run the start command from the Makefile:
+- to run all tests:
+  - `make test`
+- to run a single test:
+  - `make test TEST_FILE=test_filename.py`
 
-```bash
-make run
-```
+## 🌊 General flow
 
-## Run tests
+### Startup & Docker
 
-```bash
-make test
-```
+1. Running `make run` looks in the `Makefile` and runs the `run` command
+2. `make run` kicks off `docker-compose`, which builds a container based of the
+   `Dockerfile`
+3. The running container calls the startup script at `scripts/startup.py`
 
-## Post-Processing
+### Audio Processing
 
-To ensure the correct media file is generated each week, it is essential to
-delete the `tmp` directory after the current media file has been successfully
-created. If the `tmp` directory is not deleted, next week’s media file will use
-the previous week’s data instead of generating a new file.
+Selecting to process the audio kicks off `scripts/run_audio_pipeline.py`, which:
 
-Delete the `tmp` directory by either:
+1. Validates that the config file is correct which builds the main audio
+2. Creates the PipeLineData dataclass, which is the main object that tracks the
+   paths of the different intermediate steps of the pipeline
+3. Calls `app/pipelines/audio_pipeline.py` which is the meat of the
+   processing—`create_audio_pipeline()` composes and returns a pipeline made of
+   discrete "steps" (found in `app/steps/`) that will iteratively run and
+   process the piece of content as we need.
 
-- Manually deleting the folder
-- Running the command `rm -r tmp` in your terminal
+### DownloaderProxy & Caching
 
-Once the automated processing is complete, the final media file will be saved to
-`data/` by default, or the specified data directory on your host machine.
-Afterwards, follow the manual steps that print in the console.
+One problem that we had was re-running the script, only to download the same
+files we already had all over again. In order to solve this, we created
+`app/downloaders/downloader_proxy.py`. What this does is, during the downloading
+step, before we actually download the file, we predict what the resulting output
+filepath is going to be, and if it already exists, we just use that filepath,
+and skip the download. This should save us a bunch of time on script re-runs.
 
-## Deactivating Docker Container
+`app/cache/` holds the downloaded and intermediary files.
 
-When you're done, the Docker container will automatically stop as we've used the
-`--rm` flag, which removes the container after it exits. If you'd like to
-manually stop the image, you can run the following:
+### `ffmpeg` Flag Notes
 
-```bash
-docker-compose down
-```
+`ffmpeg` is the main file processing engine. To run it in the different steps,
+we construct a string array of commands and flags, then call it directly as a
+system subprocess. In the constructed flags the following are useful to know:
+
+- `-crf` - Constant Rate Factor
+  - usually set to: 16
+  - "The range of the quantizer scale is 0-51: where 0 is lossless, 23 is
+    default, and 51 is worst possible. A lower value is a higher quality and a
+    subjectively sane range is 18-28. Consider 18 to be visually lossless or
+    nearly so: it should look the same or nearly the same as the input but it
+    isn't technically lossless."
+- `-preset`
+  - usually set to: ultrafast
+  - "These presets affect the encoding speed. Using a slower preset gives you
+    better compression, or quality per filesize, whereas faster presets give you
+    worse compression. In general, you should just use the preset you can afford
+    to wait for. Presets can be ultrafast, superfast, veryfast, faster, fast,
+    medium (default), slow and veryslow."
+
+We've set the default to `crf=16` and `preset=ultrafast` because we're ok with
+the larger resultant filesize, as we're generally just going to upload it to
+youtube and then delete it.
+
+### Output
+
+After the pipeline runs, check the `output/{stream_id}/` dir for the finished
+file (`stream_id` is whatever you set in the config file)
diff --git a/app/constants.py b/app/constants.py
@@ -0,0 +1,9 @@
+class PipelineKeys:
+    YOUTUBE_URL = "youtube_url"
+    MAIN_FILE_PATH = "main_file_path"
+    INTRO_FILE_PATH = "intro_file_path"
+    OUTRO_FILE_PATH = "outro_file_path"
+    ACTIVE_FILE_PATH = "active_file_path"
+    FINAL_OUTPUT_PATH = "final_output_path"
+    DOWNLOADED_FILES = "downloaded_files"
+    INTERMEDIATE_FILES = "intermediate_files"
diff --git a/app/data_models/pipeline_data.py b/app/data_models/pipeline_data.py
@@ -0,0 +1,19 @@
+# app/data_models/pipeline_data.py
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+
+@dataclass
+class PipelineData:
+    """
+    Structured data model for pipeline operations.
+    """
+
+    main_file_path: Optional[str] = None
+    intro_file_path: Optional[str] = None
+    outro_file_path: Optional[str] = None
+
+    active_file_path: Optional[str] = None
+    final_output_path: Optional[str] = None
+    downloaded_files: List[str] = field(default_factory=list)
+    intermediate_files: List[str] = field(default_factory=list)
diff --git a/app/downloaders/base_downloader.py b/app/downloaders/base_downloader.py
@@ -0,0 +1,14 @@
+class Downloader:
+
+    def download(self, url, destination):
+        """
+        Download a file from the given URL to the specified destination.
+
+        Args:
+            url (str): The file URL to download.
+            destination (str): The local file path to save the downloaded file.
+
+        Raises:
+            NotImplementedError: Must be implemented by subclasses.
+        """
+        raise NotImplementedError("Subclasses must implement the `download` method.")