Skip to content

Commit

Permalink
BREAKING: full refactor to pipelines and steps (#15)
Browse files Browse the repository at this point in the history
* wip: audio pipeline impl

* update pipeline steps to lambdas

* add individual clip normalization

* add to readme

* add tests for all audio pipeline and steps

* delete old process-audio

* delete old files

* add video pipeline

* fix tests

* fix: wrong input case

* add extra comments

* feat: add step to delete intermediate, non-downloaded files

* add make clean-dirs to wipe cache and output dirs
  • Loading branch information
paulhchoi authored Dec 31, 2024
1 parent 2a1c7da commit 0cfdf03
Show file tree
Hide file tree
Showing 65 changed files with 2,421 additions and 1,039 deletions.
6 changes: 0 additions & 6 deletions .env.example

This file was deleted.

2 changes: 0 additions & 2 deletions .gitattributes

This file was deleted.

6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,9 @@ __pycache__/
data/
tmp/

.env
.vscode/

config/pipeline_config.json
cache/
output/
tests/logs
24 changes: 23 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,66 @@
"acodec",
"acompressor",
"acrossfade",
"alimiter",
"anullsrc",
"anyio",
"asetrate",
"audacityteam",
"bestaudio",
"bestvideo",
"Bitwarden",
"certifi",
"colorama",
"compand",
"crossfade",
"crossfadein",
"crossfading",
"dataclass",
"dotenv",
"downloaders",
"escription",
"fadein",
"ffprobe",
"httpx",
"idna",
"imageio",
"jsonschema",
"lavfi",
"levelname",
"libx",
"loudnorm",
"lxml",
"mdfind",
"nokey",
"nopostoverwrites",
"noprint",
"numpy",
"outtmpl",
"perfcounter",
"pluggy",
"postprocessors",
"preferredcodec",
"preferredquality",
"proglog",
"pytest",
"PYTHONPATH",
"pytube",
"quantizer",
"rootdir",
"strerror",
"studylight",
"subclip",
"superfast",
"tqdm",
"udio",
"ultrafast",
"urllib",
"versetxt",
"veryfast",
"veryslow",
"videoclips",
"videofile",
"xfade"
"xfade",
"ytdl"
]
}
53 changes: 47 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
COMPOSE = docker-compose
COMPOSE = docker-compose -f docker/docker-compose.yml
SERVICE = sermon-processor
TEST_SERVICE = tests
LOG_DIR = tests/logs
CACHE_DIR = cache
OUTPUT_DIR = output
TIMESTAMP = $(shell date +"%Y%m%d_%H%M%S")
LOG_FILE = $(LOG_DIR)/test_output_$(TIMESTAMP).log

Expand All @@ -22,22 +24,61 @@ clean:
docker image prune -f || true
@echo "Cleanup complete."

clean-dirs:
@echo "Deleting cache and output directories..."
@if [ -d "$(CACHE_DIR)" ]; then \
rm -rf $(CACHE_DIR); \
echo "Deleted: $(CACHE_DIR)"; \
else \
echo "$(CACHE_DIR) not found. Skipping..."; \
fi
@if [ -d "$(OUTPUT_DIR)" ]; then \
rm -rf $(OUTPUT_DIR); \
echo "Deleted: $(OUTPUT_DIR)"; \
else \
echo "$(OUTPUT_DIR) not found. Skipping..."; \
fi
@echo "Cache and output directories cleaned."

clean-all: clean clean-dirs
@echo "Performed full cleanup of Docker and directories."

# Run the production service (depends on build and clean)
run: clean build
@echo "Running '$(SERVICE)' Docker service..."
$(COMPOSE) run --rm --name $(SERVICE) $(SERVICE)
@echo ""
@echo "---"
@echo "Production environment is running."

# Run the audio pipeline
run-audio: clean build
@echo "Running the audio pipeline..."
$(COMPOSE) run --rm $(SERVICE) python3 scripts/run_audio_pipeline.py

# Run the video pipeline
run-video: clean build
@echo "Running the video pipeline..."
$(COMPOSE) run --rm $(SERVICE) python3 scripts/run_video_pipeline.py

# Run both pipelines
run-both: clean build
@echo "Running the audio pipeline..."
$(COMPOSE) run --rm $(SERVICE) python3 scripts/run_audio_pipeline.py
@echo "Running the video pipeline..."
$(COMPOSE) run --rm $(SERVICE) python3 scripts/run_video_pipeline.py

# Run tests
test: create-log-dir
@echo "Building Docker images for testing..."
$(COMPOSE) build $(TEST_SERVICE)
@echo "Running tests..."
$(COMPOSE) run --rm $(TEST_SERVICE) | tee $(LOG_FILE)
@if [ -n "$(TEST_FILE)" ]; then \
$(COMPOSE) run --rm $(TEST_SERVICE) pytest $(TEST_FILE) | tee $(LOG_FILE); \
else \
$(COMPOSE) run --rm $(TEST_SERVICE) pytest tests/ | tee $(LOG_FILE); \
fi
@echo "Test logs saved to $(LOG_FILE)"
@echo ""
@echo "---"
@echo "Cleaning up dangling Docker images..."
docker image prune -f || true
docker image prune -f || true

# TODO: maybe add in a 'clean-logs' to wipe the test logs
124 changes: 78 additions & 46 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,69 +1,101 @@
# 🎛️ Metro Sermons Processor

This is a small script to automate the downloading, processing, and scrubbing of
Metro's weekly sermons from YouTube and audio of Spotify. This guide will walk
you through setting up `sermon-processor` project using Docker.
## tl;dr

## Prerequisites
1. set `config/pipeline_config.json`
2. run `make run`
3. check `output/` for processed files

Ensure you have `docker` and `docker-compose` installed on your system. You can
check if they're installed by running:
## Configuration

```bash
docker --version
docker-compose --version
```
`config/pipeline_config.json` is the main config file needed to run this thing.

## Set config parameters
- run the following to copy the example config to get the structure:
- `cp config/pipeline_config.json.example config/pipeline_config.json`
- then update file with actual configurations (ask paul for now—will add to
bitwarden vault when done)

The `.env` file allows for pre-filled parameters. If you don't have it, run the
following command to use the provided template. Please reference the Bitwarden
Vault for keys
## Running

```bash
cp .env.example .env
```
Run scripts are based out of the `Makefile` (make sure you have Docker running
in the background)

Ensure that you have the `url`, `start`, and `end` params set to run the script.
- to run:
- `make run`
- to run just audio processing:
- `make run-audio`
- to run just video processing:
- `make run-video`
- to run both sequentially:
- `make run-both`

## Run the script
## Testing

1. Ensure that you have Docker Desktop running in the background.,
All tests are held in `tests/`, and are run through `pytest`

2. Run the start command from the Makefile:
- to run all tests:
- `make test`
- to run a single test:
- `make test TEST_FILE=test_filename.py`

```bash
make run
```
## 🌊 General flow

## Run tests
### Startup & Docker

```bash
make test
```
1. Running `make run` looks in the `Makefile` and runs the `run` command
2. `make run` kicks off `docker-compose`, which builds a container based of the
`Dockerfile`
3. The running container calls the startup script at `scripts/startup.py`

## Post-Processing
### Audio Processing

To ensure the correct media file is generated each week, it is essential to
delete the `tmp` directory after the current media file has been successfully
created. If the `tmp` directory is not deleted, next week’s media file will use
the previous week’s data instead of generating a new file.
Selecting to process the audio kicks off `scripts/run_audio_pipeline.py`, which:

Delete the `tmp` directory by either:
1. Validates that the config file is correct which builds the main audio
2. Creates the PipeLineData dataclass, which is the main object that tracks the
paths of the different intermediate steps of the pipeline
3. Calls `app/pipelines/audio_pipeline.py` which is the meat of the
processing—`create_audio_pipeline()` composes and returns a pipeline made of
discrete "steps" (found in `app/steps/`) that will iteratively run and
process the piece of content as we need.

- Manually deleting the folder
- Running the command `rm -r tmp` in your terminal
### DownloaderProxy & Caching

Once the automated processing is complete, the final media file will be saved to
`data/` by default, or the specified data directory on your host machine.
Afterwards, follow the manual steps that print in the console.
One problem that we had was re-running the script, only to download the same
files we already had all over again. In order to solve this, we created
`app/downloaders/downloader_proxy.py`. What this does is, during the downloading
step, before we actually download the file, we predict what the resulting output
filepath is going to be, and if it already exists, we just use that filepath,
and skip the download. This should save us a bunch of time on script re-runs.

## Deactivating Docker Container
`app/cache/` holds the downloaded and intermediary files.

When you're done, the Docker container will automatically stop as we've used the
`--rm` flag, which removes the container after it exits. If you'd like to
manually stop the image, you can run the following:
### `ffmpeg` Flag Notes

```bash
docker-compose down
```
`ffmpeg` is the main file processing engine. To run it in the different steps,
we construct a string array of commands and flags, then call it directly as a
system subprocess. In the constructed flags the following are useful to know:

- `-crf` - Constant Rate Factor
- usually set to: 16
- "The range of the quantizer scale is 0-51: where 0 is lossless, 23 is
default, and 51 is worst possible. A lower value is a higher quality and a
subjectively sane range is 18-28. Consider 18 to be visually lossless or
nearly so: it should look the same or nearly the same as the input but it
isn't technically lossless."
- `-preset`
- usually set to: ultrafast
- "These presets affect the encoding speed. Using a slower preset gives you
better compression, or quality per filesize, whereas faster presets give you
worse compression. In general, you should just use the preset you can afford
to wait for. Presets can be ultrafast, superfast, veryfast, faster, fast,
medium (default), slow and veryslow."

We've set the default to `crf=16` and `preset=ultrafast` because we're ok with
the larger resultant filesize, as we're generally just going to upload it to
youtube and then delete it.

### Output

After the pipeline runs, check the `output/{stream_id}/` dir for the finished
file (`stream_id` is whatever you set in the config file)
9 changes: 9 additions & 0 deletions app/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
class PipelineKeys:
YOUTUBE_URL = "youtube_url"
MAIN_FILE_PATH = "main_file_path"
INTRO_FILE_PATH = "intro_file_path"
OUTRO_FILE_PATH = "outro_file_path"
ACTIVE_FILE_PATH = "active_file_path"
FINAL_OUTPUT_PATH = "final_output_path"
DOWNLOADED_FILES = "downloaded_files"
INTERMEDIATE_FILES = "intermediate_files"
19 changes: 19 additions & 0 deletions app/data_models/pipeline_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# app/data_models/pipeline_data.py
from dataclasses import dataclass, field
from typing import List, Optional


@dataclass
class PipelineData:
"""
Structured data model for pipeline operations.
"""

main_file_path: Optional[str] = None
intro_file_path: Optional[str] = None
outro_file_path: Optional[str] = None

active_file_path: Optional[str] = None
final_output_path: Optional[str] = None
downloaded_files: List[str] = field(default_factory=list)
intermediate_files: List[str] = field(default_factory=list)
14 changes: 14 additions & 0 deletions app/downloaders/base_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class Downloader:

def download(self, url, destination):
"""
Download a file from the given URL to the specified destination.
Args:
url (str): The file URL to download.
destination (str): The local file path to save the downloaded file.
Raises:
NotImplementedError: Must be implemented by subclasses.
"""
raise NotImplementedError("Subclasses must implement the `download` method.")
Loading

0 comments on commit 0cfdf03

Please sign in to comment.