From 113355804f7307a27b5c5f0b8a5b1351a33a4416 Mon Sep 17 00:00:00 2001 From: Klaas Pauly Date: Sun, 29 Dec 2024 12:29:18 +0100 Subject: [PATCH] Initial commit for Blog Archiver project --- .gitignore | 14 +++++ Cargo.toml | 13 ++++ readme.md | 44 +++++++++++++ src/main.rs | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 247 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 readme.md create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..34ff1db --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +# Ignore folders +sample/ +output/ +output2/ + +# Common Rust ignores +target/ +*.rs.bk +Cargo.lock + +# Other common ignores +.DS_Store +*.log + diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..9e15932 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "blog_archiver" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.5.23", features = ["derive"] } +futures-util = { version = "0.3.31", features = ["std"] } +indicatif = "0.17.9" +regex = "1.11.1" +reqwest = { version = "0.12.11", features = ["blocking"] } +scraper = "0.22.0" +tokio = { version = "1.42.0", features = ["full"] } diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..07adced --- /dev/null +++ b/readme.md @@ -0,0 +1,44 @@ +# Blog Archiver + +Blog Archiver is a command-line tool written in Rust that downloads and saves blog posts from an archived website. It fetches the HTML content of specified blog posts, extracts the main content, and saves it as Markdown files for offline access or backup purposes. + +DISCLAIMER: this is a one-off project built specifically for a blog salvation effort from [web.archive.org](https://web.archive.org/). It is built entirely based on ChatGPT 4o prompts (including this README), in order to automate the tedious task of copying 150 blog pages and learn a few concepts along the way. It is not intended as a generic application, but could be adapted easily I suppose. Ensure you have the necessary permissions to download and store content from the target website. + +## Features + +- Fetch Base HTML: The tool fetches the HTML content of the provided base URL. +- Extract Post Links: It parses the HTML to find links to individual blog posts matching a specific pattern. +- Filter Links: Filters out links that don't match the desired pattern (e.g., URLs without a title or containing fragments). +- Download and Save Posts: For each valid post link, it fetches the HTML content, extracts the title and main content, and saves them as a Markdown file in the specified output directory. Processing is done concurrently to increase efficiency. Displays progress with a progress bar. + +## Use +A stand-alone executable for Windows x64 is available as release. No installation required, just put it at a desired location and run it from the command line. + +```blog-archiver --base_url --output_dir <"OUTPUT_DIR">``` + +where ```base_url``` is the URL to the blog archive page (that contains links to the individual post pages), and ```output_dir``` is where you want the resulting markdown files to be stored (one per post page, named by post title). Use quotation marks if your output_dir path on Windows contains spaces. + +Example: +```blog-archiver --base_url https://web.archive.org/web/20231202013114/http://blog.com/blog/ --output_dir "C:\user\My Documents\downloaded_posts"``` + +## Adapt and build + +- Ensure you have [Rust](https://www.rust-lang.org/tools/install) installed on your system. +- Clone the repository +- Navigate to the project repository +- Optionally, adapt the code +- Run the project from cargo (```cargo run -- <"OUTPUT_DIR">```), or build the project for your system (```cargo build --release```), and run the executable from ```./target/release/blog-archiver``` + +The project utilizes the following Rust crates: +- clap: For command-line argument parsing. +- futures-util: For asynchronous operations. +- scraper: For HTML parsing and element selection. +- reqwest: For making HTTP requests. +- indicatif: For displaying progress bars. +- regex: For regular expression matching. +- tokio: For asynchronous runtime. + +## Acknowledgments + +Special thanks to the authors of the Rust crates used in this project for providing excellent tools that made this project possible. + diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..303cde3 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,176 @@ +use std::collections::HashSet; +use std::fs::{create_dir_all, File}; +use std::io::Write; +use std::path::Path; + +use clap::Parser; +use futures_util::future::join_all; +use indicatif::{ProgressBar, ProgressStyle}; +use regex::Regex; +use scraper::{Html, Selector}; + +/// Command-line arguments structure +#[derive(Parser)] +struct Args { + /// Base URL of the archived website + base_url: String, + /// Output directory to save the posts + output_dir: String, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + // Parse command-line arguments + let args = Args::parse(); + + // Create HTTP client + let client = reqwest::Client::builder() + .user_agent("blog_archiver/1.0") + .build()?; + + // Fetch and parse the base URL + let base_html = fetch_html(&client, &args.base_url).await?; + let post_links = extract_post_links(&base_html)?; + println!("Found {} post links", post_links.len()); + println!("Post links: {:?}", post_links); + + // Ensure the output directory exists + create_dir_all(&args.output_dir)?; + + // Initialize the progress bar + let pb = ProgressBar::new(100); + + // Set the progress bar style + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta}) {msg}")? + .progress_chars("#>-"), + ); + + // Process posts with limited concurrency + let fetches = post_links.into_iter().map(|post_url| { + let client = client.clone(); + let output_dir = args.output_dir.clone(); + let pb = pb.clone(); + tokio::spawn(async move { + if let Err(e) = process_post(&client, &post_url, &output_dir).await { + eprintln!("Error processing {}: {}", post_url, e); + } + pb.inc(1); + }) + }); + + // Await all tasks + join_all(fetches).await; + + // Finish the progress bar + pb.finish_with_message("Processing complete"); + + Ok(()) +} + +/// Fetches the HTML content of a given URL +async fn fetch_html(client: &reqwest::Client, url: &str) -> Result { + let response = client.get(url).send().await?; + response.text().await +} + +async fn process_post(client: &reqwest::Client, post_url: &str, output_dir: &str) -> Result<(), Box> { + let post_html = fetch_html(client, post_url).await?; + if let Some((title, content)) = extract_post_content(&post_html) { + let filename = format_filename(post_url); + let filepath = Path::new(output_dir).join(filename); + save_as_markdown(&filepath, &title, &content)?; + } + Ok(()) +} + +/// Extracts post links from the base HTML by composing potential post URLs +/// and verifying their presence in the base HTML. +fn extract_post_links(html: &str) -> Result, Box> { + let document = Html::parse_document(html); + let selector = Selector::parse("a").unwrap(); + let mut unique_links = HashSet::new(); + + // Define a regex pattern to match URLs with a date and a title + let re = Regex::new(r"https://web\.archive\.org/web/\d+/http://angaatopzoek\.be/\d{4}/\d{2}/\d{2}/[^/]+/$")?; + + for element in document.select(&selector) { + if let Some(href) = element.value().attr("href") { + // Check if the href matches the desired pattern + if re.is_match(href) { + // Further filter out URLs containing a fragment identifier + if !href.contains('#') { + unique_links.insert(href.to_string()); + } + } + } + } + + Ok(unique_links.into_iter().collect()) +} + +/// Extracts the content from a post's HTML +fn extract_post_content(html: &str) -> Option<(String, String)> { + let document = Html::parse_document(html); + + // Extract the title from the first

tag (if present) + let title = document + .select(&Selector::parse("h4").ok()?) + .next() + .map(|node| node.text().collect::>().join(" ")) + .unwrap_or_else(|| "Untitled".to_string()); + + // Extract content in the natural order of tags + let mut content_parts = Vec::new(); + for node in document.select(&Selector::parse("h4, p, cite").unwrap()) { + content_parts.push(node.text().collect::>().join(" ")); + } + + // Remove duplicate title if it was extracted from the first

+ if let Some(first_content) = content_parts.first() { + if *first_content == title { + content_parts.remove(0); + } + } + + let content = content_parts.join("\n\n"); + + Some((title, content)) +} + +fn format_filename(url: &str) -> String { + // Extract the path after the domain + if let Some(path_start) = url.find("angaatopzoek.be") { + let path = &url[path_start + "angaatopzoek.be".len()..]; + let sanitized_path = path + .trim_matches('/') + .replace('/', "_") + .replace('-', "_"); + + if sanitized_path.is_empty() { + "default_post.md".to_string() + } else { + format!("{}.md", sanitized_path) + } + } else { + // Fallback in case the URL doesn't match the expected format + "unknown_post.md".to_string() + } +} + +/// Saves the content as a Markdown file +fn save_as_markdown(path: &Path, title: &str, content: &str) -> std::io::Result<()> { + let mut file = File::create(path)?; + + // Write the title + if !title.is_empty() { + writeln!(file, "# {} +", title)?; + } + + // Write the content + writeln!(file, "{}", content)?; + + Ok(()) +}