From 39c9810d04c64941d0b14b0169a62a90b0f889fd Mon Sep 17 00:00:00 2001 From: Gerd Zellweger Date: Mon, 6 Jan 2025 19:12:44 -0800 Subject: [PATCH] Censor URLs and bad words. Signed-off-by: Gerd Zellweger --- Cargo.lock | 60 ++++++++++++++++++++++++++++++++++++++- server/Cargo.toml | 4 ++- server/src/spreadsheet.rs | 23 +++++++++++++-- 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c48eb2b..b949f1c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -740,6 +740,12 @@ dependencies = [ "libloading", ] +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "document-features" version = "0.2.8" @@ -1059,6 +1065,12 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "finl_unicode" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94c970b525906eb37d3940083aa65b95e481fc1857d467d13374e1d925cfc163" + [[package]] name = "flate2" version = "1.0.28" @@ -1256,7 +1268,9 @@ dependencies = [ "env_logger", "futures", "log", + "regex", "reqwest", + "rustrict", "serde", "serde_json", "tokio", @@ -1714,6 +1728,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -1790,6 +1813,12 @@ dependencies = [ "arrayvec", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "lebe" version = "0.5.2" @@ -2720,7 +2749,7 @@ dependencies = [ "built", "cfg-if", "interpolate_name", - "itertools", + "itertools 0.12.1", "libc", "libfuzzer-sys", "log", @@ -2921,6 +2950,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" version = "0.38.31" @@ -2975,6 +3010,23 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustrict" +version = "0.7.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a566beb65e3d86654ff5cc7091d947589ef06f4bc6ba946455c2851ab1193914" +dependencies = [ + "arrayvec", + "bitflags 1.3.2", + "doc-comment", + "finl_unicode", + "itertools 0.10.5", + "lazy_static", + "rustc-hash", + "strsim", + "unicode-normalization", +] + [[package]] name = "rustversion" version = "1.0.18" @@ -3292,6 +3344,12 @@ dependencies = [ "float-cmp", ] +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + [[package]] name = "subtle" version = "2.6.1" diff --git a/server/Cargo.toml b/server/Cargo.toml index 1af1103..31d642c 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -18,4 +18,6 @@ log = "0.4.22" env_logger = "0.11.5" chrono = "0.4.38" dashmap = "6.1.0" -tower-http = { version = "0.6.2", features = ["cors"] } \ No newline at end of file +tower-http = { version = "0.6.2", features = ["cors"] } +rustrict = "0.7.33" +regex = "1.10.2" \ No newline at end of file diff --git a/server/src/spreadsheet.rs b/server/src/spreadsheet.rs index 61f0baf..7011525 100644 --- a/server/src/spreadsheet.rs +++ b/server/src/spreadsheet.rs @@ -11,7 +11,9 @@ use axum::http::HeaderMap; use chrono::Utc; use futures::{sink::SinkExt, stream::StreamExt}; use log::{debug, error, trace, warn}; +use regex::Regex; use reqwest::Client; +use rustrict::Censor; use serde::{Deserialize, Serialize}; use tokio::sync::{broadcast::Receiver, mpsc, watch, RwLock}; @@ -314,6 +316,21 @@ struct UpdatePayload { ts: String, } +fn replace_domain_in_urls(input: &str, new_domain: &str) -> String { + // Regex breakdown: + // (https?://) captures the protocol (http or https) + // ([^/\s]+) captures the domain portion (everything until a slash or whitespace) + // ([^\s]*) captures the remainder of the URL (path/query/etc. until whitespace) + let url_regex = Regex::new(r"(https?://)([^/\s]+)([^\s]*)").unwrap(); + + url_regex + .replace_all(input, |caps: ®ex::Captures| { + // caps[1] is the scheme+://, caps[2] is the original domain, caps[3] is the path/query + format!("{}{}{}", &caps[1], new_domain, &caps[3]) + }) + .to_string() +} + pub(crate) async fn post_handler( headers: HeaderMap, ConnectInfo(addr): ConnectInfo, @@ -338,10 +355,12 @@ pub(crate) async fn post_handler( Json(serde_json::json!({"error": "Invalid cell ID"})), ); } - let raw_value = update_request.raw_value.chars().take(64).collect::(); + let user_value = update_request.raw_value.chars().take(64).collect::(); + let censored_urls = replace_domain_in_urls(&user_value, "*REDACTED*"); + let censored_input = Censor::new(censored_urls.chars()).censor(); let payload = UpdatePayload { id: update_request.id, - raw_value, + raw_value: censored_input, background: update_request.background, ip: client_ip, ts: Utc::now().format("%Y-%m-%d %H:%M:%S%.3f").to_string(),