diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index dbeea187..6a94c4ab 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -1,12 +1,11 @@ use casr::util; -use libcasr::{init_ignored_frames, stacktrace::*}; +use libcasr::{cluster::*, init_ignored_frames, stacktrace::*}; use anyhow::{bail, Context, Result}; use clap::{builder::FalseyValueParser, Arg, ArgAction}; -use rayon::iter::{IndexedParallelIterator, ParallelIterator}; -use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator}; +use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fs; use std::path::{Path, PathBuf}; use std::sync::RwLock; @@ -42,8 +41,8 @@ fn stacktrace(path: &Path) -> Result { /// # Return value /// /// * Number of clusters -/// * Number of valid casrep before crashiline deduplication -/// * Number of valid casrep after crashiline deduplication +/// * Number of valid casreps before crashline deduplication +/// * Number of valid casreps after crashline deduplication fn make_clusters( inpath: &Path, outpath: Option<&Path>, @@ -52,110 +51,33 @@ fn make_clusters( ) -> Result<(usize, usize, usize)> { // if outpath is "None" we consider that outpath and inpath are the same let outpath = outpath.unwrap_or(inpath); - let dir = fs::read_dir(inpath).with_context(|| format!("File: {}", inpath.display()))?; - - let casreps: Vec = dir - .map(|path| path.unwrap().path()) - .filter(|s| s.extension().is_some() && s.extension().unwrap() == "casrep") - .collect(); + let casreps = util::get_reports(inpath)?; let len = casreps.len(); if len < 2 { bail!("{} reports, nothing to cluster...", len); } - // Start thread pool. - let custom_pool = rayon::ThreadPoolBuilder::new() - .num_threads(jobs.min(len)) - .build() - .unwrap(); - - // Report info from casreps: (casrep, (trace, crashline)) - let mut casrep_info: RwLock> = RwLock::new(Vec::new()); - // Casreps with stacktraces, that we cannot parse - let mut badreports: RwLock> = RwLock::new(Vec::new()); - custom_pool.install(|| { - (0..len).into_par_iter().for_each(|i| { - if let Ok(report) = util::report_from_file(casreps[i].as_path()) { - if let Ok(trace) = report.filtered_stacktrace() { - casrep_info - .write() - .unwrap() - .push((casreps[i].clone(), (trace, report.crashline))); - } else { - badreports.write().unwrap().push(casreps[i].clone()); - } - } else { - badreports.write().unwrap().push(casreps[i].clone()); - } - }) - }); - let casrep_info = casrep_info.get_mut().unwrap(); - let badreports = badreports.get_mut().unwrap(); - - // Sort by casrep filename - casrep_info.sort_by(|a, b| { - a.0.file_name() - .unwrap() - .to_str() - .unwrap() - .cmp(b.0.file_name().unwrap().to_str().unwrap()) - }); - - let (casreps, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = - casrep_info.iter().cloned().unzip(); + // Get casreps with stacktraces and crashlines + let (casreps, badreports) = util::reports_from_paths(&casreps, jobs); + // Handle bad reports if !badreports.is_empty() { - fs::create_dir_all(format!("{}/clerr", &outpath.display()))?; - for report in badreports { - fs::copy( - &report, - format!( - "{}/clerr/{}", - &outpath.display(), - &report.file_name().unwrap().to_str().unwrap() - ), - )?; - } + util::save_reports( + &badreports, + format!("{}/clerr", &outpath.display()).as_str(), + )?; } - if stacktraces.len() < 2 { - bail!("{} valid reports, nothing to cluster...", stacktraces.len()); + if casreps.len() < 2 { + bail!("{} valid reports, nothing to cluster...", casreps.len()); } // Get clusters - let mut clusters = cluster_stacktraces(&stacktraces)?; - - // Cluster formation - let cluster_cnt: usize = *clusters.iter().max().unwrap(); - for i in 1..=cluster_cnt { - fs::create_dir_all(format!("{}/cl{}", &outpath.display(), i))?; - } - - // Init before and after dedup counters - let before_cnt = casreps.len(); - let mut after_cnt = before_cnt; + let (clusters, before, after) = Cluster::cluster_reports(&casreps, 0, dedup)?; + // Save clusters + util::save_clusters(&clusters, outpath)?; - // Get clusters with crashline deduplication - if dedup { - after_cnt = dedup_crashlines(&crashlines, &mut clusters); - } - - for i in 0..clusters.len() { - // Skip casreps with duplicate crashlines - if clusters[i] == 0 { - continue; - } - fs::copy( - &casreps[i], - format!( - "{}/cl{}/{}", - &outpath.display(), - &clusters[i], - &casreps[i].file_name().unwrap().to_str().unwrap() - ), - )?; - } - Ok((cluster_cnt, before_cnt, after_cnt)) + Ok((clusters.len(), before, after)) } /// Remove duplicate casreps @@ -354,6 +276,271 @@ fn merge_or_diff(input: &Path, output: &Path, diff: Option<&Path>) -> Result Result<(usize, usize, usize, usize, usize, usize)> { + // Get new casreps + let casreps = util::get_reports(newpath)?; + let (casreps, _) = util::reports_from_paths(&casreps, jobs); + + // Get casreps from existing clusters + let mut dirs: Vec = fs::read_dir(oldpath) + .unwrap() + .map(|path| path.unwrap().path()) + .filter(|path| { + let name = path.file_name().unwrap().to_str().unwrap(); + name.starts_with("cl") && !name.starts_with("clerr") + }) + .collect(); + dirs.sort(); + + // Max cluster number + let mut max = 0usize; + // Init clusters vector + let mut clusters: HashMap = HashMap::new(); + // Init cluster paths vector + let mut paths: HashMap = HashMap::new(); + // Get casreps from each existing cluster + for dir in &dirs { + // Get cluster + let cluster = util::load_cluster(dir, jobs)?; + // Update max cluster number + max = max.max(cluster.number); + // Add cluster path + paths.insert(cluster.number, dir); + // Fill cluster info structures + clusters.insert(cluster.number, cluster); + } + + // Init list of casreps, which aren't suitable for any cluster + let mut deviants: Vec = Vec::new(); + // Init added casreps counter + let mut added = 0usize; + // Init duplicates counter + let mut duplicates = 0usize; + // Init crashline duplicates counter + let mut deduplicated = 0usize; + // Try to insert each new casrep + for (casrep, (stacktrace, crashline)) in casreps { + // list of "inner" clusters for casrep + let mut inners: Vec<(usize, f64)> = Vec::new(); + // Checker if casrep is duplicate of someone else + let mut dup = false; + for cluster in clusters.values_mut() { + let relation = cluster.relation(&stacktrace); + match relation { + Relation::Dup => { + dup = true; + duplicates += 1; + break; + } + Relation::Inner(measure) => { + inners.push((cluster.number, measure)); + } + Relation::Outer => { + continue; + } + } + } + // Get cluster with min measure, a.k.a. "closest" one + let number = if dup { + continue; + } else if !inners.is_empty() { + inners.iter().min_by(|a, b| a.1.total_cmp(&b.1)).unwrap().0 + } else { + // Outer + deviants.push((casrep, (stacktrace.to_vec(), crashline.to_string()))); + continue; + }; + + // Update cluster (and dedup crashline) + if !clusters.get_mut(&number).unwrap().insert( + casrep.to_path_buf(), + stacktrace.to_vec(), + crashline.to_string(), + dedup, + ) { + deduplicated += 1; + continue; + } + + // Save casrep + added += 1; + fs::copy( + &casrep, + format!( + "{}/{}", + &paths.get(&number).unwrap().display(), + &casrep.file_name().unwrap().to_str().unwrap() + ), + )?; + } + + // Handle deviant casreps + let (result, before, after) = if !deviants.is_empty() { + // Get clusters from deviants + let (mut deviant_clusters, mut before, mut after) = + Cluster::cluster_reports(&deviants, max, dedup)?; + // Merge old and new clusters + let (moved, removed) = merge_clusters(clusters, &mut deviant_clusters, oldpath, dedup)?; + // Adjust stat + if moved != 0 || removed != 0 { + added += moved; + deduplicated += removed; + before = 0; // Impossible to know (proofed by @hkctkuy) + after -= moved + removed; + } + // Save deviant clusters + util::save_clusters(&deviant_clusters, oldpath)?; + (deviant_clusters.len(), before, after) + } else { + (0, 0, 0) + }; + Ok((added, duplicates, deduplicated, result, before, after)) +} + +/// Try to merge new clusters to old clusters +/// +/// # Arguments +/// +/// * `olds` - list of old clusters represented as `HashMap` of `Cluster` +/// +/// * `news` - list of new clusters represented as `HashMap` of `Cluster` +/// +/// * `dir` - out directory +/// +/// * `dedup` - deduplicate crashline, if true +/// +/// # Return value +/// +/// Number of moved to old clusters CASR reports +/// Number of removed by crashline deduplication CASR reports +fn merge_clusters( + olds: HashMap, + news: &mut HashMap, + dir: &Path, + dedup: bool, +) -> Result<(usize, usize)> { + let mut moved = 0usize; + let mut removed = 0usize; + let mut olds: Vec = olds.into_values().collect(); + olds.sort_by(|a, b| a.number.cmp(&b.number)); + for mut old in olds { + let mut merged = Vec::new(); + let mut values: Vec<&Cluster> = news.values().collect(); + values.sort_by(|a, b| a.number.cmp(&b.number)); + for new in values { + if !old.may_merge(new) { + continue; + } + // Copy casreps from new to old + for (casrep, (stacktrace, crashline)) in new.reports() { + // Update cluster (and dedup crashline) + if !old.insert( + casrep.to_path_buf(), + stacktrace.to_vec(), + crashline.to_string(), + dedup, + ) { + removed += 1; + continue; + } + // Save report + moved += 1; + fs::copy( + &casrep, + format!( + "{}/cl{}/{}", + &dir.display(), + old.number, + &casrep.file_name().unwrap().to_str().unwrap() + ), + )?; + } + // Mark merged cluster for drop + merged.push(new.number); + } + // Drop marked cluster + for number in merged { + news.remove(&number); + } + } + Ok((moved, removed)) +} + +/// Calculate silhouette coefficient +/// +/// # Arguments +/// +/// * `dir` - path to directory with CASR report clusters +/// +/// * `jobs` - number of jobs for calculating process +/// +/// # Return value +/// +/// Silhouette coefficient +fn calc_avg_sil(dir: &Path, jobs: usize) -> Result { + // Get cluster dirs + let mut dirs: Vec = fs::read_dir(dir) + .unwrap() + .map(|path| path.unwrap().path()) + .filter(|path| { + let name = path.file_name().unwrap().to_str().unwrap(); + name.starts_with("cl") && !name.starts_with("clerr") + }) + .collect(); + dirs.sort(); + + if dirs.len() < 2 { + bail!("{} valid cluster, nothing to calculate...", dirs.len()); + } + + // Init clusters vector + let mut clusters: Vec> = Vec::new(); + // Init casreps number counter + let mut size = 0usize; + // Get casreps from each cluster + for dir in &dirs { + // Get casreps from cluster + let casreps = util::get_reports(dir)?; + // Get stacktraces from cluster + let (casreps, _) = util::reports_from_paths(&casreps, jobs); + let (_, (stacktraces, _)): (Vec<_>, (Vec<_>, Vec<_>)) = casreps.iter().cloned().unzip(); + // Update size + size += stacktraces.len(); + // Add stacktraces + clusters.push(stacktraces); + } + if size == 0 { + bail!("{} valid reports, nothing to calculate...", size); + } + let avg_sil = avg_sil_coef(&clusters, size); + Ok(avg_sil) +} + fn main() -> Result<()> { let matches = clap::Command::new("casr-cluster") .version(clap::crate_version!()) @@ -419,6 +606,27 @@ fn main() -> Result<()> { INPUT_DIR will be added to OUTPUT_DIR.", ), ) + .arg( + Arg::new("update") + .short('u') + .long("update") + .action(ArgAction::Set) + .num_args(2) + .value_parser(clap::value_parser!(PathBuf)) + .value_names(["NEW_DIR", "OLD_DIR"]) + .help( + "Update clusters in OLD_DIR using CASR reports from NEW_DIR", + ), + ) + .arg( + Arg::new("estimate") + .short('e') + .long("estimate") + .value_name("DIR") + .action(ArgAction::Set) + .value_parser(clap::value_parser!(PathBuf)) + .help("Calculate silhouette score for clustering results"), + ) .arg( Arg::new("diff") .long("diff") @@ -503,6 +711,32 @@ fn main() -> Result<()> { new, paths[1].display() ); + } else if matches.contains_id("update") { + let paths: Vec<&PathBuf> = matches.get_many::("update").unwrap().collect(); + + let (added, duplicates, deduplicated, result, before, after) = + update_clusters(paths[0], paths[1], jobs, dedup_crashlines)?; + println!("Number of casreps added to old clusters: {added}"); + println!("Number of duplicates: {duplicates}"); + if deduplicated != 0 { + println!("Number of casreps deduplicated by crashline: {deduplicated}"); + } + if result != 0 { + println!("Number of new clusters: {result}"); + } + // Print crashline dedup summary + if before != 0 { + println!("Number of reports before crashline deduplication in new clusters: {before}"); + } + if before != after { + println!("Number of reports after crashline deduplication in new clusters: {after}"); + } + let sil = calc_avg_sil(paths[1], jobs)?; + println!("Cluster silhouette score: {sil}"); + } else if matches.contains_id("estimate") { + let path: &PathBuf = matches.get_one::("estimate").unwrap(); + let sil = calc_avg_sil(path, jobs)?; + println!("Cluster silhouette score: {sil}"); } else if matches.contains_id("diff") { let paths: Vec<&PathBuf> = matches.get_many::("diff").unwrap().collect(); let new = merge_or_diff(paths[0], paths[1], Some(paths[2]))?; diff --git a/casr/src/util.rs b/casr/src/util.rs index 7275db62..4ef59909 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -1,6 +1,7 @@ //! Common utility functions. extern crate libcasr; +use libcasr::cluster::{Cluster, ReportInfo}; use libcasr::report::CrashReport; use libcasr::stacktrace::{ STACK_FRAME_FILEPATH_IGNORE_REGEXES, STACK_FRAME_FUNCTION_IGNORE_REGEXES, @@ -8,9 +9,13 @@ use libcasr::stacktrace::{ use anyhow::{bail, Context, Result}; use clap::ArgMatches; +use is_executable::IsExecutable; use log::{info, warn}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; use simplelog::*; -use std::collections::HashSet; +use wait_timeout::ChildExt; + +use std::collections::{HashMap, HashSet}; use std::fs::{self, OpenOptions}; use std::io::Write; use std::io::{BufRead, BufReader}; @@ -19,9 +24,6 @@ use std::process::{Command, Output, Stdio}; use std::sync::RwLock; use std::time::Duration; -use is_executable::IsExecutable; -use wait_timeout::ChildExt; - /// Call casr-san with the provided options /// /// # Arguments @@ -411,3 +413,148 @@ pub fn get_path(tool: &str) -> Result { ); } } + +/// Get CASR reports from specified directory +/// +/// # Arguments +/// +/// * `dir` - directory path +/// +/// # Return value +/// +/// A vector of reports paths +pub fn get_reports(dir: &Path) -> Result> { + let dir = fs::read_dir(dir).with_context(|| format!("File: {}", dir.display()))?; + let casreps: Vec = dir + .map(|path| path.unwrap().path()) + .filter(|s| s.extension().is_some() && s.extension().unwrap() == "casrep") + .collect(); + Ok(casreps) +} + +/// Parse CASR reports from specified paths. +/// +/// # Arguments +/// +/// * `casreps` - a vector of report paths +/// +/// * `jobs` - number of jobs for parsing process +/// +/// # Return value +/// +/// * A vector of correctly parsed report info: paths, stacktraces and crashlines +/// * A vector of bad reports +pub fn reports_from_paths(casreps: &Vec, jobs: usize) -> (Vec, Vec) { + // Get len + let len = casreps.len(); + // Start thread pool. + let custom_pool = rayon::ThreadPoolBuilder::new() + .num_threads(jobs.min(len)) + .build() + .unwrap(); + // Report info from casreps: (casrep, (trace, crashline)) + let mut casrep_info: RwLock> = RwLock::new(Vec::new()); + // Casreps with stacktraces, that we cannot parse + let mut badreports: RwLock> = RwLock::new(Vec::new()); + custom_pool.install(|| { + (0..len).into_par_iter().for_each(|i| { + if let Ok(report) = report_from_file(casreps[i].as_path()) { + if let Ok(trace) = report.filtered_stacktrace() { + casrep_info + .write() + .unwrap() + .push((casreps[i].clone(), (trace, report.crashline))); + } else { + badreports.write().unwrap().push(casreps[i].clone()); + } + } else { + badreports.write().unwrap().push(casreps[i].clone()); + } + }) + }); + let casrep_info = casrep_info.get_mut().unwrap(); + let badreports = badreports.get_mut().unwrap().to_vec(); + // Sort by casrep filename + casrep_info.sort_by(|a, b| { + a.0.file_name() + .unwrap() + .to_str() + .unwrap() + .cmp(b.0.file_name().unwrap().to_str().unwrap()) + }); + + (casrep_info.to_vec(), badreports) +} + +/// Get `Cluster` structure from specified directory path. +/// +/// # Arguments +/// +/// * `dir` - valid cluster dir path +/// +/// * `jobs` - number of jobs for parsing process +/// +/// # Return value +/// +/// `Cluster` structure +/// NOTE: Resulting cluster does not contains path info +pub fn load_cluster(dir: &Path, jobs: usize) -> Result { + // Get cluster number + let i = dir.file_name().unwrap().to_str().unwrap(); + if i.len() < 3 { + bail!("Invalid cluster path: {}", &dir.display()); + } + let i = i[2..].to_string().parse::()?; + // Get casreps from cluster + let casreps = get_reports(dir)?; + let (casreps, _) = reports_from_paths(&casreps, jobs); + let (_, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = + casreps.iter().cloned().unzip(); + // Create cluster + Ok(Cluster::new(i, Vec::new(), stacktraces, crashlines)) +} + +/// Save clusters to given directory +/// +/// # Arguments +/// +/// * `clusters` - given `Cluster` structures for saving +/// +/// * `dir` - out directory +pub fn save_clusters(clusters: &HashMap, dir: &Path) -> Result<()> { + for cluster in clusters.values() { + fs::create_dir_all(format!("{}/cl{}", &dir.display(), cluster.number))?; + for casrep in cluster.paths() { + fs::copy( + casrep, + format!( + "{}/cl{}/{}", + &dir.display(), + cluster.number, + &casrep.file_name().unwrap().to_str().unwrap() + ), + )?; + } + } + Ok(()) +} + +/// Save CASR reports to given directory +/// +/// # Arguments +/// +/// * `reports` - A vector of CASR reports +/// +/// * `dir` - out directory +pub fn save_reports(reports: &Vec, dir: &str) -> Result<()> { + if !Path::new(&dir).exists() { + fs::create_dir_all(dir)?; + } + for report in reports { + fs::copy( + report, + format!("{}/{}", dir, &report.file_name().unwrap().to_str().unwrap()), + )?; + } + Ok(()) +} diff --git a/casr/tests/casr_tests/casrep/test_clustering_small/40.casrep b/casr/tests/casr_tests/casrep/test_clustering_small/40.casrep new file mode 100644 index 00000000..ea43e532 --- /dev/null +++ b/casr/tests/casr_tests/casrep/test_clustering_small/40.casrep @@ -0,0 +1,87 @@ +{ + "Date": "2021-07-14T19:56:09.276635+03:00", + "Uname": "Linux titanfall 5.8.0-59-generic #66~20.04.1-Ubuntu SMP Thu Jun 17 11:14:10 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux", + "OS": "Ubuntu", + "OSRelease": "20.04", + "Architecture": "amd64", + "ExecutablePath": "/usr/local/bin/tiff2pdf", + "ProcCmdline": "tiff2pdf ./fuz3tiff2pdf/main/crashes/id:000009,sig:06,src:000040+000049,time:43718,op:splice,rep:4", + "ProcMaps": [ + " 0x555555554000 0x555555556000 0x2000 0x0 /usr/local/bin/tiff2pdf", + " 0x555555556000 0x555555561000 0xb000 0x2000 /usr/local/bin/tiff2pdf", + " 0x555555561000 0x555555565000 0x4000 0xd000 /usr/local/bin/tiff2pdf", + " 0x555555565000 0x555555566000 0x1000 0x10000 /usr/local/bin/tiff2pdf", + " 0x555555566000 0x555555567000 0x1000 0x11000 /usr/local/bin/tiff2pdf", + " 0x555555567000 0x555555588000 0x21000 0x0 [heap]", + " 0x7ffff7945000 0x7ffff7949000 0x4000 0x0 ", + " 0x7ffff7949000 0x7ffff7958000 0xf000 0x0 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7958000 0x7ffff79ff000 0xa7000 0xf000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff79ff000 0x7ffff7a96000 0x97000 0xb6000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7a96000 0x7ffff7a97000 0x1000 0x14c000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7a97000 0x7ffff7a98000 0x1000 0x14d000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7a98000 0x7ffff7a9a000 0x2000 0x0 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7a9a000 0x7ffff7aab000 0x11000 0x2000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7aab000 0x7ffff7ab1000 0x6000 0x13000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab1000 0x7ffff7ab2000 0x1000 0x19000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab2000 0x7ffff7ab3000 0x1000 0x19000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab3000 0x7ffff7ab4000 0x1000 0x1a000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab4000 0x7ffff7ab8000 0x4000 0x0 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7ab8000 0x7ffff7afc000 0x44000 0x4000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7afc000 0x7ffff7b36000 0x3a000 0x48000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b36000 0x7ffff7b37000 0x1000 0x82000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b37000 0x7ffff7b38000 0x1000 0x82000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b38000 0x7ffff7b39000 0x1000 0x83000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b39000 0x7ffff7b44000 0xb000 0x0 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7b44000 0x7ffff7d43000 0x1ff000 0xb000 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7d43000 0x7ffff7d44000 0x1000 0xa000 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7d44000 0x7ffff7d47000 0x3000 0xb000 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7d47000 0x7ffff7d6c000 0x25000 0x0 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7d6c000 0x7ffff7ee4000 0x178000 0x25000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7ee4000 0x7ffff7f2e000 0x4a000 0x19d000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f2e000 0x7ffff7f2f000 0x1000 0x1e7000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f2f000 0x7ffff7f32000 0x3000 0x1e7000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f32000 0x7ffff7f35000 0x3000 0x1ea000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f35000 0x7ffff7f39000 0x4000 0x0 ", + " 0x7ffff7f39000 0x7ffff7f41000 0x8000 0x0 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7f41000 0x7ffff7f76000 0x35000 0x8000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7f76000 0x7ffff7f9f000 0x29000 0x3d000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7f9f000 0x7ffff7fa0000 0x1000 0x66000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7fa0000 0x7ffff7fa2000 0x2000 0x66000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7fa2000 0x7ffff7fa3000 0x1000 0x68000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7fa3000 0x7ffff7fa5000 0x2000 0x0 ", + " 0x7ffff7fc8000 0x7ffff7fc9000 0x1000 0x0 ", + " 0x7ffff7fc9000 0x7ffff7fcd000 0x4000 0x0 [vvar]", + " 0x7ffff7fcd000 0x7ffff7fcf000 0x2000 0x0 [vdso]", + " 0x7ffff7fcf000 0x7ffff7fd0000 0x1000 0x0 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7fd0000 0x7ffff7ff3000 0x23000 0x1000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ff3000 0x7ffff7ffb000 0x8000 0x24000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ffb000 0x7ffff7ffc000 0x1000 0x0 /home/avgor46/testdoc/fuz3tiff2pdf/main/crashes/id:000009,sig:06,src:000040+000049,time:43718,op:splice,rep:4", + " 0x7ffff7ffc000 0x7ffff7ffd000 0x1000 0x2c000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ffd000 0x7ffff7ffe000 0x1000 0x2d000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ffe000 0x7ffff7fff000 0x1000 0x0 ", + " 0x7ffffffde000 0x7ffffffff000 0x21000 0x0 [stack]", + " 0xffffffffff600000 0xffffffffff601000 0x1000 0x0 [vsyscall]" + ], + "CrashSeverity": { + "Type": "NOT_CRITICAL", + "ShortDescription": "SafeFunctionCheck", + "Description": "Buffer overflow in safe function", + "Explanation": "The target stopped while handling a signal that was generated by libc due to detection of buffer overflow in safe copy function." + }, + "Stacktrace": [ + "#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50", + "#1 0x00007ffff7d6c859 in __GI_abort () at abort.c:79", + "#2 0x00007ffff7dd73ee in __libc_message (action=action@entry=do_abort, fmt=fmt@entry=0x7ffff7f0107c \"*** %s ***: terminated\\n\") at ../sysdeps/posix/libc_fatal.c:155", + "#3 0x00007ffff7e79b4a in __GI___fortify_fail (msg=msg@entry=0x7ffff7f01012 \"buffer overflow detected\") at fortify_fail.c:26", + "#4 0x00007ffff7e783e6 in __GI___chk_fail () at chk_fail.c:28", + "#5 0x00007ffff7dcf1cf in _IO_str_chk_overflow (fp=, c=) at iovsprintf.c:35", + "#6 0x00007ffff7da7db0 in __GI___printf_fp_l (fp=, loc=, info=, args=) at printf_fp.c:1246", + "#7 0x00007ffff7dc163a in __vfprintf_internal (s=s@entry=0x7fffffffe070, format=format@entry=0x5555555613df \"%.4f\", ap=ap@entry=0x7fffffffe1b0, mode_flags=mode_flags@entry=6) at vfprintf-internal.c:1687", + "#8 0x00007ffff7dcf279 in __vsprintf_internal (string=0x7fffffffe2a0 \"79725330432.000\", maxlen=, format=0x5555555613df \"%.4f\", args=args@entry=0x7fffffffe1b0, mode_flags=6) at iovsprintf.c:95", + "#9 0x00007ffff7e77edb in ___sprintf_chk (s=, flag=, slen=, format=) at sprintf_chk.c:40", + "#10 0x000055555555c7a1 in sprintf (__fmt=0x5555555613df \"%.4f\", __s=0x7fffffffe2a0 \"79725330432.000\") at /usr/include/x86_64-linux-gnu/bits/stdio2.h:36", + "#12 0x00005555555601b8 in t2p_write_pdf (output=0x555555568f80, input=0x555555567ea0, t2p=0x5555555672a0) at tiff2pdf.c:5175", + "#13 t2p_write_pdf (t2p=0x5555555672a0, input=0x555555567ea0, output=0x555555568f80) at tiff2pdf.c:5133", + "#14 0x00005555555568d4 in main (argc=, argv=) at tiff2pdf.c:763" + ] +} diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index 5ba7a27d..6e0a9885 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2427,7 +2427,7 @@ fn test_casr_cluster_c() { .parse::() .unwrap(); - assert_eq!(before_cnt, 11, "Before count mismatch."); + assert_eq!(before_cnt, 12, "Before count mismatch."); let re = Regex::new(r"Number of reports after crashline deduplication: (?P\d+)").unwrap(); @@ -2440,15 +2440,16 @@ fn test_casr_cluster_c() { .parse::() .unwrap(); - assert_eq!(after_cnt, 10, "After count mismatch."); + assert_eq!(after_cnt, 11, "After count mismatch."); // 2.casrep and 20.caserp without crashlines => no dedup // 3.casrep and 30.caserp with crashlines => dedup - // Thus, cluster (cl8) with 2.casrep has 2 casreps and others have 1 casrep + // Thus, cluster (cl7) with 2.casrep has 2 casreps and cl9 too + // But others have 1 casrep for i in 1..clusters_cnt + 1 { let cluster_path = paths[1].to_owned() + "/cl" + &i.to_string(); let size = std::fs::read_dir(cluster_path.clone()).unwrap().count(); - let num = if i == 8 { 2 } else { 1 }; + let num = if i == 7 || i == 9 { 2 } else { 1 }; assert_eq!(size, num); } @@ -2670,6 +2671,165 @@ fn test_casr_cluster_d_and_m() { ); } +#[test] +fn test_casr_cluster_u() { + let paths = [ + abs_path("tests/casr_tests/casrep/test_clustering_small"), + abs_path("tests/tmp_tests_casr/clustering_out"), + abs_path("tests/tmp_tests_casr/clustering_out/cl7/20.casrep"), + abs_path("tests/tmp_tests_casr/clustering_out/cl8"), + abs_path("tests/tmp_tests_casr/clustering_out/cl9"), + abs_path("tests/tmp_tests_casr/clustering_out/cl9/40.casrep"), + ]; + + let _ = fs::remove_dir_all(&paths[1]); + + let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) + .args(["-c", &paths[0], &paths[1]]) + .env("CASR_CLUSTER_UNIQUE_CRASHLINE", "1") + .output() + .expect("failed to start casr-cluster"); + + assert!( + output.status.success(), + "Stdout {}.\n Stderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + + let res = String::from_utf8_lossy(&output.stdout); + + assert!(!res.is_empty()); + + let re = Regex::new(r"Number of clusters: (?P\d+)").unwrap(); + let clusters_cnt = re + .captures(&res) + .unwrap() + .name("clusters") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(clusters_cnt, 9, "Clusters count mismatch."); + + let _ = std::fs::remove_file(&paths[2]); + let _ = std::fs::remove_file(&paths[5]); + let _ = std::fs::remove_dir_all(&paths[3]); + let _ = std::fs::rename(&paths[4], &paths[3]); + + let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) + .args(["-u", &paths[0], &paths[1]]) + .env("CASR_CLUSTER_UNIQUE_CRASHLINE", "1") + .output() + .expect("failed to start casr-cluster"); + + assert!( + output.status.success(), + "Stdout {}.\n Stderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + + let res = String::from_utf8_lossy(&output.stdout); + + assert!(!res.is_empty()); + + let re = Regex::new(r"Number of casreps added to old clusters: (?P\d+)").unwrap(); + let added_cnt = re + .captures(&res) + .unwrap() + .name("added") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(added_cnt, 1, "Added count mismatch."); + + let re = Regex::new(r"Number of duplicates: (?P\d+)").unwrap(); + let duplicates_cnt = re + .captures(&res) + .unwrap() + .name("duplicates") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(duplicates_cnt, 9, "Duplicates count mismatch."); + + let re = Regex::new(r"Number of new clusters: (?P\d+)").unwrap(); + let clusters_cnt = re + .captures(&res) + .unwrap() + .name("clusters") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(clusters_cnt, 1, "Clusters count mismatch."); + + let re = Regex::new( + r"Number of reports after crashline deduplication in new clusters: (?P\d+)", + ) + .unwrap(); + let after_cnt = re + .captures(&res) + .unwrap() + .name("after") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(after_cnt, 1, "After count mismatch."); + + let re = Regex::new(r"Cluster silhouette score: (?P\d+.\d+)").unwrap(); + let sil = re + .captures(&res) + .unwrap() + .name("sil") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(sil, 0.15436556855344655, "Silhouette score mismatch."); + + // Test estimation + let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) + .args(["-e", &paths[1]]) + .output() + .expect("failed to start casr-cluster"); + + assert!( + output.status.success(), + "Stdout {}.\n Stderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + + let res = String::from_utf8_lossy(&output.stdout); + + assert!(!res.is_empty()); + + let re = Regex::new(r"Cluster silhouette score: (?P\d+.\d+)").unwrap(); + let sil = re + .captures(&res) + .unwrap() + .name("sil") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(sil, 0.15436556855344655, "Silhouette score mismatch."); + + let _ = std::fs::remove_dir_all(&paths[1]); +} + #[test] #[cfg(target_arch = "x86_64")] fn test_casr_san() { diff --git a/docs/usage.md b/docs/usage.md index db1e7722..ac816b0d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -261,6 +261,10 @@ Tool for clustering CASR reports -m, --merge Merge INPUT_DIR into OUTPUT_DIR. Only new CASR reports from INPUT_DIR will be added to OUTPUT_DIR. + -u, --update + Update clusters in OLD_DIR using CASR reports from NEW_DIR + -e, --estimate + Calculate silhouette score for clustering results --diff Compute report sets difference NEW_DIR \ PREV_DIR. Copy new CASR reports from NEW_DIR into DIFF_DIR. @@ -322,6 +326,15 @@ After clustering result directory will have the following structure: Similar CASR reports are inside one cluster. +Report accumulation is based on stack trace comparison, recognition similar +stack traces and clustering with merging different ones. + +Example: + + $ casr-cluster -c casr/tests/casr_tests/casrep/test_clustering_small out + $ rm -f out/cl9/40.casrep out/cl7/20.casrep && rm -rf out/cl8 && mv out/cl9 out/cl8 + $ casr-cluster -u casr/tests/casr_tests/casrep/test_clustering_small out + For the **--ignore ** option, file format should be as follows: FUNCTIONS diff --git a/libcasr/src/cluster.rs b/libcasr/src/cluster.rs new file mode 100644 index 00000000..7d8e23f6 --- /dev/null +++ b/libcasr/src/cluster.rs @@ -0,0 +1,343 @@ +//! Provides API's for cluster manipulating. +use crate::error::*; +use crate::stacktrace::*; + +use core::f64::MAX; +use std::collections::HashMap; +use std::path::PathBuf; + +/// Represents the information about CASR report: path, stacktrace and crashline +pub type ReportInfo = (PathBuf, (Stacktrace, String)); + +/// Relation between a CASR report and a cluster +pub enum Relation { + /// The CASR report is a duplicate of one from cluster + Dup, + /// The CASR report is "inside" the cluster with some proximity measure + Inner(f64), + /// The CASR report is "outside" the cluster + Outer, +} + +/// Structure provides an abstraction for cluster with CASR reports +pub struct Cluster { + /// Cluster number + pub number: usize, + /// Cluster report paths + paths: Vec, + /// Cluster report stacktraces + stacktraces: Vec, + /// Cluster diameter + diam: Option, + /// Cluster report crashlines + crashlines: HashMap, +} + +impl Cluster { + /// Create new `Cluster` + pub fn new( + number: usize, + paths: Vec, + stacktraces: Vec, + crashlines: Vec, + ) -> Self { + let mut unique_crashlines: HashMap = HashMap::new(); + for (i, crashline) in crashlines.into_iter().enumerate() { + if unique_crashlines.contains_key(&crashline) { + continue; + } + unique_crashlines.insert(crashline, i); + } + Cluster { + number, + paths, + stacktraces, + diam: None, + crashlines: unique_crashlines, + } + } + /// Get CASR report paths + pub fn paths(&self) -> &Vec { + &self.paths + } + /// Get CASR report stactraces + pub fn stacktraces(&self) -> &Vec { + &self.stacktraces + } + /// Perform CASR reports clustering + /// + /// # Arguments + /// + /// * `reports` - slice of `ReportInfo` + /// + /// * `offset` - cluster enumerate offset + /// + /// * `dedup` - deduplicate crashline, if true + /// + /// # Return value + /// + /// * `HashMap` of `Cluster` with cluster number as key + /// * Number of valid casreps before crashline deduplication + /// * Number of valid casreps after crashline deduplication + pub fn cluster_reports( + reports: &[ReportInfo], + offset: usize, + dedup: bool, + ) -> Result<(HashMap, usize, usize)> { + // Unzip casrep info + let (casreps, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = + reports.iter().cloned().unzip(); + let len = casreps.len(); + // Get stacktraces cluster numbers + let mut numbers = cluster_stacktraces(&stacktraces)?; + // Deduplicate by crashline + let after = if dedup { + dedup_crashlines(&crashlines, &mut numbers) + } else { + len + }; + // Create clusters + let mut clusters: HashMap = HashMap::new(); + for i in 0..len { + if numbers[i] == 0 { + // Skip casreps with duplicate crashlines + continue; + } + let number = numbers[i] + offset; + // Add new cluster if not exists + clusters + .entry(number) + .or_insert_with(|| Cluster::new(number, Vec::new(), Vec::new(), Vec::new())); + // Update cluster + clusters.get_mut(&number).unwrap().insert( + casreps[i].to_path_buf(), + stacktraces[i].to_vec(), + crashlines[i].to_string(), + dedup, + ); + } + Ok((clusters, len, after)) + } + /// Add new CASR report to cluster + /// + /// # Arguments + /// + /// * `stacktrace` - new CASR report stacktrace + /// + /// * `crashline` - new CASR report crashline + /// + /// * `dedup` - deduplicate crashline, if true + /// + /// # Return value + /// + /// `true` if new CASR report may be added, + /// `false` if report is duplicate of someone else + pub fn insert( + &mut self, + path: PathBuf, + stacktrace: Stacktrace, + crashline: String, + dedup: bool, + ) -> bool { + if dedup && !crashline.is_empty() && self.crashlines.contains_key(&crashline) { + return false; + } + self.paths.push(path); + self.stacktraces.push(stacktrace); + self.diam = None; + self.crashlines.insert(crashline, self.paths.len() - 1); + true + } + /// Get cluster diameter + pub fn diam(&mut self) -> f64 { + if self.diam.is_none() { + self.diam = Some(diam(&self.stacktraces)); + } + self.diam.unwrap() + } + /// Get "relation" between new report and specified cluster + /// + /// # Arguments + /// + /// * `new` - new report stacktrace + /// + /// # Return value + /// + /// `Relation` enum with proximity measure according specified strategy + pub fn relation(&mut self, new: &Stacktrace) -> Relation { + let diam = self.diam(); + let mut max = 0f64; + for stacktrace in self.stacktraces() { + let dist = 1.0 - similarity(new, stacktrace); + if dist == 0.0 { + return Relation::Dup; + } else if dist > THRESHOLD { + return Relation::Outer; + } + if dist > max { + max = dist; + } + } + if diam >= max { + // Inner + Relation::Inner(diam) + } else { + // Outer + Relation::Outer + } + } + /// Check if cluster may be merged with another one + pub fn may_merge(&self, cluster: &Cluster) -> bool { + let mut stacktraces1 = self.stacktraces.clone(); + let mut stacktraces2 = cluster.stacktraces().clone(); + stacktraces1.append(&mut stacktraces2); + diam(&stacktraces1) < THRESHOLD + } + /// Convert cluster to vector of reports + pub fn reports(&self) -> Vec { + let mut reports: Vec = Vec::new(); + let mut crashlines = self.crashlines.clone(); + for (i, path) in self.paths.iter().enumerate() { + // Get crashline for cur casrep + let mut crashline = String::new(); + for (line, &number) in &crashlines { + if number == i { + crashline = line.to_string(); + break; + } + } + // Drop cur crashline from crashlines + crashlines.remove(&crashline); + // Update results + reports.push((path.clone(), (self.stacktraces[i].clone(), crashline))); + } + reports + } +} + +/// Get diameter of specified cluster +/// +/// # Arguments +/// +/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// Value of diameter +fn diam(stacktraces: &[Stacktrace]) -> f64 { + let mut diam = 0f64; + let len = stacktraces.len(); + for i in 0..len { + for j in i + 1..len { + let dist = 1.0 - similarity(&stacktraces[i], &stacktraces[j]); + if dist > diam { + diam = dist; + } + } + } + diam +} + +/// Get "a" subcoefficient silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// "a" subcoefficient silhouette coefficient +fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { + let mut sum = 0f64; + for (i, stacktrace) in stacktraces.iter().enumerate() { + if i == num { + continue; + } + sum += 1.0 - similarity(&stacktraces[num], stacktrace); + } + sum / (stacktraces.len() - 1) as f64 +} + +/// Get "b" subcoefficient silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `i` - cluster number of given stacktrace +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// "b" subcoefficient silhouette coefficient +fn sil_subcoef_b(num: usize, i: usize, clusters: &[Vec]) -> f64 { + let mut min = MAX; + for (j, cluster) in clusters.iter().enumerate() { + if j == i { + continue; + } + let mut sum = 0f64; + for stacktrace in cluster { + sum += 1.0 - similarity(&clusters[i][num], stacktrace); + } + let res = sum / cluster.len() as f64; + if res < min { + min = res; + } + } + min +} + +/// Get silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `i` - cluster number of given stacktrace +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// Silhouette coefficient +fn sil_coef(num: usize, i: usize, clusters: &[Vec]) -> f64 { + if clusters[i].len() != 1 { + let a = sil_subcoef_a(num, &clusters[i]); + let b = sil_subcoef_b(num, i, clusters); + (b - a) / a.max(b) + } else { + 0f64 + } +} + +/// Get average silhouette coefficient calculating for given stacktraces +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// * `size` - total amount of elements in clusters +/// +/// # Return value +/// +/// Average silhouette coefficient +pub fn avg_sil_coef(clusters: &[Vec], size: usize) -> f64 { + // Init sil sum + let mut sum = 0f64; + // Calculate silhouette coefficient for each casrep + for i in 0..clusters.len() { + for num in 0..clusters[i].len() { + let sil = sil_coef(num, i, clusters); + sum += sil; + } + } + sum / size as f64 +} diff --git a/libcasr/src/lib.rs b/libcasr/src/lib.rs index 2284fb57..557d7713 100644 --- a/libcasr/src/lib.rs +++ b/libcasr/src/lib.rs @@ -22,6 +22,7 @@ //! collected from gdb. To save crash reports as json (.casrep/.sarif) use `serde` feature. pub mod asan; +pub mod cluster; pub mod constants; pub mod cpp; pub mod error; diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 26bfb8a9..b60aae05 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -34,6 +34,9 @@ lazy_static::lazy_static! { Vec::new()); } +/// Threshold for clusters diameter +pub const THRESHOLD: f64 = 0.3; + /// This macro updates variables used to remove trusted functions from stack trace #[macro_export] macro_rules! init_ignored_frames { @@ -215,15 +218,12 @@ pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result> { // at the beginning every node is in its own cluster let mut clusters = (0..len).map(|x| (x, vec![x])).collect::>(); - // Set threshold - let distance = 0.3; - // Counter for new clusters, which are formed as unions of previous ones let mut counter = len; for step in dendrogram.steps() { // Break if threshold is reached - if step.dissimilarity >= distance { + if step.dissimilarity >= THRESHOLD { break; } @@ -247,7 +247,8 @@ pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result> { let mut flat_clusters = vec![0; len]; for (i, (_, nums)) in clusters.into_iter().enumerate() { for num in nums { - flat_clusters[num] = i + 1; // Number clusters from 1, not 0 + // NOTE: Clusters enumerate from 1, not 0 + flat_clusters[num] = i + 1; } }