From 05c48c44d5d9b41c7a4a9cc0cf57b4139a59bf12 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 30 Nov 2023 15:20:58 +0300 Subject: [PATCH 01/34] [casr-cluster] Add update cluster poc --- casr/src/bin/casr-cluster.rs | 227 +++++++++++++++++++++++++++-------- casr/src/util.rs | 92 +++++++++++++- libcasr/src/stacktrace.rs | 126 ++++++++++++++++++- 3 files changed, 385 insertions(+), 60 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index de74990b..9d0045e6 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -3,8 +3,7 @@ use libcasr::{init_ignored_frames, stacktrace::*}; use anyhow::{bail, Context, Result}; use clap::{builder::FalseyValueParser, Arg, ArgAction}; -use rayon::iter::{IndexedParallelIterator, ParallelIterator}; -use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator}; +use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; use std::collections::HashSet; use std::fs; @@ -39,6 +38,8 @@ fn stacktrace(path: &Path) -> Result { /// /// * `dedup` - deduplicate casrep by crashline for each cluster, if true /// +/// * `offset` - cluster enumerate offset +/// /// # Return value /// /// * Number of clusters @@ -49,60 +50,18 @@ fn make_clusters( outpath: Option<&Path>, jobs: usize, dedup: bool, + offset: usize, ) -> Result<(usize, usize, usize)> { // if outpath is "None" we consider that outpath and inpath are the same let outpath = outpath.unwrap_or(inpath); - let dir = fs::read_dir(inpath).with_context(|| format!("File: {}", inpath.display()))?; - - let casreps: Vec = dir - .map(|path| path.unwrap().path()) - .filter(|s| s.extension().is_some() && s.extension().unwrap() == "casrep") - .collect(); + let casreps = util::get_reports(inpath)?; let len = casreps.len(); if len < 2 { bail!("{} reports, nothing to cluster...", len); } - // Start thread pool. - let custom_pool = rayon::ThreadPoolBuilder::new() - .num_threads(jobs.min(len)) - .build() - .unwrap(); - - // Report info from casreps: (casrep, (trace, crashline)) - let mut casrep_info: RwLock> = RwLock::new(Vec::new()); - // Casreps with stacktraces, that we cannot parse - let mut badreports: RwLock> = RwLock::new(Vec::new()); - custom_pool.install(|| { - (0..len).into_par_iter().for_each(|i| { - if let Ok(report) = util::report_from_file(casreps[i].as_path()) { - if let Ok(trace) = report.filtered_stacktrace() { - casrep_info - .write() - .unwrap() - .push((casreps[i].clone(), (trace, report.crashline))); - } else { - badreports.write().unwrap().push(casreps[i].clone()); - } - } else { - badreports.write().unwrap().push(casreps[i].clone()); - } - }) - }); - let casrep_info = casrep_info.get_mut().unwrap(); - let badreports = badreports.get_mut().unwrap(); - - // Sort by casrep filename - casrep_info.sort_by(|a, b| { - a.0.file_name() - .unwrap() - .to_str() - .unwrap() - .cmp(b.0.file_name().unwrap().to_str().unwrap()) - }); - - let (casreps, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = - casrep_info.iter().cloned().unzip(); + // Get casreps with stacktraces and crashlines + let (casreps, stacktraces, crashlines, badreports) = util::reports_from_dirs(casreps, jobs); if !badreports.is_empty() { fs::create_dir_all(format!("{}/clerr", &outpath.display()))?; @@ -128,7 +87,7 @@ fn make_clusters( // Cluster formation let cluster_cnt: usize = *clusters.iter().max().unwrap(); for i in 1..=cluster_cnt { - fs::create_dir_all(format!("{}/cl{}", &outpath.display(), i))?; + fs::create_dir_all(format!("{}/cl{}", &outpath.display(), i + offset))?; } // Init before and after dedup counters @@ -150,7 +109,7 @@ fn make_clusters( format!( "{}/cl{}/{}", &outpath.display(), - &clusters[i], + clusters[i] + offset, &casreps[i].file_name().unwrap().to_str().unwrap() ), )?; @@ -343,6 +302,157 @@ fn merge_dirs(input: &Path, output: &Path) -> Result { Ok(new) } +/// Perform the clustering of casreps +/// +/// # Arguments +/// +/// * `newpath` - path to directory with new CASR reports +/// +/// * `oldpath` - target directory for exiting clusters +/// +/// * `jobs` - number of jobs for cluster updating process +/// +/// * `dedup` - deduplicate casrep by crashline for each cluster, if true +/// +fn update_clusters(newpath: &Path, oldpath: &Path, jobs: usize, dedup: bool) -> Result<()> { + // Get new casreps + let casreps = util::get_reports(newpath)?; + let (casreps, stacktraces, crashlines, _) = util::reports_from_dirs(casreps, jobs); + let casreps = casreps + .iter() + .zip(stacktraces.iter().zip(crashlines.iter())); + + // Get casreps from existing clusters + let cluster_dirs: Vec = fs::read_dir(oldpath) + .unwrap() + .map(|path| path.unwrap().path()) + .filter(|path| { + path.clone() + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("cl") + }) + .collect(); + let len = cluster_dirs.len(); + // Init clusters vector + let mut clusters: Vec = Vec::new(); + // Init dedup crashline list for each cluster + let mut unique_crashlines: Vec> = vec![HashSet::new(); len]; + // Get casreps from each existing cluster + for cluster in &cluster_dirs { + // Get cluster number + let i = cluster.clone().file_name().unwrap().to_str().unwrap()[2..] + .to_string() + .parse::() + .unwrap(); + // Get casreps from cluster + let casreps = util::get_reports(cluster)?; + let (_, stacktraces, crashlines, _) = util::reports_from_dirs(casreps, jobs); + // Fill cluster info structures + let diam = diam(&stacktraces); + clusters.push(Cluster { + number: i, + stacktraces, + diam, + }); + if dedup { + for crashline in crashlines { + // Note: Clusters enumerate from 1, not 0 + unique_crashlines[i - 1].insert(crashline); + } + } + } + + // Init list of casreps, which aren't suitable for any cluster + let mut deviants = Vec::<&PathBuf>::new(); + + // Try to insert each new casrep + for (casrep, (stacktrace, crashline)) in casreps { + // list of "inner" clusters for casrep + let mut inners: Vec<(usize, f64)> = Vec::new(); + // list of "outer" clusters for casrep + let mut outers: Vec<(usize, f64)> = Vec::new(); + // Checker if casrep is duplicate of someone else + let mut dup = false; + for cluster in &clusters { + // TODO: Add strategy options + let relation = relation( + stacktrace, + cluster, + AccumStrategy::Dist, + AccumStrategy::Dist, + ); + match relation { + Relation::Dup => { + dup = true; + break; + } + Relation::Inner(measure) => { + inners.push((cluster.number, measure)); + } + Relation::Outer(measure) => { + outers.push((cluster.number, measure)); + } + Relation::Oot => { + continue; + } + } + } + if dup { + continue; + } + // Get cluster with min measure + let number = if !inners.is_empty() { + inners.iter().min_by(|a, b| a.1.total_cmp(&b.1)).unwrap().0 + } else if !outers.is_empty() { + outers.iter().min_by(|a, b| a.1.total_cmp(&b.1)).unwrap().0 + } else { + // Out of threshold + deviants.push(casrep); + continue; + }; + + // TODO: Check crashline + // Save casrep + fs::copy( + casrep, + format!( + "{}/{}", + &cluster_dirs[number - 1].display(), + &casrep.file_name().unwrap().to_str().unwrap() + ), + )?; + + // Update cluster + let i = clusters.iter().position(|a| a.number == number).unwrap(); + clusters[i].stacktraces.push(stacktrace.to_vec()); + clusters[i].diam = diam(&clusters[i].stacktraces); + } + + // Handle deviant casreps + if !deviants.is_empty() { + // Copy casrep to tmp dir + let deviant_dir = format!("{}/deviant", &oldpath.display()); + fs::create_dir_all(&deviant_dir)?; + for casrep in deviants { + fs::copy( + casrep, + format!( + "{}/{}", + &deviant_dir, + &casrep.file_name().unwrap().to_str().unwrap() + ), + )?; + } + // Cluster deviant casreps + let (result, before, after) = + make_clusters(Path::new(&deviant_dir), Some(oldpath), jobs, dedup, len)?; + } + Ok(()) +} + fn main() -> Result<()> { let matches = clap::Command::new("casr-cluster") .version(clap::crate_version!()) @@ -408,6 +518,18 @@ fn main() -> Result<()> { INPUT_DIR will be added to OUTPUT_DIR.", ), ) + .arg( + Arg::new("update") + .short('u') + .long("update") + .action(ArgAction::Set) + .num_args(2) + .value_parser(clap::value_parser!(PathBuf)) + .value_names(["NEW_DIR", "OLD_DIR"]) + .help( + "Update clusters from OLD_DIR using CASR reports from NEW_DIR.", + ), + ) .arg( Arg::new("ignore") .long("ignore") @@ -458,6 +580,7 @@ fn main() -> Result<()> { paths.get(1).map(|x| x.as_path()), jobs, dedup_crashlines, + 0, )?; println!("Number of clusters: {result}"); // Print crashline dedup summary @@ -481,6 +604,10 @@ fn main() -> Result<()> { new, paths[1].display() ); + } else if matches.contains_id("update") { + let paths: Vec<&PathBuf> = matches.get_many::("update").unwrap().collect(); + + update_clusters(paths[0], paths[1], jobs, dedup_crashlines)?; } Ok(()) diff --git a/casr/src/util.rs b/casr/src/util.rs index 7275db62..4217dafb 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -3,13 +3,17 @@ extern crate libcasr; use libcasr::report::CrashReport; use libcasr::stacktrace::{ - STACK_FRAME_FILEPATH_IGNORE_REGEXES, STACK_FRAME_FUNCTION_IGNORE_REGEXES, + Stacktrace, STACK_FRAME_FILEPATH_IGNORE_REGEXES, STACK_FRAME_FUNCTION_IGNORE_REGEXES, }; use anyhow::{bail, Context, Result}; use clap::ArgMatches; +use is_executable::IsExecutable; use log::{info, warn}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; use simplelog::*; +use wait_timeout::ChildExt; + use std::collections::HashSet; use std::fs::{self, OpenOptions}; use std::io::Write; @@ -19,9 +23,6 @@ use std::process::{Command, Output, Stdio}; use std::sync::RwLock; use std::time::Duration; -use is_executable::IsExecutable; -use wait_timeout::ChildExt; - /// Call casr-san with the provided options /// /// # Arguments @@ -301,7 +302,7 @@ pub fn get_atheris_lib() -> Result { Ok(format!("{out}/asan_with_fuzzer.so")) } -/// Create output, timeout and oom directories +/// Create output, timeout and oOLDdirectories /// /// # Arguments /// @@ -411,3 +412,84 @@ pub fn get_path(tool: &str) -> Result { ); } } + +/// Get CASR reports from specified directory +/// +/// # Arguments +/// +/// * `dir` - directory path +/// +/// # Return value +/// +/// A vector of reports paths +pub fn get_reports(dir: &Path) -> Result> { + let dir = fs::read_dir(dir).with_context(|| format!("File: {}", dir.display()))?; + let casreps: Vec = dir + .map(|path| path.unwrap().path()) + .filter(|s| s.extension().is_some() && s.extension().unwrap() == "casrep") + .collect(); + Ok(casreps) +} + +/// Parse CASR reports from specified paths. +/// +/// # Arguments +/// +/// * `casreps` - a vector of report paths +/// +/// * `jobs` - number of jobs for parsing process +/// +/// # Return value +/// +/// * A vector of reports paths +/// * A vector of reports stacktraces +/// * A vector of reports crashlines +/// * A vector of bad reports +pub fn reports_from_dirs( + casreps: Vec, + jobs: usize, +) -> (Vec, Vec, Vec, Vec) { + // Get len + let len = casreps.len(); + // Start thread pool. + let custom_pool = rayon::ThreadPoolBuilder::new() + .num_threads(jobs.min(len)) + .build() + .unwrap(); + // Report info from casreps: (casrep, (trace, crashline)) + let mut casrep_info: RwLock> = RwLock::new(Vec::new()); + // Casreps with stacktraces, that we cannot parse + let mut badreports: RwLock> = RwLock::new(Vec::new()); + custom_pool.install(|| { + (0..len).into_par_iter().for_each(|i| { + if let Ok(report) = report_from_file(casreps[i].as_path()) { + if let Ok(trace) = report.filtered_stacktrace() { + casrep_info + .write() + .unwrap() + .push((casreps[i].clone(), (trace, report.crashline))); + } else { + badreports.write().unwrap().push(casreps[i].clone()); + } + } else { + badreports.write().unwrap().push(casreps[i].clone()); + } + }) + }); + let casrep_info = casrep_info.get_mut().unwrap(); + let badreports = badreports.get_mut().unwrap().to_vec(); + // Sort by casrep filename + casrep_info.sort_by(|a, b| { + a.0.file_name() + .unwrap() + .to_str() + .unwrap() + .cmp(b.0.file_name().unwrap().to_str().unwrap()) + }); + + // Unzip casrep info + let (casreps, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = + casrep_info.iter().cloned().unzip(); + + (casreps, stacktraces, crashlines, badreports) +} diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 97e4083f..abad7fe3 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -11,6 +11,7 @@ use crate::constants::{ STACK_FRAME_FUNCTION_IGNORE_REGEXES_PYTHON, STACK_FRAME_FUNCTION_IGNORE_REGEXES_RUST, }; use crate::error::*; +use core::f64::MAX; use kodama::{linkage, Method}; use regex::Regex; use std::collections::{HashMap, HashSet}; @@ -34,6 +35,43 @@ lazy_static::lazy_static! { Vec::new()); } +/// Threshold for clusters diameter +const THRESHOLD: f64 = 0.3; + +/// Relation between a CASR report and a cluster +#[derive(Clone, Debug)] +pub enum Relation { + /// The CASR report is a duplicate of one from cluster + Dup, + /// The CASR report is "inside" the cluster with some measure + Inner(f64), + /// The CASR report is "outside" the cluster with some measure + Outer(f64), + /// The CASR report is out of threshold + Oot, +} + +/// Cluster accumulation strategy +pub enum AccumStrategy { + /// Argmin diam (cluster + {new}) + Diam, + /// Argmin (diam (cluster + {new}) - diam (cluster)) + DiamDelta, + /// Argmin dist (cluster, {new}) + Dist, +} + +/// Structure provides an interface for leverages with CASR report clusters +#[derive(Clone, Debug)] +pub struct Cluster { + /// Cluster number + pub number: usize, + /// Cluster report stacktraces + pub stacktraces: Vec, + /// Cluster diameter + pub diam: f64, +} + /// This macro updates variables used to remove trusted functions from stack trace #[macro_export] macro_rules! init_ignored_frames { @@ -215,15 +253,12 @@ pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result> { // at the beginning every node is in its own cluster let mut clusters = (0..len).map(|x| (x, vec![x])).collect::>(); - // Set threshold - let distance = 0.3; - // Counter for new clusters, which are formed as unions of previous ones let mut counter = len; for step in dendrogram.steps() { // Break if threshold is reached - if step.dissimilarity >= distance { + if step.dissimilarity >= THRESHOLD { break; } @@ -247,7 +282,8 @@ pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result> { let mut flat_clusters = vec![0; len]; for (i, (_, nums)) in clusters.into_iter().enumerate() { for num in nums { - flat_clusters[num] = i + 1; // Number clusters from 1, not 0 + // Note: Clusters enumerate from 1, not 0 + flat_clusters[num] = i + 1; } } @@ -292,6 +328,86 @@ pub fn dedup_crashlines(crashlines: &[String], clusters: &mut [usize]) -> usize unique_cnt } +/// Get diameter of specified cluster +/// +/// # Arguments +/// +/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// Value of diameter +pub fn diam(stacktraces: &[Stacktrace]) -> f64 { + let mut diam = 0f64; + let len = stacktraces.len(); + for i in 0..len { + for j in i + 1..len { + let dist = 1.0 - similarity(&stacktraces[i], &stacktraces[j]); + if dist > diam { + diam = dist; + } + } + } + diam +} + +/// Get "relation" between new report and specified cluster +/// +/// # Arguments +/// +/// * `new` - new report stacktrace +/// +/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures +/// +/// * `inner_strategy` - cluster accumulation strategy if `new` is "inner" +/// +/// * `inner_strategy` - cluster accumulation strategy if `new` is "outer" +/// +/// # Return value +/// +/// `Relation` enum with measure according specified strategy +pub fn relation( + new: &Stacktrace, + cluster: &Cluster, + inner_strategy: AccumStrategy, + outer_strategy: AccumStrategy, +) -> Relation { + let diam = cluster.diam; + let mut min = MAX; + let mut max = 0f64; + for stacktrace in &cluster.stacktraces { + let dist = 1.0 - similarity(new, stacktrace); + if dist == 0.0 { + return Relation::Dup; + } else if dist > THRESHOLD { + return Relation::Oot; + } + if dist < min { + min = dist; + } + if dist > max { + max = dist; + } + } + if diam >= max { + // Inner + let rel = match inner_strategy { + // DiamDelta is a nonsensical strategy in this case + AccumStrategy::Diam => diam, + _ => min, + }; + Relation::Inner(rel) + } else { + // Outer + let rel = match outer_strategy { + AccumStrategy::Diam => max, + AccumStrategy::DiamDelta => max - diam, + AccumStrategy::Dist => min, + }; + Relation::Outer(rel) + } +} + /// Stack trace filtering trait. pub trait Filter { /// Filter frames from the stack trace that are not related to analyzed code containing crash. From 9f9c5c21002c2034e8f3f6c84b14db97ad132d6a Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 7 Dec 2023 13:03:42 +0300 Subject: [PATCH 02/34] add logging --- casr/src/bin/casr-cluster.rs | 54 ++++++++++++++++++++++++++++-------- libcasr/src/stacktrace.rs | 4 ++- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 9d0045e6..4b1fe3f7 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -43,8 +43,8 @@ fn stacktrace(path: &Path) -> Result { /// # Return value /// /// * Number of clusters -/// * Number of valid casrep before crashiline deduplication -/// * Number of valid casrep after crashiline deduplication +/// * Number of valid casreps before crashiline deduplication +/// * Number of valid casreps after crashiline deduplication fn make_clusters( inpath: &Path, outpath: Option<&Path>, @@ -314,7 +314,20 @@ fn merge_dirs(input: &Path, output: &Path) -> Result { /// /// * `dedup` - deduplicate casrep by crashline for each cluster, if true /// -fn update_clusters(newpath: &Path, oldpath: &Path, jobs: usize, dedup: bool) -> Result<()> { +/// # Return value +/// +/// * Number casreps of added to old clusters +/// * Number of duplicates +/// * TODO: crashlines... +/// * Number of new clusters +/// * Number of valid casreps before crashiline deduplication in new clusters +/// * Number of valid casreps after crashiline deduplication in new clusters +fn update_clusters( + newpath: &Path, + oldpath: &Path, + jobs: usize, + dedup: bool, +) -> Result<(usize, usize, usize, usize, usize)> { // Get new casreps let casreps = util::get_reports(newpath)?; let (casreps, stacktraces, crashlines, _) = util::reports_from_dirs(casreps, jobs); @@ -359,7 +372,7 @@ fn update_clusters(newpath: &Path, oldpath: &Path, jobs: usize, dedup: bool) -> }); if dedup { for crashline in crashlines { - // Note: Clusters enumerate from 1, not 0 + // NOTE: Clusters enumerate from 1, not 0 unique_crashlines[i - 1].insert(crashline); } } @@ -367,7 +380,11 @@ fn update_clusters(newpath: &Path, oldpath: &Path, jobs: usize, dedup: bool) -> // Init list of casreps, which aren't suitable for any cluster let mut deviants = Vec::<&PathBuf>::new(); - + // Init added casreps counter + let mut added = 0usize; + // Init duplicates counter + let mut duplicates = 0usize; + // TODO: Init crashline duplicates counter // Try to insert each new casrep for (casrep, (stacktrace, crashline)) in casreps { // list of "inner" clusters for casrep @@ -387,6 +404,7 @@ fn update_clusters(newpath: &Path, oldpath: &Path, jobs: usize, dedup: bool) -> match relation { Relation::Dup => { dup = true; + duplicates += 1; break; } Relation::Inner(measure) => { @@ -415,6 +433,8 @@ fn update_clusters(newpath: &Path, oldpath: &Path, jobs: usize, dedup: bool) -> }; // TODO: Check crashline + + added += 1; // Save casrep fs::copy( casrep, @@ -432,7 +452,7 @@ fn update_clusters(newpath: &Path, oldpath: &Path, jobs: usize, dedup: bool) -> } // Handle deviant casreps - if !deviants.is_empty() { + let (result, before, after) = if !deviants.is_empty() { // Copy casrep to tmp dir let deviant_dir = format!("{}/deviant", &oldpath.display()); fs::create_dir_all(&deviant_dir)?; @@ -447,10 +467,11 @@ fn update_clusters(newpath: &Path, oldpath: &Path, jobs: usize, dedup: bool) -> )?; } // Cluster deviant casreps - let (result, before, after) = - make_clusters(Path::new(&deviant_dir), Some(oldpath), jobs, dedup, len)?; - } - Ok(()) + make_clusters(Path::new(&deviant_dir), Some(oldpath), jobs, dedup, len)? + } else { + (0, 0, 0) + }; + Ok((added, duplicates, result, before, after)) } fn main() -> Result<()> { @@ -607,7 +628,18 @@ fn main() -> Result<()> { } else if matches.contains_id("update") { let paths: Vec<&PathBuf> = matches.get_many::("update").unwrap().collect(); - update_clusters(paths[0], paths[1], jobs, dedup_crashlines)?; + let (added, duplicates, result, before, after) = + update_clusters(paths[0], paths[1], jobs, dedup_crashlines)?; + println!("Number of casreps added to old clusters: {added}"); + println!("Number of duplicates: {duplicates}"); + if result != 0 { + println!("Number of new clusters: {result}"); + } + // Print crashline dedup summary + if before != after { + println!("Number of reports before crashline deduplication in new clusters: {before}"); + println!("Number of reports after crashline deduplication in new clusters: {after}"); + } } Ok(()) diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index abad7fe3..ea05f40e 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -61,6 +61,8 @@ pub enum AccumStrategy { Dist, } +// TODO: lazy diam +// TODO: encapsulation /// Structure provides an interface for leverages with CASR report clusters #[derive(Clone, Debug)] pub struct Cluster { @@ -282,7 +284,7 @@ pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result> { let mut flat_clusters = vec![0; len]; for (i, (_, nums)) in clusters.into_iter().enumerate() { for num in nums { - // Note: Clusters enumerate from 1, not 0 + // NOTE: Clusters enumerate from 1, not 0 flat_clusters[num] = i + 1; } } From 9f31a73a19f9988604a9dd16c2c3831c23b34fc4 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 7 Dec 2023 13:15:57 +0300 Subject: [PATCH 03/34] add crashline dedup --- casr/src/bin/casr-cluster.rs | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 4b1fe3f7..75acd107 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -316,9 +316,9 @@ fn merge_dirs(input: &Path, output: &Path) -> Result { /// /// # Return value /// -/// * Number casreps of added to old clusters +/// * Number of casreps added to old clusters /// * Number of duplicates -/// * TODO: crashlines... +/// * Number of casreps deduplicated by crashline /// * Number of new clusters /// * Number of valid casreps before crashiline deduplication in new clusters /// * Number of valid casreps after crashiline deduplication in new clusters @@ -327,7 +327,7 @@ fn update_clusters( oldpath: &Path, jobs: usize, dedup: bool, -) -> Result<(usize, usize, usize, usize, usize)> { +) -> Result<(usize, usize, usize, usize, usize, usize)> { // Get new casreps let casreps = util::get_reports(newpath)?; let (casreps, stacktraces, crashlines, _) = util::reports_from_dirs(casreps, jobs); @@ -384,7 +384,8 @@ fn update_clusters( let mut added = 0usize; // Init duplicates counter let mut duplicates = 0usize; - // TODO: Init crashline duplicates counter + // Init crashline duplicates counter + let mut deduplicated = 0usize; // Try to insert each new casrep for (casrep, (stacktrace, crashline)) in casreps { // list of "inner" clusters for casrep @@ -432,7 +433,11 @@ fn update_clusters( continue; }; - // TODO: Check crashline + // Make crashline deduplication + if !crashline.is_empty() && !unique_crashlines[number - 1].insert(crashline.to_string()) { + deduplicated += 1; + continue; + } added += 1; // Save casrep @@ -471,7 +476,7 @@ fn update_clusters( } else { (0, 0, 0) }; - Ok((added, duplicates, result, before, after)) + Ok((added, duplicates, deduplicated, result, before, after)) } fn main() -> Result<()> { @@ -628,10 +633,13 @@ fn main() -> Result<()> { } else if matches.contains_id("update") { let paths: Vec<&PathBuf> = matches.get_many::("update").unwrap().collect(); - let (added, duplicates, result, before, after) = + let (added, duplicates, deduplicated, result, before, after) = update_clusters(paths[0], paths[1], jobs, dedup_crashlines)?; println!("Number of casreps added to old clusters: {added}"); println!("Number of duplicates: {duplicates}"); + if deduplicated != 0 { + println!("Number of casreps deduplicated by crashline"); + } if result != 0 { println!("Number of new clusters: {result}"); } From d7485c1bf054403018d4f3de81e3f838d9fed161 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 7 Dec 2023 13:49:37 +0300 Subject: [PATCH 04/34] add test --- casr/tests/tests.rs | 127 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index f49de7b7..ae9c07ee 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2649,6 +2649,133 @@ fn test_casr_cluster_d_and_m() { let _ = std::fs::remove_dir_all(&paths[1]); } +#[test] +fn test_casr_cluster_u() { + let paths = [ + abs_path("tests/casr_tests/casrep/test_clustering_small"), + abs_path("tests/tmp_tests_casr/clustering_out"), + abs_path("tests/tmp_tests_casr/clustering_out/cl9"), + ]; + + let _ = fs::remove_dir_all(&paths[1]); + + let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) + .args(["-c", &paths[0], &paths[1]]) + .env("CASR_CLUSTER_UNIQUE_CRASHLINE", "1") + .output() + .expect("failed to start casr-cluster"); + + assert!( + output.status.success(), + "Stdout {}.\n Stderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + + let res = String::from_utf8_lossy(&output.stdout); + + assert!(!res.is_empty()); + + let re = Regex::new(r"Number of clusters: (?P\d+)").unwrap(); + let clusters_cnt = re + .captures(&res) + .unwrap() + .name("clusters") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(clusters_cnt, 9, "Clusters count mismatch."); + + let _ = std::fs::remove_dir_all(&paths[2]); + + let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) + .args(["-u", &paths[0], &paths[1]]) + .env("CASR_CLUSTER_UNIQUE_CRASHLINE", "1") + .output() + .expect("failed to start casr-cluster"); + + assert!( + output.status.success(), + "Stdout {}.\n Stderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + + let res = String::from_utf8_lossy(&output.stdout); + + assert!(!res.is_empty()); + + let re = Regex::new(r"Number of casreps added to old clusters: (?P\d+)").unwrap(); + let added_cnt = re + .captures(&res) + .unwrap() + .name("added") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(added_cnt, 0, "Added count mismatch."); + + let re = Regex::new(r"Number of duplicates: (?P\d+)").unwrap(); + let duplicates_cnt = re + .captures(&res) + .unwrap() + .name("duplicates") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(duplicates_cnt, 9, "Duplicates count mismatch."); + + let re = Regex::new(r"Number of new clusters: (?P\d+)").unwrap(); + let clusters_cnt = re + .captures(&res) + .unwrap() + .name("clusters") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(clusters_cnt, 1, "Clusters count mismatch."); + + let re = Regex::new( + r"Number of reports before crashline deduplication in new clusters: (?P\d+)", + ) + .unwrap(); + let before_cnt = re + .captures(&res) + .unwrap() + .name("before") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(before_cnt, 2, "Before count mismatch."); + + let re = Regex::new( + r"Number of reports after crashline deduplication in new clusters: (?P\d+)", + ) + .unwrap(); + let after_cnt = re + .captures(&res) + .unwrap() + .name("after") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(after_cnt, 1, "After count mismatch."); + + let _ = std::fs::remove_dir_all(&paths[1]); +} + #[test] #[cfg(target_arch = "x86_64")] fn test_casr_san() { From 2f8307d12bd1e754506e507ebb98517c2d261440 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 7 Dec 2023 15:43:13 +0300 Subject: [PATCH 05/34] Add strategy options --- casr/src/bin/casr-cluster.rs | 50 +++++++++++++++++++++++++++++++----- docs/usage.md | 8 ++++++ libcasr/src/stacktrace.rs | 10 ++++---- 3 files changed, 57 insertions(+), 11 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 75acd107..49113dc3 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -327,6 +327,8 @@ fn update_clusters( oldpath: &Path, jobs: usize, dedup: bool, + inner_strategy: AccumStrategy, + outer_strategy: AccumStrategy, ) -> Result<(usize, usize, usize, usize, usize, usize)> { // Get new casreps let casreps = util::get_reports(newpath)?; @@ -399,8 +401,8 @@ fn update_clusters( let relation = relation( stacktrace, cluster, - AccumStrategy::Dist, - AccumStrategy::Dist, + inner_strategy.clone(), + outer_strategy.clone(), ); match relation { Relation::Dup => { @@ -422,7 +424,7 @@ fn update_clusters( if dup { continue; } - // Get cluster with min measure + // Get cluster with min measure, a.k.a. "closest" one let number = if !inners.is_empty() { inners.iter().min_by(|a, b| a.1.total_cmp(&b.1)).unwrap().0 } else if !outers.is_empty() { @@ -556,6 +558,24 @@ fn main() -> Result<()> { "Update clusters from OLD_DIR using CASR reports from NEW_DIR.", ), ) + .arg( + Arg::new("inner-strategy") + .long("inner-strategy") + .value_name("STRATEGY") + .action(ArgAction::Set) + .value_parser(["Diam", "Dist"]) + .default_value("Dist") + .help("Strategy for inner cluster choosing when updating"), + ) + .arg( + Arg::new("outer-strategy") + .long("outer-strategy") + .value_name("STRATEGY") + .action(ArgAction::Set) + .value_parser(["Delta", "Diam", "Dist"]) + .default_value("Dist") + .help("Strategy for outer cluster choosing when updating"), + ) .arg( Arg::new("ignore") .long("ignore") @@ -633,12 +653,30 @@ fn main() -> Result<()> { } else if matches.contains_id("update") { let paths: Vec<&PathBuf> = matches.get_many::("update").unwrap().collect(); - let (added, duplicates, deduplicated, result, before, after) = - update_clusters(paths[0], paths[1], jobs, dedup_crashlines)?; + let inner_strategy = matches.get_one::("inner-strategy").unwrap(); + let inner_strategy = match inner_strategy.as_str() { + "Diam" => AccumStrategy::Diam, + _ => AccumStrategy::Dist, + }; + let outer_strategy = matches.get_one::("outer-strategy").unwrap(); + let outer_strategy = match outer_strategy.as_str() { + "Delta" => AccumStrategy::Delta, + "Diam" => AccumStrategy::Diam, + _ => AccumStrategy::Dist, + }; + + let (added, duplicates, deduplicated, result, before, after) = update_clusters( + paths[0], + paths[1], + jobs, + dedup_crashlines, + inner_strategy, + outer_strategy, + )?; println!("Number of casreps added to old clusters: {added}"); println!("Number of duplicates: {duplicates}"); if deduplicated != 0 { - println!("Number of casreps deduplicated by crashline"); + println!("Number of casreps deduplicated by crashline: {deduplicated}"); } if result != 0 { println!("Number of new clusters: {result}"); diff --git a/docs/usage.md b/docs/usage.md index 2646d9c2..5451d7d2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -233,6 +233,14 @@ Tool for clustering CASR reports -m, --merge Merge INPUT_DIR into OUTPUT_DIR. Only new CASR reports from INPUT_DIR will be added to OUTPUT_DIR. + -u, --update + Update clusters from OLD_DIR using CASR reports from NEW_DIR. + --inner-strategy + Strategy for inner cluster choosing when updating [default: Dist] [possible + values: Diam, Dist] + --outer-strategy + Strategy for outer cluster choosing when updating [default: Dist] [possible + values: Delta, Diam, Dist] --ignore File with regular expressions for functions and file paths that should be ignored diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index ea05f40e..f05e4fee 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -39,7 +39,6 @@ lazy_static::lazy_static! { const THRESHOLD: f64 = 0.3; /// Relation between a CASR report and a cluster -#[derive(Clone, Debug)] pub enum Relation { /// The CASR report is a duplicate of one from cluster Dup, @@ -52,11 +51,12 @@ pub enum Relation { } /// Cluster accumulation strategy +#[derive(Clone, Debug)] pub enum AccumStrategy { + /// Argmin (diam (cluster + {new}) - diam (cluster)) + Delta, /// Argmin diam (cluster + {new}) Diam, - /// Argmin (diam (cluster + {new}) - diam (cluster)) - DiamDelta, /// Argmin dist (cluster, {new}) Dist, } @@ -394,7 +394,7 @@ pub fn relation( if diam >= max { // Inner let rel = match inner_strategy { - // DiamDelta is a nonsensical strategy in this case + // Delta is a nonsensical strategy in this case AccumStrategy::Diam => diam, _ => min, }; @@ -403,7 +403,7 @@ pub fn relation( // Outer let rel = match outer_strategy { AccumStrategy::Diam => max, - AccumStrategy::DiamDelta => max - diam, + AccumStrategy::Delta => max - diam, AccumStrategy::Dist => min, }; Relation::Outer(rel) From c62951ea3ea79625ae22ae753b678e4c35095f54 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 7 Dec 2023 17:38:41 +0300 Subject: [PATCH 06/34] Add cluster estimation --- casr/src/bin/casr-cluster.rs | 77 ++++++++++++++++++++++++++++++++++++ casr/tests/tests.rs | 16 +++++++- docs/usage.md | 2 + libcasr/src/stacktrace.rs | 55 ++++++++++++++++++++++++++ 4 files changed, 149 insertions(+), 1 deletion(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 49113dc3..9cb57303 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -481,6 +481,69 @@ fn update_clusters( Ok((added, duplicates, deduplicated, result, before, after)) } +/// Calculate silhouette coefficient +/// +/// # Arguments +/// +/// * `dir` - path to directory with CASR report clusters +/// +/// * `jobs` - number of jobs for calculating process +/// +/// # Return value +/// +/// Silhouette coefficient +fn get_sil(dir: &Path, jobs: usize) -> Result { + // Get cluster dirs + let dirs: Vec = fs::read_dir(dir) + .unwrap() + .map(|path| path.unwrap().path()) + .filter(|path| { + path.clone() + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("cl") + }) + .collect(); + + if dirs.len() < 2 { + bail!("{} valid cluster, nothing to calculate...", dirs.len()); + } + + // Init clusters vector + let mut clusters: Vec> = Vec::new(); + // Init casreps nuber counter + let mut size = 0usize; + // Get casreps from each cluster + for dir in &dirs { + // Get casreps from cluster + let casreps = util::get_reports(dir)?; + // Get stacktraces from cluster + let (_, stacktraces, _, _) = util::reports_from_dirs(casreps, jobs); + // Update size + size += stacktraces.len(); + // Add stacktraces + clusters.push(stacktraces); + } + // Init sil sum + let mut sum = 0f64; + // Calculate silhouette coefficient for each casrep + for i in 0..clusters.len() - 1 { + for num in 0..clusters[i].len() - 1 { + let sil = if clusters[i].len() != 1 { + let a = get_subcoef_a(num, &clusters[i]); + let b = get_subcoef_b(num, i, &clusters); + (b - a) / a.max(b) + } else { + 0f64 + }; + sum += sil; + } + } + Ok(sum / size as f64) +} + fn main() -> Result<()> { let matches = clap::Command::new("casr-cluster") .version(clap::crate_version!()) @@ -576,6 +639,14 @@ fn main() -> Result<()> { .default_value("Dist") .help("Strategy for outer cluster choosing when updating"), ) + .arg( + Arg::new("estimation") + .long("estimation") + .value_name("DIR") + .action(ArgAction::Set) + .value_parser(clap::value_parser!(PathBuf)) + .help("Make cluster estimation for DIR using silhouette index"), + ) .arg( Arg::new("ignore") .long("ignore") @@ -686,6 +757,12 @@ fn main() -> Result<()> { println!("Number of reports before crashline deduplication in new clusters: {before}"); println!("Number of reports after crashline deduplication in new clusters: {after}"); } + let sil = get_sil(paths[1], jobs)?; + println!("Cluster silhouette index: {sil}"); + } else if matches.contains_id("estimation") { + let path: &PathBuf = matches.get_one::("estimation").unwrap(); + let sil = get_sil(path, jobs)?; + println!("Cluster silhouette index: {sil}"); } Ok(()) diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index ae9c07ee..0b0924ec 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2654,6 +2654,7 @@ fn test_casr_cluster_u() { let paths = [ abs_path("tests/casr_tests/casrep/test_clustering_small"), abs_path("tests/tmp_tests_casr/clustering_out"), + abs_path("tests/tmp_tests_casr/clustering_out/cl8/20.casrep"), abs_path("tests/tmp_tests_casr/clustering_out/cl9"), ]; @@ -2688,7 +2689,8 @@ fn test_casr_cluster_u() { assert_eq!(clusters_cnt, 9, "Clusters count mismatch."); - let _ = std::fs::remove_dir_all(&paths[2]); + let _ = std::fs::remove_file(&paths[2]); + let _ = std::fs::remove_dir_all(&paths[3]); let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) .args(["-u", &paths[0], &paths[1]]) @@ -2773,6 +2775,18 @@ fn test_casr_cluster_u() { assert_eq!(after_cnt, 1, "After count mismatch."); + let re = Regex::new(r"Cluster silhouette index: (?P\d+)").unwrap(); + let sil = re + .captures(&res) + .unwrap() + .name("sil") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(sil, 0, "Silhouette index mismatch."); + let _ = std::fs::remove_dir_all(&paths[1]); } diff --git a/docs/usage.md b/docs/usage.md index 5451d7d2..00764f9f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -241,6 +241,8 @@ Tool for clustering CASR reports --outer-strategy Strategy for outer cluster choosing when updating [default: Dist] [possible values: Delta, Diam, Dist] + --estimation + Make cluster estimation for DIR using silhouette index --ignore File with regular expressions for functions and file paths that should be ignored diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index f05e4fee..45f329fe 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -410,6 +410,61 @@ pub fn relation( } } +/// Get "a" subcoefficient silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// "a" subcoefficient silhouette coefficient +pub fn get_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { + let mut sum = 0f64; + for i in 0..stacktraces.len() - 1 { + if i == num { + continue; + } + sum += 1.0 - similarity(&stacktraces[num], &stacktraces[i]); + } + sum / (stacktraces.len() - 1) as f64 +} + +/// Get "b" subcoefficient silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `cl` - cluster number of given stacktrace +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// "b" subcoefficient silhouette coefficient +pub fn get_subcoef_b(num: usize, cl: usize, clusters: &[Vec]) -> f64 { + let mut min = MAX; + for j in 0..clusters.len() - 1 { + if j == cl { + continue; + } + let mut sum = 0f64; + for i in 0..clusters[j].len() - 1 { + sum += 1.0 - similarity(&clusters[cl][num], &clusters[j][i]); + } + let res = sum / clusters[j].len() as f64; + if res < min { + min = res; + } + } + min +} + /// Stack trace filtering trait. pub trait Filter { /// Filter frames from the stack trace that are not related to analyzed code containing crash. From bfca1ac3487bfaa14d987fe99157bde719c7b4b4 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 7 Dec 2023 18:12:19 +0300 Subject: [PATCH 07/34] lazy diam calc --- casr/src/bin/casr-cluster.rs | 13 +++-------- casr/src/util.rs | 2 +- libcasr/src/stacktrace.rs | 44 ++++++++++++++++++++++++++++-------- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 9cb57303..60307802 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -366,12 +366,7 @@ fn update_clusters( let casreps = util::get_reports(cluster)?; let (_, stacktraces, crashlines, _) = util::reports_from_dirs(casreps, jobs); // Fill cluster info structures - let diam = diam(&stacktraces); - clusters.push(Cluster { - number: i, - stacktraces, - diam, - }); + clusters.push(Cluster::new(i, stacktraces)); if dedup { for crashline in crashlines { // NOTE: Clusters enumerate from 1, not 0 @@ -396,8 +391,7 @@ fn update_clusters( let mut outers: Vec<(usize, f64)> = Vec::new(); // Checker if casrep is duplicate of someone else let mut dup = false; - for cluster in &clusters { - // TODO: Add strategy options + for cluster in &mut clusters { let relation = relation( stacktrace, cluster, @@ -454,8 +448,7 @@ fn update_clusters( // Update cluster let i = clusters.iter().position(|a| a.number == number).unwrap(); - clusters[i].stacktraces.push(stacktrace.to_vec()); - clusters[i].diam = diam(&clusters[i].stacktraces); + clusters[i].push(stacktrace.to_vec()); } // Handle deviant casreps diff --git a/casr/src/util.rs b/casr/src/util.rs index 4217dafb..72cc56c9 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -302,7 +302,7 @@ pub fn get_atheris_lib() -> Result { Ok(format!("{out}/asan_with_fuzzer.so")) } -/// Create output, timeout and oOLDdirectories +/// Create output, timeout and oom directories /// /// # Arguments /// diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 45f329fe..90031367 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -61,17 +61,43 @@ pub enum AccumStrategy { Dist, } -// TODO: lazy diam -// TODO: encapsulation /// Structure provides an interface for leverages with CASR report clusters #[derive(Clone, Debug)] pub struct Cluster { /// Cluster number pub number: usize, /// Cluster report stacktraces - pub stacktraces: Vec, + stacktraces: Vec, /// Cluster diameter - pub diam: f64, + diam: Option, +} + +impl Cluster { + /// Create new `Cluster` + pub fn new(number: usize, stacktraces: Vec) -> Self { + Cluster { + number, + stacktraces, + diam: None, + } + } + /// Get CASR report stactraces + pub fn stacktraces(&self) -> Vec { + self.stacktraces.clone() + } + /// Add CASR report stacktrace to cluster + pub fn push(&mut self, stacktrace: Stacktrace) { + self.stacktraces.push(stacktrace); + self.diam = None; + } + /// Get cluster diameter + pub fn diam(&mut self) -> f64 { + if self.diam.is_none() { + diam(&self.stacktraces) + } else { + self.diam.unwrap() + } + } } /// This macro updates variables used to remove trusted functions from stack trace @@ -339,7 +365,7 @@ pub fn dedup_crashlines(crashlines: &[String], clusters: &mut [usize]) -> usize /// # Return value /// /// Value of diameter -pub fn diam(stacktraces: &[Stacktrace]) -> f64 { +fn diam(stacktraces: &[Stacktrace]) -> f64 { let mut diam = 0f64; let len = stacktraces.len(); for i in 0..len { @@ -370,15 +396,15 @@ pub fn diam(stacktraces: &[Stacktrace]) -> f64 { /// `Relation` enum with measure according specified strategy pub fn relation( new: &Stacktrace, - cluster: &Cluster, + cluster: &mut Cluster, inner_strategy: AccumStrategy, outer_strategy: AccumStrategy, ) -> Relation { - let diam = cluster.diam; + let diam = cluster.diam(); let mut min = MAX; let mut max = 0f64; - for stacktrace in &cluster.stacktraces { - let dist = 1.0 - similarity(new, stacktrace); + for stacktrace in cluster.stacktraces() { + let dist = 1.0 - similarity(new, &stacktrace); if dist == 0.0 { return Relation::Dup; } else if dist > THRESHOLD { From a5b58a8db562f2a8f54ebf227d7280e50b627072 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 8 Dec 2023 15:02:31 +0300 Subject: [PATCH 08/34] Make sil func --- casr/src/bin/casr-cluster.rs | 14 +++++--------- libcasr/src/stacktrace.rs | 33 ++++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 60307802..37c687b4 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -430,7 +430,10 @@ fn update_clusters( }; // Make crashline deduplication - if !crashline.is_empty() && !unique_crashlines[number - 1].insert(crashline.to_string()) { + if dedup + && !crashline.is_empty() + && !unique_crashlines[number - 1].insert(crashline.to_string()) + { deduplicated += 1; continue; } @@ -524,14 +527,7 @@ fn get_sil(dir: &Path, jobs: usize) -> Result { // Calculate silhouette coefficient for each casrep for i in 0..clusters.len() - 1 { for num in 0..clusters[i].len() - 1 { - let sil = if clusters[i].len() != 1 { - let a = get_subcoef_a(num, &clusters[i]); - let b = get_subcoef_b(num, i, &clusters); - (b - a) / a.max(b) - } else { - 0f64 - }; - sum += sil; + sum += sil_coef(num, i, &clusters); } } Ok(sum / size as f64) diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 90031367..2333801a 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -93,10 +93,9 @@ impl Cluster { /// Get cluster diameter pub fn diam(&mut self) -> f64 { if self.diam.is_none() { - diam(&self.stacktraces) - } else { - self.diam.unwrap() + self.diam = Some(diam(&self.stacktraces)); } + self.diam.unwrap() } } @@ -448,7 +447,7 @@ pub fn relation( /// # Return value /// /// "a" subcoefficient silhouette coefficient -pub fn get_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { +fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { let mut sum = 0f64; for i in 0..stacktraces.len() - 1 { if i == num { @@ -473,7 +472,7 @@ pub fn get_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { /// # Return value /// /// "b" subcoefficient silhouette coefficient -pub fn get_subcoef_b(num: usize, cl: usize, clusters: &[Vec]) -> f64 { +fn sil_subcoef_b(num: usize, cl: usize, clusters: &[Vec]) -> f64 { let mut min = MAX; for j in 0..clusters.len() - 1 { if j == cl { @@ -491,6 +490,30 @@ pub fn get_subcoef_b(num: usize, cl: usize, clusters: &[Vec]) -> f64 min } +/// Get silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `i` - cluster number of given stacktrace +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// "b" subcoefficient silhouette coefficient +pub fn sil_coef(num: usize, i: usize, clusters: &[Vec]) -> f64 { + if clusters[i].len() != 1 { + let a = sil_subcoef_a(num, &clusters[i]); + let b = sil_subcoef_b(num, i, clusters); + (b - a) / a.max(b) + } else { + 0f64 + } +} + /// Stack trace filtering trait. pub trait Filter { /// Filter frames from the stack trace that are not related to analyzed code containing crash. From 009863245111e06b1a779df84d93fda7630cf9ba Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 8 Dec 2023 20:46:50 +0300 Subject: [PATCH 09/34] Fix sil --- casr/src/bin/casr-cluster.rs | 36 ++++---- .../casrep/test_clustering_small/40.casrep | 87 +++++++++++++++++++ casr/tests/tests.rs | 52 +++++++++-- docs/usage.md | 2 +- libcasr/src/stacktrace.rs | 13 ++- 5 files changed, 156 insertions(+), 34 deletions(-) create mode 100644 casr/tests/casr_tests/casrep/test_clustering_small/40.casrep diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 37c687b4..f9608c37 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -338,7 +338,7 @@ fn update_clusters( .zip(stacktraces.iter().zip(crashlines.iter())); // Get casreps from existing clusters - let cluster_dirs: Vec = fs::read_dir(oldpath) + let mut cluster_dirs: Vec = fs::read_dir(oldpath) .unwrap() .map(|path| path.unwrap().path()) .filter(|path| { @@ -350,6 +350,7 @@ fn update_clusters( .starts_with("cl") }) .collect(); + cluster_dirs.sort(); let len = cluster_dirs.len(); // Init clusters vector let mut clusters: Vec = Vec::new(); @@ -415,11 +416,10 @@ fn update_clusters( } } } - if dup { - continue; - } // Get cluster with min measure, a.k.a. "closest" one - let number = if !inners.is_empty() { + let number = if dup { + continue; + } else if !inners.is_empty() { inners.iter().min_by(|a, b| a.1.total_cmp(&b.1)).unwrap().0 } else if !outers.is_empty() { outers.iter().min_by(|a, b| a.1.total_cmp(&b.1)).unwrap().0 @@ -438,8 +438,8 @@ fn update_clusters( continue; } - added += 1; // Save casrep + added += 1; fs::copy( casrep, format!( @@ -488,9 +488,9 @@ fn update_clusters( /// # Return value /// /// Silhouette coefficient -fn get_sil(dir: &Path, jobs: usize) -> Result { +fn avg_sil(dir: &Path, jobs: usize) -> Result { // Get cluster dirs - let dirs: Vec = fs::read_dir(dir) + let mut dirs: Vec = fs::read_dir(dir) .unwrap() .map(|path| path.unwrap().path()) .filter(|path| { @@ -502,6 +502,7 @@ fn get_sil(dir: &Path, jobs: usize) -> Result { .starts_with("cl") }) .collect(); + dirs.sort(); if dirs.len() < 2 { bail!("{} valid cluster, nothing to calculate...", dirs.len()); @@ -525,9 +526,10 @@ fn get_sil(dir: &Path, jobs: usize) -> Result { // Init sil sum let mut sum = 0f64; // Calculate silhouette coefficient for each casrep - for i in 0..clusters.len() - 1 { - for num in 0..clusters[i].len() - 1 { - sum += sil_coef(num, i, &clusters); + for i in 0..clusters.len() { + for num in 0..clusters[i].len() { + let sil = sil_coef(num, i, &clusters); + sum += sil; } } Ok(sum / size as f64) @@ -629,8 +631,8 @@ fn main() -> Result<()> { .help("Strategy for outer cluster choosing when updating"), ) .arg( - Arg::new("estimation") - .long("estimation") + Arg::new("estimate") + .long("estimate") .value_name("DIR") .action(ArgAction::Set) .value_parser(clap::value_parser!(PathBuf)) @@ -746,11 +748,11 @@ fn main() -> Result<()> { println!("Number of reports before crashline deduplication in new clusters: {before}"); println!("Number of reports after crashline deduplication in new clusters: {after}"); } - let sil = get_sil(paths[1], jobs)?; + let sil = avg_sil(paths[1], jobs)?; println!("Cluster silhouette index: {sil}"); - } else if matches.contains_id("estimation") { - let path: &PathBuf = matches.get_one::("estimation").unwrap(); - let sil = get_sil(path, jobs)?; + } else if matches.contains_id("estimate") { + let path: &PathBuf = matches.get_one::("estimate").unwrap(); + let sil = avg_sil(path, jobs)?; println!("Cluster silhouette index: {sil}"); } diff --git a/casr/tests/casr_tests/casrep/test_clustering_small/40.casrep b/casr/tests/casr_tests/casrep/test_clustering_small/40.casrep new file mode 100644 index 00000000..ea43e532 --- /dev/null +++ b/casr/tests/casr_tests/casrep/test_clustering_small/40.casrep @@ -0,0 +1,87 @@ +{ + "Date": "2021-07-14T19:56:09.276635+03:00", + "Uname": "Linux titanfall 5.8.0-59-generic #66~20.04.1-Ubuntu SMP Thu Jun 17 11:14:10 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux", + "OS": "Ubuntu", + "OSRelease": "20.04", + "Architecture": "amd64", + "ExecutablePath": "/usr/local/bin/tiff2pdf", + "ProcCmdline": "tiff2pdf ./fuz3tiff2pdf/main/crashes/id:000009,sig:06,src:000040+000049,time:43718,op:splice,rep:4", + "ProcMaps": [ + " 0x555555554000 0x555555556000 0x2000 0x0 /usr/local/bin/tiff2pdf", + " 0x555555556000 0x555555561000 0xb000 0x2000 /usr/local/bin/tiff2pdf", + " 0x555555561000 0x555555565000 0x4000 0xd000 /usr/local/bin/tiff2pdf", + " 0x555555565000 0x555555566000 0x1000 0x10000 /usr/local/bin/tiff2pdf", + " 0x555555566000 0x555555567000 0x1000 0x11000 /usr/local/bin/tiff2pdf", + " 0x555555567000 0x555555588000 0x21000 0x0 [heap]", + " 0x7ffff7945000 0x7ffff7949000 0x4000 0x0 ", + " 0x7ffff7949000 0x7ffff7958000 0xf000 0x0 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7958000 0x7ffff79ff000 0xa7000 0xf000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff79ff000 0x7ffff7a96000 0x97000 0xb6000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7a96000 0x7ffff7a97000 0x1000 0x14c000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7a97000 0x7ffff7a98000 0x1000 0x14d000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7a98000 0x7ffff7a9a000 0x2000 0x0 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7a9a000 0x7ffff7aab000 0x11000 0x2000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7aab000 0x7ffff7ab1000 0x6000 0x13000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab1000 0x7ffff7ab2000 0x1000 0x19000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab2000 0x7ffff7ab3000 0x1000 0x19000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab3000 0x7ffff7ab4000 0x1000 0x1a000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab4000 0x7ffff7ab8000 0x4000 0x0 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7ab8000 0x7ffff7afc000 0x44000 0x4000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7afc000 0x7ffff7b36000 0x3a000 0x48000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b36000 0x7ffff7b37000 0x1000 0x82000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b37000 0x7ffff7b38000 0x1000 0x82000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b38000 0x7ffff7b39000 0x1000 0x83000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b39000 0x7ffff7b44000 0xb000 0x0 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7b44000 0x7ffff7d43000 0x1ff000 0xb000 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7d43000 0x7ffff7d44000 0x1000 0xa000 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7d44000 0x7ffff7d47000 0x3000 0xb000 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7d47000 0x7ffff7d6c000 0x25000 0x0 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7d6c000 0x7ffff7ee4000 0x178000 0x25000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7ee4000 0x7ffff7f2e000 0x4a000 0x19d000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f2e000 0x7ffff7f2f000 0x1000 0x1e7000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f2f000 0x7ffff7f32000 0x3000 0x1e7000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f32000 0x7ffff7f35000 0x3000 0x1ea000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f35000 0x7ffff7f39000 0x4000 0x0 ", + " 0x7ffff7f39000 0x7ffff7f41000 0x8000 0x0 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7f41000 0x7ffff7f76000 0x35000 0x8000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7f76000 0x7ffff7f9f000 0x29000 0x3d000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7f9f000 0x7ffff7fa0000 0x1000 0x66000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7fa0000 0x7ffff7fa2000 0x2000 0x66000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7fa2000 0x7ffff7fa3000 0x1000 0x68000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7fa3000 0x7ffff7fa5000 0x2000 0x0 ", + " 0x7ffff7fc8000 0x7ffff7fc9000 0x1000 0x0 ", + " 0x7ffff7fc9000 0x7ffff7fcd000 0x4000 0x0 [vvar]", + " 0x7ffff7fcd000 0x7ffff7fcf000 0x2000 0x0 [vdso]", + " 0x7ffff7fcf000 0x7ffff7fd0000 0x1000 0x0 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7fd0000 0x7ffff7ff3000 0x23000 0x1000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ff3000 0x7ffff7ffb000 0x8000 0x24000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ffb000 0x7ffff7ffc000 0x1000 0x0 /home/avgor46/testdoc/fuz3tiff2pdf/main/crashes/id:000009,sig:06,src:000040+000049,time:43718,op:splice,rep:4", + " 0x7ffff7ffc000 0x7ffff7ffd000 0x1000 0x2c000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ffd000 0x7ffff7ffe000 0x1000 0x2d000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ffe000 0x7ffff7fff000 0x1000 0x0 ", + " 0x7ffffffde000 0x7ffffffff000 0x21000 0x0 [stack]", + " 0xffffffffff600000 0xffffffffff601000 0x1000 0x0 [vsyscall]" + ], + "CrashSeverity": { + "Type": "NOT_CRITICAL", + "ShortDescription": "SafeFunctionCheck", + "Description": "Buffer overflow in safe function", + "Explanation": "The target stopped while handling a signal that was generated by libc due to detection of buffer overflow in safe copy function." + }, + "Stacktrace": [ + "#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50", + "#1 0x00007ffff7d6c859 in __GI_abort () at abort.c:79", + "#2 0x00007ffff7dd73ee in __libc_message (action=action@entry=do_abort, fmt=fmt@entry=0x7ffff7f0107c \"*** %s ***: terminated\\n\") at ../sysdeps/posix/libc_fatal.c:155", + "#3 0x00007ffff7e79b4a in __GI___fortify_fail (msg=msg@entry=0x7ffff7f01012 \"buffer overflow detected\") at fortify_fail.c:26", + "#4 0x00007ffff7e783e6 in __GI___chk_fail () at chk_fail.c:28", + "#5 0x00007ffff7dcf1cf in _IO_str_chk_overflow (fp=, c=) at iovsprintf.c:35", + "#6 0x00007ffff7da7db0 in __GI___printf_fp_l (fp=, loc=, info=, args=) at printf_fp.c:1246", + "#7 0x00007ffff7dc163a in __vfprintf_internal (s=s@entry=0x7fffffffe070, format=format@entry=0x5555555613df \"%.4f\", ap=ap@entry=0x7fffffffe1b0, mode_flags=mode_flags@entry=6) at vfprintf-internal.c:1687", + "#8 0x00007ffff7dcf279 in __vsprintf_internal (string=0x7fffffffe2a0 \"79725330432.000\", maxlen=, format=0x5555555613df \"%.4f\", args=args@entry=0x7fffffffe1b0, mode_flags=6) at iovsprintf.c:95", + "#9 0x00007ffff7e77edb in ___sprintf_chk (s=, flag=, slen=, format=) at sprintf_chk.c:40", + "#10 0x000055555555c7a1 in sprintf (__fmt=0x5555555613df \"%.4f\", __s=0x7fffffffe2a0 \"79725330432.000\") at /usr/include/x86_64-linux-gnu/bits/stdio2.h:36", + "#12 0x00005555555601b8 in t2p_write_pdf (output=0x555555568f80, input=0x555555567ea0, t2p=0x5555555672a0) at tiff2pdf.c:5175", + "#13 t2p_write_pdf (t2p=0x5555555672a0, input=0x555555567ea0, output=0x555555568f80) at tiff2pdf.c:5133", + "#14 0x00005555555568d4 in main (argc=, argv=) at tiff2pdf.c:763" + ] +} diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index 0b0924ec..2ec42ab1 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2425,7 +2425,7 @@ fn test_casr_cluster_c() { .parse::() .unwrap(); - assert_eq!(before_cnt, 11, "Before count mismatch."); + assert_eq!(before_cnt, 12, "Before count mismatch."); let re = Regex::new(r"Number of reports after crashline deduplication: (?P\d+)").unwrap(); @@ -2438,15 +2438,16 @@ fn test_casr_cluster_c() { .parse::() .unwrap(); - assert_eq!(after_cnt, 10, "After count mismatch."); + assert_eq!(after_cnt, 11, "After count mismatch."); // 2.casrep and 20.caserp without crashlines => no dedup // 3.casrep and 30.caserp with crashlines => dedup - // Thus, cluster (cl8) with 2.casrep has 2 casreps and others have 1 casrep + // Thus, cluster (cl7) with 2.casrep has 2 casreps and cl9 too + // But others have 1 casrep for i in 1..clusters_cnt + 1 { let cluster_path = paths[1].to_owned() + "/cl" + &i.to_string(); let size = std::fs::read_dir(cluster_path.clone()).unwrap().count(); - let num = if i == 8 { 2 } else { 1 }; + let num = if i == 7 || i == 9 { 2 } else { 1 }; assert_eq!(size, num); } @@ -2654,8 +2655,10 @@ fn test_casr_cluster_u() { let paths = [ abs_path("tests/casr_tests/casrep/test_clustering_small"), abs_path("tests/tmp_tests_casr/clustering_out"), - abs_path("tests/tmp_tests_casr/clustering_out/cl8/20.casrep"), + abs_path("tests/tmp_tests_casr/clustering_out/cl7/20.casrep"), + abs_path("tests/tmp_tests_casr/clustering_out/cl8"), abs_path("tests/tmp_tests_casr/clustering_out/cl9"), + abs_path("tests/tmp_tests_casr/clustering_out/cl9/40.casrep"), ]; let _ = fs::remove_dir_all(&paths[1]); @@ -2690,7 +2693,9 @@ fn test_casr_cluster_u() { assert_eq!(clusters_cnt, 9, "Clusters count mismatch."); let _ = std::fs::remove_file(&paths[2]); + let _ = std::fs::remove_file(&paths[5]); let _ = std::fs::remove_dir_all(&paths[3]); + let _ = std::fs::rename(&paths[4], &paths[3]); let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) .args(["-u", &paths[0], &paths[1]]) @@ -2719,7 +2724,7 @@ fn test_casr_cluster_u() { .parse::() .unwrap(); - assert_eq!(added_cnt, 0, "Added count mismatch."); + assert_eq!(added_cnt, 1, "Added count mismatch."); let re = Regex::new(r"Number of duplicates: (?P\d+)").unwrap(); let duplicates_cnt = re @@ -2775,17 +2780,46 @@ fn test_casr_cluster_u() { assert_eq!(after_cnt, 1, "After count mismatch."); - let re = Regex::new(r"Cluster silhouette index: (?P\d+)").unwrap(); + let re = Regex::new(r"Cluster silhouette index: (?P\d+.\d+)").unwrap(); let sil = re .captures(&res) .unwrap() .name("sil") .map(|x| x.as_str()) .unwrap() - .parse::() + .parse::() + .unwrap(); + + assert_eq!(sil, 0.15436556855344655, "Silhouette index mismatch."); + + // Test estimation + let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) + .args(["--estimate", &paths[1]]) + .output() + .expect("failed to start casr-cluster"); + + assert!( + output.status.success(), + "Stdout {}.\n Stderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + + let res = String::from_utf8_lossy(&output.stdout); + + assert!(!res.is_empty()); + + let re = Regex::new(r"Cluster silhouette index: (?P\d+.\d+)").unwrap(); + let sil = re + .captures(&res) + .unwrap() + .name("sil") + .map(|x| x.as_str()) + .unwrap() + .parse::() .unwrap(); - assert_eq!(sil, 0, "Silhouette index mismatch."); + assert_eq!(sil, 0.15436556855344655, "Silhouette index mismatch."); let _ = std::fs::remove_dir_all(&paths[1]); } diff --git a/docs/usage.md b/docs/usage.md index 00764f9f..18b8b592 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -241,7 +241,7 @@ Tool for clustering CASR reports --outer-strategy Strategy for outer cluster choosing when updating [default: Dist] [possible values: Delta, Diam, Dist] - --estimation + --estimate Make cluster estimation for DIR using silhouette index --ignore File with regular expressions for functions and file paths that should be diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 2333801a..b7a421af 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -42,9 +42,9 @@ const THRESHOLD: f64 = 0.3; pub enum Relation { /// The CASR report is a duplicate of one from cluster Dup, - /// The CASR report is "inside" the cluster with some measure + /// The CASR report is "inside" the cluster with some proximity measure Inner(f64), - /// The CASR report is "outside" the cluster with some measure + /// The CASR report is "outside" the cluster with some proximity measure Outer(f64), /// The CASR report is out of threshold Oot, @@ -62,7 +62,6 @@ pub enum AccumStrategy { } /// Structure provides an interface for leverages with CASR report clusters -#[derive(Clone, Debug)] pub struct Cluster { /// Cluster number pub number: usize, @@ -392,7 +391,7 @@ fn diam(stacktraces: &[Stacktrace]) -> f64 { /// /// # Return value /// -/// `Relation` enum with measure according specified strategy +/// `Relation` enum with proximity measure according specified strategy pub fn relation( new: &Stacktrace, cluster: &mut Cluster, @@ -449,7 +448,7 @@ pub fn relation( /// "a" subcoefficient silhouette coefficient fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { let mut sum = 0f64; - for i in 0..stacktraces.len() - 1 { + for i in 0..stacktraces.len() { if i == num { continue; } @@ -474,12 +473,12 @@ fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { /// "b" subcoefficient silhouette coefficient fn sil_subcoef_b(num: usize, cl: usize, clusters: &[Vec]) -> f64 { let mut min = MAX; - for j in 0..clusters.len() - 1 { + for j in 0..clusters.len() { if j == cl { continue; } let mut sum = 0f64; - for i in 0..clusters[j].len() - 1 { + for i in 0..clusters[j].len() { sum += 1.0 - similarity(&clusters[cl][num], &clusters[j][i]); } let res = sum / clusters[j].len() as f64; From 576c61bf8746cb1dab9e2704c12c57d5fbeca49e Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 13 Dec 2023 14:13:11 +0300 Subject: [PATCH 10/34] Fixes --- casr/src/bin/casr-cluster.rs | 38 +++++------ casr/src/util.rs | 2 +- casr/tests/tests.rs | 8 +-- docs/usage.md | 23 ++++++- libcasr/src/stacktrace.rs | 119 +++++++++++++++++------------------ 5 files changed, 104 insertions(+), 86 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index f9608c37..a3f96e4c 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -302,7 +302,7 @@ fn merge_dirs(input: &Path, output: &Path) -> Result { Ok(new) } -/// Perform the clustering of casreps +/// Add new reports to existing clustering structure /// /// # Arguments /// @@ -314,6 +314,10 @@ fn merge_dirs(input: &Path, output: &Path) -> Result { /// /// * `dedup` - deduplicate casrep by crashline for each cluster, if true /// +/// * `inner_strategy` - strategy for "inner" report case +/// +/// * `outer_strategy` - strategy for "outer" report case +/// /// # Return value /// /// * Number of casreps added to old clusters @@ -359,20 +363,20 @@ fn update_clusters( // Get casreps from each existing cluster for cluster in &cluster_dirs { // Get cluster number - let i = cluster.clone().file_name().unwrap().to_str().unwrap()[2..] + let Ok(i) = cluster.clone().file_name().unwrap().to_str().unwrap()[2..] .to_string() .parse::() - .unwrap(); + else { + continue; + }; // Get casreps from cluster let casreps = util::get_reports(cluster)?; let (_, stacktraces, crashlines, _) = util::reports_from_dirs(casreps, jobs); // Fill cluster info structures clusters.push(Cluster::new(i, stacktraces)); if dedup { - for crashline in crashlines { - // NOTE: Clusters enumerate from 1, not 0 - unique_crashlines[i - 1].insert(crashline); - } + // NOTE: Clusters enumerate from 1, not 0 + unique_crashlines[i - 1].extend(crashlines); } } @@ -393,12 +397,7 @@ fn update_clusters( // Checker if casrep is duplicate of someone else let mut dup = false; for cluster in &mut clusters { - let relation = relation( - stacktrace, - cluster, - inner_strategy.clone(), - outer_strategy.clone(), - ); + let relation = cluster.relation(stacktrace, inner_strategy, outer_strategy); match relation { Relation::Dup => { dup = true; @@ -470,7 +469,10 @@ fn update_clusters( )?; } // Cluster deviant casreps - make_clusters(Path::new(&deviant_dir), Some(oldpath), jobs, dedup, len)? + let (result, before, after) = + make_clusters(Path::new(&deviant_dir), Some(oldpath), jobs, dedup, len)?; + let _ = fs::remove_dir_all(&deviant_dir); + (result, before, after) } else { (0, 0, 0) }; @@ -609,7 +611,7 @@ fn main() -> Result<()> { .value_parser(clap::value_parser!(PathBuf)) .value_names(["NEW_DIR", "OLD_DIR"]) .help( - "Update clusters from OLD_DIR using CASR reports from NEW_DIR.", + "Update clusters from OLD_DIR using CASR reports from NEW_DIR", ), ) .arg( @@ -636,7 +638,7 @@ fn main() -> Result<()> { .value_name("DIR") .action(ArgAction::Set) .value_parser(clap::value_parser!(PathBuf)) - .help("Make cluster estimation for DIR using silhouette index"), + .help("Calculate silhouette score for clustering results"), ) .arg( Arg::new("ignore") @@ -749,11 +751,11 @@ fn main() -> Result<()> { println!("Number of reports after crashline deduplication in new clusters: {after}"); } let sil = avg_sil(paths[1], jobs)?; - println!("Cluster silhouette index: {sil}"); + println!("Cluster silhouette score: {sil}"); } else if matches.contains_id("estimate") { let path: &PathBuf = matches.get_one::("estimate").unwrap(); let sil = avg_sil(path, jobs)?; - println!("Cluster silhouette index: {sil}"); + println!("Cluster silhouette score: {sil}"); } Ok(()) diff --git a/casr/src/util.rs b/casr/src/util.rs index 72cc56c9..8d0e5c38 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -441,7 +441,7 @@ pub fn get_reports(dir: &Path) -> Result> { /// /// # Return value /// -/// * A vector of reports paths +/// * A vector of paths to correctly parsed reports /// * A vector of reports stacktraces /// * A vector of reports crashlines /// * A vector of bad reports diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index 2ec42ab1..3b49fe58 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2780,7 +2780,7 @@ fn test_casr_cluster_u() { assert_eq!(after_cnt, 1, "After count mismatch."); - let re = Regex::new(r"Cluster silhouette index: (?P\d+.\d+)").unwrap(); + let re = Regex::new(r"Cluster silhouette score: (?P\d+.\d+)").unwrap(); let sil = re .captures(&res) .unwrap() @@ -2790,7 +2790,7 @@ fn test_casr_cluster_u() { .parse::() .unwrap(); - assert_eq!(sil, 0.15436556855344655, "Silhouette index mismatch."); + assert_eq!(sil, 0.15436556855344655, "Silhouette score mismatch."); // Test estimation let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) @@ -2809,7 +2809,7 @@ fn test_casr_cluster_u() { assert!(!res.is_empty()); - let re = Regex::new(r"Cluster silhouette index: (?P\d+.\d+)").unwrap(); + let re = Regex::new(r"Cluster silhouette score: (?P\d+.\d+)").unwrap(); let sil = re .captures(&res) .unwrap() @@ -2819,7 +2819,7 @@ fn test_casr_cluster_u() { .parse::() .unwrap(); - assert_eq!(sil, 0.15436556855344655, "Silhouette index mismatch."); + assert_eq!(sil, 0.15436556855344655, "Silhouette score mismatch."); let _ = std::fs::remove_dir_all(&paths[1]); } diff --git a/docs/usage.md b/docs/usage.md index 18b8b592..b6b713e9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -234,7 +234,7 @@ Tool for clustering CASR reports Merge INPUT_DIR into OUTPUT_DIR. Only new CASR reports from INPUT_DIR will be added to OUTPUT_DIR. -u, --update - Update clusters from OLD_DIR using CASR reports from NEW_DIR. + Update clusters from OLD_DIR using CASR reports from NEW_DIR --inner-strategy Strategy for inner cluster choosing when updating [default: Dist] [possible values: Diam, Dist] @@ -242,7 +242,7 @@ Tool for clustering CASR reports Strategy for outer cluster choosing when updating [default: Dist] [possible values: Delta, Diam, Dist] --estimate - Make cluster estimation for DIR using silhouette index + Calculate silhouette score for clustering results --ignore File with regular expressions for functions and file paths that should be ignored @@ -315,6 +315,25 @@ For `CASR_CLUSTER_UNIQUE_CRASHLINE` a `false` literal is `n`, `no`, `f`, `false`, `off` or `0`. An absent environment variable will also be considered as `false`. Anything else will considered as true. +For updating clusters we use the following strategy: +Every CASR report may be in one of several states: + * `Duplicate` - the report is a duplicate of one from cluster, + * `Inner` - the report is "inside" a some cluster with some proximity measure, + * `Outer` - the report is "outside" a some cluster with some proximity measure, + * `Out` of threshold - the report is out of threshold for any cluster. +If report is `Duplicate` we do nothing. +If report is `Oot` we perform clustering for all such reports. +If report is `Inner` or `Outer` for a single cluster we update the cluster. +If there are several `Inner` or `Outer` clusters for the report we choose the +"closest" according according to one of the following strategies: + * `Delta` - we choose cluster with minimal diameter change, + i.e. `Argmin (diam (cluster + {new}) - diam (cluster))` + * `Diam` - we choose cluster with minimal diameter, + i.e. `Argmin diam (cluster + {new})` + * `Dist` - we choose cluster with minimal distance between cluster and report, + i.e. `Argmin dist (cluster, {new})` +N.B. `Delta` strategy is a nonsensical strategy in `Inner` case + ## casr-cli App provides text-based user interface to view CASR reports, prints joint statistics for diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index b7a421af..c2f5d3e9 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -51,7 +51,7 @@ pub enum Relation { } /// Cluster accumulation strategy -#[derive(Clone, Debug)] +#[derive(Clone, Copy, Debug)] pub enum AccumStrategy { /// Argmin (diam (cluster + {new}) - diam (cluster)) Delta, @@ -61,7 +61,7 @@ pub enum AccumStrategy { Dist, } -/// Structure provides an interface for leverages with CASR report clusters +/// Structure provides an abstraction for cluster with CASR reports pub struct Cluster { /// Cluster number pub number: usize, @@ -81,8 +81,8 @@ impl Cluster { } } /// Get CASR report stactraces - pub fn stacktraces(&self) -> Vec { - self.stacktraces.clone() + pub fn stacktraces(&self) -> &Vec { + &self.stacktraces } /// Add CASR report stacktrace to cluster pub fn push(&mut self, stacktrace: Stacktrace) { @@ -96,6 +96,60 @@ impl Cluster { } self.diam.unwrap() } + /// Get "relation" between new report and specified cluster + /// + /// # Arguments + /// + /// * `new` - new report stacktrace + /// + /// * `inner_strategy` - cluster accumulation strategy if `new` is "inner" + /// + /// * `inner_strategy` - cluster accumulation strategy if `new` is "outer" + /// + /// # Return value + /// + /// `Relation` enum with proximity measure according specified strategy + pub fn relation( + &mut self, + new: &Stacktrace, + inner_strategy: AccumStrategy, + outer_strategy: AccumStrategy, + ) -> Relation { + let diam = self.diam(); + let mut min = MAX; + let mut max = 0f64; + for stacktrace in self.stacktraces() { + let dist = 1.0 - similarity(new, stacktrace); + if dist == 0.0 { + return Relation::Dup; + } else if dist > THRESHOLD { + return Relation::Oot; + } + if dist < min { + min = dist; + } + if dist > max { + max = dist; + } + } + if diam >= max { + // Inner + let rel = match inner_strategy { + // Delta is a nonsensical strategy in this case + AccumStrategy::Diam => diam, + _ => min, + }; + Relation::Inner(rel) + } else { + // Outer + let rel = match outer_strategy { + AccumStrategy::Diam => max, + AccumStrategy::Delta => max - diam, + AccumStrategy::Dist => min, + }; + Relation::Outer(rel) + } + } } /// This macro updates variables used to remove trusted functions from stack trace @@ -377,63 +431,6 @@ fn diam(stacktraces: &[Stacktrace]) -> f64 { diam } -/// Get "relation" between new report and specified cluster -/// -/// # Arguments -/// -/// * `new` - new report stacktrace -/// -/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures -/// -/// * `inner_strategy` - cluster accumulation strategy if `new` is "inner" -/// -/// * `inner_strategy` - cluster accumulation strategy if `new` is "outer" -/// -/// # Return value -/// -/// `Relation` enum with proximity measure according specified strategy -pub fn relation( - new: &Stacktrace, - cluster: &mut Cluster, - inner_strategy: AccumStrategy, - outer_strategy: AccumStrategy, -) -> Relation { - let diam = cluster.diam(); - let mut min = MAX; - let mut max = 0f64; - for stacktrace in cluster.stacktraces() { - let dist = 1.0 - similarity(new, &stacktrace); - if dist == 0.0 { - return Relation::Dup; - } else if dist > THRESHOLD { - return Relation::Oot; - } - if dist < min { - min = dist; - } - if dist > max { - max = dist; - } - } - if diam >= max { - // Inner - let rel = match inner_strategy { - // Delta is a nonsensical strategy in this case - AccumStrategy::Diam => diam, - _ => min, - }; - Relation::Inner(rel) - } else { - // Outer - let rel = match outer_strategy { - AccumStrategy::Diam => max, - AccumStrategy::Delta => max - diam, - AccumStrategy::Dist => min, - }; - Relation::Outer(rel) - } -} - /// Get "a" subcoefficient silhouette coefficient calculating for given stacktrace /// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition /// From 85d7b7fa06118a72b86aa7381f1ef4a080e2a169 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 13 Dec 2023 14:53:13 +0300 Subject: [PATCH 11/34] Fixes --- casr/src/bin/casr-cluster.rs | 33 +++++++++++++-------------------- casr/src/util.rs | 2 +- libcasr/src/stacktrace.rs | 20 ++++++++++---------- 3 files changed, 24 insertions(+), 31 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index a3f96e4c..23da7615 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -61,7 +61,7 @@ fn make_clusters( } // Get casreps with stacktraces and crashlines - let (casreps, stacktraces, crashlines, badreports) = util::reports_from_dirs(casreps, jobs); + let (casreps, stacktraces, crashlines, badreports) = util::reports_from_paths(casreps, jobs); if !badreports.is_empty() { fs::create_dir_all(format!("{}/clerr", &outpath.display()))?; @@ -336,7 +336,7 @@ fn update_clusters( ) -> Result<(usize, usize, usize, usize, usize, usize)> { // Get new casreps let casreps = util::get_reports(newpath)?; - let (casreps, stacktraces, crashlines, _) = util::reports_from_dirs(casreps, jobs); + let (casreps, stacktraces, crashlines, _) = util::reports_from_paths(casreps, jobs); let casreps = casreps .iter() .zip(stacktraces.iter().zip(crashlines.iter())); @@ -346,12 +346,8 @@ fn update_clusters( .unwrap() .map(|path| path.unwrap().path()) .filter(|path| { - path.clone() - .file_name() - .unwrap() - .to_str() - .unwrap() - .starts_with("cl") + let name = path.file_name().unwrap().to_str().unwrap(); + name.starts_with("cl") && !name.starts_with("clerr") }) .collect(); cluster_dirs.sort(); @@ -363,15 +359,13 @@ fn update_clusters( // Get casreps from each existing cluster for cluster in &cluster_dirs { // Get cluster number - let Ok(i) = cluster.clone().file_name().unwrap().to_str().unwrap()[2..] + let i = cluster.clone().file_name().unwrap().to_str().unwrap()[2..] .to_string() .parse::() - else { - continue; - }; + .unwrap(); // Get casreps from cluster let casreps = util::get_reports(cluster)?; - let (_, stacktraces, crashlines, _) = util::reports_from_dirs(casreps, jobs); + let (_, stacktraces, crashlines, _) = util::reports_from_paths(casreps, jobs); // Fill cluster info structures clusters.push(Cluster::new(i, stacktraces)); if dedup { @@ -496,12 +490,8 @@ fn avg_sil(dir: &Path, jobs: usize) -> Result { .unwrap() .map(|path| path.unwrap().path()) .filter(|path| { - path.clone() - .file_name() - .unwrap() - .to_str() - .unwrap() - .starts_with("cl") + let name = path.file_name().unwrap().to_str().unwrap(); + name.starts_with("cl") && !name.starts_with("clerr") }) .collect(); dirs.sort(); @@ -519,12 +509,15 @@ fn avg_sil(dir: &Path, jobs: usize) -> Result { // Get casreps from cluster let casreps = util::get_reports(dir)?; // Get stacktraces from cluster - let (_, stacktraces, _, _) = util::reports_from_dirs(casreps, jobs); + let (_, stacktraces, _, _) = util::reports_from_paths(casreps, jobs); // Update size size += stacktraces.len(); // Add stacktraces clusters.push(stacktraces); } + if size == 0 { + bail!("{} valid reports, nothing to calculate...", size); + } // Init sil sum let mut sum = 0f64; // Calculate silhouette coefficient for each casrep diff --git a/casr/src/util.rs b/casr/src/util.rs index 8d0e5c38..d7646896 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -445,7 +445,7 @@ pub fn get_reports(dir: &Path) -> Result> { /// * A vector of reports stacktraces /// * A vector of reports crashlines /// * A vector of bad reports -pub fn reports_from_dirs( +pub fn reports_from_paths( casreps: Vec, jobs: usize, ) -> (Vec, Vec, Vec, Vec) { diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index c2f5d3e9..fc3a9e84 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -445,11 +445,11 @@ fn diam(stacktraces: &[Stacktrace]) -> f64 { /// "a" subcoefficient silhouette coefficient fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { let mut sum = 0f64; - for i in 0..stacktraces.len() { + for (i, stacktrace) in stacktraces.iter().enumerate() { if i == num { continue; } - sum += 1.0 - similarity(&stacktraces[num], &stacktraces[i]); + sum += 1.0 - similarity(&stacktraces[num], stacktrace); } sum / (stacktraces.len() - 1) as f64 } @@ -461,24 +461,24 @@ fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { /// /// * `num` - given stacktrace number /// -/// * `cl` - cluster number of given stacktrace +/// * `i` - cluster number of given stacktrace /// /// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures /// /// # Return value /// /// "b" subcoefficient silhouette coefficient -fn sil_subcoef_b(num: usize, cl: usize, clusters: &[Vec]) -> f64 { +fn sil_subcoef_b(num: usize, i: usize, clusters: &[Vec]) -> f64 { let mut min = MAX; - for j in 0..clusters.len() { - if j == cl { + for (j, cluster) in clusters.iter().enumerate() { + if j == i { continue; } let mut sum = 0f64; - for i in 0..clusters[j].len() { - sum += 1.0 - similarity(&clusters[cl][num], &clusters[j][i]); + for stacktrace in cluster { + sum += 1.0 - similarity(&clusters[i][num], stacktrace); } - let res = sum / clusters[j].len() as f64; + let res = sum / cluster.len() as f64; if res < min { min = res; } @@ -499,7 +499,7 @@ fn sil_subcoef_b(num: usize, cl: usize, clusters: &[Vec]) -> f64 { /// /// # Return value /// -/// "b" subcoefficient silhouette coefficient +/// Silhouette coefficient pub fn sil_coef(num: usize, i: usize, clusters: &[Vec]) -> f64 { if clusters[i].len() != 1 { let a = sil_subcoef_a(num, &clusters[i]); From 742ec1d4cc8e7b96fd00e00b501bc9b84e5b9a44 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 13 Dec 2023 15:10:20 +0300 Subject: [PATCH 12/34] Fixes --- casr/src/bin/casr-cluster.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 23da7615..a421f294 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -5,7 +5,7 @@ use anyhow::{bail, Context, Result}; use clap::{builder::FalseyValueParser, Arg, ArgAction}; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fs; use std::path::{Path, PathBuf}; use std::sync::RwLock; @@ -355,7 +355,7 @@ fn update_clusters( // Init clusters vector let mut clusters: Vec = Vec::new(); // Init dedup crashline list for each cluster - let mut unique_crashlines: Vec> = vec![HashSet::new(); len]; + let mut unique_crashlines: HashMap> = HashMap::new(); // Get casreps from each existing cluster for cluster in &cluster_dirs { // Get cluster number @@ -370,7 +370,11 @@ fn update_clusters( clusters.push(Cluster::new(i, stacktraces)); if dedup { // NOTE: Clusters enumerate from 1, not 0 - unique_crashlines[i - 1].extend(crashlines); + unique_crashlines.insert(i - 1, HashSet::new()); + unique_crashlines + .get_mut(&(i - 1)) + .unwrap() + .extend(crashlines); } } @@ -425,7 +429,10 @@ fn update_clusters( // Make crashline deduplication if dedup && !crashline.is_empty() - && !unique_crashlines[number - 1].insert(crashline.to_string()) + && !unique_crashlines + .get_mut(&(number - 1)) + .unwrap() + .insert(crashline.to_string()) { deduplicated += 1; continue; From 72be2f55526ac79b66777d0983aa9fd0c3cce5d2 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 13 Dec 2023 15:26:19 +0300 Subject: [PATCH 13/34] Fixes --- casr/src/bin/casr-cluster.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index a421f294..9df1c1c4 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -351,7 +351,9 @@ fn update_clusters( }) .collect(); cluster_dirs.sort(); - let len = cluster_dirs.len(); + + // Max cluster number + let mut max = 0usize; // Init clusters vector let mut clusters: Vec = Vec::new(); // Init dedup crashline list for each cluster @@ -363,18 +365,16 @@ fn update_clusters( .to_string() .parse::() .unwrap(); + // Update max cluster number + max = max.max(i); // Get casreps from cluster let casreps = util::get_reports(cluster)?; let (_, stacktraces, crashlines, _) = util::reports_from_paths(casreps, jobs); // Fill cluster info structures clusters.push(Cluster::new(i, stacktraces)); if dedup { - // NOTE: Clusters enumerate from 1, not 0 - unique_crashlines.insert(i - 1, HashSet::new()); - unique_crashlines - .get_mut(&(i - 1)) - .unwrap() - .extend(crashlines); + unique_crashlines.insert(i, HashSet::new()); + unique_crashlines.get_mut(&(i)).unwrap().extend(crashlines); } } @@ -430,7 +430,7 @@ fn update_clusters( if dedup && !crashline.is_empty() && !unique_crashlines - .get_mut(&(number - 1)) + .get_mut(&(number)) .unwrap() .insert(crashline.to_string()) { @@ -471,7 +471,7 @@ fn update_clusters( } // Cluster deviant casreps let (result, before, after) = - make_clusters(Path::new(&deviant_dir), Some(oldpath), jobs, dedup, len)?; + make_clusters(Path::new(&deviant_dir), Some(oldpath), jobs, dedup, max)?; let _ = fs::remove_dir_all(&deviant_dir); (result, before, after) } else { From d99d025bc3950800d5af2bc84608146f090d4520 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 13 Dec 2023 16:33:00 +0300 Subject: [PATCH 14/34] Fix usage --- docs/usage.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index b6b713e9..24133441 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -321,6 +321,7 @@ Every CASR report may be in one of several states: * `Inner` - the report is "inside" a some cluster with some proximity measure, * `Outer` - the report is "outside" a some cluster with some proximity measure, * `Out` of threshold - the report is out of threshold for any cluster. + If report is `Duplicate` we do nothing. If report is `Oot` we perform clustering for all such reports. If report is `Inner` or `Outer` for a single cluster we update the cluster. @@ -332,6 +333,7 @@ If there are several `Inner` or `Outer` clusters for the report we choose the i.e. `Argmin diam (cluster + {new})` * `Dist` - we choose cluster with minimal distance between cluster and report, i.e. `Argmin dist (cluster, {new})` + N.B. `Delta` strategy is a nonsensical strategy in `Inner` case ## casr-cli From a847ff3e46a7691c7ddc42b5a539d5c8cd6c3959 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 21 Dec 2023 16:00:54 +0300 Subject: [PATCH 15/34] Add tolerance-level option --- casr/src/bin/casr-cluster.rs | 30 +++++++++++++++++++++++++++++- casr/tests/tests.rs | 2 +- libcasr/src/stacktrace.rs | 12 ++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 9df1c1c4..ac536679 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -318,6 +318,8 @@ fn merge_dirs(input: &Path, output: &Path) -> Result { /// /// * `outer_strategy` - strategy for "outer" report case /// +/// * `tolerance_level` - cluster tolerance level to "outer" reports +/// /// # Return value /// /// * Number of casreps added to old clusters @@ -333,6 +335,7 @@ fn update_clusters( dedup: bool, inner_strategy: AccumStrategy, outer_strategy: AccumStrategy, + tolerance_level: ToleranceLevel, ) -> Result<(usize, usize, usize, usize, usize, usize)> { // Get new casreps let casreps = util::get_reports(newpath)?; @@ -406,7 +409,15 @@ fn update_clusters( inners.push((cluster.number, measure)); } Relation::Outer(measure) => { - outers.push((cluster.number, measure)); + match tolerance_level { + ToleranceLevel::Loyal => { + outers.push((cluster.number, measure)); + } + // TODO: Add "Soft" + _ => { + deviants.push(casrep); + } + } } Relation::Oot => { continue; @@ -632,8 +643,18 @@ fn main() -> Result<()> { .default_value("Dist") .help("Strategy for outer cluster choosing when updating"), ) + .arg( + Arg::new("tolerance-level") + .long("tolerance-level") + .value_name("LEVEL") + .action(ArgAction::Set) + .value_parser(["Loyal", "Hard"]) // TODO: Add "Soft" + .default_value("Loyal") + .help("Cluster tolerance level to new CASR reports") + ) .arg( Arg::new("estimate") + .short('e') .long("estimate") .value_name("DIR") .action(ArgAction::Set) @@ -728,6 +749,12 @@ fn main() -> Result<()> { "Diam" => AccumStrategy::Diam, _ => AccumStrategy::Dist, }; + let tolerance_level = matches.get_one::("tolerance-level").unwrap(); + let tolerance_level = match tolerance_level.as_str() { + "Loyal" => ToleranceLevel::Loyal, + // TODO: Add "Soft" + _ => ToleranceLevel::Hard, + }; let (added, duplicates, deduplicated, result, before, after) = update_clusters( paths[0], @@ -736,6 +763,7 @@ fn main() -> Result<()> { dedup_crashlines, inner_strategy, outer_strategy, + tolerance_level, )?; println!("Number of casreps added to old clusters: {added}"); println!("Number of duplicates: {duplicates}"); diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index 3b49fe58..f222fe8f 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2794,7 +2794,7 @@ fn test_casr_cluster_u() { // Test estimation let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) - .args(["--estimate", &paths[1]]) + .args(["-e", &paths[1]]) .output() .expect("failed to start casr-cluster"); diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index fc3a9e84..de077e92 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -61,6 +61,18 @@ pub enum AccumStrategy { Dist, } +/// Cluster tolerance level to new CASR reports +#[derive(Clone, Copy, Debug)] +pub enum ToleranceLevel { + /// May insert any "Inner" and "Outer" CASR reports + Loyal, + /// May insert only "Inner" CASR reports + Hard, + /// May insert any "Inner" CASR reports + /// But "Outers" may be added only as subclusters after their clustering + Soft, +} + /// Structure provides an abstraction for cluster with CASR reports pub struct Cluster { /// Cluster number From 3be856151ab5472edcac1614c5de838cd468fee1 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 22 Dec 2023 15:07:32 +0300 Subject: [PATCH 16/34] Move crashlines to Cluster struct --- casr/src/bin/casr-cluster.rs | 53 ++++++++++++++---------------------- libcasr/src/stacktrace.rs | 28 +++++++++++++++++-- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index ac536679..7363132c 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -358,9 +358,7 @@ fn update_clusters( // Max cluster number let mut max = 0usize; // Init clusters vector - let mut clusters: Vec = Vec::new(); - // Init dedup crashline list for each cluster - let mut unique_crashlines: HashMap> = HashMap::new(); + let mut clusters: HashMap = HashMap::new(); // Get casreps from each existing cluster for cluster in &cluster_dirs { // Get cluster number @@ -373,12 +371,10 @@ fn update_clusters( // Get casreps from cluster let casreps = util::get_reports(cluster)?; let (_, stacktraces, crashlines, _) = util::reports_from_paths(casreps, jobs); + // Drop crashlines if they're unused + let crashlines = if dedup { crashlines } else { Vec::new() }; // Fill cluster info structures - clusters.push(Cluster::new(i, stacktraces)); - if dedup { - unique_crashlines.insert(i, HashSet::new()); - unique_crashlines.get_mut(&(i)).unwrap().extend(crashlines); - } + clusters.insert(i, Cluster::new(i, stacktraces, crashlines)); } // Init list of casreps, which aren't suitable for any cluster @@ -397,7 +393,7 @@ fn update_clusters( let mut outers: Vec<(usize, f64)> = Vec::new(); // Checker if casrep is duplicate of someone else let mut dup = false; - for cluster in &mut clusters { + for cluster in clusters.values_mut() { let relation = cluster.relation(stacktrace, inner_strategy, outer_strategy); match relation { Relation::Dup => { @@ -408,17 +404,14 @@ fn update_clusters( Relation::Inner(measure) => { inners.push((cluster.number, measure)); } - Relation::Outer(measure) => { - match tolerance_level { - ToleranceLevel::Loyal => { - outers.push((cluster.number, measure)); - } - // TODO: Add "Soft" - _ => { - deviants.push(casrep); - } + Relation::Outer(measure) => match tolerance_level { + ToleranceLevel::Loyal => { + outers.push((cluster.number, measure)); } - } + _ => { + deviants.push(casrep); + } + }, Relation::Oot => { continue; } @@ -437,14 +430,12 @@ fn update_clusters( continue; }; - // Make crashline deduplication - if dedup - && !crashline.is_empty() - && !unique_crashlines - .get_mut(&(number)) - .unwrap() - .insert(crashline.to_string()) - { + // Update cluster (and dedup crashline) + if !clusters.get_mut(&(number)).unwrap().insert( + stacktrace.to_vec(), + crashline.to_string(), + dedup, + ) { deduplicated += 1; continue; } @@ -459,10 +450,6 @@ fn update_clusters( &casrep.file_name().unwrap().to_str().unwrap() ), )?; - - // Update cluster - let i = clusters.iter().position(|a| a.number == number).unwrap(); - clusters[i].push(stacktrace.to_vec()); } // Handle deviant casreps @@ -648,7 +635,7 @@ fn main() -> Result<()> { .long("tolerance-level") .value_name("LEVEL") .action(ArgAction::Set) - .value_parser(["Loyal", "Hard"]) // TODO: Add "Soft" + .value_parser(["Loyal", "Soft", "Hard"]) .default_value("Loyal") .help("Cluster tolerance level to new CASR reports") ) @@ -752,7 +739,7 @@ fn main() -> Result<()> { let tolerance_level = matches.get_one::("tolerance-level").unwrap(); let tolerance_level = match tolerance_level.as_str() { "Loyal" => ToleranceLevel::Loyal, - // TODO: Add "Soft" + "Soft" => ToleranceLevel::Soft, _ => ToleranceLevel::Hard, }; diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index de077e92..7d603855 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -81,25 +81,47 @@ pub struct Cluster { stacktraces: Vec, /// Cluster diameter diam: Option, + /// Cluster report crashlines + crashlines: HashSet, } impl Cluster { /// Create new `Cluster` - pub fn new(number: usize, stacktraces: Vec) -> Self { + pub fn new(number: usize, stacktraces: Vec, crashlines: Vec) -> Self { + let mut unique_crashlines: HashSet = HashSet::new(); + unique_crashlines.extend(crashlines); Cluster { number, stacktraces, diam: None, + crashlines: unique_crashlines, } } /// Get CASR report stactraces pub fn stacktraces(&self) -> &Vec { &self.stacktraces } - /// Add CASR report stacktrace to cluster - pub fn push(&mut self, stacktrace: Stacktrace) { + /// Add new CASR report to cluster + /// + /// # Arguments + /// + /// * `stacktrace` - new CASR report stacktrace + /// + /// * `crashline` - new CASR report crashline + /// + /// * `dedup` - deduplicate crashline, if true + /// + /// # Return value + /// + /// `true` if new CASR report may be added, + /// `false` if report is duplicate of someone else + pub fn insert(&mut self, stacktrace: Stacktrace, crashline: String, dedup: bool) -> bool { + if dedup && !crashline.is_empty() && !self.crashlines.insert(crashline.to_string()) { + return false; + } self.stacktraces.push(stacktrace); self.diam = None; + true } /// Get cluster diameter pub fn diam(&mut self) -> f64 { From 9c7274998190ce8d749e6a64ce3e6e7863d7c204 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 22 Dec 2023 17:19:37 +0300 Subject: [PATCH 17/34] Add interlayer for clustering --- casr/src/bin/casr-cluster.rs | 50 +++++++--------------- casr/src/util.rs | 29 ++++++++++++- libcasr/src/stacktrace.rs | 81 +++++++++++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 38 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 7363132c..1077f8d4 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -38,8 +38,6 @@ fn stacktrace(path: &Path) -> Result { /// /// * `dedup` - deduplicate casrep by crashline for each cluster, if true /// -/// * `offset` - cluster enumerate offset -/// /// # Return value /// /// * Number of clusters @@ -50,7 +48,6 @@ fn make_clusters( outpath: Option<&Path>, jobs: usize, dedup: bool, - offset: usize, ) -> Result<(usize, usize, usize)> { // if outpath is "None" we consider that outpath and inpath are the same let outpath = outpath.unwrap_or(inpath); @@ -87,7 +84,7 @@ fn make_clusters( // Cluster formation let cluster_cnt: usize = *clusters.iter().max().unwrap(); for i in 1..=cluster_cnt { - fs::create_dir_all(format!("{}/cl{}", &outpath.display(), i + offset))?; + fs::create_dir_all(format!("{}/cl{}", &outpath.display(), i))?; } // Init before and after dedup counters @@ -109,7 +106,7 @@ fn make_clusters( format!( "{}/cl{}/{}", &outpath.display(), - clusters[i] + offset, + clusters[i], &casreps[i].file_name().unwrap().to_str().unwrap() ), )?; @@ -374,11 +371,12 @@ fn update_clusters( // Drop crashlines if they're unused let crashlines = if dedup { crashlines } else { Vec::new() }; // Fill cluster info structures - clusters.insert(i, Cluster::new(i, stacktraces, crashlines)); + // NOTE: We don't care about paths of casreps from existing clusters + clusters.insert(i, Cluster::new(i, Vec::new(), stacktraces, crashlines)); } // Init list of casreps, which aren't suitable for any cluster - let mut deviants = Vec::<&PathBuf>::new(); + let mut deviants: Vec<(&PathBuf, (Stacktrace, String))> = Vec::new(); // Init added casreps counter let mut added = 0usize; // Init duplicates counter @@ -404,14 +402,11 @@ fn update_clusters( Relation::Inner(measure) => { inners.push((cluster.number, measure)); } - Relation::Outer(measure) => match tolerance_level { - ToleranceLevel::Loyal => { + Relation::Outer(measure) => { + if let ToleranceLevel::Loyal = tolerance_level { outers.push((cluster.number, measure)); } - _ => { - deviants.push(casrep); - } - }, + } Relation::Oot => { continue; } @@ -426,12 +421,13 @@ fn update_clusters( outers.iter().min_by(|a, b| a.1.total_cmp(&b.1)).unwrap().0 } else { // Out of threshold - deviants.push(casrep); + deviants.push((casrep, (stacktrace.to_vec(), crashline.to_string()))); continue; }; // Update cluster (and dedup crashline) - if !clusters.get_mut(&(number)).unwrap().insert( + if !clusters.get_mut(&number).unwrap().insert( + casrep.to_path_buf(), stacktrace.to_vec(), crashline.to_string(), dedup, @@ -454,24 +450,11 @@ fn update_clusters( // Handle deviant casreps let (result, before, after) = if !deviants.is_empty() { - // Copy casrep to tmp dir - let deviant_dir = format!("{}/deviant", &oldpath.display()); - fs::create_dir_all(&deviant_dir)?; - for casrep in deviants { - fs::copy( - casrep, - format!( - "{}/{}", - &deviant_dir, - &casrep.file_name().unwrap().to_str().unwrap() - ), - )?; - } - // Cluster deviant casreps - let (result, before, after) = - make_clusters(Path::new(&deviant_dir), Some(oldpath), jobs, dedup, max)?; - let _ = fs::remove_dir_all(&deviant_dir); - (result, before, after) + // Get clusters from deviants + let (deviant_clusters, before, after) = gen_clusters(&deviants, max, dedup)?; + // Save deviant clusters + util::save_clusters(&deviant_clusters, oldpath)?; + (deviant_clusters.len(), before, after) } else { (0, 0, 0) }; @@ -698,7 +681,6 @@ fn main() -> Result<()> { paths.get(1).map(|x| x.as_path()), jobs, dedup_crashlines, - 0, )?; println!("Number of clusters: {result}"); // Print crashline dedup summary diff --git a/casr/src/util.rs b/casr/src/util.rs index d7646896..e65d3ed3 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -3,7 +3,7 @@ extern crate libcasr; use libcasr::report::CrashReport; use libcasr::stacktrace::{ - Stacktrace, STACK_FRAME_FILEPATH_IGNORE_REGEXES, STACK_FRAME_FUNCTION_IGNORE_REGEXES, + Cluster, Stacktrace, STACK_FRAME_FILEPATH_IGNORE_REGEXES, STACK_FRAME_FUNCTION_IGNORE_REGEXES, }; use anyhow::{bail, Context, Result}; @@ -14,7 +14,7 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator}; use simplelog::*; use wait_timeout::ChildExt; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fs::{self, OpenOptions}; use std::io::Write; use std::io::{BufRead, BufReader}; @@ -493,3 +493,28 @@ pub fn reports_from_paths( (casreps, stacktraces, crashlines, badreports) } + +/// Save clusters to directory +/// +/// # Arguments +/// +/// * `clusters` - given `Cluster` structures for saving +/// +/// * `dir` - out directory +pub fn save_clusters(clusters: &HashMap, dir: &Path) -> Result<()> { + for cluster in clusters.values() { + fs::create_dir_all(format!("{}/cl{}", &dir.display(), cluster.number))?; + for casrep in cluster.paths() { + fs::copy( + casrep, + format!( + "{}/cl{}/{}", + &dir.display(), + cluster.number, + &casrep.file_name().unwrap().to_str().unwrap() + ), + )?; + } + } + Ok(()) +} diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 7d603855..65446363 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -16,6 +16,7 @@ use kodama::{linkage, Method}; use regex::Regex; use std::collections::{HashMap, HashSet}; use std::fmt::{self, Write}; +use std::path::PathBuf; use std::sync::RwLock; // Re-export types from gdb_command for convenient use from Casr library @@ -77,6 +78,8 @@ pub enum ToleranceLevel { pub struct Cluster { /// Cluster number pub number: usize, + /// Cluster report paths + paths: Vec, /// Cluster report stacktraces stacktraces: Vec, /// Cluster diameter @@ -87,16 +90,26 @@ pub struct Cluster { impl Cluster { /// Create new `Cluster` - pub fn new(number: usize, stacktraces: Vec, crashlines: Vec) -> Self { + pub fn new( + number: usize, + paths: Vec, + stacktraces: Vec, + crashlines: Vec, + ) -> Self { let mut unique_crashlines: HashSet = HashSet::new(); unique_crashlines.extend(crashlines); Cluster { number, + paths, stacktraces, diam: None, crashlines: unique_crashlines, } } + /// Get CASR report paths + pub fn paths(&self) -> &Vec { + &self.paths + } /// Get CASR report stactraces pub fn stacktraces(&self) -> &Vec { &self.stacktraces @@ -115,10 +128,17 @@ impl Cluster { /// /// `true` if new CASR report may be added, /// `false` if report is duplicate of someone else - pub fn insert(&mut self, stacktrace: Stacktrace, crashline: String, dedup: bool) -> bool { + pub fn insert( + &mut self, + path: PathBuf, + stacktrace: Stacktrace, + crashline: String, + dedup: bool, + ) -> bool { if dedup && !crashline.is_empty() && !self.crashlines.insert(crashline.to_string()) { return false; } + self.paths.push(path); self.stacktraces.push(stacktrace); self.diam = None; true @@ -186,6 +206,63 @@ impl Cluster { } } +// TODO: Write a better description... +// NOTE: It's just interlayer between `Cluster` and `cluster_stacktrace` fn +/// Generate clusters from CASR report info +/// +/// # Arguments +/// +/// * `reports` - slice of report info: path, stacktrace, crashline +/// +/// * `offset` - cluster enumerate offset +/// +/// * `dedup` - deduplicate crashline, if true +/// +/// # Return value +/// +/// * `HashMap` of `Cluster` +/// * Number of valid casreps before crashiline deduplication +/// * Number of valid casreps after crashiline deduplication +pub fn gen_clusters( + reports: &[(&PathBuf, (Stacktrace, String))], + offset: usize, + dedup: bool, +) -> Result<(HashMap, usize, usize)> { + // Unzip casrep info + let (casreps, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = + reports.iter().cloned().unzip(); + let len = casreps.len(); + // Get stacktraces cluster numbers + let mut numbers = cluster_stacktraces(&stacktraces)?; + // Deduplicate by crashiline + let after = if dedup { + dedup_crashlines(&crashlines, &mut numbers) + } else { + len + }; + // Create clusters + let mut clusters: HashMap = HashMap::new(); + for i in 0..len { + if numbers[i] == 0 { + // Skip casreps with duplicate crashlines + continue; + } + let number = numbers[i] + offset; + // Add new cluster if not exists + clusters + .entry(number) + .or_insert_with(|| Cluster::new(number, Vec::new(), Vec::new(), Vec::new())); + // Update cluster + clusters.get_mut(&number).unwrap().insert( + casreps[i].to_path_buf(), + stacktraces[i].to_vec(), + crashlines[i].to_string(), + dedup, + ); + } + Ok((clusters, len, after)) +} + /// This macro updates variables used to remove trusted functions from stack trace #[macro_export] macro_rules! init_ignored_frames { From 6b0eb622396b46502a492b9980092f1df7564c78 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 22 Dec 2023 19:01:01 +0300 Subject: [PATCH 18/34] Add Soft level --- casr/src/bin/casr-cluster.rs | 75 +++++++++++++++++++++++++++++++++++- libcasr/src/stacktrace.rs | 46 +++++++++++++++++++--- 2 files changed, 114 insertions(+), 7 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 1077f8d4..fabe04d6 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -451,7 +451,18 @@ fn update_clusters( // Handle deviant casreps let (result, before, after) = if !deviants.is_empty() { // Get clusters from deviants - let (deviant_clusters, before, after) = gen_clusters(&deviants, max, dedup)?; + let (mut deviant_clusters, before, mut after) = gen_clusters(&deviants, max, dedup)?; + if let ToleranceLevel::Soft = tolerance_level { + // Merge old and new clusters + let removed = if let Ok(removed) = + merge_clusters(&mut clusters, &mut deviant_clusters, oldpath, dedup) + { + removed + } else { + 0 + }; + after -= removed; + } // Save deviant clusters util::save_clusters(&deviant_clusters, oldpath)?; (deviant_clusters.len(), before, after) @@ -461,6 +472,68 @@ fn update_clusters( Ok((added, duplicates, deduplicated, result, before, after)) } +/// Try to merge new clusters to old clusters +/// +/// # Arguments +/// +/// * `olds` - list of old clusters represented as `HashMap` of `Cluster` +/// +/// * `news` - list of new clusters represented as `HashMap` of `Cluster` +/// +/// * `dir` - out directory +/// +/// * `dedup` - deduplicate crashline, if true +/// +/// # Return value +/// +/// Number of removed by crashline deduplication CASR reports +pub fn merge_clusters( + olds: &mut HashMap, + news: &mut HashMap, + dir: &Path, + dedup: bool, +) -> Result { + let mut duplicate = 0usize; + for old in olds.values_mut() { + let mut merged = Vec::new(); + for new in news.values() { + if !old.may_merge(new) { + continue; + } + // Copy casreps from new to old + for (casrep, stacktrace, crashline) in new.reports() { + // Update cluster (and dedup crashline) + if !old.insert( + casrep.to_path_buf(), + stacktrace.to_vec(), + crashline.to_string(), + dedup, + ) { + duplicate += 1; + continue; + } + // Save report + fs::copy( + &casrep, + format!( + "{}/cl{}/{}", + &dir.display(), + old.number, + &casrep.file_name().unwrap().to_str().unwrap() + ), + )?; + } + // Mark merged cluster for drop + merged.push(new.number); + } + // Drop marked cluster + for number in merged { + news.remove(&number); + } + } + Ok(duplicate) +} + /// Calculate silhouette coefficient /// /// # Arguments diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 65446363..793e9b63 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -85,7 +85,7 @@ pub struct Cluster { /// Cluster diameter diam: Option, /// Cluster report crashlines - crashlines: HashSet, + crashlines: HashMap, } impl Cluster { @@ -96,8 +96,10 @@ impl Cluster { stacktraces: Vec, crashlines: Vec, ) -> Self { - let mut unique_crashlines: HashSet = HashSet::new(); - unique_crashlines.extend(crashlines); + let mut unique_crashlines: HashMap = HashMap::new(); + for (i, crashline) in crashlines.iter().enumerate().take(crashlines.len()) { + unique_crashlines.insert(crashline.clone(), i); + } Cluster { number, paths, @@ -135,12 +137,14 @@ impl Cluster { crashline: String, dedup: bool, ) -> bool { - if dedup && !crashline.is_empty() && !self.crashlines.insert(crashline.to_string()) { + if dedup && !crashline.is_empty() && self.crashlines.contains_key(&crashline) { return false; } self.paths.push(path); self.stacktraces.push(stacktrace); self.diam = None; + self.crashlines + .insert(crashline.to_string(), self.paths.len()); true } /// Get cluster diameter @@ -204,10 +208,40 @@ impl Cluster { Relation::Outer(rel) } } + /// Check if cluster may be merged with another one + pub fn may_merge(&self, cluster: &Cluster) -> bool { + let mut stacktraces1 = self.stacktraces.clone(); + let mut stacktraces2 = cluster.stacktraces().clone(); + stacktraces1.append(&mut stacktraces2); + diam(&stacktraces1) < THRESHOLD + } + // TODO: change type + /// Convert cluster to iterator + pub fn reports(&self) -> Vec<(PathBuf, Stacktrace, String)> { + let mut reports: Vec<(PathBuf, Stacktrace, String)> = Vec::new(); + let mut crashlines = self.crashlines.clone(); + for i in 0..self.paths.len() { + // Get crashline for cur casrep + let mut crashline = String::new(); + for (line, &number) in &crashlines { + if number == i { + crashline = line.to_string(); + break; + } + } + // Drop cur crashline from crashlines + crashlines.remove(&crashline); + // Update results + reports.push(( + self.paths[i].clone(), + self.stacktraces[i].clone(), + crashline, + )); + } + reports + } } -// TODO: Write a better description... -// NOTE: It's just interlayer between `Cluster` and `cluster_stacktrace` fn /// Generate clusters from CASR report info /// /// # Arguments From 4a4251e182a0b9b2800b660724e5bcb1eedcf97c Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 22 Dec 2023 23:07:34 +0300 Subject: [PATCH 19/34] Adjust Soft stat --- casr/src/bin/casr-cluster.rs | 51 ++++++++++++++++-------------------- casr/src/util.rs | 25 ++++++++++++++++++ docs/usage.md | 5 +++- 3 files changed, 52 insertions(+), 29 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index fabe04d6..017c5e4d 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -357,22 +357,13 @@ fn update_clusters( // Init clusters vector let mut clusters: HashMap = HashMap::new(); // Get casreps from each existing cluster - for cluster in &cluster_dirs { - // Get cluster number - let i = cluster.clone().file_name().unwrap().to_str().unwrap()[2..] - .to_string() - .parse::() - .unwrap(); + for cluster_dir in &cluster_dirs { + // Get cluster + let cluster = util::cluster_from_dir(cluster_dir, jobs)?; // Update max cluster number - max = max.max(i); - // Get casreps from cluster - let casreps = util::get_reports(cluster)?; - let (_, stacktraces, crashlines, _) = util::reports_from_paths(casreps, jobs); - // Drop crashlines if they're unused - let crashlines = if dedup { crashlines } else { Vec::new() }; + max = max.max(cluster.number); // Fill cluster info structures - // NOTE: We don't care about paths of casreps from existing clusters - clusters.insert(i, Cluster::new(i, Vec::new(), stacktraces, crashlines)); + clusters.insert(cluster.number, cluster); } // Init list of casreps, which aren't suitable for any cluster @@ -451,17 +442,16 @@ fn update_clusters( // Handle deviant casreps let (result, before, after) = if !deviants.is_empty() { // Get clusters from deviants - let (mut deviant_clusters, before, mut after) = gen_clusters(&deviants, max, dedup)?; + let (mut deviant_clusters, mut before, mut after) = gen_clusters(&deviants, max, dedup)?; if let ToleranceLevel::Soft = tolerance_level { // Merge old and new clusters - let removed = if let Ok(removed) = - merge_clusters(&mut clusters, &mut deviant_clusters, oldpath, dedup) - { - removed - } else { - 0 - }; - after -= removed; + let (moved, removed) = + merge_clusters(&mut clusters, &mut deviant_clusters, oldpath, dedup)?; + // Adjust stat + added += moved; + deduplicated += removed; + before = 0; // Impossible to know (proofed by @hkctkuy) + after -= moved + removed; } // Save deviant clusters util::save_clusters(&deviant_clusters, oldpath)?; @@ -486,14 +476,16 @@ fn update_clusters( /// /// # Return value /// +/// Number of moved to old clusters CASR reports /// Number of removed by crashline deduplication CASR reports pub fn merge_clusters( olds: &mut HashMap, news: &mut HashMap, dir: &Path, dedup: bool, -) -> Result { - let mut duplicate = 0usize; +) -> Result<(usize, usize)> { + let mut moved = 0usize; + let mut removed = 0usize; for old in olds.values_mut() { let mut merged = Vec::new(); for new in news.values() { @@ -509,10 +501,11 @@ pub fn merge_clusters( crashline.to_string(), dedup, ) { - duplicate += 1; + removed += 1; continue; } // Save report + moved += 1; fs::copy( &casrep, format!( @@ -531,7 +524,7 @@ pub fn merge_clusters( news.remove(&number); } } - Ok(duplicate) + Ok((moved, removed)) } /// Calculate silhouette coefficient @@ -816,8 +809,10 @@ fn main() -> Result<()> { println!("Number of new clusters: {result}"); } // Print crashline dedup summary - if before != after { + if before != 0 { println!("Number of reports before crashline deduplication in new clusters: {before}"); + } + if before != after { println!("Number of reports after crashline deduplication in new clusters: {after}"); } let sil = avg_sil(paths[1], jobs)?; diff --git a/casr/src/util.rs b/casr/src/util.rs index e65d3ed3..1cb3dee6 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -494,6 +494,31 @@ pub fn reports_from_paths( (casreps, stacktraces, crashlines, badreports) } +/// Get `Cluster` structure from specified directory path. +/// +/// # Arguments +/// +/// * `dir` - valid cluster dir path +/// +/// * `jobs` - number of jobs for parsing process +/// +/// # Return value +/// +/// `Cluster` structure +pub fn cluster_from_dir(dir: &Path, jobs: usize) -> Result { + // Get cluster number + let i = dir.file_name().unwrap().to_str().unwrap()[2..] + .to_string() + .parse::() + .unwrap(); + // Get casreps from cluster + let casreps = get_reports(dir)?; + let (_, stacktraces, crashlines, _) = reports_from_paths(casreps, jobs); + // Create cluster + // NOTE: We don't care about paths of casreps from existing clusters + Ok(Cluster::new(i, Vec::new(), stacktraces, crashlines)) +} + /// Save clusters to directory /// /// # Arguments diff --git a/docs/usage.md b/docs/usage.md index 24133441..93fad32d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -241,7 +241,10 @@ Tool for clustering CASR reports --outer-strategy Strategy for outer cluster choosing when updating [default: Dist] [possible values: Delta, Diam, Dist] - --estimate + --tolerance-level + Cluster tolerance level to new CASR reports [default: Loyal] [possible values: + Loyal, Soft, Hard] + -e, --estimate Calculate silhouette score for clustering results --ignore File with regular expressions for functions and file paths that should be From 450ef8f74a7981a6729bc4f283ff50431296f701 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Sat, 23 Dec 2023 00:09:00 +0300 Subject: [PATCH 20/34] Reduce repetitive code --- casr/src/bin/casr-cluster.rs | 56 +++++++++--------------------------- casr/src/util.rs | 23 ++++++--------- libcasr/src/stacktrace.rs | 6 ++-- 3 files changed, 25 insertions(+), 60 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 017c5e4d..db60dcd5 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -58,7 +58,7 @@ fn make_clusters( } // Get casreps with stacktraces and crashlines - let (casreps, stacktraces, crashlines, badreports) = util::reports_from_paths(casreps, jobs); + let (casreps, badreports) = util::reports_from_paths(casreps, jobs); if !badreports.is_empty() { fs::create_dir_all(format!("{}/clerr", &outpath.display()))?; @@ -74,44 +74,16 @@ fn make_clusters( } } - if stacktraces.len() < 2 { - bail!("{} valid reports, nothing to cluster...", stacktraces.len()); + if casreps.len() < 2 { + bail!("{} valid reports, nothing to cluster...", casreps.len()); } // Get clusters - let mut clusters = cluster_stacktraces(&stacktraces)?; + let (clusters, before, after) = gen_clusters(&casreps, 0, dedup)?; + // Save clusters + util::save_clusters(&clusters, outpath)?; - // Cluster formation - let cluster_cnt: usize = *clusters.iter().max().unwrap(); - for i in 1..=cluster_cnt { - fs::create_dir_all(format!("{}/cl{}", &outpath.display(), i))?; - } - - // Init before and after dedup counters - let before_cnt = casreps.len(); - let mut after_cnt = before_cnt; - - // Get clusters with crashline deduplication - if dedup { - after_cnt = dedup_crashlines(&crashlines, &mut clusters); - } - - for i in 0..clusters.len() { - // Skip casreps with duplicate crashlines - if clusters[i] == 0 { - continue; - } - fs::copy( - &casreps[i], - format!( - "{}/cl{}/{}", - &outpath.display(), - clusters[i], - &casreps[i].file_name().unwrap().to_str().unwrap() - ), - )?; - } - Ok((cluster_cnt, before_cnt, after_cnt)) + Ok((clusters.len(), before, after)) } /// Remove duplicate casreps @@ -336,10 +308,7 @@ fn update_clusters( ) -> Result<(usize, usize, usize, usize, usize, usize)> { // Get new casreps let casreps = util::get_reports(newpath)?; - let (casreps, stacktraces, crashlines, _) = util::reports_from_paths(casreps, jobs); - let casreps = casreps - .iter() - .zip(stacktraces.iter().zip(crashlines.iter())); + let (casreps, _) = util::reports_from_paths(casreps, jobs); // Get casreps from existing clusters let mut cluster_dirs: Vec = fs::read_dir(oldpath) @@ -367,7 +336,7 @@ fn update_clusters( } // Init list of casreps, which aren't suitable for any cluster - let mut deviants: Vec<(&PathBuf, (Stacktrace, String))> = Vec::new(); + let mut deviants: Vec = Vec::new(); // Init added casreps counter let mut added = 0usize; // Init duplicates counter @@ -383,7 +352,7 @@ fn update_clusters( // Checker if casrep is duplicate of someone else let mut dup = false; for cluster in clusters.values_mut() { - let relation = cluster.relation(stacktrace, inner_strategy, outer_strategy); + let relation = cluster.relation(&stacktrace, inner_strategy, outer_strategy); match relation { Relation::Dup => { dup = true; @@ -430,7 +399,7 @@ fn update_clusters( // Save casrep added += 1; fs::copy( - casrep, + &casrep, format!( "{}/{}", &cluster_dirs[number - 1].display(), @@ -563,7 +532,8 @@ fn avg_sil(dir: &Path, jobs: usize) -> Result { // Get casreps from cluster let casreps = util::get_reports(dir)?; // Get stacktraces from cluster - let (_, stacktraces, _, _) = util::reports_from_paths(casreps, jobs); + let (casreps, _) = util::reports_from_paths(casreps, jobs); + let (_, (stacktraces, _)): (Vec<_>, (Vec<_>, Vec<_>)) = casreps.iter().cloned().unzip(); // Update size size += stacktraces.len(); // Add stacktraces diff --git a/casr/src/util.rs b/casr/src/util.rs index 1cb3dee6..809505a5 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -3,7 +3,7 @@ extern crate libcasr; use libcasr::report::CrashReport; use libcasr::stacktrace::{ - Cluster, Stacktrace, STACK_FRAME_FILEPATH_IGNORE_REGEXES, STACK_FRAME_FUNCTION_IGNORE_REGEXES, + Cluster, ReportInfo, STACK_FRAME_FILEPATH_IGNORE_REGEXES, STACK_FRAME_FUNCTION_IGNORE_REGEXES, }; use anyhow::{bail, Context, Result}; @@ -441,14 +441,9 @@ pub fn get_reports(dir: &Path) -> Result> { /// /// # Return value /// -/// * A vector of paths to correctly parsed reports -/// * A vector of reports stacktraces -/// * A vector of reports crashlines +/// * A vector of correctly parsed report info: paths, stacktraces and crashlines /// * A vector of bad reports -pub fn reports_from_paths( - casreps: Vec, - jobs: usize, -) -> (Vec, Vec, Vec, Vec) { +pub fn reports_from_paths(casreps: Vec, jobs: usize) -> (Vec, Vec) { // Get len let len = casreps.len(); // Start thread pool. @@ -457,7 +452,7 @@ pub fn reports_from_paths( .build() .unwrap(); // Report info from casreps: (casrep, (trace, crashline)) - let mut casrep_info: RwLock> = RwLock::new(Vec::new()); + let mut casrep_info: RwLock> = RwLock::new(Vec::new()); // Casreps with stacktraces, that we cannot parse let mut badreports: RwLock> = RwLock::new(Vec::new()); custom_pool.install(|| { @@ -487,11 +482,7 @@ pub fn reports_from_paths( .cmp(b.0.file_name().unwrap().to_str().unwrap()) }); - // Unzip casrep info - let (casreps, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = - casrep_info.iter().cloned().unzip(); - - (casreps, stacktraces, crashlines, badreports) + (casrep_info.to_vec(), badreports) } /// Get `Cluster` structure from specified directory path. @@ -513,7 +504,9 @@ pub fn cluster_from_dir(dir: &Path, jobs: usize) -> Result { .unwrap(); // Get casreps from cluster let casreps = get_reports(dir)?; - let (_, stacktraces, crashlines, _) = reports_from_paths(casreps, jobs); + let (casreps, _) = reports_from_paths(casreps, jobs); + let (_, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = + casreps.iter().cloned().unzip(); // Create cluster // NOTE: We don't care about paths of casreps from existing clusters Ok(Cluster::new(i, Vec::new(), stacktraces, crashlines)) diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 793e9b63..946e8159 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -27,6 +27,9 @@ pub type DebugInfo = gdb_command::stacktrace::DebugInfo; /// Represents the information about one line of the stack trace. pub type StacktraceEntry = gdb_command::stacktrace::StacktraceEntry; +/// Represents the information about CASR report +pub type ReportInfo = (PathBuf, (Stacktrace, String)); + lazy_static::lazy_static! { /// Regular expressions for functions to be ignored. pub static ref STACK_FRAME_FUNCTION_IGNORE_REGEXES: RwLock> = RwLock::new( @@ -215,7 +218,6 @@ impl Cluster { stacktraces1.append(&mut stacktraces2); diam(&stacktraces1) < THRESHOLD } - // TODO: change type /// Convert cluster to iterator pub fn reports(&self) -> Vec<(PathBuf, Stacktrace, String)> { let mut reports: Vec<(PathBuf, Stacktrace, String)> = Vec::new(); @@ -258,7 +260,7 @@ impl Cluster { /// * Number of valid casreps before crashiline deduplication /// * Number of valid casreps after crashiline deduplication pub fn gen_clusters( - reports: &[(&PathBuf, (Stacktrace, String))], + reports: &[ReportInfo], offset: usize, dedup: bool, ) -> Result<(HashMap, usize, usize)> { From 36ad312ae16f75deea3ab711a237cc3ff7cb001f Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Sat, 23 Dec 2023 15:09:12 +0300 Subject: [PATCH 21/34] Add bad rep handler --- casr/src/bin/casr-cluster.rs | 23 ++++++++--------------- casr/src/util.rs | 22 +++++++++++++++++++++- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index db60dcd5..cb2c395f 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -60,18 +60,9 @@ fn make_clusters( // Get casreps with stacktraces and crashlines let (casreps, badreports) = util::reports_from_paths(casreps, jobs); + // Handle bad reports if !badreports.is_empty() { - fs::create_dir_all(format!("{}/clerr", &outpath.display()))?; - for report in badreports { - fs::copy( - &report, - format!( - "{}/clerr/{}", - &outpath.display(), - &report.file_name().unwrap().to_str().unwrap() - ), - )?; - } + util::save_badreports(badreports, format!("{}/clerr", &outpath.display()))?; } if casreps.len() < 2 { @@ -417,10 +408,12 @@ fn update_clusters( let (moved, removed) = merge_clusters(&mut clusters, &mut deviant_clusters, oldpath, dedup)?; // Adjust stat - added += moved; - deduplicated += removed; - before = 0; // Impossible to know (proofed by @hkctkuy) - after -= moved + removed; + if moved != 0 || removed != 0 { + added += moved; + deduplicated += removed; + before = 0; // Impossible to know (proofed by @hkctkuy) + after -= moved + removed; + } } // Save deviant clusters util::save_clusters(&deviant_clusters, oldpath)?; diff --git a/casr/src/util.rs b/casr/src/util.rs index 809505a5..cc2a84f6 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -512,7 +512,7 @@ pub fn cluster_from_dir(dir: &Path, jobs: usize) -> Result { Ok(Cluster::new(i, Vec::new(), stacktraces, crashlines)) } -/// Save clusters to directory +/// Save clusters to given directory /// /// # Arguments /// @@ -536,3 +536,23 @@ pub fn save_clusters(clusters: &HashMap, dir: &Path) -> Result<( } Ok(()) } + +/// Save invalid CASR reports to given directory +/// +/// # Arguments +/// +/// * `badreports` - A vector of invalid CASR reports +/// +/// * `dir` - out directory +pub fn save_badreports(badreports: Vec, dir: String) -> Result<()> { + if !Path::new(&dir).exists() { + fs::create_dir_all(&dir)?; + } + for report in badreports { + fs::copy( + &report, + format!("{}/{}", dir, &report.file_name().unwrap().to_str().unwrap()), + )?; + } + Ok(()) +} From 83d1c6e914f38ccadc0338e018d328348d284c71 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 27 Dec 2023 17:38:21 +0300 Subject: [PATCH 22/34] Fix insertion to cluster --- libcasr/src/stacktrace.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 946e8159..b83fdbac 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -100,7 +100,7 @@ impl Cluster { crashlines: Vec, ) -> Self { let mut unique_crashlines: HashMap = HashMap::new(); - for (i, crashline) in crashlines.iter().enumerate().take(crashlines.len()) { + for (i, crashline) in crashlines.iter().enumerate() { unique_crashlines.insert(crashline.clone(), i); } Cluster { @@ -147,7 +147,7 @@ impl Cluster { self.stacktraces.push(stacktrace); self.diam = None; self.crashlines - .insert(crashline.to_string(), self.paths.len()); + .insert(crashline.to_string(), self.paths.len() - 1); true } /// Get cluster diameter From 818de14b254001511be9697897a8990bb0280a35 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 27 Dec 2023 18:26:05 +0300 Subject: [PATCH 23/34] Make merging determistic --- casr/src/bin/casr-cluster.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index cb2c395f..f9560681 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -405,8 +405,7 @@ fn update_clusters( let (mut deviant_clusters, mut before, mut after) = gen_clusters(&deviants, max, dedup)?; if let ToleranceLevel::Soft = tolerance_level { // Merge old and new clusters - let (moved, removed) = - merge_clusters(&mut clusters, &mut deviant_clusters, oldpath, dedup)?; + let (moved, removed) = merge_clusters(clusters, &mut deviant_clusters, oldpath, dedup)?; // Adjust stat if moved != 0 || removed != 0 { added += moved; @@ -441,16 +440,20 @@ fn update_clusters( /// Number of moved to old clusters CASR reports /// Number of removed by crashline deduplication CASR reports pub fn merge_clusters( - olds: &mut HashMap, + olds: HashMap, news: &mut HashMap, dir: &Path, dedup: bool, ) -> Result<(usize, usize)> { let mut moved = 0usize; let mut removed = 0usize; - for old in olds.values_mut() { + let mut olds: Vec = olds.into_values().collect(); + olds.sort_by(|a, b| a.number.cmp(&b.number)); + for mut old in olds { let mut merged = Vec::new(); - for new in news.values() { + let mut values: Vec<&Cluster> = news.values().collect(); + values.sort_by(|a, b| a.number.cmp(&b.number)); + for new in values { if !old.may_merge(new) { continue; } From 3e7857dd03c602a68578b817f9c7dc1cb2b1cd85 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 10 Jan 2024 13:30:33 +0300 Subject: [PATCH 24/34] Fixes --- casr/src/bin/casr-cluster.rs | 8 ++++---- casr/src/util.rs | 8 ++++---- docs/usage.md | 6 ++++++ libcasr/src/stacktrace.rs | 21 ++++++++------------- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index f9560681..97ad3e89 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -62,7 +62,7 @@ fn make_clusters( // Handle bad reports if !badreports.is_empty() { - util::save_badreports(badreports, format!("{}/clerr", &outpath.display()))?; + util::save_reports(badreports, format!("{}/clerr", &outpath.display()))?; } if casreps.len() < 2 { @@ -268,7 +268,7 @@ fn merge_dirs(input: &Path, output: &Path) -> Result { /// /// * `newpath` - path to directory with new CASR reports /// -/// * `oldpath` - target directory for exiting clusters +/// * `oldpath` - target directory for existing clusters /// /// * `jobs` - number of jobs for cluster updating process /// @@ -439,7 +439,7 @@ fn update_clusters( /// /// Number of moved to old clusters CASR reports /// Number of removed by crashline deduplication CASR reports -pub fn merge_clusters( +fn merge_clusters( olds: HashMap, news: &mut HashMap, dir: &Path, @@ -458,7 +458,7 @@ pub fn merge_clusters( continue; } // Copy casreps from new to old - for (casrep, stacktrace, crashline) in new.reports() { + for (casrep, (stacktrace, crashline)) in new.reports() { // Update cluster (and dedup crashline) if !old.insert( casrep.to_path_buf(), diff --git a/casr/src/util.rs b/casr/src/util.rs index cc2a84f6..eb2c0af8 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -537,18 +537,18 @@ pub fn save_clusters(clusters: &HashMap, dir: &Path) -> Result<( Ok(()) } -/// Save invalid CASR reports to given directory +/// Save CASR reports to given directory /// /// # Arguments /// -/// * `badreports` - A vector of invalid CASR reports +/// * `reports` - A vector of CASR reports /// /// * `dir` - out directory -pub fn save_badreports(badreports: Vec, dir: String) -> Result<()> { +pub fn save_reports(reports: Vec, dir: String) -> Result<()> { if !Path::new(&dir).exists() { fs::create_dir_all(&dir)?; } - for report in badreports { + for report in reports { fs::copy( &report, format!("{}/{}", dir, &report.file_name().unwrap().to_str().unwrap()), diff --git a/docs/usage.md b/docs/usage.md index 93fad32d..aa169040 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -339,6 +339,12 @@ If there are several `Inner` or `Outer` clusters for the report we choose the N.B. `Delta` strategy is a nonsensical strategy in `Inner` case +Example: + + $ casr-cluster -c casr/tests/casr_tests/casrep/test_clustering_small out + $ rm -f out/cl9/40.casrep out/cl7/20.casrep && rm -rf out/cl8 && mv out/cl9 out/cl8 + $ casr-cluster -u casr/tests/casr_tests/casrep/test_clustering_small out + ## casr-cli App provides text-based user interface to view CASR reports, prints joint statistics for diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index b83fdbac..119434ca 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -100,8 +100,8 @@ impl Cluster { crashlines: Vec, ) -> Self { let mut unique_crashlines: HashMap = HashMap::new(); - for (i, crashline) in crashlines.iter().enumerate() { - unique_crashlines.insert(crashline.clone(), i); + for (i, crashline) in crashlines.into_iter().enumerate() { + unique_crashlines.insert(crashline, i); } Cluster { number, @@ -146,8 +146,7 @@ impl Cluster { self.paths.push(path); self.stacktraces.push(stacktrace); self.diam = None; - self.crashlines - .insert(crashline.to_string(), self.paths.len() - 1); + self.crashlines.insert(crashline, self.paths.len() - 1); true } /// Get cluster diameter @@ -218,11 +217,11 @@ impl Cluster { stacktraces1.append(&mut stacktraces2); diam(&stacktraces1) < THRESHOLD } - /// Convert cluster to iterator - pub fn reports(&self) -> Vec<(PathBuf, Stacktrace, String)> { - let mut reports: Vec<(PathBuf, Stacktrace, String)> = Vec::new(); + /// Convert cluster to vector of reports + pub fn reports(&self) -> Vec { + let mut reports: Vec = Vec::new(); let mut crashlines = self.crashlines.clone(); - for i in 0..self.paths.len() { + for (i, path) in self.paths.iter().enumerate() { // Get crashline for cur casrep let mut crashline = String::new(); for (line, &number) in &crashlines { @@ -234,11 +233,7 @@ impl Cluster { // Drop cur crashline from crashlines crashlines.remove(&crashline); // Update results - reports.push(( - self.paths[i].clone(), - self.stacktraces[i].clone(), - crashline, - )); + reports.push((path.clone(), (self.stacktraces[i].clone(), crashline))); } reports } From 8110daf3e3de0b9d89684b0011314fd22bb2c99a Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 10 Jan 2024 15:40:19 +0300 Subject: [PATCH 25/34] Fixes --- casr/src/bin/casr-cluster.rs | 18 +++++------------- libcasr/src/stacktrace.rs | 27 ++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 97ad3e89..672846d1 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -503,7 +503,7 @@ fn merge_clusters( /// # Return value /// /// Silhouette coefficient -fn avg_sil(dir: &Path, jobs: usize) -> Result { +fn calc_avg_sil(dir: &Path, jobs: usize) -> Result { // Get cluster dirs let mut dirs: Vec = fs::read_dir(dir) .unwrap() @@ -538,16 +538,8 @@ fn avg_sil(dir: &Path, jobs: usize) -> Result { if size == 0 { bail!("{} valid reports, nothing to calculate...", size); } - // Init sil sum - let mut sum = 0f64; - // Calculate silhouette coefficient for each casrep - for i in 0..clusters.len() { - for num in 0..clusters[i].len() { - let sil = sil_coef(num, i, &clusters); - sum += sil; - } - } - Ok(sum / size as f64) + let avg_sil = avg_sil_ceof(&clusters, size); + Ok(avg_sil) } fn main() -> Result<()> { @@ -781,11 +773,11 @@ fn main() -> Result<()> { if before != after { println!("Number of reports after crashline deduplication in new clusters: {after}"); } - let sil = avg_sil(paths[1], jobs)?; + let sil = calc_avg_sil(paths[1], jobs)?; println!("Cluster silhouette score: {sil}"); } else if matches.contains_id("estimate") { let path: &PathBuf = matches.get_one::("estimate").unwrap(); - let sil = avg_sil(path, jobs)?; + let sil = calc_avg_sil(path, jobs)?; println!("Cluster silhouette score: {sil}"); } diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 119434ca..9cfd866d 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -642,7 +642,7 @@ fn sil_subcoef_b(num: usize, i: usize, clusters: &[Vec]) -> f64 { /// # Return value /// /// Silhouette coefficient -pub fn sil_coef(num: usize, i: usize, clusters: &[Vec]) -> f64 { +fn sil_coef(num: usize, i: usize, clusters: &[Vec]) -> f64 { if clusters[i].len() != 1 { let a = sil_subcoef_a(num, &clusters[i]); let b = sil_subcoef_b(num, i, clusters); @@ -652,6 +652,31 @@ pub fn sil_coef(num: usize, i: usize, clusters: &[Vec]) -> f64 { } } +/// Get average silhouette coefficient calculating for given stacktraces +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// * `size` - total amount of elements in clusters +/// +/// # Return value +/// +/// Average silhouette coefficient +pub fn avg_sil_ceof(clusters: &[Vec], size: usize) -> f64 { + // Init sil sum + let mut sum = 0f64; + // Calculate silhouette coefficient for each casrep + for i in 0..clusters.len() { + for num in 0..clusters[i].len() { + let sil = sil_coef(num, i, clusters); + sum += sil; + } + } + sum / size as f64 +} + /// Stack trace filtering trait. pub trait Filter { /// Filter frames from the stack trace that are not related to analyzed code containing crash. From a1f0ae443e74fa1d4779837c5315a19a86ecdeb4 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 10 Jan 2024 15:55:06 +0300 Subject: [PATCH 26/34] update clippy --- libcasr/src/stacktrace.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 9cfd866d..fe8087aa 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -365,7 +365,7 @@ impl CrashLineExt for Stacktrace { let mut trace = self.clone(); trace.filter(); - let Some(crash_entry) = trace.get(0) else { + let Some(crash_entry) = trace.first() else { return Err(Error::Casr( "No stack trace entries after filtering".to_string(), )); From d3cec56e2c49da6d1ce01cf60b9dc49329b995d6 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Wed, 10 Jan 2024 16:42:07 +0300 Subject: [PATCH 27/34] Change default options --- casr/src/bin/casr-cluster.rs | 6 +++--- casr/tests/tests.rs | 15 --------------- docs/usage.md | 6 +++--- 3 files changed, 6 insertions(+), 21 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 672846d1..b93c9c8f 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -625,7 +625,7 @@ fn main() -> Result<()> { .value_name("STRATEGY") .action(ArgAction::Set) .value_parser(["Diam", "Dist"]) - .default_value("Dist") + .default_value("Diam") .help("Strategy for inner cluster choosing when updating"), ) .arg( @@ -634,7 +634,7 @@ fn main() -> Result<()> { .value_name("STRATEGY") .action(ArgAction::Set) .value_parser(["Delta", "Diam", "Dist"]) - .default_value("Dist") + .default_value("Diam") .help("Strategy for outer cluster choosing when updating"), ) .arg( @@ -643,7 +643,7 @@ fn main() -> Result<()> { .value_name("LEVEL") .action(ArgAction::Set) .value_parser(["Loyal", "Soft", "Hard"]) - .default_value("Loyal") + .default_value("Soft") .help("Cluster tolerance level to new CASR reports") ) .arg( diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index f222fe8f..24946363 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2750,21 +2750,6 @@ fn test_casr_cluster_u() { assert_eq!(clusters_cnt, 1, "Clusters count mismatch."); - let re = Regex::new( - r"Number of reports before crashline deduplication in new clusters: (?P\d+)", - ) - .unwrap(); - let before_cnt = re - .captures(&res) - .unwrap() - .name("before") - .map(|x| x.as_str()) - .unwrap() - .parse::() - .unwrap(); - - assert_eq!(before_cnt, 2, "Before count mismatch."); - let re = Regex::new( r"Number of reports after crashline deduplication in new clusters: (?P\d+)", ) diff --git a/docs/usage.md b/docs/usage.md index aa169040..2fb51515 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -236,13 +236,13 @@ Tool for clustering CASR reports -u, --update Update clusters from OLD_DIR using CASR reports from NEW_DIR --inner-strategy - Strategy for inner cluster choosing when updating [default: Dist] [possible + Strategy for inner cluster choosing when updating [default: Diam] [possible values: Diam, Dist] --outer-strategy - Strategy for outer cluster choosing when updating [default: Dist] [possible + Strategy for outer cluster choosing when updating [default: Diam] [possible values: Delta, Diam, Dist] --tolerance-level - Cluster tolerance level to new CASR reports [default: Loyal] [possible values: + Cluster tolerance level to new CASR reports [default: Soft] [possible values: Loyal, Soft, Hard] -e, --estimate Calculate silhouette score for clustering results From 023759c084890187e15baba0bd763d3891d7d17c Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 26 Jan 2024 12:10:48 +0300 Subject: [PATCH 28/34] Remove sub options --- casr/src/bin/casr-cluster.rs | 98 +++++------------------------------- libcasr/src/stacktrace.rs | 60 +++------------------- 2 files changed, 19 insertions(+), 139 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index b93c9c8f..f6b2d1d3 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -274,12 +274,6 @@ fn merge_dirs(input: &Path, output: &Path) -> Result { /// /// * `dedup` - deduplicate casrep by crashline for each cluster, if true /// -/// * `inner_strategy` - strategy for "inner" report case -/// -/// * `outer_strategy` - strategy for "outer" report case -/// -/// * `tolerance_level` - cluster tolerance level to "outer" reports -/// /// # Return value /// /// * Number of casreps added to old clusters @@ -293,9 +287,6 @@ fn update_clusters( oldpath: &Path, jobs: usize, dedup: bool, - inner_strategy: AccumStrategy, - outer_strategy: AccumStrategy, - tolerance_level: ToleranceLevel, ) -> Result<(usize, usize, usize, usize, usize, usize)> { // Get new casreps let casreps = util::get_reports(newpath)?; @@ -338,12 +329,10 @@ fn update_clusters( for (casrep, (stacktrace, crashline)) in casreps { // list of "inner" clusters for casrep let mut inners: Vec<(usize, f64)> = Vec::new(); - // list of "outer" clusters for casrep - let mut outers: Vec<(usize, f64)> = Vec::new(); // Checker if casrep is duplicate of someone else let mut dup = false; for cluster in clusters.values_mut() { - let relation = cluster.relation(&stacktrace, inner_strategy, outer_strategy); + let relation = cluster.relation(&stacktrace); match relation { Relation::Dup => { dup = true; @@ -353,12 +342,7 @@ fn update_clusters( Relation::Inner(measure) => { inners.push((cluster.number, measure)); } - Relation::Outer(measure) => { - if let ToleranceLevel::Loyal = tolerance_level { - outers.push((cluster.number, measure)); - } - } - Relation::Oot => { + Relation::Outer => { continue; } } @@ -368,10 +352,8 @@ fn update_clusters( continue; } else if !inners.is_empty() { inners.iter().min_by(|a, b| a.1.total_cmp(&b.1)).unwrap().0 - } else if !outers.is_empty() { - outers.iter().min_by(|a, b| a.1.total_cmp(&b.1)).unwrap().0 } else { - // Out of threshold + // Outer deviants.push((casrep, (stacktrace.to_vec(), crashline.to_string()))); continue; }; @@ -403,16 +385,14 @@ fn update_clusters( let (result, before, after) = if !deviants.is_empty() { // Get clusters from deviants let (mut deviant_clusters, mut before, mut after) = gen_clusters(&deviants, max, dedup)?; - if let ToleranceLevel::Soft = tolerance_level { - // Merge old and new clusters - let (moved, removed) = merge_clusters(clusters, &mut deviant_clusters, oldpath, dedup)?; - // Adjust stat - if moved != 0 || removed != 0 { - added += moved; - deduplicated += removed; - before = 0; // Impossible to know (proofed by @hkctkuy) - after -= moved + removed; - } + // Merge old and new clusters + let (moved, removed) = merge_clusters(clusters, &mut deviant_clusters, oldpath, dedup)?; + // Adjust stat + if moved != 0 || removed != 0 { + added += moved; + deduplicated += removed; + before = 0; // Impossible to know (proofed by @hkctkuy) + after -= moved + removed; } // Save deviant clusters util::save_clusters(&deviant_clusters, oldpath)?; @@ -619,33 +599,6 @@ fn main() -> Result<()> { "Update clusters from OLD_DIR using CASR reports from NEW_DIR", ), ) - .arg( - Arg::new("inner-strategy") - .long("inner-strategy") - .value_name("STRATEGY") - .action(ArgAction::Set) - .value_parser(["Diam", "Dist"]) - .default_value("Diam") - .help("Strategy for inner cluster choosing when updating"), - ) - .arg( - Arg::new("outer-strategy") - .long("outer-strategy") - .value_name("STRATEGY") - .action(ArgAction::Set) - .value_parser(["Delta", "Diam", "Dist"]) - .default_value("Diam") - .help("Strategy for outer cluster choosing when updating"), - ) - .arg( - Arg::new("tolerance-level") - .long("tolerance-level") - .value_name("LEVEL") - .action(ArgAction::Set) - .value_parser(["Loyal", "Soft", "Hard"]) - .default_value("Soft") - .help("Cluster tolerance level to new CASR reports") - ) .arg( Arg::new("estimate") .short('e') @@ -731,33 +684,8 @@ fn main() -> Result<()> { } else if matches.contains_id("update") { let paths: Vec<&PathBuf> = matches.get_many::("update").unwrap().collect(); - let inner_strategy = matches.get_one::("inner-strategy").unwrap(); - let inner_strategy = match inner_strategy.as_str() { - "Diam" => AccumStrategy::Diam, - _ => AccumStrategy::Dist, - }; - let outer_strategy = matches.get_one::("outer-strategy").unwrap(); - let outer_strategy = match outer_strategy.as_str() { - "Delta" => AccumStrategy::Delta, - "Diam" => AccumStrategy::Diam, - _ => AccumStrategy::Dist, - }; - let tolerance_level = matches.get_one::("tolerance-level").unwrap(); - let tolerance_level = match tolerance_level.as_str() { - "Loyal" => ToleranceLevel::Loyal, - "Soft" => ToleranceLevel::Soft, - _ => ToleranceLevel::Hard, - }; - - let (added, duplicates, deduplicated, result, before, after) = update_clusters( - paths[0], - paths[1], - jobs, - dedup_crashlines, - inner_strategy, - outer_strategy, - tolerance_level, - )?; + let (added, duplicates, deduplicated, result, before, after) = + update_clusters(paths[0], paths[1], jobs, dedup_crashlines)?; println!("Number of casreps added to old clusters: {added}"); println!("Number of duplicates: {duplicates}"); if deduplicated != 0 { diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index fe8087aa..6a9c0fb6 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -48,33 +48,8 @@ pub enum Relation { Dup, /// The CASR report is "inside" the cluster with some proximity measure Inner(f64), - /// The CASR report is "outside" the cluster with some proximity measure - Outer(f64), - /// The CASR report is out of threshold - Oot, -} - -/// Cluster accumulation strategy -#[derive(Clone, Copy, Debug)] -pub enum AccumStrategy { - /// Argmin (diam (cluster + {new}) - diam (cluster)) - Delta, - /// Argmin diam (cluster + {new}) - Diam, - /// Argmin dist (cluster, {new}) - Dist, -} - -/// Cluster tolerance level to new CASR reports -#[derive(Clone, Copy, Debug)] -pub enum ToleranceLevel { - /// May insert any "Inner" and "Outer" CASR reports - Loyal, - /// May insert only "Inner" CASR reports - Hard, - /// May insert any "Inner" CASR reports - /// But "Outers" may be added only as subclusters after their clustering - Soft, + /// The CASR report is "outside" the cluster + Outer, } /// Structure provides an abstraction for cluster with CASR reports @@ -162,31 +137,18 @@ impl Cluster { /// /// * `new` - new report stacktrace /// - /// * `inner_strategy` - cluster accumulation strategy if `new` is "inner" - /// - /// * `inner_strategy` - cluster accumulation strategy if `new` is "outer" - /// /// # Return value /// /// `Relation` enum with proximity measure according specified strategy - pub fn relation( - &mut self, - new: &Stacktrace, - inner_strategy: AccumStrategy, - outer_strategy: AccumStrategy, - ) -> Relation { + pub fn relation(&mut self, new: &Stacktrace) -> Relation { let diam = self.diam(); - let mut min = MAX; let mut max = 0f64; for stacktrace in self.stacktraces() { let dist = 1.0 - similarity(new, stacktrace); if dist == 0.0 { return Relation::Dup; } else if dist > THRESHOLD { - return Relation::Oot; - } - if dist < min { - min = dist; + return Relation::Outer; } if dist > max { max = dist; @@ -194,20 +156,10 @@ impl Cluster { } if diam >= max { // Inner - let rel = match inner_strategy { - // Delta is a nonsensical strategy in this case - AccumStrategy::Diam => diam, - _ => min, - }; - Relation::Inner(rel) + Relation::Inner(diam) } else { // Outer - let rel = match outer_strategy { - AccumStrategy::Diam => max, - AccumStrategy::Delta => max - diam, - AccumStrategy::Dist => min, - }; - Relation::Outer(rel) + Relation::Outer } } /// Check if cluster may be merged with another one From 37637b2f1862493128852b3fe25e130427a6ef01 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 26 Jan 2024 12:33:07 +0300 Subject: [PATCH 29/34] Create cluster module --- casr/src/bin/casr-cluster.rs | 7 +- casr/src/util.rs | 3 +- docs/usage.md | 30 ---- libcasr/src/cluster.rs | 340 +++++++++++++++++++++++++++++++++++ libcasr/src/lib.rs | 1 + libcasr/src/stacktrace.rs | 338 +--------------------------------- 6 files changed, 348 insertions(+), 371 deletions(-) create mode 100644 libcasr/src/cluster.rs diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index f6b2d1d3..9ec7a1e6 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -1,5 +1,5 @@ use casr::util; -use libcasr::{init_ignored_frames, stacktrace::*}; +use libcasr::{cluster::*, init_ignored_frames, stacktrace::*}; use anyhow::{bail, Context, Result}; use clap::{builder::FalseyValueParser, Arg, ArgAction}; @@ -70,7 +70,7 @@ fn make_clusters( } // Get clusters - let (clusters, before, after) = gen_clusters(&casreps, 0, dedup)?; + let (clusters, before, after) = Cluster::gen_clusters(&casreps, 0, dedup)?; // Save clusters util::save_clusters(&clusters, outpath)?; @@ -384,7 +384,8 @@ fn update_clusters( // Handle deviant casreps let (result, before, after) = if !deviants.is_empty() { // Get clusters from deviants - let (mut deviant_clusters, mut before, mut after) = gen_clusters(&deviants, max, dedup)?; + let (mut deviant_clusters, mut before, mut after) = + Cluster::gen_clusters(&deviants, max, dedup)?; // Merge old and new clusters let (moved, removed) = merge_clusters(clusters, &mut deviant_clusters, oldpath, dedup)?; // Adjust stat diff --git a/casr/src/util.rs b/casr/src/util.rs index eb2c0af8..e782aa2b 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -1,9 +1,10 @@ //! Common utility functions. extern crate libcasr; +use libcasr::cluster::{Cluster, ReportInfo}; use libcasr::report::CrashReport; use libcasr::stacktrace::{ - Cluster, ReportInfo, STACK_FRAME_FILEPATH_IGNORE_REGEXES, STACK_FRAME_FUNCTION_IGNORE_REGEXES, + STACK_FRAME_FILEPATH_IGNORE_REGEXES, STACK_FRAME_FUNCTION_IGNORE_REGEXES, }; use anyhow::{bail, Context, Result}; diff --git a/docs/usage.md b/docs/usage.md index 2fb51515..ff0067ab 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -235,15 +235,6 @@ Tool for clustering CASR reports added to OUTPUT_DIR. -u, --update Update clusters from OLD_DIR using CASR reports from NEW_DIR - --inner-strategy - Strategy for inner cluster choosing when updating [default: Diam] [possible - values: Diam, Dist] - --outer-strategy - Strategy for outer cluster choosing when updating [default: Diam] [possible - values: Delta, Diam, Dist] - --tolerance-level - Cluster tolerance level to new CASR reports [default: Soft] [possible values: - Loyal, Soft, Hard] -e, --estimate Calculate silhouette score for clustering results --ignore @@ -318,27 +309,6 @@ For `CASR_CLUSTER_UNIQUE_CRASHLINE` a `false` literal is `n`, `no`, `f`, `false`, `off` or `0`. An absent environment variable will also be considered as `false`. Anything else will considered as true. -For updating clusters we use the following strategy: -Every CASR report may be in one of several states: - * `Duplicate` - the report is a duplicate of one from cluster, - * `Inner` - the report is "inside" a some cluster with some proximity measure, - * `Outer` - the report is "outside" a some cluster with some proximity measure, - * `Out` of threshold - the report is out of threshold for any cluster. - -If report is `Duplicate` we do nothing. -If report is `Oot` we perform clustering for all such reports. -If report is `Inner` or `Outer` for a single cluster we update the cluster. -If there are several `Inner` or `Outer` clusters for the report we choose the -"closest" according according to one of the following strategies: - * `Delta` - we choose cluster with minimal diameter change, - i.e. `Argmin (diam (cluster + {new}) - diam (cluster))` - * `Diam` - we choose cluster with minimal diameter, - i.e. `Argmin diam (cluster + {new})` - * `Dist` - we choose cluster with minimal distance between cluster and report, - i.e. `Argmin dist (cluster, {new})` - -N.B. `Delta` strategy is a nonsensical strategy in `Inner` case - Example: $ casr-cluster -c casr/tests/casr_tests/casrep/test_clustering_small out diff --git a/libcasr/src/cluster.rs b/libcasr/src/cluster.rs new file mode 100644 index 00000000..ef3faf9e --- /dev/null +++ b/libcasr/src/cluster.rs @@ -0,0 +1,340 @@ +//! Provides API's for cluster manipulating. +use crate::error::*; +use crate::stacktrace::*; + +use core::f64::MAX; +use std::collections::HashMap; +use std::path::PathBuf; + +/// Represents the information about CASR report +pub type ReportInfo = (PathBuf, (Stacktrace, String)); + +/// Relation between a CASR report and a cluster +pub enum Relation { + /// The CASR report is a duplicate of one from cluster + Dup, + /// The CASR report is "inside" the cluster with some proximity measure + Inner(f64), + /// The CASR report is "outside" the cluster + Outer, +} + +/// Structure provides an abstraction for cluster with CASR reports +pub struct Cluster { + /// Cluster number + pub number: usize, + /// Cluster report paths + paths: Vec, + /// Cluster report stacktraces + stacktraces: Vec, + /// Cluster diameter + diam: Option, + /// Cluster report crashlines + crashlines: HashMap, +} + +impl Cluster { + /// Create new `Cluster` + pub fn new( + number: usize, + paths: Vec, + stacktraces: Vec, + crashlines: Vec, + ) -> Self { + let mut unique_crashlines: HashMap = HashMap::new(); + for (i, crashline) in crashlines.into_iter().enumerate() { + unique_crashlines.insert(crashline, i); + } + Cluster { + number, + paths, + stacktraces, + diam: None, + crashlines: unique_crashlines, + } + } + /// Get CASR report paths + pub fn paths(&self) -> &Vec { + &self.paths + } + /// Get CASR report stactraces + pub fn stacktraces(&self) -> &Vec { + &self.stacktraces + } + /// Generate clusters from CASR report info + /// + /// # Arguments + /// + /// * `reports` - slice of report info: path, stacktrace, crashline + /// + /// * `offset` - cluster enumerate offset + /// + /// * `dedup` - deduplicate crashline, if true + /// + /// # Return value + /// + /// * `HashMap` of `Cluster` + /// * Number of valid casreps before crashiline deduplication + /// * Number of valid casreps after crashiline deduplication + pub fn gen_clusters( + reports: &[ReportInfo], + offset: usize, + dedup: bool, + ) -> Result<(HashMap, usize, usize)> { + // Unzip casrep info + let (casreps, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = + reports.iter().cloned().unzip(); + let len = casreps.len(); + // Get stacktraces cluster numbers + let mut numbers = cluster_stacktraces(&stacktraces)?; + // Deduplicate by crashiline + let after = if dedup { + dedup_crashlines(&crashlines, &mut numbers) + } else { + len + }; + // Create clusters + let mut clusters: HashMap = HashMap::new(); + for i in 0..len { + if numbers[i] == 0 { + // Skip casreps with duplicate crashlines + continue; + } + let number = numbers[i] + offset; + // Add new cluster if not exists + clusters + .entry(number) + .or_insert_with(|| Cluster::new(number, Vec::new(), Vec::new(), Vec::new())); + // Update cluster + clusters.get_mut(&number).unwrap().insert( + casreps[i].to_path_buf(), + stacktraces[i].to_vec(), + crashlines[i].to_string(), + dedup, + ); + } + Ok((clusters, len, after)) + } + /// Add new CASR report to cluster + /// + /// # Arguments + /// + /// * `stacktrace` - new CASR report stacktrace + /// + /// * `crashline` - new CASR report crashline + /// + /// * `dedup` - deduplicate crashline, if true + /// + /// # Return value + /// + /// `true` if new CASR report may be added, + /// `false` if report is duplicate of someone else + pub fn insert( + &mut self, + path: PathBuf, + stacktrace: Stacktrace, + crashline: String, + dedup: bool, + ) -> bool { + if dedup && !crashline.is_empty() && self.crashlines.contains_key(&crashline) { + return false; + } + self.paths.push(path); + self.stacktraces.push(stacktrace); + self.diam = None; + self.crashlines.insert(crashline, self.paths.len() - 1); + true + } + /// Get cluster diameter + pub fn diam(&mut self) -> f64 { + if self.diam.is_none() { + self.diam = Some(diam(&self.stacktraces)); + } + self.diam.unwrap() + } + /// Get "relation" between new report and specified cluster + /// + /// # Arguments + /// + /// * `new` - new report stacktrace + /// + /// # Return value + /// + /// `Relation` enum with proximity measure according specified strategy + pub fn relation(&mut self, new: &Stacktrace) -> Relation { + let diam = self.diam(); + let mut max = 0f64; + for stacktrace in self.stacktraces() { + let dist = 1.0 - similarity(new, stacktrace); + if dist == 0.0 { + return Relation::Dup; + } else if dist > THRESHOLD { + return Relation::Outer; + } + if dist > max { + max = dist; + } + } + if diam >= max { + // Inner + Relation::Inner(diam) + } else { + // Outer + Relation::Outer + } + } + /// Check if cluster may be merged with another one + pub fn may_merge(&self, cluster: &Cluster) -> bool { + let mut stacktraces1 = self.stacktraces.clone(); + let mut stacktraces2 = cluster.stacktraces().clone(); + stacktraces1.append(&mut stacktraces2); + diam(&stacktraces1) < THRESHOLD + } + /// Convert cluster to vector of reports + pub fn reports(&self) -> Vec { + let mut reports: Vec = Vec::new(); + let mut crashlines = self.crashlines.clone(); + for (i, path) in self.paths.iter().enumerate() { + // Get crashline for cur casrep + let mut crashline = String::new(); + for (line, &number) in &crashlines { + if number == i { + crashline = line.to_string(); + break; + } + } + // Drop cur crashline from crashlines + crashlines.remove(&crashline); + // Update results + reports.push((path.clone(), (self.stacktraces[i].clone(), crashline))); + } + reports + } +} + +/// Get diameter of specified cluster +/// +/// # Arguments +/// +/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// Value of diameter +fn diam(stacktraces: &[Stacktrace]) -> f64 { + let mut diam = 0f64; + let len = stacktraces.len(); + for i in 0..len { + for j in i + 1..len { + let dist = 1.0 - similarity(&stacktraces[i], &stacktraces[j]); + if dist > diam { + diam = dist; + } + } + } + diam +} + +/// Get "a" subcoefficient silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// "a" subcoefficient silhouette coefficient +fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { + let mut sum = 0f64; + for (i, stacktrace) in stacktraces.iter().enumerate() { + if i == num { + continue; + } + sum += 1.0 - similarity(&stacktraces[num], stacktrace); + } + sum / (stacktraces.len() - 1) as f64 +} + +/// Get "b" subcoefficient silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `i` - cluster number of given stacktrace +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// "b" subcoefficient silhouette coefficient +fn sil_subcoef_b(num: usize, i: usize, clusters: &[Vec]) -> f64 { + let mut min = MAX; + for (j, cluster) in clusters.iter().enumerate() { + if j == i { + continue; + } + let mut sum = 0f64; + for stacktrace in cluster { + sum += 1.0 - similarity(&clusters[i][num], stacktrace); + } + let res = sum / cluster.len() as f64; + if res < min { + min = res; + } + } + min +} + +/// Get silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `i` - cluster number of given stacktrace +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// Silhouette coefficient +fn sil_coef(num: usize, i: usize, clusters: &[Vec]) -> f64 { + if clusters[i].len() != 1 { + let a = sil_subcoef_a(num, &clusters[i]); + let b = sil_subcoef_b(num, i, clusters); + (b - a) / a.max(b) + } else { + 0f64 + } +} + +/// Get average silhouette coefficient calculating for given stacktraces +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// * `size` - total amount of elements in clusters +/// +/// # Return value +/// +/// Average silhouette coefficient +pub fn avg_sil_ceof(clusters: &[Vec], size: usize) -> f64 { + // Init sil sum + let mut sum = 0f64; + // Calculate silhouette coefficient for each casrep + for i in 0..clusters.len() { + for num in 0..clusters[i].len() { + let sil = sil_coef(num, i, clusters); + sum += sil; + } + } + sum / size as f64 +} diff --git a/libcasr/src/lib.rs b/libcasr/src/lib.rs index 2284fb57..557d7713 100644 --- a/libcasr/src/lib.rs +++ b/libcasr/src/lib.rs @@ -22,6 +22,7 @@ //! collected from gdb. To save crash reports as json (.casrep/.sarif) use `serde` feature. pub mod asan; +pub mod cluster; pub mod constants; pub mod cpp; pub mod error; diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 6a9c0fb6..52736a2c 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -11,12 +11,10 @@ use crate::constants::{ STACK_FRAME_FUNCTION_IGNORE_REGEXES_PYTHON, STACK_FRAME_FUNCTION_IGNORE_REGEXES_RUST, }; use crate::error::*; -use core::f64::MAX; use kodama::{linkage, Method}; use regex::Regex; use std::collections::{HashMap, HashSet}; use std::fmt::{self, Write}; -use std::path::PathBuf; use std::sync::RwLock; // Re-export types from gdb_command for convenient use from Casr library @@ -27,9 +25,6 @@ pub type DebugInfo = gdb_command::stacktrace::DebugInfo; /// Represents the information about one line of the stack trace. pub type StacktraceEntry = gdb_command::stacktrace::StacktraceEntry; -/// Represents the information about CASR report -pub type ReportInfo = (PathBuf, (Stacktrace, String)); - lazy_static::lazy_static! { /// Regular expressions for functions to be ignored. pub static ref STACK_FRAME_FUNCTION_IGNORE_REGEXES: RwLock> = RwLock::new( @@ -40,211 +35,7 @@ lazy_static::lazy_static! { } /// Threshold for clusters diameter -const THRESHOLD: f64 = 0.3; - -/// Relation between a CASR report and a cluster -pub enum Relation { - /// The CASR report is a duplicate of one from cluster - Dup, - /// The CASR report is "inside" the cluster with some proximity measure - Inner(f64), - /// The CASR report is "outside" the cluster - Outer, -} - -/// Structure provides an abstraction for cluster with CASR reports -pub struct Cluster { - /// Cluster number - pub number: usize, - /// Cluster report paths - paths: Vec, - /// Cluster report stacktraces - stacktraces: Vec, - /// Cluster diameter - diam: Option, - /// Cluster report crashlines - crashlines: HashMap, -} - -impl Cluster { - /// Create new `Cluster` - pub fn new( - number: usize, - paths: Vec, - stacktraces: Vec, - crashlines: Vec, - ) -> Self { - let mut unique_crashlines: HashMap = HashMap::new(); - for (i, crashline) in crashlines.into_iter().enumerate() { - unique_crashlines.insert(crashline, i); - } - Cluster { - number, - paths, - stacktraces, - diam: None, - crashlines: unique_crashlines, - } - } - /// Get CASR report paths - pub fn paths(&self) -> &Vec { - &self.paths - } - /// Get CASR report stactraces - pub fn stacktraces(&self) -> &Vec { - &self.stacktraces - } - /// Add new CASR report to cluster - /// - /// # Arguments - /// - /// * `stacktrace` - new CASR report stacktrace - /// - /// * `crashline` - new CASR report crashline - /// - /// * `dedup` - deduplicate crashline, if true - /// - /// # Return value - /// - /// `true` if new CASR report may be added, - /// `false` if report is duplicate of someone else - pub fn insert( - &mut self, - path: PathBuf, - stacktrace: Stacktrace, - crashline: String, - dedup: bool, - ) -> bool { - if dedup && !crashline.is_empty() && self.crashlines.contains_key(&crashline) { - return false; - } - self.paths.push(path); - self.stacktraces.push(stacktrace); - self.diam = None; - self.crashlines.insert(crashline, self.paths.len() - 1); - true - } - /// Get cluster diameter - pub fn diam(&mut self) -> f64 { - if self.diam.is_none() { - self.diam = Some(diam(&self.stacktraces)); - } - self.diam.unwrap() - } - /// Get "relation" between new report and specified cluster - /// - /// # Arguments - /// - /// * `new` - new report stacktrace - /// - /// # Return value - /// - /// `Relation` enum with proximity measure according specified strategy - pub fn relation(&mut self, new: &Stacktrace) -> Relation { - let diam = self.diam(); - let mut max = 0f64; - for stacktrace in self.stacktraces() { - let dist = 1.0 - similarity(new, stacktrace); - if dist == 0.0 { - return Relation::Dup; - } else if dist > THRESHOLD { - return Relation::Outer; - } - if dist > max { - max = dist; - } - } - if diam >= max { - // Inner - Relation::Inner(diam) - } else { - // Outer - Relation::Outer - } - } - /// Check if cluster may be merged with another one - pub fn may_merge(&self, cluster: &Cluster) -> bool { - let mut stacktraces1 = self.stacktraces.clone(); - let mut stacktraces2 = cluster.stacktraces().clone(); - stacktraces1.append(&mut stacktraces2); - diam(&stacktraces1) < THRESHOLD - } - /// Convert cluster to vector of reports - pub fn reports(&self) -> Vec { - let mut reports: Vec = Vec::new(); - let mut crashlines = self.crashlines.clone(); - for (i, path) in self.paths.iter().enumerate() { - // Get crashline for cur casrep - let mut crashline = String::new(); - for (line, &number) in &crashlines { - if number == i { - crashline = line.to_string(); - break; - } - } - // Drop cur crashline from crashlines - crashlines.remove(&crashline); - // Update results - reports.push((path.clone(), (self.stacktraces[i].clone(), crashline))); - } - reports - } -} - -/// Generate clusters from CASR report info -/// -/// # Arguments -/// -/// * `reports` - slice of report info: path, stacktrace, crashline -/// -/// * `offset` - cluster enumerate offset -/// -/// * `dedup` - deduplicate crashline, if true -/// -/// # Return value -/// -/// * `HashMap` of `Cluster` -/// * Number of valid casreps before crashiline deduplication -/// * Number of valid casreps after crashiline deduplication -pub fn gen_clusters( - reports: &[ReportInfo], - offset: usize, - dedup: bool, -) -> Result<(HashMap, usize, usize)> { - // Unzip casrep info - let (casreps, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = - reports.iter().cloned().unzip(); - let len = casreps.len(); - // Get stacktraces cluster numbers - let mut numbers = cluster_stacktraces(&stacktraces)?; - // Deduplicate by crashiline - let after = if dedup { - dedup_crashlines(&crashlines, &mut numbers) - } else { - len - }; - // Create clusters - let mut clusters: HashMap = HashMap::new(); - for i in 0..len { - if numbers[i] == 0 { - // Skip casreps with duplicate crashlines - continue; - } - let number = numbers[i] + offset; - // Add new cluster if not exists - clusters - .entry(number) - .or_insert_with(|| Cluster::new(number, Vec::new(), Vec::new(), Vec::new())); - // Update cluster - clusters.get_mut(&number).unwrap().insert( - casreps[i].to_path_buf(), - stacktraces[i].to_vec(), - crashlines[i].to_string(), - dedup, - ); - } - Ok((clusters, len, after)) -} +pub const THRESHOLD: f64 = 0.3; /// This macro updates variables used to remove trusted functions from stack trace #[macro_export] @@ -502,133 +293,6 @@ pub fn dedup_crashlines(crashlines: &[String], clusters: &mut [usize]) -> usize unique_cnt } -/// Get diameter of specified cluster -/// -/// # Arguments -/// -/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures -/// -/// # Return value -/// -/// Value of diameter -fn diam(stacktraces: &[Stacktrace]) -> f64 { - let mut diam = 0f64; - let len = stacktraces.len(); - for i in 0..len { - for j in i + 1..len { - let dist = 1.0 - similarity(&stacktraces[i], &stacktraces[j]); - if dist > diam { - diam = dist; - } - } - } - diam -} - -/// Get "a" subcoefficient silhouette coefficient calculating for given stacktrace -/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition -/// -/// # Arguments -/// -/// * `num` - given stacktrace number -/// -/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures -/// -/// # Return value -/// -/// "a" subcoefficient silhouette coefficient -fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { - let mut sum = 0f64; - for (i, stacktrace) in stacktraces.iter().enumerate() { - if i == num { - continue; - } - sum += 1.0 - similarity(&stacktraces[num], stacktrace); - } - sum / (stacktraces.len() - 1) as f64 -} - -/// Get "b" subcoefficient silhouette coefficient calculating for given stacktrace -/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition -/// -/// # Arguments -/// -/// * `num` - given stacktrace number -/// -/// * `i` - cluster number of given stacktrace -/// -/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures -/// -/// # Return value -/// -/// "b" subcoefficient silhouette coefficient -fn sil_subcoef_b(num: usize, i: usize, clusters: &[Vec]) -> f64 { - let mut min = MAX; - for (j, cluster) in clusters.iter().enumerate() { - if j == i { - continue; - } - let mut sum = 0f64; - for stacktrace in cluster { - sum += 1.0 - similarity(&clusters[i][num], stacktrace); - } - let res = sum / cluster.len() as f64; - if res < min { - min = res; - } - } - min -} - -/// Get silhouette coefficient calculating for given stacktrace -/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition -/// -/// # Arguments -/// -/// * `num` - given stacktrace number -/// -/// * `i` - cluster number of given stacktrace -/// -/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures -/// -/// # Return value -/// -/// Silhouette coefficient -fn sil_coef(num: usize, i: usize, clusters: &[Vec]) -> f64 { - if clusters[i].len() != 1 { - let a = sil_subcoef_a(num, &clusters[i]); - let b = sil_subcoef_b(num, i, clusters); - (b - a) / a.max(b) - } else { - 0f64 - } -} - -/// Get average silhouette coefficient calculating for given stacktraces -/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition -/// -/// # Arguments -/// -/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures -/// -/// * `size` - total amount of elements in clusters -/// -/// # Return value -/// -/// Average silhouette coefficient -pub fn avg_sil_ceof(clusters: &[Vec], size: usize) -> f64 { - // Init sil sum - let mut sum = 0f64; - // Calculate silhouette coefficient for each casrep - for i in 0..clusters.len() { - for num in 0..clusters[i].len() { - let sil = sil_coef(num, i, clusters); - sum += sil; - } - } - sum / size as f64 -} - /// Stack trace filtering trait. pub trait Filter { /// Filter frames from the stack trace that are not related to analyzed code containing crash. From 7537004d54a50b284bc5558cb3c774aa2af8c81b Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 26 Jan 2024 20:23:31 +0300 Subject: [PATCH 30/34] Fixes --- casr/src/bin/casr-cluster.rs | 12 ++++++------ casr/src/util.rs | 8 ++++---- docs/usage.md | 15 +++++++++------ libcasr/src/cluster.rs | 2 +- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index c90dd0ac..fcfbb66f 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -58,11 +58,11 @@ fn make_clusters( } // Get casreps with stacktraces and crashlines - let (casreps, badreports) = util::reports_from_paths(casreps, jobs); + let (casreps, badreports) = util::reports_from_paths(&casreps, jobs); // Handle bad reports if !badreports.is_empty() { - util::save_reports(badreports, format!("{}/clerr", &outpath.display()))?; + util::save_reports(&badreports, format!("{}/clerr", &outpath.display()))?; } if casreps.len() < 2 { @@ -70,7 +70,7 @@ fn make_clusters( } // Get clusters - let (clusters, before, after) = Cluster::gen_clusters(&casreps, 0, dedup)?; + let (clusters, before, after) = Cluster::cluster_reports(&casreps, 0, dedup)?; // Save clusters util::save_clusters(&clusters, outpath)?; @@ -301,7 +301,7 @@ fn update_clusters( ) -> Result<(usize, usize, usize, usize, usize, usize)> { // Get new casreps let casreps = util::get_reports(newpath)?; - let (casreps, _) = util::reports_from_paths(casreps, jobs); + let (casreps, _) = util::reports_from_paths(&casreps, jobs); // Get casreps from existing clusters let mut cluster_dirs: Vec = fs::read_dir(oldpath) @@ -396,7 +396,7 @@ fn update_clusters( let (result, before, after) = if !deviants.is_empty() { // Get clusters from deviants let (mut deviant_clusters, mut before, mut after) = - Cluster::gen_clusters(&deviants, max, dedup)?; + Cluster::cluster_reports(&deviants, max, dedup)?; // Merge old and new clusters let (moved, removed) = merge_clusters(clusters, &mut deviant_clusters, oldpath, dedup)?; // Adjust stat @@ -520,7 +520,7 @@ fn calc_avg_sil(dir: &Path, jobs: usize) -> Result { // Get casreps from cluster let casreps = util::get_reports(dir)?; // Get stacktraces from cluster - let (casreps, _) = util::reports_from_paths(casreps, jobs); + let (casreps, _) = util::reports_from_paths(&casreps, jobs); let (_, (stacktraces, _)): (Vec<_>, (Vec<_>, Vec<_>)) = casreps.iter().cloned().unzip(); // Update size size += stacktraces.len(); diff --git a/casr/src/util.rs b/casr/src/util.rs index e782aa2b..932055a0 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -444,7 +444,7 @@ pub fn get_reports(dir: &Path) -> Result> { /// /// * A vector of correctly parsed report info: paths, stacktraces and crashlines /// * A vector of bad reports -pub fn reports_from_paths(casreps: Vec, jobs: usize) -> (Vec, Vec) { +pub fn reports_from_paths(casreps: &Vec, jobs: usize) -> (Vec, Vec) { // Get len let len = casreps.len(); // Start thread pool. @@ -505,7 +505,7 @@ pub fn cluster_from_dir(dir: &Path, jobs: usize) -> Result { .unwrap(); // Get casreps from cluster let casreps = get_reports(dir)?; - let (casreps, _) = reports_from_paths(casreps, jobs); + let (casreps, _) = reports_from_paths(&casreps, jobs); let (_, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = casreps.iter().cloned().unzip(); // Create cluster @@ -545,13 +545,13 @@ pub fn save_clusters(clusters: &HashMap, dir: &Path) -> Result<( /// * `reports` - A vector of CASR reports /// /// * `dir` - out directory -pub fn save_reports(reports: Vec, dir: String) -> Result<()> { +pub fn save_reports(reports: &Vec, dir: String) -> Result<()> { if !Path::new(&dir).exists() { fs::create_dir_all(&dir)?; } for report in reports { fs::copy( - &report, + report, format!("{}/{}", dir, &report.file_name().unwrap().to_str().unwrap()), )?; } diff --git a/docs/usage.md b/docs/usage.md index 74a06395..4a3cd863 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -326,6 +326,15 @@ After clustering result directory will have the following structure: Similar CASR reports are inside one cluster. +Report accumulation is based on stack trace comparison, recognition similar +stack traces and clustering with merging different ones. + +Example: + + $ casr-cluster -c casr/tests/casr_tests/casrep/test_clustering_small out + $ rm -f out/cl9/40.casrep out/cl7/20.casrep && rm -rf out/cl8 && mv out/cl9 out/cl8 + $ casr-cluster -u casr/tests/casr_tests/casrep/test_clustering_small out + For the **--ignore ** option, file format should be as follows: FUNCTIONS @@ -340,12 +349,6 @@ For `CASR_CLUSTER_UNIQUE_CRASHLINE` a `false` literal is `n`, `no`, `f`, `false`, `off` or `0`. An absent environment variable will also be considered as `false`. Anything else will considered as true. -Example: - - $ casr-cluster -c casr/tests/casr_tests/casrep/test_clustering_small out - $ rm -f out/cl9/40.casrep out/cl7/20.casrep && rm -rf out/cl8 && mv out/cl9 out/cl8 - $ casr-cluster -u casr/tests/casr_tests/casrep/test_clustering_small out - ## casr-cli App provides text-based user interface to view CASR reports, prints joint statistics for diff --git a/libcasr/src/cluster.rs b/libcasr/src/cluster.rs index ef3faf9e..166c0285 100644 --- a/libcasr/src/cluster.rs +++ b/libcasr/src/cluster.rs @@ -76,7 +76,7 @@ impl Cluster { /// * `HashMap` of `Cluster` /// * Number of valid casreps before crashiline deduplication /// * Number of valid casreps after crashiline deduplication - pub fn gen_clusters( + pub fn cluster_reports( reports: &[ReportInfo], offset: usize, dedup: bool, From 37b40a2d46debdd91dba995ed470b1554096a857 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Mon, 29 Jan 2024 14:49:27 +0300 Subject: [PATCH 31/34] Add cluster paths hashmap --- casr/src/bin/casr-cluster.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index fcfbb66f..50693b4a 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -304,7 +304,7 @@ fn update_clusters( let (casreps, _) = util::reports_from_paths(&casreps, jobs); // Get casreps from existing clusters - let mut cluster_dirs: Vec = fs::read_dir(oldpath) + let mut dirs: Vec = fs::read_dir(oldpath) .unwrap() .map(|path| path.unwrap().path()) .filter(|path| { @@ -312,18 +312,22 @@ fn update_clusters( name.starts_with("cl") && !name.starts_with("clerr") }) .collect(); - cluster_dirs.sort(); + dirs.sort(); // Max cluster number let mut max = 0usize; // Init clusters vector let mut clusters: HashMap = HashMap::new(); + // Init cluster paths vector + let mut paths: HashMap = HashMap::new(); // Get casreps from each existing cluster - for cluster_dir in &cluster_dirs { + for dir in &dirs { // Get cluster - let cluster = util::cluster_from_dir(cluster_dir, jobs)?; + let cluster = util::cluster_from_dir(dir, jobs)?; // Update max cluster number max = max.max(cluster.number); + // Add cluster path + paths.insert(cluster.number, dir); // Fill cluster info structures clusters.insert(cluster.number, cluster); } @@ -386,7 +390,7 @@ fn update_clusters( &casrep, format!( "{}/{}", - &cluster_dirs[number - 1].display(), + &paths.get(&number).unwrap().display(), &casrep.file_name().unwrap().to_str().unwrap() ), )?; From dba74b644f16d96355967b22cfb4d849687a97d8 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 1 Feb 2024 16:59:34 +0300 Subject: [PATCH 32/34] Fixes --- casr/src/bin/casr-cluster.rs | 12 ++++++------ docs/usage.md | 2 +- libcasr/src/cluster.rs | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 50693b4a..ce240212 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -41,8 +41,8 @@ fn stacktrace(path: &Path) -> Result { /// # Return value /// /// * Number of clusters -/// * Number of valid casreps before crashiline deduplication -/// * Number of valid casreps after crashiline deduplication +/// * Number of valid casreps before crashline deduplication +/// * Number of valid casreps after crashline deduplication fn make_clusters( inpath: &Path, outpath: Option<&Path>, @@ -291,8 +291,8 @@ fn merge_or_diff(input: &Path, output: &Path, diff: Option<&Path>) -> Result Result { // Init clusters vector let mut clusters: Vec> = Vec::new(); - // Init casreps nuber counter + // Init casreps number counter let mut size = 0usize; // Get casreps from each cluster for dir in &dirs { @@ -612,7 +612,7 @@ fn main() -> Result<()> { .value_parser(clap::value_parser!(PathBuf)) .value_names(["NEW_DIR", "OLD_DIR"]) .help( - "Update clusters from OLD_DIR using CASR reports from NEW_DIR", + "Update clusters in OLD_DIR using CASR reports from NEW_DIR", ), ) .arg( diff --git a/docs/usage.md b/docs/usage.md index 4a3cd863..ac816b0d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -262,7 +262,7 @@ Tool for clustering CASR reports Merge INPUT_DIR into OUTPUT_DIR. Only new CASR reports from INPUT_DIR will be added to OUTPUT_DIR. -u, --update - Update clusters from OLD_DIR using CASR reports from NEW_DIR + Update clusters in OLD_DIR using CASR reports from NEW_DIR -e, --estimate Calculate silhouette score for clustering results --diff diff --git a/libcasr/src/cluster.rs b/libcasr/src/cluster.rs index 166c0285..8a857303 100644 --- a/libcasr/src/cluster.rs +++ b/libcasr/src/cluster.rs @@ -74,8 +74,8 @@ impl Cluster { /// # Return value /// /// * `HashMap` of `Cluster` - /// * Number of valid casreps before crashiline deduplication - /// * Number of valid casreps after crashiline deduplication + /// * Number of valid casreps before crashline deduplication + /// * Number of valid casreps after crashline deduplication pub fn cluster_reports( reports: &[ReportInfo], offset: usize, @@ -87,7 +87,7 @@ impl Cluster { let len = casreps.len(); // Get stacktraces cluster numbers let mut numbers = cluster_stacktraces(&stacktraces)?; - // Deduplicate by crashiline + // Deduplicate by crashline let after = if dedup { dedup_crashlines(&crashlines, &mut numbers) } else { From 4457e95198d86a5eaee4a2c1854e61d93d5ebc6f Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 2 Feb 2024 13:24:29 +0300 Subject: [PATCH 33/34] Fixes --- casr/src/bin/casr-cluster.rs | 7 +++++-- casr/src/util.rs | 17 +++++++++-------- libcasr/src/cluster.rs | 11 +++++++---- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index ce240212..7b21f5d7 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -62,7 +62,10 @@ fn make_clusters( // Handle bad reports if !badreports.is_empty() { - util::save_reports(&badreports, format!("{}/clerr", &outpath.display()))?; + util::save_reports( + &badreports, + format!("{}/clerr", &outpath.display()).as_str(), + )?; } if casreps.len() < 2 { @@ -323,7 +326,7 @@ fn update_clusters( // Get casreps from each existing cluster for dir in &dirs { // Get cluster - let cluster = util::cluster_from_dir(dir, jobs)?; + let cluster = util::load_cluster(dir, jobs)?; // Update max cluster number max = max.max(cluster.number); // Add cluster path diff --git a/casr/src/util.rs b/casr/src/util.rs index 932055a0..4ef59909 100644 --- a/casr/src/util.rs +++ b/casr/src/util.rs @@ -497,19 +497,20 @@ pub fn reports_from_paths(casreps: &Vec, jobs: usize) -> (Vec Result { +/// NOTE: Resulting cluster does not contains path info +pub fn load_cluster(dir: &Path, jobs: usize) -> Result { // Get cluster number - let i = dir.file_name().unwrap().to_str().unwrap()[2..] - .to_string() - .parse::() - .unwrap(); + let i = dir.file_name().unwrap().to_str().unwrap(); + if i.len() < 3 { + bail!("Invalid cluster path: {}", &dir.display()); + } + let i = i[2..].to_string().parse::()?; // Get casreps from cluster let casreps = get_reports(dir)?; let (casreps, _) = reports_from_paths(&casreps, jobs); let (_, (stacktraces, crashlines)): (Vec<_>, (Vec<_>, Vec<_>)) = casreps.iter().cloned().unzip(); // Create cluster - // NOTE: We don't care about paths of casreps from existing clusters Ok(Cluster::new(i, Vec::new(), stacktraces, crashlines)) } @@ -545,9 +546,9 @@ pub fn save_clusters(clusters: &HashMap, dir: &Path) -> Result<( /// * `reports` - A vector of CASR reports /// /// * `dir` - out directory -pub fn save_reports(reports: &Vec, dir: String) -> Result<()> { +pub fn save_reports(reports: &Vec, dir: &str) -> Result<()> { if !Path::new(&dir).exists() { - fs::create_dir_all(&dir)?; + fs::create_dir_all(dir)?; } for report in reports { fs::copy( diff --git a/libcasr/src/cluster.rs b/libcasr/src/cluster.rs index 8a857303..6a365110 100644 --- a/libcasr/src/cluster.rs +++ b/libcasr/src/cluster.rs @@ -6,7 +6,7 @@ use core::f64::MAX; use std::collections::HashMap; use std::path::PathBuf; -/// Represents the information about CASR report +/// Represents the information about CASR report: path, stacktrace and crashline pub type ReportInfo = (PathBuf, (Stacktrace, String)); /// Relation between a CASR report and a cluster @@ -43,6 +43,9 @@ impl Cluster { ) -> Self { let mut unique_crashlines: HashMap = HashMap::new(); for (i, crashline) in crashlines.into_iter().enumerate() { + if unique_crashlines.contains_key(&crashline) { + continue; + } unique_crashlines.insert(crashline, i); } Cluster { @@ -61,11 +64,11 @@ impl Cluster { pub fn stacktraces(&self) -> &Vec { &self.stacktraces } - /// Generate clusters from CASR report info + /// Perform CASR reports clustering /// /// # Arguments /// - /// * `reports` - slice of report info: path, stacktrace, crashline + /// * `reports` - slice of `ReportInfo` /// /// * `offset` - cluster enumerate offset /// @@ -73,7 +76,7 @@ impl Cluster { /// /// # Return value /// - /// * `HashMap` of `Cluster` + /// * `HashMap` of `Cluster` with cluster number as key /// * Number of valid casreps before crashline deduplication /// * Number of valid casreps after crashline deduplication pub fn cluster_reports( From 227d4fd60aa045bd7d139bdda88c0ec683a0e77a Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 2 Feb 2024 14:19:28 +0300 Subject: [PATCH 34/34] Fixes --- casr/src/bin/casr-cluster.rs | 2 +- libcasr/src/cluster.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 7b21f5d7..6a94c4ab 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -537,7 +537,7 @@ fn calc_avg_sil(dir: &Path, jobs: usize) -> Result { if size == 0 { bail!("{} valid reports, nothing to calculate...", size); } - let avg_sil = avg_sil_ceof(&clusters, size); + let avg_sil = avg_sil_coef(&clusters, size); Ok(avg_sil) } diff --git a/libcasr/src/cluster.rs b/libcasr/src/cluster.rs index 6a365110..7d8e23f6 100644 --- a/libcasr/src/cluster.rs +++ b/libcasr/src/cluster.rs @@ -329,7 +329,7 @@ fn sil_coef(num: usize, i: usize, clusters: &[Vec]) -> f64 { /// # Return value /// /// Average silhouette coefficient -pub fn avg_sil_ceof(clusters: &[Vec], size: usize) -> f64 { +pub fn avg_sil_coef(clusters: &[Vec], size: usize) -> f64 { // Init sil sum let mut sum = 0f64; // Calculate silhouette coefficient for each casrep