From c2ce4c8a0648073b8d5dc4e17344764e41ea0177 Mon Sep 17 00:00:00 2001 From: Ilya Yegorov Date: Mon, 26 Feb 2024 14:41:00 +0300 Subject: [PATCH] Add hierarchical accumulation (#202) --- casr/src/bin/casr-cluster.rs | 157 ++++++++++++++++++++++------------- casr/tests/tests.rs | 10 +-- libcasr/src/cluster.rs | 42 +++++----- libcasr/src/stacktrace.rs | 49 +++++++---- 4 files changed, 153 insertions(+), 105 deletions(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 6a94c4ab..83971af2 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -401,34 +401,27 @@ fn update_clusters( // Handle deviant casreps let (result, before, after) = if !deviants.is_empty() { - // Get clusters from deviants - let (mut deviant_clusters, mut before, mut after) = - Cluster::cluster_reports(&deviants, max, dedup)?; - // Merge old and new clusters - let (moved, removed) = merge_clusters(clusters, &mut deviant_clusters, oldpath, dedup)?; + let (moved, removed, result, before, after) = + hierarchical_accumulation(clusters, deviants, max, oldpath, dedup)?; // Adjust stat - if moved != 0 || removed != 0 { - added += moved; - deduplicated += removed; - before = 0; // Impossible to know (proofed by @hkctkuy) - after -= moved + removed; - } - // Save deviant clusters - util::save_clusters(&deviant_clusters, oldpath)?; - (deviant_clusters.len(), before, after) + added += moved; + deduplicated += removed; + (result, before, after) } else { (0, 0, 0) }; Ok((added, duplicates, deduplicated, result, before, after)) } -/// Try to merge new clusters to old clusters +/// Perform CASR report accumulation to old clusters using hierarchical clustering /// /// # Arguments /// /// * `olds` - list of old clusters represented as `HashMap` of `Cluster` /// -/// * `news` - list of new clusters represented as `HashMap` of `Cluster` +/// * `deviants` - list of deviant reports represented as `Vec` of `ReportInfo` +/// +/// * `max` - old clusters max number /// /// * `dir` - out directory /// @@ -436,59 +429,103 @@ fn update_clusters( /// /// # Return value /// -/// Number of moved to old clusters CASR reports -/// Number of removed by crashline deduplication CASR reports -fn merge_clusters( - olds: HashMap, - news: &mut HashMap, +/// * Number of moved to old clusters CASR reports +/// * Number of removed from old clusters by crashline deduplication CASR reports +/// * Number of new clusters +/// * Number of valid casreps before crashline deduplication in new clusters +/// * Number of valid casreps after crashline deduplication in new clusters +fn hierarchical_accumulation( + mut olds: HashMap, + deviants: Vec, + max: usize, dir: &Path, dedup: bool, -) -> Result<(usize, usize)> { +) -> Result<(usize, usize, usize, usize, usize)> { let mut moved = 0usize; let mut removed = 0usize; - let mut olds: Vec = olds.into_values().collect(); - olds.sort_by(|a, b| a.number.cmp(&b.number)); - for mut old in olds { - let mut merged = Vec::new(); - let mut values: Vec<&Cluster> = news.values().collect(); - values.sort_by(|a, b| a.number.cmp(&b.number)); - for new in values { - if !old.may_merge(new) { + let mut before = 0usize; + let mut deduplicated = 0usize; + // Forming condensed dissimilarity matrix + let mut matrix = vec![]; + let keys: Vec<_> = olds.keys().collect(); + let clusters: Vec<_> = olds.values().collect(); + for i in 0..clusters.len() { + // Write cluster-cluster dist + for j in i + 1..clusters.len() { + matrix.push(Cluster::dist(clusters[i], clusters[j])); + } + // Write cluster-report dist + for deviant in &deviants { + matrix.push(Cluster::dist_rep(clusters[i], deviant)); + } + } + // Write report-report dist + for i in 0..deviants.len() { + let (_, (stacktrace1, _)) = &deviants[i]; + for deviant2 in deviants.iter().skip(i + 1) { + let (_, (stacktrace2, _)) = &deviant2; + matrix.push(1.0 - similarity(stacktrace1, stacktrace2)); + } + } + + // Clustering + let res = cluster(matrix, clusters.len() + deviants.len())?; + + // Sync real cluster numbers with resulting numbers + let mut numbers: HashMap = HashMap::new(); + for i in 0..clusters.len() { + numbers.insert(res[i], *keys[i]); + } + // New clusters + let mut news: HashMap = HashMap::new(); + let mut new_num = max; + for &num in res.iter().skip(clusters.len()) { + if numbers.contains_key(&num) { + continue; + } + new_num += 1; + numbers.insert(num, new_num); + // Create new cluster + let new = Cluster::new(new_num, vec![], vec![], vec![]); + news.insert(new_num, new); + } + + // Save reports + for i in 0..deviants.len() { + // Get cluster number + let number = *numbers.get(&res[i + olds.len()]).unwrap(); + // NOTE: We need not to track stacktraces + let (casrep, (_, crashline)) = &deviants[i]; + if number > max { + // New cluster + before += 1; + let cluster = news.get_mut(&number).unwrap(); + if !cluster.insert(casrep.to_path_buf(), vec![], crashline.to_string(), dedup) { + deduplicated += 1; continue; } - // Copy casreps from new to old - for (casrep, (stacktrace, crashline)) in new.reports() { - // Update cluster (and dedup crashline) - if !old.insert( - casrep.to_path_buf(), - stacktrace.to_vec(), - crashline.to_string(), - dedup, - ) { - removed += 1; - continue; - } - // Save report - moved += 1; - fs::copy( - &casrep, - format!( - "{}/cl{}/{}", - &dir.display(), - old.number, - &casrep.file_name().unwrap().to_str().unwrap() - ), - )?; + } else { + // Old cluster + let cluster = olds.get_mut(&number).unwrap(); + if !cluster.insert(casrep.to_path_buf(), vec![], crashline.to_string(), dedup) { + removed += 1; + continue; } - // Mark merged cluster for drop - merged.push(new.number); - } - // Drop marked cluster - for number in merged { - news.remove(&number); + // Save report + moved += 1; + fs::copy( + casrep, + format!( + "{}/cl{}/{}", + &dir.display(), + cluster.number, + &casrep.file_name().unwrap().to_str().unwrap() + ), + )?; } } - Ok((moved, removed)) + util::save_clusters(&news, dir)?; + Ok((moved, removed, news.len(), before, before - deduplicated)) } /// Calculate silhouette coefficient diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index 02ddde68..a426f461 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2675,11 +2675,11 @@ fn test_casr_cluster_d_and_m() { fn test_casr_cluster_u() { let paths = [ abs_path("tests/casr_tests/casrep/test_clustering_small"), - abs_path("tests/tmp_tests_casr/clustering_out"), - abs_path("tests/tmp_tests_casr/clustering_out/cl7/20.casrep"), - abs_path("tests/tmp_tests_casr/clustering_out/cl8"), - abs_path("tests/tmp_tests_casr/clustering_out/cl9"), - abs_path("tests/tmp_tests_casr/clustering_out/cl9/40.casrep"), + abs_path("tests/tmp_tests_casr/updating_out"), + abs_path("tests/tmp_tests_casr/updating_out/cl7/20.casrep"), + abs_path("tests/tmp_tests_casr/updating_out/cl8"), + abs_path("tests/tmp_tests_casr/updating_out/cl9"), + abs_path("tests/tmp_tests_casr/updating_out/cl9/40.casrep"), ]; let _ = fs::remove_dir_all(&paths[1]); diff --git a/libcasr/src/cluster.rs b/libcasr/src/cluster.rs index 7d8e23f6..d0e45663 100644 --- a/libcasr/src/cluster.rs +++ b/libcasr/src/cluster.rs @@ -186,32 +186,28 @@ impl Cluster { Relation::Outer } } - /// Check if cluster may be merged with another one - pub fn may_merge(&self, cluster: &Cluster) -> bool { - let mut stacktraces1 = self.stacktraces.clone(); - let mut stacktraces2 = cluster.stacktraces().clone(); + /// Get complete distance between clusters + /// NOTE: Result also can be interpreted as diameter of cluster merge result + pub fn dist(cluster1: &Cluster, cluster2: &Cluster) -> f64 { + let mut stacktraces1 = cluster1.stacktraces().clone(); + let mut stacktraces2 = cluster2.stacktraces().clone(); stacktraces1.append(&mut stacktraces2); - diam(&stacktraces1) < THRESHOLD + diam(&stacktraces1) } - /// Convert cluster to vector of reports - pub fn reports(&self) -> Vec { - let mut reports: Vec = Vec::new(); - let mut crashlines = self.crashlines.clone(); - for (i, path) in self.paths.iter().enumerate() { - // Get crashline for cur casrep - let mut crashline = String::new(); - for (line, &number) in &crashlines { - if number == i { - crashline = line.to_string(); - break; - } - } - // Drop cur crashline from crashlines - crashlines.remove(&crashline); - // Update results - reports.push((path.clone(), (self.stacktraces[i].clone(), crashline))); + /// Get complete distance between cluster and report + /// NOTE: Result also can be interpreted as diameter of cluster merge result + pub fn dist_rep(cluster: &Cluster, report: &ReportInfo) -> f64 { + let (_, (trace, _)) = report; + if let Some(max) = cluster + .stacktraces() + .iter() + .map(|s| 1.0 - similarity(s, trace)) + .max_by(|a, b| a.total_cmp(b)) + { + max + } else { + 0f64 } - reports } } diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index b60aae05..f68c2959 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -189,30 +189,21 @@ pub fn dedup_stacktraces(stacktraces: &[Stacktrace]) -> Vec { .collect() } -/// Perform the clustering of stack traces +/// Perform the clustering by condensed dissimilarity matrix /// /// # Arguments /// -/// * `stacktraces` - slice of `Stacktrace` structures +/// * `matrix` - condensed dissimilarity matrix +/// +/// * `len` - number of observations that are being clustered /// /// # Return value /// -/// A vector of the same length as `stacktraces`. -/// Vec\[i\] is the flat cluster number to which original stack trace i belongs. -pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result> { - // Writing distance matrix - // Only the values in the upper triangle are explicitly represented, - // not including the diagonal - let len = stacktraces.len(); - let mut condensed_dissimilarity_matrix = vec![]; - for i in 0..len { - for j in i + 1..len { - condensed_dissimilarity_matrix.push(1.0 - similarity(&stacktraces[i], &stacktraces[j])); - } - } - +/// A vector of the `len` length. +/// Vec\[i\] is the flat cluster number to which original object i belongs. +pub fn cluster(mut matrix: Vec, len: usize) -> Result> { // Get hierarchical clustering binary tree - let dendrogram = linkage(&mut condensed_dissimilarity_matrix, len, Method::Complete); + let dendrogram = linkage(&mut matrix, len, Method::Complete); // Iterate through merging step until threshold is reached // at the beginning every node is in its own cluster @@ -255,6 +246,30 @@ pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result> { Ok(flat_clusters) } +/// Perform the clustering of stack traces +/// +/// # Arguments +/// +/// * `stacktraces` - slice of `Stacktrace` structures +/// +/// # Return value +/// +/// A vector of the same length as `stacktraces`. +/// Vec\[i\] is the flat cluster number to which original stack trace i belongs. +pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result> { + // Writing distance matrix + // Only the values in the upper triangle are explicitly represented, + // not including the diagonal + let len = stacktraces.len(); + let mut condensed_dissimilarity_matrix = vec![]; + for i in 0..len { + for j in i + 1..len { + condensed_dissimilarity_matrix.push(1.0 - similarity(&stacktraces[i], &stacktraces[j])); + } + } + cluster(condensed_dissimilarity_matrix, len) +} + /// Perform crashline deduplication for each cluster: /// Reset Vec\[i\] to 0 if report crashline is duplicate of some other. ///