Skip to content

Commit

Permalink
Add hierarchical accumulation (#202)
Browse files Browse the repository at this point in the history
  • Loading branch information
hkctkuy authored Feb 26, 2024
1 parent f8ff34b commit c2ce4c8
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 105 deletions.
157 changes: 97 additions & 60 deletions casr/src/bin/casr-cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,94 +401,131 @@ fn update_clusters(

// Handle deviant casreps
let (result, before, after) = if !deviants.is_empty() {
// Get clusters from deviants
let (mut deviant_clusters, mut before, mut after) =
Cluster::cluster_reports(&deviants, max, dedup)?;
// Merge old and new clusters
let (moved, removed) = merge_clusters(clusters, &mut deviant_clusters, oldpath, dedup)?;
let (moved, removed, result, before, after) =
hierarchical_accumulation(clusters, deviants, max, oldpath, dedup)?;
// Adjust stat
if moved != 0 || removed != 0 {
added += moved;
deduplicated += removed;
before = 0; // Impossible to know (proofed by @hkctkuy)
after -= moved + removed;
}
// Save deviant clusters
util::save_clusters(&deviant_clusters, oldpath)?;
(deviant_clusters.len(), before, after)
added += moved;
deduplicated += removed;
(result, before, after)
} else {
(0, 0, 0)
};
Ok((added, duplicates, deduplicated, result, before, after))
}

/// Try to merge new clusters to old clusters
/// Perform CASR report accumulation to old clusters using hierarchical clustering
///
/// # Arguments
///
/// * `olds` - list of old clusters represented as `HashMap` of `Cluster`
///
/// * `news` - list of new clusters represented as `HashMap` of `Cluster`
/// * `deviants` - list of deviant reports represented as `Vec` of `ReportInfo`
///
/// * `max` - old clusters max number
///
/// * `dir` - out directory
///
/// * `dedup` - deduplicate crashline, if true
///
/// # Return value
///
/// Number of moved to old clusters CASR reports
/// Number of removed by crashline deduplication CASR reports
fn merge_clusters(
olds: HashMap<usize, Cluster>,
news: &mut HashMap<usize, Cluster>,
/// * Number of moved to old clusters CASR reports
/// * Number of removed from old clusters by crashline deduplication CASR reports
/// * Number of new clusters
/// * Number of valid casreps before crashline deduplication in new clusters
/// * Number of valid casreps after crashline deduplication in new clusters
fn hierarchical_accumulation(
mut olds: HashMap<usize, Cluster>,
deviants: Vec<ReportInfo>,
max: usize,
dir: &Path,
dedup: bool,
) -> Result<(usize, usize)> {
) -> Result<(usize, usize, usize, usize, usize)> {
let mut moved = 0usize;
let mut removed = 0usize;
let mut olds: Vec<Cluster> = olds.into_values().collect();
olds.sort_by(|a, b| a.number.cmp(&b.number));
for mut old in olds {
let mut merged = Vec::new();
let mut values: Vec<&Cluster> = news.values().collect();
values.sort_by(|a, b| a.number.cmp(&b.number));
for new in values {
if !old.may_merge(new) {
let mut before = 0usize;
let mut deduplicated = 0usize;
// Forming condensed dissimilarity matrix
let mut matrix = vec![];
let keys: Vec<_> = olds.keys().collect();
let clusters: Vec<_> = olds.values().collect();
for i in 0..clusters.len() {
// Write cluster-cluster dist
for j in i + 1..clusters.len() {
matrix.push(Cluster::dist(clusters[i], clusters[j]));
}
// Write cluster-report dist
for deviant in &deviants {
matrix.push(Cluster::dist_rep(clusters[i], deviant));
}
}
// Write report-report dist
for i in 0..deviants.len() {
let (_, (stacktrace1, _)) = &deviants[i];
for deviant2 in deviants.iter().skip(i + 1) {
let (_, (stacktrace2, _)) = &deviant2;
matrix.push(1.0 - similarity(stacktrace1, stacktrace2));
}
}

// Clustering
let res = cluster(matrix, clusters.len() + deviants.len())?;

// Sync real cluster numbers with resulting numbers
let mut numbers: HashMap<usize, usize> = HashMap::new();
for i in 0..clusters.len() {
numbers.insert(res[i], *keys[i]);
}
// New clusters
let mut news: HashMap<usize, Cluster> = HashMap::new();
let mut new_num = max;
for &num in res.iter().skip(clusters.len()) {
if numbers.contains_key(&num) {
continue;
}
new_num += 1;
numbers.insert(num, new_num);
// Create new cluster
let new = Cluster::new(new_num, vec![], vec![], vec![]);
news.insert(new_num, new);
}

// Save reports
for i in 0..deviants.len() {
// Get cluster number
let number = *numbers.get(&res[i + olds.len()]).unwrap();
// NOTE: We need not to track stacktraces
let (casrep, (_, crashline)) = &deviants[i];
if number > max {
// New cluster
before += 1;
let cluster = news.get_mut(&number).unwrap();
if !cluster.insert(casrep.to_path_buf(), vec![], crashline.to_string(), dedup) {
deduplicated += 1;
continue;
}
// Copy casreps from new to old
for (casrep, (stacktrace, crashline)) in new.reports() {
// Update cluster (and dedup crashline)
if !old.insert(
casrep.to_path_buf(),
stacktrace.to_vec(),
crashline.to_string(),
dedup,
) {
removed += 1;
continue;
}
// Save report
moved += 1;
fs::copy(
&casrep,
format!(
"{}/cl{}/{}",
&dir.display(),
old.number,
&casrep.file_name().unwrap().to_str().unwrap()
),
)?;
} else {
// Old cluster
let cluster = olds.get_mut(&number).unwrap();
if !cluster.insert(casrep.to_path_buf(), vec![], crashline.to_string(), dedup) {
removed += 1;
continue;
}
// Mark merged cluster for drop
merged.push(new.number);
}
// Drop marked cluster
for number in merged {
news.remove(&number);
// Save report
moved += 1;
fs::copy(
casrep,
format!(
"{}/cl{}/{}",
&dir.display(),
cluster.number,
&casrep.file_name().unwrap().to_str().unwrap()
),
)?;
}
}
Ok((moved, removed))
util::save_clusters(&news, dir)?;
Ok((moved, removed, news.len(), before, before - deduplicated))
}

/// Calculate silhouette coefficient
Expand Down
10 changes: 5 additions & 5 deletions casr/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2675,11 +2675,11 @@ fn test_casr_cluster_d_and_m() {
fn test_casr_cluster_u() {
let paths = [
abs_path("tests/casr_tests/casrep/test_clustering_small"),
abs_path("tests/tmp_tests_casr/clustering_out"),
abs_path("tests/tmp_tests_casr/clustering_out/cl7/20.casrep"),
abs_path("tests/tmp_tests_casr/clustering_out/cl8"),
abs_path("tests/tmp_tests_casr/clustering_out/cl9"),
abs_path("tests/tmp_tests_casr/clustering_out/cl9/40.casrep"),
abs_path("tests/tmp_tests_casr/updating_out"),
abs_path("tests/tmp_tests_casr/updating_out/cl7/20.casrep"),
abs_path("tests/tmp_tests_casr/updating_out/cl8"),
abs_path("tests/tmp_tests_casr/updating_out/cl9"),
abs_path("tests/tmp_tests_casr/updating_out/cl9/40.casrep"),
];

let _ = fs::remove_dir_all(&paths[1]);
Expand Down
42 changes: 19 additions & 23 deletions libcasr/src/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,32 +186,28 @@ impl Cluster {
Relation::Outer
}
}
/// Check if cluster may be merged with another one
pub fn may_merge(&self, cluster: &Cluster) -> bool {
let mut stacktraces1 = self.stacktraces.clone();
let mut stacktraces2 = cluster.stacktraces().clone();
/// Get complete distance between clusters
/// NOTE: Result also can be interpreted as diameter of cluster merge result
pub fn dist(cluster1: &Cluster, cluster2: &Cluster) -> f64 {
let mut stacktraces1 = cluster1.stacktraces().clone();
let mut stacktraces2 = cluster2.stacktraces().clone();
stacktraces1.append(&mut stacktraces2);
diam(&stacktraces1) < THRESHOLD
diam(&stacktraces1)
}
/// Convert cluster to vector of reports
pub fn reports(&self) -> Vec<ReportInfo> {
let mut reports: Vec<ReportInfo> = Vec::new();
let mut crashlines = self.crashlines.clone();
for (i, path) in self.paths.iter().enumerate() {
// Get crashline for cur casrep
let mut crashline = String::new();
for (line, &number) in &crashlines {
if number == i {
crashline = line.to_string();
break;
}
}
// Drop cur crashline from crashlines
crashlines.remove(&crashline);
// Update results
reports.push((path.clone(), (self.stacktraces[i].clone(), crashline)));
/// Get complete distance between cluster and report
/// NOTE: Result also can be interpreted as diameter of cluster merge result
pub fn dist_rep(cluster: &Cluster, report: &ReportInfo) -> f64 {
let (_, (trace, _)) = report;
if let Some(max) = cluster
.stacktraces()
.iter()
.map(|s| 1.0 - similarity(s, trace))
.max_by(|a, b| a.total_cmp(b))
{
max
} else {
0f64
}
reports
}
}

Expand Down
49 changes: 32 additions & 17 deletions libcasr/src/stacktrace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,30 +189,21 @@ pub fn dedup_stacktraces(stacktraces: &[Stacktrace]) -> Vec<bool> {
.collect()
}

/// Perform the clustering of stack traces
/// Perform the clustering by condensed dissimilarity matrix
///
/// # Arguments
///
/// * `stacktraces` - slice of `Stacktrace` structures
/// * `matrix` - condensed dissimilarity matrix
///
/// * `len` - number of observations that are being clustered
///
/// # Return value
///
/// A vector of the same length as `stacktraces`.
/// Vec\[i\] is the flat cluster number to which original stack trace i belongs.
pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result<Vec<usize>> {
// Writing distance matrix
// Only the values in the upper triangle are explicitly represented,
// not including the diagonal
let len = stacktraces.len();
let mut condensed_dissimilarity_matrix = vec![];
for i in 0..len {
for j in i + 1..len {
condensed_dissimilarity_matrix.push(1.0 - similarity(&stacktraces[i], &stacktraces[j]));
}
}

/// A vector of the `len` length.
/// Vec\[i\] is the flat cluster number to which original object i belongs.
pub fn cluster(mut matrix: Vec<f64>, len: usize) -> Result<Vec<usize>> {
// Get hierarchical clustering binary tree
let dendrogram = linkage(&mut condensed_dissimilarity_matrix, len, Method::Complete);
let dendrogram = linkage(&mut matrix, len, Method::Complete);

// Iterate through merging step until threshold is reached
// at the beginning every node is in its own cluster
Expand Down Expand Up @@ -255,6 +246,30 @@ pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result<Vec<usize>> {
Ok(flat_clusters)
}

/// Perform the clustering of stack traces
///
/// # Arguments
///
/// * `stacktraces` - slice of `Stacktrace` structures
///
/// # Return value
///
/// A vector of the same length as `stacktraces`.
/// Vec\[i\] is the flat cluster number to which original stack trace i belongs.
pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result<Vec<usize>> {
// Writing distance matrix
// Only the values in the upper triangle are explicitly represented,
// not including the diagonal
let len = stacktraces.len();
let mut condensed_dissimilarity_matrix = vec![];
for i in 0..len {
for j in i + 1..len {
condensed_dissimilarity_matrix.push(1.0 - similarity(&stacktraces[i], &stacktraces[j]));
}
}
cluster(condensed_dissimilarity_matrix, len)
}

/// Perform crashline deduplication for each cluster:
/// Reset Vec\[i\] to 0 if report crashline is duplicate of some other.
///
Expand Down

0 comments on commit c2ce4c8

Please sign in to comment.