Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hierarchical accumulation #202

Merged
merged 4 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 97 additions & 60 deletions casr/src/bin/casr-cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,94 +401,131 @@

// Handle deviant casreps
let (result, before, after) = if !deviants.is_empty() {
// Get clusters from deviants
let (mut deviant_clusters, mut before, mut after) =
Cluster::cluster_reports(&deviants, max, dedup)?;
// Merge old and new clusters
let (moved, removed) = merge_clusters(clusters, &mut deviant_clusters, oldpath, dedup)?;
let (moved, removed, result, before, after) =
hierarchical_accumulation(clusters, deviants, max, oldpath, dedup)?;
// Adjust stat
if moved != 0 || removed != 0 {
added += moved;
deduplicated += removed;
before = 0; // Impossible to know (proofed by @hkctkuy)
after -= moved + removed;
}
// Save deviant clusters
util::save_clusters(&deviant_clusters, oldpath)?;
(deviant_clusters.len(), before, after)
added += moved;
deduplicated += removed;
(result, before, after)
} else {
(0, 0, 0)
};
Ok((added, duplicates, deduplicated, result, before, after))
}

/// Try to merge new clusters to old clusters
/// Perform CASR report accumulation to old clusters using hierarchical clustering
///
/// # Arguments
///
/// * `olds` - list of old clusters represented as `HashMap` of `Cluster`
///
/// * `news` - list of new clusters represented as `HashMap` of `Cluster`
/// * `deviants` - list of deviant reports represented as `Vec` of `ReportInfo`
///
/// * `max` - old clusters max number
///
/// * `dir` - out directory
///
/// * `dedup` - deduplicate crashline, if true
///
/// # Return value
///
/// Number of moved to old clusters CASR reports
/// Number of removed by crashline deduplication CASR reports
fn merge_clusters(
olds: HashMap<usize, Cluster>,
news: &mut HashMap<usize, Cluster>,
/// * Number of moved to old clusters CASR reports
/// * Number of removed from old clusters by crashline deduplication CASR reports
/// * Number of new clusters
/// * Number of valid casreps before crashline deduplication in new clusters
/// * Number of valid casreps after crashline deduplication in new clusters
fn hierarchical_accumulation(
mut olds: HashMap<usize, Cluster>,
deviants: Vec<ReportInfo>,
max: usize,
dir: &Path,
dedup: bool,
) -> Result<(usize, usize)> {
) -> Result<(usize, usize, usize, usize, usize)> {
let mut moved = 0usize;
let mut removed = 0usize;
let mut olds: Vec<Cluster> = olds.into_values().collect();
olds.sort_by(|a, b| a.number.cmp(&b.number));
for mut old in olds {
let mut merged = Vec::new();
let mut values: Vec<&Cluster> = news.values().collect();
values.sort_by(|a, b| a.number.cmp(&b.number));
for new in values {
if !old.may_merge(new) {
let mut before = 0usize;
let mut deduplicated = 0usize;
// Forming condensed dissimilarity matrix
let mut matrix = vec![];
let keys: Vec<_> = olds.keys().collect();
let clusters: Vec<_> = olds.values().collect();
for i in 0..clusters.len() {
// Write cluster-cluster dist
for j in i + 1..clusters.len() {
matrix.push(Cluster::dist(clusters[i], clusters[j]));
}
// Write cluster-report dist
for deviant in &deviants {
matrix.push(Cluster::dist_rep(clusters[i], deviant));
}
}
// Write report-report dist
for i in 0..deviants.len() {
let (_, (stacktrace1, _)) = &deviants[i];
for deviant2 in deviants.iter().skip(i + 1) {
let (_, (stacktrace2, _)) = &deviant2;
matrix.push(1.0 - similarity(stacktrace1, stacktrace2));
}
}

// Clustering
let res = cluster(matrix, clusters.len() + deviants.len())?;

// Sync real cluster numbers with resulting numbers
let mut numbers: HashMap<usize, usize> = HashMap::new();
for i in 0..clusters.len() {
numbers.insert(res[i], *keys[i]);
}
// New clusters
let mut news: HashMap<usize, Cluster> = HashMap::new();
let mut new_num = max;
for &num in res.iter().skip(clusters.len()).take(deviants.len()) {
Avgor46 marked this conversation as resolved.
Show resolved Hide resolved
if numbers.contains_key(&num) {
continue;
}
new_num += 1;
numbers.insert(num, new_num);
// Create new cluster
let new = Cluster::new(new_num, vec![], vec![], vec![]);
news.insert(new_num, new);
}

// Save reports
for i in 0..deviants.len() {
// Get cluster number
let number = *numbers.get(&res[i + olds.len()]).unwrap();
// NOTE: We need not to track stacktraces
let (casrep, (_, crashline)) = &deviants[i];
if number > max {
// New cluster
before += 1;
let cluster = news.get_mut(&number).unwrap();
if !cluster.insert(casrep.to_path_buf(), vec![], crashline.to_string(), dedup) {
deduplicated += 1;
continue;
}
// Copy casreps from new to old
for (casrep, (stacktrace, crashline)) in new.reports() {
// Update cluster (and dedup crashline)
if !old.insert(
casrep.to_path_buf(),
stacktrace.to_vec(),
crashline.to_string(),
dedup,
) {
removed += 1;
continue;
}
// Save report
moved += 1;
fs::copy(
&casrep,
format!(
"{}/cl{}/{}",
&dir.display(),
old.number,
&casrep.file_name().unwrap().to_str().unwrap()
),
)?;
} else {
// Old cluster
let cluster = olds.get_mut(&number).unwrap();
if !cluster.insert(casrep.to_path_buf(), vec![], crashline.to_string(), dedup) {
removed += 1;

Check warning on line 511 in casr/src/bin/casr-cluster.rs

View check run for this annotation

Codecov / codecov/patch

casr/src/bin/casr-cluster.rs#L511

Added line #L511 was not covered by tests
continue;
}
// Mark merged cluster for drop
merged.push(new.number);
}
// Drop marked cluster
for number in merged {
news.remove(&number);
// Save report
moved += 1;
fs::copy(
casrep,
format!(
"{}/cl{}/{}",
&dir.display(),
cluster.number,
&casrep.file_name().unwrap().to_str().unwrap()
),
)?;

Check warning on line 524 in casr/src/bin/casr-cluster.rs

View check run for this annotation

Codecov / codecov/patch

casr/src/bin/casr-cluster.rs#L524

Added line #L524 was not covered by tests
}
}
Ok((moved, removed))
util::save_clusters(&news, dir)?;
Ok((moved, removed, news.len(), before, before - deduplicated))
}

/// Calculate silhouette coefficient
Expand Down
10 changes: 5 additions & 5 deletions casr/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2675,11 +2675,11 @@ fn test_casr_cluster_d_and_m() {
fn test_casr_cluster_u() {
let paths = [
abs_path("tests/casr_tests/casrep/test_clustering_small"),
abs_path("tests/tmp_tests_casr/clustering_out"),
abs_path("tests/tmp_tests_casr/clustering_out/cl7/20.casrep"),
abs_path("tests/tmp_tests_casr/clustering_out/cl8"),
abs_path("tests/tmp_tests_casr/clustering_out/cl9"),
abs_path("tests/tmp_tests_casr/clustering_out/cl9/40.casrep"),
abs_path("tests/tmp_tests_casr/updating_out"),
abs_path("tests/tmp_tests_casr/updating_out/cl7/20.casrep"),
abs_path("tests/tmp_tests_casr/updating_out/cl8"),
abs_path("tests/tmp_tests_casr/updating_out/cl9"),
abs_path("tests/tmp_tests_casr/updating_out/cl9/40.casrep"),
];

let _ = fs::remove_dir_all(&paths[1]);
Expand Down
24 changes: 19 additions & 5 deletions libcasr/src/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,26 @@ impl Cluster {
Relation::Outer
}
}
/// Check if cluster may be merged with another one
pub fn may_merge(&self, cluster: &Cluster) -> bool {
let mut stacktraces1 = self.stacktraces.clone();
let mut stacktraces2 = cluster.stacktraces().clone();
/// Get complete distance between clusters
/// NOTE: Result also can be interpreted as diameter of cluster merge result
pub fn dist(cluster1: &Cluster, cluster2: &Cluster) -> f64 {
let mut stacktraces1 = cluster1.stacktraces().clone();
let mut stacktraces2 = cluster2.stacktraces().clone();
stacktraces1.append(&mut stacktraces2);
diam(&stacktraces1) < THRESHOLD
diam(&stacktraces1)
}
/// Get complete distance between cluster and report
/// NOTE: Result also can be interpreted as diameter of cluster merge result
pub fn dist_rep(cluster: &Cluster, report: &ReportInfo) -> f64 {
let mut max = 0f64;
let (_, (trace, _)) = report;
for stacktrace in cluster.stacktraces() {
Avgor46 marked this conversation as resolved.
Show resolved Hide resolved
let dist = 1.0 - similarity(stacktrace, trace);
if dist > max {
max = dist;
}
}
max
}
/// Convert cluster to vector of reports
pub fn reports(&self) -> Vec<ReportInfo> {
Expand Down
49 changes: 32 additions & 17 deletions libcasr/src/stacktrace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,30 +189,21 @@ pub fn dedup_stacktraces(stacktraces: &[Stacktrace]) -> Vec<bool> {
.collect()
}

/// Perform the clustering of stack traces
/// Perform the clustering by condensed dissimilarity matrix
///
/// # Arguments
///
/// * `stacktraces` - slice of `Stacktrace` structures
/// * `matrix` - condensed dissimilarity matrix
SweetVishnya marked this conversation as resolved.
Show resolved Hide resolved
///
/// * `len` - matrix length
Avgor46 marked this conversation as resolved.
Show resolved Hide resolved
///
/// # Return value
///
/// A vector of the same length as `stacktraces`.
/// Vec\[i\] is the flat cluster number to which original stack trace i belongs.
pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result<Vec<usize>> {
// Writing distance matrix
// Only the values in the upper triangle are explicitly represented,
// not including the diagonal
let len = stacktraces.len();
let mut condensed_dissimilarity_matrix = vec![];
for i in 0..len {
for j in i + 1..len {
condensed_dissimilarity_matrix.push(1.0 - similarity(&stacktraces[i], &stacktraces[j]));
}
}

/// A vector of the `len` length.
/// Vec\[i\] is the flat cluster number to which original object i belongs.
pub fn cluster(mut matrix: Vec<f64>, len: usize) -> Result<Vec<usize>> {
// Get hierarchical clustering binary tree
let dendrogram = linkage(&mut condensed_dissimilarity_matrix, len, Method::Complete);
let dendrogram = linkage(&mut matrix, len, Method::Complete);

// Iterate through merging step until threshold is reached
// at the beginning every node is in its own cluster
Expand Down Expand Up @@ -255,6 +246,30 @@ pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result<Vec<usize>> {
Ok(flat_clusters)
}

/// Perform the clustering of stack traces
///
/// # Arguments
///
/// * `stacktraces` - slice of `Stacktrace` structures
///
/// # Return value
///
/// A vector of the same length as `stacktraces`.
/// Vec\[i\] is the flat cluster number to which original stack trace i belongs.
pub fn cluster_stacktraces(stacktraces: &[Stacktrace]) -> Result<Vec<usize>> {
// Writing distance matrix
// Only the values in the upper triangle are explicitly represented,
// not including the diagonal
let len = stacktraces.len();
let mut condensed_dissimilarity_matrix = vec![];
for i in 0..len {
for j in i + 1..len {
condensed_dissimilarity_matrix.push(1.0 - similarity(&stacktraces[i], &stacktraces[j]));
}
}
cluster(condensed_dissimilarity_matrix, len)
}

/// Perform crashline deduplication for each cluster:
/// Reset Vec\[i\] to 0 if report crashline is duplicate of some other.
///
Expand Down
Loading