Skip to content

Commit

Permalink
Fix #123 - implement a lockfile system
Browse files Browse the repository at this point in the history
  • Loading branch information
Lars T Hansen committed Nov 6, 2023
1 parent 5380b4a commit 194b6f0
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 1 deletion.
6 changes: 6 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ enum Commands {
/// Exclude records whose commands start with these comma-separated names [default: none]
#[arg(long)]
exclude_commands: Option<String>,

/// Create a per-host lockfile in this directory and exit early if the file exists on startup [default: none]
#[arg(long)]
lockdir: Option<String>,
},
/// Not yet implemented
Analyze {},
Expand All @@ -87,6 +91,7 @@ fn main() {
exclude_system_jobs,
exclude_users,
exclude_commands,
lockdir,
} => {
let opts = ps::PsOptions {
rollup: *rollup,
Expand All @@ -105,6 +110,7 @@ fn main() {
} else {
vec![]
},
lockdir: lockdir.clone(),
};
if *batchless {
let mut jm = batchless::BatchlessJobManager::new();
Expand Down
66 changes: 65 additions & 1 deletion src/ps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ use crate::procfsapi;

use csv::{Writer, WriterBuilder};
use std::collections::{HashMap, HashSet};
use std::io;
use std::env;
use std::io::{self, Write};
use std::path::PathBuf;
use std::thread;
use std::time;

// The GpuSet has three states:
//
Expand Down Expand Up @@ -161,9 +165,69 @@ pub struct PsOptions<'a> {
pub exclude_system_jobs: bool,
pub exclude_users: Vec<&'a str>,
pub exclude_commands: Vec<&'a str>,
pub lockdir: Option<String>,
}

pub fn create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timestamp: &str) {
if let Some(ref dirname) = opts.lockdir {
let mut created = false;
let mut failed = false;
let mut skip = false;
let hostname = hostname::get().unwrap().into_string().unwrap();

let mut p = PathBuf::new();
p.push(dirname);
p.push("sonar-lock.".to_string() + &hostname);

// create_new() requests atomic creation, if the file exists we'll error out.
match std::fs::File::options().write(true).create_new(true).open(&p) {
Ok(mut f) => {
created = true;
let pid = std::process::id();
match f.write(format!("{}", pid).as_bytes()) {
Ok(_) => {}
Err(_) => { failed = true; }
}
}
Err(e) if e.kind() == io::ErrorKind::AlreadyExists => {
skip = true;
}
Err(_) => {
failed = true;
}
}

if !failed && !skip {
do_create_snapshot(jobs, opts, timestamp);

// Testing code: If we got the lockfile and produced a report, wait 10s after producing
// it while holding onto the lockfile. It is then possible to run sonar in that window
// while the lockfile is being held, to ensure the second process exits immediately.
match std::env::var("SONARTEST_WAIT_LOCKFILE") {
Ok(_) => { thread::sleep(time::Duration::new(10, 0)); }
Err(_) => {}
}
}

if created {
match std::fs::remove_file(p) {
Ok(_) => {}
Err(_) => { failed = true; }
}
}

if skip {
log::info!("Lockfile present, exiting");
}
if failed {
log::error!("Unable to properly manage or delete lockfile");
}
} else {
do_create_snapshot(jobs, opts, timestamp);
}
}

fn do_create_snapshot(jobs: &mut dyn jobs::JobManager, opts: &PsOptions, timestamp: &str) {
let no_gpus = empty_gpuset();
let mut proc_by_pid = ProcTable::new();

Expand Down

0 comments on commit 194b6f0

Please sign in to comment.