Skip to content

Commit

Permalink
Add --checkpoint command-line option
Browse files Browse the repository at this point in the history
This makes it possible to select a checkpoint from the command-line when using the --run-job option.
  • Loading branch information
ato committed Nov 24, 2024
1 parent 18c69de commit 585fed8
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 2 deletions.
3 changes: 3 additions & 0 deletions docs/operating.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ Command-line Options
-b, --web-bind-hosts HOST
Specifies a comma-separated list of hostnames/IP-addresses to bind to the Web UI. You may use '/' as a
shorthand for 'all addresses'. **Default**: ``localhost/127.0.0.1``
-c,--checkpoint ARG
Recovers from the given checkpoint. May only be used with the --run-job option. The special value 'latest'
will recover the last checkpoint or if none exist will launch a new crawl.
-j, --job-dirs PATH
Sets the directory Heritrix stores jobs in. **Default:** ``$HERITRIX_HOME/jobs``
-l, --logging-properties PATH
Expand Down
22 changes: 20 additions & 2 deletions engine/src/main/java/org/archive/crawler/Heritrix.java
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ private static Options options() {
"\"password\" (which leaves username as the default 'admin'), " +
"\"username:password\", or \"@filename\" for a file that " +
"includes the single line \"username:password\". ");
options.addOption("c", "checkpoint", true,
"Recovers from the given checkpoint. May only be used with the " +
"--run-job option. The special value 'latest' will recover the " +
"last checkpoint or if none exist will launch a new crawl.");
options.addOption("j", "jobs-dir", true, "The jobs directory. " +
"Defaults to ./jobs");
options.addOption("l", "logging-properties", true,
Expand Down Expand Up @@ -265,6 +269,11 @@ public void instanceMain(String[] args)
System.exit(1);
authPassword = ""; // suppresses uninitialized warning
}

if (cl.hasOption('c') && !cl.hasOption('r')) {
System.err.println("Cannot use --checkpoint without --run-job.");
System.exit(1);
}

File jobsDir = null;
if (cl.hasOption('j')) {
Expand Down Expand Up @@ -374,12 +383,21 @@ public void instanceMain(String[] args)
}
if (cl.hasOption('r')) {
String jobName = cl.getOptionValue('r');
engine.requestLaunch(jobName);
CrawlJob job = engine.getJob(jobName);
if (job == null || job.getCrawlController() == null) {
if (job == null) {
System.err.println("Job not found: " + jobName);
System.exit(1);
}
job.validateConfiguration();
if (cl.hasOption('c')) {
job.getCheckpointService().setRecoveryCheckpointByName(cl.getOptionValue('c'));
}
job.launch();
if (job.getCrawlController() == null) {
System.err.println("Failed to launch job: " + jobName);
System.exit(1);
}

job.getCrawlController().requestCrawlResume();
engine.waitForNoRunningJobs(0);
engine.shutdown();
Expand Down

0 comments on commit 585fed8

Please sign in to comment.