Skip to content

Commit

Permalink
Merge pull request #460 from MarcusBarnes/issue-421
Browse files Browse the repository at this point in the history
Issue 421 - allow page-level OCR files in MIK input for CSV Books
  • Loading branch information
bondjimbond authored Mar 20, 2018
2 parents 4a871e7 + 4cfe6b5 commit e7d7ece
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 10 deletions.
77 changes: 71 additions & 6 deletions src/inputvalidators/CsvBooks.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ public function __construct($settings)
$this->page_sequence_separator = '-';
}
$this->page_sequence_separator = preg_quote($this->page_sequence_separator);

$this->ocr_extension = '.txt';
// Default is to not log the absence of page-level OCR files.
if (isset($settings['WRITER']['log_missing_ocr_files'])) {
$this->log_missing_ocr_files = $settings['WRITER']['log_missing_ocr_files'];
} else {
$this->log_missing_ocr_files = false;
}
}

/**
Expand Down Expand Up @@ -74,7 +82,7 @@ public function validateAll()
* The package's record key.
*
* @param $package_path string
* The the package's input directory name (not full path).
* The package's input directory name (not full path).
*
* @return boolean
* True if all tests pass for the package, false if any tests failed.
Expand Down Expand Up @@ -135,6 +143,18 @@ public function validatePackage($record_key, $package_path)
$cumulative_validation_results[] = false;
}

if (!$this->checkOcrFiles($package_path, $pages)) {
$this->log->addError(
"Input validation failed",
array(
'record ID' => $record_key,
'issue directory' => $package_path,
'error' => 'Book directory is missing one or more OCR files'
)
);
$cumulative_validation_results[] = false;
}

// Files in book directory must be named such that their last
// filename segment is numeric.
if (!$this->checkPageSequenceNumbers($pages)) {
Expand Down Expand Up @@ -170,28 +190,35 @@ public function validatePackage($record_key, $package_path)
}

/**
* Gets the filenames of the page files in the book-level directory.
* Gets the filenames of the files in the book-level directory.
*
* @param $dir string
* The full path to the book-level directory.
*
* @return array
* A list of all the page file names.
* A list of all the file names (not just page images).
*/
private function getPageFiles($dir)
{
$page_files = array();
$files = $this->readDir($dir);
foreach ($files as &$file) {
$file = basename($file);
foreach ($files as $file) {
$pathinfo = pathinfo($file);
$page_file = $pathinfo['basename'];
$page_files[] = $page_file;
}
}
return $files;
return $page_files;
}

/**
* Validates the extensions of the pages in the book-level directory.
*
* @param $files array
* A list of all the page file names.
* A list of all the page file names.Files must have one of
* following extensions: tif, tiff, jp2.
*
* @return boolean
* True if all files have an allowed file extension, false if not.
Expand All @@ -202,7 +229,16 @@ private function checkPageExtensions($files)
foreach ($files as $file) {
$pathinfo = pathinfo($file);
$ext = $pathinfo['extension'];
if (!in_array($ext, $this->fileGetter->allowed_file_extensions_for_OBJ)) {
if ($this->log_missing_ocr_files) {
$ocr_extension = ltrim($this->ocr_extension, '.');
$allowed_extensions = array_merge(
$this->fileGetter->allowed_file_extensions_for_OBJ,
array($ocr_extension)
);
} else {
$allowed_extensions = $this->fileGetter->allowed_file_extensions_for_OBJ;
}
if (!in_array($ext, $allowed_extensions)) {
$valid = false;
}
}
Expand Down Expand Up @@ -232,4 +268,33 @@ private function checkPageSequenceNumbers($files)
}
return $valid;
}

/**
* Checks for the existence of page-level OCR files.
*
* @param $book_directory_path string
* The absolute path to the book-level directory.
* @param $files array
* A list of all the page file names in the directory.
*
* @return boolean
* True if all image files have corresponding OCR files.
*/
private function checkOcrFiles($book_directory_path, $files)
{
$valid = true;
if (!$this->log_missing_ocr_files) {
return $valid;
}
foreach ($files as $file) {
$pathinfo = pathinfo($file);
$filename = $pathinfo['filename'];
$path_to_ocr_file = realpath($book_directory_path) . DIRECTORY_SEPARATOR .
$filename . $this->ocr_extension;
if (!file_exists($path_to_ocr_file)) {
$valid = false;
}
}
return $valid;
}
}
36 changes: 32 additions & 4 deletions src/writers/CsvBooks.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ public function __construct($settings)
Logger::INFO
);
$this->log->pushHandler($this->logStreamHandler);

$this->ocr_extension = '.txt';
// Default is to not log the absence of page-level OCR files.
if (isset($settings['WRITER']['log_missing_ocr_files'])) {
$this->log_missing_ocr_files= $settings['WRITER']['log_missing_ocr_files'];
} else {
$this->log_missing_ocr_files = false;
}
}

/**
Expand Down Expand Up @@ -113,9 +121,8 @@ public function writePackages($metadata, $pages, $record_id)
}

// @todo: Add error handling on mkdir and copy.
// @todo: Write page level MODS.xml file, after testing ingest as is.
foreach ($pages as $page_path) {
// Get the page number from the filename. It is the last segment.
// Get the sequence number from the last segment of the filename.
$pathinfo = pathinfo($page_path);
$filename_segments = explode($this->page_sequence_separator, $pathinfo['filename']);

Expand All @@ -126,16 +133,37 @@ public function writePackages($metadata, $pages, $record_id)
$OBJ_expected = in_array('OBJ', $this->datastreams);
if ($OBJ_expected xor $no_datastreams_setting_flag) {
$extension = $pathinfo['extension'];
$page_output_file_path = $page_level_output_dir . DIRECTORY_SEPARATOR .
$page_output_path = $page_level_output_dir . DIRECTORY_SEPARATOR .
'OBJ.' . $extension;
copy($page_path, $page_output_file_path);
copy($page_path, $page_output_path);
}

if ($MODS_expected xor $no_datastreams_setting_flag) {
if ($this->generate_page_modsxml) {
$this->writePageMetadataFile($metadata, $page_number, $page_level_output_dir);
}
}

// If the datastreams list is comprised of only 'MODS' we're generating metadata only.
if ($this->datastreams != array('MODS')) {
$OCR_expected = in_array('OCR', $this->datastreams);
if ($OCR_expected xor $no_datastreams_setting_flag) {
$ocr_input_path = $pathinfo['dirname'] . DIRECTORY_SEPARATOR .
$pathinfo['filename'] . $this->ocr_extension;
$ocr_output_path = $page_level_output_dir . DIRECTORY_SEPARATOR .
'OCR' . $this->ocr_extension;
if (file_exists($ocr_input_path)) {
copy($ocr_input_path, $ocr_output_path);
} else {
if ($this->log_missing_ocr_files) {
$this->log->addWarning(
"CSV Books warning",
array('Page-level OCR file does not exist' => $ocr_input_path)
);
}
}
}
}
}
}

Expand Down
5 changes: 5 additions & 0 deletions tests/inputvalidators/CsvInputValidatorsTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,11 @@ public function testCsvBooksInputValidator()
$log_file_entries[2],
"CSV Books input validator did not detect unwanted files"
);
$this->assertContains(
'files/book3","error":"Some files in the book object directory have invalid extensions"',
$log_file_entries[3],
"CSV Books input validator did not find invalid page file extensions"
);
$this->assertContains(
'files/book4","error":"Book object directory not found"',
$log_file_entries[4],
Expand Down

0 comments on commit e7d7ece

Please sign in to comment.