diff --git a/src/inputvalidators/CsvBooks.php b/src/inputvalidators/CsvBooks.php index 82eefc1..7ee6c71 100644 --- a/src/inputvalidators/CsvBooks.php +++ b/src/inputvalidators/CsvBooks.php @@ -35,6 +35,14 @@ public function __construct($settings) $this->page_sequence_separator = '-'; } $this->page_sequence_separator = preg_quote($this->page_sequence_separator); + + $this->ocr_extension = '.txt'; + // Default is to not log the absence of page-level OCR files. + if (isset($settings['WRITER']['log_missing_ocr_files'])) { + $this->log_missing_ocr_files = $settings['WRITER']['log_missing_ocr_files']; + } else { + $this->log_missing_ocr_files = false; + } } /** @@ -74,7 +82,7 @@ public function validateAll() * The package's record key. * * @param $package_path string - * The the package's input directory name (not full path). + * The package's input directory name (not full path). * * @return boolean * True if all tests pass for the package, false if any tests failed. @@ -135,6 +143,18 @@ public function validatePackage($record_key, $package_path) $cumulative_validation_results[] = false; } + if (!$this->checkOcrFiles($package_path, $pages)) { + $this->log->addError( + "Input validation failed", + array( + 'record ID' => $record_key, + 'issue directory' => $package_path, + 'error' => 'Book directory is missing one or more OCR files' + ) + ); + $cumulative_validation_results[] = false; + } + // Files in book directory must be named such that their last // filename segment is numeric. if (!$this->checkPageSequenceNumbers($pages)) { @@ -170,28 +190,35 @@ public function validatePackage($record_key, $package_path) } /** - * Gets the filenames of the page files in the book-level directory. + * Gets the filenames of the files in the book-level directory. * * @param $dir string * The full path to the book-level directory. * * @return array - * A list of all the page file names. + * A list of all the file names (not just page images). */ private function getPageFiles($dir) { + $page_files = array(); $files = $this->readDir($dir); foreach ($files as &$file) { $file = basename($file); + foreach ($files as $file) { + $pathinfo = pathinfo($file); + $page_file = $pathinfo['basename']; + $page_files[] = $page_file; + } } - return $files; + return $page_files; } /** * Validates the extensions of the pages in the book-level directory. * * @param $files array - * A list of all the page file names. + * A list of all the page file names.Files must have one of + * following extensions: tif, tiff, jp2. * * @return boolean * True if all files have an allowed file extension, false if not. @@ -202,7 +229,16 @@ private function checkPageExtensions($files) foreach ($files as $file) { $pathinfo = pathinfo($file); $ext = $pathinfo['extension']; - if (!in_array($ext, $this->fileGetter->allowed_file_extensions_for_OBJ)) { + if ($this->log_missing_ocr_files) { + $ocr_extension = ltrim($this->ocr_extension, '.'); + $allowed_extensions = array_merge( + $this->fileGetter->allowed_file_extensions_for_OBJ, + array($ocr_extension) + ); + } else { + $allowed_extensions = $this->fileGetter->allowed_file_extensions_for_OBJ; + } + if (!in_array($ext, $allowed_extensions)) { $valid = false; } } @@ -232,4 +268,33 @@ private function checkPageSequenceNumbers($files) } return $valid; } + + /** + * Checks for the existence of page-level OCR files. + * + * @param $book_directory_path string + * The absolute path to the book-level directory. + * @param $files array + * A list of all the page file names in the directory. + * + * @return boolean + * True if all image files have corresponding OCR files. + */ + private function checkOcrFiles($book_directory_path, $files) + { + $valid = true; + if (!$this->log_missing_ocr_files) { + return $valid; + } + foreach ($files as $file) { + $pathinfo = pathinfo($file); + $filename = $pathinfo['filename']; + $path_to_ocr_file = realpath($book_directory_path) . DIRECTORY_SEPARATOR . + $filename . $this->ocr_extension; + if (!file_exists($path_to_ocr_file)) { + $valid = false; + } + } + return $valid; + } } diff --git a/src/writers/CsvBooks.php b/src/writers/CsvBooks.php index a7ea726..86b1b22 100644 --- a/src/writers/CsvBooks.php +++ b/src/writers/CsvBooks.php @@ -54,6 +54,14 @@ public function __construct($settings) Logger::INFO ); $this->log->pushHandler($this->logStreamHandler); + + $this->ocr_extension = '.txt'; + // Default is to not log the absence of page-level OCR files. + if (isset($settings['WRITER']['log_missing_ocr_files'])) { + $this->log_missing_ocr_files= $settings['WRITER']['log_missing_ocr_files']; + } else { + $this->log_missing_ocr_files = false; + } } /** @@ -113,9 +121,8 @@ public function writePackages($metadata, $pages, $record_id) } // @todo: Add error handling on mkdir and copy. - // @todo: Write page level MODS.xml file, after testing ingest as is. foreach ($pages as $page_path) { - // Get the page number from the filename. It is the last segment. + // Get the sequence number from the last segment of the filename. $pathinfo = pathinfo($page_path); $filename_segments = explode($this->page_sequence_separator, $pathinfo['filename']); @@ -126,9 +133,9 @@ public function writePackages($metadata, $pages, $record_id) $OBJ_expected = in_array('OBJ', $this->datastreams); if ($OBJ_expected xor $no_datastreams_setting_flag) { $extension = $pathinfo['extension']; - $page_output_file_path = $page_level_output_dir . DIRECTORY_SEPARATOR . + $page_output_path = $page_level_output_dir . DIRECTORY_SEPARATOR . 'OBJ.' . $extension; - copy($page_path, $page_output_file_path); + copy($page_path, $page_output_path); } if ($MODS_expected xor $no_datastreams_setting_flag) { @@ -136,6 +143,27 @@ public function writePackages($metadata, $pages, $record_id) $this->writePageMetadataFile($metadata, $page_number, $page_level_output_dir); } } + + // If the datastreams list is comprised of only 'MODS' we're generating metadata only. + if ($this->datastreams != array('MODS')) { + $OCR_expected = in_array('OCR', $this->datastreams); + if ($OCR_expected xor $no_datastreams_setting_flag) { + $ocr_input_path = $pathinfo['dirname'] . DIRECTORY_SEPARATOR . + $pathinfo['filename'] . $this->ocr_extension; + $ocr_output_path = $page_level_output_dir . DIRECTORY_SEPARATOR . + 'OCR' . $this->ocr_extension; + if (file_exists($ocr_input_path)) { + copy($ocr_input_path, $ocr_output_path); + } else { + if ($this->log_missing_ocr_files) { + $this->log->addWarning( + "CSV Books warning", + array('Page-level OCR file does not exist' => $ocr_input_path) + ); + } + } + } + } } } diff --git a/tests/inputvalidators/CsvInputValidatorsTest.php b/tests/inputvalidators/CsvInputValidatorsTest.php index 280f23b..2f6f225 100644 --- a/tests/inputvalidators/CsvInputValidatorsTest.php +++ b/tests/inputvalidators/CsvInputValidatorsTest.php @@ -216,6 +216,11 @@ public function testCsvBooksInputValidator() $log_file_entries[2], "CSV Books input validator did not detect unwanted files" ); + $this->assertContains( + 'files/book3","error":"Some files in the book object directory have invalid extensions"', + $log_file_entries[3], + "CSV Books input validator did not find invalid page file extensions" + ); $this->assertContains( 'files/book4","error":"Book object directory not found"', $log_file_entries[4],