diff --git a/lib/task/tools/dataIntegrityRepairTask.class.php b/lib/task/tools/dataIntegrityRepairTask.class.php new file mode 100644 index 0000000000..250585dafe --- /dev/null +++ b/lib/task/tools/dataIntegrityRepairTask.class.php @@ -0,0 +1,398 @@ +. + */ + +/** + * Data integrity repair tool + */ +class dataIntegrityRepairTask extends arBaseTask +{ + protected function configure() + { + $this->addArguments([ + new sfCommandArgument('filename', sfCommandArgument::OPTIONAL, 'A name for the generated CSV report file', 'affected-records.csv'), + ]); + + $this->addOptions([ + new sfCommandOption('application', null, sfCommandOption::PARAMETER_OPTIONAL, 'The application name', true), + new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'cli'), + new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'propel'), + new sfCommandOption('mode', null, sfCommandOption::PARAMETER_OPTIONAL, 'The chosed mode for how to proceed with fix: report(default), fix, delete', 'report'), + ]); + + $this->namespace = 'tools'; + $this->name = 'data-integrity-repair'; + $this->briefDescription = 'Attempt data integrity repair'; + $this->detailedDescription = <<<'EOF' +Attempt to repair data integrity. It does the following: +- Add missing object rown for all resources extending QubitObject +- Regenerates slugs to use them in CSV report +- Adds missing parent ids to terms +- Checks descriptions with missing data and provides options for attempting to generate a list, fix them, or delete them + +To use the data integrity repair tool: + php symfony tools:data-integrity-repair file/path/to/report.csv + Any results will be written to the csv at the supplied file path + +The data integrity repair tool has 3 modes. By default it only generate reports, but it can also attempt to fix or delete affected records: + php symfony tools:data-integrity-repair file/path/to/report.csv --mode=delete + or + php symfony tools:data-integrity-repair file/path/to/report.csv --mode=fix +EOF; + } + + protected function execute($arguments = [], $options = []) + { + parent::execute($arguments, $options); + $this->performDataIntegrityChecks($arguments['filename'], $options); + } + + private function performDataIntegrityChecks($filename, $options = []) { + QubitSearch::disable(); + $this->logSection('data-integrity-repair', "Adding missing object rows (except for descriptions):\n"); + // List of classes with a related object row + $classes = [ + // The first four extend QubitActor too and need to go before it + 'QubitRepository', + 'QubitRightsHolder', + 'QubitUser', + 'QubitDonor', + 'QubitActor', + 'QubitAip', + 'QubitJob', + 'QubitDigitalObject', + 'QubitEvent', + // Use QubitFunction for AtoM 2.5.x + 'QubitFunctionObject', + // Skipping IOs as they will be addressed later + // 'QubitInformationObject', + 'QubitObjectTermRelation', + 'QubitPhysicalObject', + 'QubitPremisObject', + 'QubitRelation', + 'QubitRights', + 'QubitRightsHolder', + 'QubitStaticPage', + 'QubitTaxonomy', + 'QubitTerm', + 'QubitAccession', + 'QubitDeaccession' + ]; + + foreach ($classes as $class) + { + $fixed = 0; + + // Find resources without object row + $sql = 'SELECT tb.id + FROM ' . $class::TABLE_NAME . ' tb + LEFT JOIN object o ON tb.id=o.id + WHERE o.id IS NULL;'; + $noObjectIds = QubitPdo::fetchAll( + $sql, [], ['fetchMode' => PDO::FETCH_COLUMN] + ); + + foreach ($noObjectIds as $id) + { + $this->insertObjectRow($id, $class); + $fixed++; + } + + + $this->log(sprintf(" - %s: %d\n", $class, $fixed)); + } + + $this->logSection('data-integrity-repair', "Regenerating slugs ...\n"); + + $task = new propelGenerateSlugsTask($this->dispatcher, $this->formatter); + $task->setConfiguration($this->configuration); + $task->run(); + + // Set root term as parent for terms without one + $sql = 'UPDATE term SET parent_id=110 WHERE parent_id IS NULL AND id<>110;'; + $updated = QubitPdo::modify($sql); + $this->logSection('data-integrity-repair', sprintf("Updating terms without parent id: %d\n", $updated)); + + $this->log("Checking descriptions integrity:\n"); + + $sql = 'SELECT COUNT(io.id) + FROM information_object io + LEFT JOIN object o ON io.id=o.id + WHERE io.id<>1 + AND o.id IS NULL;'; + $this->log(sprintf(" - Descriptions without object row: %d\n", QubitPdo::fetchColumn($sql))); + + $sql = 'SELECT COUNT(id) + FROM information_object + WHERE id<>1 + AND parent_id IS NULL;'; + $this->log(sprintf(" - Descriptions without parent id: %d\n", QubitPdo::fetchColumn($sql))); + + $sql = 'SELECT COUNT(io.id) + FROM information_object io + LEFT JOIN information_object p ON io.parent_id=p.id + WHERE io.id<>1 + AND p.id IS NULL;'; + $this->log(sprintf(" - Descriptions without parent: %d\n", QubitPdo::fetchColumn($sql))); + + $sql = 'SELECT COUNT(io.id) + FROM information_object io + LEFT JOIN status st ON io.id=st.object_id AND st.type_id=158 + WHERE io.id<>1 + AND st.status_id IS NULL;'; + $this->log(sprintf(" - Descriptions without publication status: %d\n", QubitPdo::fetchColumn($sql))); + + $sql = 'SELECT io.id, o.id as object_id, io.parent_id, p.id as parent, st.id as status, st.status_id + FROM information_object io + LEFT JOIN object o ON io.id=o.id + LEFT JOIN information_object p ON io.parent_id=p.id + LEFT JOIN status st ON io.id=st.object_id AND st.type_id=158 + WHERE io.id<>1 + AND (o.id IS NULL OR io.parent_id IS NULL OR p.id IS NULL OR st.id IS NULL OR st.status_id IS NULL);'; + $affectedIos = QubitPdo::fetchAll( + $sql, [], ['fetchMode' => PDO::FETCH_ASSOC] + ); + $this->log(sprintf(" - Affected descriptions: %d\n", count($affectedIos))); + + if (count($affectedIos) == 0) + { + $this->log("All descriptions seem to be okay.\n"); + } + else + { + // Each affected IO is an array like: + // Array + // ( + // [id] => information_object row id (can't be missing) + // [object_id] => object row id (if missing, a new object row is needed) + // [parent_id] => information_object row parent_id (if missing, set to root IO id) + // [parent] => parent row id (if missing, set to root IO id) + // [status] => status row id (if missing, an entire new status row is needed, as draft) + // [status_id] => status row status_id (if missing, update to draft id) + // ) + + // Get descendants (without duplicates), we can't fully trust the nested set due to + // the missing parents and we can't use recursive CTE as this may be needed for 2.5. + // The generated array will contain the description trees flattened and reversed. + // At the same time, normalize the affected IOs array to be able to access them by id. + $affectedIosAndDescendantIds = []; + $affectedIosById = []; + foreach (array_reverse($affectedIos) as $io) + { + $this->populateAffectedIosAndDescendantIds($io['id'], $affectedIosAndDescendantIds); + $affectedIosById[$io['id']] = $io; + } + $this->logSection('data-integrity-repair', sprintf(" - Affected descriptions (including descendants): %d\n", count($affectedIosAndDescendantIds))); + + switch ($options['fix']) + { + case 'fix': + $this->report($filename, $affectedIosById, $affectedIosAndDescendantIds); + $this->fix($affectedIosById); + break; + + case 'delete': + $this->report($filename, $affectedIosById, $affectedIosAndDescendantIds); + $this->deleteDescriptions($affectedIosById, $affectedIosAndDescendantIds); + break; + + default: + $this->report($filename, $affectedIosById, $affectedIosAndDescendantIds); + break; + } + } + + $this->logSection('data-integrity-repair', "Rebuilding nested set ...\n"); + + $task = new propelBuildNestedSetTask($this->dispatcher, $this->formatter); + $task->setConfiguration($this->configuration); + $task->run(); + + $this->logSection('data-integrity-repair', "The ES index has not been updated! Run the search:populate task to do so.\n"); + } + + private function insertObjectRow($id, $class) + { + $sql = 'INSERT INTO object + (id, class_name, created_at, updated_at, serial_number) + VALUES + (:id, :class, now(), now(), 0);'; + QubitPdo::modify( + $sql, [':id' => $id, ':class' => $class] + ); + } + + private function populateAffectedIosAndDescendantIds($id, &$affectedIosAndDescendantIds) + { + // Skip already added IOs + if (in_array($id, $affectedIosAndDescendantIds)) + { + return; + } + + // Find children + $sql = 'SELECT id FROM information_object WHERE parent_id=:id;'; + $children = QubitPdo::fetchAll( + $sql, [':id' => $id], ['fetchMode' => PDO::FETCH_COLUMN] + ); + + // Add descendants first + foreach ($childrenIds as $childId) + { + $this->populateAffectedIosAndDescendantIds($childId, $affectedIosAndDescendantIds); + } + + $affectedIosAndDescendantIds[] = $id; + } + + private function stringEndsWith($string, $pattern) { + return substr_compare($string, $pattern, -strlen($pattern)) === 0; + } + + private function report($filename, $affectedIosById, $affectedIosAndDescendantIds) + { + if (!($this->stringEndsWith($filename, '.csv') || $this->stringEndsWith($filename, '.CSV'))) { + $filename = sprintf("%s.csv", $filename); + } + + $csvFile = fopen($filename, 'w'); + fputcsv($csvFile, ['id', 'parent_id', 'slug', 'issue(s)']); + + // Reverse IOs to show ancestors first on the report + foreach (array_reverse($affectedIosAndDescendantIds) as $id) + { + // Get current IO data + $sql = 'SELECT io.id, io.parent_id, slug + FROM information_object io + LEFT JOIN slug ON io.id=slug.object_id + WHERE io.id=:id;'; + $stmt = QubitPdo::prepareAndExecute($sql, [':id' => $id]); + $result = $stmt->fetch(PDO::FETCH_NUM); + + // Check issues + $issues = []; + if (isset($affectedIosById[$id])) + { + if (!isset($affectedIosById[$id]['object_id'])) + { + $issues[] = 'missing object row'; + } + if (!isset($affectedIosById[$id]['parent'])) + { + $issues[] = 'parent does not exist'; + } + if (!isset($affectedIosById[$id]['parent_id'])) + { + $issues[] = 'parent not set'; + } + if (!isset($affectedIosById[$id]['status_id']) || !isset($affectedIosById[$id]['status'])) + { + $issues[] = 'missing publication status'; + } + } + else + { + $issues[] = 'descendant'; + } + + $result[] = implode(' | ', $issues); + fputcsv($csvFile, $result); + } + + fclose($csvFile); + $this->logSection('data-integrity-repair', sprintf("CSV generated: '%s'.\n", $filename)); + } + + private function fix($affectedIosById) + { + $count = 0; + $this->logSection('data-integrity-repair', "Fixing descriptions ...\n"); + + foreach ($affectedIosById as $id => $io) + { + // Fix missing object row + if (!isset($io['object_id'])) + { + $this->insertObjectRow($id, 'QubitInformationObject'); + } + + // Set root IO as parent + if (!isset($io['parent']) || !isset($io['parent_id'])) + { + $sql = 'UPDATE information_object SET parent_id=1 WHERE id=:id;'; + QubitPdo::modify($sql, [':id' => $id]); + } + + // Add publication status row + if (!isset($io['status'])) + { + $sql = "INSERT INTO status + (object_id, type_id, status_id, serial_number) + VALUES (:id, '158', '159', '0');"; + QubitPdo::modify($sql, [':id' => $id]); + } + // Set publication status to draft + elseif (!isset($io['status_id'])) + { + $sql = 'UPDATE status SET status_id=159 WHERE type_id=158 AND object_id=:id;'; + QubitPdo::modify($sql, [':id' => $id]); + } + + $count++; + if ($count % 100 == 0) + { + $this->logSection('data-integrity-repair', sprintf("%d descriptions fixed ...\n", $count)); + } + } + + $this->logSection('data-integrity-repair', sprintf("%d descriptions fixed.\n", count($affectedIosById))); + } + + private function deleteDescriptions($affectedIosById, $affectedIosAndDescendantIds) + { + $count = 0; + $this->logSection('data-integrity-repair', "Deleting descriptions ...\n"); + + // Description trees are already flattened and reversed to avoid foreign key issues + foreach ($affectedIosAndDescendantIds as $id) + { + // Fix object row if needed + if (isset($affectedIosById[$id]) && !isset($affectedIosById[$id]['object_id'])) + { + $this->insertObjectRow($id, 'QubitInformationObject'); + } + + // Delete IO without updating nested set + $io = QubitInformationObject::getById($id); + $io->disableNestedSetUpdating = true; + $io->delete(); + + // Avoid high memory usage + Qubit::clearClassCaches(); + + $count++; + if ($count % 100 == 0) + { + $this->logSection('data-integrity-repair', sprintf("%d descriptions deleted ...\n", $count)); + } + } + + $this->logSection('data-integrity-repair', sprintf("%d descriptions deleted.\n", count($affectedIosAndDescendantIds))); + } + +}