Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/language_guessing'
Browse files Browse the repository at this point in the history
Conflicts:
	.gitignore
	composer.json
  • Loading branch information
cbleek committed Mar 29, 2019
2 parents 4dd7bcc + af10f71 commit b0ccd05
Show file tree
Hide file tree
Showing 11 changed files with 536 additions and 28 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ composer.lock
node_modules/
var/
package-lock.json
.idea/
66 changes: 62 additions & 4 deletions config/module.config.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,22 @@
'options' => [
'SimpleImport/Options/Module' => [
'class' => Options\ModuleOptions::class
]
],
Options\LanguageGuesserOptions::class => []
],
'service_manager' => [
'factories' => [
'SimpleImport/CrawlerProcessorManager' => Factory\CrawlerProcessor\ManagerFactory::class,
'SimpleImport/JobGeocodeLocation' => Factory\Job\GeocodeLocationFactory::class
'SimpleImport/JobGeocodeLocation' => Factory\Job\GeocodeLocationFactory::class,
Service\LanguageGuesser::class => Service\LanguageGuesserFactory::class,
]
],
'controllers' => [
'factories' => [
'SimpleImport/ConsoleController' => Factory\Controller\ConsoleControllerFactory::class,
Controller\DeleteCrawlerConsoleController::class => Factory\Controller\DeleteCrawlerConsoleControllerFactory::class,
Controller\UpdateCrawlerConsoleController::class => Factory\Controller\UpdateCrawlerConsoleControllerFactory::class,
Controller\GuessLanguageConsoleController::class => Factory\Controller\GuessLanguageConsoleControllerFactory::class,
]
],
'controller_plugins' => [
Expand All @@ -56,6 +59,31 @@
'siLoadCrawler' => Controller\Plugin\LoadCrawler::class,
],
],
'slm_queue' => [
'queues' => [
'simpleimport' => [
'collection' => 'simpleimport.queue',
],
],
'worker_strategies' => [
'queues' => [
'simpleimport' => [
\Core\Queue\Strategy\LogStrategy::class => ['log' => 'Log/SimpleImport/Queue'],
\SlmQueue\Strategy\ProcessQueueStrategy::class,
],
],
],
'queue_manager' => [
'factories' => [
'simpleimport' => \Core\Queue\MongoQueueFactory::class,
],
],
'job_manager' => [
'factories' => [
Queue\GuessLanguageJob::class => Queue\GuessLanguageJobFactory::class,
],
],
],

'console' => [
'router' => [
Expand Down Expand Up @@ -107,6 +135,15 @@
],
],
],
'simpleimport-guess-language' => [
'options' => [
'route' => 'simpleimport guess-language [--limit=]',
'defaults' => [
'controller' => Controller\GuessLanguageConsoleController::class,
'action' => 'index',
],
],
],
],
],
],
Expand All @@ -116,11 +153,32 @@
[
'name' => 'stream',
'options' => [
'stream' => __DIR__ . '/../../../var/log/simple-import.log'
'stream' => getcwd() . '/var/log/simple-import.log'
]
]
]
]
],
'Log/SimpleImport/Queue' => [
'writers' => [
[
'name' => 'stream',
'priority' => 1000,
'options' => [
'stream' => getcwd().'/var/log/simpleimport.queue.log',
'formatter' => [
'name' => 'simple',
'options' => [
'format' => '%timestamp% (%pid%) %priorityName%: %message% %extra%',
'dateTimeFormat' => 'd.m.Y H:i:s',
],
],
],
],
],
'processors' => [
['name' => \Core\Log\Processor\ProcessId::class],
],
],
],
'input_filters' => [
'factories' => [
Expand Down
69 changes: 69 additions & 0 deletions src/Controller/GuessLanguageConsoleController.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
<?php
/**
* YAWIK SimpleImport
*
* @filesource
* @license MIT
* @copyright 2013 - 2018 Cross Solution <http://cross-solution.de>
*/

/** */
namespace SimpleImport\Controller;

use Jobs\Entity\Status;
use SimpleImport\Queue\GuessLanguageJob;
use Zend\Console\ColorInterface;
use Zend\Mvc\Console\Controller\AbstractConsoleController;

/**
* Update crawler configuration or displays crawler information.
*
* @author Mathias Gelhausen <[email protected]>
*/
class GuessLanguageConsoleController extends AbstractConsoleController
{

private $repository;

public static function getConsoleUsage()
{
return [
'simpleimport guess-language [--limit]' => 'Find jobs without language set and pushes a guess-language job into the queue for each.',
['--limit=INT', 'Maximum number of jobs to fetch. 0 means fetch all.'],
''
];
}

public function __construct(\Jobs\Repository\Job $repository)
{
$this->repository = $repository;
}

/**
*
* @return string|null
*/
public function indexAction()
{
$qb = $this->repository->createQueryBuilder();
$qb->field('status.name')->in([
Status::ACTIVE, Status::WAITING_FOR_APPROVAL, Status::CREATED
]);
$qb->addOr(
$qb->expr()->field('language')->exists(false),
$qb->expr()->field('language')->equals('')
);
$qb->limit(10);

$query = $qb->getQueryArray();
$jobs = $qb->getQuery()->execute();

if (!count($jobs)) { echo "Nothing to do. No jobs without language found.\n\n"; return; }

$queue = $this->queue('simpleimport');
foreach ($jobs as $job) {
$queue->push(GuessLanguageJob::class, ['jobId' => $job->getId()]);
printf('Pushed job %s in the queue.' . PHP_EOL, $job->getId());
}
}
}
33 changes: 32 additions & 1 deletion src/CrawlerProcessor/JobProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
*/
namespace SimpleImport\CrawlerProcessor;

use Jobs\Entity\Job;
use SimpleImport\Entity\Crawler;
use SimpleImport\DataFetch\JsonFetch;
use SimpleImport\DataFetch\PlainTextFetch;
use SimpleImport\Queue\GuessLanguageJob;
use Zend\Json\Json;
use Zend\Log\LoggerInterface;
use SimpleImport\Entity\Item;
Expand Down Expand Up @@ -48,6 +50,11 @@ class JobProcessor implements ProcessorInterface
* @var InputFilterInterface
*/
private $dataInputFilter;

/**
* @var \SlmQueue\Controller\Plugin\QueuePlugin
*/
private $queuePlugin;

/**
* @param JsonFetch $jsonFetch
Expand All @@ -69,7 +76,22 @@ public function __construct(
$this->jobHydrator = $jobHydrator;
$this->dataInputFilter = $dataInputFilter;
}


/**
* @param \SlmQueue\Controller\Plugin\QueuePlugin $queuePlugin
*
* @return self
*/
public function setQueuePlugin(\SlmQueue\Controller\Plugin\QueuePlugin $queuePlugin)
{
$this->queuePlugin = $queuePlugin;
$queuePlugin('simpleimport');

return $this;
}



/**
* {@inheritDoc}
* @see \SimpleImport\CrawlerProcessor\ProcessorInterface::execute()
Expand Down Expand Up @@ -164,6 +186,7 @@ private function syncChanges(Crawler $crawler, Result $result, LoggerInterface $
// update the job
$job->setStatus($crawler->getOptions()->getRecoverState());
$this->jobHydrator->hydrate($item->getImportData(), $job);
$this->guessLanguage($job);
$result->incrementUpdated();
}
} else {
Expand Down Expand Up @@ -202,6 +225,7 @@ private function syncChanges(Crawler $crawler, Result $result, LoggerInterface $
if (false !== $plainText) { $job->setMetaData('plainText', $plainText); }
$this->jobHydrator->hydrate($importData, $job);
$this->jobRepository->store($job);
$this->guessLanguage($job);
$item->setDocumentId($job->getId());
$result->incrementInserted();
}
Expand All @@ -224,4 +248,11 @@ private function formatMessages(array $messages)

return $formatted;
}

private function guessLanguage(Job $job)
{
if (!$this->queuePlugin || $job->getLanguage()) { return; }

$this->queuePlugin->push(GuessLanguageJob::class, ['jobId' => $job->getId()]);
}
}
32 changes: 32 additions & 0 deletions src/Factory/Controller/GuessLanguageConsoleControllerFactory.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?php
/**
* YAWIK-SimpleImport
*
* @filesource
* @license MIT
* @copyright 2013 - 2019 Cross Solution <http://cross-solution.de>
*/

/** */
namespace SimpleImport\Factory\Controller;

use SimpleImport\Controller\GuessLanguageConsoleController;
use Interop\Container\ContainerInterface;
use Zend\ServiceManager\Factory\FactoryInterface;

/**
* Factory for \SimpleImport\Controller\GuessLanguageConsoleController
*
* @author Mathias Gelhausen <[email protected]>
* @todo write test
*/
class GuessLanguageConsoleControllerFactory implements FactoryInterface
{

public function __invoke(ContainerInterface $container, $requestedName, array $options = null)
{
return new GuessLanguageConsoleController(
$container->get('repositories')->get('Jobs')
);
}
}
50 changes: 27 additions & 23 deletions src/Module.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
namespace SimpleImport;

use Core\ModuleManager\ModuleConfigLoader;
use SimpleImport\Controller\GuessLanguageConsoleController;
use Zend\ModuleManager\Feature\ConsoleBannerProviderInterface;
use Zend\ModuleManager\Feature\DependencyIndicatorInterface;
use Zend\Console\Adapter\AdapterInterface as Console;
Expand Down Expand Up @@ -53,28 +54,31 @@ public function getConsoleBanner(Console $console)
*/
public function getConsoleUsage(Console $console)
{
return [
'Simple import operations',
'simpleimport import [--limit] [--name] [--id]' => 'Executes a data import for all registered crawlers',
'simpleimport add-crawler --name --organization --feed-uri [--runDelay] [--type] [--jobInitialState] [--jobRecoverState]' => 'Adds a new import crawler',
['--limit=INT', 'Number of crawlers to check per run. Default 3. 0 means no limit'],
['--name=STRING', 'The name of a crawler'],
['--id=STRING', 'The Mongo object id of a crawler'],
['--organization==STRING', 'The ID of an organization'],
['--feed-uri=STRING', 'The URI pointing to a data to import'],
['--runDelay=INT', 'The number of minutes the next import run will be proceeded again'],
['--type=STRING', 'The type of an import (e.g. job)'],
['--jobInitialState=STRING', 'The initial state of an imported job'],
['--jobRecoverState=STRING', 'The state a job gets, if it was deleted, but found again in later runs.'],
'',
'simpleimport info' => 'Displays a list of all available crawlers.',
'simpleimport info [--id] <name>' => 'Shows information for a crawler',
'simpleimport update-crawler [--id] <name> [--rename] [--limit] [--organization] [--feed-uri] [--runDelay] [--type] [--jobInitalState] [--jobRecoverState]'
=> 'Updates configuration for a crawler. ',
'simpleimport delete-crawler [--id] <name>' => 'Deletes an import crawler',
['<name>', 'The name of the crawler to delete.'],
['--id', 'Treat <name> as the MongoID of the crawler'],
['--rename=STRING', 'Set a new name for the crawler.'],
];
return array_merge(
[
'Simple import operations',
'simpleimport import [--limit] [--name] [--id]' => 'Executes a data import for all registered crawlers',
'simpleimport add-crawler --name --organization --feed-uri [--runDelay] [--type] [--jobInitialState] [--jobRecoverState]' => 'Adds a new import crawler',
['--limit=INT', 'Number of crawlers to check per run. Default 3. 0 means no limit'],
['--name=STRING', 'The name of a crawler'],
['--id=STRING', 'The Mongo object id of a crawler'],
['--organization==STRING', 'The ID of an organization'],
['--feed-uri=STRING', 'The URI pointing to a data to import'],
['--runDelay=INT', 'The number of minutes the next import run will be proceeded again'],
['--type=STRING', 'The type of an import (e.g. job)'],
['--jobInitialState=STRING', 'The initial state of an imported job'],
['--jobRecoverState=STRING', 'The state a job gets, if it was deleted, but found again in later runs.'],
'',
'simpleimport info' => 'Displays a list of all available crawlers.',
'simpleimport info [--id] <name>' => 'Shows information for a crawler',
'simpleimport update-crawler [--id] <name> [--rename] [--limit] [--organization] [--feed-uri] [--runDelay] [--type] [--jobInitalState] [--jobRecoverState]'
=> 'Updates configuration for a crawler. ',
'simpleimport delete-crawler [--id] <name>' => 'Deletes an import crawler',
['<name>', 'The name of the crawler to delete.'],
['--id', 'Treat <name> as the MongoID of the crawler'],
['--rename=STRING', 'Set a new name for the crawler.'],
'',
],
GuessLanguageConsoleController::getConsoleUsage());
}
}
Loading

0 comments on commit b0ccd05

Please sign in to comment.