From 015bb012df7853c6ca4faddd78e91e5be69523c6 Mon Sep 17 00:00:00 2001 From: Andre Wyrwa Date: Sun, 16 Sep 2018 12:10:00 +1000 Subject: [PATCH 1/2] Enable lastname remapping - fixes #11 Names like "SUJAN MASTER", "JAMES J MA", "PETER K MA" had the 'Master' or 'Ma' parts as special parts (salutations, suffixes) where they should be lastnames. In "PAUL M LEWIS MR", the lastname should be 'Lewis Mr'. This change does three things to fix this: Firstly, it prevents parsing for salutations beyond the first half of words in the given string. It also introduces a `setMaxSalutationIndex()` method to allow overriding this with a fixed maximum word index. E.g. setting it to 2 will require salutations to appear in the first two words. Secondly, if the lastname mapper does not derive a lastname, but has skipped ignored parts like suffix, nickname or salutation, it will convert these into lastname parts. Thirdly, the lastname mapper will now map more than one lastname part if the already mapped lastname parts are shorter than 3 characters and there will be at least one part left after mapping. This effectively maps 'Lewis' in 'Paul M Lewis Mr' as lastname instead of previously as middlename. --- README.md | 11 +++++ src/Mapper/LastnameMapper.php | 66 +++++++++++++++++++++++++++- src/Mapper/SalutationMapper.php | 11 ++++- src/Parser.php | 30 +++++++++++-- tests/Mapper/FirstnameMapperTest.php | 10 +++++ tests/ParserTest.php | 33 +++++++++++++- 6 files changed, 153 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 3500f1d..6f7eafc 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,17 @@ $parser = new TheIconic\NameParser\Parser(); $parser->setWhitespace("\t _."); ``` +### Limiting the position of salutations +```php +$parser = new TheIconic\NameParser\Parser(); +$parser->setMaxSalutationIndex(2); +``` +This will require salutations to appear within the +first two words of the given input string. +This defaults to half the amount of words in the input string, +meaning that effectively the salutation may occur within +the first half of the name parts. + ## License THE ICONIC Name Parser library for PHP is released under the MIT License. diff --git a/src/Mapper/LastnameMapper.php b/src/Mapper/LastnameMapper.php index 7ab8f8a..ecf95c2 100644 --- a/src/Mapper/LastnameMapper.php +++ b/src/Mapper/LastnameMapper.php @@ -42,15 +42,19 @@ public function map(array $parts): array } /** + * we map the parts in reverse order because it makes more + * sense to parse for the lastname starting from the end + * * @param array $parts * @return array */ protected function mapReversedParts(array $parts): array { $length = count($parts); + $remapIgnored = true; foreach ($parts as $k => $part) { - if ($part instanceof Suffix || $part instanceof Nickname || $part instanceof Salutation) { + if ($this->isIgnoredPart($part)) { continue; } @@ -66,6 +70,66 @@ protected function mapReversedParts(array $parts): array $parts[$k] = new LastnamePrefix($part, $this->prefixes[$this->getKey($part)]); continue; } + + if ($this->shouldStopMapping($parts, $k)) { + break; + } + } + + $parts[$k] = new Lastname($part); + $remapIgnored = false; + } + + if ($remapIgnored) { + $parts = $this->remapIgnored($parts); + } + + return $parts; + } + + /** + * indicates if we should stop mapping at the give index $k + * + * the assumption is that lastname parts have already been found + * but we want to see if we should add more parts + * + * @param array $parts + * @param int $k + * @return bool + */ + protected function shouldStopMapping(array $parts, int $k): bool + { + if ($k + 2 > count($parts)) { + return true; + } + + return strlen($parts[$k - 1]->getValue()) >= 3; + } + + /** + * indicates if the given part should be ignored (skipped) during mapping + * + * @param $part + * @return bool + */ + protected function isIgnoredPart($part) { + return $part instanceof Suffix || $part instanceof Nickname || $part instanceof Salutation; + } + + /** + * remap ignored parts as lastname + * + * if the mapping did not derive any lastname this is called to transform + * any previously ignored parts into lastname parts + * the parts array is still reversed at this point + * + * @param array $parts + * @return array + */ + protected function remapIgnored(array $parts): array + { + foreach ($parts as $k => $part) { + if (!$this->isIgnoredPart($part)) { break; } diff --git a/src/Mapper/SalutationMapper.php b/src/Mapper/SalutationMapper.php index 239ba46..5486fe3 100644 --- a/src/Mapper/SalutationMapper.php +++ b/src/Mapper/SalutationMapper.php @@ -9,9 +9,12 @@ class SalutationMapper extends AbstractMapper { protected $salutations = []; - public function __construct(array $salutations) + protected $maxIndex = 0; + + public function __construct(array $salutations, $maxIndex = 0) { $this->salutations = $salutations; + $this->maxIndex = $maxIndex; } /** @@ -22,7 +25,11 @@ public function __construct(array $salutations) */ public function map(array $parts): array { - foreach ($parts as $k => $part) { + $max = ($this->maxIndex > 0) ? $this->maxIndex : floor(count($parts) / 2); + + for ($k = 0; $k < $max; $k++) { + $part = $parts[$k]; + if ($part instanceof AbstractPart) { break; } diff --git a/src/Parser.php b/src/Parser.php index c73f3fe..a99f5fb 100644 --- a/src/Parser.php +++ b/src/Parser.php @@ -33,6 +33,11 @@ class Parser */ protected $nicknameDelimiters = []; + /** + * @var int + */ + protected $maxSalutationIndex = 0; + public function __construct(array $languages = []) { if (empty($languages)) { @@ -99,7 +104,7 @@ protected function getFirstSegmentParser(): Parser $parser = new Parser(); $parser->setMappers([ - new SalutationMapper($this->getSalutations()), + new SalutationMapper($this->getSalutations(), $this->getMaxSalutationIndex()), new SuffixMapper($this->getSuffixes()), new LastnameMapper($this->getPrefixes(), true), new FirstnameMapper(), @@ -117,7 +122,7 @@ protected function getSecondSegmentParser(): Parser $parser = new Parser(); $parser->setMappers([ - new SalutationMapper($this->getSalutations()), + new SalutationMapper($this->getSalutations(), $this->getMaxSalutationIndex()), new SuffixMapper($this->getSuffixes(), true), new NicknameMapper($this->getNicknameDelimiters()), new InitialMapper(true), @@ -149,7 +154,7 @@ public function getMappers(): array if (empty($this->mappers)) { $this->setMappers([ new NicknameMapper($this->getNicknameDelimiters()), - new SalutationMapper($this->getSalutations()), + new SalutationMapper($this->getSalutations(), $this->getMaxSalutationIndex()), new SuffixMapper($this->getSuffixes()), new InitialMapper(), new LastnameMapper($this->getPrefixes()), @@ -275,4 +280,23 @@ public function setNicknameDelimiters(array $nicknameDelimiters): Parser return $this; } + + /** + * @return int + */ + public function getMaxSalutationIndex(): int + { + return $this->maxSalutationIndex; + } + + /** + * @param int $maxSalutationIndex + * @return Parser + */ + public function setMaxSalutationIndex(int $maxSalutationIndex): Parser + { + $this->maxSalutationIndex = $maxSalutationIndex; + + return $this; + } } diff --git a/tests/Mapper/FirstnameMapperTest.php b/tests/Mapper/FirstnameMapperTest.php index 21ff38b..cf9defb 100644 --- a/tests/Mapper/FirstnameMapperTest.php +++ b/tests/Mapper/FirstnameMapperTest.php @@ -48,6 +48,16 @@ public function provider() new Lastname('Pan'), ], ], + [ + 'input' => [ + 'Alfonso', + new Salutation('Mr'), + ], + 'expectation' => [ + new Firstname('Alfonso'), + new Salutation('Mr'), + ] + ] ]; } diff --git a/tests/ParserTest.php b/tests/ParserTest.php index aca8426..dc6e5be 100644 --- a/tests/ParserTest.php +++ b/tests/ParserTest.php @@ -434,10 +434,24 @@ public function provider() [ 'PAUL M LEWIS MR', [ - 'salutation' => 'Mr.', 'firstname' => 'Paul', 'initials' => 'M', - 'lastname' => 'Lewis', + 'lastname' => 'Lewis Mr', + ] + ], + [ + 'SUJAN MASTER', + [ + 'firstname' => 'Sujan', + 'lastname' => 'Master', + ], + ], + [ + 'JAMES J MA', + [ + 'firstname' => 'James', + 'initials' => 'J', + 'lastname' => 'Ma' ] ] ]; @@ -494,4 +508,19 @@ public function testSetGetNicknameDelimiters() $this->assertSame('Jim', $parser->parse('[Jim]')->getNickname()); $this->assertNotSame('Jim', $parser->parse('(Jim)')->getNickname()); } + + public function testSetMaxSalutationIndex() + { + $parser = new Parser(); + $this->assertSame(0, $parser->getMaxSalutationIndex()); + $parser->setMaxSalutationIndex(1); + $this->assertSame(1, $parser->getMaxSalutationIndex()); + $this->assertSame('', $parser->parse('Francis Mr')->getSalutation()); + + $parser = new Parser(); + $this->assertSame(0, $parser->getMaxSalutationIndex()); + $parser->setMaxSalutationIndex(2); + $this->assertSame(2, $parser->getMaxSalutationIndex()); + $this->assertSame('Mr.', $parser->parse('Francis Mr')->getSalutation()); + } } From c8501749a499bdb548002d40bca76ac7d7dc215c Mon Sep 17 00:00:00 2001 From: Andre Wyrwa Date: Sun, 16 Sep 2018 12:45:43 +1000 Subject: [PATCH 2/2] Simplify lastname parser by getting rid of array reversals --- src/Mapper/LastnameMapper.php | 56 +++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/src/Mapper/LastnameMapper.php b/src/Mapper/LastnameMapper.php index ecf95c2..05d0d11 100644 --- a/src/Mapper/LastnameMapper.php +++ b/src/Mapper/LastnameMapper.php @@ -34,11 +34,7 @@ public function map(array $parts): array return $parts; } - $parts = array_reverse($parts); - - $parts = $this->mapReversedParts($parts); - - return array_reverse($parts); + return $this->mapParts($parts); } /** @@ -48,25 +44,20 @@ public function map(array $parts): array * @param array $parts * @return array */ - protected function mapReversedParts(array $parts): array + protected function mapParts(array $parts): array { - $length = count($parts); + $k = $this->skipIgnoredParts($parts) + 1; $remapIgnored = true; - foreach ($parts as $k => $part) { - if ($this->isIgnoredPart($part)) { - continue; - } + while (--$k >= 0) { + $part = $parts[$k]; if ($part instanceof AbstractPart) { break; } - $originalIndex = $length - $k - 1; - $originalParts = array_reverse($parts); - - if ($this->isFollowedByLastnamePart($originalParts, $originalIndex)) { - if ($this->isApplicablePrefix($originalParts, $originalIndex)) { + if ($this->isFollowedByLastnamePart($parts, $k)) { + if ($this->isApplicablePrefix($parts, $k)) { $parts[$k] = new LastnamePrefix($part, $this->prefixes[$this->getKey($part)]); continue; } @@ -87,6 +78,25 @@ protected function mapReversedParts(array $parts): array return $parts; } + /** + * skip through the parts we want to ignore and return the start index + * + * @param array $parts + * @return int + */ + protected function skipIgnoredParts(array $parts): int + { + $k = count($parts); + + while (--$k >= 0) { + if (!$this->isIgnoredPart($parts[$k])) { + break; + } + } + + return $k; + } + /** * indicates if we should stop mapping at the give index $k * @@ -99,11 +109,15 @@ protected function mapReversedParts(array $parts): array */ protected function shouldStopMapping(array $parts, int $k): bool { - if ($k + 2 > count($parts)) { + if ($k < 1) { return true; } - return strlen($parts[$k - 1]->getValue()) >= 3; + if ($parts[$k + 1] instanceof LastnamePrefix) { + return true; + } + + return strlen($parts[$k + 1]->getValue()) >= 3; } /** @@ -128,7 +142,11 @@ protected function isIgnoredPart($part) { */ protected function remapIgnored(array $parts): array { - foreach ($parts as $k => $part) { + $k = count($parts); + + while (--$k >= 0) { + $part = $parts[$k]; + if (!$this->isIgnoredPart($part)) { break; }