From 7c1fc5a79e89da17a95f8bd840453c047822f69b Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Mon, 9 Dec 2024 14:26:51 +0100 Subject: [PATCH 1/2] feat(parser): remove URL regex caret anchor --- stream/parser.js | 8 +++----- test/stream/parser.js | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/stream/parser.js b/stream/parser.js index 4c3bc68..fbda09f 100644 --- a/stream/parser.js +++ b/stream/parser.js @@ -56,11 +56,9 @@ function selectName( names ){ // filter out URLs // then return the longest name // @todo: can we improve this logic? - return names.filter( function ( name) { - return !name.match(/^http(s)?:\/\//); - }).reduce( function( a, b ){ - return a.length > b.length ? a : b; - }, ''); + return names + .filter(name => !/http(s)?:\/\//.test(name)) + .reduce((a, b) => a.length > b.length ? a : b, ''); } module.exports = parser; diff --git a/test/stream/parser.js b/test/stream/parser.js index 7493272..1345409 100644 --- a/test/stream/parser.js +++ b/test/stream/parser.js @@ -111,6 +111,27 @@ module.exports.tests.filter_url = function(test, common) { stream.write(row); stream.end(); }); + + // real-world example where the URL was included with a valid name + // (ie. was preceeded by a space rather than a NULL character). + test('parse: filter URL within name', (t) => { + const stream = parser(6); + const row = [ + 'i{s~{AqubwJ{TxV{BlDmBnCiGhJgCbCs@dAaCfHmAnCoBpB', + 'Sentier des Chasupes', + 'Mairie Bouxières http://www.mairie-bouxieres-aux-dames.fr/wp-content/uploads/2005/01/Les-sentiers-de-Bouxi%C3%A8res-aux-Dames.pdf', + ].join('\0'); + const expected = 'Sentier des Chasupes'; + + const assert = ( actual, enc, next ) => { + t.deepEqual( actual.properties.name, expected, 'longest non-URL name selected' ); + next(); + }; + + stream.pipe( through.obj( assert, () => t.end() ) ); + stream.write(row); + stream.end(); + }); }; module.exports.tests.filter_only_url = function(test, common) { From e1f80ec6f2a601612a26ea772edabeb7270b2cd0 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Mon, 9 Dec 2024 14:40:17 +0100 Subject: [PATCH 2/2] feat(parser): remove URLs from names (ie. try to save them) --- stream/parser.js | 9 ++++----- test/stream/parser.js | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/stream/parser.js b/stream/parser.js index fbda09f..c7f7399 100644 --- a/stream/parser.js +++ b/stream/parser.js @@ -53,12 +53,11 @@ function parser( precision ){ // each connected road can have one or more names // we select one name to be the default. function selectName( names ){ - // filter out URLs - // then return the longest name - // @todo: can we improve this logic? + // remove URLs then return the longest name return names - .filter(name => !/http(s)?:\/\//.test(name)) - .reduce((a, b) => a.length > b.length ? a : b, ''); + .map(name => name.replace(/(?:https?|ftp):\/\/\S*/g, '').trim()) + .sort((a, b) => b.length - a.length) + .at(0); } module.exports = parser; diff --git a/test/stream/parser.js b/test/stream/parser.js index 1345409..a707d0a 100644 --- a/test/stream/parser.js +++ b/test/stream/parser.js @@ -132,6 +132,26 @@ module.exports.tests.filter_url = function(test, common) { stream.write(row); stream.end(); }); + + test('parse: URL removal', (t) => { + const stream = parser(6); + const row = [ + 'i{s~{AqubwJ{TxV{BlDmBnCiGhJgCbCs@dAaCfHmAnCoBpB', + 'http://foo.com/bar.pdf', + 'Short Example https://foo.com/bar.pdf', + 'Longer Example ftp://foo.com/bar.pdf', + ].join('\0'); + const expected = 'Longer Example'; + + const assert = ( actual, enc, next ) => { + t.deepEqual( actual.properties.name, expected, 'longest non-URL name selected' ); + next(); + }; + + stream.pipe( through.obj( assert, () => t.end() ) ); + stream.write(row); + stream.end(); + }); }; module.exports.tests.filter_only_url = function(test, common) {