From 7c1fc5a79e89da17a95f8bd840453c047822f69b Mon Sep 17 00:00:00 2001
From: Peter Johnson <insomnia@rcpt.at>
Date: Mon, 9 Dec 2024 14:26:51 +0100
Subject: [PATCH 1/2] feat(parser): remove URL regex caret anchor

---
 stream/parser.js      |  8 +++-----
 test/stream/parser.js | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/stream/parser.js b/stream/parser.js
index 4c3bc68..fbda09f 100644
--- a/stream/parser.js
+++ b/stream/parser.js
@@ -56,11 +56,9 @@ function selectName( names ){
   // filter out URLs
   // then return the longest name
   // @todo: can we improve this logic?
-  return names.filter( function ( name) {
-    return !name.match(/^http(s)?:\/\//);
-  }).reduce( function( a, b ){
-    return a.length > b.length ? a : b;
-  }, '');
+  return names
+    .filter(name => !/http(s)?:\/\//.test(name))
+    .reduce((a, b) => a.length > b.length ? a : b, '');
 }
 
 module.exports = parser;
diff --git a/test/stream/parser.js b/test/stream/parser.js
index 7493272..1345409 100644
--- a/test/stream/parser.js
+++ b/test/stream/parser.js
@@ -111,6 +111,27 @@ module.exports.tests.filter_url = function(test, common) {
     stream.write(row);
     stream.end();
   });
+
+  // real-world example where the URL was included with a valid name
+  // (ie. was preceeded by a space rather than a NULL character).
+  test('parse: filter URL within name', (t) => {
+    const stream = parser(6);
+    const row = [
+      'i{s~{AqubwJ{TxV{BlDmBnCiGhJgCbCs@dAaCfHmAnCoBpB',
+      'Sentier des Chasupes',
+      'Mairie Bouxières http://www.mairie-bouxieres-aux-dames.fr/wp-content/uploads/2005/01/Les-sentiers-de-Bouxi%C3%A8res-aux-Dames.pdf',
+    ].join('\0');
+    const expected = 'Sentier des Chasupes';
+
+    const assert = ( actual, enc, next ) => {
+      t.deepEqual( actual.properties.name, expected, 'longest non-URL name selected' );
+      next();
+    };
+
+    stream.pipe( through.obj( assert, () => t.end() ) );
+    stream.write(row);
+    stream.end();
+  });
 };
 
 module.exports.tests.filter_only_url = function(test, common) {

From e1f80ec6f2a601612a26ea772edabeb7270b2cd0 Mon Sep 17 00:00:00 2001
From: Peter Johnson <insomnia@rcpt.at>
Date: Mon, 9 Dec 2024 14:40:17 +0100
Subject: [PATCH 2/2] feat(parser): remove URLs from names (ie. try to save
 them)

---
 stream/parser.js      |  9 ++++-----
 test/stream/parser.js | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/stream/parser.js b/stream/parser.js
index fbda09f..c7f7399 100644
--- a/stream/parser.js
+++ b/stream/parser.js
@@ -53,12 +53,11 @@ function parser( precision ){
 // each connected road can have one or more names
 // we select one name to be the default.
 function selectName( names ){
-  // filter out URLs
-  // then return the longest name
-  // @todo: can we improve this logic?
+  // remove URLs then return the longest name
   return names
-    .filter(name => !/http(s)?:\/\//.test(name))
-    .reduce((a, b) => a.length > b.length ? a : b, '');
+    .map(name => name.replace(/(?:https?|ftp):\/\/\S*/g, '').trim())
+    .sort((a, b) => b.length - a.length)
+    .at(0);
 }
 
 module.exports = parser;
diff --git a/test/stream/parser.js b/test/stream/parser.js
index 1345409..a707d0a 100644
--- a/test/stream/parser.js
+++ b/test/stream/parser.js
@@ -132,6 +132,26 @@ module.exports.tests.filter_url = function(test, common) {
     stream.write(row);
     stream.end();
   });
+
+  test('parse: URL removal', (t) => {
+    const stream = parser(6);
+    const row = [
+      'i{s~{AqubwJ{TxV{BlDmBnCiGhJgCbCs@dAaCfHmAnCoBpB',
+      'http://foo.com/bar.pdf',
+      'Short Example https://foo.com/bar.pdf',
+      'Longer Example ftp://foo.com/bar.pdf',
+    ].join('\0');
+    const expected = 'Longer Example';
+
+    const assert = ( actual, enc, next ) => {
+      t.deepEqual( actual.properties.name, expected, 'longest non-URL name selected' );
+      next();
+    };
+
+    stream.pipe( through.obj( assert, () => t.end() ) );
+    stream.write(row);
+    stream.end();
+  });
 };
 
 module.exports.tests.filter_only_url = function(test, common) {