From 3fe113645f1805564bb1bf8085b709644455947f Mon Sep 17 00:00:00 2001 From: Diana Shkolnikov Date: Thu, 13 Oct 2016 16:45:24 -0400 Subject: [PATCH 1/2] feat: check for preferred record when dupe found --- middleware/dedupe.js | 49 ++++++++++++- test/unit/middleware/dedupe.js | 130 +++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 2 deletions(-) diff --git a/middleware/dedupe.js b/middleware/dedupe.js index b40f18060..030827105 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -16,11 +16,41 @@ function dedupeResults(req, res, next) { var uniqueResults = []; _.some(res.data, function (hit) { - if (uniqueResults.length === 0 || _.every(uniqueResults, isDifferent.bind(null, hit)) ) { + + if (_.isEmpty(uniqueResults)) { uniqueResults.push(hit); } else { - logger.info('[dupe]', { query: req.clean.text, hit: hit.name.default + ' ' + hit.source + ':' + hit._id }); + // if there are multiple items in results, loop through them to find a dupe + // save off the index of the dupe if found + var dupeIndex = uniqueResults.findIndex(function (elem, index, array) { + return !isDifferent(elem, hit); + }); + + // if a dupe is not found, just add to results and move on + if (dupeIndex === -1) { + uniqueResults.push(hit); + } + // if dupe was found, we need to check which of the records is preferred + // since the order in which Elasticsearch returns identical text matches is arbitrary + // of course, if the new one is preferred we should replace previous with new + else if (isPreferred(uniqueResults[dupeIndex], hit)) { + logger.info('[dupe][replacing]', { + query: req.clean.text, + previous: uniqueResults[dupeIndex].source, + hit: hit.name.default + ' ' + hit.source + ':' + hit._id + }); + // replace previous dupe item with current hit + uniqueResults.splice(dupeIndex, 1, hit); + } + // if not preferred over existing, just log and move on + else { + logger.info('[dupe][skipping]', { + query: req.clean.text, + previous: uniqueResults[dupeIndex].source, + hit: hit.name.default + ' ' + hit.source + ':' + hit._id + }); + } } // stop looping when requested size has been reached in uniqueResults @@ -32,4 +62,19 @@ function dedupeResults(req, res, next) { next(); } +function isPreferred(existing, candidateReplacement) { + // NOTE: we are assuming here that the layer for both records is the same + + //bind the trumps function to the data items to keep the rest of the function clean + var trumpsFunc = trumps.bind(null, existing, candidateReplacement); + + return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred + trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA + trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time +} + +function trumps(existing, candidateReplacement, loserSource, winnerSource) { + return existing.source === loserSource && candidateReplacement.source === winnerSource; +} + module.exports = setup; diff --git a/test/unit/middleware/dedupe.js b/test/unit/middleware/dedupe.js index b81009558..291d404ab 100644 --- a/test/unit/middleware/dedupe.js +++ b/test/unit/middleware/dedupe.js @@ -58,6 +58,136 @@ module.exports.tests.dedupe = function(test, common) { }); }; +module.exports.tests.trump = function(test, common) { + test('whosonfirst trumps geonames, replace', function (t) { + var req = { + clean: { + text: 'Lancaster', + size: 100 + } + }; + var res = { + data: [ + { + 'name': { 'default': 'Lancaster' }, + 'source': 'geonames', + 'source_id': '123456', + 'layer': 'locality' + }, + { + 'name': { 'default': 'Lancaster' }, + 'source': 'whosonfirst', + 'source_id': '654321', + 'layer': 'locality' + } + ] + }; + + var expectedCount = 1; + dedupe(req, res, function () { + t.equal(res.data.length, expectedCount, 'results have fewer items than before'); + t.deepEqual(res.data[0].source, 'whosonfirst', 'whosonfirst result won'); + t.end(); + }); + }); + + test('whosonfirst trumps geonames, no replace', function (t) { + var req = { + clean: { + text: 'Lancaster', + size: 100 + } + }; + var res = { + data: [ + { + 'name': { 'default': 'Lancaster' }, + 'source': 'whosonfirst', + 'source_id': '123456', + 'layer': 'locality' + }, + { + 'name': { 'default': 'Lancaster' }, + 'source': 'geonames', + 'source_id': '654321', + 'layer': 'locality' + } + ] + }; + + var expectedCount = 1; + dedupe(req, res, function () { + t.equal(res.data.length, expectedCount, 'results have fewer items than before'); + t.deepEqual(res.data[0].source, 'whosonfirst', 'whosonfirst result won'); + t.end(); + }); + }); + + test('openstreetmap trumps whosonfirst venues', function (t) { + var req = { + clean: { + text: 'Lancaster Dairy Farm', + size: 100 + } + }; + var res = { + data: [ + { + 'name': { 'default': 'Lancaster Dairy Farm' }, + 'source': 'openstreetmap', + 'source_id': '123456', + 'layer': 'venue' + }, + { + 'name': { 'default': 'Lancaster Dairy Farm' }, + 'source': 'whosonfirst', + 'source_id': '654321', + 'layer': 'venue' + } + ] + }; + + var expectedCount = 1; + dedupe(req, res, function () { + t.equal(res.data.length, expectedCount, 'results have fewer items than before'); + t.deepEqual(res.data[0].source, 'openstreetmap', 'openstreetmap result won'); + t.end(); + }); + }); + + test('openaddresses trumps openstreetmap', function (t) { + var req = { + clean: { + text: '100 Main St', + size: 100 + } + }; + var res = { + data: [ + { + 'name': { 'default': '100 Main St' }, + 'source': 'openstreetmap', + 'source_id': '123456', + 'layer': 'address' + }, + { + 'name': { 'default': '100 Main St' }, + 'source': 'openaddresses', + 'source_id': '654321', + 'layer': 'address' + } + ] + }; + + var expectedCount = 1; + dedupe(req, res, function () { + t.equal(res.data.length, expectedCount, 'results have fewer items than before'); + t.deepEqual(res.data[0].source, 'openaddresses', 'openaddresses result won'); + t.end(); + }); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { From a3fa49ee52d5ceec5164e5793102acdfcac3a56f Mon Sep 17 00:00:00 2001 From: Diana Shkolnikov Date: Mon, 17 Oct 2016 15:09:58 -0400 Subject: [PATCH 2/2] remove splice and replace with direct assignment --- middleware/dedupe.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/middleware/dedupe.js b/middleware/dedupe.js index 030827105..3b125d0da 100644 --- a/middleware/dedupe.js +++ b/middleware/dedupe.js @@ -41,7 +41,7 @@ function dedupeResults(req, res, next) { hit: hit.name.default + ' ' + hit.source + ':' + hit._id }); // replace previous dupe item with current hit - uniqueResults.splice(dupeIndex, 1, hit); + uniqueResults[dupeIndex] = hit; } // if not preferred over existing, just log and move on else {