Skip to content

Commit

Permalink
Merge pull request #687 from pelias/smart-dedupe
Browse files Browse the repository at this point in the history
Rock-Paper-Scissors: check which record is preferred when dupe is found
  • Loading branch information
Diana Shkolnikov authored Oct 17, 2016
2 parents 0f2cbde + a3fa49e commit bcd25ce
Show file tree
Hide file tree
Showing 2 changed files with 177 additions and 2 deletions.
49 changes: 47 additions & 2 deletions middleware/dedupe.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,41 @@ function dedupeResults(req, res, next) {
var uniqueResults = [];

_.some(res.data, function (hit) {
if (uniqueResults.length === 0 || _.every(uniqueResults, isDifferent.bind(null, hit)) ) {

if (_.isEmpty(uniqueResults)) {
uniqueResults.push(hit);
}
else {
logger.info('[dupe]', { query: req.clean.text, hit: hit.name.default + ' ' + hit.source + ':' + hit._id });
// if there are multiple items in results, loop through them to find a dupe
// save off the index of the dupe if found
var dupeIndex = uniqueResults.findIndex(function (elem, index, array) {
return !isDifferent(elem, hit);
});

// if a dupe is not found, just add to results and move on
if (dupeIndex === -1) {
uniqueResults.push(hit);
}
// if dupe was found, we need to check which of the records is preferred
// since the order in which Elasticsearch returns identical text matches is arbitrary
// of course, if the new one is preferred we should replace previous with new
else if (isPreferred(uniqueResults[dupeIndex], hit)) {
logger.info('[dupe][replacing]', {
query: req.clean.text,
previous: uniqueResults[dupeIndex].source,
hit: hit.name.default + ' ' + hit.source + ':' + hit._id
});
// replace previous dupe item with current hit
uniqueResults[dupeIndex] = hit;
}
// if not preferred over existing, just log and move on
else {
logger.info('[dupe][skipping]', {
query: req.clean.text,
previous: uniqueResults[dupeIndex].source,
hit: hit.name.default + ' ' + hit.source + ':' + hit._id
});
}
}

// stop looping when requested size has been reached in uniqueResults
Expand All @@ -32,4 +62,19 @@ function dedupeResults(req, res, next) {
next();
}

function isPreferred(existing, candidateReplacement) {
// NOTE: we are assuming here that the layer for both records is the same

//bind the trumps function to the data items to keep the rest of the function clean
var trumpsFunc = trumps.bind(null, existing, candidateReplacement);

return trumpsFunc('geonames', 'whosonfirst') || // WOF has bbox and is generally preferred
trumpsFunc('openstreetmap', 'openaddresses') || // addresses are better in OA
trumpsFunc('whosonfirst', 'openstreetmap'); // venues are better in OSM, at this time
}

function trumps(existing, candidateReplacement, loserSource, winnerSource) {
return existing.source === loserSource && candidateReplacement.source === winnerSource;
}

module.exports = setup;
130 changes: 130 additions & 0 deletions test/unit/middleware/dedupe.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,136 @@ module.exports.tests.dedupe = function(test, common) {
});
};

module.exports.tests.trump = function(test, common) {
test('whosonfirst trumps geonames, replace', function (t) {
var req = {
clean: {
text: 'Lancaster',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Lancaster' },
'source': 'geonames',
'source_id': '123456',
'layer': 'locality'
},
{
'name': { 'default': 'Lancaster' },
'source': 'whosonfirst',
'source_id': '654321',
'layer': 'locality'
}
]
};

var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].source, 'whosonfirst', 'whosonfirst result won');
t.end();
});
});

test('whosonfirst trumps geonames, no replace', function (t) {
var req = {
clean: {
text: 'Lancaster',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Lancaster' },
'source': 'whosonfirst',
'source_id': '123456',
'layer': 'locality'
},
{
'name': { 'default': 'Lancaster' },
'source': 'geonames',
'source_id': '654321',
'layer': 'locality'
}
]
};

var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].source, 'whosonfirst', 'whosonfirst result won');
t.end();
});
});

test('openstreetmap trumps whosonfirst venues', function (t) {
var req = {
clean: {
text: 'Lancaster Dairy Farm',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': 'Lancaster Dairy Farm' },
'source': 'openstreetmap',
'source_id': '123456',
'layer': 'venue'
},
{
'name': { 'default': 'Lancaster Dairy Farm' },
'source': 'whosonfirst',
'source_id': '654321',
'layer': 'venue'
}
]
};

var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].source, 'openstreetmap', 'openstreetmap result won');
t.end();
});
});

test('openaddresses trumps openstreetmap', function (t) {
var req = {
clean: {
text: '100 Main St',
size: 100
}
};
var res = {
data: [
{
'name': { 'default': '100 Main St' },
'source': 'openstreetmap',
'source_id': '123456',
'layer': 'address'
},
{
'name': { 'default': '100 Main St' },
'source': 'openaddresses',
'source_id': '654321',
'layer': 'address'
}
]
};

var expectedCount = 1;
dedupe(req, res, function () {
t.equal(res.data.length, expectedCount, 'results have fewer items than before');
t.deepEqual(res.data[0].source, 'openaddresses', 'openaddresses result won');
t.end();
});
});
};

module.exports.all = function (tape, common) {

function test(name, testFunction) {
Expand Down

0 comments on commit bcd25ce

Please sign in to comment.