Skip to content

Commit

Permalink
Add tests for clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
go2starr committed Oct 30, 2012
1 parent 452af5c commit 912606a
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 53 deletions.
56 changes: 3 additions & 53 deletions lsh.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class MinHashSignature(Signature):
def hash_functions(self):
"""Return dim different hash functions"""
def hash_factory(n):
return lambda x: hash(str(n) + str(x) + "salt")
return lambda x: hash("salt" + str(n) + str(x) + "salt")
return [ hash_factory(_) for _ in range(self.dim) ]

def sign(self, s):
Expand Down Expand Up @@ -93,7 +93,7 @@ class Cluster:
def __init__(self, width=10, threshold=0.5):
self.width = width
self.unionfind = UnionFind()
self.signer = Signature(width)
self.signer = MinHashSignature(width)
self.hasher = LSH(width, threshold)
self.hashmap = {}

Expand Down Expand Up @@ -124,7 +124,7 @@ def shingle(s, k):
yield s[i:i+k]

def hshingle(s, k):
"""Generate k-length shingles of s into m buckets"""
"""Generate k-length shingles then hash"""
for s in shingle(s, k):
yield hash(s)

Expand All @@ -135,53 +135,3 @@ def jaccard_sim(X, Y):

def jaccard_dist(X, Y):
return 1 - jaccard_sim(X, Y)

# myset = [
# (1,2,3,4,6), #
# (1,2,3,4,5) # 4 / 6 = 75
# ]

# N = 100
# K = 1
# T = 0.70

# clusterer = Cluster(N, T)

# print "True threshold: %f" % clusterer.hasher.get_threshold()
# print clusterer.hasher.bandwidth

# for s in myset:
# clusterer.add_set(s, "".join(str(_) for _ in s))

# for s in clusterer.get_sets():
# print s

# if __name__ == "__main__":
# # Run some tests :)
# N = 1000
# thresholds = [ 0.25, 0.50 ]
# trials = 10
# setwidth = 10
# setmax = 100

# import random

# def randset(setmax, setwidth):
# return tuple( random.choice(range(setmax)) for _ in range(setwidth) )

# for n in range(1, 100, 10):
# err = 0
# for trial in range(trials):
# x = randset(setmax, setwidth)
# y = randset(setmax, setwidth)
# sim = jaccard_sim(x, y)

# for threshold in range(1, 100):
# threshold = float(threshold) / 100
# clusterer = ClusterFuck(n, threshold)
# clusterer.add_set(x)
# clusterer.add_set(y)
# if len(clusterer.get_sets()) == 2:
# err += abs(sim - threshold)
# break
# print "%d: %f" % (n, 100 * err / trials)
49 changes: 49 additions & 0 deletions test/test_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from utils import *
from ..lsh import Cluster, jaccard_sim

def test_same_set():
"""A set should be clustered with itself"""
s = randset()
cluster = Cluster()
cluster.add_set(s)
cluster.add_set(s)
assert len(cluster.get_sets()) == 1

def test_similar_sets():
"""Two similar sets should be clustered"""
cluster = Cluster()
cluster.add_set("abcdefg")
cluster.add_set("abcdefghi")
assert len(cluster.get_sets()) == 1

def test_dissimilar_sets():
"""Two non-similar sets should not be clustered"""
cluster = Cluster()
cluster.add_set("12345abcdef")
cluster.add_set("1234567890z")
print cluster.get_sets()
assert len(cluster.get_sets()) == 2

def test_cluster_threshold():
"""Expected error for threshold to similarity should be reasonable"""
n_tests = 50
dim = 15
expected_error = 0.20

tot_err = 0
for test in range(n_tests):
# Get some sets and their similarities
sets = (randset(), randset())
jsim = jaccard_sim(*sets)

# Find the threshold at which they cluster together
for threshold in range(1, 100, 5):
threshold = float(threshold) / 100
cluster = Cluster(dim, threshold)
cluster.add_set(sets[0])
cluster.add_set(sets[1])
if len(cluster.get_sets()) == 2:
tot_err += abs(jsim - threshold)
break
avg_err = float(tot_err) / n_tests
assert avg_err <= expected_error
19 changes: 19 additions & 0 deletions test/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""
utils.py
Testing utilities
"""
import random
import operator

def randset():
"""Return a random set. These values of n and k have wide-ranging
similarities between pairs.
"""
n = random.choice(range(5, 20))
k = 10
return tuple(set( random.choice(range(k)) for _ in range(n) ))

def sigsim(X, Y, dim):
"""Return the similarity of the two signatures"""
return sum(map(operator.eq, X, Y)) / float(dim)

0 comments on commit 912606a

Please sign in to comment.