From aa62a3e3f53279f1d575d86d9da5b88b2930fe1b Mon Sep 17 00:00:00 2001 From: Benson Muite Date: Thu, 1 Jul 2021 10:41:31 +0300 Subject: [PATCH 1/2] initial skeleton --- samples/dbscan.rb | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 samples/dbscan.rb diff --git a/samples/dbscan.rb b/samples/dbscan.rb new file mode 100644 index 0000000..009fe6e --- /dev/null +++ b/samples/dbscan.rb @@ -0,0 +1,38 @@ +# The program is based on an exampls from +# +# M. Götz, C. Bodenstein, M. Riedel, +# HPDBSCAN: highly parallel DBSCAN, +# Proceedings of the Workshop on Machine Learning in High-Performance Computing Environments, ACM, 2015. +# https://github.com/Markus-Goetz/hpdbscan +# +# X. Hu, J. Huang, M. Qiu, C. Chen, W. Chu +# "PS-DBSCAN: An Efficient Parallel DBSCAN Algorithm Based on Platform Of AI (PAI)" +# https://arxiv.org/abs/1711.01034 +# +# D. Han, A. Agrawal, W-k. Liao, A. Choudhary +# A Fast DBSCAN Algorithm with Spark Implementation +# http://cucis.eecs.northwestern.edu/publications/pdf/HAL18.pdf +# +# Md. Mostofa Ali Patwary, D. Palsetia, A. Agrawal, W.-k. Liao, F. Manne, A. Choudhary, +# A New Scalable Parallel DBSCAN Algorithm Using the Disjoint Set Data Structure +# Proceedings of the International Conference on High Performance Computing, Networking, +# Storage and Analysis (Supercomputing, SC'12), pp.62:1-62:11, 2012. +# http://cucis.eecs.northwestern.edu/publications/pdf/PatPal12.pdf +# http://cucis.ece.northwestern.edu/projects/Clustering/download_code_dbscan.html +# https://github.com/ContinuumIO/parallel_dbscan/tree/master/dbscan-v1.0.0 +# + +require "mpi" +if defined?(NumRu::NArray) + include NumRu +end + + +MPI.Init + +world = MPI::Comm::WORLD + +size = world.size +rank = world.rank + +MPI.Finalize From 612e9cd3935d9b9ee9a0fb7da866c6ed7266d6b0 Mon Sep 17 00:00:00 2001 From: Benson Muite Date: Sun, 11 Jul 2021 23:48:43 +0300 Subject: [PATCH 2/2] initial version of dbscan --- samples/dbscan.rb | 184 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 183 insertions(+), 1 deletion(-) diff --git a/samples/dbscan.rb b/samples/dbscan.rb index 009fe6e..75150ff 100644 --- a/samples/dbscan.rb +++ b/samples/dbscan.rb @@ -1,4 +1,4 @@ -# The program is based on an exampls from +# The program is based on an examples from # # M. Götz, C. Bodenstein, M. Riedel, # HPDBSCAN: highly parallel DBSCAN, @@ -21,12 +21,35 @@ # http://cucis.ece.northwestern.edu/projects/Clustering/download_code_dbscan.html # https://github.com/ContinuumIO/parallel_dbscan/tree/master/dbscan-v1.0.0 # +# DBSCAN +# Matias Insaurralde +# https://github.com/matiasinsaurralde/dbscan +# +# dbscan +# ningjingzhiyuan +# https://github.com/ningjingzhiyuande/dbscan +# +# dbscan +# Atsushi Tatsuma +# https://github.com/yoshoku/rumale/blob/main/lib/rumale/clustering/dbscan.rb +# +# https://en.wikipedia.org/wiki/DBSCAN +# +# The current parallelization method will not work on large data sets +# Possible options to allow this include pre- partitioning the data +# set and then using unions on local clusters require "mpi" if defined?(NumRu::NArray) include NumRu + include NMath end +def generate_points count + x = NArray.float(count).random + y = NArray.float(count).random + return x , y +end MPI.Init @@ -35,4 +58,163 @@ size = world.size rank = world.rank + +def usage(rank) + if rank==0 + print < 0 with + minpoints <= numpoints and numproc <= numpoints + 1 > epsilon > 0 must be a real number, with an upper + limit of 1 because all generated points are greater than + zero and less than one. +EOF + end + MPI.Finalize + exit -1 +end + +usage(rank) if ARGV.length != 3 +usage(rank) if ( ( /^\d+$/ =~ ARGV[0] ) != 0) +usage(rank) if ( ( /^\d+$/ =~ ARGV[1] ) != 0) +usage(rank) if ( ( /^(0|)\.\d+$/ =~ ARGV[2] ) != 0) +n_points = ARGV[0].to_i +min_points = ARGV[1].to_i +epsilon = ARGV[2].to_f +usage(rank) unless n_points > size +usage(rank) unless epsilon > 0 +usage(rank) unless epsilon < 1 +usage(rank) unless n_points >= min_points + +my_points = n_points.div(size) +if ( n_points % size > rank ) + my_points += 1 +end +my_x, my_y = generate_points my_points + +nearest_neighbors = NArray.int(n_points) +nearest_neighbors_distance = NArray.float(n_points) +my_nearest_neighbors = NArray.int(n_points).fill(-1) +my_nearest_neighbors_distance = NArray.float(n_points) +distances = NArray.float(n_points) +mask = NArray.byte(n_points).fill(1) +count = NArray.int(n_points).indgen(0,1) +neighbor_loc = NArray.int(n_points) +cluster_label = NArray.int(n_points) +points_start = NArray.int(size) +points_count = NArray.int(size) +x = NArray.float(n_points) +y = NArray.float(n_points) +temp_x = NArray.float(n_points.div(size) + 1) +temp_y = NArray.float(n_points.div(size) + 1) +temp_neighbors = NArray.int(n_points.div(size) + 1) +temp_distance = NArray.float(n_points.div(size) + 1) + +j = 0 +while j < size do + points_count[j] = n_points.div(size) + if ( n_points % size > j ) + points_count[j] += 1 + end + if j == 0 + points_start[j] = 0 + else + points_start[j] = points_start[j-1] + points_count[j-1] + end + if rank == j + temp_x[0..(points_count[j]-1)]=my_x + temp_y[0..(points_count[j]-1)]=my_y + end + world.Barrier + world.Bcast(temp_x,j) + world.Bcast(temp_y,j) + x[points_start[j]..(points_start[j]+points_count[j]-1)]= + temp_x[0..(points_count[j]-1)] + y[points_start[j]..(points_start[j]+points_count[j]-1)]= + temp_y[0..(points_count[j]-1)] + j +=1 +end +# Find closest neighbors by creating +# something similar to a minimum spanning +# tree +j = points_start[rank] +while j < points_start[rank] + points_count[rank] do + distances = (x - my_x[j - points_start[rank]])**2 + + (y - my_y[j - points_start[rank]])**2 + mask[j] = 0 # remove own location + # remove any points closest to this one + neighbor_loc = my_nearest_neighbors.eq(j) + mask[neighbor_loc] = 0 + new = distances[mask] + min_value = (distances[mask]).min + min_loc = (distances[mask]).eq(min_value) + if min_loc.count_true == 1 + my_nearest_neighbors[j] = (count[ distances.eq(min_value) ]).sum + my_nearest_neighbors_distance[j] = sqrt(min_value) + else + i=0 + while i < n_points do + if distances[i] == min_value + my_nearest_neighbors[j] = i + my_nearest_neighbors_distance[j] = sqrt(min_value) + end + i += 1 + end + end + mask[j] = 1 + mask[neighbor_loc] = 1 + j += 1 +end +# Exchange neighbor data +j = 0 +while j < size do + if rank == j + temp_neighbors[0..(points_count[j]-1)]= + my_nearest_neighbors[points_start[j]..(points_start[j]+points_count[j]-1)] + temp_distance[0..(points_count[j]-1)]= + my_nearest_neighbors_distance[points_start[j]..(points_start[j]+points_count[j]-1)] + end + world.Bcast(temp_neighbors,j) + world.Bcast(temp_distance,j) + nearest_neighbors[points_start[j]..(points_start[j]+points_count[j]-1)]= + temp_neighbors[0..(points_count[j]-1)] + nearest_neighbors_distance[points_start[j]..(points_start[j]+points_count[j]-1)]= + temp_distance[0..(points_count[j]-1)] + j +=1 +end + +# label clusters +# 0 is unlabelled +j = 0 +cluster_num = 1 +while j < n_points do + if nearest_neighbors_distance[j] < epsilon + if cluster_label[j] == 0 && cluster_label[nearest_neighbors[j]] == 0 + cluster_label[j] = cluster_num + cluster_label[nearest_neighbors[j]] = cluster_num + cluster_num += 1 + elsif cluster_label[j] != 0 && cluster_label[nearest_neighbors[j]] == 0 + cluster_label[nearest_neighbors[j]] = cluster_label[j] + elsif cluster_label[j] == 0 && cluster_label[nearest_neighbors[j]] != 0 + cluster_label[j] = cluster_label[nearest_neighbors[j]] + end + end + j +=1 +end +# remove clusters that are too small +j = 1 +while j < cluster_num do + if (cluster_label.eq(j)).count_true < min_points + cluster_label[cluster_label.eq(j)]=0 + end + j += 1 +end +# Print out results, cluster label of 0 is noise +if rank == 0 + p x + p y + p nearest_neighbors + p nearest_neighbors_distance + p cluster_label +end MPI.Finalize