-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdbscan.py
More file actions
106 lines (83 loc) · 3.64 KB
/
dbscan.py
File metadata and controls
106 lines (83 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import math
import itertools
import time
import numpy as np
import pandas as pd
from sklearn import datasets
# ***************************************************************
# Function: dbscan
# Variables/input: objects.dataset
# int: number of datapoints
# float: epsilon, radius of neighborhood
# int: minimum number of points in neighborhood
# Output: pandas dataframe: cluster assignment results
# Usage/Purpose: Function performs classic dbscan by identifying
# core objects, clusters, and noise.
# ***************************************************************
def dbscan(ds, numSamples, epsilon, minPts):
# prepare dataframe for function work
df = ds.df.copy()
df = df[:numSamples]
clusterCount = 0
# add functional columns to dataframe
df["neighbors"] = [[] for _ in range(numSamples)]
df["visited"] = 0
df["cluster"] = -1
# get a list of neighbors of each datapoint
count_neighbors(df, ds.distanceArray[:numSamples, :numSamples], epsilon)
# continue while there are unvisited datapoints
while 0 in df["visited"].values:
# get next datapoint that needs processing
index = df[df.visited == 0].first_valid_index()
# mark datapoint as visited
df.at[index, "visited"] = 1
# check if this datapoint has enough neighbors
if len(df.iloc[index]["neighbors"]) < minPts:
continue
# add datapoint to a new cluster
df.at[index, "cluster"] = clusterCount
# queue of neighbors to check
queue = []
# add current datapoint's neighbors to queue
for item in df.iloc[index]["neighbors"]:
if df.iloc[item]["visited"] == 0:
queue.append(item)
# continue while queue is not empty
while len(queue) > 0:
index = queue.pop(0)
# skip if datapoint has been visited
if df.iloc[index]["visited"] == 1:
continue
# mark datapoint as visited
df.at[index, "visited"] = 1
# add this points neighbors to queue if the
# point meets the minimum points requirement
if len(df.iloc[index]["neighbors"]) >= minPts:
for item in df.iloc[index]["neighbors"]:
if df.iloc[item]["visited"] == 0:
queue.append(item)
# assign a cluster number to datapoint
df.at[index, "cluster"] = clusterCount
# clusterCount assigns the cluster number to a datapoint
clusterCount += 1
return pd.DataFrame(df["cluster"], columns=["cluster"])
# ***************************************************************
# Function: count_neighbors
# Variables/input: pandas dataframe: dataset
# numpy 2d array: distance array
# float: epsilon, radius of neighborhood
# Output: writes list of neighbors to dataframe
# Usage/Purpose: Function finds a list of neighbors for each
# point based on epsilon value.
# ***************************************************************
def count_neighbors(df, distanceArray, epsilon):
# loop through each datapoint
for i in range(df.shape[0]):
# pull out all distances from this point to all others
row = distanceArray[i, :]
# create a list of neighbors who are within epsilon
nestedList = np.argwhere(row < epsilon).tolist()
# flatten list
flatList = list(itertools.chain.from_iterable(nestedList))
# write list to datapoint["neighbors"]
df.iloc[i]["neighbors"].extend(flatList)