Datamining_Project/clusters.py at master · jcg1183/Datamining_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/usr/bin/env python

# python libraries
import argparse
import math
import sys
import time

# program imports
import settings
from objects import experiment, dataset
from dataprep import calculate_distances, build_dataset, ready_datasets
from dbscan import dbscan
from KBRAIN import run_kbrain, autoplot
from sklearn_algs import sklearn_kmeans, sklearn_kmedoids, sklearn_dbscan
from metrics import calculate_groundtruth_accuracy, calculate_sklearn_accuracy
from results_analysis import save_results, compile_results

# data libraries
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.metrics import silhouette_score, pairwise_distances


# display all columns of a dataframe
pd.set_option("display.max_columns", None)
pd.set_option("expand_frame_repr", False)
pd.set_option("display.max_rows", None)


def main():
    # process command line arguments and return arguments as args
    args = run_parser()

    # load or build datasets according to arguments
    datasets = ready_datasets(args)

    # build experiment object, including datasets
    exp = experiment(datasets, settings.algorithms)

    # calculate distances for each dataset
    calculate_distances(exp)

    # run an experiment with all algorithms and datasets
    if args.experiment:
        run_experiment(exp)

    # run k-means algorithm on specified datasets
    if args.kmeans:
        clusters = run_kbrain(settings.k[0], "k-means", exp.datasets[0])
        exp.results["k-means"].append(
            (exp.datasets[0].name, settings.maxSamples, 1, settings.k[0], clusters)
        )

    if args.kmedoids:
        clusters = run_kbrain(settings.k[0], "k-medoids", exp.datasets[0])
        exp.results["k-medoids"].append(
            (exp.datasets[0].name, settings.maxSamples, 1, settings.k[0], clusters)
        )

    if args.dbscan:
        # call dbscan wrapper function
        results = dbscan(
            exp.datasets[0],
            settings.maxSamples,
            settings.epsilons[0],
            settings.minPts[0],
        )
        # save results of each experiment
        exp.results["DBSCAN"].append(
            (
                exp.datasets[0].name,
                settings.maxSamples,
                1,
                settings.epsilons[0],
                settings.minPts[0],
                results,
            )
        )

    # compile results into a dataframe
    resultsDF = compile_results(exp)

    # print(resultsDF.drop(columns=["cluster_list"]))

    # calculate accuracy of our clustering algorithms' results
    # compared to sklearn.dataset dataset labels
    calculate_groundtruth_accuracy(resultsDF, exp)

    # calculate accuracy of our clustering algorithms' results
    # compared to sklearn clustering algorithm labels
    # calculate_sklearn_accuracy(resultsDF, exp)

    print(resultsDF.drop(columns=["cluster_list", "dataset"]))

    save_results(resultsDF)


# ***************************************************************
# Function:         run_experiment
# Variables/input:  objects.exp
# Output:           appends results to objects.exp
# Usage/Purpose:    Function loops through all permutations of
#                   algorithm parameters.
# ***************************************************************


def run_experiment(exp):
    # loop through each clustering algorithm
    for algo in exp.algorithms:

        # loop through each dataset
        for ds in exp.datasets:

            # loop through the number of datapoints
            # to be used
            for num in settings.numSamples:

                # loop for each trial run
                for i in range(1, settings.numRuns + 1):
                    print(
                        "algo: {0}, ds: {1}, size: {2}".format(algo, ds.name, num),
                        end="",
                    )
                    startTime = time.perf_counter()

                    # call dbscan with parameters
                    if algo == "DBSCAN":

                        # loop parameters unique to dbscan
                        for eps in settings.epsilons:
                            for mp in settings.minPts:

                                # call dbscan with parameters
                                results = dbscan(ds, num, eps, mp)
                                # save results of each experiment
                                exp.results[algo].append(
                                    (ds.name, num, i, eps, mp, results)
                                )

                    if algo == "k-means":

                        for k in range(3, 5):
                            clusters = run_kbrain(k, algo, ds)
                            exp.results[algo].append((ds.name, num, i, k, clusters))

                    if algo == "k-medoids":

                        for k in range(3, 5):
                            clusters = run_kbrain(k, algo, ds)
                            exp.results[algo].append((ds.name, num, i, k, clusters))

                    if algo == "sklearn_kmeans":
                        for numClusters in range(3, 5):
                            results = sklearn_kmeans(ds, numClusters, num)

                            exp.results[algo].append(
                                (ds.name, num, i, numClusters, results)
                            )

                    if algo == "sklearn_kmedoids":
                        for numClusters in range(3, 5):
                            results = sklearn_kmedoids(ds, numClusters, num)

                            exp.results[algo].append(
                                (ds.name, num, i, numClusters, results)
                            )

                    if algo == "sklearn_dbscan":
                        # loop parameters unique to dbscan
                        for eps in settings.epsilons:
                            for mp in settings.minPts:

                                # call dbscan with parameters
                                results = sklearn_dbscan(ds, num, eps, mp)

                                # save results of each experiment
                                exp.results[algo].append(
                                    (ds.name, num, i, eps, mp, results)
                                )

                    stopTime = time.perf_counter()

                    print(" {0:3.2} minutes".format((stopTime - startTime) / 60))


# ***************************************************************
# Function:         print_results
# Variables/input:  objects.exp
# Output:           prints to screen
# Usage/Purpose:    Function pretty prints an experiment object
#                   to the screen.
# ***************************************************************
def print_results(exp):
    print("Analyse Results\n")

    for algo in exp.results.keys():
        if algo == "DBSCAN" or algo == "sklearn_dbscan":
            all_results = exp.results[algo]

            for results in all_results:
                print(
                    "Experiment:\n\tAlgorithm: {0}\n\tDataset Name: {1}\n\tNum Datapoints: {2}\n\tTrial Number: {3}\n\tEpsilon: {4}\n\tMin Points: {5}".format(
                        algo, results[0], results[1], results[2], results[3], results[4]
                    )
                )
                print("Cluster Assignments:\n")
                print(results[5])

        elif algo == "k-means" or algo == "sklearn_kmeans":
            all_results = exp.results[algo]

            for results in all_results:
                print(
                    "Experiment:\n\tAlgorithm: {0}\n\tDataset Name: {1}\n\tNum Datapoints: {2}\n\tTrial Number: {3}\n\tNumber Clusters: {4}".format(
                        algo, results[0], results[1], results[2], results[3]
                    )
                )
                print("Cluster Assignments:\n")
                print(results[4])

        elif algo == "k-medoids" or algo == "sklearn_kmedoids":
            all_results = exp.results[algo]

            for results in all_results:
                print(
                    "Experiment:\n\tAlgorithm: {0}\n\tDataset Name: {1}\n\tNum Datapoints: {2}\n\tTrial Number: {3}\n\tNumber Clusters: {4}".format(
                        algo, results[0], results[1], results[2], results[3]
                    )
                )
                print("Cluster Assignments:\n")
                print(results[4])


# ***************************************************************
# Function:         run_parser
# Variables/input:  none
# Output:           argparse.arguments object
# Usage/Purpose:    Function checks command line arguments for
#                   correct state and returns an object with
#                   argument values.
# ***************************************************************
def run_parser():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-d", "--dataset", action="store", help="path to a csv dataset {./dataset.csv}"
    )
    parser.add_argument(
        "-g", "--generate", action="store_true", help="generate all dataset types"
    )

    parser.add_argument(
        "-e", "--experiment", action="store_true", help="run all clustering algorithms"
    )

    parser.add_argument(
        "-m", "--kmeans", action="store_true", help="run only the k-means algorithm"
    )

    parser.add_argument(
        "-o", "--kmedoids", action="store_true", help="run only the k-medoids algorithm"
    )

    parser.add_argument(
        "-s", "--dbscan", action="store_true", help="run only the dbscan algorithm"
    )

    if len(sys.argv) == 1:
        print("\nPlease provide command line arguments")
        print("Choose one of the following:")
        print("-d or --dataset {./dataset.csv}")
        print("-g or --generate to generate several dataset types\n")
        print("Choose one of the following:")
        print("-e or --experiment to run all algorithms")
        print("-m or --kmeans to run only the k-means algorithm")
        print("-o or --kmedoids to run only the k-medoid algorithm")
        print("-s or --dbscan to run only the dbscan algorithm\n")
        return -1

    args = parser.parse_args()

    if not args.dataset and not args.generate:
        print(
            "Please specificy file to open {-d ./dataset.csv} or to generate datasets {-g}"
        )
        exit()

    if not args.experiment and not (args.kmeans or args.kmedoids or args.dbscan):
        print("Please specificy experiment {-e} or one of the following algorithms:")
        print("\tk-means {-m}\n\tk-medoid {-o}\n\tdbscan {-s}")
        exit()

    if args.dataset:
        print("Path to csv: {0}".format(args.dataset))

    if args.generate:
        print("The following datasets will be generated:")
        for i in range(len(settings.datasetTypes)):
            print("\t{0}".format(settings.datasetTypes[i]))

    return args


main()