mleiwe · Elsword016 · May 8, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/Python_implementation/.DS_Store b/Python_implementation/.DS_Store
diff --git a/Python_implementation/Readme.md b/Python_implementation/Readme.md
@@ -1 +1,31 @@
+## Python implementation details
 
+This is the Python3 implementation of dCrawler is a fully independent clustering algorithm that only requires a distance threshold (Th(d)) to perform clustering.
+
+## Dependencies
+Main libraries. Python version `3.9.18`
+
+````
+matplotlib==3.8.3
+
+numpy==1.26.4
+
+pandas==2.0.3
+
+scikit_learn==1.3.2
+
+scipy==1.11.3
+````
+
+## Usage
+The main implementation is in `dCrawler_api.py` and can be easily used as:
+````python
+from dCrawler_api import*
+crawler = dCrawler(threshold=1.0)
+crawler.fit(data)
+centroids,clusters = crawler.centroids,crawler.clusters
+````
+
+
+## Animation
+![clustering_process](https://github.com/Elsword016/dCrawler/assets/29883365/dd47a04e-43ab-4f59-80a7-18b270d9d135)
diff --git a/Python_implementation/__pycache__/dCrawler_api.cpython-311.pyc b/Python_implementation/__pycache__/dCrawler_api.cpython-311.pyc
diff --git a/Python_implementation/clustering_process.gif b/Python_implementation/clustering_process.gif
diff --git a/Python_implementation/crawler_animate.ipynb b/Python_implementation/crawler_animate.ipynb
diff --git a/Python_implementation/dCrawler_api.py b/Python_implementation/dCrawler_api.py
@@ -0,0 +1,158 @@
+# %%
+from scipy.spatial import cKDTree
+import numpy as np
+from scipy.spatial import distance_matrix
+from sklearn.metrics import pairwise_distances
+import pandas as pd 
+import matplotlib.pyplot as plt
+from scipy.spatial import cKDTree
+
+# %%
+class dCrawler:
+    def __init__(self,threshold=1.5): #just a placeholder value user need to the define at the call
+        self.threshold = threshold
+        self.centroids = None
+        self.clusters = None 
+        self.points = None #initialise the points to none ref at the start, fix the bug of points not being defined 
+
+    def fit(self, points):
+        self.points = points
+        self.centroids, self.clusters = self._crawler(points)
+        self.centroids, self.clusters = self._adjust_clusters(points)
+        self.centroids, self.clusters = self._merge_clusters(points)
+        self._cleanup_clusters()
+        return self
+
+    def predict(self, points):
+        if self.centroids is None or self.clusters is None:
+            raise ValueError("Model not fitted yet. Call 'fit' before 'predict'.")
+
+        kdtree = cKDTree(self.centroids)
+        _, labels = kdtree.query(points)
+        return labels
+
+    def _distance(self, point1, point2):
+        return np.sqrt(np.sum((point1 - point2) ** 2))
+
+    def _crawler(self, points):
+        n = len(points)
+        centroids = []
+        clusters = []
+        assigned = np.zeros(n, dtype=bool)
+
+        i = 0
+        while i < n:
+            if not assigned[i]:
+                centroid = points[i]
+                cluster = [i]
+                assigned[i] = True
+
+                while True:
+                    distances = np.sqrt(np.sum((points[~assigned] - centroid) ** 2, axis=1))
+                    if len(distances) == 0:
+                        break
+
+                    closest_idx = np.argmin(distances)
+                    closest_point = points[~assigned][closest_idx]
+
+                    if distances[closest_idx] <= self.threshold:
+                        cluster.append(np.where(~assigned)[0][closest_idx])
+                        assigned[np.where(~assigned)[0][closest_idx]] = True
+                        centroid = np.mean(points[cluster], axis=0)
+
+                        distances = np.sqrt(np.sum((points[cluster] - centroid) ** 2, axis=1))
+                        cluster = [c for c, d in zip(cluster, distances) if d <= self.threshold]
+                        assigned[cluster] = True
+                    else:
+                        break
+
+                centroids.append(centroid)
+                clusters.append(cluster)
+
+            i += 1
+
+        return centroids, clusters
+
+    def _adjust_clusters(self, points):
+        n = len(points)
+        assigned = np.zeros(n, dtype=bool)
+
+        while True:
+            kdtree = cKDTree(self.centroids)
+            _, labels = kdtree.query(points)
+
+            if np.all(assigned == labels):
+                break
+
+            assigned = labels
+
+            for i in range(len(self.centroids)):
+                cluster_points = points[labels == i]
+                if len(cluster_points) > 0:
+                    self.centroids[i] = np.mean(cluster_points, axis=0)
+
+        self.clusters = [np.where(labels == i)[0] for i in range(len(self.centroids))]
+        return self.centroids, self.clusters
+
+    def _merge_clusters(self, points):
+        cluster_thresh = self.threshold
+        n_loop = 0
+
+        while True:
+            merged = False
+            kdtree = cKDTree(self.centroids)
+            distances, _ = kdtree.query(self.centroids, k=2)
+            distances = distances[:, 1]
+
+            merge_indices = np.where(distances <= cluster_thresh)[0]
+            #Removed list - this needs to be added to account for the fact that if the cluster has already been merged then we skip it
+            removed_list = []
+            for i in merge_indices:
+                #j = np.argmin(np.sqrt(np.sum((self.centroids - self.centroids[i]) ** 2, axis=1))) --> Here's the problem this should find the second one instead, because the minimum will be itself
+                ds = np.sqrt(np.sum((self.centroids - self.centroids[i]) ** 2, axis=1)) # Get all the distances
+                ds[i] = cluster_thresh #change it to the cluster thresh
+                j = np.argmin(ds) #so now j should be different to i unless it is above the cluster_threshold
+
+                if i != j:
+                    if i not in removed_list:
+                        self.centroids[i] = np.mean([self.centroids[i], self.centroids[j]], axis=0) #
+                        #self.clusters[i].extend(self.clusters[j]) --> you can't extend a numpy array will concatenate instead
+                        self.clusters[i] = np.concatenate((self.clusters[i], self.clusters[j]))
+                        self.clusters[j] = []
+                        merged = True
+                        if len(self.clusters[i]) > 0: #check if the cluster is empty, fix the issue with the range of thresholds
+                            cluster_indices = self.clusters[i].astype(int)
+                            distances = np.sqrt(np.sum((points[cluster_indices] - self.centroids[i]) ** 2, axis=1))
+                            self.clusters[i] = [c for c, d in zip(self.clusters[i], distances) if d <= self.threshold]
+                        removed_list.append(j) #add j to the exclude list
+            if not merged:
+                break
+
+            self.centroids = [c for c, cl in zip(self.centroids, self.clusters) if len(cl) > 0]
+            self.clusters = [cl for cl in self.clusters if len(cl) > 0]
+
+            unassigned_points = np.where(np.isin(np.arange(len(points)), np.concatenate(self.clusters), invert=True))[0]
+
+            if len(unassigned_points) > 0:
+                new_centroids, new_clusters = self._crawler(points[unassigned_points])
+                self.centroids.extend(new_centroids)
+                self.clusters.extend(new_clusters)
+                n_loop += 1
+
+            #Add in the adjust step
+            self.centroids, self.clusters = self._adjust_clusters(points)
+
+            if n_loop > 20:
+                break
+
+            cluster_thresh *= 0.95
+
+        return self.centroids, self.clusters
+
+    def _cleanup_clusters(self):
+        cluster_ids = np.zeros(len(np.concatenate(self.clusters)), dtype=int)
+        for i, cluster in enumerate(self.clusters):
+            cluster_ids[cluster] = i + 1
+
+        self.clusters = [np.where(cluster_ids == i)[0] for i in range(1, np.max(cluster_ids) + 1)] 
+        self.centroids = [np.mean(self.points[cluster], axis=0) for cluster in self.clusters] #instance
diff --git a/Python_implementation/test.py b/Python_implementation/test.py
@@ -0,0 +1,13 @@
+from dCrawler_api import *
+
+import pandas as pd
+data_path = "../DemoData/demodata.csv"
+df = pd.read_csv(data_path)
+data = df.to_numpy()[:,:2]
+
+crawler = dCrawler(threshold=1.0)
+crawler.fit(data)
+centroids,clusters = crawler.centroids,crawler.clusters
+print(clusters)
+
+
diff --git a/dCrawler/LICENSE b/dCrawler/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) <year> Marcus Lewie,et;al
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/dCrawler/README.md b/dCrawler/README.md
@@ -0,0 +1,58 @@
+## dCrawler 
+
+This is the Python3 implementation of dCrawler is a fully independent clustering algorithm that only requires a distance threshold (Th(d)) to perform clustering.
+
+Featured in the preprint:  **Automated neuronal reconstruction with super-multicolour fluorescence imaging** , Lewie et,al (2022) [bioRxiv](https://www.biorxiv.org/content/10.1101/2022.10.20.512984v1)
+
+## Implementation details
+
+A few things are different in the Python version to make it a bit more efficient, the results are the same as the MATLAB code.
+- Vectorized operations are used wherever possible to speed up computations.
+- The `cKDTree` from the `scipy.spatial` module is used for efficient nearest neighbor search. Instead of calculating distances to all centroids for each point, the k-d tree is used to find the nearest centroid quickly.
+
+
+![clustering_process](https://github.com/Elsword016/dCrawler/assets/29883365/2f7e6394-50e5-452a-b398-4e3022bf2ce1)
+
+## Installation
+
+```bash
+pip install dCrawler
+```
+
+## Build from source - local development
+Recommended to build a separate environment to prevent any possible errors
+- Clone the repository
+- Build the package with the command `python setup.py sdist`
+- Then `pip install .`
+
+## Usage
+
+```python
+from dCrawler import dCrawler
+# Initialize the dCrawler object
+crawler = dCrawler(threshold=1.0)
+crawler.fit(data)
+centroids,clusters = crawler.centroids,crawler.clusters
+```
+
+
+
+
+## Contributing
+
+Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
+
+## Cite
+```bash
+@article {Leiwe2022.10.20.512984,
+	author = {Marcus N. Leiwe and Satoshi Fujimoto and Toshikazu Baba and Daichi Moriyasu and Biswanath Saha and Richi Sakaguchi and Shigenori Inagaki and Takeshi Imai},
+	title = {Automated neuronal reconstruction with super-multicolour fluorescence imaging},
+	elocation-id = {2022.10.20.512984},
+	year = {2022},
+	doi = {10.1101/2022.10.20.512984},
+	abstract = {Fluorescence imaging is widely used for the mesoscopic mapping of neuronal connectivity. However, neurite reconstruction is challenging, especially when neurons are densely labelled. Here we report a strategy for the fully automated reconstruction of densely labelled neuronal circuits. Firstly, we established stochastic {\textquotedblleft}super-multicolour{\textquotedblright} labelling with up to seven different fluorescent proteins using the Tetbow method. With this method, each neuron was labelled with a unique combination of fluorescent proteins, which were then imaged and separated by linear unmixing. We also established an automated neurite reconstruction pipeline based on the quantitative analysis of multiple dyes (QDyeFinder). To classify colour combinations, we used a newly developed unsupervised clustering algorithm, dCrawler, in which data points in multi-dimensional space were clustered based on a given threshold distance. Our new strategy allows for the reconstruction of neurites for up to hundreds of neurons at a millimetre scale without manual tracing. Competing Interest StatementTI, MNL, and SF has filed a patent application for QDyeFinder.},
+	URL = {https://www.biorxiv.org/content/early/2022/10/20/2022.10.20.512984},
+	eprint = {https://www.biorxiv.org/content/early/2022/10/20/2022.10.20.512984.full.pdf},
+	journal = {bioRxiv}
+}
+```
diff --git a/dCrawler/dCrawler/__init__.py b/dCrawler/dCrawler/__init__.py
@@ -0,0 +1 @@
+from .dCrawler import dCrawler
diff --git a/dCrawler/dCrawler/__pycache__/__init__.cpython-311.pyc b/dCrawler/dCrawler/__pycache__/__init__.cpython-311.pyc
diff --git a/dCrawler/dCrawler/__pycache__/dCrawler.cpython-311.pyc b/dCrawler/dCrawler/__pycache__/dCrawler.cpython-311.pyc