Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
Binary file added Python_implementation/.DS_Store
Binary file not shown.
30 changes: 30 additions & 0 deletions Python_implementation/Readme.md
Original file line number Diff line number Diff line change
@@ -1 +1,31 @@
## Python implementation details

This is the Python3 implementation of dCrawler is a fully independent clustering algorithm that only requires a distance threshold (Th(d)) to perform clustering.

## Dependencies
Main libraries. Python version `3.9.18`

````
matplotlib==3.8.3

numpy==1.26.4

pandas==2.0.3

scikit_learn==1.3.2

scipy==1.11.3
````

## Usage
The main implementation is in `dCrawler_api.py` and can be easily used as:
````python
from dCrawler_api import*
crawler = dCrawler(threshold=1.0)
crawler.fit(data)
centroids,clusters = crawler.centroids,crawler.clusters
````


## Animation
![clustering_process](https://github.com/Elsword016/dCrawler/assets/29883365/dd47a04e-43ab-4f59-80a7-18b270d9d135)
Binary file not shown.
Binary file added Python_implementation/clustering_process.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
425 changes: 425 additions & 0 deletions Python_implementation/crawler_animate.ipynb

Large diffs are not rendered by default.

158 changes: 158 additions & 0 deletions Python_implementation/dCrawler_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# %%
from scipy.spatial import cKDTree
import numpy as np
from scipy.spatial import distance_matrix
from sklearn.metrics import pairwise_distances
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

# %%
class dCrawler:
def __init__(self,threshold=1.5): #just a placeholder value user need to the define at the call
self.threshold = threshold
self.centroids = None
self.clusters = None
self.points = None #initialise the points to none ref at the start, fix the bug of points not being defined

def fit(self, points):
self.points = points
self.centroids, self.clusters = self._crawler(points)
self.centroids, self.clusters = self._adjust_clusters(points)
self.centroids, self.clusters = self._merge_clusters(points)
self._cleanup_clusters()
return self

def predict(self, points):
if self.centroids is None or self.clusters is None:
raise ValueError("Model not fitted yet. Call 'fit' before 'predict'.")

kdtree = cKDTree(self.centroids)
_, labels = kdtree.query(points)
return labels

def _distance(self, point1, point2):
return np.sqrt(np.sum((point1 - point2) ** 2))

def _crawler(self, points):
n = len(points)
centroids = []
clusters = []
assigned = np.zeros(n, dtype=bool)

i = 0
while i < n:
if not assigned[i]:
centroid = points[i]
cluster = [i]
assigned[i] = True

while True:
distances = np.sqrt(np.sum((points[~assigned] - centroid) ** 2, axis=1))
if len(distances) == 0:
break

closest_idx = np.argmin(distances)
closest_point = points[~assigned][closest_idx]

if distances[closest_idx] <= self.threshold:
cluster.append(np.where(~assigned)[0][closest_idx])
assigned[np.where(~assigned)[0][closest_idx]] = True
centroid = np.mean(points[cluster], axis=0)

distances = np.sqrt(np.sum((points[cluster] - centroid) ** 2, axis=1))
cluster = [c for c, d in zip(cluster, distances) if d <= self.threshold]
assigned[cluster] = True
else:
break

centroids.append(centroid)
clusters.append(cluster)

i += 1

return centroids, clusters

def _adjust_clusters(self, points):
n = len(points)
assigned = np.zeros(n, dtype=bool)

while True:
kdtree = cKDTree(self.centroids)
_, labels = kdtree.query(points)

if np.all(assigned == labels):
break

assigned = labels

for i in range(len(self.centroids)):
cluster_points = points[labels == i]
if len(cluster_points) > 0:
self.centroids[i] = np.mean(cluster_points, axis=0)

self.clusters = [np.where(labels == i)[0] for i in range(len(self.centroids))]
return self.centroids, self.clusters

def _merge_clusters(self, points):
cluster_thresh = self.threshold
n_loop = 0

while True:
merged = False
kdtree = cKDTree(self.centroids)
distances, _ = kdtree.query(self.centroids, k=2)
distances = distances[:, 1]

merge_indices = np.where(distances <= cluster_thresh)[0]
#Removed list - this needs to be added to account for the fact that if the cluster has already been merged then we skip it
removed_list = []
for i in merge_indices:
#j = np.argmin(np.sqrt(np.sum((self.centroids - self.centroids[i]) ** 2, axis=1))) --> Here's the problem this should find the second one instead, because the minimum will be itself
ds = np.sqrt(np.sum((self.centroids - self.centroids[i]) ** 2, axis=1)) # Get all the distances
ds[i] = cluster_thresh #change it to the cluster thresh
j = np.argmin(ds) #so now j should be different to i unless it is above the cluster_threshold

if i != j:
if i not in removed_list:
self.centroids[i] = np.mean([self.centroids[i], self.centroids[j]], axis=0) #
#self.clusters[i].extend(self.clusters[j]) --> you can't extend a numpy array will concatenate instead
self.clusters[i] = np.concatenate((self.clusters[i], self.clusters[j]))
self.clusters[j] = []
merged = True
if len(self.clusters[i]) > 0: #check if the cluster is empty, fix the issue with the range of thresholds
cluster_indices = self.clusters[i].astype(int)
distances = np.sqrt(np.sum((points[cluster_indices] - self.centroids[i]) ** 2, axis=1))
self.clusters[i] = [c for c, d in zip(self.clusters[i], distances) if d <= self.threshold]
removed_list.append(j) #add j to the exclude list
if not merged:
break

self.centroids = [c for c, cl in zip(self.centroids, self.clusters) if len(cl) > 0]
self.clusters = [cl for cl in self.clusters if len(cl) > 0]

unassigned_points = np.where(np.isin(np.arange(len(points)), np.concatenate(self.clusters), invert=True))[0]

if len(unassigned_points) > 0:
new_centroids, new_clusters = self._crawler(points[unassigned_points])
self.centroids.extend(new_centroids)
self.clusters.extend(new_clusters)
n_loop += 1

#Add in the adjust step
self.centroids, self.clusters = self._adjust_clusters(points)

if n_loop > 20:
break

cluster_thresh *= 0.95

return self.centroids, self.clusters

def _cleanup_clusters(self):
cluster_ids = np.zeros(len(np.concatenate(self.clusters)), dtype=int)
for i, cluster in enumerate(self.clusters):
cluster_ids[cluster] = i + 1

self.clusters = [np.where(cluster_ids == i)[0] for i in range(1, np.max(cluster_ids) + 1)]
self.centroids = [np.mean(self.points[cluster], axis=0) for cluster in self.clusters] #instance
13 changes: 13 additions & 0 deletions Python_implementation/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from dCrawler_api import *

import pandas as pd
data_path = "../DemoData/demodata.csv"
df = pd.read_csv(data_path)
data = df.to_numpy()[:,:2]

crawler = dCrawler(threshold=1.0)
crawler.fit(data)
centroids,clusters = crawler.centroids,crawler.clusters
print(clusters)


21 changes: 21 additions & 0 deletions dCrawler/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) <year> Marcus Lewie,et;al

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
58 changes: 58 additions & 0 deletions dCrawler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
## dCrawler

This is the Python3 implementation of dCrawler is a fully independent clustering algorithm that only requires a distance threshold (Th(d)) to perform clustering.

Featured in the preprint: **Automated neuronal reconstruction with super-multicolour fluorescence imaging** , Lewie et,al (2022) [bioRxiv](https://www.biorxiv.org/content/10.1101/2022.10.20.512984v1)

## Implementation details

A few things are different in the Python version to make it a bit more efficient, the results are the same as the MATLAB code.
- Vectorized operations are used wherever possible to speed up computations.
- The `cKDTree` from the `scipy.spatial` module is used for efficient nearest neighbor search. Instead of calculating distances to all centroids for each point, the k-d tree is used to find the nearest centroid quickly.


![clustering_process](https://github.com/Elsword016/dCrawler/assets/29883365/2f7e6394-50e5-452a-b398-4e3022bf2ce1)

## Installation

```bash
pip install dCrawler
```

## Build from source - local development
Recommended to build a separate environment to prevent any possible errors
- Clone the repository
- Build the package with the command `python setup.py sdist`
- Then `pip install .`

## Usage

```python
from dCrawler import dCrawler
# Initialize the dCrawler object
crawler = dCrawler(threshold=1.0)
crawler.fit(data)
centroids,clusters = crawler.centroids,crawler.clusters
```




## Contributing

Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.

## Cite
```bash
@article {Leiwe2022.10.20.512984,
author = {Marcus N. Leiwe and Satoshi Fujimoto and Toshikazu Baba and Daichi Moriyasu and Biswanath Saha and Richi Sakaguchi and Shigenori Inagaki and Takeshi Imai},
title = {Automated neuronal reconstruction with super-multicolour fluorescence imaging},
elocation-id = {2022.10.20.512984},
year = {2022},
doi = {10.1101/2022.10.20.512984},
abstract = {Fluorescence imaging is widely used for the mesoscopic mapping of neuronal connectivity. However, neurite reconstruction is challenging, especially when neurons are densely labelled. Here we report a strategy for the fully automated reconstruction of densely labelled neuronal circuits. Firstly, we established stochastic {\textquotedblleft}super-multicolour{\textquotedblright} labelling with up to seven different fluorescent proteins using the Tetbow method. With this method, each neuron was labelled with a unique combination of fluorescent proteins, which were then imaged and separated by linear unmixing. We also established an automated neurite reconstruction pipeline based on the quantitative analysis of multiple dyes (QDyeFinder). To classify colour combinations, we used a newly developed unsupervised clustering algorithm, dCrawler, in which data points in multi-dimensional space were clustered based on a given threshold distance. Our new strategy allows for the reconstruction of neurites for up to hundreds of neurons at a millimetre scale without manual tracing. Competing Interest StatementTI, MNL, and SF has filed a patent application for QDyeFinder.},
URL = {https://www.biorxiv.org/content/early/2022/10/20/2022.10.20.512984},
eprint = {https://www.biorxiv.org/content/early/2022/10/20/2022.10.20.512984.full.pdf},
journal = {bioRxiv}
}
```
1 change: 1 addition & 0 deletions dCrawler/dCrawler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .dCrawler import dCrawler
Binary file not shown.
Binary file not shown.
Loading