-
Notifications
You must be signed in to change notification settings - Fork 10
Update Duplicate Detection #77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
5f2e901
994e99f
ea6430f
cf0dbb5
a861fb4
b4634f3
63a4dc2
22e55a3
4b656a4
fa28afa
81a1d2f
e624d40
b0e000b
0874365
01d918a
42a65f1
b281037
ee0bca7
0c9eb2c
394764e
9b8cc19
17156b7
648e30a
29640a1
0fa62d3
d3b571c
06bae81
13fd4b7
e83c15f
070cfe3
8795aaf
2e948d8
4c347be
d177522
7b49a5b
e9c10a5
b6abca7
8c86b5f
63fa5ba
d0d4f54
3b86627
6b83c67
66029f5
fce7a2c
b9defb8
636814e
495597e
ab71333
65e4458
68d1c68
8ba449e
2be1d22
814fcfa
8917ae2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -64,3 +64,4 @@ perf.data.old | |
| .idea/ | ||
| .vscode/ | ||
| centrallix-os/tmp/* | ||
| centrallix-os/datasets/ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like you need to run make in the centrallix-doc/Widgets directory to propagate the XML change to the relevant HTML files. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,148 @@ | ||
| #ifndef CLUSTERS_H | ||
| #define CLUSTERS_H | ||
|
|
||
| /************************************************************************/ | ||
| /* Centrallix Application Server System */ | ||
| /* Centrallix Core */ | ||
| /* */ | ||
| /* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this is meant to be through the year the file was last changed (though it seems we generally do a bad job of updating it). I would at least put it through the date it was created, though. |
||
| /* */ | ||
| /* This program is free software; you can redistribute it and/or modify */ | ||
| /* it under the terms of the GNU General Public License as published by */ | ||
| /* the Free Software Foundation; either version 2 of the License, or */ | ||
| /* (at your option) any later version. */ | ||
| /* */ | ||
| /* This program is distributed in the hope that it will be useful, */ | ||
| /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| /* GNU General Public License for more details. */ | ||
| /* */ | ||
| /* You should have received a copy of the GNU General Public License */ | ||
| /* along with this program; if not, write to the Free Software */ | ||
| /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ | ||
| /* 02111-1307 USA */ | ||
| /* */ | ||
| /* A copy of the GNU General Public License has been included in this */ | ||
| /* distribution in the file "COPYING". */ | ||
| /* */ | ||
| /* Module: lib_cluster.c, lib_cluster.h */ | ||
| /* Author: Israel Fuller */ | ||
| /* Creation: September 29, 2025 */ | ||
| /* Description Clustering library used to cluster and search data with */ | ||
| /* cosine or Levenshtein (aka. edit distance) similarity */ | ||
| /* measures. Used by the "clustering driver". */ | ||
| /* For more information on how to use this library, see */ | ||
| /* string-similarity.md in the centrallix-sysdoc folder. */ | ||
| /************************************************************************/ | ||
|
|
||
| #include <stdlib.h> | ||
| #include <stdbool.h> | ||
|
|
||
| #ifdef CXLIB_INTERNAL | ||
| #include "xarray.h" | ||
| #else | ||
| #include "cxlib/xarray.h" | ||
| #endif | ||
|
|
||
| /** This file has additional documentation in string_similarity.md. **/ | ||
|
|
||
|
|
||
| /*** This value defines the number of dimensions used for a sparse | ||
| *** vector. The higher the number, the fewer collisions will be | ||
| *** encountered when using these vectors for cosine comparisons. | ||
| *** This is also called the vector table size, if viewing the | ||
| *** vector as a hash table of character pairs. | ||
| *** | ||
| *** 2147483629 is the signed int max, and is also a prime number. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should be 2147483647, unless you meant the largest prime less than the signed int max? |
||
| *** Using this value ensures that the longest run of 0s will not | ||
| *** cause an int underflow with the current encoding scheme. | ||
| *** | ||
| *** Unfortunately, we can't use a number this large yet because | ||
| *** kmeans algorithm creates densely allocated centroids with | ||
| *** `CA_NUM_DIMS` dimensions, so a large number causes it to fail. | ||
| *** This, we use 251 as the largest prime number less than 256, | ||
| *** giving us a decent balance between collision reduction and | ||
| *** kmeans centroid performance/memory overhead. | ||
| ***/ | ||
| #define CA_NUM_DIMS 251 | ||
|
|
||
| /** The character used to create a pair with the first and last characters of a string. **/ | ||
| #define CA_BOUNDARY_CHAR (unsigned char)('a' - 1) | ||
|
|
||
| /** Types. **/ | ||
| typedef int* pVector; /* Sparse vector. */ | ||
| typedef double* pCentroid; /* Dense centroid. */ | ||
| #define pCentroidSize CA_NUM_DIMS * sizeof(double) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider renaming it to I think it would be best to wrap the value in parenthesis. It seems unlikely that any of the C operators with precedence above multiplication would be used on the value, but it's best practice nonetheless. |
||
|
|
||
| /** Duplocate information. **/ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. *duplicate |
||
| typedef struct | ||
| { | ||
| void* key1; | ||
| void* key2; | ||
| double similarity; | ||
| } | ||
| Dup, *pDup; | ||
|
|
||
| /** Registering all defined types for debugging. **/ | ||
| #define ca_init() \ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see the advantage of keeping the register logic near the relevant declarations, but I feel like it complicates things too. Consider changing this into a function. |
||
| nmRegister(sizeof(pVector), "pVector"); \ | ||
| nmRegister(sizeof(pCentroid), "pCentroid"); \ | ||
| nmRegister(pCentroidSize, "Centroid"); \ | ||
| nmRegister(sizeof(Dup), "Dup") | ||
|
|
||
| /** Edit distance function. **/ | ||
| int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length); | ||
|
|
||
| /** Vector functions. **/ | ||
| pVector ca_build_vector(const char* str); | ||
| unsigned int ca_sparse_len(const pVector vector); | ||
| void ca_print_vector(const pVector vector); | ||
| void ca_free_vector(pVector sparse_vector); | ||
|
|
||
| /** Kmeans function. **/ | ||
| int ca_kmeans( | ||
| pVector* vectors, | ||
| const unsigned int num_vectors, | ||
| const unsigned int num_clusters, | ||
| const unsigned int max_iter, | ||
| const double min_improvement, | ||
| unsigned int* labels, | ||
| double* vector_sims); | ||
|
|
||
| /** Vector helper macros. **/ | ||
| #define ca_is_empty(vector) (vector[0] == -CA_NUM_DIMS) | ||
| #define ca_has_no_pairs(vector) \ | ||
| ({ \ | ||
| __typeof__ (vector) _v = (vector); \ | ||
| _v[0] == -172 && _v[1] == 11 && _v[2] == -78; \ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the value of CA_NUM_DIMS changes, this function will not work properly. It could also use a comment to explain what the 3 numbers are meant to represent (I assume a number of empty slots, the value produced by the pair made by start/end, and then the remaining empty slots) |
||
| }) | ||
|
|
||
| /** Comparison functions (see ca_search()). **/ | ||
| double ca_cos_compare(void* v1, void* v2); | ||
| double ca_lev_compare(void* str1, void* str2); | ||
| bool ca_eql(pVector v1, pVector v2); | ||
|
|
||
| /** Similarity search functions. **/ | ||
| void* ca_most_similar( | ||
| void* target, | ||
| void** data, | ||
| const unsigned int num_data, | ||
| const double (*similarity)(void*, void*), | ||
| const double threshold); | ||
| pXArray ca_sliding_search( | ||
| void** data, | ||
| const unsigned int num_data, | ||
| const unsigned int window_size, | ||
| const double (*similarity)(void*, void*), | ||
| const double dupe_threshold, | ||
| void** maybe_keys, | ||
| pXArray dups); | ||
| pXArray ca_complete_search( | ||
| void** data, | ||
| const unsigned int num_data, | ||
| const double (*similarity)(void*, void*), | ||
| const double dupe_threshold, | ||
| void** maybe_keys, | ||
| pXArray dups); | ||
|
|
||
| #endif /* End of .h file. */ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| #ifndef GLYPH_H | ||
| #define GLYPH_H | ||
|
|
||
| /************************************************************************/ | ||
| /* Centrallix Application Server System */ | ||
| /* Centrallix Core */ | ||
| /* */ | ||
| /* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ | ||
| /* */ | ||
| /* This program is free software; you can redistribute it and/or modify */ | ||
| /* it under the terms of the GNU General Public License as published by */ | ||
| /* the Free Software Foundation; either version 2 of the License, or */ | ||
| /* (at your option) any later version. */ | ||
| /* */ | ||
| /* This program is distributed in the hope that it will be useful, */ | ||
| /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ | ||
| /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ | ||
| /* GNU General Public License for more details. */ | ||
| /* */ | ||
| /* You should have received a copy of the GNU General Public License */ | ||
| /* along with this program; if not, write to the Free Software */ | ||
| /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ | ||
| /* 02111-1307 USA */ | ||
| /* */ | ||
| /* A copy of the GNU General Public License has been included in this */ | ||
| /* distribution in the file "COPYING". */ | ||
| /* */ | ||
| /* Module: glyph.h */ | ||
| /* Author: Israel Fuller */ | ||
| /* Creation: October 27, 2025 */ | ||
| /* Description: A simple debug visualizer to make pretty patterns in */ | ||
| /* developer's terminal which can be surprisingly useful */ | ||
| /* for debugging algorithms. */ | ||
| /************************************************************************/ | ||
|
|
||
| #include <stdlib.h> | ||
|
|
||
| /** Uncomment to activate glyphs. **/ | ||
| /** Should not be enabled in production code on the master branch. */ | ||
| // #define ENABLE_GLYPHS | ||
|
|
||
| #ifdef ENABLE_GLYPHS | ||
| #define glyph_print(s) printf("%s", s); | ||
|
|
||
| /*** Initialize a simple debug visualizer to make pretty patterns in the | ||
| *** developer's terminal. Great for when you need to run a long task and | ||
| *** want a super simple way to make sure it's still working. | ||
| *** | ||
| *** @attention - Relies on storing data in variables in scope, so calling | ||
| *** glyph() requires a call to glyph_init() previously in the same scope. | ||
| *** | ||
| *** @param name The symbol name of the visualizer. | ||
| *** @param str The string printed for the visualization. | ||
| *** @param interval The number of invocations of glyph() required to print. | ||
| *** @param flush Whether to flush on output. | ||
| ***/ | ||
| #define glyph_init(name, str, interval, flush) \ | ||
| const char* vis_##name##_str = str; \ | ||
| const unsigned int vis_##name##_interval = interval; \ | ||
| const bool vis_##name##_flush = flush; \ | ||
| unsigned int vis_##name##_i = 0u; | ||
|
|
||
| /*** Invoke a visualizer. | ||
| *** | ||
| *** @param name The name of the visualizer to invoke. | ||
| ***/ | ||
| #define glyph(name) \ | ||
| if (++vis_##name##_i % vis_##name##_interval == 0) \ | ||
| { \ | ||
| glyph_print(vis_##name##_str); \ | ||
| if (vis_##name##_flush) fflush(stdout); \ | ||
| } | ||
| #else | ||
| #define glyph_print(str) | ||
| #define glyph_init(name, str, interval, flush) | ||
| #define glyph(name) | ||
| #endif | ||
|
|
||
| #endif /* End of .h file. */ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The centrallix-os/datasets folder does not exist in the repo, and its not used anywhere by anything in centrallix that I can tell. I would recommend either removing it from the git ignore if it's only used by you, or if it's needed by something (perhaps a Kardia branch?) adding a simple readme so the folder exists in the repo.