From 5f2e901a458d4d69beeb46735879389b37d238b5 Mon Sep 17 00:00:00 2001 From: Israel Date: Mon, 13 Oct 2025 09:53:43 -0600 Subject: [PATCH 01/30] Checkpoint: Switching to DM UI project. --- .gitignore | 1 + centrallix-lib/Makefile.in | 4 +- centrallix-lib/include/clusters.h | 93 + centrallix-lib/include/util.h | 109 +- centrallix-lib/include/xhash.h | 4 +- centrallix-lib/src/clusters.c | 989 +++++ centrallix-lib/src/util.c | 170 +- centrallix-lib/src/xhash.c | 70 + centrallix-os/cluster-schema.cluster | 176 + centrallix-os/file.cluster | 64 + centrallix-os/file2.cluster | 42 + centrallix-sysdoc/OSDriver_Authoring.md | 99 +- centrallix-sysdoc/string_comparison.md | 12 +- centrallix/Makefile.in | 3 + centrallix/centrallix.c | 2 +- centrallix/etc/types.cfg | 1 + centrallix/expression/exp_compiler.c | 17 +- centrallix/expression/exp_double_metaphone.c | 1517 ++++++++ centrallix/expression/exp_functions.c | 1819 ++++++++- centrallix/include/cxss/policy.h | 2 +- centrallix/include/expression.h | 1 + centrallix/include/stparse.h | 2 +- centrallix/osdrivers/objdrv_cluster.c | 3345 +++++++++++++++++ .../tests/test_expfn_double_metaphone_00.cmp | 140 + .../tests/test_expfn_double_metaphone_00.to | 161 + 25 files changed, 8751 insertions(+), 92 deletions(-) create mode 100644 centrallix-lib/include/clusters.h create mode 100644 centrallix-lib/src/clusters.c create mode 100644 centrallix-os/cluster-schema.cluster create mode 100644 centrallix-os/file.cluster create mode 100644 centrallix-os/file2.cluster create mode 100644 centrallix/expression/exp_double_metaphone.c create mode 100644 centrallix/osdrivers/objdrv_cluster.c create mode 100644 centrallix/tests/test_expfn_double_metaphone_00.cmp create mode 100644 centrallix/tests/test_expfn_double_metaphone_00.to diff --git a/.gitignore b/.gitignore index cbfe20f1d..bddd6b099 100644 --- a/.gitignore +++ b/.gitignore @@ -62,3 +62,4 @@ perf.data.old .idea/ .vscode/ centrallix-os/tmp/* +centrallix-os/datasets/ diff --git a/centrallix-lib/Makefile.in b/centrallix-lib/Makefile.in index a7197622b..20c57c11f 100644 --- a/centrallix-lib/Makefile.in +++ b/centrallix-lib/Makefile.in @@ -63,10 +63,10 @@ CFLAGS=@CFLAGS@ @DEFS@ -Iinclude -DCXLIB_INTERNAL -DNM_USE_SYSMALLOC -Wall $(PRO MTCFLAGS=@CFLAGS@ @DEFS@ -Iinclude -DCXLIB_INTERNAL -DNM_USE_SYSMALLOC -Wall $(PROFILE) $(COVERAGE) -g -O0 TCFLAGS=$(patsubst -DNDEBUG,,$(CFLAGS)) -XSTATICFILES=mtask.o mtlexer.o memstr.o xarray.o xhash.o xstring.o mtsession.o newmalloc.o xhashqueue.o bdqs_transport.o xhandle.o xringqueue.o cxsec.o smmalloc.o qprintf.o strtcpy.o util.o +XSTATICFILES=mtask.o mtlexer.o memstr.o xarray.o xhash.o xstring.o mtsession.o newmalloc.o xhashqueue.o bdqs_transport.o xhandle.o xringqueue.o cxsec.o smmalloc.o clusters.o qprintf.o strtcpy.o util.o STATICFILES=$(patsubst %,src/%,$(XSTATICFILES)) -XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo qprintf.lo strtcpy.lo util.lo +XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo clusters.o qprintf.lo strtcpy.lo util.lo DYNAMICFILES=$(patsubst %,src/%,$(XDYNAMICFILES)) INCLUDEFILES:=$(wildcard include/*.h) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h new file mode 100644 index 000000000..2605b4314 --- /dev/null +++ b/centrallix-lib/include/clusters.h @@ -0,0 +1,93 @@ + +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: lib_cluster.c */ +/* Author: Israel Fuller */ +/* Creation: September 29, 2025 */ +/* Description: Internal algorithms for the cluster object driver. */ +/* See centrallix-sysdoc/EAV_Pivot.md for more information. */ +/************************************************************************/ + +#include + +#ifdef CXLIB_INTERNAL +#include "xarray.h" +#else +#include "cxlib/xarray.h" +#endif + +#define CA_NUM_DIMS 251 /* aka. The vector table size. */ + +/** The character used to create a pair with the first and last characters of a string. **/ +#define CA_BOUNDARY_CHAR ('a' - 1) + +/** Types. **/ +typedef int* pVector; /* Sparse vector. */ +typedef double* pCentroid; /* Dense centroid. */ +#define pCentroidSize CA_NUM_DIMS * sizeof(double) + +/** Duplocate information. **/ +typedef struct + { + unsigned int id1; + unsigned int id2; + double similarity; + } + Dup, *pDup; + +pVector ca_build_vector(const char* str); +unsigned int ca_sparse_len(const pVector vector); +void ca_free_vector(pVector sparse_vector); +void ca_kmeans( + pVector* vectors, + const unsigned int num_vectors, + unsigned int* labels, + const unsigned int num_clusters, + const unsigned int max_iter, + const double improvement_threshold +); +pXArray ca_search( + pVector* vectors, + const unsigned int num_vectors, + const unsigned int* labels, + const double dupe_threshold +); +pXArray ca_lightning_search( + pVector* vectors, + const unsigned int num_vectors, + const double dupe_threshold +); +unsigned int ca_edit_dist( + const char* str1, + const char* str2, + const size_t str1_length, + const size_t str2_length +); +pXArray ca_phone_search( + char dataset[][10u], + const unsigned int dataset_size, + const double dupe_threshold +); +void ca_init(); diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index df4ba0d58..2b9d7b26f 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -21,14 +21,119 @@ extern "C" { #endif - int strtoi(const char *nptr, char **endptr, int base); unsigned int strtoui(const char *nptr, char **endptr, int base); + char* snprint_bytes(char* buf, const size_t buf_size, unsigned int bytes); + void fprint_mem(FILE* out); + + typedef struct + { + double start, end; + } + Timer, *pTimer; + + pTimer timer_init(pTimer timer); + pTimer timer_new(void); + pTimer timer_start(pTimer timer); + pTimer timer_stop(pTimer timer); + double timer_get(pTimer timer); + void timer_de_init(pTimer timer); + void timer_free(pTimer timer); #ifdef __cplusplus } #endif -#endif /* UTILITY_H */ +#ifndef __cplusplus + +/** TODO: Greg, is the __typeof__ syntax from GCC a portability concern? **/ + +/*** @brief Returns the smaller of two values. + *** + *** @param a The first value. + *** @param b The second value. + *** @return The smaller of the two values. + *** + *** @note This macro uses GCC extensions to enusre type safety. + ***/ +#define min(a, b) \ + ({ \ + __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + (_a < _b) ? _a : _b; \ + }) + +/*** @brief Returns the larger of two values. + *** + *** @param a The first value. + *** @param b The second value. + *** @return The larger of the two values. + *** + *** @note This macro uses GCC extensions to enusre type safety. + ***/ +#define max(a, b) \ + ({ \ + __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + (_a > _b) ? _a : _b; \ + }) + +/** Error Handling. **/ +void fail(const char* function_name, int code); +/*** Helper function for compact error handling on library & system function calls. + *** Any non-zero value is treated as an error, exiting the program. + *** + *** @param result The result of the function we're checking. + *** @returns result + ***/ +#define check(result) \ + ({ \ + __typeof__ (result) _r = (result); \ + if (_r != 0) fail(#result, _r); \ + _r; \ + }) + +/*** Helper function for compact error handling on library & system function calls. + *** Any negative is treated as an error, exiting the program. + *** + *** @param result The result of the function we're checking. + *** @returns result + ***/ +#define check_neg(result) \ + ({ \ + __typeof__ (result) _r = (result); \ + if (_r < 0) fail(#result, _r); \ + _r; \ + }) + +/*** Helper function for compact error handling on library & system function calls. + *** Any value of -1 is treated as an error, exiting the program. + *** + *** @param result The result of the function we're checking. + *** @returns result + ***/ +#define check_strict(result) \ + ({ \ + __typeof__ (result) _r = (result); \ + if (_r == -1) fail(#result, _r); \ + _r; \ + }) + +/*** Helper function for compact error handling on library & system function calls. + *** Any null value is treated as an error, exiting the program. + *** + *** @param result The result of the function we're checking + *** @returns result + ***/ +#define check_ptr(result) \ + ({ \ + __typeof__ (result) _r = (result); \ + if (_r == NULL) fail(#result, 0); \ + _r; \ + }) + +#endif /* __cplusplus */ + +#endif /* UTILITY_H */ diff --git a/centrallix-lib/include/xhash.h b/centrallix-lib/include/xhash.h index 1b5d8459a..65b900570 100644 --- a/centrallix-lib/include/xhash.h +++ b/centrallix-lib/include/xhash.h @@ -1,7 +1,6 @@ #ifndef _XHASH_H #define _XHASH_H - /************************************************************************/ /* Centrallix Application Server System */ /* Centrallix Base Library */ @@ -55,6 +54,7 @@ int xhAdd(pXHashTable this, char* key, char* data); int xhRemove(pXHashTable this, char* key); char* xhLookup(pXHashTable this, char* key); int xhClear(pXHashTable this, int (*free_fn)(), void* free_arg); +int xhForEach(pXHashTable this, int (*callback_fn)(pXHashEntry, void*), void* each_arg); +int xhClearKeySafe(pXHashTable this, void (*free_fn)(pXHashEntry, void*), void* free_arg); #endif /* _XHASH_H */ - diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c new file mode 100644 index 000000000..4e41d449d --- /dev/null +++ b/centrallix-lib/src/clusters.c @@ -0,0 +1,989 @@ + +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: lib_cluster.c */ +/* Author: Israel Fuller */ +/* Creation: September 29, 2025 */ +/* Description: Internal algorithms for the cluster object driver. */ +/* See centrallix-sysdoc/EAV_Pivot.md for more information. */ +/************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "clusters.h" +#include "newmalloc.h" +#include "util.h" +#include "xarray.h" + +/*** Gets the hash, representing a pair of ASCII characters, represented by unsigned ints. + *** Thank you to professor John Delano for this hashing algorithm. + *** + *** @param num1 The first character in the pair. + *** @param num1 The second character in the pair. + *** @returns The resulting hash. + ***/ +static unsigned int hash_char_pair(const unsigned int num1, const unsigned int num2) + { + if (num1 == CA_BOUNDARY_CHAR && num2 == CA_BOUNDARY_CHAR) + { + // fprintf(stderr, + // "hash_char_pair(%u, %u) - Warning: Pair of boundary characters.\n", + // num1, num2 + // ); + } + const double sum = (num1 * num1 * num1) + (num2 * num2 * num2); + const double scale = ((double)num1 + 1.0) / ((double)num2 + 1.0); + const unsigned int hash = (unsigned int)round(sum * scale) - 1u; + return hash % CA_NUM_DIMS; + } + +/*** Builds a vector using a string. + *** + *** Vectors are based on the frequencies of character pairs in the string. + *** Space characters and punctuation characters (see code for list) are ignored, + *** and all characters are converted to lowercase. Character 96, which is just + *** before 'a' in the ASCII table (and maps to '`') is used to make pairs on the + *** start and end of strings. The only supported characters for the passed char* + *** are spaces, punctuation, uppercase and lowercase letters, and numbers. + *** + *** This results in the following modified ASCII table: + *** ```csv + *** #, char, #, char, #, char + *** 97, a, 109, m, 121, y + *** 98, b, 110, n, 122, z + *** 99, c, 111, o, 123, 0 + *** 100, d, 112, p, 124, 1 + *** 101, e, 113, q, 125, 2 + *** 102, f, 114, r, 126, 3 + *** 103, g, 115, s, 127, 4 + *** 104, h, 116, t, 128, 5 + *** 105, i, 117, u, 129, 6 + *** 106, j, 118, v, 130, 7 + *** 107, k, 119, w, 131, 8 + *** 108, l, 120, x, 132, 9 + *** ``` + *** Thus, any number from 96 (the start/end character) to 132 ('9') is a valid + *** input to get_char_pair_hash(). + *** + *** After hashing each character pair, we add some number from 1 to 13 to the + *** coresponding dimention. However, for most names, this results in a lot of + *** zeros and a FEW positive numbers. Thus, after creating the dense vector, + *** we convert it to a sparse vector in which a negative number replaces a run + *** of that many zeros. Consider the following example: + *** + *** Dense pVector: `[1,0,0,0,3,0]` + *** + *** Sparse pVector: `[1,-3,3,-1]` + *** + *** Using these sparse vectors greatly reduces the required memory and gives + *** aproximately an x5 boost to performance when traversing vectors, at the + *** cost of more algorithmically complex code. + *** + *** @param str The string to be divided into pairs and hashed to make the vector. + *** @returns The sparse vector built using the hashed character pairs. + ***/ +pVector ca_build_vector(const char* str) + { + /** Allocate space for a dense vector. **/ + unsigned int dense_vector[CA_NUM_DIMS] = {0u}; + + /** j is the former character, i is the latter. **/ + const unsigned int num_chars = (unsigned int)strlen(str); + for (unsigned int j = 65535u, i = 0u; i <= num_chars; i++) + { + /** isspace: space, \n, \v, \f, \r **/ + if (isspace(str[i])) continue; + + /** ispunct: !"#$%&'()*+,-./:;<=>?@[\]^_{|}~ **/ + if (ispunct(str[i]) && str[i] != CA_BOUNDARY_CHAR) continue; + + /*** iscntrl (0-8): NULL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS + *** (14-31): SO, SI, DLE, DC1-4, NAK, SYN, ETB, CAN EM, + *** SUB, ESC, FS, GS, RS, US + ***/ + if (iscntrl(str[i]) && i != num_chars) + { + fprintf(stderr, + "ca_build_vector(%s) - Warning: Skipping unknown character #%u.\n", + str, (unsigned int)str[i] + ); + continue; + } + + /** First and last character should fall one before 'a' in the ASCII table. **/ + unsigned int temp1 = (j == 65535u) ? CA_BOUNDARY_CHAR : (unsigned int)tolower(str[j]); + unsigned int temp2 = (i == num_chars) ? CA_BOUNDARY_CHAR : (unsigned int)tolower(str[i]); + + /** Shift numbers to the end of the lowercase letters. **/ + if ('0' <= temp1 && temp1 <= '9') temp1 += 75u; + if ('0' <= temp2 && temp2 <= '9') temp2 += 75u; + + /** Hash the character pair into an index (dimension). **/ + /** Note that temp will be between 97 ('a') and 132 ('9'). **/ + unsigned int dim = hash_char_pair(temp1, temp2); + + /** Increment the dimension of the dense vector by a number from 1 to 13. **/ + dense_vector[dim] += (temp1 + temp2) % 13u + 1u; + + j = i; + } + + /** Count how much space is needed for a sparse vector. **/ + bool zero_prev = false; + size_t size = 0u; + for (unsigned int dim = 0u; dim < CA_NUM_DIMS; dim++) + { + if (dense_vector[dim] == 0u) + { + size += (zero_prev) ? 0u : 1u; + zero_prev = true; + } + else + { + size++; + zero_prev = false; + } + } + + /*** Check compression size. + *** If this check fails, I doubt anything will break. However, the longest + *** word I know (supercalifragilisticexpialidocious) has only 35 character + *** pairs, so it shouldn't reach half this size (and it'd be even shorter + *** if the hash generates at least one collision). + *** + *** Bad vector compression will result in degraded performace and increased + *** memory usage. This indicates a likely bug in the code. Thus, if this + *** warning is ever generated, it is definitely worth investigating. + ***/ + const size_t expected_max_size = 64u; + if (size > expected_max_size) + { + fprintf(stderr, + "cli_build_vector(%s) - Warning: Sparse vector larger than expected.\n" + " > Size: %lu\n" + " > #Dims: %u\n", + str, + size, + CA_NUM_DIMS + ); + } + + /** Allocate space for sparse vector. **/ + const size_t sparse_vector_size = size * sizeof(int); + pVector sparse_vector = (pVector)nmSysMalloc(sparse_vector_size); + if (sparse_vector == NULL) + { + fprintf(stderr, + "cli_build_vector(%s) - nmSysMalloc(%lu) failed.\n", + str, sparse_vector_size + ); + return NULL; + } + + /** Convert the dense vector above to a sparse vector. **/ + unsigned int j = 0u, sparse_idx = 0u; + while (j < CA_NUM_DIMS) + { + if (dense_vector[j] == 0u) + { + /*** Count and store consecutive zeros, except the first one, + *** which we already know is zero. + ***/ + unsigned int zero_count = 1u; + j++; + while (j < CA_NUM_DIMS && dense_vector[j] == 0u) + { + zero_count++; + j++; + } + sparse_vector[sparse_idx++] = (int)-zero_count; + } + else + { + /** Store the value. **/ + sparse_vector[sparse_idx++] = (int)dense_vector[j++]; + } + } + + return sparse_vector; + } + +/*** Free memory allocated to store a sparse vector. + *** + *** @param sparse_vector The sparse vector being freed. + ***/ +void ca_free_vector(pVector sparse_vector) + { + nmSysFree(sparse_vector); + } + +/*** Compute the magnitude of a sparsely allocated vector. + *** + *** @param vector The vector. + *** @returns The computed magnitude. + ***/ +static double magnitude_sparse(const pVector vector) + { + unsigned int magnitude = 0u; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = vector[i++]; + + /** Negative val represents -val 0s in the array, so skip that many values. **/ + if (val < 0) dim += (unsigned)(-val); + + /** We have a param_value, so square it and add it to the magnitude. **/ + else { magnitude += (unsigned)(val * val); dim++; } + } + return sqrt((double)magnitude); + } + +/*** Compute the length of a sparsely allocated vector. + *** + *** @param vector The vector. + *** @returns The computed length. + ***/ +unsigned int ca_sparse_len(const pVector vector) + { + unsigned int i = 0u; + for (unsigned int dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = vector[i++]; + + /** Negative val represents -val 0s in the array, so skip that many values. **/ + if (val < 0) dim += (unsigned)(-val); + + /** We have a param_value, but we don't need to do anything with it. **/ + else dim++; + } + return i; + } + +/*** Compute the magnitude of a densely allocated centroid. + *** + *** @param centroid The centroid. + *** @returns The computed magnitude. + ***/ +static double magnitude_dense(const pCentroid centroid) + { + double magnitude = 0.0; + for (int i = 0; i < CA_NUM_DIMS; i++) + magnitude += centroid[i] * centroid[i]; + return sqrt(magnitude); + } + +/*** Parse a token from a sparsely allocated vector and write the param_value and + *** number of remaining values to the passed locations. + *** + *** @param token The sparse vector token being parsed. + *** @param remaining The location to save the remaining number of characters. + *** @param param_value The location to save the param_value of the token. + ***/ +static void parse_vector_token(const int token, unsigned int* remaining, unsigned int* param_value) + { + if (token < 0) + { + /** This run contains -token zeros. **/ + *remaining = (unsigned)(-token); + *param_value = 0u; + } + else + { + /** This run contains one param_value. **/ + *remaining = 1u; + *param_value = (unsigned)(token); + } + } + +/*** Calculate the similarity on sparcely allocated vectors. Comparing + *** any string to an empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param v2 Sparse vector #2. + *** @returns Similarity between 0 and 1 where + *** 1 indicates identical and + *** 0 indicates completely different. + ***/ +static double sparse_similarity(const pVector v1, const pVector v2) + { + /** Calculate dot product. **/ + unsigned int vec1_remaining = 0u, vec2_remaining = 0u; + unsigned int dim = 0u, i1 = 0u, i2 = 0u, dot_product = 0u; + while (dim < CA_NUM_DIMS) + { + unsigned int val1 = 0u, val2 = 0u; + if (vec1_remaining == 0u) parse_vector_token(v1[i1++], &vec1_remaining, &val1); + if (vec2_remaining == 0u) parse_vector_token(v2[i2++], &vec2_remaining, &val2); + + /*** Accumulate the dot_product. If either vector is 0 here, + *** the total is 0 and this statement does nothing. + ***/ + dot_product += val1 * val2; + + /** Consume overlap from both runs. **/ + unsigned int overlap = min(vec1_remaining, vec2_remaining); + vec1_remaining -= overlap; + vec2_remaining -= overlap; + dim += overlap; + } + + /** Optional optimization to speed up nonsimilar vectors. **/ + if (dot_product == 0u) return 0.0; + + /** Return the difference score. **/ + return (double)dot_product / (magnitude_sparse(v1) * magnitude_sparse(v2)); + } + +/*** Calculate the difference on sparcely allocated vectors. Comparing + *** any string to an empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param v2 Sparse vector #2. + *** @returns Similarity between 0 and 1 where + *** 1 indicates completely different and + *** 0 indicates identical. + ***/ +#define sparse_dif(v1, v2) (1.0 - sparse_similarity(v1, v2)) + +/*** Calculate the similarity between a sparsely allocated vector + *** and a densely allocated centroid. Comparing any string to an + *** empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param c1 Dense centroid #2. + *** @returns Similarity between 0 and 1 where + *** 1 indicates identical and + *** 0 indicates completely different. + ***/ +static double sparse_similarity_to_centroid(const pVector v1, const pCentroid c2) + { + /** Calculate dot product. **/ + double dot_product = 0.0; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = v1[i++]; + + /** Negative val represents -val 0s in the array, so skip that many values. **/ + if (val < 0) dim += (unsigned)(-val); + + /** We have a param_value, so square it and add it to the magnitude. **/ + else dot_product += (double)val * c2[dim++]; + } + + /** Return the difference score. **/ + return dot_product / (magnitude_sparse(v1) * magnitude_dense(c2)); + } + +/*** Calculate the difference between a sparsely allocated vector + *** and a densely allocated centroid. Comparing any string to an + *** empty string should always return 0.5 (untested). + *** + *** @param v1 Sparse vector #1. + *** @param c1 Dense centroid #2. + *** @returns Difference between 0 and 1 where + *** 1 indicates completely different and + *** 0 indicates identical. + ***/ +#define sparse_dif_to_centroid(v1, c2) (1.0 - sparse_similarity_to_centroid(v1, c2)) + +/*** Calculate the average size of all clusters in a set of vectors. + *** + *** @param vectors The vectors of the dataset (allocated sparsely). + *** @param num_vectors The number of vectors in the dataset. + *** @param labels The clusters to which vectors are assigned. + *** @param centroids The locations of the centroids (allocated densely). + *** @param num_clusters The number of centroids (k). + *** @returns The average cluster size. + ***/ +static double get_cluster_size( + pVector* vectors, + const unsigned int num_vectors, + unsigned int* labels, + pCentroid* centroids, + const unsigned int num_clusters) + { + /** Could be up to around 1KB on the stack, but I think that's fine. **/ + double cluster_sums[num_clusters]; + unsigned int cluster_counts[num_clusters]; + for (unsigned int i = 0u; i < num_clusters; i++) + cluster_sums[i] = 0.0; + memset(cluster_counts, 0, sizeof(cluster_counts)); + + /** Sum the difference from each vector to its cluster centroid. **/ + for (unsigned int i = 0u; i < num_vectors; i++) + { + const unsigned int label = labels[i]; + cluster_sums[label] += sparse_dif_to_centroid(vectors[i], centroids[label]); + cluster_counts[label]++; + } + + /** Add up the average cluster size. **/ + double cluster_total = 0.0; + unsigned int num_valid_clusters = 0u; + for (unsigned int label = 0u; label < num_clusters; label++) + { + const unsigned int cluster_count = cluster_counts[label]; + if (cluster_count == 0u) continue; + + cluster_total += cluster_sums[label] / cluster_count; + num_valid_clusters++; + } + + /** Return average sizes. **/ + return cluster_total / num_valid_clusters; + } + +/*** Compute the param_value for `k` (number of clusters), given a dataset of with + *** a size of `n`. + *** + *** The following table shows data sizes vs.selected cluster size. In testing, + *** these numbers tended to givea good balance of accuracy and dulocates detected. + *** + *** ```csv + *** Data Size, Actual + *** 10k, 12 + *** 100k, 33 + *** 1M, 67 + *** 4M, 93 + *** ``` + *** + *** This function is not intended for datasets smaller than (`n < ~2000`). + *** These should be handled using complete search. + *** + *** LaTeX Notation: \log_{36}\left(n\right)^{3.1}-8 + *** + *** @param n The size of the dataset. + *** @returns k, the number of clusters to use. + *** + *** Complexity: `O(1)` + ***/ +unsigned int compute_k(const unsigned int n) + { + return (unsigned)max(2, pow(log(n) / log(36), 3.2) - 8); + } + +/*** Executes the k-means clustering algorithm. Selects NUM_CLUSTERS random + *** vectors as initial centroids. Then points are assigned to the nearest + *** centroid, after which centroids are moved to the center of their points. + *** + *** @param vectors The vectors to cluster. + *** @param num_vectors The number of vectors to cluster. + *** @param labels Stores the final cluster identities of the vectors after + *** clustering is completed. + *** @param centroids Stores the locations of the centroids used for the clusters + *** of the data. + *** @param iterations The number of iterations that actually executed is stored + *** here. Leave this NULL if you don't care. + *** @param max_iter The max number of iterations. + *** @param num_clusters The number of clusters to generate. + *** + *** @attention - Assumes: num_vectors is the length of vectors. + *** @attention - Assumes: num_clusters is the length of labels. + *** + *** @attention - Issue: At larger numbers of clustering iterations, some + *** clusters have a size of negative infinity. In this implementation, + *** the bug is mitigated by setting a small number of max iterations, + *** such as 16 instead of 100. + *** @attention - Issue: Clusters do not apear to improve much after the first + *** iteration, which puts the efficacy of the algorithm into question. This + *** may be due to the uneven density of a typical dataset. However, the + *** clusters still offer useful information. + *** + *** Complexity: + *** + *** - `O(kd + k + i*(k + n*(k+d) + kd))` + *** + *** - `O(kd + k + ik + ink + ind + ikd)` + *** + *** - `O(nk + nd)` + ***/ +void ca_kmeans( + pVector* vectors, + const unsigned int num_vectors, + unsigned int* labels, + const unsigned int num_clusters, + const unsigned int max_iter, + const double improvement_threshold) + { + /** Ensure labels is clean. **/ + memset(labels, 0, num_clusters * sizeof(unsigned int)); + + /** Allocate space to store centroids and new_centroids. **/ + /** Dynamic allocation is required because these densely allocated arrays might be up to 500KB! **/ + pCentroid* centroids = (pCentroid*)nmMalloc(num_clusters * sizeof(pCentroid)); + if (centroids == NULL) + { + fprintf(stderr, "ca_kmeans() - nmMalloc(%lu) failed.\n", num_clusters * sizeof(pCentroid)); + assert(false); + } + pCentroid* new_centroids = (pCentroid*)nmMalloc(num_clusters * sizeof(pCentroid)); + if (new_centroids == NULL) + { + fprintf(stderr, "ca_kmeans() - nmMalloc(%lu) failed.\n", num_clusters * sizeof(pCentroid)); + assert(false); + } + for (unsigned int i = 0u; i < num_clusters; i++) + { + /** Malloc each centroid. **/ + centroids[i] = (pCentroid)nmMalloc(pCentroidSize); + if (centroids[i] == NULL) + { + fprintf(stderr, "ca_kmeans() - nmMalloc(%lu) failed.\n", pCentroidSize); + assert(false); + } + memset(centroids[i], 0, pCentroidSize); + + /** Malloc each new centroid. **/ + new_centroids[i] = (pCentroid)nmMalloc(pCentroidSize); + if (new_centroids[i] == NULL) + { + fprintf(stderr, "ca_kmeans() - nmMalloc(%lu) failed.\n", pCentroidSize); + assert(false); + } + memset(new_centroids[i], 0, pCentroidSize); + } + + /** Select random vectors to use as the initial centroids. **/ + srand(time(NULL)); + for (unsigned int i = 0u; i < num_clusters; i++) + { + // Pick a random vector. + const pVector vector = vectors[rand() % num_vectors]; + + // Sparse copy the vector to expand it into a densely allocated centroid. + pCentroid centroid = centroids[i]; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int token = vector[i++]; + if (token > 0) centroid[dim++] = (double)token; + else for (unsigned int j = 0u; j < -token; j++) centroid[dim++] = 0.0; + } + } + + /** Main kmeans loop. **/ + double old_average_cluster_size = 1.0; + unsigned int cluster_counts[num_clusters]; + for (unsigned int iter = 0u; iter < max_iter; iter++) + { + bool changed = false; + + /** Reset new centroids. **/ + for (unsigned int i = 0u; i < num_clusters; i++) + { + cluster_counts[i] = 0u; + for (unsigned int dim = 0; dim < CA_NUM_DIMS; dim++) + new_centroids[i][dim] = 0.0; + } + + /** Assign each point to the nearest centroid. **/ + for (unsigned int i = 0u; i < num_vectors; i++) + { + const pVector vector = vectors[i]; + double min_dist = DBL_MAX; + unsigned int best_centroid_label = 0u; + + // Find nearest centroid. + for (unsigned int j = 0u; j < num_clusters; j++) + { + const double dist = sparse_dif_to_centroid(vector, centroids[j]); + if (dist < min_dist) + { + min_dist = dist; + best_centroid_label = j; + } + } + + /** Update label to new centroid, if necessary. **/ + if (labels[i] != best_centroid_label) + { + labels[i] = best_centroid_label; + changed = true; + } + + /** Accumulate values for new centroid calculation. **/ + pCentroid best_centroid = new_centroids[best_centroid_label]; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + { + const int val = vector[i++]; + if (val < 0) dim += (unsigned)(-val); + else best_centroid[dim++] += (double)val; + } + cluster_counts[best_centroid_label]++; + } + + /** Stop if centroids didn't change. **/ + if (!changed) break; + + /** Update centroids. **/ + for (unsigned int i = 0u; i < num_clusters; i++) + { + if (cluster_counts[i] == 0u) continue; + pCentroid centroid = centroids[i]; + const pCentroid new_centroid = new_centroids[i]; + const unsigned int cluster_count = cluster_counts[i]; + for (unsigned int dim = 0u; dim < CA_NUM_DIMS; dim++) + centroid[dim] = new_centroid[dim] / cluster_count; + } + + /** Is there enough improvement? **/ + const double average_cluster_size = get_cluster_size(vectors, num_vectors, labels, centroids, num_clusters); + const double improvement = old_average_cluster_size - average_cluster_size; + if (improvement < improvement_threshold) break; + old_average_cluster_size = average_cluster_size; + } + + /** Clean up. **/ + for (unsigned int i = 0u; i < num_clusters; i++) + { + nmFree(centroids[i], pCentroidSize); + nmFree(new_centroids[i], pCentroidSize); + } + nmFree(centroids, num_clusters * sizeof(pCentroid)); + nmFree(new_centroids, num_clusters * sizeof(pCentroid)); + } + +pXArray ca_search( + pVector* vectors, + const unsigned int num_vectors, + const unsigned int* labels, + const double dupe_threshold) + { + /** Allocate space for dups. **/ + pXArray dups = xaNew(num_vectors); + if (dups == NULL) + { + fprintf(stderr, "ca_search() - xaNew(%u) failed.\n", num_vectors); + return NULL; + } + + unsigned int a = 0, b = 0, c = 0, d = 0; + for (unsigned int i = 0u; i < num_vectors; i++) + { + const pVector v1 = vectors[i]; + const unsigned int label = labels[i]; + for (unsigned int j = i + 1u; j < num_vectors; j++) + { + if (b++ % 100 == 0) printf("."); + if (labels[j] != label) continue; + if (c++ % 100 == 0) printf(":"); + const pVector v2 = vectors[j]; + const double similarity = sparse_similarity(v1, v2); + if (similarity > dupe_threshold) /* Dup found! */ + { + Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); + if (dup == NULL) + { + fprintf(stderr, + "ca_search() - nmMalloc(%lu) failed.\n", + sizeof(Dup) + ); + goto err_free_dups; + } + + dup->id1 = i; + dup->id2 = j; + dup->similarity = similarity; + xaAddItem(dups, (void*)dup); + if (d++ % 4 == 0) printf("!"); + } + } + if (a++ % 4 == 0) printf("\n"); + } + + return dups; + + /** Free dups. **/ + err_free_dups:; + const size_t num_dups = dups->nItems; + for (unsigned int i = 0u; i < num_dups; i++) + { + nmFree(dups->Items[i], sizeof(Dup)); + dups->Items[i] = NULL; + } + xaDeInit(dups); + return NULL; + } + +/*** Runs complete search to find duplocates if `num_vectors < MAX_COMPLETE_SEARCH` + *** and runs a search using k-means clustering on larger amounts of data. + *** + *** @param vectors Array of precomputed frequency vectors for all dataset strings. + *** @param num_vectors The number of vectors to be scanned. + *** @param dupe_threshold The similarity threshold, below which dups are ignored. + *** @returns The duplicates in pDup structs. + ***/ +pXArray ca_lightning_search(pVector* vectors, const unsigned int num_vectors, const double dupe_threshold) + { + /** Allocate space for dups. **/ + const size_t guess_size = num_vectors * 2u; + pXArray dups = xaNew(guess_size); + if (dups == NULL) + { + fprintf(stderr, "ca_lightning_search() - xaNew(%lu) failed.\n", guess_size); + return NULL; + } + + /** Descide which algorithm to use. **/ + if (num_vectors <= 50 * 1000) + { /** Do a complete search. **/ + for (unsigned int i = 0u; i < num_vectors; i++) + { + const pVector v1 = vectors[i]; + for (unsigned int j = i + 1u; j < num_vectors; j++) + { + const pVector v2 = vectors[j]; + const double similarity = sparse_similarity(v1, v2); + if (similarity > dupe_threshold) // Dup found! + { + Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); + if (dup == NULL) + { + fprintf(stderr, "ca_lightning_search() - nmMalloc(%lu) failed.\n", sizeof(Dup)); + goto err_free_dups; + } + + dup->id1 = i; + dup->id2 = j; + dup->similarity = similarity; + xaAddItem(dups, (void*)dup); + } + } + } + } + else + { /** Do a k-means search. **/ + /** Define constants for the algorithm. **/ + const unsigned int max_iter = 64u; /** Hardcode value because idk. **/ + const unsigned int num_clusters = compute_k(num_vectors); + + /** Allocate static memory for finding clusters. **/ + unsigned int labels[num_vectors]; + memset(labels, 0u, sizeof(labels)); + + /** Execute kmeans clustering. **/ + ca_kmeans(vectors, num_vectors, labels, num_clusters, max_iter, 0.0002); + + /** Find duplocates in clusters. **/ + for (unsigned int i = 0u; i < num_vectors; i++) + { + const pVector v1 = vectors[i]; + const unsigned int label = labels[i]; + for (unsigned int j = i + 1u; j < num_vectors; j++) + { + if (labels[j] != label) continue; + const pVector v2 = vectors[j]; + const double similarity = sparse_similarity(v1, v2); + if (similarity > dupe_threshold) /* Dup found! */ + { + Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); + if (dup == NULL) + { + fprintf(stderr, + "ca_lightning_search() - nmMalloc(%lu) failed.\n", + sizeof(Dup) + ); + goto err_free_dups; + } + + dup->id1 = i; + dup->id2 = j; + dup->similarity = similarity; + xaAddItem(dups, (void*)dup); + } + } + } + } + + /** Done **/ + return dups; + + /** Free dups. **/ + err_free_dups:; + const size_t num_dups = dups->nItems; + for (unsigned int i = 0u; i < num_dups; i++) + { + nmFree(dups->Items[i], sizeof(Dup)); + dups->Items[i] = NULL; + } + xaDeInit(dups); + return NULL; + } + +/*** Computes Levenshtein distance between two strings. + *** + *** @param str1 The first string. + *** @param str2 The second string. + *** @param length1 The length of the first string. + *** @param length1 The length of the first string. + *** + *** @attention - Tip: Pass 0 for the length of either string to infer it + *** using the null terminating character. Thus, strings with no null + *** terminator are supported if you pass explicit lengths. + *** + *** Complexity: O(length1 * length2). + *** + *** @see centrallix-sysdoc/string_comparison.md + ***/ +unsigned int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) + { + /*** lev_matrix: + *** For all i and j, d[i][j] will hold the Levenshtein distance between + *** the first i characters of s and the first j characters of t. + *** + *** As they say, no dynamic programming algorithm is complete without a + *** matrix that you fill out and it has the answer in the final location. + ***/ + const size_t str1_len = (str1_length == 0u) ? strlen(str1) : str1_length; + const size_t str2_len = (str2_length == 0u) ? strlen(str2) : str2_length; + unsigned int lev_matrix[str1_len + 1][str2_len + 1]; + + /*** Base case #0: + *** Transforming an empty string into an empty string has 0 cost. + ***/ + lev_matrix[0][0] = 0u; + + /*** Base case #1: + *** Any source prefixe can be transformed into an empty string by + *** dropping each character. + ***/ + for (unsigned int i = 1u; i <= str1_len; i++) + lev_matrix[i][0] = i; + + /*** Base case #2: + *** Any target prefixes can be transformed into an empty string by + *** inserting each character. + ***/ + for (unsigned int j = 1u; j <= str2_len; j++) + lev_matrix[0][j] = j; + + /** General Case **/ + for (unsigned int i = 1u; i <= str1_len; i++) + { + for (unsigned int j = 1u; j <= str2_len; j++) + { + /** Equal characters need no changes. **/ + if (str1[i - 1] == str2[j - 1]) + lev_matrix[i][j] = lev_matrix[i - 1][j - 1]; + + /*** We need to make a change, so use the opereration with the + *** lowest cost out of delete, insert, replace, or swap. + ***/ + else + { + unsigned int cost_delete = lev_matrix[i - 1][j] + 1u; + unsigned int cost_insert = lev_matrix[i][j - 1] + 1u; + unsigned int cost_replace = lev_matrix[i-1][j-1] + 1u; + + /** If a swap is possible, calculate the cost. **/ + bool can_swap = ( + i > 1 && j > 1 && + str1[i - 1] == str2[j - 2] && + str1[i - 2] == str2[j - 1] + ); + unsigned int cost_swap = (can_swap) ? lev_matrix[i - 2][j - 2] + 1 : UINT_MAX; + + // Find the best operation. + lev_matrix[i][j] = min(min(min(cost_delete, cost_insert), cost_replace), cost_swap); + } + } + } + + return lev_matrix[str1_len][str2_len]; + } + +/*** Runs complete search to find duplocates in phone numbers using the + *** levenshtein min edit distance algorithm. + *** + *** @param dataset An array of characters for all dataset strings. + *** @param dataset_size The number of phone numbers to be scanned. + *** @param dupe_threshold The similarity threshold, below which dups are ignored. + *** @returns The duplicates in pDup structs. + ***/ +pXArray ca_phone_search(char dataset[][10u], const unsigned int dataset_size, const double dupe_threshold) + { + /** Allocate space for dups. **/ + const size_t guess_size = dataset_size * 2u; + pXArray dups = xaNew(guess_size); + if (dups == NULL) + { + fprintf(stderr, "ca_phone_search() - xaNew(%lu) failed.\n", guess_size); + return NULL; + } + + /** Search for dups using edit distance. **/ + for (unsigned int i = 0u; i < dataset_size; i++) + { + const char* v1 = dataset[i]; + for (unsigned int j = i + 1u; j < dataset_size; j++) + { + const char* v2 = dataset[j]; + const unsigned int dist = ca_edit_dist(v1, v2, 10u, 10u); + const double similarity = (double)dist / 10.0; + if (similarity > dupe_threshold) /* Dup found! */ + { + Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); + if (dup == NULL) + { + fprintf(stderr, "ca_phone_search() - nmMalloc(%lu) failed.\n", sizeof(Dup)); + + /** Free data before returning. **/ + const size_t num_dups = dups->nItems; + for (unsigned int i = 0u; i < num_dups; i++) + { + void* dup = dups->Items[i]; + nmFree(dup, sizeof(Dup)); + } + xaDeInit(dups); + return NULL; + } + + dup->id1 = i; + dup->id2 = j; + dup->similarity = similarity; + xaAddItem(dups, (void*)dup); + } + } + } + + return dups; + } + +void ca_init() + { + nmRegister(sizeof(Dup), "Dup"); + } + +/** Scope cleanup. **/ +#undef sparse_dif +#undef sparse_dif_to_centroid diff --git a/centrallix-lib/src/util.c b/centrallix-lib/src/util.c index 629b59c79..ec1d87bcf 100644 --- a/centrallix-lib/src/util.c +++ b/centrallix-lib/src/util.c @@ -14,11 +14,17 @@ /* Description: Collection of utilities */ /************************************************************************/ +#include +#include +#include +#include +#include #include - #include -#include -#include +#include +#include + +#include "newmalloc.h" #include "util.h" /** @@ -77,3 +83,161 @@ unsigned int strtoui(const char *nptr, char **endptr, int base){ //return as tmp; return (unsigned int)tmp; } + +/*** Detects the optimal number of threads to use on this system. + *** Note: Multithreading is not currently supported, so this funciton + *** will always return 1, for now. + *** + *** @returns The number of threads that should be used on this system. + ***/ +int util_detect_num_threads(void) + { + /** Centrallix does not support multithreading. **/ + return 1; + + long num_procs = sysconf(_SC_NPROCESSORS_ONLN); + if (num_procs < 1 || INT_MAX < num_procs) + { + fprintf(stderr, "Warning: Detected strange number of processors (assuming 1): %ld\n", num_procs); + return 1; + } + else return (int)num_procs; + } + +/*** snprint_bytes() allows one to pick between CS units, where the kibibyte + *** (KiB) is 1024 bytes, and metric units where the kilobyte (KB) is 1000 bytes. + *** Fun Fact: Windows uses kibibytes, but displays them as KB. + ***/ +#define USE_METRIC false +#define nUnits 6u +static char* units_cs[nUnits] = {"bytes", "KiB", "MiB", "GiB", "TiB", "PiB"}; +static char* units_metric[nUnits] = {"bytes", "KB", "MB", "GB", "TB", "PB"}; +/*** Displays a size in bytes using the largest unit where the result would be + *** at least 1.0. + *** + *** @param buf The buffer to which new text will be written, using snprintf(). + *** @param buf_size The amount of space in the buffer, passed to snprintf(). + *** It is recomended to have at least 12 characters available. + *** @param bytes The number of bytes, which will be formatted and written + *** to the buffer.. + *** @returns buf, for chaining. + ***/ +char* snprint_bytes(char* buf, const size_t buf_size, unsigned int bytes) + { + char** units = (USE_METRIC) ? units_metric : units_cs; + const double unit_size = (USE_METRIC) ? 1000.0 : 1024.0; + + /** Search for the largest unit where the value would be at least 1. **/ + const double size = (double)bytes; + for (unsigned char i = nUnits; i >= 1u; i--) + { + const double denominator = pow(unit_size, i); + if (size >= denominator) + { + const double converted_size = size / denominator; + if (converted_size >= 100.0) + snprintf(buf, buf_size, "%.5g %s", converted_size, units[i]); + else if (converted_size >= 10.0) + snprintf(buf, buf_size, "%.4g %s", converted_size, units[i]); + else /* if (converted_size >= 1.0) - Always true. */ + snprintf(buf, buf_size, "%.3g %s", converted_size, units[i]); + return buf; + } + } + + /** None of the larger units work, so we just use bytes. **/ + snprintf(buf, buf_size, "%u %s", bytes, units[0]); + + return buf; + } +#undef nUints + +void fprint_mem(FILE* out) + { + FILE* fp = fopen("/proc/self/statm", "r"); + if (fp == NULL) { perror("fopen()"); return; } + + long size, resident, share, text, lib, data, dt; + if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld", + &size, &resident, &share, &text, &lib, &data, &dt) != 7) + { + fprintf(stderr, "Failed to read memory info\n"); + fclose(fp); + return; + } + fclose(fp); + + long page_size = sysconf(_SC_PAGESIZE); // in bytes + long resident_bytes = resident * page_size; + + const size_t buf_siz = 16u; + char buf[buf_siz]; + snprint_bytes(buf, buf_siz, (unsigned int)resident_bytes); + + fprintf(out, "Memory used: %ld bytes (%s)\n", resident_bytes, buf); + fprintf(out, "Share %ldb, Text %ldb, Lib %ldb, Data %ldb\n", share, text, lib, data); + } + +static double get_time(void) + { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (double)ts.tv_sec + (double)ts.tv_nsec / 1.0e9f; + } + +pTimer timer_init(pTimer timer) + { + if (timer == NULL) return NULL; + timer->start = NAN; + timer->end = NAN; + return timer; + } + +pTimer timer_new(void) + { + return timer_init(nmMalloc(sizeof(Timer))); + } + +pTimer timer_start(pTimer timer) + { + if (!timer) return timer; + timer->start = get_time(); + return timer; + } + +pTimer timer_stop(pTimer timer) + { + if (!timer) return timer; + timer->end = get_time(); + return timer; + } + +double timer_get(pTimer timer) + { + return (timer) ? timer->end - timer->start : NAN; + } + +void timer_de_init(pTimer timer) {} + +void timer_free(pTimer timer) + { + timer_de_init(timer); + nmFree(timer, sizeof(Timer)); + } + +/*** Function for failing on error, assuming the error came from a library or + *** system function call, so that the error buffer is set to a valid value. + ***/ +void fail(const char* function_name, int code) + { + /** Create the most descriptive error message we can. **/ + char error_buf[BUFSIZ]; + snprintf(error_buf, sizeof(error_buf), "kmeans.c: Fail - %s", function_name); + if (errno != 0) perror(error_buf); + else if (code != 0) fprintf(stderr, "%s (error code %d)\n", error_buf, code); + else fprintf(stderr, "%s", error_buf); + + /** Throw error for easier locating in a debugger. **/ + fprintf(stderr, "Program will now crash.\n"); + raise(SIGSEGV); + } diff --git a/centrallix-lib/src/xhash.c b/centrallix-lib/src/xhash.c index afeb432b5..32a4a35eb 100644 --- a/centrallix-lib/src/xhash.c +++ b/centrallix-lib/src/xhash.c @@ -290,4 +290,74 @@ xhClear(pXHashTable this, int (*free_fn)(), void* free_arg) return 0; } +/*** Executes an operation on each entry of the hash table entry. + *** + *** @param this The affected hash table. + *** @param callback_fn A callback function to be called on each hash table + *** entry. It takes 2 parameters: the current hash table entry and a void* + *** argument specified using each_arg. If any invokation of the callback + *** function returns a value other than 0, xhForEach() will immediately + *** fail, returning that value as the error code. + *** @param each_arg An aditional argument which will be passed to each + *** invokation of the callback function. + *** @returns 0 if the function executes successfully. + *** 1 if the callback function is NULL. + *** n (where n != 0) if the callback function returns n. + ***/ +int +xhForEach(pXHashTable this, int (*callback_fn)(pXHashEntry, void*), void* each_arg) + { + if (callback_fn == NULL) return 1; + + for (int row = 0; row < this->nRows; row++) + { + pXHashEntry entry = (pXHashEntry)(this->Rows.Items[row]); + while (entry != NULL) + { + pXHashEntry next = entry->Next; + const int ret = callback_fn(entry, each_arg); + if (ret != 0) return ret; + entry = next; + } + } + + return 0; + } + +static int +xhiFreeEntry(pXHashEntry entry, void* arg) + { + /*** The passed void* actually points to a void* array with 2 elements. + *** The first element is a function pointer to the free function, which + *** we invoke using the provided entry and the free_arg, specified as the + *** second element of the array. + *** + *** Interestingly, you can write this code in one line like this: + *** ((void (*)(pXHashEntry, void*))((void**)arg)[0])(entry, ((void**)arg)[1]); + *** But I value code readability, so fortunately, I can't be THAT cleaver... + ***/ + void** args = (void**)arg; + void (*free_fn)(pXHashEntry, void*) = args[0]; + free_fn(entry, args[1]); + + /** Free the entry. **/ + nmFree(entry, sizeof(XHashEntry)); + + return 0; + } +int +xhClearKeySafe(pXHashTable this, void (*free_fn)(pXHashEntry, void*), void* free_arg) + { + /** Free each row. **/ + void* args[2] = {free_fn, free_arg}; + const int ret = xhForEach(this, xhiFreeEntry, args); + + /** Mark all rows as empty. **/ + for (int i = 0; i < this->nRows; i++) + this->Rows.Items[i] = NULL; + this->nItems = 0; + + /** We are successful only if the free function didn't fail. **/ + return ret; + } diff --git a/centrallix-os/cluster-schema.cluster b/centrallix-os/cluster-schema.cluster new file mode 100644 index 000000000..201c41255 --- /dev/null +++ b/centrallix-os/cluster-schema.cluster @@ -0,0 +1,176 @@ +// Input schema +$Version=2$ +file_name "system/cluster" + { + name "cluster/parameter" + { + type : DATA_T // See datatypes.h + ?default : type + ?name : String // Overrides the name above. + ?style : StyleObj // idk where to find docs for this. + } + // Access with :parameters:name. Accessing dynamic data (e.g. parameters) + // should be managed within a runserver() call. + ... + + source : DataSourcePath + attr_name : string ⊂ DataSourcePath/columns + + cluster_name "cluster/cluster" + { + algorithm : "none" | "sliding-window" | "k-means" + | "k-means++" | "k-medoids" |"db-scan" // dbscan not implemented + similarity_measure : "cosine" | "levenshtein" // levenshtein not implemented. + num_clusters : uint > 1 // (probably a parameter) + ?min_improvement : double && 0.0 < x < 1.0 | "none" // default: 0.0001 + ?max_iterations : uint // default: 64 + + // Not implemented + sub_cluster_name "cluster/cluster" + { + // Same as above. + } + } + ... + + search_name "system/search" + { + source : string ⊂ [cluster_name, ...] + threshold : double && 0.0 < x < 1.0 // optimization. + similarity_measure : "cosine" | "levenshtein" // levenshtein not implemented. + } + ... + } + +// Output schema + +- /{arbitrary uint} + ? /sub_cluster_name + ? /{arbitrary uint} + ... + - /average_similarity : double && 0.0 < x < 1.0 + - /size = average_similarity + - /{arbitrary uint} + - /val : typeof(attr_name) // The value of the data point. + - /label : uint < num_clusters // id of the cluster to which this data point belongs. + - /sim : double && 0.0 < x <= threshold // Similarity to cluster centroid. +... +/search_name +- /{arbitrary uint} + - /id1 : uint // The id of the first data point. + - /id2 : uint // The id of the second data point. + - /val1 : typeof(attr_name) // The value of the first data point. + - /val2 : typeof(attr_name) // The value of the second data point. + - /sim : double && 0.0 < x <= threshold // The similarity of the two data points. +... + +// Other notes + +// This means centrallix scripts will have to chose when to switch +// from complete search to clustered search. I think this is a good +// thing, because that feels like a higher-level responsibility. + +// Invoke file: +// select * from /file.cl + +// Driver-authoring.md +// Comprehend stparse.c (lib vs. centrallix?) +// Design what a .cluster file looks like. +// +// Figure out how to invoke the object system. + +// Random queries + +// Names +SELECT CONCAT(p_given_name, ' ', p_surname) AS full_name, + COUNT(*) AS num_dups +FROM p_partner +WHERE p_given_name is not null +AND p_surname is not null +AND p_given_name != "" +AND p_surname != "" +AND p_given_name != " " +AND p_surname != " " +GROUP BY full_name +ORDER BY num_dups DESC +LIMIT 1; +// Result: Ine Bradley with 4 dups + +// Phone Numbers +SELECT CONCAT(ci.p_phone_country, ci.p_phone_area_city, ci.p_contact_data) AS phone_number, + COUNT(*) AS num_dups +FROM p_partner AS p +JOIN p_contact_info AS ci + ON p.p_partner_key = ci.p_partner_key +WHERE ci.p_contact_data != ' ' +AND ci.p_contact_data != '' +AND (ci.p_contact_type = 'P' OR ci.p_contact_type = 'C') +GROUP BY phone_number +ORDER BY num_dups DESC +LIMIT 1; +// Result: 1813762-2274 with 2 dups + +// Emails and Addresses +SELECT CONCAT(ci.p_contact_data, ' ', + l.p_in_care_of, ' ', + l.p_address_1, ' ', + l.p_address_2, ' ', + l.p_address_3, ' ', + l.p_city, ' ', + l.p_state_province, ' ', + l.p_country_code, ' ', + l.p_postal_code) AS email_and_address, + COUNT(*) AS duplicate_count +FROM p_partner AS p +JOIN p_contact_info AS ci + ON p.p_partner_key = ci.p_partner_key +JOIN p_location AS l + ON p.p_partner_key = l.p_partner_key +WHERE ci.p_contact_type = 'E' +GROUP BY email_and_address +ORDER BY duplicate_count DESC +LIMIT 1; +// Result: richard.aypofblcsg@iipr.yeen with 2 dups + +// Email +SELECT ci.p_contact_data AS email, + COUNT(*) AS duplicate_count +FROM p_partner AS p +JOIN p_contact_info AS ci + ON p.p_partner_key = ci.p_partner_key +WHERE ci.p_contact_type = 'E' +GROUP BY email +ORDER BY duplicate_count DESC +LIMIT 1; + +// Result: uoehtbtjvqh20@ltirs.zese with 2 dups + +// Address +SELECT CONCAT(l.p_in_care_of, ' ', + l.p_address_1, ' ', + l.p_address_2, ' ', + l.p_address_3, ' ', + l.p_city, ' ', + l.p_state_province, ' ', + l.p_country_code, ' ', + l.p_postal_code) AS address, + COUNT(*) AS duplicate_count +FROM p_partner AS p +JOIN p_location AS l + ON p.p_partner_key = l.p_partner_key +WHERE l.p_address_1 != ' ' +GROUP BY address +ORDER BY duplicate_count DESC +LIMIT 1; +// Result: "742 1ben Sc E Adams FL US 49152" with 4 + + +// Output to dataset +INTO OUTFILE '/var/lib/mysql/db_output.csv' +LINES TERMINATED BY '|' + +// Output to CSV +INTO OUTFILE '/var/lib/mysql/db_output.csv' +FIELDS TERMINATED BY ',' +ENCLOSED BY '"' +LINES TERMINATED BY '\n'; diff --git a/centrallix-os/file.cluster b/centrallix-os/file.cluster new file mode 100644 index 000000000..929efdd03 --- /dev/null +++ b/centrallix-os/file.cluster @@ -0,0 +1,64 @@ +$Version=2$ +file_name "system/cluster" + { + // Developer can specify parameters to improve file reuseability. + // TIP: Improve performance by declairing frequently used parameters first. + k "cluster/parameter" { type = integer; style=notnull; } + str "cluster/parameter" { type = string; } + int "cluster/parameter" { type = integer; default = runserver(:parameters:k); } + dbl "cluster/parameter" { type = double; default=4.2; } + // conversion "cluster/parameter" { type=double; default=4; } + + null_str "cluster/parameter" { type = string; default = null; } + null_int "cluster/parameter" { type = integer; default = null; } + null_dbl "cluster/parameter" { type = double; default = null; } + + // We calculate k in a centrallix script using: + // k = max(2, pow(log(n) / log(36), 3.2) - 8) + // where n is the number of records passed. + + // Specify the data source at the top of the file. + // How do we pass distinct data? Should the driver + // handle that for us? + source = "/apps/kardia/data/Kardia_DB/p_partner/rows"; + attr_name = p_given_name; // runserver(:parameters:str) + + // Clustering object specifies properties for clustering. + kmeans_cluster "cluster/cluster" + { + algorithm = "k-means"; + similarity_measure = "cosine"; + num_clusters = runserver(:parameters:k); + min_improvement = 0.0001; + max_iterations = 48; + + // Create subclusters. (Not implemented) + sub_cluster "cluster/cluster" + { + algorithm = "none"; + similarity_measure = "cosine"; + num_clusters = 7; + min_improvement = "max"; + } + } + + // Complete search. + no_clustering "cluster/cluster" + { + algorithm = "none"; + } + + dups "cluster/search" + { + source = kmeans_cluster; + threshold = 0.75; + similarity_measure = "cosine"; + } + + dups2 "cluster/search" + { + source = no_clustering; + threshold = 0.75; + similarity_measure = "cosine"; + } + } diff --git a/centrallix-os/file2.cluster b/centrallix-os/file2.cluster new file mode 100644 index 000000000..a55c37f85 --- /dev/null +++ b/centrallix-os/file2.cluster @@ -0,0 +1,42 @@ +$Version=2$ +file_name "system/cluster" + { + // Developer can specify parameters to improve file reuseability. + // TIP: Improve performance by declairing frequently used parameters first. + k "cluster/parameter" { type = integer; style=notnull; } + str "cluster/parameter" { type = string; default="k-means"; } + int "cluster/parameter" { type=integer; default=:parameters:k; } + dbl "cluster/parameter" { type=double; default=4.2; } + // conversion "cluster/parameter" { type=double; default=4; } + + null_str "cluster/parameter" { type = string; default = null; } + null_int "cluster/parameter" { type = integer; default = null; } + null_dbl "cluster/parameter" { type = double; default = null; } + + // We calculate k in a centrallix script using: + // k = max(2, pow(log(n) / log(36), 3.2) - 8) + // where n is the number of records passed. + + // Specify the data source at the top of the file. + // How do we pass distinct data? Should the driver + // handle that for us? + source = "/apps/kardia/data/Kardia_DB/p_partner/rows"; + attr_name = "p_given_name"; + + // Clustering object specifies properties for clustering. + kmeans_cluster "cluster/cluster" + { + algorithm = "k-means"; + similarity_measure = "cosine"; + num_clusters = :parameters:k; + min_improvement = 0.0001; + max_iterations = 48; + } + + dups "cluster/search" + { + source = kmeans_cluster; + threshold = 0.75; + similarity_measure = "cosine"; + } + } diff --git a/centrallix-sysdoc/OSDriver_Authoring.md b/centrallix-sysdoc/OSDriver_Authoring.md index c167fce26..5755d15c5 100644 --- a/centrallix-sysdoc/OSDriver_Authoring.md +++ b/centrallix-sysdoc/OSDriver_Authoring.md @@ -166,9 +166,11 @@ Within the initialization function, the driver should initialize all necessary g To register with the OSML, the driver must first allocate an ObjDriver structure and fill in its contents. +```c pObjDriver drv; drv = (pObjDriver)nmMalloc(sizeof(ObjDriver)); +``` This involves setting a large number of fields to the appropriate entry points within the OS Driver, as well as telling the OSML what object type(s) are handled by the driver and giving the OSML a description of the driver. A list of the required entry point functions / fields follows: @@ -208,14 +210,17 @@ Another field in the driver structure is the Capabilities field. This field is a The 'Name' field should be filled in with a description of the OS driver, with a maximum length of 63 characters (plus the string null terminator). Normally, the 2-4 letter prefix of the driver is included at the beginning of 'Name', such as "UXD - UNIX filesystem driver". Finally, the 'RootContentTypes' field is an XArray containing a list of strings, each of which specifies the node object types that the driver will handle. Such types are added to this XArray using the normal XArray utility functions, such as: - +```c xaInit(&drv->RootContentTypes, 16); xaAddItem(&drv->RootContentTypes, "system/file"); xaAddItem(&drv->RootContentTypes, "system/directory"); +``` When the structure has been filled out, the os driver should call the OSML to register itself, using the objRegisterDriver function: +```c objRegisterDriver(drv); +``` The initialization function should return 0 to indicate success, or -1 on failure. Currently, initialization success/failure is not verified by lsmain.c. @@ -234,54 +239,58 @@ As an overview, the normal procedure for the open routine to follow is this: The first basic part of the OS driver consists of the Open and Close routines, normally named 'xxxOpen' and 'xxxClose' within the driver, where 'xxx' is the driver's prefix. The Close routine is normally fairly simple, but the Open routine is one of the most complicated routines in a typical OS driver, for the Open routine must parse the subtree pathname beneath the node object. For example, if the node object had a pathname like: +```sh /datasources/OMSS_DB +``` and the user opened an object called: +```sh /datasources/OMSS_DB/JNetHelp/rows/1 +``` the OS driver would have to determine what the subtree pathname 'JNetHelp/rows/1' means, since this path will mean different things to different os drivers. -The Open routine also must determine whether the object already exists or not, and if not, whether to create a new object. This logic is largely dependent on the obj->Mode flags, as if O_CREAT is included, the driver must attempt to create the object if it does not already exist, and if O_EXCL is included, the driver must refuse to open the object if it already exists, as with the UNIX open() system call semantics. +The Open routine also must determine whether the object already exists or not, and if not, whether to create a new object. This logic is largely dependent on the `obj->Mode` flags, as if `O_CREAT` is included, the driver must attempt to create the object if it does not already exist, and if `O_EXCL` is included, the driver must refuse to open the object if it already exists, as with the UNIX `open()` system call semantics. -Finally, if the os driver specified a capability of OBJDRV_C_TRANS, it must pay attention to the current state of the end-user's trans- action. If a new object is being created, an object is being deleted, or other modifications/additions are being performed, and if the OXT layer indicates a transaction is in process, the driver must either complete the current transaction and then complete the current call, or else add the current delete/create/modify call to the transaction tree (in which case the tree item is preallocated; all the driver needs to do is fill it in). The transaction layer will be discussed in depth later in this document. +Finally, if the os driver specified a capability of `OBJDRV_C_TRANS`, it must pay attention to the current state of the end-user's transaction. If a new object is being created, an object is being deleted, or other modifications/additions are being performed, and if the OXT layer indicates a transaction is in process, the driver must either complete the current transaction and then complete the current call, or else add the current delete/create/modify call to the transaction tree (in which case the tree item is preallocated; all the driver needs to do is fill it in). The transaction layer will be discussed in depth later in this document. -As a part of the Open process, the OS driver will normally allocate an internal structure to represent the current open object, and will return that structure as a void* data type in the return value. This pointer will be then passed to each of the other driver entry point functions, with the exception of QueryFetch, QueryDelete, and Query- Close, which will be discussed later. +As a part of the Open process, the OS driver will normally allocate an internal structure to represent the current open object, and will return that structure as a `void*` data type in the return value. This pointer will be then passed to each of the other driver entry point functions, with the exception of QueryFetch, QueryDelete, and Query- Close, which will be discussed later. The Open() routine is called with five parameters: -- obj (pObject) +- `obj` (pObject) This is a pointer to the Object sturcture maintained by the OSML. This structure will contain some important fields for processing the open() request. - obj->Mode is a bitmask of the O_* flags, which include O_RDONLY, O_WRONLY, O_RDWR, O_CREAT, O_TRUNC, and O_EXCL. + - `obj->Mode` is a bitmask of the O_* flags, which include `O_RDONLY`, `O_WRONLY`, `O_RDWR`, `O_CREAT`, `O_TRUNC`, and `O_EXCL`. - obj->Pathname is a Pathname structure which contains the complete parsed pathname for the object. This structure is defined in the file include/obj.h, and has a buffer for the pathname as well as an array of pointers to the pathname's components. The function obj_internal_PathPart() can be used to obtain at will any component or series of components of the pathname. + - `obj->Pathname` is a Pathname structure which contains the complete parsed pathname for the object. This structure is defined in the file `include/obj.h`, and has a buffer for the pathname as well as an array of pointers to the pathname's components. The function `obj_internal_PathPart()` can be used to obtain at will any component or series of components of the pathname. - obj->Pathname->OpenCtl[] contains parameters to the open() operation. Frequently these params provide additional information on how to open the object. The use of these parameters is determined by the author of the objectsystem driver. The parameters are those passed in normal URL fasion (?param=value, etc.). Typically, the only OpenCtl of interest is going to be obj->Pathname->OpenCtl[obj->SubPtr] (see below for SubPtr meaning). + - `obj->Pathname->OpenCtl[]` contains parameters to the open() operation. Frequently these params provide additional information on how to open the object. The use of these parameters is determined by the author of the objectsystem driver. The parameters are those passed in normal URL fasion (?param=value, etc.). Typically, the only OpenCtl of interest is going to be `obj->Pathname->OpenCtl[obj->SubPtr]` (see below for SubPtr meaning). - obj->SubPtr is the number of components in the path that are a part of the node object's path. For example, in the above path of '/datasources/OMSS_DB', the path would be internally represented as './datasources/ OMSS_DB', and the SubPtr would be 3. + - `obj->SubPtr` is the number of components in the path that are a part of the node object's path. For example, in the above path of '/datasources/OMSS_DB', the path would be internally represented as './datasources/ OMSS_DB', and the SubPtr would be 3. - obj->SubCnt reflects the number of components of the path which are under the control of the current driver. This includes the node object, so SubCnt will always be at least 1. For example, when opening '/data/file.csv/rows/1', and the driver in question is the CSV driver, SubPtr would be 3 (includes an "invisible" first component), from '/data/file.csv', and SubCnt would be 3, from 'file.csv/rows/1'. The driver will need to SET THE SUBCNT value in its Open function. SubPtr is already set. + - `obj->SubCnt` reflects the number of components of the path which are under the control of the current driver. This includes the node object, so SubCnt will always be at least 1. For example, when opening '/data/file.csv/rows/1', and the driver in question is the CSV driver, SubPtr would be 3 (includes an "invisible" first component), from '/data/file.csv', and SubCnt would be 3, from 'file.csv/rows/1'. The driver will need to SET THE SUBCNT value in its Open function. SubPtr is already set. - obj->Prev is the underlying object as opened by the next-lower-level driver. It is the duty of this driver to parse the content of that object and do something meaningful with it. + - `obj->Prev` is the underlying object as opened by the next-lower-level driver. It is the duty of this driver to parse the content of that object and do something meaningful with it. - obj->Prev->Flags contains some critical infor- mation about the underlying object. If it contains the flag OBJ_F_CREATED, then the underlying object was just created by this open() operation. In that case, this driver is expected to create the node with snNewNode() (see later in this document) as long as obj->Mode contains O_CREAT. + - `obj->Prev->Flags` contains some critical information about the underlying object. If it contains the flag `OBJ_F_CREATED`, then the underlying object was just created by this open() operation. In that case, this driver is expected to create the node with snNewNode() (see later in this document) as long as obj->Mode contains O_CREAT. -- mask (int) +- `mask` (int) Indicates the security mask to be given to the object if it is being created. Typically, this will only apply to files and directories. The values are the same as UNIX chmod() type values. -- systype (pContentType) +- `systype` (pContentType) This param indicates the content type of the node object as determined by the OSML. The ContentType structure is defined in include/ obj.h, and includes among other things the name of the content type. For example, for the reporting driver, this type would be "system/report". -- usrtype (char*) +- `usrtype` (char*) This param is the requested object type by the user and is normally used when creating a new object, though under some circumstances it may change the way the open operates on an existing object. For example, the reporting driver can change whether it generates HTML report text or plaintext reports based on usrtype being either "text/html" or "text/plain". -- oxt (pObjTrxTree*) +- `oxt` (pObjTrxTree*) This param is only used by object drivers that specified a capability of OBJDRV_C_TRANS. More on this field later. For non-transaction-aware drivers, this field can be safely ignored. Yes, this param *is* a pointer to a pointer. Essentially, a pointer passed by reference. -The Open routine should return its internal structure pointer on success, or NULL on failure. It is normal to allocate one such structure per Open call, and for the structure to point, among other things, to shared data describing the node object. Accessing the node object is described later in this document. +The Open routine should return its internal structure pointer on success, or `NULL` on failure. It is normal to allocate one such structure per Open call, and for the structure to point, among other things, to shared data describing the node object. Accessing the node object is described later in this document. It is important to know what kinds of fields normally are placed in the allocated data structure returned by Open. These fields are all determined by the driver author, but here are a few typical ones that are helpful to have ("inf" is the pointer to the structure here): @@ -307,7 +316,7 @@ Before exiting, the Close routine should make sure it decrements the Open Count ### C. Creating and Deleting Objects. The Create and Delete functions are used for creating and deleting objects. Normally, the os driver will process the Pathname in the same manner for Create and Delete as for Open, thus such functionality could be placed in another function. -As a side note, within Centrallix, the standard function naming convention is to use xxx_internal_FunctionName for functions that are more or less internal to the module and not a part of any standard interface. +As a side note, within Centrallix, the standard function naming convention is to use `xxx_internal_FunctionName()` for functions that are more or less internal to the module and not a part of any standard interface. The Create routine has parameters identical to the Open routine. It should return 0 on success and -1 on error. @@ -349,19 +358,19 @@ The query mechanism can also be used to delete a set of child objects, optionall The first main function for handling queries is OpenQuery. This function is passed three arguments: -- inf_v (void*) The value returned from Open for this object. +- `inf_v` (void*) The value returned from Open for this object. -- query (pObjQuery) The query structure setup by the OSML. It will contain several key fields: +- `query` (pObjQuery) The query structure setup by the OSML. It will contain several key fields: - query->QyText: the text of the criteria (i.e., the WHERE clause, in Centrallix SQL syntax) + - `query->QyText`: the text of the criteria (i.e., the WHERE clause, in Centrallix SQL syntax) - query->Tree: the compiled expression tree, which evaluates to nonzero for true or zero for false as the WHERE clause condition. + - `query->Tree`: the compiled expression tree, which evaluates to nonzero for true or zero for false as the WHERE clause condition. - query->SortBy[]: an array of expressions giving the various components of the sorting criteria. + - `query->SortBy[]`: an array of expressions giving the various components of the sorting criteria. - query->Flags: the driver should set and/or clear the flags OBJ_QY_F_FULLQUERY and OBJ_QY_F_FULLSORT if need be. The former indicates that the driver is willing to handle the full WHERE clause (the query->Tree). The latter indicates that the driver is willing to handle the sorting of the data as well (in query->SortBy[]). If the driver can easily have the sorting/selection done (as when querying an RDBMS), it should set these flags. Otherwise, it should let the OSML take care of the ORDER BY and WHERE conditions. + - `query->Flags`: the driver should set and/or clear the flags `OBJ_QY_F_FULLQUERY` and `OBJ_QY_F_FULLSORT` if need be. The former indicates that the driver is willing to handle the full WHERE clause (the query->Tree). The latter indicates that the driver is willing to handle the sorting of the data as well (in query->SortBy[]). If the driver can easily have the sorting/selection done (as when querying an RDBMS), it should set these flags. Otherwise, it should let the OSML take care of the ORDER BY and WHERE conditions. -- oxt (pObjTrxTree*) The transaction tree pointer. +- `oxt` (pObjTrxTree*) The transaction tree pointer. The OpenQuery function should return a void* value, which will within the driver point to a structure used for managing the query. This structure will normally have a pointer to the inf_v value returned by Open as well, since inf_v is never passed to QueryFetch, QueryDelete or QueryClose. OpenQuery should return NULL if the object does not support queries or if some other error condition occurs that will prevent the execution of the query. @@ -378,6 +387,7 @@ The QueryFetch routine should return an inf_v pointer to the child object, or NU All object drivers will need to add an element to the obj->Pathname structure to indicate the path to the child object being returned. This will involve a process somewhat like this: (given that new_name is the new object's name, qy is the current query structure, which contains a field 'Parent' that points to the inf_v originally returned by Open, and where the inf_v contains a field Obj that points to the Object structure containing a Pathname structure) +```c int cnt; pObject obj; char* new_name; @@ -389,6 +399,7 @@ All object drivers will need to add an element to the obj->Pathname structure to if (cnt < 0 || cnt >= 256) return NULL; obj->Pathname->Elements[obj->Pathname->nElements++] = strrchr(obj->Pathname->Pathbuf,'/')+1; +``` QueryDelete is passed the qy_v void* parameter, and an oxt parameter. It should return 0 on successful deletion, and -1 on failure. @@ -456,22 +467,28 @@ The driver's first course of action to obtain node object data is to open the no ### pSnNode snReadNode(pObject obj) This function reads a Structure File from the already-open node object which is passed in the "obj" parameter in the xxxOpen() routine. The "obj" parameter has an element, obj->Prev, which is a link to the node object as opened by the previous driver in the OSML's chain of drivers for handling this open(). All you need to know to get the parsed node object is the following: +```c pSnNode node; node = snReadNode(obj->Prev); +``` The returned node structure is managed by the SN module and need not be nmFree()ed. The only thing that must be done is that the driver should increment the node structure's link count like this: +```c node->OpenCnt++; +``` When closing an object (and thus releasing a reference to the Node structure), the driver should decrement the link count. ### pSnNode snNewNode(pObject obj, char* content_type) This function creates a new node object with a given content type. The open link count should be incremented as appropriate, as before with snReadNode(). +```c pSnNode node; node = snNewNode(obj->Prev, "system/structure"); +``` The "system/structure" argument is the type that will be assigned to the newly created node object. Note that the underlying object must already exist in order for this to create a node object as that object's content. Normally the OSML does this for you by commanding the previous driver (handling obj->Prev) to create the underlying object in question. @@ -512,6 +529,7 @@ This function adds a node of type ST_T_SUBGROUP to either a ST_T_SUBGROUP or ST_ ### int stAddValue(pStructInf inf, char* strval, int intval) This function adds a value to an attribute, and can be called multiple times on an attribute to add a list of values. If 'strval' is not null, a string value is added, otherwise an integer value is added. The string is NOT copied, but is simply pointed-to. If the string is non-static, and has a lifetime less than the ST_T_ATTRIB tree node, then the following procedure must be used: +```c char* ptr; char* nptr; pStructInf attr_inf; @@ -522,6 +540,7 @@ This function adds a value to an attribute, and can be called multiple times on strcpy(nptr, ptr); stAddValue(attr_inf, nptr, 0); attr_inf->StrAlloc[0] = 1; +``` By following this method (making a copy of the string and then setting the StrAlloc value for that string), when the StructInf tree node is freed by the stparse module, the string will auto- matically be freed as well. @@ -533,6 +552,7 @@ This function returns the value of the given attribute in an ST_T_ATTRIB tree no It is common practice to use the stLookup and stAttrValue functions together to retrieve values, and search for an attribute StructInf and retrieve its value in one operation: +```c pStructInf inf; char* ptr; @@ -540,12 +560,14 @@ It is common practice to use the stLookup and stAttrValue functions together to { printf("%s is the value\n", ptr); } +``` ### int stFreeInf(pStructInf this) This function is used to free a StructInf tree node. It will free any sub-nodes first, so if that is not desired, be sure to disconnect them by removing them from the SubInf array and appropriately adjusting the nSubInf counter, and setting the SubInf array position to NULL. This function also disconnects the tree node from its parent, if any, so if the parent is already free()'d, be sure to set the node's Parent pointer to NULL. Any strings marked allocated with the StrAlloc flags will be free()'d. It is also common practice to bypass the stXxx() functions entirely and access the elements of the StructInf structures themselves. This is not forbidden, and may be done. See the file stparse.h for a description of the structure. For example, +```c pStructInf inf; int i; @@ -556,6 +578,7 @@ It is also common practice to bypass the stXxx() functions entirely and access t /** do stuff with attribute... **/ } } +``` ## IV Memory Management in Centrallix Centrallix has its own memory manager that caches freshly-deallocated blocks of memory in lists according to size so that they can be quickly reallocated. This memory manager also catches double-freeing of blocks, making debugging of memory problems a little easier. @@ -625,6 +648,7 @@ This adds an item to the xarray, and keeps the array sorted. The value for sort #### xaFindItem(pXArray this, void* item) This returns the offset into the array's items of the given value. An exact match is required. The array's items are given below: +```c XArray xa; pStructInf inf; int item_id; @@ -639,6 +663,7 @@ This returns the offset into the array's items of the given value. An exact matc item_id = xaFindItem(&xa, inf); inf == xa.Items[item_id]; +``` #### xaRemoveItem(pXArray this, int index) This function removes an item from the xarray at the given index. @@ -682,22 +707,27 @@ Copies the string 'text' into the XString. Like xsConcatenate, except that the #### char* xsStringEnd(pXString this) Returns a pointer to the end of the string. Useful for finding the end of the string without performing: +```c pXString xs; xs->String + strlen(xs->String) +``` since the xs module already knows the string length and does not have to search for the null terminator. Furthermore, since the string can contain nulls, the above statement could produce incorrect results in those situations. The contents of the XString can be easily referenced via: +```c pXString xs; printf("This string is %s\n", xs->String); +``` IMPORTANT NOTE: Do not store pointers to values within the string while you are still adding text to the end of the string. If the string ends up realloc()ing, your pointers will be incorrect. Instead, if data in the middle of the string needs to be pointed to, store offsets from the beginning of the string, not pointers to the string. For example, this is WRONG: +```c pXString xs; char* ptr; @@ -706,9 +736,11 @@ For example, this is WRONG: ptr = xsStringEnd(&xs); xsConcatenate(&xs, "This is the second sentence.", -1); printf("A pointer to the second sentence is '%s'\n", ptr); +``` Instead, use pointer aritmetic and do this: +```c pXString xs; int offset; @@ -717,6 +749,7 @@ Instead, use pointer aritmetic and do this: offset = xsStringEnd(&xs) - xs->String; xsConcatenate(&xs, "This is the second sentence.", -1); printf("A pointer to the second sentence is '%s'\n",xs->String+offset); +``` ### D. Expression (EXP) - Expression Trees @@ -726,7 +759,9 @@ Expressions can be stand-alone expression trees, or they can take parameter obje Expression evaluation results in the top-level expression tree node having the final value of the expression, which may be NULL, and may be an integer, string, datetime, money, or double data type. For example, the final value of +``` :myobject:oneattribute == 'yes' +``` would be integer 1 (true) if the attribute's value is indeed 'yes'. @@ -777,8 +812,10 @@ Frees a parameter object list. #### int expAddParamToList(pParamObjects this, char* name, pObject obj, int flags) Adds a parameter to the parameter object list. The 'obj' pointer may be left NULL during the expCompileExpression state of operation but must be set to a value before expEvalTree is called. Otherwise the attributes that reference that parameter object will result in NULL values in the expression (it's technically not an error). Flags can be EXPR_O_CURRENT if the object is to be marked as the current one, or EXPR_O_PARENT if it is to be marked as the parent object. Current and Parent objects can be referenced in an expression like this: +``` :currentobjattr ::parentobjattr +``` and is thus a shortcut to typing the full object name. @@ -851,15 +888,21 @@ drivers. Most of them are named obj_internal_XxxYyy or similar. #### char* obj_internal_PathPart(pPathname path, int start, int length) The Pathname structure breaks down a pathname into path elements, which are text strings separated by the directory separator '/'. This function takes the given Pathname structure, and returns the number of path elements requested. For instance, if you have a path: +``` /apps/kardia/data/Kardia_DB/p_partner/rows/1 +``` that path would be stored internally in Centrallix as: +``` ./apps/kardia/data/Kardia_DB/p_partner/rows/1 +``` To just return "Kardia_DB/p_partner", you could call: +``` obj_internal_PathPart(pathstruct, 4, 2); +``` Note that return values from obj_internal_PathPart are only valid until the next call to PathPart on the given pathname structure. @@ -886,9 +929,9 @@ This function closes a network connection, and optionally waits up to 'linger_ms ### int fdWrite(pFile filedesc, char* buffer, int length, int offset, int flags) This function writes data to a file descriptor, from a given buffer and length, and to an optional seek offset and with some optional flags. Flags can be the following: -- FD_U_NOBLOCK - If the write can't be performed immediately, don't perform it at all. -- FD_U_SEEK - The 'offset' value is valid. Seek to it before writing. Not valid for network connections. -- FD_U_PACKET - ALL of the data of 'length' in 'buffer' must be written. Normal write() semantics in UNIX state that not all data has to be written, and the number of bytes actually written is returned. Setting this flag makes sure all data is really written before returning. +- `FD_U_NOBLOCK` - If the write can't be performed immediately, don't perform it at all. +- `FD_U_SEEK` - The 'offset' value is valid. Seek to it before writing. Not valid for network connections. +- `FD_U_PACKET` - ALL of the data of 'length' in 'buffer' must be written. Normal write() semantics in UNIX state that not all data has to be written, and the number of bytes actually written is returned. Setting this flag makes sure all data is really written before returning. #### int fdRead(pFile filedesc, char* buffer, int maxlen, int offset, int flags) The complement to the above routine. Takes the same flags as the above routine, except FD_U_PACKET means that all of 'maxlen' bytes must be read before returning. This is good for reading a packet that is known to be exactly 'maxlen' bytes long, but which might be broken up into fragments by the network (TCP/IP has a maximum frame transmission size of about 1450 bytes). diff --git a/centrallix-sysdoc/string_comparison.md b/centrallix-sysdoc/string_comparison.md index 222e3e6d9..dac13d544 100644 --- a/centrallix-sysdoc/string_comparison.md +++ b/centrallix-sysdoc/string_comparison.md @@ -31,9 +31,9 @@ int exp_fn_fuzzy_compare(pExpression tree, pParamObjects objlist, pExpression i0 ``` Returns a value between 0.0 (complete match) and 1.0 (complete difference) between strings a and b, based on the (levenshtein distance) / (max len of input strings). Some alterations to the calculation are as follows: -- matching an empty string against anything returns 0.5. -- a string that only required insertions to become the other string has its (lev_dist)/(strlen) value halved before returning -The parameter max_field_width is required, but not used. +- Matching an empty string against anything returns 0.5. +- A string that only required insertions to become the other string has its `(lev_dist)/(strlen)` value halved before returning. +- The parameter `max_field_width` is required, but not used. ## Cosine Similarity @@ -46,6 +46,7 @@ const char *CHAR_SET ... `CHAR_SET` represents all of the characters that should be considered during the calculation of similarity. `CHAR_SET` can be extended to include additional characters, as necessary. ### Frequency Table + ```c int exp_fn_i_frequency_table(double *table, char *term) ``` @@ -70,6 +71,7 @@ Helper function for similarity(). Creates a TF x IDF vector from a frequency tab The `frequency_table` parameter must have been created using the `exp_fn_i_frequency_table` function above. ### Dot Product + ```c int exp_fn_i_dot_product(double *dot_product, double *r_freq_table1, double *r_freq_table2) ``` @@ -78,6 +80,7 @@ Helper function for similarity(). Calculates the dot product of two relative fre The `dot_product` parameter should be initialized to 0 before calling the function. The table parameters must contain relative frequency tables that are generated from the `exp_fn_i_relative_frequency_table` function. The lengths of both tables must equal the length of `CHAR_SET`. ### Magnitude + ```c int exp_fn_i_magnitude(double *magnitude, double *r_freq_table) ``` @@ -86,6 +89,7 @@ Helper function for similarity(). Calculates the magnitude of a relative frequen The `magnitude` parameter should be initialized to 0 before calling the function. The table parameter must contain a relative frequency table that was generated from the `exp_fn_i_relative_frequency_table` function. The length of the frequency table must equal the length of `CHAR_SET`. ### Similarity + ```c int exp_fn_similarity(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) ``` @@ -95,5 +99,3 @@ Returns a value between 0.0 (completely different) and 1.0 (complete match) refl ### Inverse Document Frequency (IDF) In text mining, the most common metric to use in the cosine similarity function is the [TF x IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) metric. Our approach uses only TF (term frequency). Inverse document frequency calculates a weighting factor for each character. This could increase precision a small amount by weighting characters that appear on many records as less important in distinguishing matches, and weighting characters that appear on only certain records as more important. IDF could be calculated by iterating through the entire partner dataset each time. The current approach uses the relative frequency of each letter used in the English language on [Wikipedia](https://en.wikipedia.org/wiki/Letter_frequency), which may not be consistent with the data in the partner database. - - diff --git a/centrallix/Makefile.in b/centrallix/Makefile.in index 7d2b1e238..0d13843de 100644 --- a/centrallix/Makefile.in +++ b/centrallix/Makefile.in @@ -115,6 +115,7 @@ XOBJDRIVERS=objdrv_ux.o \ objdrv_uxprint.o \ objdrv_qytree.o \ objdrv_qypivot.o \ + objdrv_cluster.o \ objdrv_datafile.o \ objdrv_audio.o \ objdrv_link.o \ @@ -133,6 +134,7 @@ XV3OBJDRIVERS= \ objdrv_uxprint_v3.o \ objdrv_qytree.o \ objdrv_qypivot.o \ + objdrv_cluster.o \ objdrv_query.o \ objdrv_datafile.o \ objdrv_audio.o \ @@ -314,6 +316,7 @@ XEXPRMODS=exp_main.o \ exp_compiler.o \ exp_evaluate.o \ exp_functions.o \ + exp_double_metaphone.o \ exp_generator.o EXPRMODS=$(patsubst %,expression/%,$(XEXPRMODS)) diff --git a/centrallix/centrallix.c b/centrallix/centrallix.c index 6467ab2b7..75e19d12d 100644 --- a/centrallix/centrallix.c +++ b/centrallix/centrallix.c @@ -440,6 +440,7 @@ cxDriverInit() stxInitialize(); /* Structure file driver */ qytInitialize(); /* Query Tree driver */ qypInitialize(); /* Query Pivot driver */ + clusterInitialize(); /* Cluster driver */ qyInitialize(); /* stored query (aka view) driver */ rptInitialize(); /* report writer driver */ uxpInitialize(); /* UNIX printer access driver */ @@ -694,4 +695,3 @@ cxLinkSigningSetup(pStructInf my_config) return 0; } - diff --git a/centrallix/etc/types.cfg b/centrallix/etc/types.cfg index 11ebc3e3e..6cbac5ae6 100644 --- a/centrallix/etc/types.cfg +++ b/centrallix/etc/types.cfg @@ -51,6 +51,7 @@ "system/symbolic-link" "Symbolic Link" lnk "" "text/plain" "text/css" "CSS File" css "" "text/plain" "system/querypivot" "Query Pivot Object" qyp "" "system/structure" +"system/cluster" "Clustering Object" cluster "" "system/structure" "application/json" "JSON data" json "" "text/plain" "text/json" "JSON data" "" "" "application/json" "text/x-json" "JSON data" "" "" "application/json" diff --git a/centrallix/expression/exp_compiler.c b/centrallix/expression/exp_compiler.c index 9455d6676..bcda38f71 100644 --- a/centrallix/expression/exp_compiler.c +++ b/centrallix/expression/exp_compiler.c @@ -1022,8 +1022,8 @@ expCompileExpression(char* text, pParamObjects objlist, int lxflags, int cmpflag /*** expBindExpression - do late binding of an expression tree to an *** object list. 'domain' specifies the requested bind domain, whether - *** runstatic (EXP_F_RUNSTATIC), runserver (EXP_F_RUNSERVER), or runclient - *** (EXP_F_RUNCLIENT). 'domain' can also be -0-, in which case we rebind + *** runstatic (EXPR_F_RUNSTATIC), runserver (EXPR_F_RUNSERVER), or runclient + *** (EXPR_F_RUNCLIENT). 'domain' can also be -0-, in which case we rebind *** a domainless expression. ***/ int @@ -1051,16 +1051,10 @@ expBindExpression(pExpression exp, pParamObjects objlist, int flags) break; } } - if (exp->ObjID == -1) - { - cm |= EXPR_MASK_EXTREF; - } - } - else if (exp->ObjID == -2 || exp->ObjID == -3) - { - if (exp->ObjID == -2) cm |= (1<<(objlist->CurrentID)); - if (exp->ObjID == -3) cm |= (1<<(objlist->ParentID)); + cm |= EXPR_MASK_EXTREF; } + else if (exp->ObjID == EXPR_CTL_CURRENT) cm |= (1<<(objlist->CurrentID)); + else if (exp->ObjID == EXPR_CTL_PARENT) cm |= (1<<(objlist->ParentID)); else if (exp->ObjID >= 0) { cm |= (1<<(exp->ObjID)); @@ -1084,4 +1078,3 @@ expBindExpression(pExpression exp, pParamObjects objlist, int flags) return cm; } - diff --git a/centrallix/expression/exp_double_metaphone.c b/centrallix/expression/exp_double_metaphone.c new file mode 100644 index 000000000..f3d76c49b --- /dev/null +++ b/centrallix/expression/exp_double_metaphone.c @@ -0,0 +1,1517 @@ +/************************************************************************/ +/* Text-DoubleMetaphone */ +/* Centrallix Core */ +/* */ +/* Copyright 2000, Maurice Aubrey . */ +/* All rights reserved. */ +/* */ +/* This code is copied for redistribution with modification, from the */ +/* gitpan/Text-DoubleMetaphone implementation on GitHub (1), which is */ +/* under the following license. */ +/* */ +/* This code is based heavily on the C++ implementation by Lawrence */ +/* Philips and incorporates several bug fixes courtesy of Kevin */ +/* Atkinson . */ +/* */ +/* This module is free software; you may redistribute it and/or */ +/* modify it under the same terms as Perl itself. */ +/* */ +/* A summary of the relevant content from https://dev.perl.org/licenses */ +/* has been included below for the convenience of the reader. This */ +/* was collected and saved on September 5th, 2025 and may not reflect */ +/* current information. For the most up to date information, please use */ +/* the link above. */ +/* */ +/* Perl5 is Copyright © 1993 and later, by Larry Wall and others. */ +/* */ +/* It is free software; you can redistribute it and/or modify it */ +/* under the terms of either: */ +/* */ +/* a) the GNU General Public License (2) as published by the Free */ +/* Software Foundation (3); either version 1 (2), or (at your */ +/* option) any later version (4), or */ +/* */ +/* b) the "Artistic License" (5). */ +/* */ +/* Citations: */ +/* 1: https://github.com/gitpan/Text-meta_double_metaphone */ +/* 2: https://dev.perl.org/licenses/gpl1.html */ +/* 3: http://www.fsf.org */ +/* 4: http://www.fsf.org/licenses/licenses.html#GNUGPL */ +/* 5: https://dev.perl.org/licenses/artistic.html */ +/* */ +/* Centrallix is published under the GNU General Public License, */ +/* satisfying the above requirement. A summary of this is included */ +/* below for the convenience of the reader. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: exp_double_metaphone.c */ +/* Author: Maurice Aubrey */ +/* Description: This module implements a "sounds like" algorithm */ +/* developed by Lawrence Philips which he published */ +/* in the June, 2000 issue of C/C++ Users Journal. */ +/* Double Metaphone is an improved version of Philips' */ +/* original Metaphone algorithm. */ +/************************************************************************/ + +/*** Note to future programmers reading this file (by Israel Fuller): + *** + *** This file was copied from a GitHub Repo with proper licensing (in case + *** you didn't read the legal stuff above), so feel free to check it out. + *** + *** As for this code, I've modified it to use styling and memory allocation + *** consistent with the rest of the Centrallix codebase. Also, I have added + *** documentation comments and extensive test cases (at the end of the file), + *** however, these reflect my own (possibly incorrect) understanding, which + *** might not line up with the original author. + *** + *** To be honest, though, trying to make this code as readable as possible + *** was very challanging due to all the messy boolean algebra. If there is + *** ever a professional linguist reading this, please factor out some of the + *** logic into local variables with descriptive names so that the rest of us + *** can read this code without our eyes glazing over. + *** + *** If you have any questions, please feel free to reach out to me or Greg. + *** + *** Original Source: https://github.com/gitpan/Text-meta_double_metaphone + ***/ + +#include +#include +#include +#include +#include +#include + +/*** If running in a testing environment, newmalloc is not + *** available, so we fall back to default C memory allocation. + ***/ +#ifndef TESTING +#include "cxlib/newmalloc.h" +#define META_MALLOC(size) nmSysMalloc(size) +#define META_REALLOC(ptr, size) nmSysRealloc(ptr, size) +#define META_FREE(ptr) nmSysFree(ptr) +#else +#include +#define META_MALLOC(size) malloc(size) +#define META_REALLOC(ptr, size) realloc(ptr, size) +#define META_FREE(ptr) free(ptr) +#endif + +/*** Helper function to handle checking for failed memory allocation + *** Author: Israel Fuller. + *** + *** @param ptr Pointer to the memory that should be allocated. + *** @param fname The name of the function invoked to allocate memory. + *** @param size The amount of memory being allocated. + *** @returns The pointer, for chaining. + ***/ +void* meta_check_allocation(void* ptr, const char* fname, const size_t size) + { + if (ptr == NULL) + { + /** Create the most descriptive error message we can. **/ + char error_buf[BUFSIZ]; + snprintf(error_buf, sizeof(error_buf), "exp_double_metaphone.c: Fail - %s(%lu)", fname, size); + perror(error_buf); + + // Throw error for easier locating in a debugger. + fprintf(stderr, "Program will now crash.\n"); + assert(0); + } + return ptr; + } + +/** Malloc shortcut macros. **/ +#define SAFE_MALLOC(size) \ + ({ \ + const size_t sz = (size); \ + memset(meta_check_allocation(META_MALLOC(sz), "META_MALLOC", sz), 0, sz); \ + }) +#define SAFE_REALLOC(ptr, size) \ + ({ \ + const size_t sz = (size); \ + meta_check_allocation(META_REALLOC(ptr, sz), "META_REALLOC", sz); \ + }) + +typedef struct + { + char* str; + size_t length; + size_t bufsize; + int free_str_on_destroy; + } +MetaString; + +/*** Allocates a new MetaString. + *** + *** @param init_str The initial size of the string. + *** @returns The new MetaString. + ***/ +MetaString* meta_new_string(const char* init_str) + { + MetaString *s; + char empty_string[] = ""; + + s = (MetaString*)SAFE_MALLOC(sizeof(MetaString)); + + if (init_str == NULL) + init_str = empty_string; + + s->length = strlen(init_str); + /** Preallocate a bit more for potential growth. **/ + s->bufsize = s->length + 7u; + + s->str = (char*)SAFE_MALLOC(s->bufsize * sizeof(char)); + + strncpy(s->str, init_str, s->length + 1); + s->free_str_on_destroy = 1; + + return s; + } + +/*** Frees a MetaString. + *** + *** @param s The MetaString. + ***/ +void meta_destroy_string(MetaString* s) + { + if (s == NULL) + return; + + if (s->free_str_on_destroy && s->str != NULL) + META_FREE(s->str); + + META_FREE(s); + } + +/*** Increases a MetaString's buffer size. + *** + *** @param s The MetaString* being modified. + *** @param chars_needed Minimumn number of characters to increase buffer size. + ***/ +void meta_increase_buffer(MetaString* s, const size_t chars_needed) + { + s->bufsize += chars_needed + 8u; + s->str = SAFE_REALLOC(s->str, s->bufsize * sizeof(char)); + } + +/*** Convert all characters of a MetaString to uppercase. + *** + *** @param s The MetaString being modified. + ***/ +void meta_make_upper(MetaString* s) + { + for (char* i = s->str; i[0] != '\0'; i++) + *i = (char)toupper(*i); + } + +/*** @param s The MetaString being checked. + *** @param pos The character location to check within the MetaString. + *** @returns 1 if the location is out of bounds for the MetaString, + *** 0 otherwise. + ***/ +bool meta_is_out_of_bounds(MetaString* s, unsigned int pos) + { + return (s->length <= pos); + } + +/*** Checks if a character in a MetaString is a vowel. + *** + *** @param s The MetaString being checked. + *** @param pos The character location to check within the MetaString. + ***/ +bool meta_is_vowel(MetaString* s, unsigned int pos) + { + if (meta_is_out_of_bounds(s, pos)) return 0; + + const char c = *(s->str + pos); + return ((c == 'A') || (c == 'E') || (c == 'I') || + (c == 'O') || (c == 'U') || (c == 'Y')); + } + +/*** Search a MetaString for "W", "K", "CZ", or "WITZ", which indicate that the + *** string is Slavo Germanic. + *** + *** @param s The MetaString to be searched. + *** @returns 1 if the MetaString is Slavo Germanic, or 0 otherwise. + ***/ +bool meta_is_slavo_germanic(MetaString* s) + { + return (strstr(s->str, "W") != NULL) + || (strstr(s->str, "K") != NULL) + || (strstr(s->str, "CZ") != NULL) + || (strstr(s->str, "WITZ") != NULL); + } + +/*** @param s The MetaString being checked. + *** @param pos The character location to check within the MetaString. + *** @returns The character at the position in the MetaString, or + *** '\0' if the position is not in the MetaString. + ***/ +char meta_get_char_at(MetaString* s, unsigned int pos) + { + return (meta_is_out_of_bounds(s, pos)) ? '\0' : ((char) *(s->str + pos)); + } + +/*** Checks for to see if any of a list of strings appear in a the given + *** MetaString after the given start position. + *** + *** @attention - Note that the START value is 0 based. + *** + *** @param s The MetaString being modified. + *** @param start The zero-based start of at which to begin searching + *** within the MetaString. + *** @param length The length of the character strings being checked. + *** @returns 1 if any of the character sequences appear after the start + *** in the MetaString and 0 otherwise. + ***/ +bool meta_is_str_at(MetaString* s, unsigned int start, ...) + { + va_list ap; + + /** Should never happen. **/ + if (meta_is_out_of_bounds(s, start)) + return 0; + + const char* pos = (s->str + start); + va_start(ap, start); + + char* test; + do + { + test = va_arg(ap, char*); + if (*test && (strncmp(pos, test, strlen(test)) == 0)) + return true; + } + while (test[0] != '\0'); + + va_end(ap); + + return false; + } + +/*** Adds a string to a MetaString, expanding the MetaString if needed. + *** + *** @param s The MetaString being modified. + *** @param new_str The string being added. + ***/ +void meta_add_str(MetaString* s, const char* new_str) + { + if (new_str == NULL) + return; + + const size_t add_length = strlen(new_str); + if ((s->length + add_length) > (s->bufsize - 1)) + meta_increase_buffer(s, add_length); + + strcat(s->str, new_str); + s->length += add_length; + } + +/*** Computes double metaphone. + *** + *** Example Usage: + *** ```c + *** char* primary_code; + *** char* secondary_code; + *** meta_double_metaphone(input, &primary_code, &secondary_code); + *** ``` + *** + *** @param str The string to compute. + *** @param primary_code A pointer to a buffer where the pointer to a string + *** containing the produced primary code will be stored. + *** @param secondary_code A pointer to a buffer where the pointer to a string + *** containing the produced secondary code will be stored. + ***/ +void meta_double_metaphone(const char* str, char** primary_code, char** secondary_code) + { + size_t length; + if (str == NULL || (length = strlen(str)) == 0u) { + fprintf(stderr, "Warning: Call to meta_double_metaphone() with invalid string.\n"); + + /** Double Metaphone on an invalid string yeilds two empty strings. **/ + *primary_code = (char*)SAFE_MALLOC(sizeof(char)); + *secondary_code = (char*)SAFE_MALLOC(sizeof(char)); + return; + } + unsigned int current = 0; + unsigned int last = (unsigned int)(length - 1); + + /** Pad original so we can index beyond end. **/ + MetaString* original = meta_new_string(str); + meta_make_upper(original); + meta_add_str(original, " "); + + MetaString* primary = meta_new_string(""); + MetaString* secondary = meta_new_string(""); + primary->free_str_on_destroy = 0; + secondary->free_str_on_destroy = 0; + + /** Skip these if they are at start of a word. **/ + if (meta_is_str_at(original, 0, "GN", "KN", "PN", "WR", "PS", "")) + current += 1; + + /** Initial 'X' is pronounced 'Z' e.g. 'Xavier' **/ + const char first_char = meta_get_char_at(original, 0); + if (first_char == 'X') + { + meta_add_str(primary, "S"); /* 'Z' maps to 'S' */ + meta_add_str(secondary, "S"); + current += 1; + } + + /** Precomputing this is useful. **/ + const bool is_slavo_germanic = meta_is_slavo_germanic(original); + + /** Main loop. **/ + while (current < length) + { + const char cur_char = meta_get_char_at(original, current); + const char next_char = meta_get_char_at(original, current + 1); + switch (cur_char) + { + case 'A': + case 'E': + case 'I': + case 'O': + case 'U': + case 'Y': + { + if (current == 0) + { + /** All init vowels now map to 'A'. **/ + meta_add_str(primary, "A"); + meta_add_str(secondary, "A"); + } + current += 1; + break; + } + + case 'B': + { + /** "-mb", e.g", "dumb", already skipped over... **/ + meta_add_str(primary, "P"); + meta_add_str(secondary, "P"); + + current += (next_char == 'B') ? 2 : 1; + break; + } + + case 'C': + { + /** Various germanic. **/ + if ( + (current > 1) + && !meta_is_vowel(original, current - 2) + && meta_is_str_at(original, (current - 1), "ACH", "") + && meta_get_char_at(original, current + 2) != 'I' + && ( + meta_get_char_at(original, current + 2) != 'E' + || meta_is_str_at(original, (current - 2), "BACHER", "MACHER", "") + ) + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + /** Special case 'caesar' **/ + if (current == 0 && meta_is_str_at(original, current, "CAESAR", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "S"); + current += 2; + break; + } + + /** Italian 'chianti' **/ + if (meta_is_str_at(original, current, "CHIA", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + if (meta_is_str_at(original, current, "CH", "")) + { + /** Find 'michael' **/ + if (current > 0 && meta_is_str_at(original, current, "CHAE", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "X"); + current += 2; + break; + } + + /** Greek roots e.g. 'chemistry', 'chorus' **/ + if ( + current == 0 + && meta_is_str_at(original, (current + 1), "HOR", "HYM", "HIA", "HEM", "HARAC", "HARIS", "") + && !meta_is_str_at(original, 0, "CHORE", "") + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + /** Germanic, greek, or otherwise 'ch' for 'kh' sound. */ + if ( + meta_is_str_at(original, 0, "SCH", "VAN ", "VON ", "") + /** 'architect but not 'arch', 'orchestra', 'orchid' **/ + || meta_is_str_at(original, (current - 2), "ORCHES", "ARCHIT", "ORCHID", "") + || meta_is_str_at(original, (current + 2), "T", "S", "") + || ( + (current == 0 || meta_is_str_at(original, (current - 1), "A", "O", "U", "E", "")) + /** e.g., 'wachtler', 'wechsler', but not 'tichner' **/ + && meta_is_str_at(original, (current + 2), "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", "") + ) + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + else + { + if (current > 0) + { + if (meta_is_str_at(original, 0, "MC", "")) + { + /* e.g., "McHugh" */ + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + else + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "K"); + } + } + else + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + } + } + current += 2; + break; + } + + /** e.g, 'czerny' **/ + if (meta_is_str_at(original, current, "CZ", "") + && !meta_is_str_at(original, (current - 2), "WICZ", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + current += 2; + break; + } + + /** e.g., 'focaccia' **/ + if (meta_is_str_at(original, (current + 1), "CIA", "")) + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + current += 3; + break; + } + + /** Double 'C' rule. **/ + if ( + meta_is_str_at(original, current, "CC", "") + && !(current == 1 && first_char == 'M') /* McClellan exception. */ + ) + { + /** 'bellocchio' but not 'bacchus' **/ + if ( + meta_is_str_at(original, (current + 2), "I", "E", "H", "") + && !meta_is_str_at(original, (current + 2), "HU", "") + ) + { + /** 'accident', 'accede' 'succeed' **/ + if ( + (current == 1 && meta_get_char_at(original, current - 1) == 'A') + || meta_is_str_at(original, (current - 1), "UCCEE", "UCCES", "") + ) + { + meta_add_str(primary, "KS"); + meta_add_str(secondary, "KS"); + /** 'bacci', 'bertucci', other italian **/ + } + else + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + } + current += 3; + break; + } + else + { /** Pierce's rule **/ + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + } + + if (meta_is_str_at(original, current, "CK", "CG", "CQ", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + if (meta_is_str_at(original, current, "CI", "CE", "CY", "")) + { + /* Italian vs. English */ + if (meta_is_str_at(original, current, "CIO", "CIE", "CIA", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + } + else + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "S"); + } + current += 2; + break; + } + + /** else **/ + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + + /** Name sent in 'mac caffrey', 'mac gregor **/ + if (meta_is_str_at(original, (current + 1), " C", " Q", " G", "")) + current += 3; + else if (meta_is_str_at(original, (current + 1), "C", "K", "Q", "") + && !meta_is_str_at(original, (current + 1), "CE", "CI", "")) + current += 2; + else + current += 1; + break; + } + + case 'D': + { + if (meta_is_str_at(original, current, "DG", "")) + { + if (meta_is_str_at(original, (current + 2), "I", "E", "Y", "")) + { + /** e.g. 'edge' **/ + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + current += 3; + break; + } + else + { + /** e.g. 'edgar' **/ + meta_add_str(primary, "TK"); + meta_add_str(secondary, "TK"); + current += 2; + break; + } + } + + if (meta_is_str_at(original, current, "DT", "DD", "")) + { + meta_add_str(primary, "T"); + meta_add_str(secondary, "T"); + current += 2; + break; + } + + /** else **/ + meta_add_str(primary, "T"); + meta_add_str(secondary, "T"); + current += 1; + break; + } + + case 'F': + { + current += (next_char == 'F') ? 2 : 1; + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + break; + } + + case 'G': + { + if (next_char == 'H') + { + /** 'Vghee' */ + if (current > 0 && !meta_is_vowel(original, (current - 1))) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + current += 2; + break; + } + + if (current < 3) + { + /** 'ghislane', 'ghiradelli' **/ + if (current == 0) + { + if (meta_get_char_at(original, (current + 2)) == 'I') + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + } + else + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + current += 2; + break; + } + } + + if ( + /** Parker's rule (with some further refinements) - e.g., 'hugh' **/ + (current > 1 && meta_is_str_at(original, (current - 2), "B", "H", "D", "")) + /** e.g., 'bough' **/ + || (current > 2 && meta_is_str_at(original, (current - 3), "B", "H", "D", "")) + /** e.g., 'broughton' **/ + || (current > 3 && meta_is_str_at(original, (current - 4), "B", "H", "")) + ) + { + current += 2; + break; + } + else + { + /** e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' **/ + if ( + current > 2 + && meta_get_char_at(original, (current - 1)) == 'U' + && meta_is_str_at(original, (current - 3), "C", "G", "L", "R", "T", "") + ) + { + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + } + else if (current > 0 && meta_get_char_at(original, (current - 1)) != 'I') + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + + current += 2; + break; + } + } + + if (next_char == 'N') + { + if (current == 1 && !is_slavo_germanic && meta_is_vowel(original, 0)) + { + meta_add_str(primary, "KN"); + meta_add_str(secondary, "N"); + } + else + /** not e.g. 'cagney' **/ + if ( + next_char != 'Y' + && !is_slavo_germanic + && !meta_is_str_at(original, (current + 2), "EY", "") + ) + { + meta_add_str(primary, "N"); + meta_add_str(secondary, "KN"); + } + else + { + meta_add_str(primary, "KN"); + meta_add_str(secondary, "KN"); + } + current += 2; + break; + } + + /** 'tagliaro' **/ + if ( + !is_slavo_germanic + && meta_is_str_at(original, (current + 1), "LI", "") + ) + { + meta_add_str(primary, "KL"); + meta_add_str(secondary, "L"); + current += 2; + break; + } + + /** -ges-,-gep-,-gel-, -gie- at beginning **/ + if ( + current == 0 + && ( + next_char == 'Y' + || meta_is_str_at( + original, (current + 1), + "ES", "EP", "EB", "EL", "EY", "IB", + "IL", "IN", "IE", "EI", "ER", "" + ) + ) + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "J"); + current += 2; + break; + } + + /** -ger-, -gy- **/ + if ( + (next_char == 'Y' || meta_is_str_at(original, (current + 1), "ER", "")) + /** Exceptions. **/ + && !meta_is_str_at(original, 0, "DANGER", "RANGER", "MANGER", "") + && !meta_is_str_at(original, (current - 1), "E", "I", "RGY", "OGY", "") + ) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "J"); + current += 2; + break; + } + + /** Italian e.g, 'biaggi' **/ + if ( + meta_is_str_at(original, (current + 1), "E", "I", "Y", "") + || meta_is_str_at(original, (current - 1), "AGGI", "OGGI", "") + ) + { + /** Obvious germanic. **/ + if (meta_is_str_at(original, 0, "SCH", "VAN ", "VON ", "") + || meta_is_str_at(original, (current + 1), "ET", "")) + { + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + } + else + { + /** Always soft, if french ending. **/ + if (meta_is_str_at(original, (current + 1), "IER ", "")) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + } + else + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "K"); + } + } + current += 2; + break; + } + + current += (next_char == 'G') ? 2 : 1; + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + break; + } + + case 'H': + { + /** Only keep if first & before vowel or between 2 vowels. **/ + if ( + (current == 0 || meta_is_vowel(original, (current - 1))) + && meta_is_vowel(original, current + 1) + ) + { + meta_add_str(primary, "H"); + meta_add_str(secondary, "H"); + current += 2; + } + else /* also takes care of 'HH' */ + current += 1; + break; + } + + case 'J': + { + /** Obvious spanish, 'jose', 'san jacinto' **/ + const bool has_jose_next = meta_is_str_at(original, current, "JOSE", ""); + const bool starts_with_san = meta_is_str_at(original, 0, "SAN ", ""); + if (has_jose_next || starts_with_san) + { + if ( + starts_with_san + /** I don't know what this condition means. **/ + || (current == 0 && meta_get_char_at(original, current + 4) == ' ') + ) + { + meta_add_str(primary, "H"); + meta_add_str(secondary, "H"); + } + else + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "H"); + } + current += 1; + break; + } + + if (current == 0 && !has_jose_next) + { + meta_add_str(primary, "J"); /* Yankelovich/Jankelowicz */ + meta_add_str(secondary, "A"); + } + else + { + /** spanish pron. of e.g. 'bajador' **/ + if ( + !is_slavo_germanic + && (next_char == 'A' || next_char == 'O') + && meta_is_vowel(original, (current - 1)) + ) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "H"); + } + else + { + if (current == last) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, ""); + } + else + { + if ( + !meta_is_str_at(original, (current + 1), "L", "T", "K", "S", "N", "M", "B", "Z", "") + && !meta_is_str_at(original, (current - 1), "S", "K", "L", "") + ) + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + } + } + } + } + + current += (next_char == 'J') ? 2 : 1; + break; + } + + case 'K': + { + current += (next_char == 'K') ? 2 : 1; + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + break; + } + + case 'L': + { + if (next_char == 'L') + { + /** Spanish e.g. 'cabrillo', 'gallegos' **/ + if ( + ( + current == length - 3 + && meta_is_str_at(original, (current - 1), "ILLO", "ILLA", "ALLE", "") + ) + || ( + meta_is_str_at(original, (current - 1), "ALLE", "") + && ( + meta_is_str_at(original, (last - 1), "AS", "OS", "") + || meta_is_str_at(original, last, "A", "O", "") + ) + ) + ) + { + meta_add_str(primary, "L"); + meta_add_str(secondary, ""); + current += 2; + break; + } + current += 2; + } + else + current += 1; + meta_add_str(primary, "L"); + meta_add_str(secondary, "L"); + break; + } + + case 'M': + { + current += ( + ( + meta_is_str_at(original, (current - 1), "UMB", "") + && (current + 1 == last || meta_is_str_at(original, (current + 2), "ER", "")) + ) + /** 'dumb','thumb' **/ + || next_char == 'M' + ) ? 2 : 1; + meta_add_str(primary, "M"); + meta_add_str(secondary, "M"); + break; + } + + case 'N': + { + current += (next_char == 'N') ? 2 : 1; + meta_add_str(primary, "N"); + meta_add_str(secondary, "N"); + break; + } + + case 'P': + { + if (next_char == 'H') + { + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + current += 2; + break; + } + + /** Also account for "campbell", "raspberry" **/ + current += (meta_is_str_at(original, (current + 1), "P", "B", "")) ? 2 : 1; + meta_add_str(primary, "P"); + meta_add_str(secondary, "P"); + break; + } + + case 'Q': + { + current += (next_char == 'Q') ? 2 : 1; + meta_add_str(primary, "K"); + meta_add_str(secondary, "K"); + break; + } + + case 'R': + { + /** French e.g. 'rogier', but exclude 'hochmeier' **/ + const bool no_primary = ( + !is_slavo_germanic + && current == last + && meta_is_str_at(original, (current - 2), "IE", "") + && !meta_is_str_at(original, (current - 4), "ME", "MA", "") + ); + + meta_add_str(primary, (no_primary) ? "" : "R"); + meta_add_str(secondary, "R"); + current += (next_char == 'R') ? 2 : 1; + break; + } + + case 'S': + { + /** Special cases 'island', 'isle', 'carlisle', 'carlysle' **/ + if (meta_is_str_at(original, (current - 1), "ISL", "YSL", "")) + { + current += 1; + break; + } + + /** Special case 'sugar-' **/ + if (current == 0 && meta_is_str_at(original, current, "SUGAR", "")) + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "S"); + current += 1; + break; + } + + if (meta_is_str_at(original, current, "SH", "")) + { + const bool germanic = meta_is_str_at(original, (current + 1), "HEIM", "HOEK", "HOLM", "HOLZ", ""); + const char* sound = (germanic) ? "S" : "X"; + meta_add_str(primary, sound); + meta_add_str(secondary, sound); + current += 2; + break; + } + + /** Italian & Armenian. **/ + if (meta_is_str_at(original, current, "SIO", "SIA", "SIAN", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, (is_slavo_germanic) ? "S" : "X"); + current += 3; + break; + } + + /** german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' **/ + /** also, -sz- in slavic language altho in hungarian it is pronounced 's' **/ + if (current == 0 && meta_is_str_at(original, (current + 1), "M", "N", "L", "W", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + current += 1; + break; + } + if (meta_is_str_at(original, (current + 1), "Z", "")) + { + meta_add_str(primary, "S"); + meta_add_str(secondary, "X"); + current += 2; + break; + } + + if (meta_is_str_at(original, current, "SC", "")) + { + /** Schlesinger's rule. **/ + if (meta_get_char_at(original, current + 2) == 'H') + { + /** Dutch origin, e.g. 'school', 'schooner' **/ + if (meta_is_str_at(original, (current + 3), "OO", "ER", "EN", "UY", "ED", "EM", "")) + { + /** 'schermerhorn', 'schenker' **/ + const bool x_sound = meta_is_str_at(original, (current + 3), "ER", "EN", ""); + meta_add_str(primary, (x_sound) ? "X" : "SK"); + meta_add_str(secondary, "SK"); + current += 3; + break; + } + else + { + const bool s_sound = ( + current == 0 + && !meta_is_vowel(original, 3) + && meta_get_char_at(original, 3) != 'W' + ); + meta_add_str(primary, "X"); + meta_add_str(secondary, (s_sound) ? "S" : "X"); + current += 3; + break; + } + } + + /** Default case. **/ + const char* sound = (meta_is_str_at(original, (current + 2), "E", "I", "Y", "")) ? "S" : "SK"; + meta_add_str(primary, sound); + meta_add_str(secondary, sound); + current += 3; + break; + } + + /** French e.g. 'resnais', 'artois' **/ + const bool no_primary = (current == last && meta_is_str_at(original, (current - 2), "AI", "OI", "")); + meta_add_str(primary, (no_primary) ? "" : "S"); + meta_add_str(secondary, "S"); + current += (meta_is_str_at(original, (current + 1), "S", "Z", "")) ? 2 : 1; + break; + } + + case 'T': + { + if (meta_is_str_at(original, current, "TIA", "TCH", "TION", "")) + { + meta_add_str(primary, "X"); + meta_add_str(secondary, "X"); + current += 3; + break; + } + + if (meta_is_str_at(original, current, "TH", "TTH", "")) + { + /** Special case 'thomas', 'thames' or germanic. **/ + if ( + meta_is_str_at(original, (current + 2), "OM", "AM", "") + || meta_is_str_at(original, 0, "SCH", "VAN ", "VON ", "") + ) + meta_add_str(primary, "T"); + else + meta_add_str(primary, "0"); /* Yes, zero. */ + meta_add_str(secondary, "T"); + current += 2; + break; + } + + meta_add_str(primary, "T"); + meta_add_str(secondary, "T"); + current += (meta_is_str_at(original, (current + 1), "T", "D", "")) ? 2 : 1; + break; + } + + case 'V': + { + meta_add_str(primary, "F"); + meta_add_str(secondary, "F"); + current += (next_char == 'V') ? 2 : 1; + break; + } + + case 'W': + { + /** Can also be in middle of word. **/ + if (meta_is_str_at(original, current, "WR", "")) + { + meta_add_str(primary, "R"); + meta_add_str(secondary, "R"); + current += 2; + break; + } + + const bool next_is_vowel = meta_is_vowel(original, current + 1); + if (current == 0 && (next_is_vowel || meta_is_str_at(original, current, "WH", ""))) + { + /** Wasserman should match Vasserman. **/ + meta_add_str(primary, "A"); + meta_add_str(secondary, (next_is_vowel) ? "F" : "A"); + } + + /** Arnow should match Arnoff. **/ + if ((current == last && meta_is_vowel(original, current - 1)) + || meta_is_str_at(original, (current - 1), "EWSKI", "EWSKY", "OWSKI", "OWSKY", "") + || meta_is_str_at(original, 0, "SCH", "") + ) + { + meta_add_str(primary, ""); + meta_add_str(secondary, "F"); + current += 1; + break; + } + + /** Polish e.g. 'filipowicz' **/ + if (meta_is_str_at(original, current, "WICZ", "WITZ", "")) + { + meta_add_str(primary, "TS"); + meta_add_str(secondary, "FX"); + current += 4; + break; + } + + /** Else skip it. **/ + current += 1; + break; + } + + case 'X': + { + /** French e.g. breaux **/ + const bool silent = ( + current == last + && ( + meta_is_str_at(original, (current - 2), "AU", "OU", "") + || meta_is_str_at(original, (current - 3), "IAU", "EAU", "") + ) + ); + if (!silent) + { + meta_add_str(primary, "KS"); + meta_add_str(secondary, "KS"); + } + + current += (meta_is_str_at(original, (current + 1), "C", "X", "")) ? 2 : 1; + break; + } + + case 'Z': + { + /** Chinese pinyin e.g. 'zhao' **/ + if (next_char == 'H') + { + meta_add_str(primary, "J"); + meta_add_str(secondary, "J"); + current += 2; + break; + } + + const bool has_t_sound = ( + meta_is_str_at(original, (current + 1), "ZO", "ZI", "ZA", "") + || (is_slavo_germanic && current > 0 && meta_get_char_at(original, (current - 1)) != 'T') + ); + meta_add_str(primary, "S"); + meta_add_str(secondary, (has_t_sound) ? "TS" : "S"); + current += (next_char == 'Z') ? 2 : 1; + break; + } + + default: + current += 1; + } + } + + *primary_code = primary->str; + *secondary_code = secondary->str; + + meta_destroy_string(original); + meta_destroy_string(primary); + meta_destroy_string(secondary); + } + +#ifdef TESTING +/*** Built in test cases. + *** + *** These tests have been integrated into the Centrallix testing environment, + *** where they can be run using `export TONLY=expfn_double_metaphone_00`, + *** followed by make test, in the Centrallix directory. + *** + *** The can also be run here by executing the following commands in the + *** centrallix/expression directory, which aditionally generates a coverage + *** report. These tests cover all parts of the double metaphone algorithm, + *** although some of the error cases in various helper functions (such as + *** meta_destroy_string(null)) are not covered by testing. + *** + *** Commands: + *** gcc exp_double_metaphone.c -o exp_double_metaphone.o -I .. -DTESTING -fprofile-arcs -ftest-coverage -O0 + *** ./exp_double_metaphone.o + *** gcov exp_double_metaphone.c + ***/ + +unsigned int num_tests_passed = 0u, num_tests_failed = 0u; + +void test(const char* input, const char* expected_primary, const char* expected_secondary) { + char* codes[2]; + + /** Run DoubleMetaphone() and extract results. **/ + char* actual_primary; + char* actual_secondary; + meta_double_metaphone( + input, + memset(&actual_primary, 0, sizeof(actual_primary)), + memset(&actual_secondary, 0, sizeof(actual_secondary)) + ); + + /** Test for correct value. **/ + if (!strcmp(expected_primary, actual_primary) && + !strcmp(expected_secondary, actual_secondary)) + num_tests_passed++; + else + { + printf( + "\nTEST FAILED: \"%s\"\n" + "Expected: %s %s\n" + "Actual: %s %s\n", + input, + expected_primary, expected_secondary, + actual_primary, actual_secondary + ); + num_tests_failed++; + } + } + +// Special thanks to the following websites for double checking the correct results: +// 1: https://words.github.io/double-metaphone +// 2: https://mainegenealogy.net/metaphone_converter.asp +// 3: https://en.toolpage.org/tool/metaphone +void run_tests(void) { + printf("\nRunning tests...\n"); + + /** Test that always fails. **/ + // test("This", "test", "fails."); + + /** Invalid string tests, by Israel. **/ + fprintf(stderr, "There should be two warnings after this line.\n"); + test(NULL, "", ""); + test("", "", ""); + + /** Basic tests, by Israel. **/ + test("Test", "TST", "TST"); + test("Basic", "PSK", "PSK"); + test("Centrallix", "SNTRLKS", "SNTRLKS"); + test("Lawrence", "LRNS", "LRNS"); + test("Philips", "FLPS", "FLPS"); + test("Acceptingness", "AKSPTNNS", "AKSPTNKNS"); + test("Supercalifragilisticexpialidocious", "SPRKLFRJLSTSKSPLTSS", "SPRKLFRKLSTSKSPLTXS"); + test("Suoicodilaipxecitsiligarfilacrepus", "SKTLPKSSTSLKRFLKRPS", "SKTLPKSSTSLKRFLKRPS"); + + /** Match tests from code comments above. **/ + test("Smith", "SM0", "XMT"); + test("Schmidt", "XMT", "SMT"); + test("Snider", "SNTR", "XNTR"); + test("Schneider", "XNTR", "SNTR"); + test("Arnow", "ARN", "ARNF"); + test("Arnoff", "ARNF", "ARNF"); + + /** Tests from examples in code comments above. **/ + test("Accede", "AKST", "AKST"); + test("Accident", "AKSTNT", "AKSTNT"); + test("Actually", "AKTL", "AKTL"); + test("Arch", "ARX", "ARK"); + test("Artois", "ART", "ARTS"); + test("Bacchus", "PKS", "PKS"); + test("Bacci", "PX", "PX"); + test("Bajador", "PJTR", "PHTR"); + test("Bellocchio", "PLX", "PLX"); + test("Bertucci", "PRTX", "PRTX"); + test("Biaggi", "PJ", "PK"); + test("Bough", "P", "P"); + test("Breaux", "PR", "PR"); + test("Broughton", "PRTN", "PRTN"); + test("Cabrillo", "KPRL", "KPR"); + test("Caesar", "SSR", "SSR"); + test("Cagney", "KKN", "KKN"); + test("Campbell", "KMPL", "KMPL"); + test("Carlisle", "KRLL", "KRLL"); + test("Carlysle", "KRLL", "KRLL"); + test("Chemistry", "KMSTR", "KMSTR"); + test("Chianti", "KNT", "KNT"); + test("Chorus", "KRS", "KRS"); + test("Cough", "KF", "KF"); + test("Czerny", "SRN", "XRN"); + test("Dumb", "TM", "TM"); + test("Edgar", "ATKR", "ATKR"); + test("Edge", "AJ", "AJ"); + test("Filipowicz", "FLPTS", "FLPFX"); + test("Focaccia", "FKX", "FKX"); + test("Gallegos", "KLKS", "KKS"); + test("Germanic", "KRMNK", "JRMNK"); + test("Ghiradelli", "JRTL", "JRTL"); + test("Ghislane", "JLN", "JLN"); + test("Gospel", "KSPL", "KSPL"); + test("Gough", "KF", "KF"); + test("Greek", "KRK", "KRK"); + test("Hochmeier", "HKMR", "HKMR"); + test("Hugh", "H", "H"); + test("Island", "ALNT", "ALNT"); + test("Isle", "AL", "AL"); + test("Italian", "ATLN", "ATLN"); + test("Jankelowicz", "JNKLTS", "ANKLFX"); + test("Jose", "HS", "HS"); + test("Laugh", "LF", "LF"); + test("Mac Caffrey", "MKFR", "MKFR"); + test("Mac Gregor", "MKRKR", "MKRKR"); + test("Manager", "MNKR", "MNJR"); + test("McHugh", "MK", "MK"); + test("McLaughlin", "MKLFLN", "MKLFLN"); + test("Michael", "MKL", "MXL"); + test("Middle", "MTL", "MTL"); + test("Orchestra", "ARKSTR", "ARKSTR"); + test("Orchid", "ARKT", "ARKT"); + test("Pinyin", "PNN", "PNN"); + test("Raspberry", "RSPR", "RSPR"); + test("Resnais", "RSN", "RSNS"); + test("Rogier", "RJ", "RJR"); + test("Rough", "RF", "RF"); + test("Salvador", "SLFTR", "SLFTR"); + test("San jacinto", "SNHSNT", "SNHSNT"); + test("Schenker", "XNKR", "SKNKR"); + test("Schermerhorn", "XRMRRN", "SKRMRRN"); + test("Schlesinger", "XLSNKR", "SLSNJR"); + test("School", "SKL", "SKL"); + test("Schooner", "SKNR", "SKNR"); + test("Succeed", "SKST", "SKST"); + test("Sugar", "XKR", "SKR"); + test("Sugary", "XKR", "SKR"); + test("Tagliaro", "TKLR", "TLR"); + test("Thames", "TMS", "TMS"); + test("Thomas", "TMS", "TMS"); + test("Thumb", "0M", "TM"); + test("Tichner", "TXNR", "TKNR"); + test("Tough", "TF", "TF"); + test("Vghee", "FK", "FK"); + test("Wachtler", "AKTLR", "FKTLR"); + test("Wechsler", "AKSLR", "FKSLR"); + test("Word", "ART", "FRT"); + test("Xavier", "SF", "SFR"); + test("Yankelovich", "ANKLFX", "ANKLFK"); + test("Zhao", "J", "J"); + + /** Intereesting Edge Case: "McClellan" **/ + /*** Note: Sources (1) and (3) both include a double K ("MKKLLN"), but the + *** original code on GitHub and mainegenealogy.net do not. I chose "MKLLN" + *** to be correct because I personally do not pronounce the second c. + ***/ + test("McClellan", "MKLLN", "MKLLN"); + + /** Maurice Aubrey's Tests. **/ + /** Source: https://github.com/gitpan/Text-DoubleMetaphone/blob/master/t/words.txt **/ + test("maurice", "MRS", "MRS"); + test("aubrey", "APR", "APR"); + test("cambrillo", "KMPRL", "KMPR"); + test("heidi", "HT", "HT"); + test("katherine", "K0RN", "KTRN"); + test("catherine", "K0RN", "KTRN"); + test("richard", "RXRT", "RKRT"); + test("bob", "PP", "PP"); + test("eric", "ARK", "ARK"); + test("geoff", "JF", "KF"); + test("dave", "TF", "TF"); + test("ray", "R", "R"); + test("steven", "STFN", "STFN"); + test("bryce", "PRS", "PRS"); + test("randy", "RNT", "RNT"); + test("bryan", "PRN", "PRN"); + test("brian", "PRN", "PRN"); + test("otto", "AT", "AT"); + test("auto", "AT", "AT"); + + /** GPT-5 Coverage Tests. **/ + /*** GPT-5 mini (Preview) running in GitHub Copilot suggested the words + *** after analizing a generated coverage report, and I (Israel) used + *** them to write the tests below. I kept the AI's reasoning for tests, + *** while removing tests that did not contribute any coverage, but after + *** a few reprompts, the AI started just giving words without reasoning. + *** I guess we were both getting pretty tired. + ***/ + test("Abbott", "APT", "APT"); /* double-B ("BB") handling. */ + test("Back", "PK", "PK"); /* "CK"/"CG"/"CQ" branch. */ + test("Bacher", "PKR", "PKR"); /* matches "...BACHER" / ACH special-case. */ + test("Charles", "XRLS", "XRLS"); /* initial "CH" -> the branch that maps to "X"/"X" at start. */ + test("Ghana", "KN", "KN"); /* initial "GH" special-start handling. */ + test("Gnome", "NM", "NM"); /* "GN" sequence handling. */ + test("Raj", "RJ", "R"); /* J at end (exercise J-last behavior). */ + test("Quentin", "KNTN", "KNTN"); /* Q case (Q -> K mapping). */ + test("Who", "A", "A"); /* "WH" at start handling. */ + test("Shoemaker", "XMKR", "XMKR"); /* "SH" general mapping paths. */ + test("Sian", "SN", "XN"); /* "SIO"/"SIA"/"SIAN" branch. */ + test("Scold", "SKLT", "SKLT"); /* "SC" default / "SK" vs other SC subcases. */ + test("Station", "STXN", "STXN"); /* "TION" -> X mapping. */ + test("Match", "MX", "MX"); /* "TCH"/"TIA" -> X mapping. */ + test("Pizza", "PS", "PTS"); /* double-Z ("ZZ") handling. */ + test("Agnes", "AKNS", "ANS"); /* "GN" at index 1 (GN handling that yields KN / N). */ + test("Science", "SNS", "SNS"); /* "SC" followed by I (SC + I/E/Y branch). */ + test("Van Gogh", "FNKK", "FNKK"); + test("Josef", "JSF", "HSF"); + test("Object", "APJKT", "APJKT"); + test("Sholz", "SLS", "SLS"); + test("Scharf", "XRF", "XRF"); + test("Kasia", "KS", "KS"); + test("Van Geller", "FNKLR", "FNKLR"); + + const unsigned int total_tests = num_tests_passed + num_tests_failed; + printf("\nTests completed!\n"); + printf(" > Failed: %u\n", num_tests_failed); + printf(" > Skipped: %u\n", 0u); /* Implementation removed. */ + printf(" > Passed: %u/%u\n", num_tests_passed, total_tests); +} + +int main(void) { + run_tests(); + return 0; +} + +/** Prevent scope leak. **/ +#undef META_FREE +#undef META_MALLOC +#undef META_REALLOC +#undef SAFE_MALLOC +#undef SAFE_REALLOC + +#endif diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 6425114db..df55559be 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -1,27 +1,3 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include "obj.h" -#include "cxlib/mtask.h" -#include "cxlib/xarray.h" -#include "cxlib/xhash.h" -#include "cxlib/mtlexer.h" -#include "expression.h" -#include "cxlib/mtsession.h" -#include "cxss/cxss.h" -#include -#include -#include -#include -#include - - /************************************************************************/ /* Centrallix Application Server System */ /* Centrallix Core */ @@ -65,6 +41,48 @@ /* that issue in exp_evaluate.c */ /************************************************************************/ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxlib/clusters.h" +#include "cxlib/mtask.h" +#include "cxlib/mtlexer.h" +#include "cxlib/mtsession.h" +#include "cxlib/newmalloc.h" +#include "cxlib/xarray.h" +#include "cxlib/xhash.h" +#include "cxss/cxss.h" +#include "expression.h" +#include "obj.h" + +/** Duplocate detection settings. **/ +// #define SEPARATOR "|" +// #define SEPARATOR_CHAR '|' +// #define DBL_BUF_SIZE 16u +// #define USE_PARALLEL_COMPLETE_SEARCH true +// #define MIN_PARALLEL_COMPLETE_SEARCH 1000 +// #define MAX_COMPLETE_SEARCH 50 * 1000 // Default: 100 * 1000 +// #define KMEANS_IMPROVEMENT_THRESHOLD 0.0002 +#define EXP_NUM_DIMS 251 /* aka. The size of the vector table. */ +const int EXP_VECTOR_TABLE_SIZE = EXP_NUM_DIMS; /* Should probably be removed. */ /****** Evaluator functions follow for expEvalFunction ******/ @@ -1111,7 +1129,7 @@ int exp_fn_reverse(pExpression tree, pParamObjects objlist, pExpression i0, pExp return 0; } - +/** Leading zero trim. */ int exp_fn_lztrim(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { char* ptr; @@ -1337,6 +1355,7 @@ int exp_fn_ralign(pExpression tree, pParamObjects objlist, pExpression i0, pExpr tree->Alloc = 0; tree->String = tree->Types.StringBuf; } + /** Possible overflow? **/ sprintf(tree->String,"%*.*s",i1->Integer,i1->Integer,i0->String); } return 0; @@ -4119,9 +4138,6 @@ int exp_fn_lev_compare(pExpression tree, pParamObjects objlist, pExpression i0, return 0; } -// This is the size of the vector table. It is also used in calculating the table indices. -const int EXP_VECTOR_TABLE_SIZE = 251; - /* * hash_char_pair * This method creates an vector table index based a given character pair. The characters are represented @@ -4151,6 +4167,8 @@ int exp_fn_i_hash_char_pair(double num1, double num2) * * Returns: * 0 + * + * LINK ../../centrallix-sysdoc/string_comparison.md#exp_fn_i_frequency_table */ int exp_fn_i_frequency_table(unsigned short *table, char *term) { @@ -4233,7 +4251,9 @@ int exp_fn_i_frequency_table(unsigned short *table, char *term) * r_freq_table2 : the second vector (unsigned short) * * Returns: - * 0 + * 0 + * + * LINK ../../centrallix-sysdoc/string_comparison.md#exp_fn_i_dot_product */ int exp_fn_i_dot_product(double *dot_product, unsigned short *r_freq_table1, unsigned short *r_freq_table2) { @@ -4252,6 +4272,8 @@ int exp_fn_i_dot_product(double *dot_product, unsigned short *r_freq_table1, uns * Parameters: * magnitude : the place where the result is stored (double) * r_freq_table : the vector (unsigned short) + * + * LINK ../../centrallix-sysdoc/string_comparison.md#exp_fn_i_magnitude */ int exp_fn_i_magnitude(double *magnitude, unsigned short *r_freq_table) { @@ -4271,13 +4293,15 @@ int exp_fn_i_magnitude(double *magnitude, unsigned short *r_freq_table) * * Parameters: * tree : structure where output is stored - * objlist: + * objlist : unused * i0 : first data entry (pExpression) * i1 : second data entry (pExpression) - * i2 : + * i2 : unused * * Returns: - * 0 + * 0 + * + * LINK ../../centrallix-sysdoc/string_comparison.md#exp_fn_similarity */ int exp_fn_cos_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { @@ -4343,6 +4367,1722 @@ int exp_fn_cos_compare(pExpression tree, pParamObjects objlist, pExpression i0, return 0; } +// /*** ========================= +// *** DUPE SECTION +// *** By: Israel Fuller +// *** Last Updated: September, 2025 +// *** +// *** This section of the file deals with finding duplocates. +// ***/ + +// /*** @brief Returns the smaller of two values. +// *** +// *** @param a The first value. +// *** @param b The second value. +// *** @return The smaller of the two values. +// *** +// *** @note This macro uses GNU C extensions and is type-safe. +// ***/ +// #define min(a, b) ({ \ +// __typeof__ (a) _a = (a); \ +// __typeof__ (b) _b = (b); \ +// (_a < _b) ? _a : _b; \ +// }) + +// /*** @brief Returns the larger of two values. +// *** +// *** @param a The first value. +// *** @param b The second value. +// *** @return The larger of the two values. +// *** +// *** @note This macro uses GNU C extensions and is type-safe. +// ***/ +// #define max(a, b) ({ \ +// __typeof__ (a) _a = (a); \ +// __typeof__ (b) _b = (b); \ +// (_a > _b) ? _a : _b; \ +// }) + +// /** The character used to create a pair with the first and last characters of a string. **/ +// #define EXP_BOUNDARY_CHAR ('a' - 1) + +// /*** Helpful error handling function. **/ +// void mssErrorf(int clr, char* module, const char* format, ...); + +// /*** Gets the hash, representing a pair of ASCII characters, represented by unsigned ints. +// *** +// *** @param num1 The first character in the pair. +// *** @param num1 The second character in the pair. +// *** @returns The resulting hash. +// ***/ +// unsigned int exp_fn_get_char_pair_hash(const unsigned int num1, const unsigned int num2) +// { +// if (num1 == EXP_BOUNDARY_CHAR && num2 == EXP_BOUNDARY_CHAR) +// { +// mssErrorf(1, "EXP", +// "exp_fn_get_char_pair_hash(%u, %u) - Warning: Pair of boundary characters.", +// num1, num2 +// ); +// } +// const double sum = (num1 * num1 * num1) + (num2 * num2 * num2); +// const double scale = ((double)num1 + 1.0) / ((double)num2 + 1.0); +// const unsigned int hash = (unsigned int)round(sum * scale) - 1u; +// return hash % EXP_NUM_DIMS; +// } + +// /*** Builds a vector using a string. +// *** +// *** Vectors are based on the frequencies of character pairs in the string. +// *** Space characters and punctuation characters (see code for list) are ignored, +// *** and all characters are converted to lowercase. Character 96, which is just +// *** before 'a' in the ASCII table (and maps to '`') is used to make pairs on the +// *** start and end of strings. The only supported characters for the passed char* +// *** are spaces, punctuation, uppercase and lowercase letters, and numbers. +// *** +// *** This results in the following modified ASCII table: +// *** ```csv +// *** #, char, #, char, #, char +// *** 97, a, 109, m, 121, y +// *** 98, b, 110, n, 122, z +// *** 99, c, 111, o, 123, 0 +// *** 100, d, 112, p, 124, 1 +// *** 101, e, 113, q, 125, 2 +// *** 102, f, 114, r, 126, 3 +// *** 103, g, 115, s, 127, 4 +// *** 104, h, 116, t, 128, 5 +// *** 105, i, 117, u, 129, 6 +// *** 106, j, 118, v, 130, 7 +// *** 107, k, 119, w, 131, 8 +// *** 108, l, 120, x, 132, 9 +// *** ``` +// *** Thus, any number from 96 (the start/end character) to 132 ('9') is a valid +// *** input to get_char_pair_hash(). +// *** +// *** After hashing each character pair, we add some number from 1 to 13 to the +// *** coresponding dimention. However, for most names, this results in a lot of +// *** zeros and a FEW positive numbers. Thus, after creating the dense vector, +// *** we convert it to a sparse vector in which a negative number replaces a run +// *** of that many zeros. Consider the following example: +// *** +// *** Dense Vector: `[1,0,0,0,3,0]` +// *** +// *** Sparse Vector: `[1,-3,3,-1]` +// *** +// *** Using these sparse vectors greatly reduces the required memory and gives +// *** aproximately an x5 boost to performance when traversing vectors, at the +// *** cost of more algorithmically complex code. +// *** +// *** @param str The string to be divided into pairs and hashed to make the vector. +// *** @returns The sparse vector built using the hashed character pairs. +// ***/ +// int* build_vector(char* str) { +// /** Allocate space for a dense vector. **/ +// unsigned int dense_vector[EXP_NUM_DIMS] = {0u}; + +// /** j is the former character, i is the latter. **/ +// const unsigned int num_chars = (unsigned int)strlen(str); +// for (unsigned int j = 65535u, i = 0u; i <= num_chars; i++) +// { +// /** isspace: space, \n, \v, \f, \r **/ +// if (isspace(str[i])) continue; + +// /** ispunct: !"#$%&'()*+,-./:;<=>?@[\]^_{|}~ **/ +// if (ispunct(str[i]) && str[i] != EXP_BOUNDARY_CHAR) continue; + +// /*** iscntrl (0-8): SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS +// *** (14-31): SO, SI, DLE, DC1-4, NAK, SYN, ETB, CAN +// *** EM, SUB, ESC, FS, GS, RS, US +// ***/ +// if (iscntrl(str[i]) && i != num_chars) { +// mssErrorf(1, "EXP", +// "build_vector(%s) - Warning: Skipping unknown character #%u.\n", +// str, (unsigned int)str[i] +// ); +// continue; +// } + +// /** First and last character should fall one before 'a' in the ASCII table. **/ +// unsigned int temp1 = (j == 65535u) ? EXP_BOUNDARY_CHAR : (unsigned int)tolower(str[j]); +// unsigned int temp2 = (i == num_chars) ? EXP_BOUNDARY_CHAR : (unsigned int)tolower(str[i]); + +// /** Shift numbers to the end of the lowercase letters. **/ +// if ('0' <= temp1 && temp1 <= '9') temp1 += 75u; +// if ('0' <= temp2 && temp2 <= '9') temp2 += 75u; + +// /** Hash the character pair into an index (dimension). **/ +// /** Note that temp will be between 97 ('a') and 132 ('9'). **/ +// unsigned int dim = exp_fn_get_char_pair_hash(temp1, temp2); + +// /** Increment the dimension of the dense vector by a number from 1 to 13. **/ +// dense_vector[dim] += (temp1 + temp2) % 13u + 1u; + +// j = i; +// } + +// /** Count how much space is needed for a sparse vector. **/ +// bool zero_prev = false; +// size_t size = 0u; +// for (unsigned int dim = 0u; dim < EXP_NUM_DIMS; dim++) +// { +// if (dense_vector[dim] == 0u) +// { +// size += (zero_prev) ? 0u : 1u; +// zero_prev = true; +// } +// else +// { +// size++; +// zero_prev = false; +// } +// } + +// /*** Check compression size. +// *** If this check fails, I doubt anything will break. However, the longest +// *** word I know (supercalifragilisticexpialidocious) has only 35 character +// *** pairs, so it shouldn't reach half this size (and it'd be even shorter +// *** if the hash generates at least one collision). +// *** +// *** Bad vector compression will result in degraded performace and increased +// *** memory usage, and likely also indicates a bug or modified assumption +// *** elsewhere in the code. +// *** +// *** If this warning is ever generated, it's definitely worth investigating. +// ***/ +// const size_t expected_max_size = 64u; +// if (size > expected_max_size) +// { +// mssErrorf(1, "EXP" +// "build_vector(%s) - Warning: Sparse vector larger than expected.\n" +// " > Size: %lu\n" +// " > #Dims: %u\n", +// str, +// size, +// EXP_NUM_DIMS +// ); +// } + +// /** Allocate space for sparse vector. **/ +// const size_t sparse_vector_size = size * sizeof(int); +// int* sparse_vector = (int*)nmSysMalloc(sparse_vector_size); +// if (sparse_vector == NULL) { +// mssErrorf(1, "EXP", +// "build_vector(%s) - nmSysMalloc(%lu) failed.", +// str, sparse_vector_size +// ); +// return NULL; +// } + +// /** Convert the dense vector above to a sparse vector. **/ +// unsigned int j = 0u, sparse_idx = 0u; +// while (j < EXP_NUM_DIMS) +// { +// if (dense_vector[j] == 0u) +// { +// /*** Count and store consecutive zeros, except the first one, +// *** which we already know is zero. +// ***/ +// unsigned int zero_count = 1u; +// j++; +// while (j < EXP_NUM_DIMS && dense_vector[j] == 0u) +// { +// zero_count++; +// j++; +// } +// sparse_vector[sparse_idx++] = (int)-zero_count; +// } +// else +// { +// /** Store the value. **/ +// sparse_vector[sparse_idx++] = (int)dense_vector[j++]; +// } +// } + +// return sparse_vector; +// } + +// /*** Compute the magnitude of a sparsely allocated vector. +// *** +// *** @param vector The vector. +// *** @returns The computed magnitude. +// ***/ +// double exp_fn_magnitude_sparse(const int* vector) +// { +// unsigned int magnitude = 0u; +// for (unsigned int i = 0u, dim = 0u; dim < EXP_NUM_DIMS;) +// { +// const int val = vector[i++]; + +// /** Negative val represents -val 0s in the array, so skip that many values. **/ +// if (val < 0) dim += (unsigned)(-val); + +// /** We have a param_value, so square it and add it to the magnitude. **/ +// else { magnitude += (unsigned)(val * val); dim++; } +// } +// return sqrt((double)magnitude); +// } + +// /*** Compute the magnitude of a densely allocated centroid. +// *** +// *** @param centroid The centroid. +// *** @returns The computed magnitude. +// ***/ +// double exp_fn_magnitude_dense(const double* centroid) +// { +// double magnitude = 0.0; +// for (int i = 0; i < EXP_NUM_DIMS; i++) +// magnitude += centroid[i] * centroid[i]; +// return sqrt(magnitude); +// } + +// /*** Parse a token from a sparsely allocated vector and write the param_value and +// *** number of remaining values to the passed locations. +// *** +// *** @param token The sparse vector token being parsed. +// *** @param remaining The location to save the remaining number of characters. +// *** @param param_value The location to save the param_value of the token. +// ***/ +// void exp_fn_parse_token(const int token, unsigned int* remaining, unsigned int* param_value) { +// if (token < 0) +// { +// /** This run contains -token zeros. **/ +// *remaining = (unsigned)(-token); +// *param_value = 0u; +// } +// else +// { +// /** This run contains one param_value. **/ +// *remaining = 1u; +// *param_value = (unsigned)(token); +// } +// } + +// /*** Calculate the similarity on sparcely allocated vectors. Comparing +// *** any string to an empty string should always return 0.5 (untested). +// *** +// *** @param v1 Sparse vector #1. +// *** @param v2 Sparse vector #2. +// *** @returns Similarity between 0 and 1 where +// *** 1 indicates identical and +// *** 0 indicates completely different. +// ***/ +// double exp_fn_sparse_similarity(const int* v1, const int* v2) +// { +// /** Calculate dot product. **/ +// unsigned int vec1_remaining = 0u, vec2_remaining = 0u; +// unsigned int dim = 0u, i1 = 0u, i2 = 0u, dot_product = 0u; +// while (dim < EXP_NUM_DIMS) +// { +// unsigned int val1 = 0u, val2 = 0u; +// if (vec1_remaining == 0u) exp_fn_parse_token(v1[i1++], &vec1_remaining, &val1); +// if (vec2_remaining == 0u) exp_fn_parse_token(v2[i2++], &vec2_remaining, &val2); + +// /*** Accumulate the dot_product. If either vector is 0 here, +// *** the total is 0 and this statement does nothing. +// ***/ +// dot_product += val1 * val2; + +// /** Consume overlap from both runs. **/ +// unsigned int overlap = min(vec1_remaining, vec2_remaining); +// vec1_remaining -= overlap; +// vec2_remaining -= overlap; +// dim += overlap; +// } + +// /** Optional optimization to speed up nonsimilar vectors. **/ +// if (dot_product == 0u) return 0.0; + +// /** Return the difference score. **/ +// return (double)dot_product / (exp_fn_magnitude_sparse(v1) * exp_fn_magnitude_sparse(v2)); +// } + +// /*** Calculate the difference on sparcely allocated vectors. Comparing +// *** any string to an empty string should always return 0.5 (untested). +// *** +// *** @param v1 Sparse vector #1. +// *** @param v2 Sparse vector #2. +// *** @returns Similarity between 0 and 1 where +// *** 1 indicates completely different and +// *** 0 indicates identical. +// ***/ +// #define exp_fn_sparse_dif(v1, v2) (1.0 - exp_fn_sparse_similarity(v1, v2)) + +// /*** Calculate the similarity between a sparsely allocated vector +// *** and a densely allocated centroid. Comparing any string to an +// *** empty string should always return 0.5 (untested). +// *** +// *** @param v1 Sparse vector #1. +// *** @param c1 Dense centroid #2. +// *** @returns Similarity between 0 and 1 where +// *** 1 indicates identical and +// *** 0 indicates completely different. +// ***/ +// double exp_fn_sparse_similarity_c(const int* v1, const double* c2) +// { +// /** Calculate dot product. **/ +// double dot_product = 0.0; +// for (unsigned int i = 0u, dim = 0u; dim < EXP_NUM_DIMS;) +// { +// const int val = v1[i++]; + +// /** Negative val represents -val 0s in the array, so skip that many values. **/ +// if (val < 0) dim += (unsigned)(-val); + +// /** We have a param_value, so square it and add it to the magnitude. **/ +// else dot_product += (double)val * c2[dim++]; +// } + +// /** Return the difference score. **/ +// return dot_product / (exp_fn_magnitude_sparse(v1) * exp_fn_magnitude_dense(c2)); +// } + +// /*** Calculate the difference between a sparsely allocated vector +// *** and a densely allocated centroid. Comparing any string to an +// *** empty string should always return 0.5 (untested). +// *** +// *** @param v1 Sparse vector #1. +// *** @param c1 Dense centroid #2. +// *** @returns Difference between 0 and 1 where +// *** 1 indicates completely different and +// *** 0 indicates identical. +// ***/ +// #define exp_fn_sparse_dif_c(v1, c2) (1.0 - exp_fn_sparse_similarity_c(v1, c2)) + +// /*** Calculate the average size of all clusters in a set of vectors. +// *** +// *** @param vectors The vectors of the dataset (allocated sparsely). +// *** @param num_vectors The number of vectors in the dataset. +// *** @param labels The clusters to which vectors are assigned. +// *** @param centroids The locations of the centroids (allocated densely). +// *** @param num_clusters The number of centroids (k). +// *** @returns The average cluster size. +// ***/ +// double exp_fn_get_cluster_size( +// int** vectors, +// const unsigned int num_vectors, +// unsigned int* labels, +// double centroids[][EXP_NUM_DIMS], +// const unsigned int num_clusters +// ) +// { +// double cluster_sums[num_clusters]; +// unsigned int cluster_counts[num_clusters]; +// for (unsigned int i = 0u; i < num_clusters; i++) +// cluster_sums[i] = 0.0; +// memset(cluster_counts, 0, sizeof(cluster_counts)); + +// /** Sum the difference from each vector to its cluster centroid. **/ +// for (unsigned int i = 0u; i < num_vectors; i++) +// { +// const unsigned int label = labels[i]; +// cluster_sums[label] += exp_fn_sparse_dif_c(vectors[i], centroids[label]); +// cluster_counts[label]++; +// } + +// /** Add up the average cluster size. **/ +// double cluster_total = 0.0; +// unsigned int num_valid_clusters = 0u; +// for (unsigned int label = 0u; label < num_clusters; label++) +// { +// const unsigned int cluster_count = cluster_counts[label]; +// if (cluster_count == 0u) continue; + +// cluster_total += cluster_sums[label] / cluster_count; +// num_valid_clusters++; +// } + +// /** Return average sizes. **/ +// return cluster_total / num_valid_clusters; +// } + +// /*** Compute the param_value for `k` (number of clusters), given a dataset of with +// *** a size of `n`. +// *** +// *** The following table shows data sizes vs.selected cluster size. In testing, +// *** these numbers tended to givea good balance of accuracy and dulocates detected. +// *** +// *** ```csv +// *** Data Size, Actual +// *** 10k, 12 +// *** 100k, 33 +// *** 1M, 67 +// *** 4M, 93 +// *** ``` +// *** +// *** This function is not intended for datasets smaller than (`n < ~2000`). +// *** These should be handled using complete search. +// *** +// *** LaTeX Notation: \log_{36}\left(n\right)^{3.1}-8 +// *** +// *** @param n The size of the dataset. +// *** @returns k, the number of clusters to use. +// *** +// *** Complexity: `O(1)` +// ***/ +// unsigned int exp_fn_compute_k(const unsigned int n) +// { +// return (unsigned)max(2, pow(log(n) / log(36), 3.2) - 8); +// } + +// /*** Executes the k-means clustering algorithm. Selects NUM_CLUSTERS random +// *** vectors as initial centroids. Then points are assigned to the nearest +// *** centroid, after which centroids are moved to the center of their points. +// *** +// *** @param vectors The vectors to cluster. +// *** @param num_vectors The number of vectors to cluster. +// *** @param labels Stores the final cluster identities of the vectors after +// *** clustering is completed. +// *** @param centroids Stores the locations of the centroids used for the clusters +// *** of the data. +// *** @param iterations The number of iterations that actually executed is stored +// *** here. Leave this NULL if you don't care. +// *** @param max_iter The max number of iterations. +// *** @param num_clusters The number of clusters to generate. +// *** +// *** @attention - Assumes: num_vectors is the length of vectors. +// *** @attention - Assumes: num_clusters is the length of labels. +// *** +// *** @attention - Issue: At larger numbers of clustering iterations, some +// *** clusters have a size of negative infinity. In this implementation, +// *** the bug is mitigated by setting a small number of max iterations, +// *** such as 16 instead of 100. +// *** @attention - Issue: Clusters do not apear to improve much after the first +// *** iteration, which puts the efficacy of the algorithm into question. This +// *** may be due to the uneven density of a typical dataset. However, the +// *** clusters still offer useful information. +// *** +// *** Complexity: +// *** +// *** - `O(kd + k + i*(k + n*(k+d) + kd))` +// *** +// *** - `O(kd + k + ik + ink + ind + ikd)` +// *** +// *** - `O(nk + nd)` +// ***/ +// void exp_fn_kmeans( +// int** vectors, +// const unsigned int num_vectors, +// unsigned int* labels, +// const unsigned int num_clusters, +// const unsigned int max_iter +// ) +// { +// // const size_t centroids_size = num_clusters * sizeof(double*); +// // const size_t centroid_size = EXP_NUM_DIMS * sizeof(double); +// // double** centroids = (double**)nmMalloc(centroids_size); +// // if (centroids == NULL) +// // { +// // fprintf(stderr, "exp_fn_kmeans() - nmMalloc(%u) failed.\n", centroids_size); +// // return; +// // } +// // for (int i = 0; i < num_clusters; i++) +// // { +// // double* centroid = centroids[i] = (double*)nmMalloc(centroid_size); +// // if (centroid == NULL) +// // { +// // fprintf(stderr, "exp_fn_kmeans() - nmMalloc(%u) failed.\n", centroid_size); +// // return; +// // } +// // memset(centroids[i], 0, centroid_size); +// // } +// double centroids[num_clusters][EXP_NUM_DIMS]; +// memset(centroids, 0, sizeof(centroids)); + +// /** Select random vectors to use as the initial centroids. **/ +// srand(time(NULL)); +// for (unsigned int i = 0u; i < num_clusters; i++) +// { +// // Pick a random vector. +// const unsigned int random_index = (unsigned int)rand() % num_vectors; + +// // Sparse copy the vector into a densely allocated centroid. +// double* centroid = centroids[i]; +// const int* vector = vectors[random_index]; +// for (unsigned int i = 0u, dim = 0u; dim < EXP_NUM_DIMS;) +// { +// const int token = vector[i++]; +// if (token > 0) centroid[dim++] = (double)token; +// else for (unsigned int j = 0u; j < -token; j++) centroid[dim++] = 0.0; +// } +// } + +// /** Allocate memory for new centroids. **/ +// double new_centroids[num_clusters][EXP_NUM_DIMS]; + +// /** Main exp_fn_kmeans loop. **/ +// double old_average_cluster_size = 1.0; +// unsigned int cluster_counts[num_clusters]; +// for (unsigned int iter = 0u; iter < max_iter; iter++) +// { +// bool changed = false; + +// /** Reset new centroids. **/ +// for (unsigned int i = 0u; i < num_clusters; i++) +// { +// cluster_counts[i] = 0u; +// for (unsigned int dim = 0; dim < EXP_NUM_DIMS; dim++) +// new_centroids[i][dim] = 0.0; +// } + +// /** Assign each point to the nearest centroid. **/ +// for (unsigned int i = 0u; i < num_vectors; i++) +// { +// const int* vector = vectors[i]; +// double min_dist = DBL_MAX; +// unsigned int best_centroid_label = 0u; + +// // Find nearest centroid. +// for (unsigned int j = 0u; j < num_clusters; j++) +// { +// const double dist = exp_fn_sparse_dif_c(vector, centroids[j]); +// if (dist < min_dist) +// { +// min_dist = dist; +// best_centroid_label = j; +// } +// } + +// /** Update label to new centroid, if necessary. **/ +// if (labels[i] != best_centroid_label) +// { +// labels[i] = best_centroid_label; +// changed = true; +// } + +// /** Accumulate values for new centroid calculation. **/ +// double* best_centroid = new_centroids[best_centroid_label]; +// for (unsigned int i = 0u, dim = 0u; dim < EXP_NUM_DIMS;) +// { +// const int val = vector[i++]; +// if (val < 0) dim += (unsigned)(-val); +// else best_centroid[dim++] += (double)val; +// } +// cluster_counts[best_centroid_label]++; +// } + +// /** Stop if centroids didn't change. **/ +// if (!changed) break; + +// /** Update centroids. **/ +// for (unsigned int i = 0u; i < num_clusters; i++) +// { +// if (cluster_counts[i] == 0u) continue; +// double* centroid = centroids[i]; +// const double* new_centroid = new_centroids[i]; +// const unsigned int cluster_count = cluster_counts[i]; +// for (unsigned int dim = 0u; dim < EXP_NUM_DIMS; dim++) +// centroid[dim] = new_centroid[dim] / cluster_count; +// } + +// /** Print cluster size for debugging. **/ +// const double average_cluster_size = exp_fn_get_cluster_size(vectors, num_vectors, labels, centroids, num_clusters); + +// /** Is there enough improvement? **/ +// const double improvement = old_average_cluster_size - average_cluster_size; +// if (improvement < KMEANS_IMPROVEMENT_THRESHOLD) break; +// old_average_cluster_size = average_cluster_size; +// } + +// // Free unused memory. +// // for (int i = 0; i < num_clusters; i++) { +// // nmFree(centroids[i], centroid_size); +// // } +// // nmFree(centroids, centroids_size); +// } + +// /** Duplocate information. **/ +// typedef struct +// { +// unsigned int id1; +// unsigned int id2; +// double similarity; +// } +// Dup, *pDup; + +// /*** Runs complete search to find duplocates if `num_vectors < MAX_COMPLETE_SEARCH` +// *** and runs a search using k-means clustering on larger amounts of data. +// *** +// *** @param vectors Array of precomputed frequency vectors for all dataset strings. +// *** @param num_vectors The number of vectors to be scanned. +// *** @param dupe_threshold The similarity threshold, below which dups are ignored. +// *** @returns The duplicates in pDup structs. +// ***/ +// pXArray lightning_search(int** vectors, const unsigned int num_vectors, const double dupe_threshold) +// { +// /** Allocate space for dups. **/ +// const size_t guess_size = num_vectors * 2u; +// pXArray dups = xaNew(guess_size); +// if (dups == NULL) +// { +// mssErrorf(1, "EXP", "lightning_search() - xaNew(%lu) failed.", guess_size); +// return NULL; +// } + +// /** Descide which algorithm to use. **/ +// if (num_vectors <= MAX_COMPLETE_SEARCH) +// { /** Do a complete search. **/ +// for (unsigned int i = 0u; i < num_vectors; i++) +// { +// const int* v1 = vectors[i]; +// for (unsigned int j = i + 1u; j < num_vectors; j++) +// { +// const int* v2 = vectors[j]; +// const double similarity = exp_fn_sparse_similarity(v1, v2); +// if (similarity > dupe_threshold) // Dup found! +// { +// Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); +// if (dup == NULL) +// { +// mssErrorf(1, "EXP", "lightning_search() - nmMalloc(%lu) failed.", sizeof(Dup)); +// goto err_free_dups; +// } + +// dup->id1 = i; +// dup->id2 = j; +// dup->similarity = similarity; +// xaAddItem(dups, (void*)dup); +// } +// } +// } +// } +// else +// { /** Do a k-means search. **/ +// /** Define constants for the algorithm. **/ +// const unsigned int max_iter = 64u; /** Hardcode value because idk. **/ +// const unsigned int num_clusters = exp_fn_compute_k(num_vectors); + +// /** Allocate static memory for finding clusters. **/ +// unsigned int labels[num_vectors]; +// memset(labels, 0u, sizeof(labels)); + +// /** Execute kmeans clustering. **/ +// exp_fn_kmeans(vectors, num_vectors, labels, num_clusters, max_iter); + +// /** Find duplocates in clusters. **/ +// for (unsigned int i = 0u; i < num_vectors; i++) +// { +// const int* v1 = vectors[i]; +// const unsigned int label = labels[i]; +// for (unsigned int j = i + 1u; j < num_vectors; j++) +// { +// if (labels[j] != label) continue; +// const int* v2 = vectors[j]; +// const double similarity = exp_fn_sparse_similarity(v1, v2); +// if (similarity > dupe_threshold) /* Dup found! */ +// { +// Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); +// if (dup == NULL) +// { +// mssErrorf(1, "EXP", +// "lightning_search() - nmMalloc(%lu) failed.", +// sizeof(Dup) +// ); +// goto err_free_dups; +// } + +// dup->id1 = i; +// dup->id2 = j; +// dup->similarity = similarity; +// xaAddItem(dups, (void*)dup); +// } +// } +// } +// } + +// /** Done **/ +// return dups; + +// /** Free dups. **/ +// err_free_dups:; +// const size_t num_dups = dups->nItems; +// for (unsigned int i = 0u; i < num_dups; i++) +// { +// nmFree(dups->Items[i], sizeof(Dup)); +// dups->Items[i] = NULL; +// } +// xaDeInit(dups); +// return NULL; +// } + +// /*** Computes Levenshtein distance between two strings. +// *** +// *** @param str1 The first string. +// *** @param str2 The second string. +// *** @param length1 The length of the first string. +// *** @param length1 The length of the first string. +// *** +// *** @attention - Tip: Pass 0 for the length of either string to infer it +// *** using the null terminating character. Thus, strings with no null +// *** terminator are supported if you pass explicit lengths. +// *** +// *** Complexity: O(length1 * length2). +// *** +// *** @see centrallix-sysdoc/string_comparison.md +// ***/ +// unsigned int exp_fn_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) +// { +// /*** lev_matrix: +// *** For all i and j, d[i][j] will hold the Levenshtein distance between +// *** the first i characters of s and the first j characters of t. +// *** +// *** As they say, no dynamic programming algorithm is complete without a +// *** matrix that you fill out and it has the answer in the final location. +// ***/ +// const size_t str1_len = (str1_length == 0u) ? strlen(str1) : str1_length; +// const size_t str2_len = (str2_length == 0u) ? strlen(str2) : str2_length; +// unsigned int lev_matrix[str1_len + 1][str2_len + 1]; + +// /*** Base case #0: +// *** Transforming an empty string into an empty string has 0 cost. +// ***/ +// lev_matrix[0][0] = 0u; + +// /*** Base case #1: +// *** Any source prefixe can be transformed into an empty string by +// *** dropping each character. +// ***/ +// for (unsigned int i = 1u; i <= str1_len; i++) +// lev_matrix[i][0] = i; + +// /*** Base case #2: +// *** Any target prefixes can be transformed into an empty string by +// *** inserting each character. +// ***/ +// for (unsigned int j = 1u; j <= str2_len; j++) +// lev_matrix[0][j] = j; + +// /** General Case **/ +// for (unsigned int i = 1u; i <= str1_len; i++) +// { +// for (unsigned int j = 1u; j <= str2_len; j++) +// { +// /** Equal characters need no changes. **/ +// if (str1[i - 1] == str2[j - 1]) +// lev_matrix[i][j] = lev_matrix[i - 1][j - 1]; + +// /*** We need to make a change, so use the opereration with the +// *** lowest cost out of delete, insert, replace, or swap. +// ***/ +// else +// { +// unsigned int cost_delete = lev_matrix[i - 1][j] + 1u; +// unsigned int cost_insert = lev_matrix[i][j - 1] + 1u; +// unsigned int cost_replace = lev_matrix[i-1][j-1] + 1u; + +// /** If a swap is possible, calculate the cost. **/ +// bool can_swap = ( +// i > 1 && j > 1 && +// str1[i - 1] == str2[j - 2] && +// str1[i - 2] == str2[j - 1] +// ); +// unsigned int cost_swap = (can_swap) ? lev_matrix[i - 2][j - 2] + 1 : UINT_MAX; + +// // Find the best operation. +// lev_matrix[i][j] = min(min(min(cost_delete, cost_insert), cost_replace), cost_swap); +// } +// } +// } + +// return lev_matrix[str1_len][str2_len]; +// } + +// /*** Runs complete search to find duplocates in phone numbers using the +// *** levenshtein min edit distance algorithm. +// *** +// *** @param dataset An array of characters for all dataset strings. +// *** @param dataset_size The number of phone numbers to be scanned. +// *** @param dupe_threshold The similarity threshold, below which dups are ignored. +// *** @returns The duplicates in pDup structs. +// ***/ +// pXArray phone_search(char dataset[][10u], const unsigned int dataset_size, const double dupe_threshold) +// { +// /** Allocate space for dups. **/ +// const size_t guess_size = dataset_size * 2u; +// pXArray dups = xaNew(guess_size); +// if (dups == NULL) +// { +// mssErrorf(1, "EXP", "phone_search() - xaNew(%lu) failed.", guess_size); +// return NULL; +// } + +// /** Search for dups using edit distance. **/ +// for (unsigned int i = 0u; i < dataset_size; i++) +// { +// const char* v1 = dataset[i]; +// for (unsigned int j = i + 1u; j < dataset_size; j++) +// { +// const char* v2 = dataset[j]; +// const unsigned int dist = exp_fn_edit_dist(v1, v2, 10u, 10u); +// const double similarity = (double)dist / 10.0; +// if (similarity > dupe_threshold) /* Dup found! */ +// { +// Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); +// if (dup == NULL) +// { +// mssErrorf(1, "EXP", "phone_search() - nmMalloc(%lu) failed.", sizeof(Dup)); + +// /** Free data before returning. **/ +// const size_t num_dups = dups->nItems; +// for (unsigned int i = 0u; i < num_dups; i++) +// { +// void* dup = dups->Items[i]; +// nmFree(dup, sizeof(Dup)); +// } +// xaDeInit(dups); +// return NULL; +// } + +// dup->id1 = i; +// dup->id2 = j; +// dup->similarity = similarity; +// xaAddItem(dups, (void*)dup); +// } +// } +// } + +// return dups; +// } + +// /*** Usage: get_dups(, , ) +// *** data is assumed to contain only the following characters: +// *** (Data containing ` or control characters is undefined.) +// *** \n\v\f\r 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij +// *** klmnopqrstuvwxyz!"#$%&'()*+,-./:;<=>?@[\]^_{|}~ +// ***/ +// int exp_fn_get_dups_general(pExpression tree, pParamObjects objlist, pExpression maybe_dup_threshold, pExpression maybe_out_file_path, pExpression maybe_data, const char* fn_name, bool is_phone_numbers) +// { +// /** Check number of arguments. **/ +// if (!maybe_dup_threshold || !maybe_out_file_path || !maybe_data) +// { +// mssErrorf(1, "EXP", "%s(?) expects 3 parameters.", fn_name); +// return -1; +// } +// const int num_params = tree->Children.nItems; +// if (num_params != 3) +// { +// mssErrorf(1, "EXP", "%s(?) expects 3 parameter, got %d.", fn_name, num_params); +// return -1; +// } + +// /** Magic checks. **/ +// ASSERTMAGIC(tree, MGK_EXPRESSION); +// ASSERTMAGIC(maybe_dup_threshold, MGK_EXPRESSION); +// ASSERTMAGIC(maybe_out_file_path, MGK_EXPRESSION); +// ASSERTMAGIC(maybe_data, MGK_EXPRESSION); + +// /** Check object list. **/ +// if (!objlist) +// { +// mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); +// return -1; +// } +// ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); + +// /** Extract dup_threshold. **/ +// if (maybe_dup_threshold->Flags & EXPR_F_NULL) +// { +// mssErrorf(1, "EXP", "%s(NULL, ...) dup_threshold cannot be NULL.", fn_name); +// return -1; +// } +// if (maybe_dup_threshold->DataType != DATA_T_DOUBLE) +// { +// mssErrorf(1, "EXP", "%s(?, ...) dup_threshold must be a doube.", fn_name); +// return -1; +// } +// double dup_threshold = maybe_dup_threshold->Types.Double; +// if (isnan(dup_threshold)) +// { +// mssErrorf(1, "EXP", "%s(NAN, ...) dup_threshold cannot be NAN.", fn_name); +// return -1; +// } +// if (dup_threshold <= 0 || 1 <= dup_threshold) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, ...) dup_threshold must be between 0 and 1 (exclusive).", +// fn_name, dup_threshold +// ); +// return -1; +// } + +// /** Extract output file path. **/ +// if (maybe_out_file_path->Flags & EXPR_F_NULL) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, NULL, ...) out_file_path cannot be NULL.", +// fn_name, dup_threshold +// ); +// return -1; +// } +// if (maybe_out_file_path->DataType != DATA_T_STRING) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \?\?\?, ...) out_file_path should be a string.", +// fn_name, dup_threshold +// ); +// return -1; +// } +// char* out_file_path = maybe_out_file_path->String; +// if (out_file_path == NULL) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, nothing?, ...) expected string from out_file_path " +// "(of type DataType = DATA_T_STRING), but the String was NULL " +// "or did not exist!", +// fn_name, dup_threshold +// ); +// return -1; +// } +// size_t out_path_len = strlen(out_file_path); +// if (out_path_len == 0u) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", ...) out_file_path cannot be an empty string.", +// fn_name, dup_threshold, out_file_path +// ); +// return -1; +// } +// const size_t max_len = BUFSIZ - 48u; +// if (out_path_len >= max_len) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", ...) out_file_path length (%lu) > max length (%lu).", +// fn_name, dup_threshold, out_file_path, out_path_len, max_len +// ); +// return -1; +// } +// if (strncmp(out_file_path + (out_path_len - 4u), ".csv", 4u) != 0) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", ...) out_file_path must end in .csv, " +// "because the output file is a csv.", +// fn_name, dup_threshold, out_file_path +// ); +// return -1; +// } + +// /** Extract dataset string. **/ +// if (maybe_data->Flags & EXPR_F_NULL) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", NULL) data cannot be NULL.", +// fn_name, dup_threshold, out_file_path +// ); +// return -1; +// } +// if (maybe_data->DataType != DATA_T_STRING) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \?\?\?) data must be a string.", +// fn_name, dup_threshold, out_file_path +// ); +// return -1; +// } +// char* data = maybe_data->String; +// if (data == NULL) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \?\?\?) expected string from data " +// "(of type DataType = DATA_T_STRING), but the String " +// "was NULL or did not exist!", +// fn_name, dup_threshold, out_file_path +// ); +// return -1; +// } +// if (strlen(data) == 0u) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"%s\") data cannot be an empty string.", +// fn_name, dup_threshold, out_file_path, data +// ); +// return -1; +// } + +// /** Check number of entries in the dataset. **/ +// size_t dataset_size = 1; +// for (char* buf = data; *buf != '\0'; buf++) +// if (*buf == SEPARATOR_CHAR) dataset_size++; + +// /** Verify dataset is reasonable size. **/ +// if (dataset_size == 1) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"\?\?\?\") Expected data to contain multiple " +// "values separated by \""SEPARATOR"\", but data was: \"%s\"", +// fn_name, dup_threshold, out_file_path, data +// ); +// return -1; +// } + +// /** Parse strs out of the data into the dataset. **/ +// size_t count = 0u; +// char* token = strtok(data, SEPARATOR); +// char* dataset[dataset_size]; +// memset(dataset, 0, sizeof(dataset)); +// while (token && count < dataset_size) +// { +// char* new_token = strdup(token); +// if (new_token == NULL) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") Failed to copy token \"%s\" from data.", +// fn_name, dup_threshold, out_file_path, token +// ); +// goto err_free_dataset; +// } +// dataset[count++] = new_token; +// token = strtok(NULL, SEPARATOR); +// } + +// /** Allocate memory to store dups. **/ +// pXArray dups; + +// /** Handle phone numbers. **/ +// if (is_phone_numbers) +// { +// /*** Phone number strings are always 10 characters long. Thus, they +// *** are NOT NULL TERMINATED because we can assume the length. +// ***/ +// unsigned int num_phone_numbers = 0u; +// char phone_numbers[dataset_size][10u]; + +// /** Parse the dataset. **/ +// for (unsigned int i = 0u; i < dataset_size; i++) +// { +// char* maybe_phone_number = dataset[i]; + +// /** Verify length can be a valid phone number. **/ +// const size_t len = strlen(maybe_phone_number); +// if (len < 10u) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") is too short. (skipped)", +// fn_name, dup_threshold, out_file_path, maybe_phone_number +// ); +// continue; +// } +// if (len > 18u) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") is too long. (skipped)", +// fn_name, dup_threshold, out_file_path, maybe_phone_number +// ); +// continue; +// } + +// /** Parse phone number. **/ +// char buf[11u], cur_char = maybe_phone_number[0]; +// unsigned int j = ((cur_char == '+') ? 2u : +// ((cur_char == '1') ? 1u : 0u)); +// unsigned int number_len = 0u; +// while (cur_char != '\0' && number_len <= 10u) +// { +// cur_char = maybe_phone_number[j]; + +// if ( +// cur_char == '-' || +// cur_char == ' ' || +// cur_char == '(' || +// cur_char == ')' +// ) continue; +// else if (!isdigit(cur_char)) +// { +// /** Unknown character. **/ +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") contains unexpected character '%c'. (skipped)", +// fn_name, dup_threshold, out_file_path, maybe_phone_number, cur_char +// ); +// goto next_phone_number; +// } + +// /** Add the character to the phone number. */ +// buf[number_len] = cur_char; +// number_len++; + +// /** Advance to next number. **/ +// j++; +// } + +// /** Check number of digits. **/ +// if (number_len < 10u) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") has less than 10 digits. (skipped)", +// fn_name, dup_threshold, out_file_path, maybe_phone_number +// ); +// continue; +// } +// if (number_len > 10u) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") has more than 10 digits. (skipped)", +// fn_name, dup_threshold, out_file_path, maybe_phone_number +// ); +// continue; +// } + +// /** Copy valid phone number (with no null-terminator). **/ +// memcpy(phone_numbers[num_phone_numbers++], buf, 10u); + +// next_phone_number:; +// } + +// /** Invoke phone number search to find dups in the processed data. **/ +// dups = phone_search(phone_numbers, num_phone_numbers, dup_threshold); +// } + +// /** Handle text. **/ +// else +// { +// /** Build vectors from the strs in the dataset. **/ +// const size_t vectors_size = dataset_size * sizeof(int*); +// int** vectors = (int**)nmMalloc(vectors_size); +// if (vectors == NULL) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") - nmMalloc(%lu) failed.", +// fn_name, dup_threshold, out_file_path, vectors_size +// ); +// goto err_free_dataset; +// } +// for (size_t i = 0; i < dataset_size; i++) +// { +// const int* vector = vectors[i] = build_vector(dataset[i]); +// if (vector == NULL) +// { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") - build_vector(%s) failed.", +// fn_name, dup_threshold, out_file_path, dataset[i] +// ); +// goto err_free_vectors; +// } +// if (vector[0] == -EXP_NUM_DIMS) { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") - build_vector(%s) produced no character pairs.", +// fn_name, dup_threshold, out_file_path, dataset[i] +// ); +// goto err_free_vectors; +// } +// } + +// /** Invoke lightning search to find dups using the vectors. **/ +// dups = lightning_search(vectors, dataset_size, dup_threshold); +// if (dups == NULL) { +// mssErrorf(1, "EXP", +// "%s(%lg, \"%s\", \"...\") - lightning_search() failed.", +// fn_name, dup_threshold, out_file_path +// ); +// goto err_free_vectors; +// } + +// /** Free unused memory. **/ +// for (size_t i = 0; i < dataset_size; i++) +// { +// nmSysFree(vectors[i]); +// vectors[i] = NULL; +// } +// nmFree(vectors, vectors_size); +// vectors = NULL; +// goto search_done; + +// /** Free vectors, if needed. **/ +// err_free_vectors: +// if (vectors != NULL) +// { +// for (size_t i = 0; i < dataset_size; i++) +// { +// if (vectors[i] == NULL) break; +// nmSysFree(vectors[i]); +// vectors[i] = NULL; +// } +// nmFree(vectors, vectors_size); +// vectors = NULL; +// } +// goto err_free_dataset; + +// search_done:; +// } + +// /** Check number of dups found. **/ +// const int num_dups = dups->nItems; + +// // Hack where we hardcode the path to the root directory because trying to +// // track it down is way too hard. +// const char root_path[] = "/usr/local/src/cx-git/centrallix-os"; + +// /** Create output file path. **/ +// char out_path[BUFSIZ]; +// snprintf(memset(out_path, 0, sizeof(out_path)), sizeof(out_path), "%s/%s", root_path, out_file_path); + +// /** Write output file. **/ +// FILE* file = fopen(out_path, "w"); +// if (file == NULL) +// { +// perror("Failed to open file."); +// mssErrorf(1, "EXP", +// "%s(%lg, \"...\", ...) failed to open file: %s", +// fn_name, dup_threshold, out_path +// ); +// goto err_free_dups; +// } +// const int setvbuf_ret = setvbuf(file, NULL, _IOFBF, (1000 * 1000)); +// if (setvbuf_ret != 0) +// { +// perror("Failed to set buffering on file."); +// mssErrorf(1, "EXP", +// "%s(%lg, \"...\", ...) failed to set buffering on file: %d, %s", +// fn_name, dup_threshold, setvbuf_ret, out_path +// ); +// goto err_close_file; +// } + +// /** Write CSV header. **/ +// fprintf(file, "id1,id2,sim\n"); + +// /*** If no data was written, make sure there is at least one row in the +// *** output file since assuming this file has data makes the sql faster. +// ***/ +// if (num_dups == 0u) +// fprintf(file, "error,undefined,0.0\n"); + +// /** Write CSV data rows. **/ +// else +// { +// for (unsigned int i = 0u; i < num_dups; i++) +// { +// Dup* data = (Dup*)dups->Items[i]; +// fprintf(file, "%s,%s,%.8lf\n", dataset[data->id1], dataset[data->id2], data->similarity); +// nmFree(data, sizeof(Dup)); /* Free unused data. */ +// dups->Items[i] = NULL; +// } +// } + +// /** Free unused data. **/ +// for (unsigned int i = 0u; i < dataset_size; i++) +// { +// free(dataset[i]); +// dataset[i] = NULL; +// } +// xaDeInit(dups); +// dups = NULL; + +// /** Close file. **/ +// const int fclose_ret = fclose(file); +// if (fclose_ret != 0) +// { +// perror("Failed to close file."); +// mssErrorf(1, "EXP", +// "%s(%lg, \"...\") failed to close file: %d, %s", +// fn_name, dup_threshold, fclose_ret, out_path +// ); +// goto err_free_dataset; +// } +// file = NULL; + +// /** Success. **/ +// tree->DataType = DATA_T_INTEGER; +// tree->Integer = (int)num_dups; +// return 0; + +// /** Error cases. **/ + +// /** Close file, if needed. **/ +// err_close_file: +// if (file != NULL) +// { +// const int fclose_ret = fclose(file); +// if (fclose_ret != 0) +// { +// char dbl_buf[DBL_BUF_SIZE]; +// snprintf(dbl_buf, sizeof(dbl_buf), "%lg", dup_threshold); +// perror("Failed to close file."); +// mssErrorf(1, "EXP", +// "%s(%s, \"...\") failed to close file: %d, %s", +// fn_name, dbl_buf, fclose_ret, out_path +// ); +// } +// } + +// /** Free dups, if needed. **/ +// err_free_dups: +// if (dups != NULL) +// { +// for (unsigned int i = 0u; i < num_dups; i++) +// { +// nmFree(dups->Items[i], sizeof(Dup)); +// dups->Items[i] = NULL; +// } +// xaDeInit(dups); +// dups = NULL; +// } + +// /** Free dataset, if needed. **/ +// err_free_dataset: +// for (unsigned int i = 0u; i < dataset_size; i++) +// { +// if (dataset[i] == NULL) break; +// free(dataset[i]); +// dataset[i] = NULL; +// } + +// return -1; +// } + +// int exp_fn_get_dups(pExpression tree, pParamObjects objlist, pExpression p1, pExpression p2, pExpression p3) +// { +// return exp_fn_get_dups_general(tree, objlist, p1, p2, p3, "get_dups", false); +// } + +// int exp_fn_get_dups_phone(pExpression tree, pParamObjects objlist, pExpression p1, pExpression p2, pExpression p3) +// { +// return exp_fn_get_dups_general(tree, objlist, p1, p2, p3, "get_dups_phone", true); +// } + +// /** Magic values. **/ +// #define EXP_NUM_FIELDS 7 +// #define EXP_INDEX_FIRST_NAME 0 +// #define EXP_INDEX_FIRST_NAME_METAPHONE 1 +// #define EXP_INDEX_LAST_NAME 2 +// #define EXP_INDEX_LAST_NAME_METAPHONE 3 +// #define EXP_INDEX_EMAIL 4 +// #define EXP_INDEX_PHONE 5 +// #define EXP_INDEX_ADDRESS 6 + +// /** No-op function. **/ +// int exp_fn_do_nothing() { return 0; } + +// /*** Function to add parameters to private storage so that more than 3 parameters can be passed. +// *** Currently, doubles are the only supported param type. +// *** +// *** Usage: param(, , ) : R, +// *** where: V : Double +// *** +// *** @param tree Return param_value. +// *** @param objlist Function scope. +// *** @param maybe_array The 1st param, should be NULL or another call to param(). +// *** @param maybe_param_name The 2nd param, should be a string for the name of the param. +// *** @param maybe_param_value The 3rd param, should be the param_value of the param being set. +// ***/ +// int exp_fn_param(pExpression tree, pParamObjects objlist, pExpression maybe_param_name, pExpression maybe_param_value, pExpression maybe_array) { +// // Verify arg number. +// if (!maybe_param_name || !maybe_param_value) +// { +// mssErrorf(1, "EXP", "param(?) expects two or three parameters."); +// return -1; +// } + +// // Magic checks. +// ASSERTMAGIC(tree, MGK_EXPRESSION); +// ASSERTMAGIC(maybe_param_name, MGK_EXPRESSION); +// ASSERTMAGIC(maybe_param_value, MGK_EXPRESSION); +// ASSERTMAGIC(maybe_array, MGK_EXPRESSION); + +// // Check object list. +// if (!objlist) +// { +// mssErrorf(1, "EXP", "param(\?\?\?) no object list?"); +// return -1; +// } +// ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); + +// // Extract param name. +// if (maybe_param_name->Flags & EXPR_F_NULL) +// { +// mssErrorf(1, "EXP", "param(NULL, ...) param_name cannot be null."); +// return -1; +// } +// if (maybe_param_name->DataType != DATA_T_STRING) +// { +// mssErrorf(1, "EXP", "param(?, ...) param_name must be a string."); +// return -1; +// } +// const char* param_name = maybe_param_name->String; + +// // Extract param value. +// if (maybe_param_value->Flags & EXPR_F_NULL) +// { +// mssErrorf(1, "EXP", "param(\"%s\", NULL, ...) param_value cannot be null.", param_name); +// return -1; +// } +// if (maybe_param_value->DataType != DATA_T_DOUBLE) +// { +// mssErrorf(1, "EXP", "param(\"%s\", ?, ...) param_value must be a doube.", param_name); +// return -1; +// } +// double param_value = maybe_param_value->Types.Double; + +// // Verify the value being set. +// // TODO: Replace with hashmap. +// signed int index = -1; +// if (strcmp(param_name, "first_name") == 0) index = EXP_INDEX_FIRST_NAME; +// else if (strcmp(param_name, "first_name_metaphone") == 0) index = EXP_INDEX_FIRST_NAME_METAPHONE; +// else if (strcmp(param_name, "last_name") == 0) index = EXP_INDEX_LAST_NAME; +// else if (strcmp(param_name, "last_name_metaphone") == 0) index = EXP_INDEX_LAST_NAME_METAPHONE; +// else if (strcmp(param_name, "email") == 0) index = EXP_INDEX_EMAIL; +// else if (strcmp(param_name, "phone") == 0) index = EXP_INDEX_PHONE; +// else if (strcmp(param_name, "address") == 0) index = EXP_INDEX_ADDRESS; +// if (index == -1) +// { +// mssErrorf(1, "EXP", +// "param(\"%s\", %lf, ...) invalid field name %s.", +// param_name, param_value, param_name +// ); +// return -1; +// } + +// // Extract array. +// double* array; +// if (!maybe_array || maybe_array->Flags & EXPR_F_NULL) +// { +// const size_t size = EXP_NUM_FIELDS * sizeof(double); +// void* PrivateData = tree->PrivateData = memset(nmSysMalloc(size), 0, size); +// tree->PrivateDataFinalize = exp_fn_do_nothing; // DON'T FREE MY DATA UNTIL I'M READY. + +// array = (double*)PrivateData; +// for (unsigned int i = 0u; i < EXP_NUM_FIELDS; i++) array[i] = NAN; +// } +// else if ( +// maybe_array->DataType == DATA_T_ARRAY && +// maybe_array->PrivateData != NULL && +// !strcmp(maybe_array->Name, "param") +// ) +// { +// tree->PrivateData = maybe_array->PrivateData; +// tree->PrivateDataFinalize = exp_fn_do_nothing; // DON'T FREE MY DATA UNTIL I'M READY. +// array = (double*)maybe_array->PrivateData; +// } +// else +// { +// mssErrorf(1, "EXP", "param(\"%s\", %lf, ...) if provided, array must be from a call to param().", param_name, param_value); +// return -1; +// } + +// // Warn on previous data. +// double old_value = array[index]; +// if (!isnan(old_value)) +// { +// fprintf(stderr, +// "Warning: Overwriting field '%s'(@ index %d) with %lf (was %lf).\n", +// param_name, index, param_value, old_value +// ); +// } + +// // Set param_value. +// array[index] = param_value; + +// // Done +// tree->DataType = DATA_T_ARRAY; +// tree->Integer = 0; +// tree->Types.Double = 0.0; +// return 0; +// } + +// int exp_fn_get_sim(pExpression tree, pParamObjects objlist, pExpression maybe_fields, pExpression unused1, pExpression unused2) +// { +// if (!maybe_fields || unused1 || unused2) +// { +// mssErrorf(1, "EXP", "get_sim(param(...)) expects one parameter, from param()."); +// return -1; +// } + +// // Magic checks. +// ASSERTMAGIC(tree, MGK_EXPRESSION); +// ASSERTMAGIC(maybe_fields, MGK_EXPRESSION); + +// // Check object list. +// if (!objlist) +// { +// mssErrorf(1, "EXP", "get_sim(\?\?\?) no object list?"); +// return -1; +// } +// ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); + +// // Verify arg. +// if (maybe_fields->Flags & EXPR_F_NULL) +// { +// mssErrorf(1, "EXP", "get_sim(NULL) fields from param() cannot be NULL."); +// return -1; +// } +// if (maybe_fields->DataType != DATA_T_ARRAY || maybe_fields->PrivateData == NULL) +// { +// mssErrorf(1, "EXP", "get_sim(\?\?\?) expects arg 0 to be fields from a call to param()."); +// return -1; +// } + +// // Extract arg(s?). +// double* fields = (double*)maybe_fields->PrivateData; + +// const double first_name = fields[EXP_INDEX_FIRST_NAME]; +// if (isnan(first_name)) +// { +// mssErrorf(1, "EXP", "get_sim(...) first_name similarity not set."); +// return -1; +// } + +// const double first_name_metaphone = fields[EXP_INDEX_FIRST_NAME_METAPHONE]; +// if (isnan(first_name_metaphone)) +// { +// mssErrorf(1, "EXP", "get_sim(...) first_name_metaphone similarity not set."); +// return -1; +// } + +// const double last_name = fields[EXP_INDEX_LAST_NAME]; +// if (isnan(last_name)) +// { +// mssErrorf(1, "EXP", "get_sim(...) last_name similarity not set."); +// return -1; +// } + +// const double last_name_metaphone = fields[EXP_INDEX_LAST_NAME_METAPHONE]; +// if (isnan(last_name_metaphone)) +// { +// mssErrorf(1, "EXP", "get_sim(...) last_name_metaphone similarity not set."); +// return -1; +// } + +// const double email = fields[EXP_INDEX_EMAIL]; +// if (isnan(email)) +// { +// mssErrorf(1, "EXP", "get_sim(...) email similarity not set."); +// return -1; +// } + +// const double phone = fields[EXP_INDEX_PHONE]; +// if (isnan(phone)) +// { +// mssErrorf(1, "EXP", "get_sim(...) phone similarity not set."); +// return -1; +// } + +// const double address = fields[EXP_INDEX_ADDRESS]; +// if (isnan(address)) +// { +// mssErrorf(1, "EXP", "get_sim(...) address similarity not set."); +// return -1; +// } + +// char* primary; +// char* secondary; +// meta_double_metaphone("text", &primary, &secondary); +// printf("Primary: %s, secondary: %s\n", primary, secondary); + +// // Print args. +// printf( +// "Sims:\n" +// "\tfirst_name: %lf\n" +// "\tfirst_name_metaphone: %lf\n" +// "\tlast_name: %lf\n" +// "\tlast_name_metaphone: %lf\n" +// "\temail: %lf\n" +// "\tphone: %lf\n" +// "\taddress: %lf\n", +// first_name, +// first_name_metaphone, +// last_name, +// last_name_metaphone, +// email, +// phone, +// address +// ); + +// // Compute total. +// const double first_name_total = max(first_name * 1.0, first_name_metaphone * 0.9); +// const double last_name_total = max(last_name * 1.0, last_name_metaphone * 0.9); +// double total = (first_name_total * last_name_total) * 0.6 + email * 0.2 + address * 0.2; + +// // Clean up. +// nmSysFree(fields); + +// // Return total. +// tree->DataType = DATA_T_DOUBLE; +// tree->Types.Double = total; +// return 0; +// } + + +int exp_fn_double_metaphone(pExpression tree, pParamObjects objlist, pExpression maybe_str, pExpression u1, pExpression u2) + { + const char fn_name[] = "double_metaphone"; + + /** Check number of arguments. **/ + if (!maybe_str || u1 || u2) + { + mssErrorf(1, "EXP", "%s(?) expects 1 parameter.", fn_name); + return -1; + } + const int num_params = tree->Children.nItems; + if (num_params != 1) + { + mssErrorf(1, "EXP", "%s(?) expects 1 parameter, got %d.", fn_name, num_params); + return -1; + } + + /** Magic checks. **/ + ASSERTMAGIC(tree, MGK_EXPRESSION); + ASSERTMAGIC(maybe_str, MGK_EXPRESSION); + + /** Check object list. **/ + if (!objlist) + { + mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); + return -1; + } + ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); + + /** Extract str. **/ + if (maybe_str->Flags & EXPR_F_NULL) + { + mssErrorf(1, "EXP", "%s(NULL) str cannot be NULL.", fn_name); + return -1; + } + if (maybe_str->DataType != DATA_T_STRING) + { + mssErrorf(1, "EXP", "%s(\?\?\?) str should be a string.", fn_name); + return -1; + } + const char* str = maybe_str->String; + if (str == NULL) + { + mssErrorf(1, "EXP", + "%s(nothing?) expected string from str " + "(of type DataType = DATA_T_STRING), but the String " + "was NULL or did not exist!", + fn_name + ); + return -1; + } + const size_t str_len = strlen(str); + if (str_len == 0u) + { + mssErrorf(1, "EXP", "%s(\"\") str cannot be an empty string.", fn_name); + return -1; + } + + /** Compute DoubleMetaphone. **/ + char* primary; + char* secondary; + meta_double_metaphone( + str, + memset(&primary, 0, sizeof(primary)), + memset(&secondary, 0, sizeof(secondary)) + ); + + /** Process result. **/ + const size_t primary_length = strlen(primary); + const size_t secondary_length = strlen(secondary); + char* result = nmSysMalloc(primary_length + 1u + secondary_length + 1u); + sprintf(result, "%s%c%s", primary, CA_BOUNDARY_CHAR, secondary); + + /** Return the result. **/ + tree->String = result; + tree->DataType = DATA_T_STRING; + return 0; + } + +// // Clean up. +// #undef min +// #undef max + +// // END OF DUPE SECTION +// // =================== /* * exp_fn_argon2id @@ -4521,8 +6261,8 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "log10", (char*)exp_fn_log10); xhAdd(&EXP.Functions, "power", (char*)exp_fn_power); xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); - xhAdd(&EXP.Functions, "levenshtein", (char*)exp_fn_levenshtein); - xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_compare); + xhAdd(&EXP.Functions, "levenshtein", (char*)exp_fn_levenshtein); /* Only used in its own tests. */ + xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_compare); /* Only used in its own tests. */ xhAdd(&EXP.Functions, "cos_compare", (char*)exp_fn_cos_compare); xhAdd(&EXP.Functions, "to_base64", (char*)exp_fn_to_base64); xhAdd(&EXP.Functions, "from_base64", (char*)exp_fn_from_base64); @@ -4530,7 +6270,16 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "from_hex", (char*)exp_fn_from_hex); xhAdd(&EXP.Functions, "octet_length", (char*)exp_fn_octet_length); xhAdd(&EXP.Functions, "argon2id",(char*)exp_fn_argon2id); - + + /** Duplicate Detection **/ + // xhAdd(&EXP.Functions, "get_dups", (char*)exp_fn_get_dups); + // xhAdd(&EXP.Functions, "get_dups_phone", (char*)exp_fn_get_dups_phone); + // xhAdd(&EXP.Functions, "no_op", (char*)exp_fn_do_nothing); + // xhAdd(&EXP.Functions, "do_nothing", (char*)exp_fn_do_nothing); + // xhAdd(&EXP.Functions, "param", (char*)exp_fn_param); + // xhAdd(&EXP.Functions, "total_sim", (char*)exp_fn_get_sim); + xhAdd(&EXP.Functions, "double_metaphone", (char*)exp_fn_double_metaphone); + /** Windowing **/ xhAdd(&EXP.Functions, "row_number", (char*)exp_fn_row_number); xhAdd(&EXP.Functions, "dense_rank", (char*)exp_fn_dense_rank); diff --git a/centrallix/include/cxss/policy.h b/centrallix/include/cxss/policy.h index aeee11ce8..6f9ca7d83 100644 --- a/centrallix/include/cxss/policy.h +++ b/centrallix/include/cxss/policy.h @@ -2,6 +2,7 @@ #define _CXSS_POLICY_H #include "cxss/cxss.h" +#include "obj.h" /************************************************************************/ /* Centrallix Application Server System */ @@ -89,4 +90,3 @@ typedef struct _CXSSPOL CxssPolicy, *pCxssPolicy; #endif /* defined _CXSS_POLICY_H */ - diff --git a/centrallix/include/expression.h b/centrallix/include/expression.h index 8d506f72e..3b334606b 100644 --- a/centrallix/include/expression.h +++ b/centrallix/include/expression.h @@ -307,6 +307,7 @@ int exp_internal_SetupControl(pExpression exp); pExpControl exp_internal_LinkControl(pExpControl ctl); int exp_internal_UnlinkControl(pExpControl ctl); +void meta_double_metaphone(const char* str, char** primary_code, char** secondary_code); /*** Evaluator functions ***/ int expEvalIsNull(pExpression tree, pParamObjects objlist); diff --git a/centrallix/include/stparse.h b/centrallix/include/stparse.h index 50d9e2c20..fad7f9604 100644 --- a/centrallix/include/stparse.h +++ b/centrallix/include/stparse.h @@ -46,7 +46,7 @@ typedef struct _SI int Magic; int LinkCnt; char* Name; /* name of attrib or group */ - char* UsrType; /* type of group, null if attrib */ + char* UsrType; /* type of group (e.g. "system/object"), null if attrib */ pExpression Value; /* value; EXPR_N_LIST if several listed */ struct _SI* Parent; /* Parent inf, null if toplevel */ struct _SI** SubInf; /* List of attrs/groups included */ diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c new file mode 100644 index 000000000..9ffbd1d22 --- /dev/null +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -0,0 +1,3345 @@ + +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: objdrv_cluster.c */ +/* Author: Israel Fuller */ +/* Creation: September 17, 2025 */ +/* Description: Cluster object driver. */ +/************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxlib/clusters.h" +#include "cxlib/mtask.h" +#include "cxlib/mtsession.h" +#include "cxlib/newmalloc.h" +#include "cxlib/util.h" +#include "cxlib/xarray.h" +#include "cxlib/xhash.h" +#include "expression.h" +#include "hints.h" +#include "obj.h" +#include "param.h" +#include "st_node.h" +#include "stparse.h" + +/*** File notes: + *** This file uses comment anchors, provided by the Comment Anchors VSCode + *** extension from Starlane Studios. This allows developers with the extension + *** to control click the "LINK " comments to navigate to the coresponding + *** "ANCHOR[id=]" comment. (Note: Invalid or broken links will default to + *** the first line of the file.) + *** + *** For example, this link should take you to the function signatures: + *** LINK #functions + *** + *** Any developers without this extension can safely ignore these comments, + *** although please try not to break them. :) + *** + *** Comment Anchors VSCode Extension: + *** https://marketplace.visualstudio.com/items?itemName=ExodiusStudios.comment-anchors + ***/ + + +/** Debugging **/ +// void void_func() {} +// #define tprintf void_func +#define tprintf printf + +/** Defaults for unspecified optional attributes. **/ +#define DEFAULT_MIN_IMPROVEMENT 0.0001 +#define DEFAULT_MAX_ITERATIONS 64u + +/** ================ Stuff That Should Be Somewhere Else ================ **/ +/** ANCHOR[id=temp] **/ + +#define INT_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c" +#define INT_TO_BINARY(int_val) \ + ((int_val) & 0b10000000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b01000000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00100000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00010000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00001000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000100000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000010000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000001000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000100000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000010000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000001000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000100000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000010000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000001000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000100000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000010000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000001000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000100000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000010000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000001000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000100000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000010000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000001000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000100000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000010000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000001000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000100000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000010000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000001000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000000100 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000000010 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000000001 ? '1' : '0') + + +/** TODO: I think this should be moved to mtsession. **/ +/*** I caused at least 10 bugs so far trying to pass format specifiers to + *** mssError without realizing that it didn't support them. Eventually, I + *** got fed up enough with the whole thing to write the following function. + ***/ +/*** Displays error text to the user. Does not print a stack trace. Does not + *** exit the program, allowing for the calling function to fail, generating + *** an error cascade which may be useful to the user since a stack trace is + *** not readily available. + *** + *** @todo I think this should be moved to somewhere else. + *** + *** @param clr Whether to clear the current error stack. As a rule of thumb, + *** if you are the first one to detec the error, clear the stack so that + *** other unrelated messages are not shown. If you are detecting an error + *** from another function that may also call an mssError() function, do + *** not clear the stack. + *** @param module The name or abbreviation of the module in which this + *** function is being called, to help developers narrow down the location + *** of the error. + *** @param format The format text for the error, which accepts any format + *** specifier that would be accepted by printf(). + *** @param ... Variables matching format specifiers in the format. + *** @returns Nothing, always succeeds. + ***/ +void mssErrorf(int clr, char* module, const char* format, ...) + { + /** Prevent interlacing with stdout flushing at a weird time. **/ + check(fflush(stdout)); + + /** Insert convenient newline before error stack begins. **/ + if (clr == 1) fprintf(stderr, "\n"); + + /** Process the format with all the same rules as printf(). **/ + char buf[BUFSIZ]; + va_list args; + va_start(args, format); + const int num_chars = vsnprintf(buf, sizeof(buf), format, args); + va_end(args); + + /** Error check vsnprintf, just to be safe. **/ + if (num_chars < 0) + { + perror("vsnprintf() failed"); + fprintf(stderr, "FAIL: mssErrorf(%d, \"%s\", \"%s\", ...)\n", clr, module, format); + return; + } + if (num_chars > BUFSIZ) + fprintf(stderr, "WARNING: Error truncated (length %d > buffer size %d).\n", num_chars, BUFSIZ); + + /** Print the error. **/ + const int ret = mssError(clr, module, "%s", buf); + + /** Not sure why you have to error check the error function... **/ + if (ret != 0) fprintf(stderr, "FAIL %d: mssError(%d, \"%s\", \"%%s\", \"%s\")\n", ret, clr, module, buf); + } + + +/** TODO: I think this should be moved to datatypes. **/ +/** Should maybe replace current type parsing in the presentation hints. **/ +int ci_TypeFromStr(const char* str) + { + if (str == NULL) return -1; + + /** Check string length. **/ + const size_t len = strlen(str); + if (len < 3 || 13 < len) return -1; + + /** Copy str to enable mutability. **/ + char buf[len + 1u]; + strcpy(buf, str); + + /** First character is case insensitive. **/ + buf[0] = toupper(buf[0]); + + /** Check type. **/ + if (strcmp(buf, "Any") == 0) return DATA_T_UNAVAILABLE; + if (strcmp(buf, "Integer") == 0) return DATA_T_INTEGER; + if (strcmp(buf, "String") == 0) return DATA_T_STRING; + if (strcmp(buf, "Double") == 0) return DATA_T_DOUBLE; + if (strcmp(buf, "DateTime") == 0) return DATA_T_DATETIME; + if (strcmp(buf, "IntVecor") == 0) return DATA_T_INTVEC; + if (strcmp(buf, "StringVector") == 0) return DATA_T_STRINGVEC; + if (strcmp(buf, "Money") == 0) return DATA_T_MONEY; + if (strcmp(buf, "Array") == 0) return DATA_T_ARRAY; + if (strcmp(buf, "Code") == 0) return DATA_T_CODE; + if (strcmp(buf, "Binary") == 0) return DATA_T_BINARY; + + /** Invalid type. **/ + return -1; + } + +/** TODO: I think this should be moved to datatypes. **/ +/** Should maybe replace duplocate functionality elsewhere. **/ +char* ci_TypeToStr(const int type) + { + switch (type) + { + case DATA_T_UNAVAILABLE: return "Unknown"; + case DATA_T_INTEGER: return "Integer"; + case DATA_T_STRING: return "String"; + case DATA_T_DOUBLE: return "Double"; + case DATA_T_DATETIME: return "DateTime"; + case DATA_T_INTVEC: return "IntVecor"; + case DATA_T_STRINGVEC: return "StringVector"; + case DATA_T_MONEY: return "Money"; + case DATA_T_ARRAY: return "Array"; + case DATA_T_CODE: return "Code"; + case DATA_T_BINARY: return "Binary"; + } + + /** Invalid type. **/ + mssErrorf(1, "Cluster", "Invalid type %d.\n", type); + return "Invalid"; + } + +/** TODO: I think this should be moved to xarray. **/ +/** Contract: Return value is null iff pXArray has 0 items. **/ +void** ci_xaToTrimmedArray(pXArray arr) + { + if (arr->nItems == 0) { + mssErrorf(1, "Cluster", "Failed to trim XArray of length 0."); + return NULL; + } + + const size_t arr_size = arr->nItems * sizeof(void*); + void** result = check_ptr(nmMalloc(arr_size)); + memcpy(result, arr->Items, arr_size); + return result; + } + +/** ================ Enum Declairations ================ **/ +/** ANCHOR[id=enums] **/ + +/** Enum representing a clustering algorithm. **/ +typedef unsigned char ClusterAlgorithm; +#define ALGORITHM_NULL (ClusterAlgorithm)0u +#define ALGORITHM_NONE (ClusterAlgorithm)1u +#define ALGORITHM_SLIDING_WINDOW (ClusterAlgorithm)2u +#define ALGORITHM_KMEANS (ClusterAlgorithm)3u +#define ALGORITHM_KMEANS_PLUS_PLUS (ClusterAlgorithm)4u +#define ALGORITHM_KMEDOIDS (ClusterAlgorithm)5u +#define ALGORITHM_DB_SCAN (ClusterAlgorithm)6u + +/** Converts a clustering algorithm to its string name. **/ +char* ci_ClusteringAlgorithmToString(ClusterAlgorithm clustering_algorithm) + { + switch (clustering_algorithm) + { + case ALGORITHM_NULL: return "NULL algorithm"; + case ALGORITHM_NONE: return "none"; + case ALGORITHM_SLIDING_WINDOW: return "sliding-window"; + case ALGORITHM_KMEANS: return "k-means"; + case ALGORITHM_KMEANS_PLUS_PLUS: return "k-means++"; + case ALGORITHM_KMEDOIDS: return "k-medoids"; + case ALGORITHM_DB_SCAN: return "db-scan"; + default: return "Unknown algorithm"; + } + } + +/** Enum representing a similarity measurement algorithm. **/ +typedef unsigned char SimilarityMeasure; +#define SIMILARITY_NULL (SimilarityMeasure)0u +#define SIMILARITY_COSINE (SimilarityMeasure)1u +#define SIMILARITY_LEVENSHTEIN (SimilarityMeasure)2u + +/** Converts a similarity measure to its string name. **/ +char* ci_SimilarityMeasureToString(SimilarityMeasure similarity_measure) + { + switch (similarity_measure) + { + case SIMILARITY_NULL: return "NULL similarity measure"; + case SIMILARITY_COSINE: return "cosine"; + case SIMILARITY_LEVENSHTEIN: return "levenshtein"; + default: return "Unknown similarity measure"; + } + } + +/*** Enum representing the type of data targetted by the driver, + *** set based on the path given when the driver is used to open + *** a cluster file. + *** + *** `0u` is reserved for a possible `NULL` value in the future. + *** However, there is currently no allowed `NULL` TargetType. + ***/ +typedef unsigned char TargetType; +#define TARGET_ROOT (TargetType)1u +#define TARGET_CLUSTER (TargetType)2u +#define TARGET_SEARCH (TargetType)3u +#define TARGET_CLUSTER_ENTRY (TargetType)4u +#define TARGET_SEARCH_ENTRY (TargetType)5u + +/** Attribute name lists by TargetType. **/ +#define nATTR_ROOT 2u +char* const ATTR_ROOT[nATTR_ROOT] = { + "source", + "attr_name", +}; +#define nATTR_CLUSTER 5u +char* const ATTR_CLUSTER[nATTR_CLUSTER] = { + "algorithm", + "similarity_measure", + "num_clusters", + "min_improvement", + "max_iterations", +}; +#define nATTR_SEARCH 4u +char* const ATTR_SEARCH[nATTR_SEARCH] = { + "source", + "threshold", + "similarity_measure", +}; +#define nATTR_CLUSTER_ENTRY 2u +char* const ATTR_CLUSTER_ENTRY[nATTR_CLUSTER_ENTRY] = { + "val", + "sim", +}; +#define nATTR_SEARCH_ENTRY 3u +char* const ATTR_SEARCH_ENTRY[nATTR_SEARCH_ENTRY] = { + "val1", + "val2", + "sim", +}; +#define END_OF_ATTRIBUTES NULL + + +/** Method name list. **/ +#define nMETHOD_NAME 2u +char* const METHOD_NAME[nMETHOD_NAME] = { + "cache", +}; +#define END_OF_METHODS END_OF_ATTRIBUTES + + +/** ================ Struct Declarations ================ **/ +/** ANCHOR[id=structs] **/ + +/** Represents the data source which may have data already fetched. **/ +typedef struct _SOURCE + { + /** Top level attributes (specified in the .cluster file). **/ + char* Name; /* The node name, specified in the .cluster file. + * Warning: Some code makes the assumption that this + * is the first field in the struct. + */ + char* Key; /* The key associated with this object in the global SourceCache. */ + char* SourcePath; /* The path to the data source from which to retrieve data. */ + char* AttrName; /* The name of the attribute to get from the data source. */ + + /** Computed data. **/ + char** Data; /* The data strings to be clustered and searched, or NULL if they + * have not been fetched from the source. + */ + pVector* Vectors; /* The cosine comparison vectors from the fetched data, or NULL if + * they haven't been computed. Note that vectors are no longer + * needed once all clusters and searches have been computed, so + * they are automatically freed in that case to save memory. + */ + unsigned int nVectors; /* The number of vectors and data strings. Note: This is not + * set to 0 if the vector array is freed, this case should be + * checked separately. + */ + } SourceData, *pSourceData; + +/** Data for each cluster. **/ +typedef struct _CLUSTER + { + /** Attribute Data. **/ + char* Name; /* The cluster name, specified in the .cluster file. + * Warning: Some code makes the assumption that this + * is the first field in the struct. + */ + char* Key; /* The key associated with this object in the global ClusterCache. */ + ClusterAlgorithm ClusterAlgorithm; /* The clustering algorithm to be used. */ + SimilarityMeasure SimilarityMeasure; /* The similarity measurse to be used when clustering. */ + unsigned int NumClusters; /* The number of clusters. 1 if algorithm = none. */ + double MinImprovement; /* The minimum amount of improvement that must be met each + * clustering iteration. If there is less improvement, the + * algorithm will stop. Specifying "max" in the .cluster + * file should be represented by a value of -inf. + */ + unsigned int MaxIterations; /* The maximum number of iterations to run clustering. */ + + /** Other data (ignored by caching). **/ + unsigned int nSubClusters; /* The number of subclusters of this cluster. */ + struct _CLUSTER** SubClusters; /* A pClusterData array, NULL if nSubClusters == 0. */ + struct _CLUSTER* Parent; /* This cluster's parent. NULL if it is not a subcluster. */ + pSourceData SourceData; /* Pointer to the source data that this cluster uses. */ + + /** Computed data. **/ + unsigned int* Labels; /* An array with one element for each vector in the data + * (aka. DriverData->nVectors). For vector i, Labels[i] is + * the ID of the cluster to which that data is assigned. + * NULL if the cluster has not been computed. */ + } + ClusterData, *pClusterData; + +/** Data for each search. **/ +typedef struct _SEARCH + { + char* Name; /* The search name, specified in the .cluster file. + * Warning: Some code makes the assumption that this + * is the first field in the struct. + */ + char* Key; /* The key associated with this object in the global SearchCache. */ + pClusterData Source; /* The cluster from which this search is to be derived. */ + double Threshold; /* The minimum similarity threshold for elements to be + * included in the results of the search. + */ + SimilarityMeasure SimilarityMeasure; /* The similarity measure used to compare items. */ + + /** Computed data. **/ + pDup* Dups; /* An array holding the dups found by the search, or NULL + * if the search has not been computed. + */ + unsigned int nDups; /* The number of dups found. */ + } + SearchData, *pSearchData; + +/*** Node instance data. + *** When a .cluster file is openned, there will be only one node for that + *** file. However, in the course of the query, many driver instance structs + *** may be created by functions like clusterQueryFetch(), and closed by the + *** object system using clusterClose(). + ***/ +typedef struct _NODE + { + /** Substructures. **/ + pSourceData SourceData; /* Data from the provided source. */ + pParam* Params; /* A pParam array storing the params in the .cluster file. */ + unsigned int nParams; /* The number of specified params. */ + pParamObjects ParamList; /* Functions as a "scope" for resolving values during parsing. */ + pClusterData* Clusters; /* A pCluster array storing the clusters in the .cluster file. + * Will be NULL if nClusters = 0. + */ + unsigned int nClusters; /* The number of specified clusters. */ + pSearchData* Searches; /* A SearchData array storing the searches in the .cluster file. */ + unsigned int nSearches; /* The number of specified searches. */ + + /** Other stuff, idk why it's here. **/ + pSnNode Node; + pObject Obj; + char* CreateDateField; + char* ModifyDateField; + } + NodeData, *pNodeData; + +/** Driver instance data. **/ +/*** Similar to a pointer to specific, computed data in the pNodeData struct. + *** If target type is the root, a cluster, or a search, no data is guarnteed + *** to be computed yet. These three types can be returned from clusterOpen(). + *** To target a cluster entry or search entry, fetch a driver targetting a + *** cluster or search (respectively). These target types ensure that the data + *** has been computed, so the GetAttr functions do not need to ensure this. + ***/ +typedef struct _DRIVER + { + pNodeData NodeData; /* The associated node data. */ + TargetType TargetType; /* The type of data targetted by this driver instance. */ + void* TargetData; /* A pointer to the specific targetted cluster or search. */ + unsigned int TargetIndex; /* An index into the cluster or search (entries only). */ + unsigned char TargetAttrIndex; /* An index into an attribute list (for GetNextAttr()). */ + unsigned char TargetMethodIndex; /* An index into an method list (for GetNextMethod()). */ + } + DriverData, *pDriverData; + +/** Query instance data. **/ +typedef struct + { + pDriverData DriverData; /* The associated driver instance being queried. */ + unsigned int RowIndex; /* The selected row of the data targetted by the driver. */ + } + ClusterQuery, *pClusterQuery; + +/** Global storage for caches. **/ +struct + { + XHashTable SourceCache; + XHashTable ClusterCache; + XHashTable SearchCache; + } + ClusterCaches; + + +/** ================ Function Declarations ================ **/ +/** ANCHOR[id=functions] **/ + +/** Note: ci stands for "cluster_internal". **/ + +/** Parsing Functions. **/ +// LINK #parsing +int ci_ParseAttribute(pStructInf inf, char* attr_name, int datatype, pObjData data, pParamObjects param_list, bool required, bool print_type_error); +ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf cluster_inf, pParamObjects param_list); +SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf cluster_inf, pParamObjects param_list); +pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path); +pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data); +pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data); +pNodeData ci_ParseNodeData(pStructInf inf, pObject obj); + +/** Freeing Functions. **/ +// LINK #freeing +void ci_FreeSourceData(pSourceData source_data); +void ci_FreeClusterData(pClusterData cluster_data, bool recursive); +void ci_FreeSearchData(pSearchData search_data); +void ci_FreeNodeData(pNodeData node_data); + +/** Deep Size Computation Functions. **/ +// LINK #sizing +unsigned int ci_SizeOfSourceData(pSourceData source_data); +unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive); +unsigned int ci_SizeOfSearchData(pSearchData search_data); +unsigned int ci_SizeOfNodeData(pNodeData node_data); + +/** Cache Invalidation Functions. **/ +// LINK #invalidation +void ci_CacheFreeSourceData(pXHashEntry entry, void* _); +void ci_CacheFreeCluster(pXHashEntry entry, void* _); +void ci_CacheFreeSearch(pXHashEntry entry, void* _); + +/** Computation Functions. (Ensure data is computed.) **/ +// LINK #computation +int ci_ComputeSourceData(pSourceData source_data, pObjSession session); +int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data); +int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data); + +/** Parameter Functions. **/ +// LINK #params +int ci_GetParamType(void* inf_v, const char* attr_name); +int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); +int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); + +/** Driver Functions. **/ +// LINK #driver +void* clusterOpen(pObject obj, int mask, pContentType systype, char* usr_type, pObjTrxTree* oxt); +int clusterClose(void* inf_v, pObjTrxTree* oxt); +void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); +void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt); +int clusterQueryClose(void* qy_v, pObjTrxTree* oxt); +int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt); +int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt); +char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt); +int clusterInfo(void* inf_v, pObjectInfo info); + +/** Method Execution Functions. **/ +// LINK #method +char* clusterGetFirstMethod(void* inf_v, pObjTrxTree oxt); +char* clusterGetNextMethod(void* inf_v, pObjTrxTree oxt); +int clusterExecuteMethod(void* inf_v, char* methodname, pObjData param, pObjTrxTree oxt); + +/** Unimplemented DriverFunctions. **/ +// LINK #unimplemented +int clusterCreate(pObject obj, int mask, pContentType systype, char* usrtype, pObjTrxTree* oxt); +int clusterDeleteObj(void* inf_v, pObjTrxTree* oxt); +int clusterDelete(pObject obj, pObjTrxTree* oxt); +int clusterRead(void* inf_v, char* buffer, int maxcnt, int offset, int flags, pObjTrxTree* oxt); +int clusterWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt); +int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree oxt); +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree oxt); +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree oxt); +int clusterCommit(void* inf_v, pObjTrxTree *oxt); +pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt); + +/** ================ Parsing Functions ================ **/ +/** ANCHOR[id=parsing] **/ +// LINK #functions + +/*** Returns 0 for success and -1 on failure. Promises that mssError() will be + *** invoked on failure, so the caller need not specify their own error message. + *** Returns 1 if attribute is available, printing an error if the attribute was + *** marked as required. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** TODO: Greg + *** This function took several hours of debugging before it worked at all, and I + *** still don't know if it works correctly... or really how it works. Please + *** review this code carefully! + ***/ +int ci_ParseAttribute( + pStructInf inf, + char* attr_name, + int datatype, + pObjData data, + pParamObjects param_list, + bool required, + bool print_type_error) + { + int ret; + + /** Get attribute name. **/ + pStructInf attr_info = stLookup(inf, attr_name); + if (attr_info == NULL) + { + if (required) mssErrorf(1, "Cluster", "'%s' must be specified for clustering.", attr_name); + return 1; + } + ASSERTMAGIC(attr_info, MGK_STRUCTINF); + + /** Get the attribute. **/ + tprintf("Invoking ci_ParseAttribute('%s')...\n", attr_name); + pExpression exp = check_ptr(stGetExpression(attr_info, 0)); + expBindExpression(exp, param_list, EXPR_F_RUNSERVER); + ret = expEvalTree(exp, param_list); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Expression evaluation failed."); + goto err; + } + + /** Check for data type mismatch. **/ + if (datatype != exp->DataType) + { + mssErrorf(1, "Cluster", + "Expected \"%s\" : %s, but got type %s.", + attr_name, ci_TypeToStr(datatype), ci_TypeToStr(exp->DataType) + ); + goto err; + } + + /** Get the data out of the expression. **/ + ret = expExpressionToPod(exp, datatype, data); + if (ret != 0) + { + mssErrorf(1, "Cluster", + "Failed to get data of type \"%s\" from exp \"%s\" (error code %d).", + ci_TypeToStr(datatype), exp->Name, ret + ); + goto err; + } + +// const int ret = stGetAttrValueOSML( +// attr_info, +// datatype, +// data, +// 0, +// param_list->Session, +// param_list +// ); +// if (ret == 1) +// { +// mssErrorf(1, "Cluster", +// "stGetAttrValueOSML('%s') because %s cannot be null.\n" +// " > Hint: You might have used an undefined variable or forgot to add runserver().", +// attr_name, attr_name +// ); +// return 1; +// } +// if (ret != 0) +// { +// if (print_type_error) +// { +// mssErrorf(1, "Cluster", +// "stGetAttrValueOSML('%s') failed (error code %d).\n" +// " > Hint: It might be a type mismatch, or you used an undefined variable.", +// attr_name, ret +// ); +// } +// return ret; +// } + + return 0; + + err: + mssErrorf(0, "Cluster", + "Failed to parse attribute \"%s\" from group \"%s\"", + attr_name, inf->Name + ); + return -1; + } + + +/*** Parses a ClusteringAlgorithm from the algorithm field in the pStructInf + *** representing some structure with that attribute in a parsed structure file. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @returns The data algorithm, or ALGORITHM_NULL on failure. + ***/ +ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObjects param_list) + { + /** Get the algorithm attribute. **/ + char* algorithm; + if (ci_ParseAttribute(inf, "algorithm", DATA_T_STRING, POD(&algorithm), param_list, true, true) != 0) + { + mssErrorf(0, "Cluster", "Failed to parse attribute 'algorithm' in group \"%s\".", inf->Name); + return ALGORITHM_NULL; + } + + /** Parse known clustering algorithms. **/ + if (!strcasecmp(algorithm, "none")) return ALGORITHM_NONE; + if (!strcasecmp(algorithm, "sliding-window")) return ALGORITHM_SLIDING_WINDOW; + if (!strcasecmp(algorithm, "k-means")) return ALGORITHM_KMEANS; + if (!strcasecmp(algorithm, "k-means++")) return ALGORITHM_KMEANS_PLUS_PLUS; + if (!strcasecmp(algorithm, "k-medoids")) return ALGORITHM_KMEDOIDS; + if (!strcasecmp(algorithm, "db-scan")) return ALGORITHM_DB_SCAN; + + /** Unknown value for clustering algorithm. **/ + mssErrorf(1, "Cluster", "Unknown \"clustering algorithm\": %s", algorithm); + return ALGORITHM_NULL; + } + + +/*** Parses a SimilarityMeasure from the similarity_measure field in the given + *** pStructInf parameter, which represents some structure with that attribute + *** in a parsed structure file. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @returns The similarity measure, or SIMILARITY_NULL on failure. + ***/ +SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects param_list) + { + /** Get the similarity_measure attribute. **/ + char* measure; + if (ci_ParseAttribute(inf, "similarity_measure", DATA_T_STRING, POD(&measure), param_list, true, true) != 0) + { + mssErrorf(0, "Cluster", "Failed to parse attribute 'similarity_measure' in group \"%s\".", inf->Name); + return SIMILARITY_NULL; + } + + /** Parse known clustering algorithms. **/ + if (!strcasecmp(measure, "cosine")) return SIMILARITY_COSINE; + if (!strcasecmp(measure, "levenshtein")) return SIMILARITY_LEVENSHTEIN; + + mssErrorf(1, "Cluster", "Unknown \"similarity measure\": %s", measure); + return SIMILARITY_NULL; + } + + +/*** Allocates a new pSourceData struct from a parsed pStructInf representing + *** a .cluster structure file. + *** + *** @attention - Warning: Caching in use. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for a .cluster structure file. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @param path The file path to the parsed structure file, used to generate + *** cache entry keys. + *** @returns A new pSourceData struct on success, or NULL on failure. + ***/ +pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path) + { + char* buf; + + /** Get source. **/ + if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err; + char* source_path = check_ptr(strdup(buf)); + + /** Get attribute name. **/ + if (ci_ParseAttribute(inf, "attr_name", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err; + char* attr_name = check_ptr(strdup(buf)); + + /** Create cache entry key. **/ + const size_t len = strlen(path) + strlen(source_path) + strlen(attr_name) + 3lu; + char* key = check_ptr(nmSysMalloc(len * sizeof(char))); + snprintf(key, len, "%s?%s:%s", path, source_path, attr_name); + pXHashTable source_cache = &ClusterCaches.SourceCache; + + /** Check for a cached version. **/ + pSourceData source_maybe = (pSourceData)xhLookup(source_cache, key); + if (source_maybe != NULL) + { + /** Cache hit. **/ + tprintf("# source: \"%s\"\n", key); + tprintf("--> Name: %s\n", source_maybe->Name); /* Cause invalid read if cache was incorrectly freed. */ + + /** Free data we don't need. */ + free(source_path); + free(attr_name); + nmSysFree(key); + + /** Return the cached source data. **/ + return source_maybe; + } + + /** Cache miss: Create a new source data object. **/ + pSourceData source_data = check_ptr(nmMalloc(sizeof(SourceData))); + memset(source_data, 0, sizeof(SourceData)); + source_data->Name = check_ptr(strdup(inf->Name)); + source_data->Key = key; + source_data->SourcePath = source_path; + source_data->AttrName = attr_name; + + /** Add the new object to the cache for next time. **/ + tprintf("+ source: \"%s\"\n", key); + check(xhAdd(source_cache, key, (void*)source_data)); + + return source_data; + + err: + mssErrorf(0, "Cluster", "Failed to parse source data from group \"%s\" in file: %s", inf->Name, path); + return NULL; + } + + +/*** Allocates a new pClusterData struct from a parsed pStructInf. + *** + *** @attention - Warning: Caching in use. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for a cluster group in a structure file. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @param source_data The pSourceData that clusters are to be built from, also + *** used to generate cache entry keys. + *** @returns A new pClusterData struct on success, or NULL on failure. + ***/ +pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) + { + int result; + + tprintf("Parsing cluster: %s\n", inf->Name); + + pParamObjects param_list = node_data->ParamList; + pSourceData source_data = node_data->SourceData; + + /** Allocate space for data struct. **/ + pClusterData cluster_data = check_ptr(nmMalloc(sizeof(ClusterData))); + memset(cluster_data, 0, sizeof(ClusterData)); + + /** Basic Properties. **/ + cluster_data->Name = check_ptr(strdup(inf->Name)); + cluster_data->SourceData = source_data; + + /** Get algorithm. **/ + cluster_data->ClusterAlgorithm = ci_ParseClusteringAlgorithm(inf, param_list); + if (cluster_data->ClusterAlgorithm == ALGORITHM_NULL) goto err; + + /** Handle no clustering case. **/ + if (cluster_data->ClusterAlgorithm == ALGORITHM_NONE) + { + cluster_data->NumClusters = 1u; + goto parsing_done; + } + + /** Get similarity_measure. **/ + cluster_data->SimilarityMeasure = ci_ParseSimilarityMeasure(inf, param_list); + if (cluster_data->SimilarityMeasure == SIMILARITY_NULL) goto err_free_cluster; + + /** Handle sliding window case. **/ + if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + goto parsing_done; + + /** Get num_clusters. **/ + int num_clusters; + if (ci_ParseAttribute(inf, "num_clusters", DATA_T_INTEGER, POD(&num_clusters), param_list, true, true) != 0) goto err_free_cluster; + if (num_clusters < 2) + { + mssErrorf(1, "Cluster", "Invalid value for [num_clusters : uint > 1]: %d", num_clusters); + if (num_clusters == 1) fprintf(stderr, "HINT: Use algorithm=\"none\" to disable clustering.\n"); + goto err_free_cluster; + } + cluster_data->NumClusters = (unsigned int)num_clusters; + tprintf("Got value for num_clusters: %d\n", num_clusters); + + /** Get min_improvement. **/ + double improvement; + result = ci_ParseAttribute(inf, "min_improvement", DATA_T_DOUBLE, POD(&improvement), param_list, false, false); + if (result == 1) cluster_data->MinImprovement = DEFAULT_MIN_IMPROVEMENT; + else if (result == 0) + { + if (improvement <= 0.0 || 1.0 <= improvement) + { + mssErrorf(1, "Cluster", "Invalid value for [min_improvement : 0.0 < x < 1.0 | \"none\"]: %g", improvement); + goto err_free_cluster; + } + cluster_data->MinImprovement = improvement; + } + else if (result == -1) + { + char* str; + result = ci_ParseAttribute(inf, "min_improvement", DATA_T_STRING, POD(&str), param_list, false, true); + if (result == 0 && !strcasecmp(str, "none")) + { + /** Specify no min improvement. **/ + cluster_data->MinImprovement = -INFINITY; + } + } + if (result == -1) goto err_free_cluster; + + /** Get max_iterations. **/ + int max_iterations; + result = ci_ParseAttribute(inf, "max_iterations", DATA_T_INTEGER, POD(&max_iterations), param_list, false, true); + if (result == -1) goto err_free_cluster; + if (result == 0) + { + if (max_iterations < 0) + { + mssErrorf(1, "Cluster", "Invalid value for [max_iterations : uint]: %d", max_iterations); + goto err_free_cluster; + } + cluster_data->MaxIterations = (unsigned int)max_iterations; + } + else cluster_data->MaxIterations = DEFAULT_MAX_ITERATIONS; + + /** Search for sub-clusters. **/ + XArray sub_clusters; + const int ret = xaInit(&sub_clusters, 4u); + if (ret != 0) + { + mssErrorf(1, "Cluster", "FAIL - xaInit(&sub_clusters, %u): %d", 4u, ret); + goto err_free_cluster; + } + for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + /** Check that this is a group (not an attribute). **/ + pStructInf group_inf = inf->SubInf[i]; + ASSERTMAGIC(group_inf, MGK_STRUCTINF); + if (stStructType(group_inf) != ST_T_SUBGROUP) continue; + + /** Select array by group type. **/ + assert(group_inf->UsrType != NULL); + if (strcmp(group_inf->UsrType, "cluster/cluster")) continue; + + /** Subcluster found. **/ + pClusterData sub_cluster = ci_ParseClusterData(group_inf, node_data); + if (sub_cluster == NULL) goto err_free_sub_clusters; + sub_cluster->Parent = cluster_data; + xaAddItem(&sub_clusters, sub_cluster); + } + cluster_data->nSubClusters = sub_clusters.nItems; + cluster_data->SubClusters = (cluster_data->nSubClusters > 0u) ? + (pClusterData*)ci_xaToTrimmedArray(&sub_clusters) + : NULL; /* No sub-clusters. */ + xaDeInit(&sub_clusters); + + /** Create the cache key. **/ + parsing_done:; + char* key; + switch (cluster_data->ClusterAlgorithm) + { + case ALGORITHM_NONE: + { + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 5lu; + key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%u", + source_data->Key, + cluster_data->Name, + ALGORITHM_NONE + ); + break; + } + + case ALGORITHM_SLIDING_WINDOW: + { + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 8lu; + key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%u&%u", + source_data->Key, + cluster_data->Name, + ALGORITHM_SLIDING_WINDOW, + cluster_data->SimilarityMeasure + ); + break; + } + + default: + { + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 32lu; + key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%u&%u&%u&%g&%u", + source_data->Key, + cluster_data->Name, + cluster_data->ClusterAlgorithm, + cluster_data->SimilarityMeasure, + cluster_data->NumClusters, + cluster_data->MinImprovement, + cluster_data->MaxIterations + ); + break; + } + } + pXHashTable cluster_cache = &ClusterCaches.ClusterCache; + cluster_data->Key = key; + + /** Check for a cached version. **/ + pClusterData cluster_maybe = (pClusterData)xhLookup(cluster_cache, key); + if (cluster_maybe != NULL) + { + /** Cache hit. **/ + tprintf("# cluster: \"%s\"\n", key); + tprintf("--> Name: %s\n", cluster_maybe->Name); /* Cause invalid read if cache was incorrectly freed. */ + + /** Free the parsed cluster that we no longer need. */ + ci_FreeClusterData(cluster_data, false); + nmSysFree(key); + + /** Return the cached cluster. **/ + return cluster_maybe; + } + + /** Cache miss. **/ + tprintf("+ cluster: \"%s\"\n", key); + check(xhAdd(cluster_cache, key, (void*)cluster_data)); + return cluster_data; + + /** Error cleanup. **/ + err_free_sub_clusters: + for (unsigned int i = 0u; i < sub_clusters.nItems; i++) + ci_FreeClusterData(sub_clusters.Items[i], true); + xaDeInit(&sub_clusters); + + err_free_cluster: + nmFree(cluster_data, sizeof(ClusterData)); + + err: + mssErrorf(0, "Cluster", "Failed to parse cluster from group \"%s\".", inf->Name); + return NULL; + } + + +/*** Allocates a new pSearchData struct from a parsed pStructInf. + *** + *** @attention - Warning: Caching in use. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for a search group in a structure file. + *** @param param_list The param objects that function as a kind of "scope" for + *** evaluating parameter variables in the structure file. + *** @param node_data The pNodeData, used to get the param list and to look up + *** the cluster pointed to by the source attribute. + *** @returns A new pSearchData struct on success, or NULL on failure. + ***/ +pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) + { + tprintf("Parsing search: %s\n", inf->Name); + + /** Allocate space for search struct. **/ + pSearchData search_data = nmMalloc(sizeof(SearchData)); + assert(search_data != NULL); + memset(search_data, 0, sizeof(SearchData)); + + /** Get search name. **/ + search_data->Name = check_ptr(strdup(inf->Name)); + + /** Get source. **/ + char* source_name; + if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&source_name), node_data->ParamList, true, true) != 0) return NULL; + for (unsigned int i = 0; i < node_data->nClusters; i++) + { + pClusterData cluster_data = node_data->Clusters[i]; + if (strcmp(source_name, cluster_data->Name) == 0) + { + /** Source found. **/ + search_data->Source = cluster_data; + break; + } + + /** Note: Subclusters not implemented here. **/ + } + if (search_data->Source == NULL) + { + mssErrorf(1, "Cluster", "Could not find cluster %s for search %s.", source_name, search_data->Name); + goto err_free_search; + } + + /** Get threshold attribute. **/ + if (ci_ParseAttribute(inf, "threshold", DATA_T_DOUBLE, POD(&search_data->Threshold), node_data->ParamList, true, true) != 0) goto err_free_search; + if (search_data->Threshold <= 0.0 || 1.0 <= search_data->Threshold) + { + mssErrorf(1, "Cluster", + "Invalid value for [threshold : 0.0 < x < 1.0 | \"none\"]: %g", + search_data->Threshold + ); + goto err_free_search; + } + + /** Get similarity measure. **/ + search_data->SimilarityMeasure = ci_ParseSimilarityMeasure(inf, node_data->ParamList); + if (search_data->SimilarityMeasure == SIMILARITY_NULL) goto err_free_search; + + /** Create cache entry key. **/ + char* source_key = search_data->Source->Key; + const size_t len = strlen(source_key) + strlen(search_data->Name) + 16lu; + char* key = nmSysMalloc(len * sizeof(char)); + snprintf(key, len, "%s/%s?%g&%u", + source_key, + search_data->Name, + search_data->Threshold, + search_data->SimilarityMeasure + ); + pXHashTable search_cache = &ClusterCaches.SearchCache; + + /** Check for a cached version. **/ + pSearchData search_maybe = (pSearchData)xhLookup(search_cache, key); + if (search_maybe != NULL) + { + /** Cache hit. **/ + tprintf("# search: \"%s\"\n", key); + tprintf("--> Name: %s\n", search_maybe->Name); /* Cause invalid read if cache was incorrectly freed. */ + + /** Free the parsed search that we no longer need. */ + ci_FreeSearchData(search_data); + nmSysFree(key); + + /** Return the cached search. **/ + return search_maybe; + } + + /** Cache miss. **/ + tprintf("+ search: \"%s\"\n", key); + check(xhAdd(search_cache, key, (void*)search_data)); + return search_data; + + err_free_search: + ci_FreeSearchData(search_data); + mssErrorf(0, "Cluster", "Failed to parse search from group \"%s\".", inf->Name); + return NULL; + } + + +/*** Allocates a new pNodeData struct from a parsed pStructInf. + *** + *** @attention - Does not use caching directly, but uses subfunctions to + *** handle caching of substructures. + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param inf A parsed pStructInf for the top level group in a .cluster + *** structure file. + *** @param obj The parent object struct. + *** @returns A new pNodeData struct on success, or NULL on failure. + ***/ +pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) + { + int ret; + + /** Retrieve path so we'll know we have it later. **/ + char* path = obj_internal_PathPart(obj->Pathname, 0, obj->SubPtr); + + /** Allocate node struct data. **/ + // pNodeData node_data = NodeData |> sizeof() |> nmMalloc() |> check_ptr(); + pNodeData node_data = check_ptr(nmMalloc(sizeof(NodeData))); + memset(node_data, 0, sizeof(NodeData)); + node_data->Obj = obj; + + /** Set up param list. **/ + node_data->ParamList = check_ptr(expCreateParamList()); + node_data->ParamList->Session = obj->Session; + ret = expAddParamToList(node_data->ParamList, "parameters", (void*)node_data, 0); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to add parameters to the param list scope (error code %d).", ret); + goto err_free_node; + } + + /** Set the param functions, defined later in the file. **/ + ret = expSetParamFunctions( + node_data->ParamList, + "parameters", + ci_GetParamType, + ci_GetParamValue, + ci_SetParamValue + ); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to set param functions (error code %d).", ret); + goto err_free_node; + } + + /** Detect relevant groups. **/ + XArray param_infs, cluster_infs, search_infs; + check(xaInit(¶m_infs, 8)); + check(xaInit(&cluster_infs, 8)); + check(xaInit(&search_infs, 8)); + for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + /** Check that this is a group (not an attribute). **/ + pStructInf group_inf = inf->SubInf[i]; + ASSERTMAGIC(group_inf, MGK_STRUCTINF); + if (stStructType(group_inf) != ST_T_SUBGROUP) continue; + + /** Select array by group type. **/ + const char* group_type = group_inf->UsrType; + if (strcmp(group_type, "cluster/parameter") == 0) check_strict(xaAddItem(¶m_infs, group_inf)); + else if (strcmp(group_type, "cluster/cluster") == 0) check_strict(xaAddItem(&cluster_infs, group_inf)); + else if (strcmp(group_type, "cluster/search") == 0) check_strict(xaAddItem(&search_infs, group_inf)); + else + { + mssErrorf(1, "Cluster", + "Unkown group type \"%s\" on group \"%s\".", + group_type, group_inf->Name + ); + goto err_free_arrs; + } + } + + /** Extract OpenCtl for use below. **/ + bool has_provided_params = obj != NULL + && obj->Pathname != NULL + && obj->Pathname->OpenCtl != NULL + && obj->Pathname->OpenCtl[obj->SubPtr - 1] != NULL + && obj->Pathname->OpenCtl[obj->SubPtr - 1]->nSubInf > 0 + && obj->Pathname->OpenCtl[obj->SubPtr - 1]->SubInf != NULL; + int num_provided_params = (has_provided_params) ? obj->Pathname->OpenCtl[obj->SubPtr - 1]->nSubInf : 0; + pStruct* provided_params = (has_provided_params) ? obj->Pathname->OpenCtl[obj->SubPtr - 1]->SubInf : NULL; + + /** Itterate over each param in the structure file. **/ + node_data->nParams = param_infs.nItems; + const size_t params_size = node_data->nParams * sizeof(pParam); + node_data->Params = check_ptr(nmMalloc(params_size)); + memset(node_data->Params, 0, params_size); + for (unsigned int i = 0u; i < node_data->nParams; i++) + { + pParam param = paramCreateFromInf(param_infs.Items[i]); + if (param == NULL) + { + mssErrorf(0, "Cluster", + "Failed to create param from inf for param #%u: %s", + i, ((pStructInf)param_infs.Items[i])->Name + ); + goto err_free_arrs; + } + node_data->Params[i] = param; + + /** Check each provided param to see if the user provided value. **/ + for (unsigned int j = 0u; j < num_provided_params; j++) + { + pStruct provided_param = provided_params[j]; + if (provided_param == NULL) + { + mssErrorf(1, "Cluster", "Provided param struct cannot be NULL."); + fprintf(stderr, + "Debug info: obj->Pathname->OpenCtl[%d]->SubInf[%u] is NULL", + obj->SubPtr - 1, j + ); + goto err_free_arrs; + } + + /** If this provided param value isn't for the param, ignore it. **/ + if (strcmp(provided_param->Name, param->Name) != 0) continue; + + /** Matched! The user is providing a value for this param. **/ + ret = paramSetValueFromInfNe(param, provided_param, 0, node_data->ParamList, obj->Session); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to set param value from struct info.\n" + " > Param #%u: %s\n" + " > Provided Param #%u: %n\n" + " > Error code: %d", + i, param->Name, + j, provided_param->Name, + ret + ); + goto err_free_arrs; + } + tprintf("Found provided value for %s, which is now %d\n", param->Name, param->Value->Data.Integer); + + /** Provided value successfully handled, we're done. **/ + break; + } + + /** Invoke param hints parsing. **/ + ret = paramEvalHints(param, node_data->ParamList, obj->Session); + if (ret != 0) + { + mssErrorf(0, "Cluster", + "Failed to evaluate parameter hints for parameter \"%s\" (error code %d).", + param->Name, ret + ); + goto err_free_arrs; + } + if (strcmp("k", param->Name) == 0) tprintf("Param k is now %d\n", param->Value->Data.Integer); + } + check(xaDeInit(¶m_infs)); + param_infs.nAlloc = 0; + + /** Parse source data. **/ + node_data->SourceData = ci_ParseSourceData(inf, node_data->ParamList, path); + if (node_data->SourceData == NULL) goto err_free_node; + + /** Parse each cluster. **/ + node_data->nClusters = cluster_infs.nItems; + if (node_data->nClusters > 0) + { + const size_t clusters_size = node_data->nClusters * sizeof(pClusterData); + node_data->Clusters = check_ptr(nmMalloc(clusters_size)); + memset(node_data->Clusters, 0, clusters_size); + for (unsigned int i = 0u; i < node_data->nClusters; i++) + { + node_data->Clusters[i] = ci_ParseClusterData(cluster_infs.Items[i], node_data); + if (node_data->Clusters[i] == NULL) goto err_free_arrs; + } + } + else node_data->Clusters = NULL; + check(xaDeInit(&cluster_infs)); + cluster_infs.nAlloc = 0; + + /** Parse each search. **/ + node_data->nSearches = search_infs.nItems; + if (node_data->nSearches > 0) + { + const size_t searches_size = node_data->nSearches * sizeof(pSearchData); + node_data->Searches = check_ptr(nmMalloc(searches_size)); + memset(node_data->Searches, 0, searches_size); + for (unsigned int i = 0u; i < node_data->nSearches; i++) + { + node_data->Searches[i] = ci_ParseSearchData(search_infs.Items[i], node_data); + if (node_data->Searches[i] == NULL) goto err_free_node; /* The XArrays are already freed. */ + } + } + else node_data->Searches = NULL; + check(xaDeInit(&search_infs)); + search_infs.nAlloc = 0; + + /** Success. **/ + return node_data; + + err_free_arrs: + if (param_infs.nAlloc != 0) check(xaDeInit(¶m_infs)); + if (cluster_infs.nAlloc != 0) check(xaDeInit(&cluster_infs)); + if (search_infs.nAlloc != 0) check(xaDeInit(&search_infs)); + + err_free_node: + ci_FreeNodeData(node_data); + mssErrorf(0, "Cluster", "Failed to parse node from group \"%s\" in file: %s", inf->Name, path); + return NULL; + } + + +/** ================ Freeing Functions ================ **/ +/** ANCHOR[id=freeing] **/ +// LINK #functions + +/** @param source_data A pSourceData struct, freed by this function. **/ +void ci_FreeSourceData(pSourceData source_data) + { + /** Free top level attributes, if they exist. **/ + if (source_data->Name != NULL) + { + free(source_data->Name); + source_data->Name = NULL; + } + if (source_data->SourcePath != NULL) + { + free(source_data->SourcePath); + source_data->SourcePath = NULL; + } + if (source_data->AttrName != NULL) + { + free(source_data->AttrName); + source_data->AttrName = NULL; + } + + /** Free fetched data, if it exists. **/ + if (source_data->Data != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + free(source_data->Data[i]); + nmFree(source_data->Data, source_data->nVectors * sizeof(char*)); + source_data->Data = NULL; + } + + /** Free computed vectors, if they exist. **/ + if (source_data->Vectors != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + ca_free_vector(source_data->Vectors[i]); + nmFree(source_data->Vectors, source_data->nVectors * sizeof(pVector)); + source_data->Vectors = NULL; + } + + /** Free the source_data struct. **/ + nmFree(source_data, sizeof(SourceData)); + } + + +/*** Free pClusterData struct with an option to recursively free subclusters. + *** + *** @param cluster_data The cluster data struct to free. + *** @param recrusive Whether to recursively free subclusters. + ***/ +void ci_FreeClusterData(pClusterData cluster_data, bool recursive) + { + /** Free top level cluster data. **/ + if (cluster_data->Name != NULL) free(cluster_data->Name); + + /** Free computed data, if it exists. **/ + if (cluster_data->Labels != NULL) + { + const unsigned int nVectors = cluster_data->SourceData->nVectors; + nmFree(cluster_data->Labels, nVectors * sizeof(unsigned int)); + } + + /** Free subclusters recursively. **/ + if (cluster_data->SubClusters != NULL) + { + if (recursive) + { + for (unsigned int i = 0u; i < cluster_data->nSubClusters; i++) + ci_FreeClusterData(cluster_data->SubClusters[i], recursive); + } + nmFree(cluster_data->SubClusters, cluster_data->nSubClusters * sizeof(void*)); + } + + /** Free the cluster struct. **/ + nmFree(cluster_data, sizeof(ClusterData)); + } + + +/** @param search_data A pSearchData struct, freed by this function. **/ +void ci_FreeSearchData(pSearchData search_data) + { + if (search_data->Name != NULL) free(search_data->Name); + if (search_data->Dups != NULL) + { + for (unsigned int i = 0; i < search_data->nDups; i++) + nmFree(search_data->Dups[i], sizeof(Dup)); + nmFree(search_data->Dups, search_data->nDups * sizeof(void*)); + } + nmFree(search_data, sizeof(SearchData)); + } + + +/** @param node_data A pNodeData struct, freed by this function. **/ +void ci_FreeNodeData(pNodeData node_data) + { + /** Free parsed params, if they exist. **/ + if (node_data->Params != NULL) + { + for (unsigned int i = 0u; i < node_data->nParams; i++) + { + if (node_data->Params[i] == NULL) break; + paramFree(node_data->Params[i]); + } + nmFree(node_data->Params, node_data->nParams * sizeof(pParam)); + } + if (node_data->ParamList != NULL) expFreeParamList(node_data->ParamList); + + /** Free parsed clusters, if they exist. **/ + if (node_data->Clusters != NULL) + { + /*** This data is cached, so we should NOT free it! + *** The caching system is responsible for the memory. + ***/ + nmFree(node_data->Clusters, node_data->nClusters * sizeof(pClusterData)); + node_data->Clusters = NULL; + } + + /** Free parsed searches, if they exist. **/ + if (node_data->Searches != NULL) + { + /*** This data is cached, so we should NOT free it! + *** The caching system is responsible for the memory. + ***/ + nmFree(node_data->Searches, node_data->nSearches * sizeof(pSearchData)); + node_data->Searches = NULL; + } + + /** Free data source, if one exists. **/ + /*** Note: SourceData is freed last since other free functions may need to + *** access information from this structure when freeing data. + *** (For example, nVector which is used to determine the size of the + *** label struct in each cluster.) + ***/ + if (node_data->SourceData != NULL) + { + /*** This data is cached, so we should NOT free it! + *** The caching system is responsible for the memory. + ***/ + node_data->SourceData = NULL; + } + + /** Free the node data. **/ + nmFree(node_data, sizeof(NodeData)); + } + +/** ================ Deep Size Computation Functions ================ **/ +/** ANCHOR[id=sizing] **/ +// LINK #functions + +/*** Returns the deep size of a SourceData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param source_data The source data struct to be queried. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +unsigned int ci_SizeOfSourceData(pSourceData source_data) + { + unsigned int size = 0u; + if (source_data->Name != NULL) size += strlen(source_data->Name) * sizeof(char); + if (source_data->SourcePath != NULL) size += strlen(source_data->SourcePath) * sizeof(char); + if (source_data->AttrName != NULL) size += strlen(source_data->AttrName) * sizeof(char); + if (source_data->Data != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + size += strlen(source_data->Data[i]) * sizeof(char); + size += source_data->nVectors * sizeof(char*); + } + if (source_data->Vectors != NULL) + { + for (unsigned int i = 0u; i < source_data->nVectors; i++) + size += ca_sparse_len(source_data->Vectors[i]) * sizeof(int); + size += source_data->nVectors * sizeof(pVector); + } + size += sizeof(SourceData); + return size; + } + + +/*** Returns the deep size of a ClusterData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param cluster_data The cluster data struct to be queried. + *** @param recrusive Whether to recursively free subclusters. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive) + { + unsigned int size = 0u; + if (cluster_data->Name != NULL) size += strlen(cluster_data->Name) * sizeof(char); + if (cluster_data->Labels != NULL) size += cluster_data->SourceData->nVectors * sizeof(unsigned int); + if (cluster_data->SubClusters != NULL) + { + if (recursive) + { + for (unsigned int i = 0u; i < cluster_data->nSubClusters; i++) + size += ci_SizeOfClusterData(cluster_data->SubClusters[i], recursive); + } + size += cluster_data->nSubClusters * sizeof(void*); + } + size += sizeof(ClusterData); + return size; + } + + +/*** Returns the deep size of a SearchData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param search_data The search data struct to be queried. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +unsigned int ci_SizeOfSearchData(pSearchData search_data) + { + unsigned int size = 0u; + if (search_data->Name != NULL) size += strlen(search_data->Name) * sizeof(char); + if (search_data->Dups != NULL) size += search_data->nDups * (sizeof(void*) + sizeof(Dup)); + size += sizeof(SearchData); + return size; + } + + +/*** Returns the deep size of a NodeData struct, including the size of all + *** allocated substructures. As far as I can tell, this is probably only + *** useful for cache management and debugging. + *** + *** Note that Key is ignored because it is a pointer to data managed by the + *** caching systems, so it is not technically part of the struct. + *** + *** @param node_data The cluster data struct to be queried. + *** @returns The size in bytes of the struct and all internal allocated data. + ***/ +unsigned int ci_SizeOfNodeData(pNodeData node_data) + { + unsigned int size = 0u; + if (node_data->Params != NULL) + { + /** Approximate. **/ + size += node_data->nParams * (sizeof(Param) + sizeof(pParam)); + } + if (node_data->ParamList == NULL) + { + /** Approximate. **/ + size += node_data->nParams * 30u * sizeof(char); + size += sizeof(pParamObjects); + } + if (node_data->Clusters != NULL) + { + /** Note: This data is also stored in a cache. **/ + for (unsigned int i = 0u; i < node_data->nClusters; i++) + size += ci_SizeOfClusterData(node_data->Clusters[i], true); + size += node_data->nClusters * sizeof(pClusterData); + } + if (node_data->Searches != NULL) + { + /** Note: This data is also stored in a cache. **/ + for (unsigned int i = 0u; i < node_data->nSearches; i++) + size += ci_SizeOfSearchData(node_data->Searches[i]); + size += node_data->nSearches * sizeof(pSearchData); + } + if (node_data->SourceData != NULL) + { + /** Note: This data is also stored in a cache. **/ + size += ci_SizeOfSourceData(node_data->SourceData); + } + size += sizeof(NodeData); + return size; + } + + +/** ================ Cache Invalidation Functions ================ **/ +/** ANCHOR[id=invalidation] **/ +// LINK #functions + +/** Intended for use in xhClearKeySafe(). **/ +void ci_CacheFreeSourceData(pXHashEntry entry, void* _) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pSourceData source_data = (pSourceData)entry->Data; + + /** Free data. **/ + tprintf("- source: \"%s\"\n", key); + ci_FreeSourceData(source_data); + nmSysFree(key); + } + +/** Intended for use in xhClearKeySafe(). **/ +void ci_CacheFreeCluster(pXHashEntry entry, void* _) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pClusterData cluster_data = (pClusterData)entry->Data; + + /** Free data. **/ + tprintf("- cluster: \"%s\"\n", key); + ci_FreeClusterData(cluster_data, false); + nmSysFree(key); + } + +/** Intended for use in xhClearKeySafe(). **/ +void ci_CacheFreeSearch(pXHashEntry entry, void* _) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pSearchData search_data = (pSearchData)entry->Data; + + /** Free data. **/ + tprintf("- search: \"%s\"\n", key); + ci_FreeSearchData(search_data); + nmSysFree(key); + } + +/** ================ Computation Functions ================ **/ +/** ANCHOR[id=computation] **/ +// LINK #functions + +/*** Ensures that the source_data->Data has been fetched from the data source + *** and that source_data->nVectors has been computed from the fetched data. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param source_data The pSourceData affected by the computation. + *** @param session The current session, used to open the data source. + *** @returns 0 if successful, or + *** -1 other value on failure. + ***/ +int ci_ComputeSourceData(pSourceData source_data, pObjSession session) + { + /** If the vectors are already computed, we're done. **/ + if (source_data->Vectors != NULL) return 0; + + /** Handle error case that happens if memory optimizations break. **/ + if (source_data->Data != NULL) + { + /*** We have data, but not vectors, which means that this function ran + *** before, but the vectors were cleared by ci_GCSourceData(). This + *** should only happen if the vectors will not be needed again. Thus, + *** clearly something has gone wrong. + ***/ + fprintf(stderr, "ERROR:" + "\tci_computeSourceData() invoked on source data \"%s\" where\n" + "\tvectors were previously freed. There is likely a bug in\n" + "\tci_GCSourceData() which caused it to free vectors when we\n" + "\tstill needed them.\n", + source_data->Name + ); + fprintf(stderr, "Resolution:\n" + "\tThe original data will be dropped and refetched, and the\n" + "\tthe vectors will be recomputed, avoiding possible issues\n" + "\tfrom stale data.\n" + ); + + /** Drop source_data->Data. **/ + for (unsigned int i = 0u; i < source_data->nVectors; i++) + free(source_data->Data[i]); + nmFree(source_data->Data, source_data->nVectors * sizeof(char*)); + source_data->Data = NULL; + source_data->nVectors = 0; + } + + /** Time to play shoots-and-ladders in an error-handling jungle of gotos. **/ + bool successful = false; + int ret; + + /** Open the source path specified by the .cluster file. **/ + tprintf("Openning...\n"); + pObject obj = objOpen(session, source_data->SourcePath, OBJ_O_RDONLY, 0600, "system/directory"); + if (obj == NULL) + { + mssErrorf(0, "Cluster", + "Failed to open object driver:" + " > Attribute: \"%s\" : String\n" + " > Source Path: %s", + source_data->AttrName, + source_data->SourcePath + ); + successful = false; + goto end; + } + + /** Generate a "query" for retrieving data. **/ + tprintf("Openning query...\n"); + pObjQuery query = objOpenQuery(obj, NULL, NULL, NULL, NULL, 0); + if (query == NULL) + { + mssErrorf(0, "Cluster", + "Failed to open query:\n" + " > Attribute: \"%s\" : String\n" + " > Driver Used: %s\n" + " > Source Path: %s", + source_data->AttrName, + obj->Driver->Name, + source_data->SourcePath + ); + successful = false; + goto end_close; + } + + /** Initialize an xarray to store the retrieved data. **/ + XArray data_xarray, vector_xarray; + check(xaInit(&data_xarray, 64)); + check(xaInit(&vector_xarray, 64)); + + /** Fetch data and build vectors. **/ + tprintf("Skips: "); + unsigned int i = 0u; + while (true) + { + pObject entry = objQueryFetch(query, O_RDONLY); + if (entry == NULL) break; /* Done. */ + + /** Type checking. **/ + const int datatype = objGetAttrType(entry, source_data->AttrName); + if (datatype == -1) + { + mssErrorf(0, "Cluster", + "Failed to get type for %uth entry:\n" + " > Attribute: \"%s\" : String\n" + " > Driver Used: %s\n" + " > Source Path: %s", + i, + source_data->AttrName, + obj->Driver->Name, + source_data->SourcePath + ); + goto end_free_data; + } + if (datatype != DATA_T_STRING) + { + mssErrorf(1, "Cluster", + "Type for %uth entry was not a string:\n" + " > Attribute: \"%s\" : %s!!\n" + " > Driver Used: %s\n" + " > Source Path: %s", + i, + source_data->AttrName, ci_TypeToStr(datatype), + obj->Driver->Name, + source_data->SourcePath + ); + goto end_free_data; + } + + /** Get value from database. **/ + char* val; + ret = objGetAttrValue(entry, source_data->AttrName, DATA_T_STRING, POD(&val)); + if (ret != 0) + { + tprintf("\n"); + mssErrorf(0, "Cluster", + "Failed to value for %uth entry:\n" + " > Attribute: \"%s\" : String\n" + " > Driver Used: %s\n" + " > Source Path: %s\n" + " > Error code: %d", + i, + source_data->AttrName, + obj->Driver->Name, + source_data->SourcePath, + ret + ); + successful = false; + goto end_free_data; + } + + /** Skip empty strings. **/ + if (strlen(val) == 0) + { + tprintf("_"); + check(fflush(stdout)); + continue; + } + + /** Convert the string to a vector. **/ + pVector vector = ca_build_vector(val); + if (vector == NULL) + { + mssErrorf(1, "Cluster", "Failed to build vectors for string \"%s\".", val); + successful = false; + goto end_free_data; + } + if (vector[0] == -CA_NUM_DIMS) + { + mssErrorf(1, "Cluster", "Vector building for string \"%s\" produced no character pairs.", val); + successful = false; + goto end_free_data; + } + if (vector[0] == -172 && vector[1] == 11 && vector[2] == -78) + { + /** Skip pVector with no pairs. **/ + tprintf("."); + check(fflush(stdout)); + ca_free_vector(vector); + continue; + } + + /** Store value. **/ + char* dup_val = check_ptr(strdup(val)); + check_strict(xaAddItem(&data_xarray, (void*)dup_val)); + check_strict(xaAddItem(&vector_xarray, (void*)vector)); + + /** Clean up. **/ + check(objClose(entry)); + } + tprintf("\nData aquired.\n"); + source_data->nVectors = vector_xarray.nItems; + + /** Trim data and store data. **/ + const size_t data_size = source_data->nVectors * sizeof(char*); + source_data->Data = check_ptr(nmMalloc(data_size)); + memcpy(source_data->Data, data_xarray.Items, data_size); + check(xaDeInit(&data_xarray)); + data_xarray.nAlloc = 0; + + /** Trim data and store vectors. **/ + const size_t vectors_size = source_data->nVectors * sizeof(pVector); + source_data->Vectors = check_ptr(nmMalloc(vectors_size)); + memcpy(source_data->Vectors, vector_xarray.Items, vectors_size); + check(xaDeInit(&vector_xarray)); + vector_xarray.nAlloc = 0; + + /** Success. **/ + successful = true; + + end_free_data: + if (data_xarray.nAlloc != 0) + { + for (unsigned int i = 0u; i < data_xarray.nItems; i++) + free(data_xarray.Items[i]); + check(xaDeInit(&data_xarray)); + } + if (vector_xarray.nAlloc != 0) + { + for (unsigned int i = 0u; i < vector_xarray.nItems; i++) + ca_free_vector(vector_xarray.Items[i]); + check(xaDeInit(&vector_xarray)); + } + + // end_close_query: + ret = objQueryClose(query); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to close query (error code %d).", ret); + // ret = ret; // Fall-through: Continue through failure. + } + + end_close: + ret = objClose(obj); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to close object driver (error code %d).", ret); + // ret = ret; // Fall-through: Continue through failure. + } + + end: + if (!successful) mssErrorf(0, "Cluster", "Vector computation failed."); + return (successful) ? 0 : -1; + } + +/*** Ensures that the cluster_data->Labels has been computed, running the + *** specified clustering algorithm if necessary. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param cluster_data The pClusterData affected by the computation. + *** @param node_data The current pNodeData, used to get vectors to cluster. + *** @returns 0 if successful, or + *** -1 other value on failure. + ***/ +int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) + { + /** If the clusters are alreadyd computed, we're done. **/ + if (cluster_data->Labels != NULL) return 0; + + /** Make source data available. **/ + pSourceData source_data = node_data->SourceData; + + /** We need the vectors to compute clusters. **/ + if (ci_ComputeSourceData(source_data, node_data->ParamList->Session) != 0) + { + mssErrorf(0, "Cluster", "Vectors not found."); + goto err; + } + + /** Allocate static memory for finding clusters. **/ + const size_t labels_size = source_data->nVectors * sizeof(unsigned int); + cluster_data->Labels = check_ptr(nmMalloc(labels_size)); + + /** Execute clustering. **/ + switch (cluster_data->ClusterAlgorithm) + { + case ALGORITHM_NONE: + case ALGORITHM_SLIDING_WINDOW: /* Clusters are not computed separately for performance reasons. */ + tprintf("Applying no clustering...\n"); + memset(cluster_data->Labels, 0u, labels_size); + break; + + case ALGORITHM_KMEANS: + /** Check for unimplemented similarity measures. **/ + if (cluster_data->SimilarityMeasure != SIMILARITY_COSINE) + { + mssErrorf(1, "Cluster", + "The similarity meausre \"%s\" is not implemented.", + ci_SimilarityMeasureToString(cluster_data->SimilarityMeasure) + ); + goto err; + } + + /** kmeans expects clusters to be initialized. **/ + memset(cluster_data->Labels, 0u, labels_size); + + tprintf("Running kmeans\n"); + Timer timer_i, *timer = timer_start(timer_init(&timer_i)); + ca_kmeans( + source_data->Vectors, + source_data->nVectors, + cluster_data->Labels, + cluster_data->NumClusters, + cluster_data->MaxIterations, + cluster_data->MinImprovement + ); + timer_stop(timer); + tprintf("Done after %.4lf.\n", timer_get(timer)); + break; + + default: + mssErrorf(1, "Cluster", + "Clustering algorithm \"%s\" is not implemented.", + ci_ClusteringAlgorithmToString(cluster_data->ClusterAlgorithm) + ); + goto err; + } + + tprintf("Clustering done.\n"); + return 0; + + err: + mssErrorf(0, "Cluster", "Cluster computation failed for \"%s\".", cluster_data->Name); + return -1; + } + +/*** Ensures that the search_data->Dups has been computed, running the a + *** search with the specified similarity measure if necessary. + *** + *** @attention - Promises that mssError() will be invoked on failure, so the + *** caller is not required to specify their own error message. + *** + *** @param cluster_data The pClusterData affected by the computation. + *** @param node_data The current pNodeData, used to get vectors to cluster. + *** @returns 0 if successful, or + *** -1 other value on failure. + ***/ +int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) + { + int ret; + + /** If the clusters are already computed, we're done. **/ + if (search_data->Dups != NULL) return 0; + + /** Extract structs. **/ + pClusterData cluster_data = search_data->Source; + pSourceData source_data = node_data->SourceData; + + /** We need the clusters to be able to search them. **/ + ret = ci_ComputeClusterData(cluster_data, node_data); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Search computation failed due to missing clusters."); + goto err; + } + + /** Check for unimplemented similarity measures. **/ + if (search_data->SimilarityMeasure != SIMILARITY_COSINE) + { + mssErrorf(1, "Cluster", + "The similarity meausre \"%s\" is not implemented.", + ci_SimilarityMeasureToString(search_data->SimilarityMeasure) + ); + goto err; + } + + /** Execute the search. **/ + tprintf("Invoking ca_search.\n"); + Timer timer_i, *timer = timer_start(timer_init(&timer_i)); + pXArray dups_temp = ca_search( + source_data->Vectors, + source_data->nVectors, + cluster_data->Labels, + search_data->Threshold + ); + timer_stop(timer); + if (dups_temp == NULL) goto err; + tprintf("ca_search done after %.4lf.\n", timer_get(timer)); + + /** Store dups. **/ + search_data->nDups = dups_temp->nItems; + search_data->Dups = (dups_temp->nItems == 0) + ? check_ptr(nmMalloc(0)) + : ci_xaToTrimmedArray(dups_temp); + + /** Free unused data. **/ + tprintf("Cleanup.\n"); + check(xaFree(dups_temp)); + + return 0; + + err: + mssErrorf(0, "Cluster", "Search computation failed for \"%s\".", search_data->Name); + return -1; + } + + +/** ================ Parameter Functions ================ **/ +/** ANCHOR[id=params] **/ +// LINK #functions + +/*** Get the type of a parameter. Intended for expSetParamFunctions(). + *** + *** @param inf_v Node data containing the list of paramenters. + *** @param attr_name The name of the requested paramenter. + *** @returns The datatype, see datatypes.h for a list of valid datatypes. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +int ci_GetParamType(void* inf_v, const char* attr_name) + { + tprintf("Call to ci_GetParamType(\"%s\")\n", attr_name); + pNodeData node_data = (pNodeData)inf_v; + + /** Find the parameter. **/ + for (unsigned int i = 0; i < node_data->nParams; i++) + { + pParam param = node_data->Params[i]; + if (strcmp(param->Name, attr_name) != 0) continue; + + /** Parameter found. **/ + return (param->Value == NULL) ? DATA_T_UNAVAILABLE : param->Value->DataType; + } + + /** Parameter not found. **/ + return DATA_T_UNAVAILABLE; + } + + +/*** Get the value of a parameter. Intended for `expSetParamFunctions()`. + *** + *** @attention - Warning: If the retrieved value is `NULL`, the pObjectData + *** val is not updated, and the function returns 1, indicating `NULL`. + *** This is intended behavior, for consistancy with other Centrallix + *** functions, so keep it in mind so you're not surpised. + *** + *** @param inf_v Node data containing the list of paramenters. + *** @param attr_name The name of the requested paramenter. + *** @param datatype The expected datatype of the parameter value. + *** See datatypes.h for a list of valid datatypes. + *** @param val A pointer to a location where a pointer to the requested + *** data should be stored. Typically, the caller creates a local variable + *** to store this pointer, then passes a pointer to that local variable + *** so that they will have a pointer to the data. + *** This buffer will not be modified unless the data is successfully + *** found. If a value other than 0 is returned, the buffer is not updated. + *** @returns 0 if successsful, + *** 1 if the variable is null, + *** -1 if an error occures. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) + { + tprintf("Call to ci_GetParamValue(\"%s\", %s)\n", attr_name, ci_TypeToStr(datatype)); + pNodeData node_data = (pNodeData)inf_v; + + /** Find the parameter. **/ + for (unsigned int i = 0; i < node_data->nParams; i++) + { + pParam param = (pParam)node_data->Params[i]; + if (strcmp(param->Name, attr_name) != 0) continue; + + tprintf("Param found: Parsing...\n"); + + /** Parameter found. **/ + if (param->Value == NULL) return 1; + if (param->Value->Flags & DATA_TF_NULL) return 1; + if (param->Value->DataType != datatype) + { + mssErrorf(1, "Cluster", "Type mismatch accessing parameter '%s'.", param->Name); + return -1; + } + + tprintf("Param found: Copying...\n"); + /** Return param value. **/ + objCopyData(&(param->Value->Data), val, datatype); + return 0; + } + + /** Param not found. **/ + tprintf("Param not found.\n"); + return -1; + } + + +/** Not implemented. **/ +int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) + { + tprintf("Call to ci_SetParamValue(%s, %s)\n", attr_name, ci_TypeToStr(datatype)); + mssErrorf(1, "Cluster", "SetParamValue() is not implemented because clusters are imutable."); + return -1; + } + + +/** ================ Driver functions ================ **/ +/** ANCHOR[id=driver] **/ +// LINK #functions + +/*** Opens a new cluster driver instance by parsing a `.cluster` file found + *** at the path provided in obj. + *** + *** @param obj The object being opened, including the path, session, and + *** other necessary information. + *** @param mask Driver permission mask (unused). + *** @param systype ? (unused) + *** @param usr_type The object system file type being openned. Should always + *** be "system/cluster" because this driver is only registered for that + *** type of file. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** + *** @returns A pDriverData struct representing a driver instance, or + *** NULL if an error occures. + ***/ +void* clusterOpen(pObject obj, int mask, pContentType systype, char* usr_type, pObjTrxTree* oxt) + { + tprintf( + "Warning: clusterOpen(\"%s\") is under active development.\n", + obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt, 1) + ); + + /** If CREAT and EXCL are specified, create it and fail if it already exists. **/ + pSnNode node_struct = NULL; + bool can_create = (obj->Mode & O_CREAT) && (obj->SubPtr == obj->Pathname->nElements); + if (can_create && (obj->Mode & O_EXCL)) + { + node_struct = snNewNode(obj->Prev, usr_type); + if (node_struct == NULL) + { + mssErrorf(0, "Cluster", "Failed to EXCL create new node struct."); + goto err; + } + } + + /** Read the node if it exists. **/ + if (node_struct == NULL) + node_struct = snReadNode(obj->Prev); + + /** If we can't read, create it (if allowed). **/ + if (node_struct == NULL && can_create) + node_struct = snNewNode(obj->Prev, usr_type); + + /** If there still isn't a node, fail early. **/ + if (node_struct == NULL) + { + mssErrorf(0, "Cluster", "Failed to create node struct."); + goto err; + } + + /** Parse node data. **/ + pNodeData node_data = ci_ParseNodeData(node_struct->Data, obj); + if (node_data == NULL) + { + mssErrorf(0, "Cluster", + "Failed to parse structure file of name %s.", + obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt, 1) + ); + goto err; + } + node_data->Node = node_struct; + node_data->Node->OpenCnt++; + + /** Allocate driver instance data. **/ + pDriverData driver_data = check_ptr(nmMalloc(sizeof(DriverData))); + memset(driver_data, 0, sizeof(DriverData)); + driver_data->NodeData = node_data; + + /** Detect target from path. **/ + tprintf("Parsing node path: %d %d\n", obj->SubPtr, obj->SubCnt); obj->SubCnt = 0; + char* target_name = obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt++, 1); + if (target_name == NULL) + { + /** Target found: Root **/ + tprintf("Found target: Root.\n"); + driver_data->TargetType = TARGET_ROOT; + driver_data->TargetData = (void*)driver_data->NodeData->SourceData; + return (void*)driver_data; /* Sucess. */ + } + + /** Search clusters. **/ + for (unsigned int i = 0u; i < node_data->nClusters; i++) + { + pClusterData cluster = node_data->Clusters[i]; + if (strcmp(cluster->Name, target_name) != 0) continue; + + /** Target found: Cluster **/ + driver_data->TargetType = TARGET_CLUSTER; + tprintf("Found target cluster: %s\n", cluster->Name); + + /** Check for sub-clusters in the path. **/ + while (true) + { + /** Decend one path part deeper into the path. **/ + const char* path_part = obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt++, 1); + + /** If the path does not go any deeper, we're done. **/ + if (path_part == NULL) + { + driver_data->TargetData = (void*)cluster; + break; + } + + /** Need to go deeper: Search for the requested sub-cluster. **/ + for (unsigned int i = 0u; i < cluster->nSubClusters; i++) + { + pClusterData sub_cluster = cluster->SubClusters[i]; + if (strcmp(sub_cluster->Name, path_part) != 0) continue; + + /** Target found: Sub-cluster **/ + tprintf("Found target sub-cluster: %s\n", sub_cluster->Name); + cluster = sub_cluster; + goto continue_descent; + } + + /** Path names sub-cluster that does not exist. **/ + mssErrorf(1, "Cluster", "Sub-cluster \"%s\" does not exist.", path_part); + goto err_free_node; + + continue_descent:; + } + return (void*)driver_data; /* Sucess. */ + } + + /** Search searches. **/ + for (unsigned int i = 0u; i < node_data->nSearches; i++) + { + pSearchData search = node_data->Searches[i]; + if (strcmp(search->Name, target_name) != 0) continue; + + /** Target found: Search **/ + driver_data->TargetType = TARGET_SEARCH; + driver_data->TargetData = (void*)search; + + /** Check for extra, invalid path parts. **/ + char* extra_data = obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt++, 1); + if (extra_data != NULL) + { + mssErrorf(1, "Cluster", "Unknown path part %s.", extra_data); + goto err_free_node; + } + tprintf("Found target search: %s %d %d\n", search->Name, obj->SubPtr, obj->SubCnt); + return (void*)driver_data; /* Sucess. */ + } + + /** We were unable to find the requested cluster or search. **/ + mssErrorf(1, "Cluster", "\"%s\" is not the name of a declaired cluster or search.", target_name); + + /** Error cleanup. **/ + err_free_node: + ci_FreeNodeData(node_data); + nmFree(driver_data, sizeof(DriverData)); + + err: + return NULL; + } + + +/*** Close a cluster driver instance object, releasing any necessary memory + *** and closing any necessary underlying resources. However, most of that + *** data will be cached and won't be freed unless the cache is dropped. + *** + *** @param inf_v The affected driver instance. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns 0, success. + ***/ +int clusterClose(void* inf_v, pObjTrxTree* oxt) + { + tprintf("Warning: clusterClose() is under active development.\n"); + pDriverData driver_data = (pDriverData)inf_v; + + /** Entries are shallow copies so we shouldn't do a deep free. **/ + if (driver_data->TargetType == TARGET_CLUSTER_ENTRY + || driver_data->TargetType == TARGET_SEARCH_ENTRY) + { + nmFree(driver_data, sizeof(DriverData)); + return 0; + } + + /** Free the node data (which is held in cache). **/ + ci_FreeNodeData(driver_data->NodeData); + + /** Free driver data. **/ + nmFree(driver_data, sizeof(DriverData)); + + return 0; + } + + +/*** Opens a new query pointing to the first row of the data targetted by + *** the driver instance struct. The query has an internal index counter + *** that starts at the first row and increments as data is fetched. + *** + *** @param inf_v The driver instance to be queried. + *** @param query The query to use on this struct. This is assumed to be + *** handled elsewhere, so we don't read it here (unused). + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns The cluster query. + ***/ +void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt) + { + tprintf("Warning: clusterOpenQuery() is under active development.\n"); + pClusterQuery cluster_query = check_ptr(nmMalloc(sizeof(ClusterQuery))); + cluster_query->DriverData = (pDriverData)inf_v; + cluster_query->RowIndex = 0u; + return cluster_query; + } + + +/*** Get the next entry as an open driver instance object. + *** + *** @param qy_v A query instance, storing an internal index which is + *** incremented once that data has been fetched. + *** @param obj Unused. + *** @param mode Unused. + *** @param oxt Unused. + *** @returns pDriverData that is either a cluster entry or search entry, + *** pointing to a specific target index into the relevant data. + *** OR NULL, indicating that all data has been fetched. + ***/ +void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) + { + int ret; + tprintf("Warning: clusterQueryFetch() is under active development.\n"); + pClusterQuery cluster_query = (pClusterQuery)qy_v; + + /** Ensure that the data being fetched exists and is computed. **/ + TargetType target_type = cluster_query->DriverData->TargetType, new_target_type; + unsigned int data_amount = 0u; + switch (target_type) + { + case TARGET_ROOT: + mssErrorf(1, "Cluster", "Querying the root node of a cluster file is not allowed."); + fprintf(stderr, " > Hint: Try / or /\n"); + return NULL; + + case TARGET_CLUSTER: + { + new_target_type = TARGET_CLUSTER_ENTRY; + pClusterData target = (pClusterData)cluster_query->DriverData->TargetData; + ret = ci_ComputeClusterData(target, cluster_query->DriverData->NodeData); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Internal cluster computation failed."); + return NULL; + } + data_amount = cluster_query->DriverData->NodeData->SourceData->nVectors; + break; + } + + case TARGET_SEARCH: + { + new_target_type = TARGET_SEARCH_ENTRY; + pSearchData target = (pSearchData)cluster_query->DriverData->TargetData; + ret = ci_ComputeSearchData(target, cluster_query->DriverData->NodeData); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Internal search computation failed."); + return NULL; + } + data_amount = target->nDups; + break; + } + + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + mssErrorf(1, "Cluster", "Querying a query result is not allowed."); + return NULL; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", target_type); + return NULL; + } + tprintf("Fetch Index: %u/16 (total: %u)\n", cluster_query->RowIndex, data_amount); + + /** Cap results to 16 for faster debugging. TODO: Remove. **/ + data_amount = min(data_amount, 16); + + /** Check that the requested data exists, returning null if we've reached the end of the data. **/ + if (cluster_query->RowIndex >= data_amount) return NULL; + + /** Create the result struct. **/ + pDriverData driver_data = nmMalloc(sizeof(DriverData)); + assert(driver_data != NULL); + memcpy(driver_data, cluster_query->DriverData, sizeof(DriverData)); + driver_data->TargetType = new_target_type; + driver_data->TargetIndex = cluster_query->RowIndex++; + + return driver_data; + } + + +/*** Close a cluster query instance, releasing any necessary memory and + *** closing any necessary underlying resources. This does not close the + *** underlying driver instance, which must be closed with clusterClose(). + *** + *** @param qy_v The affected query instance. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns 0, success. + ***/ +int clusterQueryClose(void* qy_v, pObjTrxTree* oxt) + { + tprintf("Warning: clusterQueryClose() is under active development.\n"); + + nmFree(qy_v, sizeof(ClusterQuery)); + return 0; + } + + +/*** Get the type of a cluster driver instance attribute. + *** + *** @param inf_v The driver instance. + *** @param attr_name The name of the requested attribute. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns The datatype, see datatypes.h for a list of valid datatypes. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + /** Performance shortcut for frequently requested attributes: val1, val2, and sim. **/ + if (attr_name[0] == 'v' || attr_name[0] == 's') goto handle_targets; + + /** Debug info. **/ + if (oxt == NULL) tprintf(" > "); + tprintf("Call to clusterGetAttrType(%s)\n", attr_name); + + /** Guard possible segfault. **/ + if (attr_name == NULL) + { + fprintf(stderr, "Warning: Call to clusterGetAttrType() with NULL attribute name.\n"); + return DATA_T_UNAVAILABLE; + } + + /** Types for general attributes. **/ + if (strcmp(attr_name, "name") == 0 + || strcmp(attr_name, "annotation") == 0 + || strcmp(attr_name,"content_type") == 0 + || strcmp(attr_name, "inner_type") == 0 + || strcmp(attr_name,"outer_type") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "last_modification") == 0) + return DATA_T_DATETIME; + + /** Types for specific data targets. **/ + handle_targets: + switch (driver_data->TargetType) + { + case TARGET_ROOT: + if (strcmp(attr_name, "source") == 0 + || strcmp(attr_name, "attr_name") == 0) + return DATA_T_STRING; + break; + + case TARGET_CLUSTER: + if (strcmp(attr_name, "algorithm") == 0 + || strcmp(attr_name, "similarity_measure") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "num_clusters") == 0 + || strcmp(attr_name, "max_iterations") == 0) + return DATA_T_INTEGER; + if (strcmp(attr_name, "min_improvement") == 0 + || strcmp(attr_name, "average_similarity") == 0 + || strcmp(attr_name, "size") == 0) + return DATA_T_DOUBLE; + break; + + case TARGET_SEARCH: + if (strcmp(attr_name, "source") == 0 + || strcmp(attr_name, "similarity_measure") == 0) + return DATA_T_STRING; + if (strcmp(attr_name, "threshold") == 0) + return DATA_T_DOUBLE; + break; + + case TARGET_CLUSTER_ENTRY: + if (strcmp(attr_name, "id") == 0) + return DATA_T_INTEGER; + if (strcmp(attr_name, "val") == 0) + { + /** TODO: Replace with type calculation. **/ + return DATA_T_STRING; + } + if (strcmp(attr_name, "sim") == 0) + return DATA_T_DOUBLE; + break; + + case TARGET_SEARCH_ENTRY: + if (strcmp(attr_name, "id1") == 0 + || strcmp(attr_name, "id2") == 0) + return DATA_T_INTEGER; + if (strcmp(attr_name, "val1") == 0 + || strcmp(attr_name, "val2") == 0) + { + /** TODO: Replace with type calculation. **/ + return DATA_T_STRING; + } + if (strcmp(attr_name, "sim") == 0) + return DATA_T_DOUBLE; + break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return DATA_T_UNAVAILABLE; + } + + return DATA_T_UNAVAILABLE; + } + + +/*** Get the value of a cluster driver instance attribute. + *** + *** @param inf_v Node data containing the list of paramenters. + *** @param attr_name The name of the requested paramenter. + *** @param datatype The expected datatype of the parameter value. + *** See datatypes.h for a list of valid datatypes. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @param val A pointer to a location where a pointer to the requested + *** data should be stored. Typically, the caller creates a local variable + *** to store this pointer, then passes a pointer to that local variable + *** so that they will have a pointer to the data. + *** This buffer will not be modified unless the data is successfully + *** found. If a value other than 0 is returned, the buffer is not updated. + *** @returns 0 if successsful, + *** -1 if an error occures. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ +int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt) + { + pDriverData driver_data = (pDriverData)inf_v; + + /** Performance shortcut for frequently requested attributes: val1, val2, and sim. **/ + if ( + (attr_name[0] == 'v' && datatype == DATA_T_STRING) /* val, val1, val2 : String */ + || (attr_name[0] == 's' && datatype == DATA_T_DOUBLE) /* sim : double */ + ) goto handle_targets; + + /** Debug info. **/ + tprintf("Call to clusterGetAttrValue(%s)\n", attr_name); + + /** Type check. **/ + const int expected_datatype = clusterGetAttrType(inf_v, attr_name, NULL); + if (datatype != expected_datatype) + { + mssErrorf(1, "Cluster", + "Type mismatch: Accessing attribute '%s' : %s as type %s.", + attr_name, ci_TypeToStr(expected_datatype), ci_TypeToStr(datatype) + ); + return -1; + } + + /** Handle name and annotation. **/ + if (strcmp(attr_name, "name") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_ROOT: + val->String = ((pSourceData)driver_data->TargetData)->Name; + break; + + case TARGET_CLUSTER: + case TARGET_CLUSTER_ENTRY: + val->String = ((pClusterData)driver_data->TargetData)->Name; + break; + + case TARGET_SEARCH: + case TARGET_SEARCH_ENTRY: + val->String = ((pSearchData)driver_data->TargetData)->Name; + break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + return 0; + } + if (strcmp(attr_name, "annotation") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_ROOT: val->String = "Clustering driver."; break; + case TARGET_CLUSTER: val->String = "Clustering driver: Cluster."; break; + case TARGET_CLUSTER_ENTRY: val->String = "Clustering driver: Cluster Entry."; break; + case TARGET_SEARCH: val->String = "Clustering driver: Search."; break; + case TARGET_SEARCH_ENTRY: val->String = "Clustering driver: Cluster Entry."; break; + } + return 0; + } + + /** Return the appropriate types. **/ + if (strcmp(attr_name, "outer_type") == 0) + { + val->String = "system/row"; + return 0; + } + if (strcmp(attr_name, "content_type") == 0 + || strcmp(attr_name, "inner_type") == 0) + { + val->String = "system/void"; + return 0; + } + + /** Last modification is not implemented yet. **/ + if (strcmp(attr_name, "last_modification") == 0) return 1; /* null */ + + /** Handle attributes for specific data targets. **/ + handle_targets: + switch (driver_data->TargetType) + { + case TARGET_ROOT: + if (strcmp(attr_name, "source") == 0) + { + val->String = ((pSourceData)driver_data->TargetData)->SourcePath; + return 0; + } + if (strcmp(attr_name, "attr_name") == 0) + { + val->String = ((pSourceData)driver_data->TargetData)->AttrName; + return 0; + } + break; + + case TARGET_CLUSTER: + { + pClusterData target = (pClusterData)driver_data->TargetData; + + if (strcmp(attr_name, "algorithm") == 0) + { + val->String = ci_ClusteringAlgorithmToString(target->ClusterAlgorithm); + return 0; + } + if (strcmp(attr_name, "similarity_measure") == 0) + { + val->String = ci_SimilarityMeasureToString(target->SimilarityMeasure); + return 0; + } + if (strcmp(attr_name, "num_clusters") == 0) + { + if (target->NumClusters > INT_MAX) + fprintf(stderr, "Warning: num_clusters value of %u exceeds INT_MAX.\n", target->NumClusters); + val->Integer = (int)target->NumClusters; + return 0; + } + if (strcmp(attr_name, "max_iterations") == 0) + { + if (target->MaxIterations > INT_MAX) + fprintf(stderr, "Warning: max_iterations value of %u exceeds INT_MAX.\n", target->MaxIterations); + val->Integer = (int)target->MaxIterations; + return 0; + } + if (strcmp(attr_name, "min_improvement") == 0) + { + val->Double = target->MinImprovement; + return 0; + } + if (strcmp(attr_name, "average_similarity") == 0 + || strcmp(attr_name, "size") == 0) + { + mssErrorf(1, "Cluster", "average_similarity is not implemented."); + return -1; + } + break; + } + + case TARGET_SEARCH: + { + pSearchData target = (pSearchData)driver_data->TargetData; + + if (strcmp(attr_name, "source") == 0) + { + val->String = target->Source->Name; + return 0; + } + if (strcmp(attr_name, "similarity_measure") == 0) + { + val->String = ci_SimilarityMeasureToString(target->SimilarityMeasure); + return 0; + } + if (strcmp(attr_name, "threshold") == 0) + { + val->Double = target->Threshold; + return 0; + } + } + + case TARGET_CLUSTER_ENTRY: + { + pClusterData target = (pClusterData)driver_data->TargetData; + + if (strcmp(attr_name, "id") == 0) + { + val->Integer = (int)target->Labels[driver_data->TargetIndex]; + return 0; + } + if (strcmp(attr_name, "val") == 0) + { + val->String = driver_data->NodeData->SourceData->Data[driver_data->TargetIndex]; + return 0; + } + if (strcmp(attr_name, "sim") == 0) + { + mssErrorf(1, "Cluster", "Cluster entry similarity is not supported."); + return -1; + } + break; + } + + case TARGET_SEARCH_ENTRY: + { + pSearchData target = (pSearchData)driver_data->TargetData; + pDup target_dup = target->Dups[driver_data->TargetIndex]; + + if (strcmp(attr_name, "id1") == 0) + { + val->Integer = (int)target_dup->id1; + return 0; + } + if (strcmp(attr_name, "id2") == 0) + { + val->Integer = (int)target_dup->id2; + return 0; + } + if (strcmp(attr_name, "val1") == 0) + { + val->String = driver_data->NodeData->SourceData->Data[target_dup->id1]; + // val->Integer = (int)target_dup->id1; + return 0; + } + if (strcmp(attr_name, "val2") == 0) + { + val->String = driver_data->NodeData->SourceData->Data[target_dup->id2]; + // val->Integer = (int)target_dup->id2; + return 0; + } + if (strcmp(attr_name, "sim") == 0) + { + val->Double = target_dup->similarity; + return 0; + } + break; + } + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + /** Unknown attribute. **/ + char* name; + clusterGetAttrValue(inf_v, "name", DATA_T_STRING, POD(&name), NULL); + mssErrorf(1, "Cluster", + "Unknown attribute '%s' for cluster object %s (target type: %u, \"%s\").", + attr_name, driver_data->NodeData->SourceData->Name, driver_data->TargetType, name + ); + + return -1; + } + + +/*** Returns the name of the first attribute that one can get from + *** this driver instance (using GetAttrType() and GetAttrValue()). + *** Resets the internal variable (TargetAttrIndex) used to maintain + *** itteration state for clusterGetNextAttr(). + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the first attribute. + ***/ +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt) + { + tprintf("Warning: clusterGetFirstAttr() is under active development.\n"); + pDriverData driver_data = (pDriverData)inf_v; + driver_data->TargetAttrIndex = 0u; + return clusterGetNextAttr(inf_v, oxt); + } + + +/*** Returns the name of the next attribute that one can get from + *** this driver instance (using GetAttrType() and GetAttrValue()). + *** Uses an internal variable (TargetAttrIndex) used to maintain + *** the state of this itteration over repeated calls. + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the next attribute. + ***/ +char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt) + { + tprintf("Warning: clusterGetNextAttr("); + pDriverData driver_data = (pDriverData)inf_v; + const unsigned int i = driver_data->TargetAttrIndex++; + tprintf("%u) is under active development.\n", i); + switch (driver_data->TargetType) + { + case TARGET_ROOT: return (i < nATTR_ROOT) ? ATTR_ROOT[i] : END_OF_ATTRIBUTES; + case TARGET_CLUSTER: return (i < nATTR_CLUSTER) ? ATTR_CLUSTER[i] : END_OF_ATTRIBUTES; + case TARGET_SEARCH: return (i < nATTR_SEARCH) ? ATTR_SEARCH[i] : END_OF_ATTRIBUTES; + case TARGET_CLUSTER_ENTRY: return (i < nATTR_CLUSTER_ENTRY) ? ATTR_CLUSTER_ENTRY[i] : END_OF_ATTRIBUTES; + case TARGET_SEARCH_ENTRY: return (i < nATTR_SEARCH_ENTRY) ? ATTR_SEARCH_ENTRY[i] : END_OF_ATTRIBUTES; + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return NULL; + } + } + + +/*** Get the capabilities of the driver instance object. + *** + *** @param inf_v The driver instance to be checked. + *** @param info The struct to be populated with driver flags. + *** @returns 0 if succesful, + *** -1 if the driver is an unimplemented type (should never happen). + ***/ +int clusterInfo(void* inf_v, pObjectInfo info) + { + tprintf("Warning: clusterInfo() is under active development.\n"); + pDriverData driver_data = (pDriverData)inf_v; + pNodeData node_data = (pNodeData)driver_data->NodeData; + + /** Reset flags buffer. **/ + info->Flags = 0; + + /** Disallow unsupported functionality. **/ + info->Flags |= OBJ_INFO_F_CANT_ADD_ATTR; + info->Flags |= OBJ_INFO_F_CANT_HAVE_CONTENT; + info->Flags |= OBJ_INFO_F_NO_CONTENT; + + switch (driver_data->TargetType) + { + case TARGET_ROOT: + info->nSubobjects = node_data->nClusters + node_data->nSearches; + info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->Flags |= (info->nSubobjects > 0) ? OBJ_INFO_F_HAS_SUBOBJ : OBJ_INFO_F_NO_SUBOBJ; + break; + + case TARGET_CLUSTER: + info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; + info->Flags |= OBJ_INFO_F_HAS_SUBOBJ; /* Data must not be empty. */ + + /*** Clusters always have one label per vector. + *** If we know how many vectors are in the dataset, + *** we know how many labels this cluster will have, + *** even if it hasn't been computed yet. + ***/ + if (node_data->SourceData->Vectors != NULL) + { + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->nSubobjects = node_data->SourceData->nVectors; + } + break; + + case TARGET_SEARCH: + { + pSearchData search_data = (pSearchData)driver_data->TargetData; + info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; + if (search_data->Dups != NULL) + { + info->nSubobjects = search_data->nDups; + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->Flags |= (info->nSubobjects > 0) ? OBJ_INFO_F_HAS_SUBOBJ : OBJ_INFO_F_NO_SUBOBJ; + } + break; + } + + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + /** No Subobjects. **/ + info->Flags |= OBJ_INFO_F_CANT_HAVE_SUBOBJ; + info->Flags |= OBJ_INFO_F_NO_SUBOBJ; + info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; + info->nSubobjects = 0; + break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + + tprintf("Info result: "INT_TO_BINARY_PATTERN"\n", INT_TO_BINARY(info->Flags)); + return 0; + } + + +/** ================ Method Execution Functions ================ **/ +/** ANCHOR[id=method] **/ +// LINK #functions + +/*** Returns the name of the first method that one can execute from + *** this driver instance (using clusterExecuteMethod()). Resets the + *** internal variable (TargetMethodIndex) used to maintain itteration + *** state for clusterGetNextMethod(). + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the first methd. + ***/ +char* clusterGetFirstMethod(void* inf_v, pObjTrxTree oxt) + { + tprintf("Warning: clusterGetFirstMethod() is under active development.\n"); + pDriverData driver_data = (pDriverData)inf_v; + driver_data->TargetMethodIndex = 0u; + return clusterGetNextMethod(inf_v, oxt); + } + + +/*** Returns the name of the next method that one can get from + *** this driver instance (using GetAttrType() and GetAttrValue()). + *** Uses an internal variable (TargetMethodIndex) used to maintain + *** the state of this itteration over repeated calls. + *** + *** @param inf_v The driver instance to be read. + *** @param oxt Unused. + *** @returns The name of the next method. + ***/ +char* clusterGetNextMethod(void* inf_v, pObjTrxTree oxt) + { + tprintf("Warning: clusterGetNextMethod("); + pDriverData driver_data = (pDriverData)inf_v; + const unsigned int i = driver_data->TargetMethodIndex++; + tprintf("%u) is under active development.\n", i); + return (i < nMETHOD_NAME) ? METHOD_NAME[i] : END_OF_METHODS; + } + +/** Intended for use in xhForEach(). **/ +static int ci_PrintEntry(pXHashEntry entry, void* arg) + { + /** Extract entry. **/ + char* key = entry->Key; + void* data = entry->Data; + + /** Extract args. **/ + void** args = (void**)arg; + unsigned int* type_id_ptr = (unsigned int*)args[0]; + unsigned int* total_bytes_ptr = (unsigned int*)args[1]; + char* path = (char*)args[2]; + + /** If a path is provided, check that it matches the start of the key. **/ +// if (path != NULL) printf("Comparing \"%s\" to \"%s\"[0,%lu].\n", path, key, strlen((char*)path)); + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return 0; + + /** Handle type. **/ + char* type; + char* name; + unsigned int bytes; + switch (*type_id_ptr) + { + case 1u: + { + pSourceData source_data = (pSourceData)data; + type = "Source"; + name = source_data->Name; + bytes = ci_SizeOfSourceData(source_data); + break; + } + case 2u: + { + pClusterData cluster_data = (pClusterData)data; + type = "Cluster"; + name = cluster_data->Name; + bytes = ci_SizeOfClusterData(cluster_data, false); + break; + } + case 3u: + { + pSearchData search_data = (pSearchData)data; + type = "Search"; + name = search_data->Name; + bytes = ci_SizeOfSearchData(search_data); + break; + } + default: assert(false); + } + + /** Increment total bytes. **/ + *total_bytes_ptr += bytes; + + char buf[12]; + snprint_bytes(buf, sizeof(buf), bytes); + printf("%-8s %-16s %-12s \"%s\"\n", type, name, buf, key); + + return 0; + } + + +/*** Executes a method with the given name. + *** + *** @param inf_v The affected driver instance. + *** @param method_name The name of the method. + *** @param param A possibly optional param passed to the method. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + ***/ +int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree oxt) + { + tprintf("Warning: clusterExecuteMethod(\"%s\") is under active development.\n", method_name); + + /** Cache management method. **/ + if (strcmp(method_name, "cache") == 0) + { + /** Second parameter is required. **/ + if (param->String == NULL) + { + mssErrorf(1, "Cluster", + "param : \"show\" | \"show_all\" | \"drop_all\" is required for the cache method." + ); + return -1; + } + + /** Show cache. **/ + if (strcmp(param->String, "show") == 0) + { + const pObject obj = ((pDriverData)inf_v)->NodeData->Obj; + char* path = obj_internal_PathPart(obj->Pathname, 0, obj->SubPtr); + + /** Print cache info table. **/ + unsigned int i = 1u, source_bytes = 0u, cluster_bytes = 0u, search_bytes = 0u; + printf("\nShowing cache for \"%s\":\n", path); + printf("%-8s %-16s %-12s %s\n", "Type", "Name", "Size", "Cache Entry Key"); + xhForEach(&ClusterCaches.SourceCache, ci_PrintEntry, (void*[]){&i, &source_bytes, path}); i++; + xhForEach(&ClusterCaches.ClusterCache, ci_PrintEntry, (void*[]){&i, &cluster_bytes, path}); i++; + xhForEach(&ClusterCaches.SearchCache, ci_PrintEntry, (void*[]){&i, &search_bytes, path}); i++; + + /** Print stats. **/ + char buf[16]; + printf("\nCache Stats:\n"); + printf("%-8s %-4s %-12s\n", "", "#", "Total Size"); + const int n_sources = ClusterCaches.SourceCache.nItems; + snprint_bytes(buf, sizeof(buf), source_bytes); + printf("%-8s %-4d %-12s\n", "Source", n_sources, buf); + const int n_clusters = ClusterCaches.ClusterCache.nItems; + snprint_bytes(buf, sizeof(buf), cluster_bytes); + printf("%-8s %-4d %-12s\n", "Cluster", n_clusters, buf); + const int n_searches = ClusterCaches.SearchCache.nItems; + snprint_bytes(buf, sizeof(buf), search_bytes); + printf("%-8s %-4d %-12s\n", "Search", n_searches, buf); + snprint_bytes(buf, sizeof(buf), source_bytes + cluster_bytes + search_bytes); + printf("%-8s %-4d %-12s\n\n", "Total", n_sources + n_clusters + n_searches, buf); + return 0; + } + + + /** Show all cache. **/ + if (strcmp(param->String, "show_all") == 0) + { + /** Print cache info table. **/ + unsigned int i = 1u, total_bytes = 0u; + tprintf("Showing cluster driver cache for all files...\n"); + printf("%-8s %-16s %-12s %s\n", "Type", "Name", "Size", "Cache Entry Key"); + xhForEach(&ClusterCaches.SourceCache, ci_PrintEntry, (void*[]){&i, &total_bytes, NULL}); i++; + xhForEach(&ClusterCaches.ClusterCache, ci_PrintEntry, (void*[]){&i, &total_bytes, NULL}); i++; + xhForEach(&ClusterCaches.SearchCache, ci_PrintEntry, (void*[]){&i, &total_bytes, NULL}); i++; + + /** Print total size. **/ + char buf[16]; + snprint_bytes(buf, sizeof(buf), total_bytes); + printf("Total cache size: %s\n", buf); + return 0; + } + + /** Drop allcache. **/ + if (strcmp(param->String, "drop_all") == 0) + { + tprintf("Dropping cluster driver cache for all files...\n"); + /*** Free caches in reverse of the order they are created in case + *** cached data relies on its source during the freeing process. + ***/ + xhClearKeySafe(&ClusterCaches.SearchCache, ci_CacheFreeSearch, NULL); + xhClearKeySafe(&ClusterCaches.ClusterCache, ci_CacheFreeCluster, NULL); + xhClearKeySafe(&ClusterCaches.SourceCache, ci_CacheFreeSourceData, NULL); + printf("Cache dropped.\n"); + return 0; + } + + /** Unknown parameter. **/ + mssErrorf(1, "Cluster", + "Expected param : \"show\" | \"show_all\" | \"drop_all\" the cache method, but got: \"%s\"", + param->String + ); + return -1; + } + + return -1; + } + +/** ================ Unimplemented Functions ================ **/ +/** ANCHOR[id=unimplemented] **/ +// LINK #functions + +/** Not implemented. **/ +int clusterCreate(pObject obj, int mask, pContentType systype, char* usrtype, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterCreate() is not implemented."); + return -ENOSYS; + } +/** Not implemented. **/ +int clusterDeleteObj(void* inf_v, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterDeleteObj() is not implemented."); + return -1; + } +/** Not implemented. **/ +int clusterDelete(pObject obj, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterDelete() is not implemented."); + return -1; + } +/** Not implemented. **/ +int clusterRead(void* inf_v, char* buffer, int maxcnt, int offset, int flags, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterRead() not implemented."); + fprintf(stderr, "HINT: Use queries instead, (e.g. clusterOpenQuery()).\n"); + return -1; + } +/** Not implemented. **/ +int clusterWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterWrite() not implemented because clusters are imutable."); + return -1; + } +/** Not implemented. **/ +int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree oxt) + { + mssErrorf(1, "Cluster", "clusterSetAttrValue() not implemented because clusters are imutable."); + return -1; + } +/** Not implemented. **/ +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree oxt) + { + mssErrorf(1, "Cluster", "clusterAddAttr() not implemented because clusters are imutable."); + return -1; + } +/** Not implemented. **/ +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree oxt) + { + mssErrorf(1, "Cluster", "clusterOpenAttr() not implemented."); + return NULL; + } +/** Not implemented. **/ +int clusterCommit(void* inf_v, pObjTrxTree *oxt) + { + mssErrorf(1, "Cluster", "clusterCommit() not implemented because clusters are imutable."); + return 0; + } +/** Not implemented. **/ +pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt) + { + mssErrorf(1, "Cluster", "clusterPresentationHints() not implemented."); + return NULL; + } + + +/*** Initialize the driver. This includes: + *** - Registering the driver with the objectsystem. + *** - Registering structs with newmalloc for debugging. + *** - Initializing global data needed for the driver. + *** + *** @returns 0 if successful, or + *** a negative value if an error occured. + ***/ +int clusterInitialize(void) + { + int ret; + /** Initialize library. **/ + ca_init(); + + /** Allocate the driver. **/ + pObjDriver drv = (pObjDriver)nmMalloc(sizeof(ObjDriver)); + if (drv == NULL) return -1; + memset(drv, 0, sizeof(ObjDriver)); + + /** Initialize globals. **/ + memset(&ClusterCaches, 0, sizeof(ClusterCaches)); + ret = xhInit(&ClusterCaches.SourceCache, 251, 0); + if (ret < 0) return ret; + ret = xhInit(&ClusterCaches.ClusterCache, 251, 0); + if (ret < 0) return ret; + ret = xhInit(&ClusterCaches.SearchCache, 251, 0); + if (ret < 0) return ret; + + /** Setup the structure. **/ + strcpy(drv->Name, "clu - Clustering Driver"); + drv->Capabilities = OBJDRV_C_TRANS | OBJDRV_C_FULLQUERY; // OBJDRV_C_TRANS | OBJDRV_C_FULLQUERY; + ret = xaInit(&(drv->RootContentTypes), 1); + if (ret < 0) return ret; + ret = xaAddItem(&(drv->RootContentTypes), "system/cluster"); + if (ret < 0) return ret; + + /** Setup the function references. **/ + drv->Open = clusterOpen; + drv->Close = clusterClose; + drv->Create = clusterCreate; + drv->Delete = clusterDelete; + drv->DeleteObj = clusterDeleteObj; + drv->OpenQuery = clusterOpenQuery; + drv->QueryDelete = NULL; + drv->QueryFetch = clusterQueryFetch; + drv->QueryClose = clusterQueryClose; + drv->Read = clusterRead; + drv->Write = clusterWrite; + drv->GetAttrType = clusterGetAttrType; + drv->GetAttrValue = clusterGetAttrValue; + drv->GetFirstAttr = clusterGetFirstAttr; + drv->GetNextAttr = clusterGetNextAttr; + drv->SetAttrValue = clusterSetAttrValue; + drv->AddAttr = clusterAddAttr; + drv->OpenAttr = clusterOpenAttr; + drv->GetFirstMethod = clusterGetFirstMethod; + drv->GetNextMethod = clusterGetNextMethod; + drv->ExecuteMethod = clusterExecuteMethod; + drv->Commit = clusterCommit; + drv->Info = clusterInfo; + drv->PresentationHints = clusterPresentationHints; + + /** Register some structures. **/ + nmRegister(sizeof(ClusterData), "ClusterData"); + nmRegister(sizeof(SearchData), "ClusterSearch"); + nmRegister(sizeof(SourceData), "ClusterSourceData"); + nmRegister(sizeof(NodeData), "ClusterNodeData"); + nmRegister(sizeof(DriverData), "ClusterDriverData"); + nmRegister(sizeof(ClusterQuery), "ClusterQuery"); + nmRegister(sizeof(ClusterCaches), "ClusterCaches"); + + /** Print debug size info. **/ + char cluster_size_buf[16]; + char search_size_buf[16]; + char source_size_buf[16]; + char node_size_buf[16]; + char driver_size_buf[16]; + char query_size_buf[16]; + char caches_size_buf[16]; + tprintf( + "Cluster driver struct sizes:\n" + " > sizeof(ClusterData): %s\n" + " > sizeof(SearchData): %s\n" + " > sizeof(SourceData): %s\n" + " > sizeof(NodeData): %s\n" + " > sizeof(DriverData): %s\n" + " > sizeof(ClusterQuery): %s\n" + " > sizeof(ClusterCaches): %s\n", + snprint_bytes(cluster_size_buf, sizeof(cluster_size_buf), sizeof(ClusterData)), + snprint_bytes(search_size_buf, sizeof(search_size_buf), sizeof(SearchData)), + snprint_bytes(source_size_buf, sizeof(source_size_buf), sizeof(SourceData)), + snprint_bytes(node_size_buf, sizeof(node_size_buf), sizeof(NodeData)), + snprint_bytes(driver_size_buf, sizeof(driver_size_buf), sizeof(DriverData)), + snprint_bytes(query_size_buf, sizeof(query_size_buf), sizeof(ClusterQuery)), + snprint_bytes(caches_size_buf, sizeof(caches_size_buf), sizeof(ClusterCaches)) + ); + + /** Register the driver. **/ + ret = objRegisterDriver(drv); + if (ret < 0) return ret; + + return 0; + } diff --git a/centrallix/tests/test_expfn_double_metaphone_00.cmp b/centrallix/tests/test_expfn_double_metaphone_00.cmp new file mode 100644 index 000000000..d13cf05ca --- /dev/null +++ b/centrallix/tests/test_expfn_double_metaphone_00.cmp @@ -0,0 +1,140 @@ +Attribute [result]: string "TST`TST" +Attribute [result]: string "PSK`PSK" +Attribute [result]: string "SNTRLKS`SNTRLKS" +Attribute [result]: string "LRNS`LRNS" +Attribute [result]: string "FLPS`FLPS" +Attribute [result]: string "AKSPTNNS`AKSPTNKNS" +Attribute [result]: string "SPRKLFRJLSTSKSPLTSS`SPRKLFRKLSTSKSPLTXS" +Attribute [result]: string "SKTLPKSSTSLKRFLKRPS`SKTLPKSSTSLKRFLKRPS" +Attribute [result]: string "SM0`XMT" +Attribute [result]: string "XMT`SMT" +Attribute [result]: string "SNTR`XNTR" +Attribute [result]: string "XNTR`SNTR" +Attribute [result]: string "ARN`ARNF" +Attribute [result]: string "ARNF`ARNF" +Attribute [result]: string "AKST`AKST" +Attribute [result]: string "AKSTNT`AKSTNT" +Attribute [result]: string "AKTL`AKTL" +Attribute [result]: string "ARX`ARK" +Attribute [result]: string "ART`ARTS" +Attribute [result]: string "PKS`PKS" +Attribute [result]: string "PX`PX" +Attribute [result]: string "PJTR`PHTR" +Attribute [result]: string "PLX`PLX" +Attribute [result]: string "PRTX`PRTX" +Attribute [result]: string "PJ`PK" +Attribute [result]: string "P`P" +Attribute [result]: string "PR`PR" +Attribute [result]: string "PRTN`PRTN" +Attribute [result]: string "KPRL`KPR" +Attribute [result]: string "SSR`SSR" +Attribute [result]: string "KKN`KKN" +Attribute [result]: string "KMPL`KMPL" +Attribute [result]: string "KRLL`KRLL" +Attribute [result]: string "KRLL`KRLL" +Attribute [result]: string "KMSTR`KMSTR" +Attribute [result]: string "KNT`KNT" +Attribute [result]: string "KRS`KRS" +Attribute [result]: string "KF`KF" +Attribute [result]: string "SRN`XRN" +Attribute [result]: string "TM`TM" +Attribute [result]: string "ATKR`ATKR" +Attribute [result]: string "AJ`AJ" +Attribute [result]: string "FLPTS`FLPFX" +Attribute [result]: string "FKX`FKX" +Attribute [result]: string "KLKS`KKS" +Attribute [result]: string "KRMNK`JRMNK" +Attribute [result]: string "JRTL`JRTL" +Attribute [result]: string "JLN`JLN" +Attribute [result]: string "KSPL`KSPL" +Attribute [result]: string "KF`KF" +Attribute [result]: string "KRK`KRK" +Attribute [result]: string "HKMR`HKMR" +Attribute [result]: string "H`H" +Attribute [result]: string "ALNT`ALNT" +Attribute [result]: string "AL`AL" +Attribute [result]: string "ATLN`ATLN" +Attribute [result]: string "JNKLTS`ANKLFX" +Attribute [result]: string "HS`HS" +Attribute [result]: string "LF`LF" +Attribute [result]: string "MKFR`MKFR" +Attribute [result]: string "MKRKR`MKRKR" +Attribute [result]: string "MNKR`MNJR" +Attribute [result]: string "MK`MK" +Attribute [result]: string "MKLFLN`MKLFLN" +Attribute [result]: string "MKL`MXL" +Attribute [result]: string "MTL`MTL" +Attribute [result]: string "ARKSTR`ARKSTR" +Attribute [result]: string "ARKT`ARKT" +Attribute [result]: string "PNN`PNN" +Attribute [result]: string "RSPR`RSPR" +Attribute [result]: string "RSN`RSNS" +Attribute [result]: string "RJ`RJR" +Attribute [result]: string "RF`RF" +Attribute [result]: string "SLFTR`SLFTR" +Attribute [result]: string "SNHSNT`SNHSNT" +Attribute [result]: string "XNKR`SKNKR" +Attribute [result]: string "XRMRRN`SKRMRRN" +Attribute [result]: string "XLSNKR`SLSNJR" +Attribute [result]: string "SKL`SKL" +Attribute [result]: string "SKNR`SKNR" +Attribute [result]: string "SKST`SKST" +Attribute [result]: string "XKR`SKR" +Attribute [result]: string "XKR`SKR" +Attribute [result]: string "TKLR`TLR" +Attribute [result]: string "TMS`TMS" +Attribute [result]: string "TMS`TMS" +Attribute [result]: string "0M`TM" +Attribute [result]: string "TXNR`TKNR" +Attribute [result]: string "TF`TF" +Attribute [result]: string "FK`FK" +Attribute [result]: string "AKTLR`FKTLR" +Attribute [result]: string "AKSLR`FKSLR" +Attribute [result]: string "ART`FRT" +Attribute [result]: string "SF`SFR" +Attribute [result]: string "ANKLFX`ANKLFK" +Attribute [result]: string "J`J" +Attribute [result]: string "MKLLN`MKLLN" +Attribute [result]: string "MRS`MRS" +Attribute [result]: string "APR`APR" +Attribute [result]: string "KMPRL`KMPR" +Attribute [result]: string "HT`HT" +Attribute [result]: string "K0RN`KTRN" +Attribute [result]: string "K0RN`KTRN" +Attribute [result]: string "RXRT`RKRT" +Attribute [result]: string "PP`PP" +Attribute [result]: string "ARK`ARK" +Attribute [result]: string "JF`KF" +Attribute [result]: string "TF`TF" +Attribute [result]: string "R`R" +Attribute [result]: string "STFN`STFN" +Attribute [result]: string "PRS`PRS" +Attribute [result]: string "RNT`RNT" +Attribute [result]: string "PRN`PRN" +Attribute [result]: string "PRN`PRN" +Attribute [result]: string "AT`AT" +Attribute [result]: string "AT`AT" +Attribute [result]: string "APT`APT" +Attribute [result]: string "PK`PK" +Attribute [result]: string "PKR`PKR" +Attribute [result]: string "XRLS`XRLS" +Attribute [result]: string "KN`KN" +Attribute [result]: string "NM`NM" +Attribute [result]: string "RJ`R" +Attribute [result]: string "KNTN`KNTN" +Attribute [result]: string "A`A" +Attribute [result]: string "XMKR`XMKR" +Attribute [result]: string "SN`XN" +Attribute [result]: string "SKLT`SKLT" +Attribute [result]: string "STXN`STXN" +Attribute [result]: string "MX`MX" +Attribute [result]: string "PS`PTS" +Attribute [result]: string "AKNS`ANS" +Attribute [result]: string "SNS`SNS" +Attribute [result]: string "FNKK`FNKK" +Attribute [result]: string "JSF`HSF" +Attribute [result]: string "APJKT`APJKT" +Attribute [result]: string "SLS`SLS" +Attribute [result]: string "XRF`XRF" +Attribute [result]: string "KS`KS" +Attribute [result]: string "FNKLR`FNKLR" diff --git a/centrallix/tests/test_expfn_double_metaphone_00.to b/centrallix/tests/test_expfn_double_metaphone_00.to new file mode 100644 index 000000000..efd7548cc --- /dev/null +++ b/centrallix/tests/test_expfn_double_metaphone_00.to @@ -0,0 +1,161 @@ +##NAME double_metaphone() function + +# Special thanks to the following websites for double checking the correct results: +# 1: https://words.github.io/double-metaphone +# 2: https://mainegenealogy.net/metaphone_converter.asp +# 3: https://en.toolpage.org/tool/metaphone + +# These tests were collected from the following sources: +# - Example comments in the source code of exp_double_metaphone.c +# - Maurice Aubrey's Tests* +# - Tests manually written by Israel Fuller +# - Tests written by prompting ChatGPT-5 (preview)** +# +# *Source: https://github.com/gitpan/Text-DoubleMetaphone/blob/master/t/words.txt +# **GPT-5 mini (Preview) was run in GitHub Copilot to suggest the words +# for some tests after analizing a generated coverage report. I (Israel) +# used the suggestions to write some "AI generated" test cases. +# +# For more information, see the manual test suite implementation at the +# end of the exp_double_metaphone.c file. + +query select result = double_metaphone("Test") +query select result = double_metaphone("Basic") +query select result = double_metaphone("Centrallix") +query select result = double_metaphone("Lawrence") +query select result = double_metaphone("Philips") +query select result = double_metaphone("Acceptingness") +query select result = double_metaphone("Supercalifragilisticexpialidocious") +query select result = double_metaphone("Suoicodilaipxecitsiligarfilacrepus") +query select result = double_metaphone("Smith") +query select result = double_metaphone("Schmidt") +query select result = double_metaphone("Snider") +query select result = double_metaphone("Schneider") +query select result = double_metaphone("Arnow") +query select result = double_metaphone("Arnoff") +query select result = double_metaphone("Accede") +query select result = double_metaphone("Accident") +query select result = double_metaphone("Actually") +query select result = double_metaphone("Arch") +query select result = double_metaphone("Artois") +query select result = double_metaphone("Bacchus") +query select result = double_metaphone("Bacci") +query select result = double_metaphone("Bajador") +query select result = double_metaphone("Bellocchio") +query select result = double_metaphone("Bertucci") +query select result = double_metaphone("Biaggi") +query select result = double_metaphone("Bough") +query select result = double_metaphone("Breaux") +query select result = double_metaphone("Broughton") +query select result = double_metaphone("Cabrillo") +query select result = double_metaphone("Caesar") +query select result = double_metaphone("Cagney") +query select result = double_metaphone("Campbell") +query select result = double_metaphone("Carlisle") +query select result = double_metaphone("Carlysle") +query select result = double_metaphone("Chemistry") +query select result = double_metaphone("Chianti") +query select result = double_metaphone("Chorus") +query select result = double_metaphone("Cough") +query select result = double_metaphone("Czerny") +query select result = double_metaphone("Dumb") +query select result = double_metaphone("Edgar") +query select result = double_metaphone("Edge") +query select result = double_metaphone("Filipowicz") +query select result = double_metaphone("Focaccia") +query select result = double_metaphone("Gallegos") +query select result = double_metaphone("Germanic") +query select result = double_metaphone("Ghiradelli") +query select result = double_metaphone("Ghislane") +query select result = double_metaphone("Gospel") +query select result = double_metaphone("Gough") +query select result = double_metaphone("Greek") +query select result = double_metaphone("Hochmeier") +query select result = double_metaphone("Hugh") +query select result = double_metaphone("Island") +query select result = double_metaphone("Isle") +query select result = double_metaphone("Italian") +query select result = double_metaphone("Jankelowicz") +query select result = double_metaphone("Jose") +query select result = double_metaphone("Laugh") +query select result = double_metaphone("Mac Caffrey") +query select result = double_metaphone("Mac Gregor") +query select result = double_metaphone("Manager") +query select result = double_metaphone("McHugh") +query select result = double_metaphone("McLaughlin") +query select result = double_metaphone("Michael") +query select result = double_metaphone("Middle") +query select result = double_metaphone("Orchestra") +query select result = double_metaphone("Orchid") +query select result = double_metaphone("Pinyin") +query select result = double_metaphone("Raspberry") +query select result = double_metaphone("Resnais") +query select result = double_metaphone("Rogier") +query select result = double_metaphone("Rough") +query select result = double_metaphone("Salvador") +query select result = double_metaphone("San jacinto") +query select result = double_metaphone("Schenker") +query select result = double_metaphone("Schermerhorn") +query select result = double_metaphone("Schlesinger") +query select result = double_metaphone("School") +query select result = double_metaphone("Schooner") +query select result = double_metaphone("Succeed") +query select result = double_metaphone("Sugar") +query select result = double_metaphone("Sugary") +query select result = double_metaphone("Tagliaro") +query select result = double_metaphone("Thames") +query select result = double_metaphone("Thomas") +query select result = double_metaphone("Thumb") +query select result = double_metaphone("Tichner") +query select result = double_metaphone("Tough") +query select result = double_metaphone("Vghee") +query select result = double_metaphone("Wachtler") +query select result = double_metaphone("Wechsler") +query select result = double_metaphone("Word") +query select result = double_metaphone("Xavier") +query select result = double_metaphone("Yankelovich") +query select result = double_metaphone("Zhao") +query select result = double_metaphone("McClellan") +query select result = double_metaphone("maurice") +query select result = double_metaphone("aubrey") +query select result = double_metaphone("cambrillo") +query select result = double_metaphone("heidi") +query select result = double_metaphone("katherine") +query select result = double_metaphone("catherine") +query select result = double_metaphone("richard") +query select result = double_metaphone("bob") +query select result = double_metaphone("eric") +query select result = double_metaphone("geoff") +query select result = double_metaphone("dave") +query select result = double_metaphone("ray") +query select result = double_metaphone("steven") +query select result = double_metaphone("bryce") +query select result = double_metaphone("randy") +query select result = double_metaphone("bryan") +query select result = double_metaphone("brian") +query select result = double_metaphone("otto") +query select result = double_metaphone("auto") +query select result = double_metaphone("Abbott") +query select result = double_metaphone("Back") +query select result = double_metaphone("Bacher") +query select result = double_metaphone("Charles") +query select result = double_metaphone("Ghana") +query select result = double_metaphone("Gnome") +query select result = double_metaphone("Raj") +query select result = double_metaphone("Quentin") +query select result = double_metaphone("Who") +query select result = double_metaphone("Shoemaker") +query select result = double_metaphone("Sian") +query select result = double_metaphone("Scold") +query select result = double_metaphone("Station") +query select result = double_metaphone("Match") +query select result = double_metaphone("Pizza") +query select result = double_metaphone("Agnes") +query select result = double_metaphone("Science") +query select result = double_metaphone("Van Gogh") +query select result = double_metaphone("Josef") +query select result = double_metaphone("Object") +query select result = double_metaphone("Sholz") +query select result = double_metaphone("Scharf") +query select result = double_metaphone("Kasia") +query select result = double_metaphone("Van Geller") From 994e99fa9c6f494b9709f4dac99d9924b5b1b95e Mon Sep 17 00:00:00 2001 From: Israel Date: Tue, 14 Oct 2025 11:41:41 -0600 Subject: [PATCH 02/30] Checkpoint: Switching to DM project. --- centrallix-lib/Makefile.in | 2 +- centrallix-os/cluster-schema.cluster | 10 +- centrallix/osdrivers/objdrv_cluster.c | 699 ++++++++++++++++++++++---- 3 files changed, 602 insertions(+), 109 deletions(-) diff --git a/centrallix-lib/Makefile.in b/centrallix-lib/Makefile.in index 20c57c11f..0daf7e568 100644 --- a/centrallix-lib/Makefile.in +++ b/centrallix-lib/Makefile.in @@ -66,7 +66,7 @@ TCFLAGS=$(patsubst -DNDEBUG,,$(CFLAGS)) XSTATICFILES=mtask.o mtlexer.o memstr.o xarray.o xhash.o xstring.o mtsession.o newmalloc.o xhashqueue.o bdqs_transport.o xhandle.o xringqueue.o cxsec.o smmalloc.o clusters.o qprintf.o strtcpy.o util.o STATICFILES=$(patsubst %,src/%,$(XSTATICFILES)) -XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo clusters.o qprintf.lo strtcpy.lo util.lo +XDYNAMICFILES=mtask.lo mtlexer.lo memstr.lo xarray.lo xhash.lo xstring.lo mtsession.lo newmalloc.lo xhashqueue.lo bdqs_transport.lo xhandle.lo xringqueue.lo cxsec.lo smmalloc.lo clusters.lo qprintf.lo strtcpy.lo util.lo DYNAMICFILES=$(patsubst %,src/%,$(XDYNAMICFILES)) INCLUDEFILES:=$(wildcard include/*.h) diff --git a/centrallix-os/cluster-schema.cluster b/centrallix-os/cluster-schema.cluster index 201c41255..a97d7f9ba 100644 --- a/centrallix-os/cluster-schema.cluster +++ b/centrallix-os/cluster-schema.cluster @@ -51,16 +51,16 @@ file_name "system/cluster" - /average_similarity : double && 0.0 < x < 1.0 - /size = average_similarity - /{arbitrary uint} - - /val : typeof(attr_name) // The value of the data point. + - /val : string // The value of the data point. - /label : uint < num_clusters // id of the cluster to which this data point belongs. - /sim : double && 0.0 < x <= threshold // Similarity to cluster centroid. ... /search_name - /{arbitrary uint} - - /id1 : uint // The id of the first data point. - - /id2 : uint // The id of the second data point. - - /val1 : typeof(attr_name) // The value of the first data point. - - /val2 : typeof(attr_name) // The value of the second data point. + - /id1 : uint < sizeof(source/attr_name) // The id of the first data point. + - /id2 : uint < sizeof(source/attr_name) // The id of the second data point. + - /val1 : string // The value of the first data point. + - /val2 : string // The value of the second data point. - /sim : double && 0.0 < x <= threshold // The similarity of the two data points. ... diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index 9ffbd1d22..2369bc1fb 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -74,9 +74,9 @@ /** Debugging **/ -// void void_func() {} -// #define tprintf void_func -#define tprintf printf +void void_func() {} +#define tprintf void_func +// #define tprintf printf /** Defaults for unspecified optional attributes. **/ #define DEFAULT_MIN_IMPROVEMENT 0.0001 @@ -181,33 +181,62 @@ void mssErrorf(int clr, char* module, const char* format, ...) /** TODO: I think this should be moved to datatypes. **/ /** Should maybe replace current type parsing in the presentation hints. **/ +/*** Parse the given string into a datatype. The case of the first character + *** is ignored, but all other characters must be capitalized correctly. + *** + *** @attention - This function is optimized to prevent performance hits + *** situations where it may need to be called many thousands of times. + *** + *** @param str The string to be parsed to a datatype. + *** @returns The datatype. + *** + *** LINK ../../centrallix-lib/include/datatypes.h:72 + ***/ int ci_TypeFromStr(const char* str) { - if (str == NULL) return -1; - - /** Check string length. **/ - const size_t len = strlen(str); - if (len < 3 || 13 < len) return -1; - - /** Copy str to enable mutability. **/ - char buf[len + 1u]; - strcpy(buf, str); - - /** First character is case insensitive. **/ - buf[0] = toupper(buf[0]); + /** All valid types are non-null strings, at least 2 characters long. **/ + if (str == NULL || str[0] == '\0' || str[1] == '\0') return -1; /** Check type. **/ - if (strcmp(buf, "Any") == 0) return DATA_T_UNAVAILABLE; - if (strcmp(buf, "Integer") == 0) return DATA_T_INTEGER; - if (strcmp(buf, "String") == 0) return DATA_T_STRING; - if (strcmp(buf, "Double") == 0) return DATA_T_DOUBLE; - if (strcmp(buf, "DateTime") == 0) return DATA_T_DATETIME; - if (strcmp(buf, "IntVecor") == 0) return DATA_T_INTVEC; - if (strcmp(buf, "StringVector") == 0) return DATA_T_STRINGVEC; - if (strcmp(buf, "Money") == 0) return DATA_T_MONEY; - if (strcmp(buf, "Array") == 0) return DATA_T_ARRAY; - if (strcmp(buf, "Code") == 0) return DATA_T_CODE; - if (strcmp(buf, "Binary") == 0) return DATA_T_BINARY; + switch (str[0]) + { + case 'A': case 'a': + if (strcmp(str+1, "Array"+1) == 0) return DATA_T_ARRAY; + if (strcmp(str+1, "Any"+1) == 0) return DATA_T_ANY; + break; + + case 'B': case 'b': + if (strcmp(str+1, "Binary"+1) == 0) return DATA_T_BINARY; + break; + + case 'C': case 'c': + if (strcmp(str+1, "Code"+1) == 0) return DATA_T_CODE; + break; + + case 'D': case 'd': + if (strcmp(str+1, "Double"+1) == 0) return DATA_T_DOUBLE; + if (strcmp(str+1, "DateTime"+1) == 0) return DATA_T_DATETIME; + break; + + case 'I': case 'i': + if (strcmp(str+1, "Integer"+1) == 0) return DATA_T_INTEGER; + if (strcmp(str+1, "IntVecor"+1) == 0) return DATA_T_INTVEC; + break; + + case 'M': case 'm': + if (strcmp(str+1, "Money"+1) == 0) return DATA_T_MONEY; + break; + + case 'S': case 's': + if (strcmp(str+1, "String"+1) == 0) return DATA_T_STRING; + if (strcmp(str+1, "StringVector"+1) == 0) return DATA_T_STRINGVEC; + break; + + case 'U': case 'u': + if (strcmp(str+1, "Unknown"+1) == 0) return DATA_T_UNAVAILABLE; + if (strcmp(str+1, "Unavailable"+1) == 0) return DATA_T_UNAVAILABLE; + break; + } /** Invalid type. **/ return -1; @@ -220,21 +249,21 @@ char* ci_TypeToStr(const int type) switch (type) { case DATA_T_UNAVAILABLE: return "Unknown"; - case DATA_T_INTEGER: return "Integer"; - case DATA_T_STRING: return "String"; - case DATA_T_DOUBLE: return "Double"; - case DATA_T_DATETIME: return "DateTime"; - case DATA_T_INTVEC: return "IntVecor"; - case DATA_T_STRINGVEC: return "StringVector"; - case DATA_T_MONEY: return "Money"; - case DATA_T_ARRAY: return "Array"; - case DATA_T_CODE: return "Code"; - case DATA_T_BINARY: return "Binary"; + case DATA_T_INTEGER: return "Integer"; + case DATA_T_STRING: return "String"; + case DATA_T_DOUBLE: return "Double"; + case DATA_T_DATETIME: return "DateTime"; + case DATA_T_INTVEC: return "IntVecor"; + case DATA_T_STRINGVEC: return "StringVector"; + case DATA_T_MONEY: return "Money"; + case DATA_T_ARRAY: return "Array"; + case DATA_T_CODE: return "Code"; + case DATA_T_BINARY: return "Binary"; } /** Invalid type. **/ mssErrorf(1, "Cluster", "Invalid type %d.\n", type); - return "Invalid"; + return "Invalid"; /* Shall not parse to a valid type in ci_TypeFromStr(). */ } /** TODO: I think this should be moved to xarray. **/ @@ -252,6 +281,19 @@ void** ci_xaToTrimmedArray(pXArray arr) return result; } +/** I got tired of forgetting how to do these. **/ +#define ci_file_name(obj) \ + ({ \ + __typeof__ (obj) _obj = (obj); \ + obj_internal_PathPart(_obj->Pathname, _obj->SubPtr - 1, 1); \ + }) +#define ci_file_path(obj) \ + ({ \ + __typeof__ (obj) _obj = (obj); \ + obj_internal_PathPart(_obj->Pathname, 0, _obj->SubPtr); \ + }) + + /** ================ Enum Declairations ================ **/ /** ANCHOR[id=enums] **/ @@ -265,6 +307,18 @@ typedef unsigned char ClusterAlgorithm; #define ALGORITHM_KMEDOIDS (ClusterAlgorithm)5u #define ALGORITHM_DB_SCAN (ClusterAlgorithm)6u +#define nClusteringAlgorithms 7u +ClusterAlgorithm ALL_CLUSTERING_ALGORITHMS[nClusteringAlgorithms] = + { + ALGORITHM_NULL, + ALGORITHM_NONE, + ALGORITHM_SLIDING_WINDOW, + ALGORITHM_KMEANS, + ALGORITHM_KMEANS_PLUS_PLUS, + ALGORITHM_KMEDOIDS, + ALGORITHM_DB_SCAN, + }; + /** Converts a clustering algorithm to its string name. **/ char* ci_ClusteringAlgorithmToString(ClusterAlgorithm clustering_algorithm) { @@ -287,6 +341,14 @@ typedef unsigned char SimilarityMeasure; #define SIMILARITY_COSINE (SimilarityMeasure)1u #define SIMILARITY_LEVENSHTEIN (SimilarityMeasure)2u +#define nSimilarityMeasures 3u +SimilarityMeasure ALL_SIMILARITY_MEASURES[nSimilarityMeasures] = + { + SIMILARITY_NULL, + SIMILARITY_COSINE, + SIMILARITY_LEVENSHTEIN, + }; + /** Converts a similarity measure to its string name. **/ char* ci_SimilarityMeasureToString(SimilarityMeasure similarity_measure) { @@ -319,39 +381,48 @@ char* const ATTR_ROOT[nATTR_ROOT] = { "source", "attr_name", }; -#define nATTR_CLUSTER 5u -char* const ATTR_CLUSTER[nATTR_CLUSTER] = { +#define nATTR_CLUSTER 7u +char* const ATTR_CLUSTER[nATTR_CLUSTER] = + { "algorithm", "similarity_measure", "num_clusters", "min_improvement", "max_iterations", -}; -#define nATTR_SEARCH 4u -char* const ATTR_SEARCH[nATTR_SEARCH] = { + "date_created", + "date_computed", + }; +#define nATTR_SEARCH 5u +char* const ATTR_SEARCH[nATTR_SEARCH] = + { "source", "threshold", "similarity_measure", -}; + "date_created", + "date_computed", + }; #define nATTR_CLUSTER_ENTRY 2u -char* const ATTR_CLUSTER_ENTRY[nATTR_CLUSTER_ENTRY] = { +char* const ATTR_CLUSTER_ENTRY[nATTR_CLUSTER_ENTRY] = + { "val", "sim", -}; + }; #define nATTR_SEARCH_ENTRY 3u -char* const ATTR_SEARCH_ENTRY[nATTR_SEARCH_ENTRY] = { +char* const ATTR_SEARCH_ENTRY[nATTR_SEARCH_ENTRY] = + { "val1", "val2", "sim", -}; + }; #define END_OF_ATTRIBUTES NULL /** Method name list. **/ #define nMETHOD_NAME 2u -char* const METHOD_NAME[nMETHOD_NAME] = { +char* const METHOD_NAME[nMETHOD_NAME] = + { "cache", -}; + }; #define END_OF_METHODS END_OF_ATTRIBUTES @@ -362,27 +433,31 @@ char* const METHOD_NAME[nMETHOD_NAME] = { typedef struct _SOURCE { /** Top level attributes (specified in the .cluster file). **/ - char* Name; /* The node name, specified in the .cluster file. - * Warning: Some code makes the assumption that this - * is the first field in the struct. - */ - char* Key; /* The key associated with this object in the global SourceCache. */ - char* SourcePath; /* The path to the data source from which to retrieve data. */ - char* AttrName; /* The name of the attribute to get from the data source. */ + char* Name; /* The node name, specified in the .cluster file. + * Warning: Some code makes the assumption that this + * is the first field in the struct. + */ + char* Key; /* The key associated with this object in the global SourceCache. */ + char* SourcePath; /* The path to the data source from which to retrieve data. */ + char* AttrName; /* The name of the attribute to get from the data source. */ /** Computed data. **/ - char** Data; /* The data strings to be clustered and searched, or NULL if they - * have not been fetched from the source. - */ - pVector* Vectors; /* The cosine comparison vectors from the fetched data, or NULL if - * they haven't been computed. Note that vectors are no longer - * needed once all clusters and searches have been computed, so - * they are automatically freed in that case to save memory. - */ - unsigned int nVectors; /* The number of vectors and data strings. Note: This is not - * set to 0 if the vector array is freed, this case should be - * checked separately. - */ + char** Data; /* The data strings to be clustered and searched, or NULL if they + * have not been fetched from the source. + */ + pVector* Vectors; /* The cosine comparison vectors from the fetched data, or NULL if + * they haven't been computed. Note that vectors are no longer + * needed once all clusters and searches have been computed, so + * they are automatically freed in that case to save memory. + */ + unsigned int nVectors; /* The number of vectors and data strings. Note: This is not + * set to 0 if the vector array is freed, this case should be + * checked separately. + */ + + /** Time. **/ + DateTime DateCreated; /* The date and time that this object was created and initialized. */ + DateTime DateComputed; /* The date and time that the Data and Vectors fields were computed. */ } SourceData, *pSourceData; /** Data for each cluster. **/ @@ -415,6 +490,10 @@ typedef struct _CLUSTER * (aka. DriverData->nVectors). For vector i, Labels[i] is * the ID of the cluster to which that data is assigned. * NULL if the cluster has not been computed. */ + + /** Time. **/ + DateTime DateCreated; /* The date and time that this object was created and initialized. */ + DateTime DateComputed; /* The date and time that the Labels field was computed. */ } ClusterData, *pClusterData; @@ -437,6 +516,10 @@ typedef struct _SEARCH * if the search has not been computed. */ unsigned int nDups; /* The number of dups found. */ + + /** Time. **/ + DateTime DateCreated; /* The date and time that this object was created and initialized. */ + DateTime DateComputed; /* The date and time that the Dups field was computed. */ } SearchData, *pSearchData; @@ -463,8 +546,6 @@ typedef struct _NODE /** Other stuff, idk why it's here. **/ pSnNode Node; pObject Obj; - char* CreateDateField; - char* ModifyDateField; } NodeData, *pNodeData; @@ -694,6 +775,7 @@ int ci_ParseAttribute( } +// LINK #functions /*** Parses a ClusteringAlgorithm from the algorithm field in the pStructInf *** representing some structure with that attribute in a parsed structure file. *** @@ -729,6 +811,7 @@ ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObjects param } +// LINK #functions /*** Parses a SimilarityMeasure from the similarity_measure field in the given *** pStructInf parameter, which represents some structure with that attribute *** in a parsed structure file. @@ -760,6 +843,7 @@ SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects param_ } +// LINK #functions /*** Allocates a new pSourceData struct from a parsed pStructInf representing *** a .cluster structure file. *** @@ -816,6 +900,7 @@ pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* p source_data->Key = key; source_data->SourcePath = source_path; source_data->AttrName = attr_name; + check(objCurrentDate(&source_data->DateCreated)); /** Add the new object to the cache for next time. **/ tprintf("+ source: \"%s\"\n", key); @@ -829,6 +914,7 @@ pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* p } +// LINK #functions /*** Allocates a new pClusterData struct from a parsed pStructInf. *** *** @attention - Warning: Caching in use. @@ -858,6 +944,7 @@ pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) /** Basic Properties. **/ cluster_data->Name = check_ptr(strdup(inf->Name)); cluster_data->SourceData = source_data; + check(objCurrentDate(&cluster_data->DateCreated)); /** Get algorithm. **/ cluster_data->ClusterAlgorithm = ci_ParseClusteringAlgorithm(inf, param_list); @@ -1046,6 +1133,7 @@ pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) } +// LINK #functions /*** Allocates a new pSearchData struct from a parsed pStructInf. *** *** @attention - Warning: Caching in use. @@ -1068,8 +1156,9 @@ pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) assert(search_data != NULL); memset(search_data, 0, sizeof(SearchData)); - /** Get search name. **/ + /** Get basic information. **/ search_data->Name = check_ptr(strdup(inf->Name)); + check(objCurrentDate(&search_data->DateCreated)); /** Get source. **/ char* source_name; @@ -1147,6 +1236,7 @@ pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) } +// LINK #functions /*** Allocates a new pNodeData struct from a parsed pStructInf. *** *** @attention - Does not use caching directly, but uses subfunctions to @@ -1164,7 +1254,7 @@ pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) int ret; /** Retrieve path so we'll know we have it later. **/ - char* path = obj_internal_PathPart(obj->Pathname, 0, obj->SubPtr); + char* path = ci_file_path(obj); /** Allocate node struct data. **/ // pNodeData node_data = NodeData |> sizeof() |> nmMalloc() |> check_ptr(); @@ -1404,6 +1494,7 @@ void ci_FreeSourceData(pSourceData source_data) } +// LINK #functions /*** Free pClusterData struct with an option to recursively free subclusters. *** *** @param cluster_data The cluster data struct to free. @@ -1437,6 +1528,7 @@ void ci_FreeClusterData(pClusterData cluster_data, bool recursive) } +// LINK #functions /** @param search_data A pSearchData struct, freed by this function. **/ void ci_FreeSearchData(pSearchData search_data) { @@ -1451,6 +1543,7 @@ void ci_FreeSearchData(pSearchData search_data) } +// LINK #functions /** @param node_data A pNodeData struct, freed by this function. **/ void ci_FreeNodeData(pNodeData node_data) { @@ -1541,6 +1634,7 @@ unsigned int ci_SizeOfSourceData(pSourceData source_data) } +// LINK #functions /*** Returns the deep size of a ClusterData struct, including the size of all *** allocated substructures. As far as I can tell, this is probably only *** useful for cache management and debugging. @@ -1571,6 +1665,7 @@ unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive) } +// LINK #functions /*** Returns the deep size of a SearchData struct, including the size of all *** allocated substructures. As far as I can tell, this is probably only *** useful for cache management and debugging. @@ -1591,6 +1686,7 @@ unsigned int ci_SizeOfSearchData(pSearchData search_data) } +// LINK #functions /*** Returns the deep size of a NodeData struct, including the size of all *** allocated substructures. As far as I can tell, this is probably only *** useful for cache management and debugging. @@ -1656,6 +1752,7 @@ void ci_CacheFreeSourceData(pXHashEntry entry, void* _) nmSysFree(key); } +// LINK #functions /** Intended for use in xhClearKeySafe(). **/ void ci_CacheFreeCluster(pXHashEntry entry, void* _) { @@ -1669,6 +1766,7 @@ void ci_CacheFreeCluster(pXHashEntry entry, void* _) nmSysFree(key); } +// LINK #functions /** Intended for use in xhClearKeySafe(). **/ void ci_CacheFreeSearch(pXHashEntry entry, void* _) { @@ -1731,6 +1829,10 @@ int ci_ComputeSourceData(pSourceData source_data, pObjSession session) source_data->nVectors = 0; } + /** Record the date and time. **/ + /** Even if this computation fails, we may want this information. **/ + check(objCurrentDate(&source_data->DateComputed)); + /** Time to play shoots-and-ladders in an error-handling jungle of gotos. **/ bool successful = false; int ret; @@ -1929,6 +2031,8 @@ int ci_ComputeSourceData(pSourceData source_data, pObjSession session) return (successful) ? 0 : -1; } + +// LINK #functions /*** Ensures that the cluster_data->Labels has been computed, running the *** specified clustering algorithm if necessary. *** @@ -1955,6 +2059,10 @@ int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) goto err; } + /** Record the date and time. **/ + /** Even if this computation fails, we may want this information. **/ + check(objCurrentDate(&cluster_data->DateComputed)); + /** Allocate static memory for finding clusters. **/ const size_t labels_size = source_data->nVectors * sizeof(unsigned int); cluster_data->Labels = check_ptr(nmMalloc(labels_size)); @@ -2012,6 +2120,8 @@ int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) return -1; } + +// LINK #functions /*** Ensures that the search_data->Dups has been computed, running the a *** search with the specified similarity measure if necessary. *** @@ -2052,6 +2162,10 @@ int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) goto err; } + /** Record the date and time. **/ + /** Even if this computation fails, we may want this information. **/ + check(objCurrentDate(&search_data->DateComputed)); + /** Execute the search. **/ tprintf("Invoking ca_search.\n"); Timer timer_i, *timer = timer_start(timer_init(&timer_i)); @@ -2115,6 +2229,7 @@ int ci_GetParamType(void* inf_v, const char* attr_name) } +// LINK #functions /*** Get the value of a parameter. Intended for `expSetParamFunctions()`. *** *** @attention - Warning: If the retrieved value is `NULL`, the pObjectData @@ -2191,7 +2306,7 @@ int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) *** @param obj The object being opened, including the path, session, and *** other necessary information. *** @param mask Driver permission mask (unused). - *** @param systype ? (unused) + *** @param sys_type ? (unused) *** @param usr_type The object system file type being openned. Should always *** be "system/cluster" because this driver is only registered for that *** type of file. @@ -2200,12 +2315,9 @@ int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) *** @returns A pDriverData struct representing a driver instance, or *** NULL if an error occures. ***/ -void* clusterOpen(pObject obj, int mask, pContentType systype, char* usr_type, pObjTrxTree* oxt) +void* clusterOpen(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) { - tprintf( - "Warning: clusterOpen(\"%s\") is under active development.\n", - obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt, 1) - ); + tprintf("Warning: clusterOpen(\"%s\") is under active development.\n", ci_file_name(obj)); /** If CREAT and EXCL are specified, create it and fail if it already exists. **/ pSnNode node_struct = NULL; @@ -2239,10 +2351,7 @@ void* clusterOpen(pObject obj, int mask, pContentType systype, char* usr_type, p pNodeData node_data = ci_ParseNodeData(node_struct->Data, obj); if (node_data == NULL) { - mssErrorf(0, "Cluster", - "Failed to parse structure file of name %s.", - obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt, 1) - ); + mssErrorf(0, "Cluster", "Failed to parse structure file \"%s\".", ci_file_name(obj)); goto err; } node_data->Node = node_struct; @@ -2343,6 +2452,7 @@ void* clusterOpen(pObject obj, int mask, pContentType systype, char* usr_type, p } +// LINK #functions /*** Close a cluster driver instance object, releasing any necessary memory *** and closing any necessary underlying resources. However, most of that *** data will be cached and won't be freed unless the cache is dropped. @@ -2374,6 +2484,7 @@ int clusterClose(void* inf_v, pObjTrxTree* oxt) } +// LINK #functions /*** Opens a new query pointing to the first row of the data targetted by *** the driver instance struct. The query has an internal index counter *** that starts at the first row and increments as data is fetched. @@ -2394,6 +2505,7 @@ void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt) } +// LINK #functions /*** Get the next entry as an open driver instance object. *** *** @param qy_v A query instance, storing an internal index which is @@ -2477,6 +2589,7 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) } +// LINK #functions /*** Close a cluster query instance, releasing any necessary memory and *** closing any necessary underlying resources. This does not close the *** underlying driver instance, which must be closed with clusterClose(). @@ -2494,6 +2607,7 @@ int clusterQueryClose(void* qy_v, pObjTrxTree* oxt) } +// LINK #functions /*** Get the type of a cluster driver instance attribute. *** *** @param inf_v The driver instance. @@ -2507,13 +2621,6 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) { pDriverData driver_data = (pDriverData)inf_v; - /** Performance shortcut for frequently requested attributes: val1, val2, and sim. **/ - if (attr_name[0] == 'v' || attr_name[0] == 's') goto handle_targets; - - /** Debug info. **/ - if (oxt == NULL) tprintf(" > "); - tprintf("Call to clusterGetAttrType(%s)\n", attr_name); - /** Guard possible segfault. **/ if (attr_name == NULL) { @@ -2521,6 +2628,13 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) return DATA_T_UNAVAILABLE; } + /** Performance shortcut for frequently requested attributes: val, val1, val2, and sim. **/ + if (attr_name[0] == 'v' || attr_name[0] == 's') goto handle_targets; + + /** Debug info. **/ + if (oxt == NULL) tprintf(" > "); + tprintf("Call to clusterGetAttrType(%s)\n", attr_name); + /** Types for general attributes. **/ if (strcmp(attr_name, "name") == 0 || strcmp(attr_name, "annotation") == 0 @@ -2530,6 +2644,12 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) return DATA_T_STRING; if (strcmp(attr_name, "last_modification") == 0) return DATA_T_DATETIME; + if ((strcmp(attr_name, "date_created") == 0 + || strcmp(attr_name, "date_computed") == 0) + && + (driver_data->TargetType == TARGET_CLUSTER + || driver_data->TargetType == TARGET_SEARCH)) + return DATA_T_DATETIME; /** Types for specific data targets. **/ handle_targets: @@ -2566,10 +2686,7 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) if (strcmp(attr_name, "id") == 0) return DATA_T_INTEGER; if (strcmp(attr_name, "val") == 0) - { - /** TODO: Replace with type calculation. **/ return DATA_T_STRING; - } if (strcmp(attr_name, "sim") == 0) return DATA_T_DOUBLE; break; @@ -2580,10 +2697,7 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) return DATA_T_INTEGER; if (strcmp(attr_name, "val1") == 0 || strcmp(attr_name, "val2") == 0) - { - /** TODO: Replace with type calculation. **/ return DATA_T_STRING; - } if (strcmp(attr_name, "sim") == 0) return DATA_T_DOUBLE; break; @@ -2597,6 +2711,7 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) } +// LINK #functions /*** Get the value of a cluster driver instance attribute. *** *** @param inf_v Node data containing the list of paramenters. @@ -2619,7 +2734,14 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val { pDriverData driver_data = (pDriverData)inf_v; - /** Performance shortcut for frequently requested attributes: val1, val2, and sim. **/ + /** Guard possible segfault. **/ + if (attr_name == NULL) + { + fprintf(stderr, "Warning: Call to clusterGetAttrType() with NULL attribute name.\n"); + return DATA_T_UNAVAILABLE; + } + + /** Performance shortcut for frequently requested attributes: val, val1, val2, and sim. **/ if ( (attr_name[0] == 'v' && datatype == DATA_T_STRING) /* val, val1, val2 : String */ || (attr_name[0] == 's' && datatype == DATA_T_DOUBLE) /* sim : double */ @@ -2674,6 +2796,10 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val case TARGET_CLUSTER_ENTRY: val->String = "Clustering driver: Cluster Entry."; break; case TARGET_SEARCH: val->String = "Clustering driver: Search."; break; case TARGET_SEARCH_ENTRY: val->String = "Clustering driver: Cluster Entry."; break; + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; } return 0; } @@ -2694,6 +2820,61 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val /** Last modification is not implemented yet. **/ if (strcmp(attr_name, "last_modification") == 0) return 1; /* null */ + /** Handle creation and computation dates. **/ + if (strcmp(attr_name, "date_created") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_ROOT: + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + /** Field is not defined for this target type. **/ + return -1; + + case TARGET_CLUSTER: + val->DateTime = &((pClusterData)driver_data->TargetData)->DateCreated; + return 0; + + case TARGET_SEARCH: + val->DateTime = &((pSearchData)driver_data->TargetData)->DateCreated; + return 0; + } + return -1; + } + if (strcmp(attr_name, "date_computed") == 0) + { + switch (driver_data->TargetType) + { + case TARGET_ROOT: + case TARGET_CLUSTER_ENTRY: + case TARGET_SEARCH_ENTRY: + /** Field is not defined for this target type. **/ + return -1; + + case TARGET_CLUSTER: + { + pClusterData target = (pClusterData)driver_data->TargetData; + pDateTime date_time = &target->DateComputed; + if (date_time->Value == 0) return 1; /* null */ + else val->DateTime = date_time; + return 0; + } + + case TARGET_SEARCH: + { + pSearchData target = (pSearchData)driver_data->TargetData; + pDateTime date_time = &target->DateComputed; + if (date_time->Value == 0) return 1; /* null */ + else val->DateTime = date_time; + return 0; + } + } + + /** Default: Unknown type. **/ + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return -1; + } + /** Handle attributes for specific data targets. **/ handle_targets: switch (driver_data->TargetType) @@ -2848,6 +3029,318 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val } +// LINK #functions +/** Not implemented. **/ +pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt) + { + tprintf("Warning: clusterPresentationHints(\"%s\") is under active development.", attr_name); + pDriverData driver_data = (pDriverData)inf_v; + + /** Malloc presentation hints struct. **/ + pObjPresentationHints hints = check_ptr(nmMalloc(sizeof(ObjPresentationHints))); + memset(hints, 0, sizeof(ObjPresentationHints)); + + /** Hints that are the same for all fields */ + hints->GroupID = -1; + hints->VisualLength2 = 1; + hints->Style |= OBJ_PH_STYLE_READONLY | OBJ_PH_STYLE_CREATEONLY | OBJ_PH_STYLE_NOTNULL; + hints->StyleMask |= OBJ_PH_STYLE_READONLY | OBJ_PH_STYLE_CREATEONLY | OBJ_PH_STYLE_NOTNULL; + + /** Temporary param list for compiling expressions. **/ + pParamObjects tmp_list = check_ptr(expCreateParamList()); + + if (strcmp(attr_name, "name") == 0) + { + hints->Length = 32; + hints->VisualLength = 16; + goto end; + } + if (strcmp(attr_name, "annotation") == 0) + { + hints->Length = 36; + hints->VisualLength = 36; + goto end; + } + if (strcmp(attr_name, "inner_type") == 0 + || strcmp(attr_name, "inner_type") == 0 + || strcmp(attr_name, "outer_type") == 0 + || strcmp(attr_name, "content_type") == 0 + || strcmp(attr_name, "last_modification") == 0) + { + hints->VisualLength = 30; + goto end; + } + + if (strcmp(attr_name, "date_created") == 0 + || strcmp(attr_name, "date_computed") == 0) + { + hints->Length = 24; + hints->VisualLength = 20; + hints->Format = nmSysStrdup("datetime"); + goto end; + } + + switch (driver_data->TargetType) + { + case TARGET_ROOT: + if (strcmp(attr_name, "source") == 0) + { + hints->Length = _PC_PATH_MAX; + hints->VisualLength = 64; + hints->FriendlyName = "Source Path"; + goto end; + } + if (strcmp(attr_name, "attr_name") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = "Attribute Name"; + goto end; + } + break; + + case TARGET_CLUSTER: + if (strcmp(attr_name, "num_clusters") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("2", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("2147483647", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 8; + hints->VisualLength = 4; + hints->FriendlyName = nmSysStrdup("Number of Clusters"); + goto end; + } + if (strcmp(attr_name, "min_improvement") == 0) + { + /** Min and max values. **/ + hints->DefaultExpr = expCompileExpression("0.0001", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = nmSysStrdup("Minimum Improvement Threshold"); + goto end; + } + if (strcmp(attr_name, "max_iterations") == 0) + { + /** Min and max values. **/ + hints->DefaultExpr = expCompileExpression("64", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("2147483647", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 8; + hints->VisualLength = 4; + hints->FriendlyName = nmSysStrdup("Maximum Number of Clustering Iterations"); + goto end; + } + if (strcmp(attr_name, "average_similarity") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = nmSysStrdup("Average Similarity"); + goto end; + } + if (strcmp(attr_name, "size") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = nmSysStrdup("Average Cluster Size"); + goto end; + } + if (strcmp(attr_name, "algorithm") == 0) + { + /** Enum values. **/ + check(xaInit(&(hints->EnumList), nClusteringAlgorithms)); + for (unsigned int i = 0u; i < nClusteringAlgorithms; i++) + check_neg(xaAddItem(&(hints->EnumList), &ALL_CLUSTERING_ALGORITHMS[i])); + + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + char buf[4u]; + snprintf(buf, sizeof(buf), "%d", nClusteringAlgorithms); + hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Display flags. **/ + hints->Style |= OBJ_PH_STYLE_BUTTONS; + hints->StyleMask |= OBJ_PH_STYLE_BUTTONS; + + /** Other hints. **/ + hints->Length = 24; + hints->VisualLength = 20; + hints->FriendlyName = nmSysStrdup("Clustering Algorithm"); + goto end; + } + /** Fall-through: Start of overlapping region. **/ + + case TARGET_SEARCH: + if (strcmp(attr_name, "similarity_measure") == 0) + { + /** Enum values. **/ + check(xaInit(&(hints->EnumList), nSimilarityMeasures)); + for (unsigned int i = 0u; i < nSimilarityMeasures; i++) + check_neg(xaAddItem(&(hints->EnumList), &ALL_SIMILARITY_MEASURES[i])); + + /** Display flags. **/ + hints->Style |= OBJ_PH_STYLE_BUTTONS; + hints->StyleMask |= OBJ_PH_STYLE_BUTTONS; + + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + char buf[4u]; + snprintf(buf, sizeof(buf), "%d", nSimilarityMeasures); + hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 32; + hints->VisualLength = 20; + hints->FriendlyName = nmSysStrdup("Similarity Measure"); + goto end; + } + + /** End of overlapping region. **/ + if (driver_data->TargetType == TARGET_CLUSTER) break; + + if (strcmp(attr_name, "source") == 0) + { + hints->Length = 64; + hints->VisualLength = 32; + hints->FriendlyName = nmSysStrdup("Source Cluster Name"); + goto end; + } + if (strcmp(attr_name, "threshold") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = nmSysStrdup("Similarity Threshold"); + goto end; + } + break; + + case TARGET_CLUSTER_ENTRY: + { + pClusterData target = (pClusterData)driver_data->TargetData; + + if (strcmp(attr_name, "id") == 0) + { + pSourceData source_data = (pSourceData)target->SourceData; + + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + if (source_data->Vectors != NULL) + { + char buf[16u]; + snprintf(buf, sizeof(buf), "%u", source_data->nVectors); + hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + return 0; + } + + /** Other hints. **/ + hints->Length = 8; + hints->VisualLength = 4; + goto end; + } + if (strcmp(attr_name, "val") == 0) + { + /** Other hints. **/ + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = nmSysStrdup("Value"); + goto end; + } + if (strcmp(attr_name, "sim") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = nmSysStrdup("Similarity"); + goto end; + } + break; + } + + case TARGET_SEARCH_ENTRY: + { + pSearchData target = (pSearchData)driver_data->TargetData; + + if (strcmp(attr_name, "id1") == 0 || strcmp(attr_name, "id2") == 0) + { + pSourceData source_data = (pSourceData)target->Source->SourceData; + + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + if (source_data->Vectors != NULL) + { + char buf[16u]; + snprintf(buf, sizeof(buf), "%u", source_data->nVectors); + hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + return 0; + } + + /** Other hints. **/ + hints->Length = 8; + hints->VisualLength = 4; + goto end; + } + if (strcmp(attr_name, "val1") == 0 || strcmp(attr_name, "val2") == 0) + { + /** Other hints. **/ + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = nmSysStrdup("Value"); + goto end; + } + if (strcmp(attr_name, "sim") == 0) + { + /** Min and max values. **/ + hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); + + /** Other hints. **/ + hints->Length = 16; + hints->VisualLength = 8; + hints->FriendlyName = nmSysStrdup("Similarity"); + goto end; + } + break; + } + + default: + mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); + return NULL; + } + + + end: + check(expFreeParamList(tmp_list)); + return hints; + } + + +// LINK #functions /*** Returns the name of the first attribute that one can get from *** this driver instance (using GetAttrType() and GetAttrValue()). *** Resets the internal variable (TargetAttrIndex) used to maintain @@ -2866,6 +3359,7 @@ char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt) } +// LINK #functions /*** Returns the name of the next attribute that one can get from *** this driver instance (using GetAttrType() and GetAttrValue()). *** Uses an internal variable (TargetAttrIndex) used to maintain @@ -2895,6 +3389,7 @@ char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt) } +// LINK #functions /*** Get the capabilities of the driver instance object. *** *** @param inf_v The driver instance to be checked. @@ -2995,6 +3490,7 @@ char* clusterGetFirstMethod(void* inf_v, pObjTrxTree oxt) } +// LINK #functions /*** Returns the name of the next method that one can get from *** this driver instance (using GetAttrType() and GetAttrValue()). *** Uses an internal variable (TargetMethodIndex) used to maintain @@ -3013,6 +3509,7 @@ char* clusterGetNextMethod(void* inf_v, pObjTrxTree oxt) return (i < nMETHOD_NAME) ? METHOD_NAME[i] : END_OF_METHODS; } +// LINK #functions /** Intended for use in xhForEach(). **/ static int ci_PrintEntry(pXHashEntry entry, void* arg) { @@ -3074,6 +3571,7 @@ static int ci_PrintEntry(pXHashEntry entry, void* arg) } +// LINK #functions /*** Executes a method with the given name. *** *** @param inf_v The affected driver instance. @@ -3101,7 +3599,7 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx if (strcmp(param->String, "show") == 0) { const pObject obj = ((pDriverData)inf_v)->NodeData->Obj; - char* path = obj_internal_PathPart(obj->Pathname, 0, obj->SubPtr); + char* path = ci_file_path(obj); /** Print cache info table. **/ unsigned int i = 1u, source_bytes = 0u, cluster_bytes = 0u, search_bytes = 0u; @@ -3232,14 +3730,9 @@ int clusterCommit(void* inf_v, pObjTrxTree *oxt) mssErrorf(1, "Cluster", "clusterCommit() not implemented because clusters are imutable."); return 0; } -/** Not implemented. **/ -pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt) - { - mssErrorf(1, "Cluster", "clusterPresentationHints() not implemented."); - return NULL; - } +// LINK #functions /*** Initialize the driver. This includes: *** - Registering the driver with the objectsystem. *** - Registering structs with newmalloc for debugging. From ea6430fa8e0965aaac782b49cc73e5ceff457ddc Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 16 Oct 2025 08:55:52 -0600 Subject: [PATCH 03/30] Checkpoing: Switching to DM project. --- centrallix-lib/include/util.h | 47 +++ centrallix-os/testdir/file.cluster | 64 ++++ centrallix/expression/exp_functions.c | 1 - centrallix/multiquery/multiquery.c | 8 +- centrallix/osdrivers/objdrv_cluster.c | 451 +++++++++++--------------- centrallix/test_obj.c | 1 + 6 files changed, 314 insertions(+), 258 deletions(-) create mode 100644 centrallix-os/testdir/file.cluster diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index 2b9d7b26f..12019abfb 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -134,6 +134,53 @@ void fail(const char* function_name, int code); _r; \ }) +/** Pattern for printing a binary int using printf(). **/ +#define INT_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c" + +/*** Converts an int to the values that should be passed to printf() for the + *** INT_TO_BINARY_PATTERN pattern. + *** + *** @attention - Double evaluation is NOT HANDLED so int_val will be evaluted + *** 32 times when this macro is used. Ensure that evaluation of the value + *** passed for int_val does not have important side effects! + *** + *** @param int_val The int to be printed. + *** @returns Values for printf(). + ***/ +#define INT_TO_BINARY(int_val) \ + ((int_val) & 0b10000000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b01000000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00100000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00010000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00001000000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000100000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000010000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000001000000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000100000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000010000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000001000000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000100000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000010000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000001000000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000100000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000010000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000001000000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000100000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000010000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000001000000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000100000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000010000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000001000000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000100000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000010000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000001000000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000100000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000010000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000001000 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000000100 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000000010 ? '1' : '0'), \ + ((int_val) & 0b00000000000000000000000000000001 ? '1' : '0') + #endif /* __cplusplus */ #endif /* UTILITY_H */ diff --git a/centrallix-os/testdir/file.cluster b/centrallix-os/testdir/file.cluster new file mode 100644 index 000000000..929efdd03 --- /dev/null +++ b/centrallix-os/testdir/file.cluster @@ -0,0 +1,64 @@ +$Version=2$ +file_name "system/cluster" + { + // Developer can specify parameters to improve file reuseability. + // TIP: Improve performance by declairing frequently used parameters first. + k "cluster/parameter" { type = integer; style=notnull; } + str "cluster/parameter" { type = string; } + int "cluster/parameter" { type = integer; default = runserver(:parameters:k); } + dbl "cluster/parameter" { type = double; default=4.2; } + // conversion "cluster/parameter" { type=double; default=4; } + + null_str "cluster/parameter" { type = string; default = null; } + null_int "cluster/parameter" { type = integer; default = null; } + null_dbl "cluster/parameter" { type = double; default = null; } + + // We calculate k in a centrallix script using: + // k = max(2, pow(log(n) / log(36), 3.2) - 8) + // where n is the number of records passed. + + // Specify the data source at the top of the file. + // How do we pass distinct data? Should the driver + // handle that for us? + source = "/apps/kardia/data/Kardia_DB/p_partner/rows"; + attr_name = p_given_name; // runserver(:parameters:str) + + // Clustering object specifies properties for clustering. + kmeans_cluster "cluster/cluster" + { + algorithm = "k-means"; + similarity_measure = "cosine"; + num_clusters = runserver(:parameters:k); + min_improvement = 0.0001; + max_iterations = 48; + + // Create subclusters. (Not implemented) + sub_cluster "cluster/cluster" + { + algorithm = "none"; + similarity_measure = "cosine"; + num_clusters = 7; + min_improvement = "max"; + } + } + + // Complete search. + no_clustering "cluster/cluster" + { + algorithm = "none"; + } + + dups "cluster/search" + { + source = kmeans_cluster; + threshold = 0.75; + similarity_measure = "cosine"; + } + + dups2 "cluster/search" + { + source = no_clustering; + threshold = 0.75; + similarity_measure = "cosine"; + } + } diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index df55559be..a8e16ecc7 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -1355,7 +1355,6 @@ int exp_fn_ralign(pExpression tree, pParamObjects objlist, pExpression i0, pExpr tree->Alloc = 0; tree->String = tree->Types.StringBuf; } - /** Possible overflow? **/ sprintf(tree->String,"%*.*s",i1->Integer,i1->Integer,i0->String); } return 0; diff --git a/centrallix/multiquery/multiquery.c b/centrallix/multiquery/multiquery.c index 897362751..069186e80 100644 --- a/centrallix/multiquery/multiquery.c +++ b/centrallix/multiquery/multiquery.c @@ -2086,6 +2086,7 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p mssError(1,"MQ","Expected equals after EXEC parameter"); mlxNoteError(lxs); xsFree(xs); + xs = NULL; break; } @@ -2098,6 +2099,7 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p mssError(1,"MQ","Error in EXEC parameter"); mlxNoteError(lxs); xsFree(xs); + xs = NULL; xsFree(param); break; } @@ -2108,6 +2110,7 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p mssError(1,"MQ","Could not evaluate EXEC parameter"); mlxNoteError(lxs); xsFree(xs); + xs = NULL; xsFree(param); break; } @@ -2120,7 +2123,8 @@ mq_internal_SyntaxParse(pLxSession lxs, pQueryStatement stmt, int allow_empty, p } } - strtcpy(new_qs->Source, xs->String, sizeof(new_qs->Source)); + if (xs != NULL) + strtcpy(new_qs->Source, xs->String, sizeof(new_qs->Source)); next_state = LookForClause; } else @@ -4774,5 +4778,3 @@ mqInitialize() return 0; } - - diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index 2369bc1fb..f56cca5de 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -72,11 +72,17 @@ *** https://marketplace.visualstudio.com/items?itemName=ExodiusStudios.comment-anchors ***/ +/** Pure Laziness **/ +#define ENABLE_TPRINTF /** Debugging **/ +#ifndef ENABLE_TPRINTF void void_func() {} #define tprintf void_func -// #define tprintf printf +#endif +#ifdef ENABLE_TPRINTF +#define tprintf printf +#endif /** Defaults for unspecified optional attributes. **/ #define DEFAULT_MIN_IMPROVEMENT 0.0001 @@ -85,42 +91,6 @@ void void_func() {} /** ================ Stuff That Should Be Somewhere Else ================ **/ /** ANCHOR[id=temp] **/ -#define INT_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c" -#define INT_TO_BINARY(int_val) \ - ((int_val) & 0b10000000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b01000000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00100000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00010000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00001000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000100000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000010000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000001000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000100000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000010000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000001000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000100000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000010000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000001000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000100000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000010000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000001000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000100000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000010000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000001000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000100000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000010000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000001000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000100000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000010000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000001000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000100000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000010000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000001000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000000100 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000000010 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000000001 ? '1' : '0') - - /** TODO: I think this should be moved to mtsession. **/ /*** I caused at least 10 bugs so far trying to pass format specifiers to *** mssError without realizing that it didn't support them. Eventually, I @@ -593,45 +563,38 @@ struct /** Parsing Functions. **/ // LINK #parsing -int ci_ParseAttribute(pStructInf inf, char* attr_name, int datatype, pObjData data, pParamObjects param_list, bool required, bool print_type_error); -ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf cluster_inf, pParamObjects param_list); -SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf cluster_inf, pParamObjects param_list); -pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path); -pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data); -pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data); -pNodeData ci_ParseNodeData(pStructInf inf, pObject obj); +static int ci_ParseAttribute(pStructInf inf, char* attr_name, int datatype, pObjData data, pParamObjects param_list, bool required, bool print_type_error); +static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf cluster_inf, pParamObjects param_list); +static SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf cluster_inf, pParamObjects param_list); +static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path); +static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data); +static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data); +static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj); /** Freeing Functions. **/ // LINK #freeing -void ci_FreeSourceData(pSourceData source_data); -void ci_FreeClusterData(pClusterData cluster_data, bool recursive); -void ci_FreeSearchData(pSearchData search_data); -void ci_FreeNodeData(pNodeData node_data); +static void ci_FreeSourceData(pSourceData source_data); +static void ci_FreeClusterData(pClusterData cluster_data, bool recursive); +static void ci_FreeSearchData(pSearchData search_data); +static void ci_FreeNodeData(pNodeData node_data); /** Deep Size Computation Functions. **/ // LINK #sizing -unsigned int ci_SizeOfSourceData(pSourceData source_data); -unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive); -unsigned int ci_SizeOfSearchData(pSearchData search_data); -unsigned int ci_SizeOfNodeData(pNodeData node_data); - -/** Cache Invalidation Functions. **/ -// LINK #invalidation -void ci_CacheFreeSourceData(pXHashEntry entry, void* _); -void ci_CacheFreeCluster(pXHashEntry entry, void* _); -void ci_CacheFreeSearch(pXHashEntry entry, void* _); +static unsigned int ci_SizeOfSourceData(pSourceData source_data); +static unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive); +static unsigned int ci_SizeOfSearchData(pSearchData search_data); /** Computation Functions. (Ensure data is computed.) **/ // LINK #computation -int ci_ComputeSourceData(pSourceData source_data, pObjSession session); -int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data); -int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data); +static int ci_ComputeSourceData(pSourceData source_data, pObjSession session); +static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data); +static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data); /** Parameter Functions. **/ // LINK #params -int ci_GetParamType(void* inf_v, const char* attr_name); -int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); -int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); +static int ci_GetParamType(void* inf_v, const char* attr_name); +static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); +static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val); /** Driver Functions. **/ // LINK #driver @@ -642,6 +605,7 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt); int clusterQueryClose(void* qy_v, pObjTrxTree* oxt); int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt); int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt); char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt); char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt); int clusterInfo(void* inf_v, pObjectInfo info); @@ -650,6 +614,10 @@ int clusterInfo(void* inf_v, pObjectInfo info); // LINK #method char* clusterGetFirstMethod(void* inf_v, pObjTrxTree oxt); char* clusterGetNextMethod(void* inf_v, pObjTrxTree oxt); +static int ci_PrintEntry(pXHashEntry entry, void* arg); +static void ci_CacheFreeSourceData(pXHashEntry entry, void* path); +static void ci_CacheFreeCluster(pXHashEntry entry, void* path); +static void ci_CacheFreeSearch(pXHashEntry entry, void* path); int clusterExecuteMethod(void* inf_v, char* methodname, pObjData param, pObjTrxTree oxt); /** Unimplemented DriverFunctions. **/ @@ -663,7 +631,6 @@ int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree oxt); void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree oxt); int clusterCommit(void* inf_v, pObjTrxTree *oxt); -pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt); /** ================ Parsing Functions ================ **/ /** ANCHOR[id=parsing] **/ @@ -682,7 +649,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb *** still don't know if it works correctly... or really how it works. Please *** review this code carefully! ***/ -int ci_ParseAttribute( +static int ci_ParseAttribute( pStructInf inf, char* attr_name, int datatype, @@ -787,7 +754,7 @@ int ci_ParseAttribute( *** evaluating parameter variables in the structure file. *** @returns The data algorithm, or ALGORITHM_NULL on failure. ***/ -ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObjects param_list) +static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObjects param_list) { /** Get the algorithm attribute. **/ char* algorithm; @@ -824,7 +791,7 @@ ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObjects param *** evaluating parameter variables in the structure file. *** @returns The similarity measure, or SIMILARITY_NULL on failure. ***/ -SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects param_list) +static SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects param_list) { /** Get the similarity_measure attribute. **/ char* measure; @@ -858,17 +825,17 @@ SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects param_ *** cache entry keys. *** @returns A new pSourceData struct on success, or NULL on failure. ***/ -pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path) +static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path) { char* buf; /** Get source. **/ if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err; - char* source_path = check_ptr(strdup(buf)); + char* source_path = check_ptr(nmSysStrdup(buf)); /** Get attribute name. **/ if (ci_ParseAttribute(inf, "attr_name", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err; - char* attr_name = check_ptr(strdup(buf)); + char* attr_name = check_ptr(nmSysStrdup(buf)); /** Create cache entry key. **/ const size_t len = strlen(path) + strlen(source_path) + strlen(attr_name) + 3lu; @@ -885,8 +852,8 @@ pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* p tprintf("--> Name: %s\n", source_maybe->Name); /* Cause invalid read if cache was incorrectly freed. */ /** Free data we don't need. */ - free(source_path); - free(attr_name); + nmSysFree(source_path); + nmSysFree(attr_name); nmSysFree(key); /** Return the cached source data. **/ @@ -896,7 +863,7 @@ pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* p /** Cache miss: Create a new source data object. **/ pSourceData source_data = check_ptr(nmMalloc(sizeof(SourceData))); memset(source_data, 0, sizeof(SourceData)); - source_data->Name = check_ptr(strdup(inf->Name)); + source_data->Name = check_ptr(nmSysStrdup(inf->Name)); source_data->Key = key; source_data->SourcePath = source_path; source_data->AttrName = attr_name; @@ -928,7 +895,7 @@ pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* p *** used to generate cache entry keys. *** @returns A new pClusterData struct on success, or NULL on failure. ***/ -pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) +static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) { int result; @@ -942,13 +909,13 @@ pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) memset(cluster_data, 0, sizeof(ClusterData)); /** Basic Properties. **/ - cluster_data->Name = check_ptr(strdup(inf->Name)); + cluster_data->Name = check_ptr(nmSysStrdup(inf->Name)); cluster_data->SourceData = source_data; check(objCurrentDate(&cluster_data->DateCreated)); /** Get algorithm. **/ cluster_data->ClusterAlgorithm = ci_ParseClusteringAlgorithm(inf, param_list); - if (cluster_data->ClusterAlgorithm == ALGORITHM_NULL) goto err; + if (cluster_data->ClusterAlgorithm == ALGORITHM_NULL) goto err_free_cluster; /** Handle no clustering case. **/ if (cluster_data->ClusterAlgorithm == ALGORITHM_NONE) @@ -1008,7 +975,7 @@ pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) if (result == -1) goto err_free_cluster; if (result == 0) { - if (max_iterations < 0) + if (max_iterations < 1) { mssErrorf(1, "Cluster", "Invalid value for [max_iterations : uint]: %d", max_iterations); goto err_free_cluster; @@ -1033,8 +1000,7 @@ pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) if (stStructType(group_inf) != ST_T_SUBGROUP) continue; /** Select array by group type. **/ - assert(group_inf->UsrType != NULL); - if (strcmp(group_inf->UsrType, "cluster/cluster")) continue; + if (strcmp(check_ptr(group_inf->UsrType), "cluster/cluster") != 0) continue; /** Subcluster found. **/ pClusterData sub_cluster = ci_ParseClusterData(group_inf, node_data); @@ -1125,9 +1091,9 @@ pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) xaDeInit(&sub_clusters); err_free_cluster: - nmFree(cluster_data, sizeof(ClusterData)); + ci_FreeClusterData(cluster_data, false); - err: + // err: mssErrorf(0, "Cluster", "Failed to parse cluster from group \"%s\".", inf->Name); return NULL; } @@ -1147,7 +1113,7 @@ pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) *** the cluster pointed to by the source attribute. *** @returns A new pSearchData struct on success, or NULL on failure. ***/ -pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) +static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) { tprintf("Parsing search: %s\n", inf->Name); @@ -1157,7 +1123,7 @@ pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) memset(search_data, 0, sizeof(SearchData)); /** Get basic information. **/ - search_data->Name = check_ptr(strdup(inf->Name)); + search_data->Name = check_ptr(nmSysStrdup(inf->Name)); check(objCurrentDate(&search_data->DateCreated)); /** Get source. **/ @@ -1249,7 +1215,7 @@ pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) *** @param obj The parent object struct. *** @returns A new pNodeData struct on success, or NULL on failure. ***/ -pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) +static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) { int ret; @@ -1452,30 +1418,18 @@ pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) // LINK #functions /** @param source_data A pSourceData struct, freed by this function. **/ -void ci_FreeSourceData(pSourceData source_data) +static void ci_FreeSourceData(pSourceData source_data) { /** Free top level attributes, if they exist. **/ - if (source_data->Name != NULL) - { - free(source_data->Name); - source_data->Name = NULL; - } - if (source_data->SourcePath != NULL) - { - free(source_data->SourcePath); - source_data->SourcePath = NULL; - } - if (source_data->AttrName != NULL) - { - free(source_data->AttrName); - source_data->AttrName = NULL; - } + if (source_data->Name != NULL) nmSysFree(source_data->Name); + if (source_data->SourcePath != NULL) nmSysFree(source_data->SourcePath); + if (source_data->AttrName != NULL) nmSysFree(source_data->AttrName); /** Free fetched data, if it exists. **/ if (source_data->Data != NULL) { for (unsigned int i = 0u; i < source_data->nVectors; i++) - free(source_data->Data[i]); + nmSysFree(source_data->Data[i]); nmFree(source_data->Data, source_data->nVectors * sizeof(char*)); source_data->Data = NULL; } @@ -1500,16 +1454,17 @@ void ci_FreeSourceData(pSourceData source_data) *** @param cluster_data The cluster data struct to free. *** @param recrusive Whether to recursively free subclusters. ***/ -void ci_FreeClusterData(pClusterData cluster_data, bool recursive) +static void ci_FreeClusterData(pClusterData cluster_data, bool recursive) { /** Free top level cluster data. **/ - if (cluster_data->Name != NULL) free(cluster_data->Name); + if (cluster_data->Name != NULL) nmSysFree(cluster_data->Name); /** Free computed data, if it exists. **/ if (cluster_data->Labels != NULL) { const unsigned int nVectors = cluster_data->SourceData->nVectors; nmFree(cluster_data->Labels, nVectors * sizeof(unsigned int)); + cluster_data->Labels = NULL; } /** Free subclusters recursively. **/ @@ -1521,6 +1476,7 @@ void ci_FreeClusterData(pClusterData cluster_data, bool recursive) ci_FreeClusterData(cluster_data->SubClusters[i], recursive); } nmFree(cluster_data->SubClusters, cluster_data->nSubClusters * sizeof(void*)); + cluster_data->SubClusters = NULL; } /** Free the cluster struct. **/ @@ -1530,14 +1486,15 @@ void ci_FreeClusterData(pClusterData cluster_data, bool recursive) // LINK #functions /** @param search_data A pSearchData struct, freed by this function. **/ -void ci_FreeSearchData(pSearchData search_data) +static void ci_FreeSearchData(pSearchData search_data) { - if (search_data->Name != NULL) free(search_data->Name); + if (search_data->Name != NULL) nmSysFree(search_data->Name); if (search_data->Dups != NULL) { for (unsigned int i = 0; i < search_data->nDups; i++) nmFree(search_data->Dups[i], sizeof(Dup)); nmFree(search_data->Dups, search_data->nDups * sizeof(void*)); + search_data->Dups = NULL; } nmFree(search_data, sizeof(SearchData)); } @@ -1545,7 +1502,7 @@ void ci_FreeSearchData(pSearchData search_data) // LINK #functions /** @param node_data A pNodeData struct, freed by this function. **/ -void ci_FreeNodeData(pNodeData node_data) +static void ci_FreeNodeData(pNodeData node_data) { /** Free parsed params, if they exist. **/ if (node_data->Params != NULL) @@ -1611,7 +1568,7 @@ void ci_FreeNodeData(pNodeData node_data) *** @param source_data The source data struct to be queried. *** @returns The size in bytes of the struct and all internal allocated data. ***/ -unsigned int ci_SizeOfSourceData(pSourceData source_data) +static unsigned int ci_SizeOfSourceData(pSourceData source_data) { unsigned int size = 0u; if (source_data->Name != NULL) size += strlen(source_data->Name) * sizeof(char); @@ -1646,7 +1603,7 @@ unsigned int ci_SizeOfSourceData(pSourceData source_data) *** @param recrusive Whether to recursively free subclusters. *** @returns The size in bytes of the struct and all internal allocated data. ***/ -unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive) +static unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive) { unsigned int size = 0u; if (cluster_data->Name != NULL) size += strlen(cluster_data->Name) * sizeof(char); @@ -1676,7 +1633,7 @@ unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive) *** @param search_data The search data struct to be queried. *** @returns The size in bytes of the struct and all internal allocated data. ***/ -unsigned int ci_SizeOfSearchData(pSearchData search_data) +static unsigned int ci_SizeOfSearchData(pSearchData search_data) { unsigned int size = 0u; if (search_data->Name != NULL) size += strlen(search_data->Name) * sizeof(char); @@ -1686,100 +1643,6 @@ unsigned int ci_SizeOfSearchData(pSearchData search_data) } -// LINK #functions -/*** Returns the deep size of a NodeData struct, including the size of all - *** allocated substructures. As far as I can tell, this is probably only - *** useful for cache management and debugging. - *** - *** Note that Key is ignored because it is a pointer to data managed by the - *** caching systems, so it is not technically part of the struct. - *** - *** @param node_data The cluster data struct to be queried. - *** @returns The size in bytes of the struct and all internal allocated data. - ***/ -unsigned int ci_SizeOfNodeData(pNodeData node_data) - { - unsigned int size = 0u; - if (node_data->Params != NULL) - { - /** Approximate. **/ - size += node_data->nParams * (sizeof(Param) + sizeof(pParam)); - } - if (node_data->ParamList == NULL) - { - /** Approximate. **/ - size += node_data->nParams * 30u * sizeof(char); - size += sizeof(pParamObjects); - } - if (node_data->Clusters != NULL) - { - /** Note: This data is also stored in a cache. **/ - for (unsigned int i = 0u; i < node_data->nClusters; i++) - size += ci_SizeOfClusterData(node_data->Clusters[i], true); - size += node_data->nClusters * sizeof(pClusterData); - } - if (node_data->Searches != NULL) - { - /** Note: This data is also stored in a cache. **/ - for (unsigned int i = 0u; i < node_data->nSearches; i++) - size += ci_SizeOfSearchData(node_data->Searches[i]); - size += node_data->nSearches * sizeof(pSearchData); - } - if (node_data->SourceData != NULL) - { - /** Note: This data is also stored in a cache. **/ - size += ci_SizeOfSourceData(node_data->SourceData); - } - size += sizeof(NodeData); - return size; - } - - -/** ================ Cache Invalidation Functions ================ **/ -/** ANCHOR[id=invalidation] **/ -// LINK #functions - -/** Intended for use in xhClearKeySafe(). **/ -void ci_CacheFreeSourceData(pXHashEntry entry, void* _) - { - /** Extract hash entry. **/ - char* key = entry->Key; - pSourceData source_data = (pSourceData)entry->Data; - - /** Free data. **/ - tprintf("- source: \"%s\"\n", key); - ci_FreeSourceData(source_data); - nmSysFree(key); - } - -// LINK #functions -/** Intended for use in xhClearKeySafe(). **/ -void ci_CacheFreeCluster(pXHashEntry entry, void* _) - { - /** Extract hash entry. **/ - char* key = entry->Key; - pClusterData cluster_data = (pClusterData)entry->Data; - - /** Free data. **/ - tprintf("- cluster: \"%s\"\n", key); - ci_FreeClusterData(cluster_data, false); - nmSysFree(key); - } - -// LINK #functions -/** Intended for use in xhClearKeySafe(). **/ -void ci_CacheFreeSearch(pXHashEntry entry, void* _) - { - /** Extract hash entry. **/ - char* key = entry->Key; - pSearchData search_data = (pSearchData)entry->Data; - - /** Free data. **/ - tprintf("- search: \"%s\"\n", key); - ci_FreeSearchData(search_data); - nmSysFree(key); - } - /** ================ Computation Functions ================ **/ /** ANCHOR[id=computation] **/ // LINK #functions @@ -1795,7 +1658,7 @@ void ci_CacheFreeSearch(pXHashEntry entry, void* _) *** @returns 0 if successful, or *** -1 other value on failure. ***/ -int ci_ComputeSourceData(pSourceData source_data, pObjSession session) +static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) { /** If the vectors are already computed, we're done. **/ if (source_data->Vectors != NULL) return 0; @@ -1823,7 +1686,7 @@ int ci_ComputeSourceData(pSourceData source_data, pObjSession session) /** Drop source_data->Data. **/ for (unsigned int i = 0u; i < source_data->nVectors; i++) - free(source_data->Data[i]); + nmSysFree(source_data->Data[i]); nmFree(source_data->Data, source_data->nVectors * sizeof(char*)); source_data->Data = NULL; source_data->nVectors = 0; @@ -1969,7 +1832,7 @@ int ci_ComputeSourceData(pSourceData source_data, pObjSession session) } /** Store value. **/ - char* dup_val = check_ptr(strdup(val)); + char* dup_val = check_ptr(nmSysStrdup(val)); check_strict(xaAddItem(&data_xarray, (void*)dup_val)); check_strict(xaAddItem(&vector_xarray, (void*)vector)); @@ -2000,7 +1863,7 @@ int ci_ComputeSourceData(pSourceData source_data, pObjSession session) if (data_xarray.nAlloc != 0) { for (unsigned int i = 0u; i < data_xarray.nItems; i++) - free(data_xarray.Items[i]); + nmSysFree(data_xarray.Items[i]); check(xaDeInit(&data_xarray)); } if (vector_xarray.nAlloc != 0) @@ -2044,7 +1907,7 @@ int ci_ComputeSourceData(pSourceData source_data, pObjSession session) *** @returns 0 if successful, or *** -1 other value on failure. ***/ -int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) +static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) { /** If the clusters are alreadyd computed, we're done. **/ if (cluster_data->Labels != NULL) return 0; @@ -2133,7 +1996,7 @@ int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) *** @returns 0 if successful, or *** -1 other value on failure. ***/ -int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) +static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) { int ret; @@ -2209,7 +2072,7 @@ int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) *** *** LINK ../../centrallix-lib/include/datatypes.h:72 ***/ -int ci_GetParamType(void* inf_v, const char* attr_name) +static int ci_GetParamType(void* inf_v, const char* attr_name) { tprintf("Call to ci_GetParamType(\"%s\")\n", attr_name); pNodeData node_data = (pNodeData)inf_v; @@ -2253,7 +2116,7 @@ int ci_GetParamType(void* inf_v, const char* attr_name) *** *** LINK ../../centrallix-lib/include/datatypes.h:72 ***/ -int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) +static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) { tprintf("Call to ci_GetParamValue(\"%s\", %s)\n", attr_name, ci_TypeToStr(datatype)); pNodeData node_data = (pNodeData)inf_v; @@ -2286,9 +2149,9 @@ int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) return -1; } - +// LINK #functions /** Not implemented. **/ -int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) +static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) { tprintf("Call to ci_SetParamValue(%s, %s)\n", attr_name, ci_TypeToStr(datatype)); mssErrorf(1, "Cluster", "SetParamValue() is not implemented because clusters are imutable."); @@ -2714,9 +2577,9 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) // LINK #functions /*** Get the value of a cluster driver instance attribute. *** - *** @param inf_v Node data containing the list of paramenters. - *** @param attr_name The name of the requested paramenter. - *** @param datatype The expected datatype of the parameter value. + *** @param inf_v The driver instance to be read. + *** @param attr_name The name of the requested attribute. + *** @param datatype The expected datatype of the attribute value. *** See datatypes.h for a list of valid datatypes. *** @param oxt The object system tree, similar to a kind of "scope" (unused). *** @param val A pointer to a location where a pointer to the requested @@ -2909,14 +2772,14 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val if (strcmp(attr_name, "num_clusters") == 0) { if (target->NumClusters > INT_MAX) - fprintf(stderr, "Warning: num_clusters value of %u exceeds INT_MAX.\n", target->NumClusters); + fprintf(stderr, "Warning: num_clusters value of %u exceeds INT_MAX (%d).\n", target->NumClusters, INT_MAX); val->Integer = (int)target->NumClusters; return 0; } if (strcmp(attr_name, "max_iterations") == 0) { if (target->MaxIterations > INT_MAX) - fprintf(stderr, "Warning: max_iterations value of %u exceeds INT_MAX.\n", target->MaxIterations); + fprintf(stderr, "Warning: max_iterations value of %u exceeds INT_MAX (%d).\n", target->MaxIterations, INT_MAX); val->Integer = (int)target->MaxIterations; return 0; } @@ -3030,7 +2893,20 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val // LINK #functions -/** Not implemented. **/ +/*** Create a new presentation hints object, describing this attribute on the + *** provided cluster driver instance. + *** + *** Note: expCompileExpression() and nmSysStrdup() are run unchecked because + *** the worst case senario is that the fields are set to null and ignored, + *** which I consider to be better than ending the script because one of + *** them failed. + *** + *** @param inf_v The driver instance to be read. + *** @param attr_name The name of the requested attribute. + *** @param oxt The object system tree, similar to a kind of "scope" (unused). + *** @returns A presentation hints object, if successsful, + *** NULL if an error occures. + ***/ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt) { tprintf("Warning: clusterPresentationHints(\"%s\") is under active development.", attr_name); @@ -3251,7 +3127,6 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb char buf[16u]; snprintf(buf, sizeof(buf), "%u", source_data->nVectors); hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - return 0; } /** Other hints. **/ @@ -3297,7 +3172,6 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb char buf[16u]; snprintf(buf, sizeof(buf), "%u", source_data->nVectors); hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - return 0; } /** Other hints. **/ @@ -3330,13 +3204,17 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb default: mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); - return NULL; + goto err; } end: check(expFreeParamList(tmp_list)); return hints; + + err: + mssErrorf(0, "Cluster", "Failed execute generate presentation hints."); + return NULL; } @@ -3460,11 +3338,15 @@ int clusterInfo(void* inf_v, pObjectInfo info) default: mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); - return -1; + goto err; } tprintf("Info result: "INT_TO_BINARY_PATTERN"\n", INT_TO_BINARY(info->Flags)); return 0; + + err: + mssErrorf(0, "Cluster", "Failed execute get info."); + return -1; } @@ -3509,6 +3391,7 @@ char* clusterGetNextMethod(void* inf_v, pObjTrxTree oxt) return (i < nMETHOD_NAME) ? METHOD_NAME[i] : END_OF_METHODS; } + // LINK #functions /** Intended for use in xhForEach(). **/ static int ci_PrintEntry(pXHashEntry entry, void* arg) @@ -3524,7 +3407,6 @@ static int ci_PrintEntry(pXHashEntry entry, void* arg) char* path = (char*)args[2]; /** If a path is provided, check that it matches the start of the key. **/ -// if (path != NULL) printf("Comparing \"%s\" to \"%s\"[0,%lu].\n", path, key, strlen((char*)path)); if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return 0; /** Handle type. **/ @@ -3571,6 +3453,60 @@ static int ci_PrintEntry(pXHashEntry entry, void* arg) } +// LINK #functions +/** Intended for use in xhClearKeySafe(). **/ +static void ci_CacheFreeSourceData(pXHashEntry entry, void* path) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pSourceData source_data = (pSourceData)entry->Data; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; + + /** Free data. **/ + tprintf("- source: \"%s\"\n", key); + ci_FreeSourceData(source_data); + nmSysFree(key); + } + + +// LINK #functions +/** Intended for use in xhClearKeySafe(). **/ +static void ci_CacheFreeCluster(pXHashEntry entry, void* path) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pClusterData cluster_data = (pClusterData)entry->Data; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; + + /** Free data. **/ + tprintf("- cluster: \"%s\"\n", key); + ci_FreeClusterData(cluster_data, false); + nmSysFree(key); + } + + +// LINK #functions +/** Intended for use in xhClearKeySafe(). **/ +static void ci_CacheFreeSearch(pXHashEntry entry, void* path) + { + /** Extract hash entry. **/ + char* key = entry->Key; + pSearchData search_data = (pSearchData)entry->Data; + + /** If a path is provided, check that it matches the start of the key. **/ + if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; + + /** Free data. **/ + tprintf("- search: \"%s\"\n", key); + ci_FreeSearchData(search_data); + nmSysFree(key); + } + + // LINK #functions /*** Executes a method with the given name. *** @@ -3582,28 +3518,38 @@ static int ci_PrintEntry(pXHashEntry entry, void* arg) int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree oxt) { tprintf("Warning: clusterExecuteMethod(\"%s\") is under active development.\n", method_name); + pDriverData driver_data = (pDriverData)inf_v; /** Cache management method. **/ if (strcmp(method_name, "cache") == 0) { + char* path = NULL; + /** Second parameter is required. **/ if (param->String == NULL) { mssErrorf(1, "Cluster", "param : \"show\" | \"show_all\" | \"drop_all\" is required for the cache method." ); - return -1; + goto err; } - /** Show cache. **/ + /** show and show_all. **/ + bool show = false; if (strcmp(param->String, "show") == 0) { - const pObject obj = ((pDriverData)inf_v)->NodeData->Obj; - char* path = ci_file_path(obj); - + show = true; + path = ci_file_path(driver_data->NodeData->Obj); + } + if (strcmp(param->String, "show_all") == 0) show = true; + + if (show) + { /** Print cache info table. **/ unsigned int i = 1u, source_bytes = 0u, cluster_bytes = 0u, search_bytes = 0u; - printf("\nShowing cache for \"%s\":\n", path); + printf("\nShowing cache for "); + if (path != NULL) printf("\"%s\":\n", path); + else printf("all files:\n"); printf("%-8s %-16s %-12s %s\n", "Type", "Name", "Size", "Cache Entry Key"); xhForEach(&ClusterCaches.SourceCache, ci_PrintEntry, (void*[]){&i, &source_bytes, path}); i++; xhForEach(&ClusterCaches.ClusterCache, ci_PrintEntry, (void*[]){&i, &cluster_bytes, path}); i++; @@ -3627,35 +3573,27 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx return 0; } - - /** Show all cache. **/ - if (strcmp(param->String, "show_all") == 0) + /** drop and drop_all. **/ + bool drop = false; + if (strcmp(param->String, "drop") == 0) { - /** Print cache info table. **/ - unsigned int i = 1u, total_bytes = 0u; - tprintf("Showing cluster driver cache for all files...\n"); - printf("%-8s %-16s %-12s %s\n", "Type", "Name", "Size", "Cache Entry Key"); - xhForEach(&ClusterCaches.SourceCache, ci_PrintEntry, (void*[]){&i, &total_bytes, NULL}); i++; - xhForEach(&ClusterCaches.ClusterCache, ci_PrintEntry, (void*[]){&i, &total_bytes, NULL}); i++; - xhForEach(&ClusterCaches.SearchCache, ci_PrintEntry, (void*[]){&i, &total_bytes, NULL}); i++; - - /** Print total size. **/ - char buf[16]; - snprint_bytes(buf, sizeof(buf), total_bytes); - printf("Total cache size: %s\n", buf); - return 0; + show = true; + path = ci_file_path(driver_data->NodeData->Obj); } + if (strcmp(param->String, "drop_all") == 0) drop = true; - /** Drop allcache. **/ - if (strcmp(param->String, "drop_all") == 0) + if (drop) { - tprintf("Dropping cluster driver cache for all files...\n"); + printf("\nDropping cache for "); + if (path != NULL) printf("\"%s\":\n", path); + else printf("all files:\n"); + /*** Free caches in reverse of the order they are created in case *** cached data relies on its source during the freeing process. ***/ - xhClearKeySafe(&ClusterCaches.SearchCache, ci_CacheFreeSearch, NULL); - xhClearKeySafe(&ClusterCaches.ClusterCache, ci_CacheFreeCluster, NULL); - xhClearKeySafe(&ClusterCaches.SourceCache, ci_CacheFreeSourceData, NULL); + xhClearKeySafe(&ClusterCaches.SearchCache, ci_CacheFreeSearch, path); + xhClearKeySafe(&ClusterCaches.ClusterCache, ci_CacheFreeCluster, path); + xhClearKeySafe(&ClusterCaches.SourceCache, ci_CacheFreeSourceData, path); printf("Cache dropped.\n"); return 0; } @@ -3665,9 +3603,14 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx "Expected param : \"show\" | \"show_all\" | \"drop_all\" the cache method, but got: \"%s\"", param->String ); - return -1; + goto err; } + + /** Unknown parameter. **/ + mssErrorf(1, "Cluster", "Unknown command: \"%s\"", method_name); + err: + mssErrorf(0, "Cluster", "Failed execute command."); return -1; } diff --git a/centrallix/test_obj.c b/centrallix/test_obj.c index c4c64e25b..5ef492de3 100644 --- a/centrallix/test_obj.c +++ b/centrallix/test_obj.c @@ -1443,6 +1443,7 @@ testobj_do_cmd(pObjSession s, char* cmd, int batch_mode, pLxSession inp_lx) else { printf("Unknown command '%s'\n",cmdname); + mlxCloseSession(ls); return -1; } From cf0dbb5fb1f061c617e65e6fd91924f4d389ec9f Mon Sep 17 00:00:00 2001 From: Israel Date: Mon, 27 Oct 2025 17:14:15 -0600 Subject: [PATCH 04/30] Finish implementing major features for the cluster driver. --- centrallix-lib/include/clusters.h | 69 +- centrallix-lib/include/glyph.h | 78 + centrallix-lib/include/util.h | 59 +- centrallix-lib/src/clusters.c | 716 ++++----- centrallix-lib/src/util.c | 25 +- centrallix-os/cluster-schema.cluster | 27 +- centrallix-os/file.cluster | 3 + centrallix-sysdoc/string_similarity.md | 167 +++ centrallix/include/obj.h | 1 + centrallix/osdrivers/objdrv_cluster.c | 1854 ++++++++++++++++-------- centrallix/test_obj.c | 7 + 11 files changed, 1936 insertions(+), 1070 deletions(-) create mode 100644 centrallix-lib/include/glyph.h create mode 100644 centrallix-sysdoc/string_similarity.md diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index 2605b4314..d8b7f97c6 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -1,3 +1,5 @@ +#ifndef CLUSTERS_H +#define CLUSTERS_H /************************************************************************/ /* Centrallix Application Server System */ @@ -23,7 +25,7 @@ /* A copy of the GNU General Public License has been included in this */ /* distribution in the file "COPYING". */ /* */ -/* Module: lib_cluster.c */ +/* Module: lib_cluster.h */ /* Author: Israel Fuller */ /* Creation: September 29, 2025 */ /* Description: Internal algorithms for the cluster object driver. */ @@ -40,6 +42,7 @@ #define CA_NUM_DIMS 251 /* aka. The vector table size. */ +/// LINK ../../centrallix-sysdoc/string_comparison.md#cosine_charsets /** The character used to create a pair with the first and last characters of a string. **/ #define CA_BOUNDARY_CHAR ('a' - 1) @@ -57,37 +60,47 @@ typedef struct } Dup, *pDup; +/** Registering all defined types for debugging. **/ +#define ca_init() \ + nmRegister(sizeof(pVector), "pVector"); \ + nmRegister(sizeof(pCentroid), "pCentroid"); \ + nmRegister(pCentroidSize, "Centroid"); \ + nmRegister(sizeof(Dup), "Dup") + pVector ca_build_vector(const char* str); unsigned int ca_sparse_len(const pVector vector); void ca_free_vector(pVector sparse_vector); -void ca_kmeans( +int ca_kmeans( pVector* vectors, const unsigned int num_vectors, - unsigned int* labels, const unsigned int num_clusters, const unsigned int max_iter, - const double improvement_threshold -); -pXArray ca_search( - pVector* vectors, - const unsigned int num_vectors, - const unsigned int* labels, - const double dupe_threshold -); -pXArray ca_lightning_search( - pVector* vectors, - const unsigned int num_vectors, - const double dupe_threshold -); -unsigned int ca_edit_dist( - const char* str1, - const char* str2, - const size_t str1_length, - const size_t str2_length -); -pXArray ca_phone_search( - char dataset[][10u], - const unsigned int dataset_size, - const double dupe_threshold -); -void ca_init(); + const double min_improvement, + unsigned int* labels, + double* vector_sims); + +/** Comparison functions, for ca_search(). **/ +double ca_cos_compare(void* v1, void* v2); +double ca_lev_compare(void* str1, void* str2); + +void* ca_most_similar( + void* target, + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double threshold); +pXArray ca_sliding_search( + void** data, + const unsigned int num_data, + const unsigned int window_size, + const double (*similarity)(void*, void*), + const double dupe_threshold, + pXArray dups); +pXArray ca_complete_search( + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double dupe_threshold, + pXArray dups); + +#endif /* End of .h file. */ diff --git a/centrallix-lib/include/glyph.h b/centrallix-lib/include/glyph.h new file mode 100644 index 000000000..5f78eab5d --- /dev/null +++ b/centrallix-lib/include/glyph.h @@ -0,0 +1,78 @@ +#ifndef GLYPH_H +#define GLYPH_H + +/************************************************************************/ +/* Centrallix Application Server System */ +/* Centrallix Core */ +/* */ +/* Copyright (C) 1998-2012 LightSys Technology Services, Inc. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ +/* */ +/* A copy of the GNU General Public License has been included in this */ +/* distribution in the file "COPYING". */ +/* */ +/* Module: glyph.h */ +/* Author: Israel Fuller */ +/* Creation: October 27, 2025 */ +/* Description: A simple debug visualizer to make pretty patterns in */ +/* developer's terminal which can be surprisingly useful */ +/* for debugging algorithms. */ +/************************************************************************/ + +#include + +/** Uncomment to use glyphs. **/ +/** TODO: Israel - Comment this out. **/ +// #define ENABLE_GLYPHS + +#ifdef ENABLE_GLYPHS +#define glyph_print(s) printf("%s", s); +/*** Initialize a simple debug visualizer to make pretty patterns in the + *** developer's terminal. Great for when you need to run a long task and + *** want a super simple way to make sure it's still working. + *** + *** @attention - Relies on storing data in variables in scope, so calling + *** glyph() requires a call to glyph_init() previously in the same scope. + *** + *** @param name The symbol name of the visualizer. + *** @param str The string printed for the visualization. + *** @param interval The number of invokations of glyph() required to print. + *** @param flush Whether to flush on output. + ***/ +#define glyph_init(name, str, interval, flush) \ + const char* vis_##name##_str = str; \ + const unsigned int vis_##name##_interval = interval; \ + const bool vis_##name##_flush = flush; \ + unsigned int vis_##name##_i = 0u; + +/*** Invoke a visualizer. + *** + *** @param name The name of the visualizer to invoke. + ***/ +#define glyph(name) \ + if (++vis_##name##_i % vis_##name##_interval == 0) \ + { \ + glyph_print(vis_##name##_str); \ + if (vis_##name##_flush) fflush(stdout); \ + } +#else +#define glyph_print(str) +#define glyph_init(name, str, interval, flush) +#define glyph(name) +#endif + +#endif /* End of .h file. */ diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index 12019abfb..dd821767f 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -46,6 +46,7 @@ extern "C" { #endif #ifndef __cplusplus +#include /** TODO: Greg, is the __typeof__ syntax from GCC a portability concern? **/ @@ -79,58 +80,72 @@ extern "C" { (_a > _b) ? _a : _b; \ }) +/** File name macro, expanding functionality like __FILE__ and __LINE__. **/ +#define __FILENAME__ \ + ({ \ + const char* last_directory = strrchr(__FILE__, '/'); \ + ((last_directory != NULL) ? last_directory + 1 : __FILE__); \ + }) + /** Error Handling. **/ -void fail(const char* function_name, int code); +void print_diagnostics(int code, const char* function_name, const char* file_name, const int line_number); -/*** Helper function for compact error handling on library & system function calls. - *** Any non-zero value is treated as an error, exiting the program. +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is not zero. Not intended for user errors. *** *** @param result The result of the function we're checking. - *** @returns result + *** @returns Whether the passed function succeeded. ***/ #define check(result) \ ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ __typeof__ (result) _r = (result); \ - if (_r != 0) fail(#result, _r); \ - _r; \ + const bool success = (_r == 0); \ + if (!success) print_diagnostics(_r, #result, __FILE__, __LINE__); \ + success; \ }) - -/*** Helper function for compact error handling on library & system function calls. - *** Any negative is treated as an error, exiting the program. + +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is negative. Not intended for user errors. *** *** @param result The result of the function we're checking. - *** @returns result + *** @returns Whether the passed function succeeded. ***/ #define check_neg(result) \ ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ __typeof__ (result) _r = (result); \ - if (_r < 0) fail(#result, _r); \ - _r; \ + const bool success = (_r >= 0); \ + if (!success) print_diagnostics(_r, #result, __FILE__, __LINE__); \ + success; \ }) -/*** Helper function for compact error handling on library & system function calls. - *** Any value of -1 is treated as an error, exiting the program. +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is -1. Not intended for user errors. *** *** @param result The result of the function we're checking. - *** @returns result + *** @returns Whether the passed function succeeded. ***/ -#define check_strict(result) \ +#define check_weak(result) \ ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ __typeof__ (result) _r = (result); \ - if (_r == -1) fail(#result, _r); \ - _r; \ + const bool success = (_r != -1); \ + if (!success) print_diagnostics(_r, #result, __FILE__, __LINE__); \ + success; \ }) -/*** Helper function for compact error handling on library & system function calls. - *** Any null value is treated as an error, exiting the program. +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is a NULL pointer. Not intended for user errors. *** - *** @param result The result of the function we're checking + *** @param result The result of the function we're checking. *** @returns result ***/ #define check_ptr(result) \ ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ __typeof__ (result) _r = (result); \ - if (_r == NULL) fail(#result, 0); \ + if (_r == NULL) print_diagnostics(0, #result, __FILE__, __LINE__); \ _r; \ }) diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 4e41d449d..90599269c 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -27,10 +27,10 @@ /* Author: Israel Fuller */ /* Creation: September 29, 2025 */ /* Description: Internal algorithms for the cluster object driver. */ -/* See centrallix-sysdoc/EAV_Pivot.md for more information. */ /************************************************************************/ -#include +/** This file has additional documentation in string_similarity.md. **/ + #include #include #include @@ -42,6 +42,7 @@ #include #include "clusters.h" +#include "glyph.h" #include "newmalloc.h" #include "util.h" #include "xarray.h" @@ -55,13 +56,6 @@ ***/ static unsigned int hash_char_pair(const unsigned int num1, const unsigned int num2) { - if (num1 == CA_BOUNDARY_CHAR && num2 == CA_BOUNDARY_CHAR) - { - // fprintf(stderr, - // "hash_char_pair(%u, %u) - Warning: Pair of boundary characters.\n", - // num1, num2 - // ); - } const double sum = (num1 * num1 * num1) + (num2 * num2 * num2); const double scale = ((double)num1 + 1.0) / ((double)num2 + 1.0); const unsigned int hash = (unsigned int)round(sum * scale) - 1u; @@ -201,15 +195,8 @@ pVector ca_build_vector(const char* str) /** Allocate space for sparse vector. **/ const size_t sparse_vector_size = size * sizeof(int); - pVector sparse_vector = (pVector)nmSysMalloc(sparse_vector_size); - if (sparse_vector == NULL) - { - fprintf(stderr, - "cli_build_vector(%s) - nmSysMalloc(%lu) failed.\n", - str, sparse_vector_size - ); - return NULL; - } + pVector sparse_vector = (pVector)check_ptr(nmSysMalloc(sparse_vector_size)); + if (sparse_vector == NULL) return NULL; /** Convert the dense vector above to a sparse vector. **/ unsigned int j = 0u, sparse_idx = 0u; @@ -248,46 +235,46 @@ void ca_free_vector(pVector sparse_vector) nmSysFree(sparse_vector); } -/*** Compute the magnitude of a sparsely allocated vector. +/*** Compute the length of a sparsely allocated vector. *** *** @param vector The vector. - *** @returns The computed magnitude. + *** @returns The computed length. ***/ -static double magnitude_sparse(const pVector vector) +unsigned int ca_sparse_len(const pVector vector) { - unsigned int magnitude = 0u; - for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) + unsigned int i = 0u; + for (unsigned int dim = 0u; dim < CA_NUM_DIMS;) { const int val = vector[i++]; /** Negative val represents -val 0s in the array, so skip that many values. **/ if (val < 0) dim += (unsigned)(-val); - /** We have a param_value, so square it and add it to the magnitude. **/ - else { magnitude += (unsigned)(val * val); dim++; } + /** We have a param_value, but we don't need to do anything with it. **/ + else dim++; } - return sqrt((double)magnitude); + return i; } -/*** Compute the length of a sparsely allocated vector. +/*** Compute the magnitude of a sparsely allocated vector. *** *** @param vector The vector. - *** @returns The computed length. + *** @returns The computed magnitude. ***/ -unsigned int ca_sparse_len(const pVector vector) +static double magnitude_sparse(const pVector vector) { - unsigned int i = 0u; - for (unsigned int dim = 0u; dim < CA_NUM_DIMS;) + unsigned int magnitude = 0u; + for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) { const int val = vector[i++]; /** Negative val represents -val 0s in the array, so skip that many values. **/ if (val < 0) dim += (unsigned)(-val); - /** We have a param_value, but we don't need to do anything with it. **/ - else dim++; + /** We have a param_value, so square it and add it to the magnitude. **/ + else { magnitude += (unsigned)(val * val); dim++; } } - return i; + return sqrt((double)magnitude); } /*** Compute the magnitude of a densely allocated centroid. @@ -417,6 +404,163 @@ static double sparse_similarity_to_centroid(const pVector v1, const pCentroid c2 ***/ #define sparse_dif_to_centroid(v1, c2) (1.0 - sparse_similarity_to_centroid(v1, c2)) +/*** Computes Levenshtein distance between two strings. + *** + *** @param str1 The first string. + *** @param str2 The second string. + *** @param length1 The length of the first string. + *** @param length1 The length of the first string. + *** + *** @attention - `Tip`: Pass 0 for the length of either string to infer it + *** using the null terminating character. Conversely, character arrays + *** with no null terminator are allowed if an explicit length is specified. + *** + *** @attention - `Complexity`: O(nm), where n and m are the lengths of str1 + *** and str2 (respectively). + *** + *** @skip + *** LINK ../../centrallix-sysdoc/string_comparison.md#levenshtein + ***/ +static unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) + { + /*** lev_matrix: + *** For all i and j, d[i][j] will hold the Levenshtein distance between + *** the first i characters of s and the first j characters of t. + *** + *** As they say, no dynamic programming algorithm is complete without a + *** matrix that you fill out and it has the answer in the final location. + ***/ + const size_t str1_len = (str1_length == 0u) ? strlen(str1) : str1_length; + const size_t str2_len = (str2_length == 0u) ? strlen(str2) : str2_length; + unsigned int* lev_matrix[str1_len + 1]; + for (unsigned int i = 0u; i < str1_len + 1u; i++) + lev_matrix[i] = nmMalloc((str2_len + 1) * sizeof(unsigned int)); + + /*** Base case #0: + *** Transforming an empty string into an empty string has 0 cost. + ***/ + lev_matrix[0][0] = 0u; + + /*** Base case #1: + *** Any source prefixe can be transformed into an empty string by + *** dropping each character. + ***/ + for (unsigned int i = 1u; i <= str1_len; i++) + lev_matrix[i][0] = i; + + /*** Base case #2: + *** Any target prefixes can be transformed into an empty string by + *** inserting each character. + ***/ + for (unsigned int j = 1u; j <= str2_len; j++) + lev_matrix[0][j] = j; + + /** General Case **/ + for (unsigned int i = 1u; i <= str1_len; i++) + { + for (unsigned int j = 1u; j <= str2_len; j++) + { + /** Equal characters need no changes. **/ + if (str1[i - 1] == str2[j - 1]) + lev_matrix[i][j] = lev_matrix[i - 1][j - 1]; + + /*** We need to make a change, so use the opereration with the + *** lowest cost out of delete, insert, replace, or swap. + ***/ + else + { + unsigned int cost_delete = lev_matrix[i - 1][j] + 1u; + unsigned int cost_insert = lev_matrix[i][j - 1] + 1u; + unsigned int cost_replace = lev_matrix[i-1][j-1] + 1u; + + /** If a swap is possible, calculate the cost. **/ + bool can_swap = ( + i > 1 && j > 1 && + str1[i - 1] == str2[j - 2] && + str1[i - 2] == str2[j - 1] + ); + unsigned int cost_swap = (can_swap) ? lev_matrix[i - 2][j - 2] + 1 : UINT_MAX; + + // Find the best operation. + lev_matrix[i][j] = min(min(min(cost_delete, cost_insert), cost_replace), cost_swap); + } + } + } + + /** Store result. **/ + unsigned int result = lev_matrix[str1_len][str2_len]; + + /** Cleanup. **/ + for (unsigned int i = 0u; i < str1_len + 1u; i++) + nmFree(lev_matrix[i], (str2_len + 1) * sizeof(unsigned int)); + + return result; + } + +/*** Compares two strings using their cosie simiarity, returning a value + *** between `0.0` (completely different) and `1.0` (identical). If either + *** OR BOTH strings are NULL, this function returns `0.0`. + *** + *** @attention - This function takes `void*` instead of `pVector` so that it + *** can be used as the similarity function in the ca_search() function + *** family without needing a messy typecast to avoid the compiler warning. + *** + *** @param v1 A `pVector` to the first string to compare. + *** @param v2 A `pVector` to the second string to compare. + *** @returns The cosine similarity between the two strings. + *** + *** @skip + *** LINK ../../centrallix-sysdoc/string_comparison.md#cosine + ***/ +double ca_cos_compare(void* v1, void* v2) + { + /** Input validation checks. **/ + if (v1 == NULL || v2 == NULL) return 0.0; + if (v1 == v2) return 1.0; + + /** Return the sparse similarity. **/ + return sparse_similarity((const pVector)v1, (const pVector)v2); + } + +/*** Compares two strings using their levenstien edit distance to compute a + *** similarity between `0.0` (completely different) and `1.0` (identical). + *** If both strings are empty, this function returns `1.0` (identical). If + *** either OR BOTH strings are NULL, this function returns `0.0`. + *** + *** @attention - This function takes `void*` instead of `char*` so that it + *** can be used as the similarity function in the ca_search() function + *** family without needing a messy typecast to avoid the compiler warning. + *** + *** @param str1 A `char*` to the first string to compare. + *** @param str2 A `char*` to the second string to compare. + *** @returns The levenshtein similarity between the two strings. + *** + *** @skip + *** LINK ../../centrallix-sysdoc/string_comparison.md#levenshtein + ***/ +double ca_lev_compare(void* str1, void* str2) + { + /** Input validation checks. **/ + if (str1 == NULL || str2 == NULL) return 0.0; + if (str1 == str2) return 1.0; + + /** Compute string length. **/ + const size_t len1 = strlen(str1); + const size_t len2 = strlen(str2); + + /** Empty strings are identical, avoiding a divide by zero. */ + if (len1 == 0lu && len2 == 0lu) return 1.0; + + /** Compute levenshtein edit distance. **/ + const unsigned int dist = edit_dist((const char*)str1, (const char*)str2, len1, len2); + + /** Normalize edit distance into a similarity measure. **/ + const double normalized_similarity = 1.0 - (double)dist / (double)max(len1, len2); + + /** Done. **/ + return normalized_similarity; + } + /*** Calculate the average size of all clusters in a set of vectors. *** *** @param vectors The vectors of the dataset (allocated sparsely). @@ -436,8 +580,7 @@ static double get_cluster_size( /** Could be up to around 1KB on the stack, but I think that's fine. **/ double cluster_sums[num_clusters]; unsigned int cluster_counts[num_clusters]; - for (unsigned int i = 0u; i < num_clusters; i++) - cluster_sums[i] = 0.0; + memset(cluster_sums, 0, sizeof(cluster_sums)); memset(cluster_counts, 0, sizeof(cluster_counts)); /** Sum the difference from each vector to its cluster centroid. **/ @@ -499,14 +642,16 @@ unsigned int compute_k(const unsigned int n) *** *** @param vectors The vectors to cluster. *** @param num_vectors The number of vectors to cluster. - *** @param labels Stores the final cluster identities of the vectors after - *** clustering is completed. - *** @param centroids Stores the locations of the centroids used for the clusters - *** of the data. - *** @param iterations The number of iterations that actually executed is stored - *** here. Leave this NULL if you don't care. - *** @param max_iter The max number of iterations. *** @param num_clusters The number of clusters to generate. + *** @param max_iter The max number of iterations. + *** @param min_improvement The minimum amount of improvement that must be met + *** each clustering iteration. If there is less improvement, the algorithm + *** will stop. Pass any value less than -1 to fully disable this feature. + *** @param labels Stores the final cluster identities of the vectors after + *** clustering is completed. Each value will be `0 <= n < num_clusters`. + *** @param vector_sims An array of num_vectors elements, allocated by the + *** caller, where index i stores the similarity of vector i to its assigned + *** cluster. Passing NULL skips evaluation of these values. *** *** @attention - Assumes: num_vectors is the length of vectors. *** @attention - Assumes: num_clusters is the length of labels. @@ -528,49 +673,39 @@ unsigned int compute_k(const unsigned int n) *** *** - `O(nk + nd)` ***/ -void ca_kmeans( +int ca_kmeans( pVector* vectors, const unsigned int num_vectors, - unsigned int* labels, const unsigned int num_clusters, const unsigned int max_iter, - const double improvement_threshold) + const double min_improvement, + unsigned int* labels, + double* vector_sims) { - /** Ensure labels is clean. **/ - memset(labels, 0, num_clusters * sizeof(unsigned int)); + /** Setup stuff. **/ + bool successful = false; + unsigned int cluster_counts[num_clusters]; + memset(labels, 0u, num_vectors * sizeof(unsigned int)); /** Allocate space to store centroids and new_centroids. **/ /** Dynamic allocation is required because these densely allocated arrays might be up to 500KB! **/ - pCentroid* centroids = (pCentroid*)nmMalloc(num_clusters * sizeof(pCentroid)); - if (centroids == NULL) - { - fprintf(stderr, "ca_kmeans() - nmMalloc(%lu) failed.\n", num_clusters * sizeof(pCentroid)); - assert(false); - } - pCentroid* new_centroids = (pCentroid*)nmMalloc(num_clusters * sizeof(pCentroid)); - if (new_centroids == NULL) - { - fprintf(stderr, "ca_kmeans() - nmMalloc(%lu) failed.\n", num_clusters * sizeof(pCentroid)); - assert(false); - } + const size_t centroids_size = num_clusters * sizeof(pCentroid); + pCentroid* centroids = (pCentroid*)check_ptr(nmMalloc(centroids_size)); + if (centroids == NULL) goto end; + memset(centroids, 0, centroids_size); + pCentroid* new_centroids = (pCentroid*)check_ptr(nmMalloc(centroids_size)); + if (new_centroids == NULL) goto end_free_centroids; + memset(new_centroids, 0, centroids_size); for (unsigned int i = 0u; i < num_clusters; i++) { /** Malloc each centroid. **/ - centroids[i] = (pCentroid)nmMalloc(pCentroidSize); - if (centroids[i] == NULL) - { - fprintf(stderr, "ca_kmeans() - nmMalloc(%lu) failed.\n", pCentroidSize); - assert(false); - } + centroids[i] = (pCentroid)check_ptr(nmMalloc(pCentroidSize)); + if (centroids[i] == NULL) goto end_deep_free_centroids; memset(centroids[i], 0, pCentroidSize); /** Malloc each new centroid. **/ - new_centroids[i] = (pCentroid)nmMalloc(pCentroidSize); - if (new_centroids[i] == NULL) - { - fprintf(stderr, "ca_kmeans() - nmMalloc(%lu) failed.\n", pCentroidSize); - assert(false); - } + new_centroids[i] = (pCentroid)check_ptr(nmMalloc(pCentroidSize)); + if (new_centroids[i] == NULL) goto end_deep_free_centroids; memset(new_centroids[i], 0, pCentroidSize); } @@ -578,10 +713,10 @@ void ca_kmeans( srand(time(NULL)); for (unsigned int i = 0u; i < num_clusters; i++) { - // Pick a random vector. + /** Pick a random vector. **/ const pVector vector = vectors[rand() % num_vectors]; - // Sparse copy the vector to expand it into a densely allocated centroid. + /** Sparse copy the vector to expand it into a densely allocated centroid. **/ pCentroid centroid = centroids[i]; for (unsigned int i = 0u, dim = 0u; dim < CA_NUM_DIMS;) { @@ -591,11 +726,17 @@ void ca_kmeans( } } + /** Setup debug visualizations. **/ + glyph_init(iter, "\n", 1, false); + glyph_init(find, ".", 64, false); + glyph_init(update_label, "!", 16, false); + glyph_init(update_centroid, ":", 8, false); + /** Main kmeans loop. **/ double old_average_cluster_size = 1.0; - unsigned int cluster_counts[num_clusters]; for (unsigned int iter = 0u; iter < max_iter; iter++) { + glyph(iter); bool changed = false; /** Reset new centroids. **/ @@ -609,6 +750,7 @@ void ca_kmeans( /** Assign each point to the nearest centroid. **/ for (unsigned int i = 0u; i < num_vectors; i++) { + glyph(find); const pVector vector = vectors[i]; double min_dist = DBL_MAX; unsigned int best_centroid_label = 0u; @@ -627,6 +769,7 @@ void ca_kmeans( /** Update label to new centroid, if necessary. **/ if (labels[i] != best_centroid_label) { + glyph(update_label); labels[i] = best_centroid_label; changed = true; } @@ -648,6 +791,7 @@ void ca_kmeans( /** Update centroids. **/ for (unsigned int i = 0u; i < num_clusters; i++) { + glyph(update_centroid); if (cluster_counts[i] == 0u) continue; pCentroid centroid = centroids[i]; const pCentroid new_centroid = new_centroids[i]; @@ -657,331 +801,187 @@ void ca_kmeans( } /** Is there enough improvement? **/ + if (min_improvement < -1) continue; /** Skip check if it will always fail. **/ const double average_cluster_size = get_cluster_size(vectors, num_vectors, labels, centroids, num_clusters); const double improvement = old_average_cluster_size - average_cluster_size; - if (improvement < improvement_threshold) break; + if (improvement < min_improvement) break; old_average_cluster_size = average_cluster_size; } - /** Clean up. **/ - for (unsigned int i = 0u; i < num_clusters; i++) + /** Compute vector similarities, if requested. **/ + if (vector_sims != NULL) { - nmFree(centroids[i], pCentroidSize); - nmFree(new_centroids[i], pCentroidSize); - } - nmFree(centroids, num_clusters * sizeof(pCentroid)); - nmFree(new_centroids, num_clusters * sizeof(pCentroid)); - } - -pXArray ca_search( - pVector* vectors, - const unsigned int num_vectors, - const unsigned int* labels, - const double dupe_threshold) - { - /** Allocate space for dups. **/ - pXArray dups = xaNew(num_vectors); - if (dups == NULL) - { - fprintf(stderr, "ca_search() - xaNew(%u) failed.\n", num_vectors); - return NULL; + for (unsigned int i = 0u; i < num_vectors; i++) + vector_sims[i] = sparse_similarity_to_centroid(vectors[i], centroids[labels[i]]); } - unsigned int a = 0, b = 0, c = 0, d = 0; - for (unsigned int i = 0u; i < num_vectors; i++) - { - const pVector v1 = vectors[i]; - const unsigned int label = labels[i]; - for (unsigned int j = i + 1u; j < num_vectors; j++) - { - if (b++ % 100 == 0) printf("."); - if (labels[j] != label) continue; - if (c++ % 100 == 0) printf(":"); - const pVector v2 = vectors[j]; - const double similarity = sparse_similarity(v1, v2); - if (similarity > dupe_threshold) /* Dup found! */ - { - Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); - if (dup == NULL) - { - fprintf(stderr, - "ca_search() - nmMalloc(%lu) failed.\n", - sizeof(Dup) - ); - goto err_free_dups; - } - - dup->id1 = i; - dup->id2 = j; - dup->similarity = similarity; - xaAddItem(dups, (void*)dup); - if (d++ % 4 == 0) printf("!"); - } - } - if (a++ % 4 == 0) printf("\n"); - } + glyph_print("\n"); - return dups; + /** Success. **/ + successful = true; - /** Free dups. **/ - err_free_dups:; - const size_t num_dups = dups->nItems; - for (unsigned int i = 0u; i < num_dups; i++) - { - nmFree(dups->Items[i], sizeof(Dup)); - dups->Items[i] = NULL; - } - xaDeInit(dups); - return NULL; - } - -/*** Runs complete search to find duplocates if `num_vectors < MAX_COMPLETE_SEARCH` - *** and runs a search using k-means clustering on larger amounts of data. - *** - *** @param vectors Array of precomputed frequency vectors for all dataset strings. - *** @param num_vectors The number of vectors to be scanned. - *** @param dupe_threshold The similarity threshold, below which dups are ignored. - *** @returns The duplicates in pDup structs. - ***/ -pXArray ca_lightning_search(pVector* vectors, const unsigned int num_vectors, const double dupe_threshold) - { - /** Allocate space for dups. **/ - const size_t guess_size = num_vectors * 2u; - pXArray dups = xaNew(guess_size); - if (dups == NULL) + /** Clean up. **/ + end_deep_free_centroids: + for (unsigned int i = 0u; i < num_clusters; i++) { - fprintf(stderr, "ca_lightning_search() - xaNew(%lu) failed.\n", guess_size); - return NULL; + if (centroids[i] != NULL) nmFree(centroids[i], pCentroidSize); + else break; + if (new_centroids[i] != NULL) nmFree(new_centroids[i], pCentroidSize); + else break; } - /** Descide which algorithm to use. **/ - if (num_vectors <= 50 * 1000) - { /** Do a complete search. **/ - for (unsigned int i = 0u; i < num_vectors; i++) - { - const pVector v1 = vectors[i]; - for (unsigned int j = i + 1u; j < num_vectors; j++) - { - const pVector v2 = vectors[j]; - const double similarity = sparse_similarity(v1, v2); - if (similarity > dupe_threshold) // Dup found! - { - Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); - if (dup == NULL) - { - fprintf(stderr, "ca_lightning_search() - nmMalloc(%lu) failed.\n", sizeof(Dup)); - goto err_free_dups; - } - - dup->id1 = i; - dup->id2 = j; - dup->similarity = similarity; - xaAddItem(dups, (void*)dup); - } - } - } - } - else - { /** Do a k-means search. **/ - /** Define constants for the algorithm. **/ - const unsigned int max_iter = 64u; /** Hardcode value because idk. **/ - const unsigned int num_clusters = compute_k(num_vectors); - - /** Allocate static memory for finding clusters. **/ - unsigned int labels[num_vectors]; - memset(labels, 0u, sizeof(labels)); - - /** Execute kmeans clustering. **/ - ca_kmeans(vectors, num_vectors, labels, num_clusters, max_iter, 0.0002); - - /** Find duplocates in clusters. **/ - for (unsigned int i = 0u; i < num_vectors; i++) - { - const pVector v1 = vectors[i]; - const unsigned int label = labels[i]; - for (unsigned int j = i + 1u; j < num_vectors; j++) - { - if (labels[j] != label) continue; - const pVector v2 = vectors[j]; - const double similarity = sparse_similarity(v1, v2); - if (similarity > dupe_threshold) /* Dup found! */ - { - Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); - if (dup == NULL) - { - fprintf(stderr, - "ca_lightning_search() - nmMalloc(%lu) failed.\n", - sizeof(Dup) - ); - goto err_free_dups; - } - - dup->id1 = i; - dup->id2 = j; - dup->similarity = similarity; - xaAddItem(dups, (void*)dup); - } - } - } - } + // end_free_new_centroids: + nmFree(new_centroids, num_clusters * sizeof(pCentroid)); - /** Done **/ - return dups; + end_free_centroids: + nmFree(centroids, num_clusters * sizeof(pCentroid)); - /** Free dups. **/ - err_free_dups:; - const size_t num_dups = dups->nItems; - for (unsigned int i = 0u; i < num_dups; i++) - { - nmFree(dups->Items[i], sizeof(Dup)); - dups->Items[i] = NULL; - } - xaDeInit(dups); - return NULL; + end: + return (successful) ? 0 : -1; } -/*** Computes Levenshtein distance between two strings. - *** - *** @param str1 The first string. - *** @param str2 The second string. - *** @param length1 The length of the first string. - *** @param length1 The length of the first string. - *** - *** @attention - Tip: Pass 0 for the length of either string to infer it - *** using the null terminating character. Thus, strings with no null - *** terminator are supported if you pass explicit lengths. - *** - *** Complexity: O(length1 * length2). - *** - *** @see centrallix-sysdoc/string_comparison.md +/*** Finds the data that is the most similar to the target and returns + *** it if the similarity meets the threshold. + *** + *** @param target The target data to compare to the rest of the data. + *** @param data The rest of the data, compared against the target to + *** find the data that is the most similar. + *** @param num_data The number of elements in data. Specify 0 to detect + *** length on a null terminated array of data. + *** @param similarity A function which takes two data items of the type + *** of the data param and returns their similarity. + *** @param threshold The minimum similarity threshold. If the most similar + *** data does not meet this threshold, the funciton returns NULL. + *** @returns A pointer to the most similar piece of data found in the data + *** array, or NULL if the most similar data did not meet the threshold. ***/ -unsigned int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) +void* ca_most_similar( + void* target, + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double threshold) { - /*** lev_matrix: - *** For all i and j, d[i][j] will hold the Levenshtein distance between - *** the first i characters of s and the first j characters of t. - *** - *** As they say, no dynamic programming algorithm is complete without a - *** matrix that you fill out and it has the answer in the final location. - ***/ - const size_t str1_len = (str1_length == 0u) ? strlen(str1) : str1_length; - const size_t str2_len = (str2_length == 0u) ? strlen(str2) : str2_length; - unsigned int lev_matrix[str1_len + 1][str2_len + 1]; - - /*** Base case #0: - *** Transforming an empty string into an empty string has 0 cost. - ***/ - lev_matrix[0][0] = 0u; - - /*** Base case #1: - *** Any source prefixe can be transformed into an empty string by - *** dropping each character. - ***/ - for (unsigned int i = 1u; i <= str1_len; i++) - lev_matrix[i][0] = i; - - /*** Base case #2: - *** Any target prefixes can be transformed into an empty string by - *** inserting each character. - ***/ - for (unsigned int j = 1u; j <= str2_len; j++) - lev_matrix[0][j] = j; - - /** General Case **/ - for (unsigned int i = 1u; i <= str1_len; i++) + void* most_similar = NULL; + double best_sim = -INFINITY; + for (unsigned int i = 0u; (num_data == 0u) ? (data[i] != NULL) : (i < num_data); i++) { - for (unsigned int j = 1u; j <= str2_len; j++) + const double sim = similarity(target, data[i]); + if (sim > best_sim && sim > threshold) { - /** Equal characters need no changes. **/ - if (str1[i - 1] == str2[j - 1]) - lev_matrix[i][j] = lev_matrix[i - 1][j - 1]; - - /*** We need to make a change, so use the opereration with the - *** lowest cost out of delete, insert, replace, or swap. - ***/ - else - { - unsigned int cost_delete = lev_matrix[i - 1][j] + 1u; - unsigned int cost_insert = lev_matrix[i][j - 1] + 1u; - unsigned int cost_replace = lev_matrix[i-1][j-1] + 1u; - - /** If a swap is possible, calculate the cost. **/ - bool can_swap = ( - i > 1 && j > 1 && - str1[i - 1] == str2[j - 2] && - str1[i - 2] == str2[j - 1] - ); - unsigned int cost_swap = (can_swap) ? lev_matrix[i - 2][j - 2] + 1 : UINT_MAX; - - // Find the best operation. - lev_matrix[i][j] = min(min(min(cost_delete, cost_insert), cost_replace), cost_swap); - } + most_similar = data[i]; + best_sim = sim; } } - - return lev_matrix[str1_len][str2_len]; + return most_similar; } -/*** Runs complete search to find duplocates in phone numbers using the - *** levenshtein min edit distance algorithm. - *** - *** @param dataset An array of characters for all dataset strings. - *** @param dataset_size The number of phone numbers to be scanned. - *** @param dupe_threshold The similarity threshold, below which dups are ignored. - *** @returns The duplicates in pDup structs. + +/*** Runs a sliding search over the povided data, comparing each element to + *** the following `window_size` elements, invoking the passed comparison + *** function just under `window_size * num_data` times. If any comparison + *** yeilds a similarity greater than the threshold, it is stored in the + *** xArray returned by this function. + *** + *** @param data The data to be searched. + *** @param num_data The number of data items in data. + *** @param window_size The size of the sliding window used for the search. + *** @param similarity A function which takes two data items of the type of + *** the data param and returns their similarity. + *** @param threshold The minimum threshold required for a duplocate to be + *** included in the returned xArray. + *** @param maybe_dups A pointer to an xArray in which dups should be found. + *** Pass NULL to allocate a new one. + *** @returns An xArray holding all of the duplocates found. If maybe_dups is + *** not NULL, this will be that xArray, to allow for chaining. ***/ -pXArray ca_phone_search(char dataset[][10u], const unsigned int dataset_size, const double dupe_threshold) +pXArray ca_sliding_search( + void** data, + const unsigned int num_data, + const unsigned int window_size, + const double (*similarity)(void*, void*), + const double threshold, + pXArray dups) { - /** Allocate space for dups. **/ - const size_t guess_size = dataset_size * 2u; - pXArray dups = xaNew(guess_size); - if (dups == NULL) + /** Allocate space for dups (if necessary). **/ + const bool allocate_dups = (dups == NULL); + if (allocate_dups) { - fprintf(stderr, "ca_phone_search() - xaNew(%lu) failed.\n", guess_size); - return NULL; + /** Guess that we will need space for num_data * 2 dups. **/ + const int guess_size = num_data * 2; + dups = check_ptr(xaNew(guess_size)); + if (dups == NULL) goto err; } + const int num_starting_dups = dups->nItems; - /** Search for dups using edit distance. **/ - for (unsigned int i = 0u; i < dataset_size; i++) - { - const char* v1 = dataset[i]; - for (unsigned int j = i + 1u; j < dataset_size; j++) + /** Setup debug visualizations. **/ + glyph_init(outer, " ", 4, true); + glyph_init(inner, ".", 128, false); + glyph_init(find, "!", 32, false); + + /** Search for dups. **/ + for (unsigned int i = 0u; i < num_data; i++) + { + glyph(outer); + const unsigned int window_start = i + 1u; + const unsigned int window_end = min(i + window_size, num_data); + for (unsigned int j = window_start; j < window_end; j++) { - const char* v2 = dataset[j]; - const unsigned int dist = ca_edit_dist(v1, v2, 10u, 10u); - const double similarity = (double)dist / 10.0; - if (similarity > dupe_threshold) /* Dup found! */ + glyph(inner); + const double sim = similarity(data[i], data[j]); + if (sim > threshold) /* Dup found! */ { - Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); - if (dup == NULL) - { - fprintf(stderr, "ca_phone_search() - nmMalloc(%lu) failed.\n", sizeof(Dup)); - - /** Free data before returning. **/ - const size_t num_dups = dups->nItems; - for (unsigned int i = 0u; i < num_dups; i++) - { - void* dup = dups->Items[i]; - nmFree(dup, sizeof(Dup)); - } - xaDeInit(dups); - return NULL; - } - + glyph(find); + Dup* dup = (Dup*)check_ptr(nmMalloc(sizeof(Dup))); + if (dup == NULL) goto err_free_dups; dup->id1 = i; dup->id2 = j; - dup->similarity = similarity; - xaAddItem(dups, (void*)dup); + dup->similarity = sim; + if (!check_neg(xaAddItem(dups, (void*)dup))) goto err_free_dups; } } } + glyph_print("\n"); + /** Success. **/ return dups; + + /** Error cleanup. **/ + + err_free_dups: + /** Free the dups we added to the XArray. */ + while (dups->nItems > num_starting_dups) + nmFree(dups->Items[dups->nItems--], sizeof(Dup)); + if (allocate_dups) check(xaDeInit(dups)); /* Failure ignored. */ + + err: + return NULL; } -void ca_init() +/*** Runs a complete search over the povided data, comparing each element to + *** each other element, invoking the passed comparison function `num_data^2` + *** times. If any comparison yeilds a similarity greater than the threshold, + *** it is stored in the xArray returned by this function. + *** + *** @param data The data to be searched. + *** @param num_data The number of data items in data. + *** @param similarity A function which takes two data items of the type of + *** the data param and returns their similarity. + *** @param threshold The minimum threshold required for a duplocate to be + *** included in the returned xArray. + *** @param maybe_dups A pointer to an xArray in which dups should be found. + *** Pass NULL to allocate a new one. + *** @returns An xArray holding all of the duplocates found. If maybe_dups is + *** not NULL, this will be that xArray, to allow for chaining. + ***/ +pXArray ca_complete_search( + void** data, + const unsigned int num_data, + const double (*similarity)(void*, void*), + const double threshold, + pXArray dups) { - nmRegister(sizeof(Dup), "Dup"); + return ca_sliding_search(data, num_data, num_data, similarity, threshold, dups); } /** Scope cleanup. **/ diff --git a/centrallix-lib/src/util.c b/centrallix-lib/src/util.c index ec1d87bcf..450c16593 100644 --- a/centrallix-lib/src/util.c +++ b/centrallix-lib/src/util.c @@ -110,10 +110,13 @@ int util_detect_num_threads(void) ***/ #define USE_METRIC false #define nUnits 6u -static char* units_cs[nUnits] = {"bytes", "KiB", "MiB", "GiB", "TiB", "PiB"}; -static char* units_metric[nUnits] = {"bytes", "KB", "MB", "GB", "TB", "PB"}; +static char* units_cs[nUnits] = {"bytes", "KiB", "MiB", "GiB"}; +static char* units_metric[nUnits] = {"bytes", "KB", "MB", "GB"}; + /*** Displays a size in bytes using the largest unit where the result would be - *** at least 1.0. + *** at least 1.0. Note that units larger than GB and GiB are not supported + *** because the largest possible unsigned int is 4,294,967,295, which is + *** exactly 4 GiB (or approximately 4.29 GB). *** *** @param buf The buffer to which new text will be written, using snprintf(). *** @param buf_size The amount of space in the buffer, passed to snprintf(). @@ -228,16 +231,14 @@ void timer_free(pTimer timer) /*** Function for failing on error, assuming the error came from a library or *** system function call, so that the error buffer is set to a valid value. ***/ -void fail(const char* function_name, int code) +void print_diagnostics(int code, const char* function_name, const char* file_name, const int line_number) { - /** Create the most descriptive error message we can. **/ + /** Create a descriptive error message. **/ char error_buf[BUFSIZ]; - snprintf(error_buf, sizeof(error_buf), "kmeans.c: Fail - %s", function_name); - if (errno != 0) perror(error_buf); - else if (code != 0) fprintf(stderr, "%s (error code %d)\n", error_buf, code); - else fprintf(stderr, "%s", error_buf); + snprintf(error_buf, sizeof(error_buf), "%s:%d: %s failed", file_name, line_number, function_name); - /** Throw error for easier locating in a debugger. **/ - fprintf(stderr, "Program will now crash.\n"); - raise(SIGSEGV); + /** Print it with as much info as we can reasonably find. **/ + if (errno != 0) perror(error_buf); + else if (code != 0) fprintf(stderr, "%s (error code %d).\n", error_buf, code); + else fprintf(stderr, "%s.\n", error_buf); } diff --git a/centrallix-os/cluster-schema.cluster b/centrallix-os/cluster-schema.cluster index a97d7f9ba..9f11c1636 100644 --- a/centrallix-os/cluster-schema.cluster +++ b/centrallix-os/cluster-schema.cluster @@ -10,7 +10,7 @@ file_name "system/cluster" ?style : StyleObj // idk where to find docs for this. } // Access with :parameters:name. Accessing dynamic data (e.g. parameters) - // should be managed within a runserver() call. + // should be done within a runserver() call. ... source : DataSourcePath @@ -18,12 +18,14 @@ file_name "system/cluster" cluster_name "cluster/cluster" { - algorithm : "none" | "sliding-window" | "k-means" - | "k-means++" | "k-medoids" |"db-scan" // dbscan not implemented + algorithm : "none" | "sliding-window" | "k-means" // Implemented + | "k-means++" | "k-medoids" | "db-scan" // Not implemented similarity_measure : "cosine" | "levenshtein" // levenshtein not implemented. num_clusters : uint > 1 // (probably a parameter) ?min_improvement : double && 0.0 < x < 1.0 | "none" // default: 0.0001 ?max_iterations : uint // default: 64 + ?window_size : uint > 0 // required for algorithm = sliding_window. + ?overlap_size : double && 0.0 <= x <= 1.0 // default: 0.0, only allowed for algorithm = k-means | k-means++ | k-medoids, not implemented // Not implemented sub_cluster_name "cluster/cluster" @@ -37,26 +39,21 @@ file_name "system/cluster" { source : string ⊂ [cluster_name, ...] threshold : double && 0.0 < x < 1.0 // optimization. - similarity_measure : "cosine" | "levenshtein" // levenshtein not implemented. + similarity_measure : "cosine" | "levenshtein" } ... } // Output schema -- /{arbitrary uint} +- /cluster_name ? /sub_cluster_name - ? /{arbitrary uint} + ? ... + - /{query} + - /items : StringVec // The data points in the cluster. ... - - /average_similarity : double && 0.0 < x < 1.0 - - /size = average_similarity - - /{arbitrary uint} - - /val : string // The value of the data point. - - /label : uint < num_clusters // id of the cluster to which this data point belongs. - - /sim : double && 0.0 < x <= threshold // Similarity to cluster centroid. -... /search_name -- /{arbitrary uint} +- /{query} - /id1 : uint < sizeof(source/attr_name) // The id of the first data point. - /id2 : uint < sizeof(source/attr_name) // The id of the second data point. - /val1 : string // The value of the first data point. @@ -71,7 +68,7 @@ file_name "system/cluster" // thing, because that feels like a higher-level responsibility. // Invoke file: -// select * from /file.cl +// select * from /file.cluster // Driver-authoring.md // Comprehend stparse.c (lib vs. centrallix?) diff --git a/centrallix-os/file.cluster b/centrallix-os/file.cluster index 929efdd03..078a39fcc 100644 --- a/centrallix-os/file.cluster +++ b/centrallix-os/file.cluster @@ -23,11 +23,14 @@ file_name "system/cluster" source = "/apps/kardia/data/Kardia_DB/p_partner/rows"; attr_name = p_given_name; // runserver(:parameters:str) + // Multiple data sources when? + // Clustering object specifies properties for clustering. kmeans_cluster "cluster/cluster" { algorithm = "k-means"; similarity_measure = "cosine"; + // window_size = 16; num_clusters = runserver(:parameters:k); min_improvement = 0.0001; max_iterations = 48; diff --git a/centrallix-sysdoc/string_similarity.md b/centrallix-sysdoc/string_similarity.md new file mode 100644 index 000000000..f466a057c --- /dev/null +++ b/centrallix-sysdoc/string_similarity.md @@ -0,0 +1,167 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# String Similarity +The following sections discuss the approaches to calculating similarity between two strings which are implemented in the `clusters.c` library. This library can be incuded using `#include "clusters.h"` in centrallix-lib and `#include "cxlib/clusters.h"` in centrallix. + + +## Table of Contents +- [String Comparison](#string-comparison) + - [Table of Contents](#table-of-contents) + - [Cosine Similarity](#cosine-similarity) + - [Character Sets](#character-sets) + - [Character Pair Hashing](#character-pair-hashing) + - [String Vectors](#string-vectors) + - [Sparse Vectors](#sparse-vectors) + - [Computing Similarity](#computing-similarity) + - [Levenshtein Similarity](#levenshtein-similarity) + - [Clustering](#clustering) + - [K-means Clustering](#k-means-clustering) + - [K-means++ Clustering](#k-means-clustering-1) + - [K-medoids Clustering](#k-medoids-clustering) + - [DBScan Clustering](#db-scan) + - [Sliding Clusters](#sliding-clusters) + - [Future Implementation](#future-implementation) + - [K-means Fuzzy Clustering](#k-means-fuzzy-clusterings) + - [Implement Missing Algorithms](#implement-missing-algorithms) + + +## Cosine Similarity +The [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) function is defined as the dot product of two vectors divided by the product of the magnitude of the two vectors. Conceptually, it's like finding the _angle_ between two vectors. To get these vectors, we use the relative frequency of character pairs within each string. To reduce memory cost and speed up computation, we store them in a special sparcely allocated form, described below. + +### Character Sets +Cosine compare currnetly uses the following character sets. These can be extended or modified later, if necessary. +```c +const char ALLOW_SET[] = " \n\v\f\r!#$%&\"'()*+,-./:;<=>?@[]^_{|}~ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; +const char CHAR_SET[] = "`abcdefghijklmnopqrstuvwxyz0123456789"; +const char SIGNIFICANT_SET[] = "`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; +const char IGNORE_SET[] = " \n\v\f\r!#$%&\"'()*+,-./:;<=>?@[]^_{|}"; +const char BOUNDARY_CHAR = ('a' - 1); // aka. '`' +``` +- `ALLOW_SET` represents all characters which can be passed to a similarity detection algorithm. Passing other characters may cause warnings and errors, undefined or unintended behavior, and even security concerns. +- `CHAR_SET` represents all of the characters that will be uniquely considered during the calculation of similarity. Currently, this is all lowercase letters and numbers. +- `SIGNIFICANT_SET` represents all of the characters that are significant for the purposes of similarity. For example, the upercase letters are significant because they are considered identical to lowercase letters. Thus, they are included in the `SIGNIFICANT_SET`, but not in the `CHAR_SET`. +- `IGNORE_SET` represents characters which, while allowed to be passed to a similarity algorithm, will be ignored. For example, the strings "Ya!!" and "Ya..." will be considered identical. +- The `BOUNDARY_CHAR` is a special character which is conceptually added to the start and end of any string to be checked. + - This allows for pairs that functionally include only the first and last character. + - This character appears to have been selected to be one before the first character in `CHAR_SET` (thus convention dictates that it be written `'a' - 1` to indicate this), although it's unknown if that's the main or only reason. + - If `clusters.h` is included, it can be accessed using the `CA_BOUNDARY_CHAR` macro. + +### Character Pair Hashing +Even with a small set of ASCII characters (say 36), there are still `36^2 = 1296` possible character pairs. If the number of characters in the `CHAR_SET` ever needed to be expanded - for example, to include all UTF-8 characters - this number would quickly explode exponentially to utterly infeasible proportions. Thus, a hashing algorithm is employed to hash each character pair down to a more reasonable number of dimensions (which can be accessed with the `CA_NUM_DIMS` macro). + +### String Vectors +Any string of characters in the `ALLOW_SET` can be represented by a vector. For simplicty, imagine this vector has only `5` dimensions. To find this vector, we hash each character pair in the string. As each character pair is hashed (for example, that the pair "ab" happens to hash to `3`), the corresponding dimension is increased by some amount. This amount varies to based on the characters in the pair, helping to mitigate the impact of collisions where different character pairs hash to identical numbers (a larger number of dimensions also helps to mitigate this). + +Remember that the first and last characters form a pair with the `BOUNDARY_CHAR`, so the string "ab" has three pairs: "a", "ab", and "b". If these each hash to `2`, `3`, and `0`. Thus, the vector generated by the string "ab" might be: `[7, 0, 4, 3, 0]`. Notice that dimensions #1 and #4 are both `0` because no character pairs generated a hash of `1` or `4`. In real usecases, the vast majority of elements are `0`s because the number of dimensions used is much larger than the number of character pairs in a typical string. + +### Sparse Vectors +As noted above, the vast majority of elements in a vector generated by a typical string are `0`s. This would lead to a large waste of memory and computation if every `0` was stored separately, so instead, vectors are stored sparsely. Because all hashes are positive integers, we represent `n` `0`s with a value of ` -n`. Thus, the vector `[0, 1, 0, 0, 0]` (representing an empty string in `5` dimensions) would be represented sparsely as `[-1, 1, -3]`. + +**Note**: A value of `0` in a sparse vector is undefined, so no element should be equal to `0`. + +**Note**: Sparse arrays can vary greatly in length. To find their size, one needs to traverse the array until the total number of values found adds up to `CA_NUM_DIMS`. The `ca_sparse_len()` function can be used to do this. Also, the `ca_build_vector()` and `ca_free_vector()` use the `nmSys` functions from `newmalloc.h` to avoid conflicts over the size of the allocated data. + +### Computing Similarity +Finally, to find the cosine similarity between two strings, we can simply take the [dot product](https://en.wikipedia.org/wiki/Dot_product) of their coresponding vectors. Then, we normalize the dot product by dividing by the magnitudes of both vectors multiplied together. Two strings can be compared this way using the `ca_cos_compare()` function. + + +## Levenshtein Similarity +The [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) distance is defined as the number of insertions, deletions, or substitutions required to make one string exactly like another string. The version implemented in `clusters.c` additionally allows a new operation called a "swap" in which two adjacent characters change places. Transpositions of larger pieces of text are, unfortunately, not handled as well, which is a potential downfall of using levenshtein edit distance. + +The levenshtein similarity of two strings can be compared using the `ca_lev_compare()` function. + + +## Clustering +When searching for similar strings in a large amount of data (for example, `1,000,000` strings), comparing every string to every other string can be very computationally expensive. To speed up this process, it is helpful to _cluster_ similar strings together, then only compare strings within similar clusters. This sacrifices some accuracy to allow large amounts of data to be searched and compared in a feasable amount of time. + +### K-means Clustering +When clustering data using the [k-means](https://en.wikipedia.org/wiki/K-means_clustering) algorithm, data is divided into a predefined number of clusters with the goal of maximizing the average similarity of datapoints within any given cluster. To quickly summarize the algorithm: +1. Randomly select `k` datapoints to be the initial centroids of each cluster. +2. For each datapoint, find the centroid it is most similar to, and assign it to that clustser. +3. For each cluster, find the new centroid by averaging all datapoints in the cluster. +4. Repeat steps 2 and 3 until the clusters stabilize (i.e. no datapoint changes clusters). + +The implementation used in `clusters.c` also allows the programmer to specify a maximum number of iterations (called `max_iter` in the code) to prevent this process from running forever. Additionally, successive iterations can give diminishing results or even produce clusters that are slightly worse. To improve performance, the programmer can also specify a minimum improvement threshold (called `min_improvement`). Clusters must become more similar by at least this amount each iteration, otherwise the algorithm ends, even if the maximum number of iterations has not yet been reached. + +The `ca_kmeans()` function can be invoked using [the cosine comparison string vectors](#string-vectors) (see above) to cluster them into similar clusters. + +### K-means++ Clustering +**Not yet implemented** +This method is largely identical to k-means, except that [k-means++](https://en.wikipedia.org/wiki/K-means%2B%2B) assignes the initial centroids using an aproximate algorithm designed to avoid some of the poor clusterings possible with random assignment. + +### K-medoids Clustering +**Not yet implemented** +This method is also very similar to k-means, except that [k-medoids](https://en.wikipedia.org/wiki/K-medoids) places an aditional requirement that all centroids be points in the data. This would theoretically allow for other similarity measures (such as levenshtein edit distance) to be used for clustering instead of only cosine compare. + +### DB-Scan +**Proposed, not yet implemented or documented** + +### Sliding Clusters +A far more basic method of "clustering" is to simply sort all data alphabetically, then, instead of comparing each string to all other strings, it can be compared to only the next `n` strings. Of course, differences near the start of a string (for example, "fox" vs. "box") will cause those strings to sort far away from each other, leading them to be completely missed. + +Sorting using a similarity measure, such as `ca_cos_compare()` or `ca_lev_compare()` would resolve this issue. However, these comparison functions do not meet the transitivity requirement for sorting, which is that `(A < B) & (B < C) -> (A < C)`. For example, "car" is similar to "boxcar", which is also similar to "box". However, "car" and "box" are not similar at all. + +Additionally, sorting by the cosine vectors (similarly to how we cluster by them when using k-means) was proposed, but further investigation showed that this was also not possible. + +For problems where a sorting algorithm exists which can mitigate the above issues, this solution may prove very promissing. However, so far we have not found such a problem, so the other clustering algorithms tend to out perform Sliding Clusters. + + +## Future Implementation + +### K-means Fuzzy Clustering +One of the biggest downsides with k-means is that it creates very arbitrary boundaries between clusters. Elements on either side of these boundaries may be highly similar, but if comparisons only occur within a cluster, these similar entries will be missed. The problem becomes more extreme as a higher k value (more clusters) is used, creating more arbitrary boundaries. This drawback is probably the main reason that clustering sacrifices some accuracy over searching every element. + +Running the entire search multiple types may allow some of these to be found because the initial cluster locations are random. This approach is partially implemented for duplocate searching because the algorithm runs nightly anyway, so a simple up-sert (**UP**date existing entries; in**SERT** new entries) slightly reduces this problem. However, this solution is obviously far from ideal. + +If the clustering could be expanded with an additional step that makes clusters larger, adding elements from other clusters to them, this might effectively mitigate the issue. It may also allow developers to use larger numbers of clusters, improving performance as well as accuracy. Further research is needed to verify the effectiveness of this approach before an implementation is written. + +### Implement Missing Algorithms +Several algorithms (such as [k-means++](#k-means-clustering-1), [k-medoids](#k-medoids-clustering), and [DBScan](#db-scan)) above are proposed but lack an implementation. They may be effective and useful, however, to reduce development time, they have not yet been implemented. diff --git a/centrallix/include/obj.h b/centrallix/include/obj.h index 045d57f85..54d4c988a 100644 --- a/centrallix/include/obj.h +++ b/centrallix/include/obj.h @@ -192,6 +192,7 @@ typedef struct _OSD int (*Commit)(); int (*GetQueryCoverageMask)(); int (*GetQueryIdentityPath)(); + int (*Unregister)(); } ObjDriver, *pObjDriver; diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index f56cca5de..4acfc8579 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -29,7 +29,6 @@ /* Description: Cluster object driver. */ /************************************************************************/ -#include #include #include #include @@ -73,7 +72,7 @@ ***/ /** Pure Laziness **/ -#define ENABLE_TPRINTF +// #define ENABLE_TPRINTF /** Debugging **/ #ifndef ENABLE_TPRINTF @@ -101,8 +100,6 @@ void void_func() {} *** an error cascade which may be useful to the user since a stack trace is *** not readily available. *** - *** @todo I think this should be moved to somewhere else. - *** *** @param clr Whether to clear the current error stack. As a rule of thumb, *** if you are the first one to detec the error, clear the stack so that *** other unrelated messages are not shown. If you are detecting an error @@ -119,7 +116,7 @@ void void_func() {} void mssErrorf(int clr, char* module, const char* format, ...) { /** Prevent interlacing with stdout flushing at a weird time. **/ - check(fflush(stdout)); + check(fflush(stdout)); /* Failure ignored. */ /** Insert convenient newline before error stack begins. **/ if (clr == 1) fprintf(stderr, "\n"); @@ -139,7 +136,7 @@ void mssErrorf(int clr, char* module, const char* format, ...) return; } if (num_chars > BUFSIZ) - fprintf(stderr, "WARNING: Error truncated (length %d > buffer size %d).\n", num_chars, BUFSIZ); + fprintf(stderr, "Warning: Error truncated (length %d > buffer size %d).\n", num_chars, BUFSIZ); /** Print the error. **/ const int ret = mssError(clr, module, "%s", buf); @@ -346,13 +343,13 @@ typedef unsigned char TargetType; #define TARGET_SEARCH_ENTRY (TargetType)5u /** Attribute name lists by TargetType. **/ -#define nATTR_ROOT 2u -char* const ATTR_ROOT[nATTR_ROOT] = { +#define END_OF_ARRAY NULL +char* const ATTR_ROOT[] = { "source", "attr_name", + END_OF_ARRAY, }; -#define nATTR_CLUSTER 7u -char* const ATTR_CLUSTER[nATTR_CLUSTER] = +char* const ATTR_CLUSTER[] = { "algorithm", "similarity_measure", @@ -361,199 +358,300 @@ char* const ATTR_CLUSTER[nATTR_CLUSTER] = "max_iterations", "date_created", "date_computed", + END_OF_ARRAY, }; -#define nATTR_SEARCH 5u -char* const ATTR_SEARCH[nATTR_SEARCH] = +char* const ATTR_SEARCH[] = { "source", "threshold", "similarity_measure", "date_created", "date_computed", + END_OF_ARRAY, }; -#define nATTR_CLUSTER_ENTRY 2u -char* const ATTR_CLUSTER_ENTRY[nATTR_CLUSTER_ENTRY] = +char* const ATTR_CLUSTER_ENTRY[] = { - "val", - "sim", + "items", + END_OF_ARRAY, }; -#define nATTR_SEARCH_ENTRY 3u -char* const ATTR_SEARCH_ENTRY[nATTR_SEARCH_ENTRY] = +char* const ATTR_SEARCH_ENTRY[] = { "val1", "val2", "sim", + END_OF_ARRAY, }; -#define END_OF_ATTRIBUTES NULL - /** Method name list. **/ -#define nMETHOD_NAME 2u -char* const METHOD_NAME[nMETHOD_NAME] = +char* const METHOD_NAME[] = { "cache", + END_OF_ARRAY, }; -#define END_OF_METHODS END_OF_ATTRIBUTES /** ================ Struct Declarations ================ **/ /** ANCHOR[id=structs] **/ -/** Represents the data source which may have data already fetched. **/ +/*** Represents the data source which may have data already fetched. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 72 bytes + *** + *** @skip --> Attribute Data. + *** @param Name The source name, specified in the .cluster file. + *** @param Key The key associated with this object in the SourceDataCache. + *** @param SourcePath The path to the data source from which to retrieve data. + *** @param AttrName The name of the attribute to get from the data source. + *** + *** @skip --> Computed data. + *** @param Strings The data strings to be clustered and searched, or NULL if + *** they have not been fetched from the source. + *** @param Vectors The cosine comparison vectors from the fetched data, or + *** NULL if they haven't been computed. Note that vectors are no longer + *** needed once all clusters and searches have been computed, so they are + *** automatically freed in that case to save memory. + *** @param nVectors The number of vectors and data strings. + *** + *** @skip --> Time. + *** @param DateCreated The date and time that this object was created and initialized. + *** @param DateComputed The date and time that the Labels field was computed. + ***/ typedef struct _SOURCE { - /** Top level attributes (specified in the .cluster file). **/ - char* Name; /* The node name, specified in the .cluster file. - * Warning: Some code makes the assumption that this - * is the first field in the struct. - */ - char* Key; /* The key associated with this object in the global SourceCache. */ - char* SourcePath; /* The path to the data source from which to retrieve data. */ - char* AttrName; /* The name of the attribute to get from the data source. */ - - /** Computed data. **/ - char** Data; /* The data strings to be clustered and searched, or NULL if they - * have not been fetched from the source. - */ - pVector* Vectors; /* The cosine comparison vectors from the fetched data, or NULL if - * they haven't been computed. Note that vectors are no longer - * needed once all clusters and searches have been computed, so - * they are automatically freed in that case to save memory. - */ - unsigned int nVectors; /* The number of vectors and data strings. Note: This is not - * set to 0 if the vector array is freed, this case should be - * checked separately. - */ - - /** Time. **/ - DateTime DateCreated; /* The date and time that this object was created and initialized. */ - DateTime DateComputed; /* The date and time that the Data and Vectors fields were computed. */ - } SourceData, *pSourceData; - -/** Data for each cluster. **/ + char* Name; + char* Key; + char* SourcePath; + char* AttrName; + char** Strings; + pVector* Vectors; + unsigned int nVectors; + DateTime DateCreated; + DateTime DateComputed; + } + SourceData, *pSourceData; + + +/*** Computed data for a single cluster. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 24 bytes + *** + *** @param Size The number of items in the cluster. + *** @param Strings The string values of each item. + *** @param Vectors The cosine vectors for each item. + ***/ +typedef struct + { + unsigned int Size; + char** Strings; + pVector* Vectors; + } + Cluster, *pCluster; + + +/*** Data for each cluster. Only attribute data is checked for caching. + *** + *** Memory Stats: + *** - Padding: 2 bytes + *** - Total size: 96 bytes + *** + *** @skip --> Attribute Data. + *** @param Name The cluster name, specified in the .cluster file. + *** @param Key The key associated with this object in the ClusterDataCache. + *** @param ClusterAlgorithm The clustering algorithm to be used. + *** @param SimilarityMeasure The similarity measure used to compare items. + *** @param nClusters The number of clusters. 1 if algorithm = none. + *** @param MinImprovement The minimum amount of improvement that must be met + *** each clustering iteration. If there is less improvement, the algorithm + *** will stop. The "max" in a .cluster file is represented by -inf. + *** @param MaxIterations The maximum number of iterations that a clustering + *** algorithm can run for. Note: Sliding window uses this field to store + *** the window_size. + *** + *** @skip --> Relationship Data. + *** @param nSubClusters The number of subclusters of this cluster. + *** @param SubClusters A pClusterData array, NULL if nSubClusters == 0. + *** @param Parent This cluster's parent. NULL if it is not a subcluster. + *** @param SourceData Pointer to the source data that this cluster uses. + *** + *** @skip --> Computed data. + *** @param Clusters An array of length num_clusters, NULL if the clusters + *** have not yet been computed. + *** @param Sims An array of num_vectors elements, where index i stores the + *** similarity of vector i to its assigned cluster. This field is NULL + *** if the clusters have not yet been computed. + *** + *** @skip --> Time. + *** @param DateCreated The date and time that this object was created and initialized. + *** @param DateComputed The date and time that the Labels field was computed. + ***/ typedef struct _CLUSTER { - /** Attribute Data. **/ - char* Name; /* The cluster name, specified in the .cluster file. - * Warning: Some code makes the assumption that this - * is the first field in the struct. - */ - char* Key; /* The key associated with this object in the global ClusterCache. */ - ClusterAlgorithm ClusterAlgorithm; /* The clustering algorithm to be used. */ - SimilarityMeasure SimilarityMeasure; /* The similarity measurse to be used when clustering. */ - unsigned int NumClusters; /* The number of clusters. 1 if algorithm = none. */ - double MinImprovement; /* The minimum amount of improvement that must be met each - * clustering iteration. If there is less improvement, the - * algorithm will stop. Specifying "max" in the .cluster - * file should be represented by a value of -inf. - */ - unsigned int MaxIterations; /* The maximum number of iterations to run clustering. */ - - /** Other data (ignored by caching). **/ - unsigned int nSubClusters; /* The number of subclusters of this cluster. */ - struct _CLUSTER** SubClusters; /* A pClusterData array, NULL if nSubClusters == 0. */ - struct _CLUSTER* Parent; /* This cluster's parent. NULL if it is not a subcluster. */ - pSourceData SourceData; /* Pointer to the source data that this cluster uses. */ - - /** Computed data. **/ - unsigned int* Labels; /* An array with one element for each vector in the data - * (aka. DriverData->nVectors). For vector i, Labels[i] is - * the ID of the cluster to which that data is assigned. - * NULL if the cluster has not been computed. */ - - /** Time. **/ - DateTime DateCreated; /* The date and time that this object was created and initialized. */ - DateTime DateComputed; /* The date and time that the Labels field was computed. */ + char* Name; + char* Key; + ClusterAlgorithm ClusterAlgorithm; + SimilarityMeasure SimilarityMeasure; + unsigned int nClusters; + double MinImprovement; + unsigned int MaxIterations; + unsigned int nSubClusters; + struct _CLUSTER** SubClusters; + struct _CLUSTER* Parent; + pSourceData SourceData; + Cluster* Clusters; + double* Sims; + DateTime DateCreated; + DateTime DateComputed; } ClusterData, *pClusterData; -/** Data for each search. **/ + +/*** Data for each search. + *** + *** Memory Stats: + *** - Padding: 3 bytes + *** - Total size: 64 bytes + *** + *** @skip --> Attribute Data. + *** @param Name The search name, specified in the .cluster file. + *** @param Key The key associated with this object in the SearchDataCache. + *** @param Source The cluster from which this search is to be derived. + *** @param SimilarityMeasure The similarity measure used to compare items. + *** @param Threshold The minimum similarity threshold for elements to be + *** included in the results of the search. + *** + *** @skip --> Computed data. + *** @param Dups An array holding the dups found by the search, or NULL if the + *** search has not been computed. + *** @param nDups The number of dups found. + *** + *** @skip --> Time. + *** @param DateCreated The date and time that this object was created and initialized. + *** @param DateComputed The date and time that the Dups field was computed. + ***/ typedef struct _SEARCH { - char* Name; /* The search name, specified in the .cluster file. - * Warning: Some code makes the assumption that this - * is the first field in the struct. - */ - char* Key; /* The key associated with this object in the global SearchCache. */ - pClusterData Source; /* The cluster from which this search is to be derived. */ - double Threshold; /* The minimum similarity threshold for elements to be - * included in the results of the search. - */ - SimilarityMeasure SimilarityMeasure; /* The similarity measure used to compare items. */ - - /** Computed data. **/ - pDup* Dups; /* An array holding the dups found by the search, or NULL - * if the search has not been computed. - */ - unsigned int nDups; /* The number of dups found. */ - - /** Time. **/ - DateTime DateCreated; /* The date and time that this object was created and initialized. */ - DateTime DateComputed; /* The date and time that the Dups field was computed. */ + char* Name; + char* Key; + pClusterData Source; + double Threshold; + pDup* Dups; + unsigned int nDups; + SimilarityMeasure SimilarityMeasure; + DateTime DateCreated; + DateTime DateComputed; } SearchData, *pSearchData; + /*** Node instance data. - *** When a .cluster file is openned, there will be only one node for that + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 64 bytes + *** + *** @note When a .cluster file is openned, there will be only one node for that *** file. However, in the course of the query, many driver instance structs *** may be created by functions like clusterQueryFetch(), and closed by the *** object system using clusterClose(). + *** + *** @param SourceData Data from the provided source. + *** @param Params A pParam array storing the params in the .cluster file. + *** @param nParams The number of specified params. + *** @param ParamList Functions as a "scope" for resolving values during parsing. + *** @param ClusterDatas A pCluster array storing the clusters in the .cluster file. + *** Will be NULL if nClusters = 0. + *** @param nClusterDatas The number of specified clusters. + *** @param SearchDatas A SearchData array storing the searches in the .cluster file. + *** @param nSearches The number of specified searches. + *** @param nSearchDatas The parent object used to open this NodeData instance. ***/ typedef struct _NODE { - /** Substructures. **/ - pSourceData SourceData; /* Data from the provided source. */ - pParam* Params; /* A pParam array storing the params in the .cluster file. */ - unsigned int nParams; /* The number of specified params. */ - pParamObjects ParamList; /* Functions as a "scope" for resolving values during parsing. */ - pClusterData* Clusters; /* A pCluster array storing the clusters in the .cluster file. - * Will be NULL if nClusters = 0. - */ - unsigned int nClusters; /* The number of specified clusters. */ - pSearchData* Searches; /* A SearchData array storing the searches in the .cluster file. */ - unsigned int nSearches; /* The number of specified searches. */ - - /** Other stuff, idk why it's here. **/ - pSnNode Node; - pObject Obj; + pObject Parent; + pParam* Params; + pParamObjects ParamList; + pSourceData SourceData; + pClusterData* ClusterDatas; + pSearchData* SearchDatas; + unsigned int nParams; + unsigned int nClusterDatas; + unsigned int nSearchDatas; } NodeData, *pNodeData; -/** Driver instance data. **/ -/*** Similar to a pointer to specific, computed data in the pNodeData struct. - *** If target type is the root, a cluster, or a search, no data is guarnteed - *** to be computed yet. These three types can be returned from clusterOpen(). - *** To target a cluster entry or search entry, fetch a driver targetting a - *** cluster or search (respectively). These target types ensure that the data - *** has been computed, so the GetAttr functions do not need to ensure this. +/*** Driver instance data. + *** + *** Memory Stats: + *** - Padding: 1 bytes + *** - Total size: 24 bytes + *** + *** This struct can be thought of like a "pointer" to specific data accessible + *** through the stored pNodeData struct. This struct also communicates whether + *** that data is guaranteed to have been computed. + *** + *** For example, if target type is the root, a cluster, or a search, no data + *** is guaranteed to be computed. These three types can be returned from + *** clusterOpen(), based on the provided path. + *** + *** Alternatively, a cluster entry or search entry can be targetted by calling + *** fetch on a query pointing to a driver instance that targets a cluster or + *** search (respectively). These two entry target types ensure that the data + *** they indicate has been computed, so the GetAttrType() and GetAttrValue() + *** functions do not need to check this repeatedly each time they are called. + *** + *** @param NodeData The associated node data struct. There can be many driver + *** instances pointing to one NodeData at a time, but each driver instance + *** always points to singular NodeData struct. + *** @param TargetType The type of data targetted (see above). + *** @param TargetData If target type is: + *** ```csv + *** Root: A pointer to the SourceData struct. + *** Cluster or ClusterEntry: A pointer to the targetted cluster. + *** Search or SearchEntry: A pointer to the targetted search. + *** ``` + *** @param TargetAttrIndex An index into an attribute list (for GetNextAttr()). + *** @param TargetMethodIndex An index into an method list (for GetNextMethod()). ***/ typedef struct _DRIVER { - pNodeData NodeData; /* The associated node data. */ - TargetType TargetType; /* The type of data targetted by this driver instance. */ - void* TargetData; /* A pointer to the specific targetted cluster or search. */ - unsigned int TargetIndex; /* An index into the cluster or search (entries only). */ - unsigned char TargetAttrIndex; /* An index into an attribute list (for GetNextAttr()). */ - unsigned char TargetMethodIndex; /* An index into an method list (for GetNextMethod()). */ + pNodeData NodeData; + void* TargetData; + unsigned int TargetIndex; + unsigned char TargetAttrIndex; + unsigned char TargetMethodIndex; + TargetType TargetType; } DriverData, *pDriverData; -/** Query instance data. **/ +/*** Query instance data. + *** + *** Memory Stats: + *** - Padding: 4 bytes + *** - Total size: 16 bytes + *** + *** @param DriverData The associated driver instance being queried. + *** @param RowIndex The selected row of the data targetted by the driver. + ***/ typedef struct { - pDriverData DriverData; /* The associated driver instance being queried. */ - unsigned int RowIndex; /* The selected row of the data targetted by the driver. */ + pDriverData DriverData; + unsigned int RowIndex; } ClusterQuery, *pClusterQuery; + /** Global storage for caches. **/ struct { - XHashTable SourceCache; - XHashTable ClusterCache; - XHashTable SearchCache; + XHashTable SourceDataCache; + XHashTable ClusterDataCache; + XHashTable SearchDataCache; } - ClusterCaches; + ClusterDriverCaches; /** ================ Function Declarations ================ **/ @@ -577,6 +675,7 @@ static void ci_FreeSourceData(pSourceData source_data); static void ci_FreeClusterData(pClusterData cluster_data, bool recursive); static void ci_FreeSearchData(pSearchData search_data); static void ci_FreeNodeData(pNodeData node_data); +static void ci_FreeCaches(void); /** Deep Size Computation Functions. **/ // LINK #sizing @@ -619,6 +718,7 @@ static void ci_CacheFreeSourceData(pXHashEntry entry, void* path); static void ci_CacheFreeCluster(pXHashEntry entry, void* path); static void ci_CacheFreeSearch(pXHashEntry entry, void* path); int clusterExecuteMethod(void* inf_v, char* methodname, pObjData param, pObjTrxTree oxt); +int clusterUnregister(pObjDriver object_driver, pObjSession session); /** Unimplemented DriverFunctions. **/ // LINK #unimplemented @@ -636,18 +736,44 @@ int clusterCommit(void* inf_v, pObjTrxTree *oxt); /** ANCHOR[id=parsing] **/ // LINK #functions +/** Format a hint to give to the user. **/ +static void ci_GiveHint(const char* hint) + { + fprintf(stderr, " > Hint: Did you mean \"%s\"?\n", hint); + } + +/*** Given the user a hint when they specify an invalid string for a field + *** where we know the list of valid strings. The hint is only displayed if + *** their string is close enough to a valid string. + *** + *** @param value The value the user gave. + *** @param valid_values The valid values that could be what they meant. + *** @param n_valid_values The number of valid values. Specify 0 to detect + *** length on a null terminated array of values. + *** @returns Whether a hint was given. + ***/ +static bool ci_TryHint(char* value, char** valid_values, const unsigned int n_valid_values) + { + char* guess = ca_most_similar(value, (void**)valid_values, n_valid_values, ca_lev_compare, 0.5); + if (guess == NULL) return false; /* No hint. */ + + /** Issue hint. **/ + ci_GiveHint(guess); + return true; + } + + +// LINK #functions /*** Returns 0 for success and -1 on failure. Promises that mssError() will be *** invoked on failure, so the caller need not specify their own error message. *** Returns 1 if attribute is available, printing an error if the attribute was *** marked as required. *** - *** @attention - Promises that mssError() will be invoked on failure, so the - *** caller is not required to specify their own error message. + *** @attention - Promises that a failure invokes mssError() at least once. *** - *** TODO: Greg - *** This function took several hours of debugging before it worked at all, and I - *** still don't know if it works correctly... or really how it works. Please - *** review this code carefully! + *** TODO: Greg - Review Carefully. + *** This function took a lot of debugging to get it to work. Please make sure + *** it works correctly and properly requires runserver() for dynamic attributes. ***/ static int ci_ParseAttribute( pStructInf inf, @@ -659,8 +785,9 @@ static int ci_ParseAttribute( bool print_type_error) { int ret; + tprintf("Invoking ci_ParseAttribute('%s').\n", attr_name); - /** Get attribute name. **/ + /** Get attribute inf. **/ pStructInf attr_info = stLookup(inf, attr_name); if (attr_info == NULL) { @@ -669,14 +796,19 @@ static int ci_ParseAttribute( } ASSERTMAGIC(attr_info, MGK_STRUCTINF); - /** Get the attribute. **/ - tprintf("Invoking ci_ParseAttribute('%s')...\n", attr_name); + /** Allocate expression. **/ pExpression exp = check_ptr(stGetExpression(attr_info, 0)); + if (exp == NULL) goto err; + + /** Bind parameters. **/ + /** TODO: Greg - What does this return? How do I know if it fails? **/ expBindExpression(exp, param_list, EXPR_F_RUNSERVER); + + /** Evaluate expression. **/ ret = expEvalTree(exp, param_list); if (ret != 0) { - mssErrorf(0, "Cluster", "Expression evaluation failed."); + mssErrorf(0, "Cluster", "Expression evaluation failed (error code %d).", ret); goto err; } @@ -695,42 +827,13 @@ static int ci_ParseAttribute( if (ret != 0) { mssErrorf(1, "Cluster", - "Failed to get data of type \"%s\" from exp \"%s\" (error code %d).", - ci_TypeToStr(datatype), exp->Name, ret + "Failed to get \"%s\" : %s using expression \"%s\" (error code %d).", + attr_name, ci_TypeToStr(datatype), exp->Name, ret ); goto err; } -// const int ret = stGetAttrValueOSML( -// attr_info, -// datatype, -// data, -// 0, -// param_list->Session, -// param_list -// ); -// if (ret == 1) -// { -// mssErrorf(1, "Cluster", -// "stGetAttrValueOSML('%s') because %s cannot be null.\n" -// " > Hint: You might have used an undefined variable or forgot to add runserver().", -// attr_name, attr_name -// ); -// return 1; -// } -// if (ret != 0) -// { -// if (print_type_error) -// { -// mssErrorf(1, "Cluster", -// "stGetAttrValueOSML('%s') failed (error code %d).\n" -// " > Hint: It might be a type mismatch, or you used an undefined variable.", -// attr_name, ret -// ); -// } -// return ret; -// } - + /** Success. **/ return 0; err: @@ -746,8 +849,7 @@ static int ci_ParseAttribute( /*** Parses a ClusteringAlgorithm from the algorithm field in the pStructInf *** representing some structure with that attribute in a parsed structure file. *** - *** @attention - Promises that mssError() will be invoked on failure, so the - *** caller is not required to specify their own error message. + *** @attention - Promises that a failure invokes mssError() at least once. *** *** @param inf A parsed pStructInf. *** @param param_list The param objects that function as a kind of "scope" for @@ -758,7 +860,7 @@ static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObject { /** Get the algorithm attribute. **/ char* algorithm; - if (ci_ParseAttribute(inf, "algorithm", DATA_T_STRING, POD(&algorithm), param_list, true, true) != 0) + if (ci_ParseAttribute(inf, "algorithm", DATA_T_STRING, POD(&algorithm), param_list, true, true) != 0) { mssErrorf(0, "Cluster", "Failed to parse attribute 'algorithm' in group \"%s\".", inf->Name); return ALGORITHM_NULL; @@ -771,9 +873,21 @@ static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObject if (!strcasecmp(algorithm, "k-means++")) return ALGORITHM_KMEANS_PLUS_PLUS; if (!strcasecmp(algorithm, "k-medoids")) return ALGORITHM_KMEDOIDS; if (!strcasecmp(algorithm, "db-scan")) return ALGORITHM_DB_SCAN; - + /** Unknown value for clustering algorithm. **/ mssErrorf(1, "Cluster", "Unknown \"clustering algorithm\": %s", algorithm); + + /** Attempt to give a hint. **/ + char* all_names[nClusteringAlgorithms] = {NULL}; + for (unsigned int i = 0u; i < nClusteringAlgorithms; i++) + all_names[i] = ci_ClusteringAlgorithmToString(ALL_CLUSTERING_ALGORITHMS[i]); + if (ci_TryHint(algorithm, all_names, nClusteringAlgorithms)); + else if (strcasecmp(algorithm, "sliding") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_SLIDING_WINDOW)); + else if (strcasecmp(algorithm, "window") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_SLIDING_WINDOW)); + else if (strcasecmp(algorithm, "null") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_NONE)); + else if (strcasecmp(algorithm, "nothing") == 0) ci_GiveHint(ci_ClusteringAlgorithmToString(ALGORITHM_NONE)); + + /** Fail. **/ return ALGORITHM_NULL; } @@ -783,8 +897,7 @@ static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObject *** pStructInf parameter, which represents some structure with that attribute *** in a parsed structure file. *** - *** @attention - Promises that mssError() will be invoked on failure, so the - *** caller is not required to specify their own error message. + *** @attention - Promises that a failure invokes mssError() at least once. *** *** @param inf A parsed pStructInf. *** @param param_list The param objects that function as a kind of "scope" for @@ -805,7 +918,20 @@ static SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects if (!strcasecmp(measure, "cosine")) return SIMILARITY_COSINE; if (!strcasecmp(measure, "levenshtein")) return SIMILARITY_LEVENSHTEIN; + /** Unknown similarity measure. **/ mssErrorf(1, "Cluster", "Unknown \"similarity measure\": %s", measure); + + /** Attempt to give a hint. **/ + char* all_names[nSimilarityMeasures] = {NULL}; + for (unsigned int i = 0u; i < nSimilarityMeasures; i++) + all_names[i] = ci_SimilarityMeasureToString(ALL_SIMILARITY_MEASURES[i]); + if (ci_TryHint(measure, all_names, nSimilarityMeasures)); + else if (strcasecmp(measure, "cos") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_COSINE)); + else if (strcasecmp(measure, "lev") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_LEVENSHTEIN)); + else if (strcasecmp(measure, "edit-dist") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_LEVENSHTEIN)); + else if (strcasecmp(measure, "edit-distance") == 0) ci_GiveHint(ci_SimilarityMeasureToString(SIMILARITY_LEVENSHTEIN)); + + /** Fail. **/ return SIMILARITY_NULL; } @@ -815,8 +941,7 @@ static SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects *** a .cluster structure file. *** *** @attention - Warning: Caching in use. - *** @attention - Promises that mssError() will be invoked on failure, so the - *** caller is not required to specify their own error message. + *** @attention - Promises that a failure invokes mssError() at least once. *** *** @param inf A parsed pStructInf for a .cluster structure file. *** @param param_list The param objects that function as a kind of "scope" for @@ -832,26 +957,30 @@ static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, /** Get source. **/ if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err; char* source_path = check_ptr(nmSysStrdup(buf)); + if (source_path == NULL) goto err; /** Get attribute name. **/ if (ci_ParseAttribute(inf, "attr_name", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err; char* attr_name = check_ptr(nmSysStrdup(buf)); + if (attr_name == NULL) goto err_free_path; /** Create cache entry key. **/ const size_t len = strlen(path) + strlen(source_path) + strlen(attr_name) + 3lu; char* key = check_ptr(nmSysMalloc(len * sizeof(char))); + if (key == NULL) goto err_free_attr; snprintf(key, len, "%s?%s:%s", path, source_path, attr_name); - pXHashTable source_cache = &ClusterCaches.SourceCache; /** Check for a cached version. **/ - pSourceData source_maybe = (pSourceData)xhLookup(source_cache, key); + pSourceData source_maybe = (pSourceData)xhLookup(&ClusterDriverCaches.SourceDataCache, key); if (source_maybe != NULL) { /** Cache hit. **/ tprintf("# source: \"%s\"\n", key); - tprintf("--> Name: %s\n", source_maybe->Name); /* Cause invalid read if cache was incorrectly freed. */ - /** Free data we don't need. */ + /** Cause an imediate invalid read if cache was incorrectly freed. **/ + tprintf("--> Name: %s\n", source_maybe->Name); + + /** Free data we don't need. **/ nmSysFree(source_path); nmSysFree(attr_name); nmSysFree(key); @@ -862,21 +991,43 @@ static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, /** Cache miss: Create a new source data object. **/ pSourceData source_data = check_ptr(nmMalloc(sizeof(SourceData))); + if (source_data == NULL) goto err_free_key; memset(source_data, 0, sizeof(SourceData)); - source_data->Name = check_ptr(nmSysStrdup(inf->Name)); source_data->Key = key; source_data->SourcePath = source_path; source_data->AttrName = attr_name; - check(objCurrentDate(&source_data->DateCreated)); + source_data->Name = check_ptr(nmSysStrdup(inf->Name)); + if (source_data->Name == NULL) goto err_free_source; + if (!check(objCurrentDate(&source_data->DateCreated))) goto err_free_source; /** Add the new object to the cache for next time. **/ tprintf("+ source: \"%s\"\n", key); - check(xhAdd(source_cache, key, (void*)source_data)); + if (!check(xhAdd(&ClusterDriverCaches.SourceDataCache, key, (void*)source_data))) + goto err_free_source; + /** Success. **/ return source_data; + /** Error handling. **/ + err_free_source: + ci_FreeSourceData(source_data); + nmSysFree(key); + goto err; + + err_free_key: + nmSysFree(key); + + err_free_attr: + nmSysFree(attr_name); + + err_free_path: + nmSysFree(source_path); + err: - mssErrorf(0, "Cluster", "Failed to parse source data from group \"%s\" in file: %s", inf->Name, path); + mssErrorf(0, "Cluster", + "Failed to parse source data from group \"%s\" in file: %s", + inf->Name, path + ); return NULL; } @@ -901,17 +1052,21 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) tprintf("Parsing cluster: %s\n", inf->Name); + /** Extract values. **/ pParamObjects param_list = node_data->ParamList; pSourceData source_data = node_data->SourceData; /** Allocate space for data struct. **/ pClusterData cluster_data = check_ptr(nmMalloc(sizeof(ClusterData))); + if (cluster_data == NULL) goto err; memset(cluster_data, 0, sizeof(ClusterData)); /** Basic Properties. **/ cluster_data->Name = check_ptr(nmSysStrdup(inf->Name)); - cluster_data->SourceData = source_data; - check(objCurrentDate(&cluster_data->DateCreated)); + if (cluster_data->Name == NULL) goto err_free_cluster; + cluster_data->SourceData = check_ptr(source_data); + if (cluster_data->SourceData == NULL) goto err_free_cluster; + if (!check(objCurrentDate(&cluster_data->DateCreated))) goto err_free_cluster; /** Get algorithm. **/ cluster_data->ClusterAlgorithm = ci_ParseClusteringAlgorithm(inf, param_list); @@ -920,7 +1075,7 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) /** Handle no clustering case. **/ if (cluster_data->ClusterAlgorithm == ALGORITHM_NONE) { - cluster_data->NumClusters = 1u; + cluster_data->nClusters = 1u; goto parsing_done; } @@ -930,19 +1085,36 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) /** Handle sliding window case. **/ if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + { + /** Sliding window doesn't allocate any clusters. **/ + cluster_data->nClusters = 0u; + + /** Get window_size. **/ + int window_size; + if (ci_ParseAttribute(inf, "window_size", DATA_T_INTEGER, POD(&window_size), param_list, true, true) != 0) + goto err_free_cluster; + if (window_size < 1) + { + mssErrorf(1, "Cluster", "Invalid value for [window_size : uint > 0]: %d", window_size); + goto err_free_cluster; + } + + /** Store value. **/ + cluster_data->MaxIterations = (unsigned int)window_size; goto parsing_done; + } /** Get num_clusters. **/ int num_clusters; - if (ci_ParseAttribute(inf, "num_clusters", DATA_T_INTEGER, POD(&num_clusters), param_list, true, true) != 0) goto err_free_cluster; + if (ci_ParseAttribute(inf, "num_clusters", DATA_T_INTEGER, POD(&num_clusters), param_list, true, true) != 0) + goto err_free_cluster; if (num_clusters < 2) { mssErrorf(1, "Cluster", "Invalid value for [num_clusters : uint > 1]: %d", num_clusters); if (num_clusters == 1) fprintf(stderr, "HINT: Use algorithm=\"none\" to disable clustering.\n"); goto err_free_cluster; } - cluster_data->NumClusters = (unsigned int)num_clusters; - tprintf("Got value for num_clusters: %d\n", num_clusters); + cluster_data->nClusters = (unsigned int)num_clusters; /** Get min_improvement. **/ double improvement; @@ -955,19 +1127,24 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) mssErrorf(1, "Cluster", "Invalid value for [min_improvement : 0.0 < x < 1.0 | \"none\"]: %g", improvement); goto err_free_cluster; } + + /** Successfully got value. **/ cluster_data->MinImprovement = improvement; } else if (result == -1) { char* str; result = ci_ParseAttribute(inf, "min_improvement", DATA_T_STRING, POD(&str), param_list, false, true); - if (result == 0 && !strcasecmp(str, "none")) + if (result != 0) goto err_free_cluster; + if (strcasecmp(str, "none") != 0) { - /** Specify no min improvement. **/ - cluster_data->MinImprovement = -INFINITY; + mssErrorf(1, "Cluster", "Invalid value for [min_improvement : 0.0 < x < 1.0 | \"none\"]: %s", str); + goto err_free_cluster; } + + /** Successfully got none. **/ + cluster_data->MinImprovement = -INFINITY; } - if (result == -1) goto err_free_cluster; /** Get max_iterations. **/ int max_iterations; @@ -986,33 +1163,89 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) /** Search for sub-clusters. **/ XArray sub_clusters; - const int ret = xaInit(&sub_clusters, 4u); - if (ret != 0) - { - mssErrorf(1, "Cluster", "FAIL - xaInit(&sub_clusters, %u): %d", 4u, ret); - goto err_free_cluster; - } + if (!check(xaInit(&sub_clusters, 4u))) goto err_free_cluster; for (unsigned int i = 0u; i < inf->nSubInf; i++) { - /** Check that this is a group (not an attribute). **/ - pStructInf group_inf = inf->SubInf[i]; - ASSERTMAGIC(group_inf, MGK_STRUCTINF); - if (stStructType(group_inf) != ST_T_SUBGROUP) continue; - - /** Select array by group type. **/ - if (strcmp(check_ptr(group_inf->UsrType), "cluster/cluster") != 0) continue; + pStructInf sub_inf = check_ptr(inf->SubInf[i]); + ASSERTMAGIC(sub_inf, MGK_STRUCTINF); + char* name = sub_inf->Name; - /** Subcluster found. **/ - pClusterData sub_cluster = ci_ParseClusterData(group_inf, node_data); - if (sub_cluster == NULL) goto err_free_sub_clusters; - sub_cluster->Parent = cluster_data; - xaAddItem(&sub_clusters, sub_cluster); + /** Handle various struct types. **/ + const int struct_type = stStructType(sub_inf); + switch (struct_type) + { + case ST_T_ATTRIB: + { + /** Valid attribute names. **/ + char* attrs[] = { + "algorithm", + "similarity_measure", + "num_clusters", + "min_improvement", + "max_iterations", + "window_size", + }; + const unsigned int nattrs = sizeof(attrs) / sizeof(char*); + + /** Ignore valid attribute names. **/ + bool is_valid = false; + for (unsigned int i = 0u; i < nattrs; i++) + { + if (strcmp(name, attrs[i]) == 0) + { + is_valid = true; + break; + } + } + if (is_valid) continue; /* Next inf. */ + + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown attribute '%s' in cluster \"%s\".\n", name, inf->Name); + if (ci_TryHint(name, attrs, nattrs)); + else if (strcasecmp(name, "k") == 0) ci_GiveHint("num_clusters"); + else if (strcasecmp(name, "threshold") == 0) ci_GiveHint("min_improvement"); + + break; + } + + case ST_T_SUBGROUP: + { + /** Select array by group type. **/ + char* group_type = check_ptr(sub_inf->UsrType); + if (group_type == NULL) goto err_free_subclusters; + if (strcmp(group_type, "cluster/cluster") != 0) + { + fprintf(stderr, + "Warning: Unknown group \"%s\" : \"%s\" in cluster \"%s\".\n", + name, group_type, inf->Name + ); + continue; + } + + /** Subcluster found. **/ + pClusterData sub_cluster = ci_ParseClusterData(sub_inf, node_data); + if (sub_cluster == NULL) goto err_free_subclusters; + sub_cluster->Parent = cluster_data; + if (!check_neg(xaAddItem(&sub_clusters, sub_cluster))) goto err_free_subclusters; + + break; + } + + default: + { + mssErrorf(1, "Cluster", + "Warning: Unknown struct type %d in cluster \"%s\".", + struct_type, inf->Name + ); + goto err_free_subclusters; + } + } } cluster_data->nSubClusters = sub_clusters.nItems; cluster_data->SubClusters = (cluster_data->nSubClusters > 0u) ? (pClusterData*)ci_xaToTrimmedArray(&sub_clusters) : NULL; /* No sub-clusters. */ - xaDeInit(&sub_clusters); + check(xaDeInit(&sub_clusters)); /* Failure ignored. */ /** Create the cache key. **/ parsing_done:; @@ -1021,7 +1254,7 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) { case ALGORITHM_NONE: { - const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 5lu; + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 8lu; key = nmSysMalloc(len * sizeof(char)); snprintf(key, len, "%s/%s?%u", source_data->Key, @@ -1033,13 +1266,14 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) case ALGORITHM_SLIDING_WINDOW: { - const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 8lu; + const size_t len = strlen(source_data->Key) + strlen(cluster_data->Name) + 16lu; key = nmSysMalloc(len * sizeof(char)); - snprintf(key, len, "%s/%s?%u&%u", + snprintf(key, len, "%s/%s?%u&%u&%u", source_data->Key, cluster_data->Name, ALGORITHM_SLIDING_WINDOW, - cluster_data->SimilarityMeasure + cluster_data->SimilarityMeasure, + cluster_data->MaxIterations ); break; } @@ -1053,23 +1287,24 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) cluster_data->Name, cluster_data->ClusterAlgorithm, cluster_data->SimilarityMeasure, - cluster_data->NumClusters, + cluster_data->nClusters, cluster_data->MinImprovement, cluster_data->MaxIterations ); break; } } - pXHashTable cluster_cache = &ClusterCaches.ClusterCache; cluster_data->Key = key; /** Check for a cached version. **/ - pClusterData cluster_maybe = (pClusterData)xhLookup(cluster_cache, key); + pClusterData cluster_maybe = (pClusterData)xhLookup(&ClusterDriverCaches.ClusterDataCache, key); if (cluster_maybe != NULL) { /** Cache hit. **/ tprintf("# cluster: \"%s\"\n", key); - tprintf("--> Name: %s\n", cluster_maybe->Name); /* Cause invalid read if cache was incorrectly freed. */ + + /** Cause invalid read if cache was incorrectly freed. **/ + tprintf("--> Name: %s\n", cluster_maybe->Name); /** Free the parsed cluster that we no longer need. */ ci_FreeClusterData(cluster_data, false); @@ -1081,19 +1316,22 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) /** Cache miss. **/ tprintf("+ cluster: \"%s\"\n", key); - check(xhAdd(cluster_cache, key, (void*)cluster_data)); + if (!check(xhAdd(&ClusterDriverCaches.ClusterDataCache, key, (void*)cluster_data))) goto err_free_key; return cluster_data; /** Error cleanup. **/ - err_free_sub_clusters: + err_free_key: + nmSysFree(key); + + err_free_subclusters: for (unsigned int i = 0u; i < sub_clusters.nItems; i++) ci_FreeClusterData(sub_clusters.Items[i], true); - xaDeInit(&sub_clusters); + check(xaDeInit(&sub_clusters)); /* Failure ignored. */ err_free_cluster: ci_FreeClusterData(cluster_data, false); - // err: + err: mssErrorf(0, "Cluster", "Failed to parse cluster from group \"%s\".", inf->Name); return NULL; } @@ -1118,20 +1356,21 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) tprintf("Parsing search: %s\n", inf->Name); /** Allocate space for search struct. **/ - pSearchData search_data = nmMalloc(sizeof(SearchData)); - assert(search_data != NULL); + pSearchData search_data = check_ptr(nmMalloc(sizeof(SearchData))); + if (search_data == NULL) goto err; memset(search_data, 0, sizeof(SearchData)); - + /** Get basic information. **/ search_data->Name = check_ptr(nmSysStrdup(inf->Name)); - check(objCurrentDate(&search_data->DateCreated)); + if (search_data->Name == NULL) goto err_free_search; + if (!check(objCurrentDate(&search_data->DateCreated))) goto err_free_search; /** Get source. **/ char* source_name; if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&source_name), node_data->ParamList, true, true) != 0) return NULL; - for (unsigned int i = 0; i < node_data->nClusters; i++) + for (unsigned int i = 0; i < node_data->nClusterDatas; i++) { - pClusterData cluster_data = node_data->Clusters[i]; + pClusterData cluster_data = node_data->ClusterDatas[i]; if (strcmp(source_name, cluster_data->Name) == 0) { /** Source found. **/ @@ -1139,11 +1378,22 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) break; } - /** Note: Subclusters not implemented here. **/ + /** Note: Subclusters should probably be parsed here, if they were implemented. **/ } + + /** Did we find the requested source? **/ if (search_data->Source == NULL) { - mssErrorf(1, "Cluster", "Could not find cluster %s for search %s.", source_name, search_data->Name); + /** Print error. **/ + mssErrorf(1, "Cluster", "Could not find cluster \"%s\" for search \"%s\".", source_name, search_data->Name); + + /** Attempt to give a hint. **/ + char* cluster_names[node_data->nClusterDatas]; + for (unsigned int i = 0; i < node_data->nClusterDatas; i++) + cluster_names[i] = node_data->ClusterDatas[i]->Name; + ci_TryHint(source_name, cluster_names, node_data->nClusterDatas); + + /** Fail. **/ goto err_free_search; } @@ -1162,17 +1412,81 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) search_data->SimilarityMeasure = ci_ParseSimilarityMeasure(inf, node_data->ParamList); if (search_data->SimilarityMeasure == SIMILARITY_NULL) goto err_free_search; + /** Check for additional data to warn the user about. **/ + for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + pStructInf sub_inf = check_ptr(inf->SubInf[i]); + ASSERTMAGIC(sub_inf, MGK_STRUCTINF); + char* name = sub_inf->Name; + + /** Handle various struct types. **/ + const int struct_type = stStructType(sub_inf); + switch (struct_type) + { + case ST_T_ATTRIB: + { + /** Valid attribute names. **/ + char* attrs[] = { + "source", + "threshold", + "similarity_measure", + }; + const unsigned int nattrs = sizeof(attrs) / sizeof(char*); + + /** Ignore valid attribute names. **/ + bool is_valid = false; + for (unsigned int i = 0u; i < nattrs; i++) + { + if (strcmp(name, attrs[i]) == 0) + { + is_valid = true; + break; + } + } + if (is_valid) continue; /* Next inf. */ + + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown attribute '%s' in search \"%s\".\n", name, inf->Name); + ci_TryHint(name, attrs, nattrs); + + break; + } + + case ST_T_SUBGROUP: + { + /** The spec does not specify any valid sub-groups for searches. **/ + char* group_type = check_ptr(sub_inf->UsrType); + if (group_type == NULL) goto err_free_search; + fprintf(stderr, + "Warning: Unknown group \"%s\" : \"%s\" in search \"%s\".\n", + name, group_type, inf->Name + ); + break; + } + + default: + { + mssErrorf(1, "Cluster", + "Warning: Unknown struct type %d in search \"%s\".", + struct_type, inf->Name + ); + goto err_free_search; + } + } + } + /** Create cache entry key. **/ char* source_key = search_data->Source->Key; const size_t len = strlen(source_key) + strlen(search_data->Name) + 16lu; - char* key = nmSysMalloc(len * sizeof(char)); + char* key = check_ptr(nmSysMalloc(len * sizeof(char))); + if (key == NULL) goto err_free_search; snprintf(key, len, "%s/%s?%g&%u", source_key, search_data->Name, search_data->Threshold, search_data->SimilarityMeasure ); - pXHashTable search_cache = &ClusterCaches.SearchCache; + pXHashTable search_cache = &ClusterDriverCaches.SearchDataCache; /** Check for a cached version. **/ pSearchData search_maybe = (pSearchData)xhLookup(search_cache, key); @@ -1182,7 +1496,7 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) tprintf("# search: \"%s\"\n", key); tprintf("--> Name: %s\n", search_maybe->Name); /* Cause invalid read if cache was incorrectly freed. */ - /** Free the parsed search that we no longer need. */ + /** Free the parsed search that we no longer need. **/ ci_FreeSearchData(search_data); nmSysFree(key); @@ -1195,8 +1509,11 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) check(xhAdd(search_cache, key, (void*)search_data)); return search_data; + /** Error cleanup. **/ err_free_search: ci_FreeSearchData(search_data); + + err: mssErrorf(0, "Cluster", "Failed to parse search from group \"%s\".", inf->Name); return NULL; } @@ -1212,25 +1529,27 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) *** *** @param inf A parsed pStructInf for the top level group in a .cluster *** structure file. - *** @param obj The parent object struct. + *** @param parent The parent object struct. *** @returns A new pNodeData struct on success, or NULL on failure. ***/ -static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) +static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) { int ret; - - /** Retrieve path so we'll know we have it later. **/ - char* path = ci_file_path(obj); + char* path = check_ptr(ci_file_path(parent)); + if (path == NULL) goto err; /** Allocate node struct data. **/ // pNodeData node_data = NodeData |> sizeof() |> nmMalloc() |> check_ptr(); pNodeData node_data = check_ptr(nmMalloc(sizeof(NodeData))); + if (node_data == NULL) goto err; memset(node_data, 0, sizeof(NodeData)); - node_data->Obj = obj; + node_data->Parent = parent; /** Set up param list. **/ node_data->ParamList = check_ptr(expCreateParamList()); - node_data->ParamList->Session = obj->Session; + if (node_data->ParamList == NULL) goto err; + node_data->ParamList->Session = check_ptr(parent->Session); + if (node_data->ParamList->Session == NULL) goto err; ret = expAddParamToList(node_data->ParamList, "parameters", (void*)node_data, 0); if (ret != 0) { @@ -1254,45 +1573,113 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) /** Detect relevant groups. **/ XArray param_infs, cluster_infs, search_infs; - check(xaInit(¶m_infs, 8)); - check(xaInit(&cluster_infs, 8)); - check(xaInit(&search_infs, 8)); + memset(¶m_infs, 0, sizeof(XArray)); + memset(&cluster_infs, 0, sizeof(XArray)); + memset(&search_infs, 0, sizeof(XArray)); + if (!check(xaInit(¶m_infs, 8))) goto err_free_arrs; + if (!check(xaInit(&cluster_infs, 8))) goto err_free_arrs; + if (!check(xaInit(&search_infs, 8))) goto err_free_arrs; for (unsigned int i = 0u; i < inf->nSubInf; i++) { - /** Check that this is a group (not an attribute). **/ - pStructInf group_inf = inf->SubInf[i]; - ASSERTMAGIC(group_inf, MGK_STRUCTINF); - if (stStructType(group_inf) != ST_T_SUBGROUP) continue; - - /** Select array by group type. **/ - const char* group_type = group_inf->UsrType; - if (strcmp(group_type, "cluster/parameter") == 0) check_strict(xaAddItem(¶m_infs, group_inf)); - else if (strcmp(group_type, "cluster/cluster") == 0) check_strict(xaAddItem(&cluster_infs, group_inf)); - else if (strcmp(group_type, "cluster/search") == 0) check_strict(xaAddItem(&search_infs, group_inf)); - else + pStructInf sub_inf = check_ptr(inf->SubInf[i]); + ASSERTMAGIC(sub_inf, MGK_STRUCTINF); + char* name = sub_inf->Name; + + /** Handle various struct types. **/ + const int struct_type = stStructType(sub_inf); + switch (struct_type) { - mssErrorf(1, "Cluster", - "Unkown group type \"%s\" on group \"%s\".", - group_type, group_inf->Name - ); - goto err_free_arrs; + case ST_T_ATTRIB: + { + /** Valid attribute names. **/ + char* attrs[] = { + "source", + "attr_name", + }; + const unsigned int nattrs = sizeof(attrs) / sizeof(char*); + + /** Ignore valid attribute names. **/ + bool is_valid = false; + for (unsigned int i = 0u; i < nattrs; i++) + { + if (strcmp(name, attrs[i]) == 0) + { + is_valid = true; + break; + } + } + if (is_valid) continue; /* Next inf. */ + + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown attribute '%s' in cluster node \"%s\".\n", name, inf->Name); + ci_TryHint(name, attrs, nattrs); + + break; + } + + case ST_T_SUBGROUP: + { + /** The spec does not specify any valid sub-groups for searches. **/ + char* group_type = check_ptr(sub_inf->UsrType); + if (group_type == NULL) goto err_free_arrs; + if (strcmp(group_type, "cluster/parameter") == 0) + { + if (!check_neg(xaAddItem(¶m_infs, sub_inf))) + goto err_free_arrs; + } + else if (strcmp(group_type, "cluster/cluster") == 0) + { + if (!check_neg(xaAddItem(&cluster_infs, sub_inf))) + goto err_free_arrs; + } + else if (strcmp(group_type, "cluster/search") == 0) + { + if (!check_neg(xaAddItem(&search_infs, sub_inf))) + goto err_free_arrs; + } + else + { + /** Give the user a warning, and attempt to give them a hint. **/ + fprintf(stderr, + "Warning: Unknown group type \"%s\" on group \"%s\".\n", + group_type, sub_inf->Name + ); + ci_TryHint(group_type, (char*[]){ + "cluster/parameter", + "cluster/cluster", + "cluster/search", + NULL, + }, 0u); + } + break; + } + + default: + { + mssErrorf(1, "Cluster", + "Warning: Unknown struct type %d in search \"%s\".", + struct_type, inf->Name + ); + goto err_free_arrs; + } } } /** Extract OpenCtl for use below. **/ - bool has_provided_params = obj != NULL - && obj->Pathname != NULL - && obj->Pathname->OpenCtl != NULL - && obj->Pathname->OpenCtl[obj->SubPtr - 1] != NULL - && obj->Pathname->OpenCtl[obj->SubPtr - 1]->nSubInf > 0 - && obj->Pathname->OpenCtl[obj->SubPtr - 1]->SubInf != NULL; - int num_provided_params = (has_provided_params) ? obj->Pathname->OpenCtl[obj->SubPtr - 1]->nSubInf : 0; - pStruct* provided_params = (has_provided_params) ? obj->Pathname->OpenCtl[obj->SubPtr - 1]->SubInf : NULL; + bool has_provided_params = parent != NULL + && parent->Pathname != NULL + && parent->Pathname->OpenCtl != NULL + && parent->Pathname->OpenCtl[parent->SubPtr - 1] != NULL + && parent->Pathname->OpenCtl[parent->SubPtr - 1]->nSubInf > 0 + && parent->Pathname->OpenCtl[parent->SubPtr - 1]->SubInf != NULL; + int num_provided_params = (has_provided_params) ? parent->Pathname->OpenCtl[parent->SubPtr - 1]->nSubInf : 0; + pStruct* provided_params = (has_provided_params) ? parent->Pathname->OpenCtl[parent->SubPtr - 1]->SubInf : NULL; /** Itterate over each param in the structure file. **/ node_data->nParams = param_infs.nItems; const size_t params_size = node_data->nParams * sizeof(pParam); node_data->Params = check_ptr(nmMalloc(params_size)); + if (node_data->Params == NULL) goto err_free_arrs; memset(node_data->Params, 0, params_size); for (unsigned int i = 0u; i < node_data->nParams; i++) { @@ -1316,7 +1703,7 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) mssErrorf(1, "Cluster", "Provided param struct cannot be NULL."); fprintf(stderr, "Debug info: obj->Pathname->OpenCtl[%d]->SubInf[%u] is NULL", - obj->SubPtr - 1, j + parent->SubPtr - 1, j ); goto err_free_arrs; } @@ -1325,7 +1712,7 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) if (strcmp(provided_param->Name, param->Name) != 0) continue; /** Matched! The user is providing a value for this param. **/ - ret = paramSetValueFromInfNe(param, provided_param, 0, node_data->ParamList, obj->Session); + ret = paramSetValueFromInfNe(param, provided_param, 0, node_data->ParamList, node_data->ParamList->Session); if (ret != 0) { mssErrorf(0, "Cluster", @@ -1346,7 +1733,7 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) } /** Invoke param hints parsing. **/ - ret = paramEvalHints(param, node_data->ParamList, obj->Session); + ret = paramEvalHints(param, node_data->ParamList, node_data->ParamList->Session); if (ret != 0) { mssErrorf(0, "Cluster", @@ -1355,59 +1742,62 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject obj) ); goto err_free_arrs; } - if (strcmp("k", param->Name) == 0) tprintf("Param k is now %d\n", param->Value->Data.Integer); } - check(xaDeInit(¶m_infs)); + check(xaDeInit(¶m_infs)); /* Failure ignored. */ param_infs.nAlloc = 0; /** Parse source data. **/ node_data->SourceData = ci_ParseSourceData(inf, node_data->ParamList, path); - if (node_data->SourceData == NULL) goto err_free_node; + if (node_data->SourceData == NULL) goto err_free_arrs; /** Parse each cluster. **/ - node_data->nClusters = cluster_infs.nItems; - if (node_data->nClusters > 0) + node_data->nClusterDatas = cluster_infs.nItems; + if (node_data->nClusterDatas > 0) { - const size_t clusters_size = node_data->nClusters * sizeof(pClusterData); - node_data->Clusters = check_ptr(nmMalloc(clusters_size)); - memset(node_data->Clusters, 0, clusters_size); - for (unsigned int i = 0u; i < node_data->nClusters; i++) + const size_t clusters_size = node_data->nClusterDatas * sizeof(pClusterData); + node_data->ClusterDatas = check_ptr(nmMalloc(clusters_size)); + if (node_data->ClusterDatas == NULL) goto err_free_arrs; + memset(node_data->ClusterDatas, 0, clusters_size); + for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) { - node_data->Clusters[i] = ci_ParseClusterData(cluster_infs.Items[i], node_data); - if (node_data->Clusters[i] == NULL) goto err_free_arrs; + node_data->ClusterDatas[i] = ci_ParseClusterData(cluster_infs.Items[i], node_data); + if (node_data->ClusterDatas[i] == NULL) goto err_free_arrs; } } - else node_data->Clusters = NULL; - check(xaDeInit(&cluster_infs)); + else node_data->ClusterDatas = NULL; + check(xaDeInit(&cluster_infs)); /* Failure ignored. */ cluster_infs.nAlloc = 0; /** Parse each search. **/ - node_data->nSearches = search_infs.nItems; - if (node_data->nSearches > 0) + node_data->nSearchDatas = search_infs.nItems; + if (node_data->nSearchDatas > 0) { - const size_t searches_size = node_data->nSearches * sizeof(pSearchData); - node_data->Searches = check_ptr(nmMalloc(searches_size)); - memset(node_data->Searches, 0, searches_size); - for (unsigned int i = 0u; i < node_data->nSearches; i++) + const size_t searches_size = node_data->nSearchDatas * sizeof(pSearchData); + node_data->SearchDatas = check_ptr(nmMalloc(searches_size)); + if (node_data->SearchDatas == NULL) goto err_free_arrs; + memset(node_data->SearchDatas, 0, searches_size); + for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) { - node_data->Searches[i] = ci_ParseSearchData(search_infs.Items[i], node_data); - if (node_data->Searches[i] == NULL) goto err_free_node; /* The XArrays are already freed. */ + node_data->SearchDatas[i] = ci_ParseSearchData(search_infs.Items[i], node_data); + if (node_data->SearchDatas[i] == NULL) goto err_free_arrs; } } - else node_data->Searches = NULL; - check(xaDeInit(&search_infs)); + else node_data->SearchDatas = NULL; + check(xaDeInit(&search_infs)); /* Failure ignored. */ search_infs.nAlloc = 0; /** Success. **/ return node_data; err_free_arrs: - if (param_infs.nAlloc != 0) check(xaDeInit(¶m_infs)); - if (cluster_infs.nAlloc != 0) check(xaDeInit(&cluster_infs)); - if (search_infs.nAlloc != 0) check(xaDeInit(&search_infs)); + if (param_infs.nAlloc != 0) check(xaDeInit(¶m_infs)); /* Failure ignored. */ + if (cluster_infs.nAlloc != 0) check(xaDeInit(&cluster_infs)); /* Failure ignored. */ + if (search_infs.nAlloc != 0) check(xaDeInit(&search_infs)); /* Failure ignored. */ err_free_node: ci_FreeNodeData(node_data); + + err: mssErrorf(0, "Cluster", "Failed to parse node from group \"%s\" in file: %s", inf->Name, path); return NULL; } @@ -1426,12 +1816,12 @@ static void ci_FreeSourceData(pSourceData source_data) if (source_data->AttrName != NULL) nmSysFree(source_data->AttrName); /** Free fetched data, if it exists. **/ - if (source_data->Data != NULL) + if (source_data->Strings != NULL) { for (unsigned int i = 0u; i < source_data->nVectors; i++) - nmSysFree(source_data->Data[i]); - nmFree(source_data->Data, source_data->nVectors * sizeof(char*)); - source_data->Data = NULL; + nmSysFree(source_data->Strings[i]); + nmFree(source_data->Strings, source_data->nVectors * sizeof(char*)); + source_data->Strings = NULL; } /** Free computed vectors, if they exist. **/ @@ -1460,11 +1850,19 @@ static void ci_FreeClusterData(pClusterData cluster_data, bool recursive) if (cluster_data->Name != NULL) nmSysFree(cluster_data->Name); /** Free computed data, if it exists. **/ - if (cluster_data->Labels != NULL) + if (cluster_data->Clusters != NULL) { const unsigned int nVectors = cluster_data->SourceData->nVectors; - nmFree(cluster_data->Labels, nVectors * sizeof(unsigned int)); - cluster_data->Labels = NULL; + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + pCluster cluster = &cluster_data->Clusters[i]; + nmFree(cluster->Strings, cluster->Size * sizeof(char*)); + nmFree(cluster->Vectors, cluster->Size * sizeof(pVector)); + } + nmFree(cluster_data->Clusters, nVectors * sizeof(Cluster)); + nmFree(cluster_data->Sims, nVectors * sizeof(double)); + cluster_data->Clusters = NULL; + cluster_data->Sims = NULL; } /** Free subclusters recursively. **/ @@ -1517,23 +1915,23 @@ static void ci_FreeNodeData(pNodeData node_data) if (node_data->ParamList != NULL) expFreeParamList(node_data->ParamList); /** Free parsed clusters, if they exist. **/ - if (node_data->Clusters != NULL) + if (node_data->ClusterDatas != NULL) { /*** This data is cached, so we should NOT free it! *** The caching system is responsible for the memory. ***/ - nmFree(node_data->Clusters, node_data->nClusters * sizeof(pClusterData)); - node_data->Clusters = NULL; + nmFree(node_data->ClusterDatas, node_data->nClusterDatas * sizeof(pClusterData)); + node_data->ClusterDatas = NULL; } /** Free parsed searches, if they exist. **/ - if (node_data->Searches != NULL) + if (node_data->SearchDatas != NULL) { /*** This data is cached, so we should NOT free it! *** The caching system is responsible for the memory. ***/ - nmFree(node_data->Searches, node_data->nSearches * sizeof(pSearchData)); - node_data->Searches = NULL; + nmFree(node_data->SearchDatas, node_data->nSearchDatas * sizeof(pSearchData)); + node_data->SearchDatas = NULL; } /** Free data source, if one exists. **/ @@ -1554,6 +1952,18 @@ static void ci_FreeNodeData(pNodeData node_data) nmFree(node_data, sizeof(NodeData)); } +/** Frees all caches for all cluster driver instances. **/ +static void ci_FreeCaches(void) + { + /*** Free caches in reverse of the order they are created in case + *** cached data relies on its source during the freeing process. + ***/ + check(xhClearKeySafe(&ClusterDriverCaches.SearchDataCache, ci_CacheFreeSearch, NULL)); /* Failure ignored. */ + check(xhClearKeySafe(&ClusterDriverCaches.ClusterDataCache, ci_CacheFreeCluster, NULL)); /* Failure ignored. */ + check(xhClearKeySafe(&ClusterDriverCaches.SourceDataCache, ci_CacheFreeSourceData, NULL)); /* Failure ignored. */ + } + + /** ================ Deep Size Computation Functions ================ **/ /** ANCHOR[id=sizing] **/ // LINK #functions @@ -1574,10 +1984,10 @@ static unsigned int ci_SizeOfSourceData(pSourceData source_data) if (source_data->Name != NULL) size += strlen(source_data->Name) * sizeof(char); if (source_data->SourcePath != NULL) size += strlen(source_data->SourcePath) * sizeof(char); if (source_data->AttrName != NULL) size += strlen(source_data->AttrName) * sizeof(char); - if (source_data->Data != NULL) + if (source_data->Strings != NULL) { for (unsigned int i = 0u; i < source_data->nVectors; i++) - size += strlen(source_data->Data[i]) * sizeof(char); + size += strlen(source_data->Strings[i]) * sizeof(char); size += source_data->nVectors * sizeof(char*); } if (source_data->Vectors != NULL) @@ -1607,7 +2017,18 @@ static unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursi { unsigned int size = 0u; if (cluster_data->Name != NULL) size += strlen(cluster_data->Name) * sizeof(char); - if (cluster_data->Labels != NULL) size += cluster_data->SourceData->nVectors * sizeof(unsigned int); + if (cluster_data->Clusters != NULL) + { + const unsigned int nVectors = cluster_data->SourceData->nVectors; + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + const unsigned int cluster_size = cluster_data->Clusters[i].Size; + size += cluster_size * sizeof(char*); + size += cluster_size * sizeof(pVector); + } + size += nVectors * sizeof(Cluster); + size += nVectors * sizeof(double); + } if (cluster_data->SubClusters != NULL) { if (recursive) @@ -1663,43 +2084,13 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) /** If the vectors are already computed, we're done. **/ if (source_data->Vectors != NULL) return 0; - /** Handle error case that happens if memory optimizations break. **/ - if (source_data->Data != NULL) - { - /*** We have data, but not vectors, which means that this function ran - *** before, but the vectors were cleared by ci_GCSourceData(). This - *** should only happen if the vectors will not be needed again. Thus, - *** clearly something has gone wrong. - ***/ - fprintf(stderr, "ERROR:" - "\tci_computeSourceData() invoked on source data \"%s\" where\n" - "\tvectors were previously freed. There is likely a bug in\n" - "\tci_GCSourceData() which caused it to free vectors when we\n" - "\tstill needed them.\n", - source_data->Name - ); - fprintf(stderr, "Resolution:\n" - "\tThe original data will be dropped and refetched, and the\n" - "\tthe vectors will be recomputed, avoiding possible issues\n" - "\tfrom stale data.\n" - ); - - /** Drop source_data->Data. **/ - for (unsigned int i = 0u; i < source_data->nVectors; i++) - nmSysFree(source_data->Data[i]); - nmFree(source_data->Data, source_data->nVectors * sizeof(char*)); - source_data->Data = NULL; - source_data->nVectors = 0; - } - - /** Record the date and time. **/ - /** Even if this computation fails, we may want this information. **/ - check(objCurrentDate(&source_data->DateComputed)); - /** Time to play shoots-and-ladders in an error-handling jungle of gotos. **/ bool successful = false; int ret; + /** Record the date and time. **/ + if (!check(objCurrentDate(&source_data->DateComputed))) goto end; + /** Open the source path specified by the .cluster file. **/ tprintf("Openning...\n"); pObject obj = objOpen(session, source_data->SourcePath, OBJ_O_RDONLY, 0600, "system/directory"); @@ -1712,7 +2103,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) source_data->AttrName, source_data->SourcePath ); - successful = false; goto end; } @@ -1730,14 +2120,15 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) obj->Driver->Name, source_data->SourcePath ); - successful = false; goto end_close; } /** Initialize an xarray to store the retrieved data. **/ XArray data_xarray, vector_xarray; - check(xaInit(&data_xarray, 64)); - check(xaInit(&vector_xarray, 64)); + memset(&data_xarray, 0, sizeof(XArray)); + memset(&vector_xarray, 0, sizeof(XArray)); + if (!check(xaInit(&data_xarray, 64))) goto end_close_query; + if (!check(xaInit(&vector_xarray, 64))) goto end_free_data; /** Fetch data and build vectors. **/ tprintf("Skips: "); @@ -1804,7 +2195,7 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) if (strlen(val) == 0) { tprintf("_"); - check(fflush(stdout)); + check(fflush(stdout)); /* Failure ignored. */ continue; } @@ -1826,34 +2217,41 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) { /** Skip pVector with no pairs. **/ tprintf("."); - check(fflush(stdout)); + check(fflush(stdout)); /* Failure ignored. */ ca_free_vector(vector); continue; } /** Store value. **/ char* dup_val = check_ptr(nmSysStrdup(val)); - check_strict(xaAddItem(&data_xarray, (void*)dup_val)); - check_strict(xaAddItem(&vector_xarray, (void*)vector)); + if (dup_val == NULL) goto end_free_data; + if (!check_neg(xaAddItem(&data_xarray, (void*)dup_val))) goto end_free_data; + if (!check_neg(xaAddItem(&vector_xarray, (void*)vector))) goto end_free_data; /** Clean up. **/ - check(objClose(entry)); + ret = objClose(entry); + if (ret != 0) + { + mssErrorf(0, "Cluster", "Failed to close object entry (error code %d).", ret); + // ret = ret; // Fall-through: Failure ignored. + } } tprintf("\nData aquired.\n"); source_data->nVectors = vector_xarray.nItems; /** Trim data and store data. **/ const size_t data_size = source_data->nVectors * sizeof(char*); - source_data->Data = check_ptr(nmMalloc(data_size)); - memcpy(source_data->Data, data_xarray.Items, data_size); - check(xaDeInit(&data_xarray)); + source_data->Strings = check_ptr(nmMalloc(data_size)); + if (source_data->Strings == NULL) goto end_free_data; + memcpy(source_data->Strings, data_xarray.Items, data_size); + check(xaDeInit(&data_xarray)); /* Failure ignored. */ data_xarray.nAlloc = 0; /** Trim data and store vectors. **/ const size_t vectors_size = source_data->nVectors * sizeof(pVector); source_data->Vectors = check_ptr(nmMalloc(vectors_size)); memcpy(source_data->Vectors, vector_xarray.Items, vectors_size); - check(xaDeInit(&vector_xarray)); + check(xaDeInit(&vector_xarray)); /* Failure ignored. */ vector_xarray.nAlloc = 0; /** Success. **/ @@ -1864,21 +2262,21 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) { for (unsigned int i = 0u; i < data_xarray.nItems; i++) nmSysFree(data_xarray.Items[i]); - check(xaDeInit(&data_xarray)); + check(xaDeInit(&data_xarray)); /* Failure ignored. */ } if (vector_xarray.nAlloc != 0) { for (unsigned int i = 0u; i < vector_xarray.nItems; i++) ca_free_vector(vector_xarray.Items[i]); - check(xaDeInit(&vector_xarray)); + check(xaDeInit(&vector_xarray)); /* Failure ignored. */ } - // end_close_query: + end_close_query: ret = objQueryClose(query); if (ret != 0) { mssErrorf(0, "Cluster", "Failed to close query (error code %d).", ret); - // ret = ret; // Fall-through: Continue through failure. + // ret = ret; // Fall-through: Failure ignored. } end_close: @@ -1886,11 +2284,11 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) if (ret != 0) { mssErrorf(0, "Cluster", "Failed to close object driver (error code %d).", ret); - // ret = ret; // Fall-through: Continue through failure. + // ret = ret; // Fall-through: Failure ignored. } end: - if (!successful) mssErrorf(0, "Cluster", "Vector computation failed."); + if (!successful) mssErrorf(0, "Cluster", "SourceData computation failed."); return (successful) ? 0 : -1; } @@ -1910,36 +2308,58 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) { /** If the clusters are alreadyd computed, we're done. **/ - if (cluster_data->Labels != NULL) return 0; + if (cluster_data->Clusters != NULL) return 0; /** Make source data available. **/ pSourceData source_data = node_data->SourceData; - /** We need the vectors to compute clusters. **/ + /** We need the SourceData vectors to compute clusters. **/ if (ci_ComputeSourceData(source_data, node_data->ParamList->Session) != 0) { - mssErrorf(0, "Cluster", "Vectors not found."); + mssErrorf(0, "Cluster", "Failed to compute SourceData."); goto err; } /** Record the date and time. **/ - /** Even if this computation fails, we may want this information. **/ - check(objCurrentDate(&cluster_data->DateComputed)); + if (!check(objCurrentDate(&cluster_data->DateComputed))) goto err; /** Allocate static memory for finding clusters. **/ - const size_t labels_size = source_data->nVectors * sizeof(unsigned int); - cluster_data->Labels = check_ptr(nmMalloc(labels_size)); + const size_t clusters_size = cluster_data->nClusters * sizeof(Cluster); + cluster_data->Clusters = check_ptr(nmMalloc(clusters_size)); + if (cluster_data->Clusters == NULL) goto err; + memset(cluster_data->Clusters, 0, clusters_size); + const size_t sims_size = source_data->nVectors * sizeof(double); + cluster_data->Sims = check_ptr(nmMalloc(sims_size)); + if (cluster_data->Sims == NULL) goto err_free_clusters; + memset(cluster_data->Sims, 0, sims_size); /** Execute clustering. **/ switch (cluster_data->ClusterAlgorithm) { case ALGORITHM_NONE: - case ALGORITHM_SLIDING_WINDOW: /* Clusters are not computed separately for performance reasons. */ + { tprintf("Applying no clustering...\n"); - memset(cluster_data->Labels, 0u, labels_size); + /** Put all the data into one cluster. **/ + pCluster first_cluster = &cluster_data->Clusters[0]; + first_cluster->Size = source_data->nVectors; + first_cluster->Strings = check_ptr(nmMalloc(source_data->nVectors * sizeof(char*))); + if (first_cluster->Strings == NULL) goto err_free_sims; + first_cluster->Vectors = check_ptr(nmMalloc(source_data->nVectors * sizeof(pVector))); + if (first_cluster->Vectors == NULL) goto err_free_sims; + memcpy(first_cluster->Strings, source_data->Strings, source_data->nVectors * sizeof(char*)); + memcpy(first_cluster->Vectors, source_data->Vectors, source_data->nVectors * sizeof(pVector)); + break; + } + + case ALGORITHM_SLIDING_WINDOW: + /** Computed in each search for efficiency. **/ + tprintf("Skipping sliding window clustering...\n"); + memset(cluster_data->Clusters, 0, clusters_size); break; case ALGORITHM_KMEANS: + { + tprintf("Applying kmeans clustering...\n"); /** Check for unimplemented similarity measures. **/ if (cluster_data->SimilarityMeasure != SIMILARITY_COSINE) { @@ -1947,25 +2367,64 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) "The similarity meausre \"%s\" is not implemented.", ci_SimilarityMeasureToString(cluster_data->SimilarityMeasure) ); - goto err; + goto err_free_sims; } - /** kmeans expects clusters to be initialized. **/ - memset(cluster_data->Labels, 0u, labels_size); + /** Allocate lables. Note: kmeans does not require us to initialize them. **/ + const size_t lables_size = source_data->nVectors * sizeof(unsigned int); + unsigned int* labels = check_ptr(nmMalloc(lables_size)); + if (labels == NULL) goto err_free_sims; + /** Run kmeans. **/ tprintf("Running kmeans\n"); Timer timer_i, *timer = timer_start(timer_init(&timer_i)); - ca_kmeans( + const bool successful = check(ca_kmeans( source_data->Vectors, source_data->nVectors, - cluster_data->Labels, - cluster_data->NumClusters, + cluster_data->nClusters, cluster_data->MaxIterations, - cluster_data->MinImprovement - ); + cluster_data->MinImprovement, + labels, + cluster_data->Sims + )); timer_stop(timer); - tprintf("Done after %.4lf.\n", timer_get(timer)); + tprintf("Clustering done after %.4lf.\n", timer_get(timer)); + if (!successful) goto err_free_sims; + + /** Convert the labels into clusters. **/ + + /** Allocate space for clusters. **/ + XArray indexes_in_cluster[cluster_data->nClusters]; + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + if (!check(xaInit(&indexes_in_cluster[i], 8))) goto err_free_sims; + + /** Iterate through each label and add the index of the specified cluster to the xArray. **/ + for (unsigned long long i = 0llu; i < source_data->nVectors; i++) + if (!check_neg(xaAddItem(&indexes_in_cluster[labels[i]], (void*)i))) goto err_free_sims; + nmFree(labels, lables_size); /* Free unused data. */ + + /** Iterate through each cluster, store it, and free the xArray. **/ + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + pXArray indexes_in_this_cluster = &indexes_in_cluster[i]; + pCluster cluster = &cluster_data->Clusters[i]; + cluster->Size = indexes_in_this_cluster->nItems; + cluster->Strings = check_ptr(nmMalloc(cluster->Size * sizeof(char*))); + if (cluster->Strings == NULL) goto err_free_sims; + cluster->Vectors = check_ptr(nmMalloc(cluster->Size * sizeof(pVector))); + if (cluster->Vectors == NULL) goto err_free_sims; + for (unsigned int j = 0u; j < cluster->Size; j++) + { + const unsigned long long index = (unsigned long long)indexes_in_this_cluster->Items[j]; + cluster->Strings[j] = source_data->Strings[index]; + cluster->Vectors[j] = source_data->Vectors[index]; + } + check(xaDeInit(indexes_in_this_cluster)); /* Failure ignored. */ + } + + /** k-means is done. **/ break; + } default: mssErrorf(1, "Cluster", @@ -1975,9 +2434,26 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) goto err; } + /** Success. **/ tprintf("Clustering done.\n"); return 0; + err_free_sims: + nmFree(cluster_data->Sims, sims_size); + cluster_data->Sims = NULL; + + err_free_clusters: + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + pCluster cluster = &cluster_data->Clusters[i]; + if (cluster->Strings != NULL) nmFree(cluster->Strings, cluster->Size * sizeof(char*)); + else break; + if (cluster->Vectors != NULL) nmFree(cluster->Vectors, cluster->Size * sizeof(pVector)); + else break; + } + nmFree(cluster_data->Clusters, clusters_size); + cluster_data->Clusters = NULL; + err: mssErrorf(0, "Cluster", "Cluster computation failed for \"%s\".", cluster_data->Name); return -1; @@ -2003,11 +2479,8 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) /** If the clusters are already computed, we're done. **/ if (search_data->Dups != NULL) return 0; - /** Extract structs. **/ + /** We need the cluster data to be computed before we search it. **/ pClusterData cluster_data = search_data->Source; - pSourceData source_data = node_data->SourceData; - - /** We need the clusters to be able to search them. **/ ret = ci_ComputeClusterData(cluster_data, node_data); if (ret != 0) { @@ -2026,35 +2499,112 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) } /** Record the date and time. **/ - /** Even if this computation fails, we may want this information. **/ - check(objCurrentDate(&search_data->DateComputed)); + if (!check(objCurrentDate(&search_data->DateComputed))) goto err; - /** Execute the search. **/ - tprintf("Invoking ca_search.\n"); + tprintf("Invoking search.\n"); Timer timer_i, *timer = timer_start(timer_init(&timer_i)); - pXArray dups_temp = ca_search( - source_data->Vectors, - source_data->nVectors, - cluster_data->Labels, - search_data->Threshold - ); + /** Execute the search using the specified source and comparison function. **/ + pXArray dups = NULL, dups_temp = NULL; + switch (search_data->SimilarityMeasure) + { + case SIMILARITY_COSINE: + { + if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + { + dups_temp = check_ptr(ca_sliding_search( + (void**)cluster_data->SourceData->Vectors, + cluster_data->SourceData->nVectors, + cluster_data->MaxIterations, /* Window size. */ + ca_cos_compare, + search_data->Threshold, + dups + )); + } + else + { + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + dups_temp = check_ptr(ca_complete_search( + (void**)cluster_data->Clusters[i].Vectors, + cluster_data->Clusters[i].Size, + ca_cos_compare, + search_data->Threshold, + dups + )); + if (dups_temp == NULL) goto err; + else dups = dups_temp; + } + } + break; + } + + case SIMILARITY_LEVENSHTEIN: + { + if (cluster_data->ClusterAlgorithm == ALGORITHM_SLIDING_WINDOW) + { + dups_temp = check_ptr(ca_sliding_search( + (void**)cluster_data->SourceData->Vectors, + cluster_data->SourceData->nVectors, + cluster_data->MaxIterations, /* Window size. */ + ca_lev_compare, + search_data->Threshold, + dups + )); + } + else + { + for (unsigned int i = 0u; i < cluster_data->nClusters; i++) + { + dups_temp = check_ptr(ca_complete_search( + (void**)cluster_data->Clusters[i].Strings, + cluster_data->Clusters[i].Size, + ca_lev_compare, + search_data->Threshold, + dups + )); + if (dups_temp == NULL) goto err; + else dups = dups_temp; + } + } + break; + } + + default: + mssErrorf(1, "Cluster", + "Unknown similarity meansure \"%s\".", + ci_SimilarityMeasureToString(search_data->SimilarityMeasure) + ); + goto err; + } timer_stop(timer); if (dups_temp == NULL) goto err; - tprintf("ca_search done after %.4lf.\n", timer_get(timer)); + else dups = dups_temp; + tprintf("Search done after %.4lf.\n", timer_get(timer)); /** Store dups. **/ - search_data->nDups = dups_temp->nItems; - search_data->Dups = (dups_temp->nItems == 0) + search_data->nDups = dups->nItems; + search_data->Dups = (dups->nItems == 0) ? check_ptr(nmMalloc(0)) - : ci_xaToTrimmedArray(dups_temp); + : ci_xaToTrimmedArray(dups); /** Free unused data. **/ tprintf("Cleanup.\n"); - check(xaFree(dups_temp)); + check(xaFree(dups)); /* Failure ignored. */ + /** Success. **/ return 0; err: + if (dups != NULL) + { + for (unsigned int i = 0u; i < dups->nItems; i++) + { + if (dups->Items[i] != NULL) nmFree(dups->Items[i], sizeof(Dup)); + else break; + } + check(xaFree(dups)); /* Failure ignored. */ + } + mssErrorf(0, "Cluster", "Search computation failed for \"%s\".", search_data->Name); return -1; } @@ -2126,9 +2676,7 @@ static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData { pParam param = (pParam)node_data->Params[i]; if (strcmp(param->Name, attr_name) != 0) continue; - - tprintf("Param found: Parsing...\n"); - + /** Parameter found. **/ if (param->Value == NULL) return 1; if (param->Value->Flags & DATA_TF_NULL) return 1; @@ -2138,14 +2686,16 @@ static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData return -1; } - tprintf("Param found: Copying...\n"); /** Return param value. **/ - objCopyData(&(param->Value->Data), val, datatype); + if (!check(objCopyData(&(param->Value->Data), val, datatype))) goto err; return 0; } - /** Param not found. **/ - tprintf("Param not found.\n"); + err: + mssErrorf(1, "Cluster", + "Failed to get parameter %s : %s", + attr_name, ci_TypeToStr(datatype) + ); return -1; } @@ -2164,10 +2714,10 @@ static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData // LINK #functions /*** Opens a new cluster driver instance by parsing a `.cluster` file found - *** at the path provided in obj. + *** at the path provided in parent. *** - *** @param obj The object being opened, including the path, session, and - *** other necessary information. + *** @param parent The parent of the object to be openned, including useful + *** information such as the pathname, session, etc. *** @param mask Driver permission mask (unused). *** @param sys_type ? (unused) *** @param usr_type The object system file type being openned. Should always @@ -2178,30 +2728,30 @@ static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData *** @returns A pDriverData struct representing a driver instance, or *** NULL if an error occures. ***/ -void* clusterOpen(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) +void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) { - tprintf("Warning: clusterOpen(\"%s\") is under active development.\n", ci_file_name(obj)); + tprintf("Warning: clusterOpen(\"%s\") is under active development.\n", ci_file_name(parent)); - /** If CREAT and EXCL are specified, create it and fail if it already exists. **/ + /** If CREAT and EXCL are specified, exclusively create it, failing if the file already exists. **/ pSnNode node_struct = NULL; - bool can_create = (obj->Mode & O_CREAT) && (obj->SubPtr == obj->Pathname->nElements); - if (can_create && (obj->Mode & O_EXCL)) + bool can_create = (parent->Mode & O_CREAT) && (parent->SubPtr == parent->Pathname->nElements); + if (can_create && (parent->Mode & O_EXCL)) { - node_struct = snNewNode(obj->Prev, usr_type); + node_struct = snNewNode(parent->Prev, usr_type); if (node_struct == NULL) { - mssErrorf(0, "Cluster", "Failed to EXCL create new node struct."); + mssErrorf(0, "Cluster", "Failed to exclusively create new node struct."); goto err; } } /** Read the node if it exists. **/ if (node_struct == NULL) - node_struct = snReadNode(obj->Prev); + node_struct = snReadNode(parent->Prev); - /** If we can't read, create it (if allowed). **/ + /** If we can't read it, create it (if allowed). **/ if (node_struct == NULL && can_create) - node_struct = snNewNode(obj->Prev, usr_type); + node_struct = snNewNode(parent->Prev, usr_type); /** If there still isn't a node, fail early. **/ if (node_struct == NULL) @@ -2210,37 +2760,40 @@ void* clusterOpen(pObject obj, int mask, pContentType sys_type, char* usr_type, goto err; } - /** Parse node data. **/ - pNodeData node_data = ci_ParseNodeData(node_struct->Data, obj); + /** Magic. **/ + ASSERTMAGIC(node_struct, MGK_STNODE); + ASSERTMAGIC(node_struct->Data, MGK_STRUCTINF); + + /** Parse node data from the node_struct. **/ + pNodeData node_data = ci_ParseNodeData(node_struct->Data, parent); if (node_data == NULL) { - mssErrorf(0, "Cluster", "Failed to parse structure file \"%s\".", ci_file_name(obj)); + mssErrorf(0, "Cluster", "Failed to parse structure file \"%s\".", ci_file_name(parent)); goto err; } - node_data->Node = node_struct; - node_data->Node->OpenCnt++; /** Allocate driver instance data. **/ pDriverData driver_data = check_ptr(nmMalloc(sizeof(DriverData))); + if (driver_data == NULL) goto err_free_node; memset(driver_data, 0, sizeof(DriverData)); driver_data->NodeData = node_data; /** Detect target from path. **/ - tprintf("Parsing node path: %d %d\n", obj->SubPtr, obj->SubCnt); obj->SubCnt = 0; - char* target_name = obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt++, 1); + tprintf("Parsing node path: %d %d\n", parent->SubPtr, parent->SubCnt); parent->SubCnt = 0; + char* target_name = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); if (target_name == NULL) { /** Target found: Root **/ tprintf("Found target: Root.\n"); driver_data->TargetType = TARGET_ROOT; driver_data->TargetData = (void*)driver_data->NodeData->SourceData; - return (void*)driver_data; /* Sucess. */ + return (void*)driver_data; /* Success. */ } /** Search clusters. **/ - for (unsigned int i = 0u; i < node_data->nClusters; i++) + for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) { - pClusterData cluster = node_data->Clusters[i]; + pClusterData cluster = node_data->ClusterDatas[i]; if (strcmp(cluster->Name, target_name) != 0) continue; /** Target found: Cluster **/ @@ -2251,7 +2804,7 @@ void* clusterOpen(pObject obj, int mask, pContentType sys_type, char* usr_type, while (true) { /** Decend one path part deeper into the path. **/ - const char* path_part = obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt++, 1); + const char* path_part = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); /** If the path does not go any deeper, we're done. **/ if (path_part == NULL) @@ -2278,13 +2831,13 @@ void* clusterOpen(pObject obj, int mask, pContentType sys_type, char* usr_type, continue_descent:; } - return (void*)driver_data; /* Sucess. */ + return (void*)driver_data; /* Success. */ } /** Search searches. **/ - for (unsigned int i = 0u; i < node_data->nSearches; i++) + for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) { - pSearchData search = node_data->Searches[i]; + pSearchData search = node_data->SearchDatas[i]; if (strcmp(search->Name, target_name) != 0) continue; /** Target found: Search **/ @@ -2292,25 +2845,40 @@ void* clusterOpen(pObject obj, int mask, pContentType sys_type, char* usr_type, driver_data->TargetData = (void*)search; /** Check for extra, invalid path parts. **/ - char* extra_data = obj_internal_PathPart(obj->Pathname, obj->SubPtr + obj->SubCnt++, 1); + char* extra_data = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); if (extra_data != NULL) { mssErrorf(1, "Cluster", "Unknown path part %s.", extra_data); goto err_free_node; } - tprintf("Found target search: %s %d %d\n", search->Name, obj->SubPtr, obj->SubCnt); - return (void*)driver_data; /* Sucess. */ + tprintf("Found target search: %s %d %d\n", search->Name, parent->SubPtr, parent->SubCnt); + return (void*)driver_data; /* Success. */ } /** We were unable to find the requested cluster or search. **/ - mssErrorf(1, "Cluster", "\"%s\" is not the name of a declaired cluster or search.", target_name); + mssErrorf(1, "Cluster", "\"%s\" is not the name of a declared cluster or search.", target_name); + + /** Attempt to give a hint. **/ + { + const unsigned int n_targets = node_data->nClusterDatas + node_data->nSearchDatas; + char* target_names[n_targets]; + for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) + target_names[i] = node_data->ClusterDatas[i]->Name; + for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) + target_names[i + node_data->nClusterDatas] = node_data->SearchDatas[i]->Name; + ci_TryHint(target_name, target_names, n_targets); + } /** Error cleanup. **/ err_free_node: - ci_FreeNodeData(node_data); - nmFree(driver_data, sizeof(DriverData)); + if (node_data != NULL) ci_FreeNodeData(node_data); + if (driver_data != NULL) nmFree(driver_data, sizeof(DriverData)); err: + mssErrorf(0, "Cluster", + "Failed to open cluster file \"%s\" at: %s", + ci_file_name(parent), ci_file_path(parent) + ); return NULL; } @@ -2362,6 +2930,7 @@ void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt) { tprintf("Warning: clusterOpenQuery() is under active development.\n"); pClusterQuery cluster_query = check_ptr(nmMalloc(sizeof(ClusterQuery))); + if (cluster_query == NULL) return NULL; cluster_query->DriverData = (pDriverData)inf_v; cluster_query->RowIndex = 0u; return cluster_query; @@ -2404,9 +2973,9 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) if (ret != 0) { mssErrorf(0, "Cluster", "Internal cluster computation failed."); - return NULL; + return NULL; } - data_amount = cluster_query->DriverData->NodeData->SourceData->nVectors; + data_amount = target->nClusters; break; } @@ -2418,7 +2987,7 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) if (ret != 0) { mssErrorf(0, "Cluster", "Internal search computation failed."); - return NULL; + return NULL; } data_amount = target->nDups; break; @@ -2442,12 +3011,13 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) if (cluster_query->RowIndex >= data_amount) return NULL; /** Create the result struct. **/ - pDriverData driver_data = nmMalloc(sizeof(DriverData)); - assert(driver_data != NULL); + pDriverData driver_data = check_ptr(nmMalloc(sizeof(DriverData))); + if (driver_data == NULL) return NULL; memcpy(driver_data, cluster_query->DriverData, sizeof(DriverData)); driver_data->TargetType = new_target_type; driver_data->TargetIndex = cluster_query->RowIndex++; + /** Success. **/ return driver_data; } @@ -2531,9 +3101,7 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) if (strcmp(attr_name, "num_clusters") == 0 || strcmp(attr_name, "max_iterations") == 0) return DATA_T_INTEGER; - if (strcmp(attr_name, "min_improvement") == 0 - || strcmp(attr_name, "average_similarity") == 0 - || strcmp(attr_name, "size") == 0) + if (strcmp(attr_name, "min_improvement") == 0) return DATA_T_DOUBLE; break; @@ -2546,12 +3114,8 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) break; case TARGET_CLUSTER_ENTRY: - if (strcmp(attr_name, "id") == 0) - return DATA_T_INTEGER; - if (strcmp(attr_name, "val") == 0) - return DATA_T_STRING; - if (strcmp(attr_name, "sim") == 0) - return DATA_T_DOUBLE; + if (strcmp(attr_name, "items") == 0) + return DATA_T_STRINGVEC; break; case TARGET_SEARCH_ENTRY: @@ -2604,9 +3168,9 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val return DATA_T_UNAVAILABLE; } - /** Performance shortcut for frequently requested attributes: val, val1, val2, and sim. **/ + /** Performance shortcut for frequently requested attributes: val1, val2, and sim. **/ if ( - (attr_name[0] == 'v' && datatype == DATA_T_STRING) /* val, val1, val2 : String */ + (attr_name[0] == 'v' && datatype == DATA_T_STRING) /* val1, val2 : String */ || (attr_name[0] == 's' && datatype == DATA_T_DOUBLE) /* sim : double */ ) goto handle_targets; @@ -2771,15 +3335,15 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val } if (strcmp(attr_name, "num_clusters") == 0) { - if (target->NumClusters > INT_MAX) - fprintf(stderr, "Warning: num_clusters value of %u exceeds INT_MAX (%d).\n", target->NumClusters, INT_MAX); - val->Integer = (int)target->NumClusters; + if (target->nClusters > INT_MAX) + fprintf(stderr, "Warning: 'num_clusters' value of %u exceeds INT_MAX (%d).\n", target->nClusters, INT_MAX); + val->Integer = (int)target->nClusters; return 0; } if (strcmp(attr_name, "max_iterations") == 0) { if (target->MaxIterations > INT_MAX) - fprintf(stderr, "Warning: max_iterations value of %u exceeds INT_MAX (%d).\n", target->MaxIterations, INT_MAX); + fprintf(stderr, "Warning: 'max_iterations' value of %u exceeds INT_MAX (%d).\n", target->MaxIterations, INT_MAX); val->Integer = (int)target->MaxIterations; return 0; } @@ -2788,12 +3352,6 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val val->Double = target->MinImprovement; return 0; } - if (strcmp(attr_name, "average_similarity") == 0 - || strcmp(attr_name, "size") == 0) - { - mssErrorf(1, "Cluster", "average_similarity is not implemented."); - return -1; - } break; } @@ -2822,21 +3380,22 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val { pClusterData target = (pClusterData)driver_data->TargetData; - if (strcmp(attr_name, "id") == 0) - { - val->Integer = (int)target->Labels[driver_data->TargetIndex]; - return 0; - } - if (strcmp(attr_name, "val") == 0) + if (strcmp(attr_name, "items") == 0) { - val->String = driver_data->NodeData->SourceData->Data[driver_data->TargetIndex]; + /** Static variable to prevent leaking StringVec from previous calls. **/ + static StringVec* vec = NULL; + if (vec != NULL) nmFree(vec, sizeof(StringVec)); + + /** Allocate and initiallize the requested data. **/ + pCluster target_cluster = &target->Clusters[driver_data->TargetIndex]; + val->StringVec = vec = check_ptr(nmMalloc(sizeof(StringVec))); + if (val->StringVec == NULL) return -1; + val->StringVec->nStrings = target_cluster->Size; + val->StringVec->Strings = target_cluster->Strings; + + /** Success. **/ return 0; } - if (strcmp(attr_name, "sim") == 0) - { - mssErrorf(1, "Cluster", "Cluster entry similarity is not supported."); - return -1; - } break; } @@ -2847,23 +3406,29 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val if (strcmp(attr_name, "id1") == 0) { - val->Integer = (int)target_dup->id1; + unsigned int value = target_dup->id1; + if (value > INT_MAX) + fprintf(stderr, "Warning: id1 value of %u exceeds INT_MAX (%d).\n", value, INT_MAX); + val->Integer = (int)value; return 0; } if (strcmp(attr_name, "id2") == 0) { - val->Integer = (int)target_dup->id2; + unsigned int value = target_dup->id2; + if (value > INT_MAX) + fprintf(stderr, "Warning: id2 value of %u exceeds INT_MAX (%d).\n", value, INT_MAX); + val->Integer = (int)value; return 0; } if (strcmp(attr_name, "val1") == 0) { - val->String = driver_data->NodeData->SourceData->Data[target_dup->id1]; + val->String = driver_data->NodeData->SourceData->Strings[target_dup->id1]; // val->Integer = (int)target_dup->id1; return 0; } if (strcmp(attr_name, "val2") == 0) { - val->String = driver_data->NodeData->SourceData->Data[target_dup->id2]; + val->String = driver_data->NodeData->SourceData->Strings[target_dup->id2]; // val->Integer = (int)target_dup->id2; return 0; } @@ -2914,9 +3479,10 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Malloc presentation hints struct. **/ pObjPresentationHints hints = check_ptr(nmMalloc(sizeof(ObjPresentationHints))); + if (hints == NULL) goto err; memset(hints, 0, sizeof(ObjPresentationHints)); - /** Hints that are the same for all fields */ + /** Hints that are the same for all fields **/ hints->GroupID = -1; hints->VisualLength2 = 1; hints->Style |= OBJ_PH_STYLE_READONLY | OBJ_PH_STYLE_CREATEONLY | OBJ_PH_STYLE_NOTNULL; @@ -2924,18 +3490,20 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Temporary param list for compiling expressions. **/ pParamObjects tmp_list = check_ptr(expCreateParamList()); + if (hints == NULL) goto err; + /** Search for the requested attribute through attributes common to all instances. **/ if (strcmp(attr_name, "name") == 0) { hints->Length = 32; hints->VisualLength = 16; - goto end; + goto success; } if (strcmp(attr_name, "annotation") == 0) { hints->Length = 36; hints->VisualLength = 36; - goto end; + goto success; } if (strcmp(attr_name, "inner_type") == 0 || strcmp(attr_name, "inner_type") == 0 @@ -2944,18 +3512,24 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb || strcmp(attr_name, "last_modification") == 0) { hints->VisualLength = 30; - goto end; + goto success; } + /** Handle date created and date computed. */ if (strcmp(attr_name, "date_created") == 0 || strcmp(attr_name, "date_computed") == 0) { - hints->Length = 24; - hints->VisualLength = 20; - hints->Format = nmSysStrdup("datetime"); - goto end; + if (driver_data->TargetType == TARGET_CLUSTER || driver_data->TargetType == TARGET_SEARCH) + { + hints->Length = 24; + hints->VisualLength = 20; + hints->Format = nmSysStrdup("datetime"); + goto success; + } + else goto unknown_attribute; } + /** Search by target type. **/ switch (driver_data->TargetType) { case TARGET_ROOT: @@ -2964,14 +3538,14 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = _PC_PATH_MAX; hints->VisualLength = 64; hints->FriendlyName = "Source Path"; - goto end; + goto success; } if (strcmp(attr_name, "attr_name") == 0) { hints->Length = 255; hints->VisualLength = 32; hints->FriendlyName = "Attribute Name"; - goto end; + goto success; } break; @@ -2986,7 +3560,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 8; hints->VisualLength = 4; hints->FriendlyName = nmSysStrdup("Number of Clusters"); - goto end; + goto success; } if (strcmp(attr_name, "min_improvement") == 0) { @@ -2999,7 +3573,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 16; hints->VisualLength = 8; hints->FriendlyName = nmSysStrdup("Minimum Improvement Threshold"); - goto end; + goto success; } if (strcmp(attr_name, "max_iterations") == 0) { @@ -3012,31 +3586,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 8; hints->VisualLength = 4; hints->FriendlyName = nmSysStrdup("Maximum Number of Clustering Iterations"); - goto end; - } - if (strcmp(attr_name, "average_similarity") == 0) - { - /** Min and max values. **/ - hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - - /** Other hints. **/ - hints->Length = 16; - hints->VisualLength = 8; - hints->FriendlyName = nmSysStrdup("Average Similarity"); - goto end; - } - if (strcmp(attr_name, "size") == 0) - { - /** Min and max values. **/ - hints->MinValue = expCompileExpression("0.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - hints->MaxValue = expCompileExpression("1.0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - - /** Other hints. **/ - hints->Length = 16; - hints->VisualLength = 8; - hints->FriendlyName = nmSysStrdup("Average Cluster Size"); - goto end; + goto success; } if (strcmp(attr_name, "algorithm") == 0) { @@ -3059,7 +3609,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 24; hints->VisualLength = 20; hints->FriendlyName = nmSysStrdup("Clustering Algorithm"); - goto end; + goto success; } /** Fall-through: Start of overlapping region. **/ @@ -3085,7 +3635,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 32; hints->VisualLength = 20; hints->FriendlyName = nmSysStrdup("Similarity Measure"); - goto end; + goto success; } /** End of overlapping region. **/ @@ -3096,7 +3646,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 64; hints->VisualLength = 32; hints->FriendlyName = nmSysStrdup("Source Cluster Name"); - goto end; + goto success; } if (strcmp(attr_name, "threshold") == 0) { @@ -3108,7 +3658,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 16; hints->VisualLength = 8; hints->FriendlyName = nmSysStrdup("Similarity Threshold"); - goto end; + goto success; } break; @@ -3132,7 +3682,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 8; hints->VisualLength = 4; - goto end; + goto success; } if (strcmp(attr_name, "val") == 0) { @@ -3140,7 +3690,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 255; hints->VisualLength = 32; hints->FriendlyName = nmSysStrdup("Value"); - goto end; + goto success; } if (strcmp(attr_name, "sim") == 0) { @@ -3152,7 +3702,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 16; hints->VisualLength = 8; hints->FriendlyName = nmSysStrdup("Similarity"); - goto end; + goto success; } break; } @@ -3177,7 +3727,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 8; hints->VisualLength = 4; - goto end; + goto success; } if (strcmp(attr_name, "val1") == 0 || strcmp(attr_name, "val2") == 0) { @@ -3185,7 +3735,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 255; hints->VisualLength = 32; hints->FriendlyName = nmSysStrdup("Value"); - goto end; + goto success; } if (strcmp(attr_name, "sim") == 0) { @@ -3197,7 +3747,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb hints->Length = 16; hints->VisualLength = 8; hints->FriendlyName = nmSysStrdup("Similarity"); - goto end; + goto success; } break; } @@ -3207,14 +3757,26 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb goto err; } + /** Unknown attribute. **/ + unknown_attribute:; + char* name; + clusterGetAttrValue(inf_v, "name", DATA_T_STRING, POD(&name), NULL); + mssErrorf(1, "Cluster", + "Unknown attribute '%s' for cluster object %s (target type: %u, \"%s\").", + attr_name, driver_data->NodeData->SourceData->Name, driver_data->TargetType, name + ); - end: - check(expFreeParamList(tmp_list)); - return hints; - + /** Error cleanup. **/ err: + if (tmp_list != NULL) check(expFreeParamList(tmp_list)); /* Failure ignored. */ + if (hints != NULL) nmFree(hints, sizeof(ObjPresentationHints)); mssErrorf(0, "Cluster", "Failed execute generate presentation hints."); return NULL; + + /** Success. **/ + success: + check(expFreeParamList(tmp_list)); /* Failure ignored. */ + return hints; } @@ -3255,11 +3817,11 @@ char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt) tprintf("%u) is under active development.\n", i); switch (driver_data->TargetType) { - case TARGET_ROOT: return (i < nATTR_ROOT) ? ATTR_ROOT[i] : END_OF_ATTRIBUTES; - case TARGET_CLUSTER: return (i < nATTR_CLUSTER) ? ATTR_CLUSTER[i] : END_OF_ATTRIBUTES; - case TARGET_SEARCH: return (i < nATTR_SEARCH) ? ATTR_SEARCH[i] : END_OF_ATTRIBUTES; - case TARGET_CLUSTER_ENTRY: return (i < nATTR_CLUSTER_ENTRY) ? ATTR_CLUSTER_ENTRY[i] : END_OF_ATTRIBUTES; - case TARGET_SEARCH_ENTRY: return (i < nATTR_SEARCH_ENTRY) ? ATTR_SEARCH_ENTRY[i] : END_OF_ATTRIBUTES; + case TARGET_ROOT: return ATTR_ROOT[i]; + case TARGET_CLUSTER: return ATTR_CLUSTER[i]; + case TARGET_SEARCH: return ATTR_SEARCH[i]; + case TARGET_CLUSTER_ENTRY: return ATTR_CLUSTER_ENTRY[i]; + case TARGET_SEARCH_ENTRY: return ATTR_SEARCH_ENTRY[i]; default: mssErrorf(1, "Cluster", "Unknown target type %u.", driver_data->TargetType); return NULL; @@ -3292,7 +3854,7 @@ int clusterInfo(void* inf_v, pObjectInfo info) switch (driver_data->TargetType) { case TARGET_ROOT: - info->nSubobjects = node_data->nClusters + node_data->nSearches; + info->nSubobjects = node_data->nClusterDatas + node_data->nSearchDatas; info->Flags |= OBJ_INFO_F_CAN_HAVE_SUBOBJ; info->Flags |= OBJ_INFO_F_SUBOBJ_CNT_KNOWN; info->Flags |= (info->nSubobjects > 0) ? OBJ_INFO_F_HAS_SUBOBJ : OBJ_INFO_F_NO_SUBOBJ; @@ -3384,11 +3946,9 @@ char* clusterGetFirstMethod(void* inf_v, pObjTrxTree oxt) ***/ char* clusterGetNextMethod(void* inf_v, pObjTrxTree oxt) { - tprintf("Warning: clusterGetNextMethod("); + tprintf("Warning: clusterGetNextMethod() is under active development."); pDriverData driver_data = (pDriverData)inf_v; - const unsigned int i = driver_data->TargetMethodIndex++; - tprintf("%u) is under active development.\n", i); - return (i < nMETHOD_NAME) ? METHOD_NAME[i] : END_OF_METHODS; + return METHOD_NAME[driver_data->TargetMethodIndex++]; } @@ -3439,7 +3999,9 @@ static int ci_PrintEntry(pXHashEntry entry, void* arg) bytes = ci_SizeOfSearchData(search_data); break; } - default: assert(false); + default: + mssErrorf(0, "Cluster", "Unknown type_id %u.", *type_id_ptr); + return -1; } /** Increment total bytes. **/ @@ -3539,62 +4101,70 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx if (strcmp(param->String, "show") == 0) { show = true; - path = ci_file_path(driver_data->NodeData->Obj); + path = ci_file_path(driver_data->NodeData->Parent); } if (strcmp(param->String, "show_all") == 0) show = true; if (show) { /** Print cache info table. **/ + int ret = 0; unsigned int i = 1u, source_bytes = 0u, cluster_bytes = 0u, search_bytes = 0u; + bool failed = false; printf("\nShowing cache for "); if (path != NULL) printf("\"%s\":\n", path); else printf("all files:\n"); printf("%-8s %-16s %-12s %s\n", "Type", "Name", "Size", "Cache Entry Key"); - xhForEach(&ClusterCaches.SourceCache, ci_PrintEntry, (void*[]){&i, &source_bytes, path}); i++; - xhForEach(&ClusterCaches.ClusterCache, ci_PrintEntry, (void*[]){&i, &cluster_bytes, path}); i++; - xhForEach(&ClusterCaches.SearchCache, ci_PrintEntry, (void*[]){&i, &search_bytes, path}); i++; + failed |= !check(xhForEach( + &ClusterDriverCaches.SourceDataCache, + ci_PrintEntry, + (void*[]){&i, &source_bytes, path} + )); + i++; + failed |= !check(xhForEach( + &ClusterDriverCaches.ClusterDataCache, + ci_PrintEntry, + (void*[]){&i, &cluster_bytes, path} + )); + i++; + failed |= !check(xhForEach( + &ClusterDriverCaches.SearchDataCache, + ci_PrintEntry, + (void*[]){&i, &search_bytes, path} + )); + if (failed) + { + mssErrorf(0, "Cluster", "Unexpected error occured while showhing caches."); + ret = -1; + } /** Print stats. **/ char buf[16]; printf("\nCache Stats:\n"); printf("%-8s %-4s %-12s\n", "", "#", "Total Size"); - const int n_sources = ClusterCaches.SourceCache.nItems; - snprint_bytes(buf, sizeof(buf), source_bytes); - printf("%-8s %-4d %-12s\n", "Source", n_sources, buf); - const int n_clusters = ClusterCaches.ClusterCache.nItems; - snprint_bytes(buf, sizeof(buf), cluster_bytes); - printf("%-8s %-4d %-12s\n", "Cluster", n_clusters, buf); - const int n_searches = ClusterCaches.SearchCache.nItems; - snprint_bytes(buf, sizeof(buf), search_bytes); - printf("%-8s %-4d %-12s\n", "Search", n_searches, buf); - snprint_bytes(buf, sizeof(buf), source_bytes + cluster_bytes + search_bytes); - printf("%-8s %-4d %-12s\n\n", "Total", n_sources + n_clusters + n_searches, buf); - return 0; - } - - /** drop and drop_all. **/ - bool drop = false; - if (strcmp(param->String, "drop") == 0) - { - show = true; - path = ci_file_path(driver_data->NodeData->Obj); + printf("%-8s %-4d %-12s\n", "Source", ClusterDriverCaches.SourceDataCache.nItems, snprint_bytes(buf, sizeof(buf), source_bytes)); + printf("%-8s %-4d %-12s\n", "Cluster", ClusterDriverCaches.ClusterDataCache.nItems, snprint_bytes(buf, sizeof(buf), cluster_bytes)); + printf("%-8s %-4d %-12s\n", "Search", ClusterDriverCaches.SearchDataCache.nItems, snprint_bytes(buf, sizeof(buf), search_bytes)); + printf("%-8s %-4d %-12s\n\n", "Total", + ClusterDriverCaches.SourceDataCache.nItems + ClusterDriverCaches.ClusterDataCache.nItems + ClusterDriverCaches.SearchDataCache.nItems, + snprint_bytes(buf, sizeof(buf), source_bytes + cluster_bytes + search_bytes) + ); + + return ret; } - if (strcmp(param->String, "drop_all") == 0) drop = true; - if (drop) + /** drop_all. **/ + if (strcmp(param->String, "drop_all") == 0) { + /** Print info. **/ printf("\nDropping cache for "); if (path != NULL) printf("\"%s\":\n", path); else printf("all files:\n"); - /*** Free caches in reverse of the order they are created in case - *** cached data relies on its source during the freeing process. - ***/ - xhClearKeySafe(&ClusterCaches.SearchCache, ci_CacheFreeSearch, path); - xhClearKeySafe(&ClusterCaches.ClusterCache, ci_CacheFreeCluster, path); - xhClearKeySafe(&ClusterCaches.SourceCache, ci_CacheFreeSourceData, path); - printf("Cache dropped.\n"); + /** Free caches. **/ + ci_FreeCaches(); + + tprintf("Cache dropped.\n"); return 0; } @@ -3613,7 +4183,23 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx mssErrorf(0, "Cluster", "Failed execute command."); return -1; } - + + +/*** Frees caches when the driver is unregistered. + *** + *** This function does not free either of the given parameters. + *** + *** @param object_driver The driver instance which was registered being unregistered. (unused) + *** @param session The session being closed. (unused) + *** Returns + ***/ +int clusterUnregister(pObjDriver object_driver, pObjSession session) + { + ci_FreeCaches(); + return 0; + } + + /** ================ Unimplemented Functions ================ **/ /** ANCHOR[id=unimplemented] **/ // LINK #functions @@ -3682,38 +4268,33 @@ int clusterCommit(void* inf_v, pObjTrxTree *oxt) *** - Initializing global data needed for the driver. *** *** @returns 0 if successful, or - *** a negative value if an error occured. + *** -1 if an error occured. ***/ int clusterInitialize(void) { - int ret; /** Initialize library. **/ ca_init(); /** Allocate the driver. **/ - pObjDriver drv = (pObjDriver)nmMalloc(sizeof(ObjDriver)); - if (drv == NULL) return -1; + pObjDriver drv = (pObjDriver)check_ptr(nmMalloc(sizeof(ObjDriver))); + if (drv == NULL) goto err; memset(drv, 0, sizeof(ObjDriver)); /** Initialize globals. **/ - memset(&ClusterCaches, 0, sizeof(ClusterCaches)); - ret = xhInit(&ClusterCaches.SourceCache, 251, 0); - if (ret < 0) return ret; - ret = xhInit(&ClusterCaches.ClusterCache, 251, 0); - if (ret < 0) return ret; - ret = xhInit(&ClusterCaches.SearchCache, 251, 0); - if (ret < 0) return ret; + memset(&ClusterDriverCaches, 0, sizeof(ClusterDriverCaches)); + if (!check(xhInit(&ClusterDriverCaches.SourceDataCache, 251, 0))) goto err; + if (!check(xhInit(&ClusterDriverCaches.ClusterDataCache, 251, 0))) goto err; + if (!check(xhInit(&ClusterDriverCaches.SearchDataCache, 251, 0))) goto err; /** Setup the structure. **/ - strcpy(drv->Name, "clu - Clustering Driver"); - drv->Capabilities = OBJDRV_C_TRANS | OBJDRV_C_FULLQUERY; // OBJDRV_C_TRANS | OBJDRV_C_FULLQUERY; - ret = xaInit(&(drv->RootContentTypes), 1); - if (ret < 0) return ret; - ret = xaAddItem(&(drv->RootContentTypes), "system/cluster"); - if (ret < 0) return ret; + if (check_ptr(strcpy(drv->Name, "clu - Clustering Driver")) == NULL) goto err; + if (!check(xaInit(&(drv->RootContentTypes), 1))) goto err; + if (!check_neg(xaAddItem(&(drv->RootContentTypes), "system/cluster"))) goto err; + drv->Capabilities = OBJDRV_C_TRANS | OBJDRV_C_FULLQUERY; /* TODO: Greg, double check these are correct. */ /** Setup the function references. **/ drv->Open = clusterOpen; + drv->OpenChild = NULL; drv->Close = clusterClose; drv->Create = clusterCreate; drv->Delete = clusterDelete; @@ -3734,9 +4315,12 @@ int clusterInitialize(void) drv->GetFirstMethod = clusterGetFirstMethod; drv->GetNextMethod = clusterGetNextMethod; drv->ExecuteMethod = clusterExecuteMethod; - drv->Commit = clusterCommit; - drv->Info = clusterInfo; drv->PresentationHints = clusterPresentationHints; + drv->Info = clusterInfo; + drv->Commit = clusterCommit; + drv->GetQueryCoverageMask = NULL; + drv->GetQueryIdentityPath = NULL; + drv->Unregister = clusterUnregister; /** Register some structures. **/ nmRegister(sizeof(ClusterData), "ClusterData"); @@ -3745,37 +4329,37 @@ int clusterInitialize(void) nmRegister(sizeof(NodeData), "ClusterNodeData"); nmRegister(sizeof(DriverData), "ClusterDriverData"); nmRegister(sizeof(ClusterQuery), "ClusterQuery"); - nmRegister(sizeof(ClusterCaches), "ClusterCaches"); + nmRegister(sizeof(ClusterDriverCaches), "ClusterDriverCaches"); /** Print debug size info. **/ - char cluster_size_buf[16]; - char search_size_buf[16]; - char source_size_buf[16]; - char node_size_buf[16]; - char driver_size_buf[16]; - char query_size_buf[16]; - char caches_size_buf[16]; + char buf1[16], buf2[16], buf3[16], buf4[16], buf5[16], buf6[16], buf7[16]; tprintf( "Cluster driver struct sizes:\n" + " > sizeof(SourceData): %s\n" " > sizeof(ClusterData): %s\n" " > sizeof(SearchData): %s\n" - " > sizeof(SourceData): %s\n" " > sizeof(NodeData): %s\n" " > sizeof(DriverData): %s\n" " > sizeof(ClusterQuery): %s\n" - " > sizeof(ClusterCaches): %s\n", - snprint_bytes(cluster_size_buf, sizeof(cluster_size_buf), sizeof(ClusterData)), - snprint_bytes(search_size_buf, sizeof(search_size_buf), sizeof(SearchData)), - snprint_bytes(source_size_buf, sizeof(source_size_buf), sizeof(SourceData)), - snprint_bytes(node_size_buf, sizeof(node_size_buf), sizeof(NodeData)), - snprint_bytes(driver_size_buf, sizeof(driver_size_buf), sizeof(DriverData)), - snprint_bytes(query_size_buf, sizeof(query_size_buf), sizeof(ClusterQuery)), - snprint_bytes(caches_size_buf, sizeof(caches_size_buf), sizeof(ClusterCaches)) + " > sizeof(ClusterDriverCaches): %s\n", + snprint_bytes(buf1, sizeof(buf1), sizeof(SourceData)), + snprint_bytes(buf2, sizeof(buf2), sizeof(ClusterData)), + snprint_bytes(buf3, sizeof(buf3), sizeof(SearchData)), + snprint_bytes(buf4, sizeof(buf4), sizeof(NodeData)), + snprint_bytes(buf5, sizeof(buf5), sizeof(DriverData)), + snprint_bytes(buf6, sizeof(buf6), sizeof(ClusterQuery)), + snprint_bytes(buf7, sizeof(buf7), sizeof(ClusterDriverCaches)) ); /** Register the driver. **/ - ret = objRegisterDriver(drv); - if (ret < 0) return ret; + if (!check(objRegisterDriver(drv))) goto err; + /** Success. **/ return 0; + + /** Error cleanup. **/ + err: + if (drv != NULL) nmFree(drv, sizeof(ObjDriver)); + fprintf(stderr, "Error: Failed to initialize cluster driver.\n"); + return -1; } diff --git a/centrallix/test_obj.c b/centrallix/test_obj.c index 5ef492de3..6b09a8586 100644 --- a/centrallix/test_obj.c +++ b/centrallix/test_obj.c @@ -1271,6 +1271,13 @@ testobj_do_cmd(pObjSession s, char* cmd, int batch_mode, pLxSession inp_lx) } else if (!strcmp(cmdname,"quit")) { + /** Loop through each driver and call their unregister handler, if they have one. **/ + for (unsigned int i = 0u; i < OSYS.Drivers.nItems; i++) + { + pObjDriver cur = (pObjDriver)OSYS.Drivers.Items[i]; + if (cur->Unregister != NULL) cur->Unregister(cur, s); + } + mlxCloseSession(ls); return 1; } From a861fb4a2f55241e5bfa1c2ac15c888ff2eccab8 Mon Sep 17 00:00:00 2001 From: Israel Date: Tue, 28 Oct 2025 10:58:21 -0600 Subject: [PATCH 05/30] Upgrade memory handling in the cluster driver. Improve edge case logic in comparison functions. Remove unregister driver function. Clean up exp_functions.c. --- centrallix-lib/include/clusters.h | 8 + centrallix-lib/src/clusters.c | 18 +- centrallix/expression/exp_functions.c | 2151 ++----------------------- centrallix/include/obj.h | 1 - centrallix/osdrivers/objdrv_cluster.c | 246 ++- centrallix/test_obj.c | 7 - 6 files changed, 326 insertions(+), 2105 deletions(-) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index d8b7f97c6..bddd0800c 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -79,6 +79,14 @@ int ca_kmeans( unsigned int* labels, double* vector_sims); +/** Vector helper macros. **/ +#define ca_is_empty(vector) (vector[0] == -CA_NUM_DIMS) +#define ca_has_no_pairs(vector) \ + ({ \ + __typeof__ (vector) _v = (vector); \ + _v[0] == -172 && _v[1] == 11 && _v[2] == -78; \ + }) + /** Comparison functions, for ca_search(). **/ double ca_cos_compare(void* v1, void* v2); double ca_lev_compare(void* str1, void* str2); diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 90599269c..864ff36eb 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -514,12 +514,18 @@ static unsigned int edit_dist(const char* str1, const char* str2, const size_t s ***/ double ca_cos_compare(void* v1, void* v2) { - /** Input validation checks. **/ - if (v1 == NULL || v2 == NULL) return 0.0; if (v1 == v2) return 1.0; + /** Input validation checks. **/ + const pVector vec1 = v1, vec2 = v2; + const bool v1_empty = (vec1 == NULL || ca_is_empty(vec1) || ca_has_no_pairs(vec1)); + const bool v2_empty = (vec2 == NULL || ca_is_empty(vec2) || ca_has_no_pairs(vec2)); + if (v1_empty && v2_empty) return 1.0; + if (v1_empty && !v2_empty) return 0.0; + if (!v1_empty && v2_empty) return 0.0; + /** Return the sparse similarity. **/ - return sparse_similarity((const pVector)v1, (const pVector)v2); + return sparse_similarity(vec1, vec2); } /*** Compares two strings using their levenstien edit distance to compute a @@ -544,12 +550,12 @@ double ca_lev_compare(void* str1, void* str2) if (str1 == NULL || str2 == NULL) return 0.0; if (str1 == str2) return 1.0; - /** Compute string length. **/ + /** Handle string length. **/ const size_t len1 = strlen(str1); const size_t len2 = strlen(str2); - - /** Empty strings are identical, avoiding a divide by zero. */ if (len1 == 0lu && len2 == 0lu) return 1.0; + if (len1 != 0lu && len2 == 0lu) return 0.0; + if (len1 != 0lu && len2 != 0lu) return 0.0; /** Compute levenshtein edit distance. **/ const unsigned int dist = edit_dist((const char*)str1, (const char*)str2, len1, len2); diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index a8e16ecc7..4f9ffa563 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -67,6 +67,7 @@ #include "cxlib/mtlexer.h" #include "cxlib/mtsession.h" #include "cxlib/newmalloc.h" +#include "cxlib/util.h" #include "cxlib/xarray.h" #include "cxlib/xhash.h" #include "cxss/cxss.h" @@ -3996,2021 +3997,177 @@ int exp_fn_nth(pExpression tree, pParamObjects objlist, pExpression i0, pExpress return 0; } -/* See centrallix-sysdoc/string_comparison.md for more information. */ -int exp_fn_levenshtein(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) - { - if (!i0 || !i1) +/*** Computes cosine or levenshtien similarity between two strings. These two + *** tasks have a large amount of overlapping logic (mostly error checking), + *** so doing them with one function greatly reduces code duplocation. + *** + *** @param tree The tree resulting from this function. + *** @param objlist The evaluation "scope", including available variables. + *** @param maybe_str1 Possibly the first string. + *** @param maybe_str2 Possibly the second string. + *** @param u1 Unused parameter. + *** @param is_cos Whether to compute cosine or levenshtien. + ***/ +static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1, bool is_cos) + { + const char fn_name[] = "cos_cmp"; + + /** Check number of arguments. **/ + const int num_params = tree->Children.nItems; + if (num_params != 2) { - mssError(1,"EXP","levenshtein() requires two parameters"); - return -1; + mssErrorf(1, "EXP", "%s(?) expects 2 parameters, got %d parameters.", fn_name, num_params); + return -1; } - - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) + if (maybe_str1 == NULL || maybe_str2 == NULL || u1 != NULL) { - tree->DataType = DATA_T_INTEGER; - tree->Flags |= EXPR_F_NULL; - return 0; + mssErrorf(1, "EXP", "%s(?) expects 2 parameters.", fn_name); + return -1; } - - if ((i0->DataType != DATA_T_STRING) || (i1->DataType != DATA_T_STRING)) + + /** Magic checks. **/ + ASSERTMAGIC(tree, MGK_EXPRESSION); + ASSERTMAGIC(maybe_str1, MGK_EXPRESSION); + ASSERTMAGIC(maybe_str2, MGK_EXPRESSION); + + /** Check object list. **/ + if (objlist == NULL) { - mssError(1,"EXP","levenshtein() requires two string parameters"); - return -1; + mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); + return -1; } - - // for all i and j, d[i,j] will hold the Levenshtein distance between - // the first i characters of s and the first j characters of t - int length1 = strlen(i0->String); - int length2 = strlen(i1->String); - //int levMatrix[length1+1][length2+1]; - int (*levMatrix)[length1+1][length2+1] = nmSysMalloc(sizeof(*levMatrix)); - int i; - int j; - //set each element in d to zero - for (i = 0; i < length1; i++) - { - for (j = 0; j < length2; j++) - { - (*levMatrix)[i][j] = 0; - } - } - - // source prefixes can be transformed into empty string by - // dropping all characters - for (i = 0; i <= length1; i++) - { - (*levMatrix)[i][0] = i; - } - - // target prefixes can be reached from empty source prefix - // by inserting every character - for (j = 0; j <= length2; j++) - { - (*levMatrix)[0][j] = j; - } + ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); - for (i = 1; i <= length1; i++) - { - for (j = 1; j <= length2; j++) - { - if (i0->String[i-1] == i1->String[j-1]) - { - (*levMatrix)[i][j] = (*levMatrix)[i-1][j-1]; - } - else - { - int value1 = (*levMatrix)[i - 1][j] + 1; - int value2 = (*levMatrix)[i][j-1] + 1; - int value3 = (*levMatrix)[i-1][j-1] + 1; - (*levMatrix)[i][j] = (value1 < value2) ? - ((value1 < value3) ? value1 : value3) : - (value2 < value3) ? value2 : value3; - } - } - } - tree->DataType = DATA_T_INTEGER; - tree->Integer = (*levMatrix)[length1][length2]; - nmSysFree(levMatrix); - return 0; - } - -/* See centrallix-sysdoc/string_comparison.md for more information. */ -int exp_fn_lev_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) - { - - if (!i0 || !i1) + /** Extract str1. **/ + if (maybe_str1->Flags & EXPR_F_NULL) { - mssError(1,"EXP","lev_compare() requires two or three parameters"); - return -1; + mssErrorf(1, "EXP", "%s(NULL, ...) str1 cannot be NULL.", fn_name); + return -1; } - - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL) || (i2 && (i2->Flags & EXPR_F_NULL))) + if (maybe_str1->DataType != DATA_T_STRING) { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; + mssErrorf(1, "EXP", "%s(\?\?\?, ..) str1 should be a string.", fn_name); + return -1; } - - if ((i0->DataType != DATA_T_STRING) || (i1->DataType != DATA_T_STRING) || (i2 && i2->DataType != DATA_T_INTEGER)) + char* str1 = maybe_str1->String; + if (str1 == NULL) { - mssError(1,"EXP","lev_compare() requires two string and one optional integer parameters"); - return -1; + mssErrorf(1, "EXP", + "%s(nothing?, ...) expected string from str1 (of datatype DataType = " + "DATA_T_STRING), but the String was NULL or did not exist!", + fn_name + ); + return -1; } - exp_fn_levenshtein(tree, objlist, i0, i1, i2); - //!!! I am not checking for errors here, because IN THEORY we have two strings... if we don't, big uh-oh. - int lev_dist = tree->Integer; - - int length1 = strlen(i0->String); - int length2 = strlen(i1->String); - - double clamped_dist = 1.0; - - if (length1 == 0 || length2 == 0) //empty string + /** Extract str2. **/ + if (maybe_str2->Flags & EXPR_F_NULL) { - clamped_dist = 0.5; - } - else //normal case - { - int max_len = (length1 > length2) ? length1 : length2; - clamped_dist = ((double) lev_dist) / max_len; - - if (abs(length1-length2) == lev_dist) //only inserts. Maybe substring. - { - clamped_dist /= 2; - } - - //use max_field_width if it was provided as a sensible value. If not, don't use it. - double max_field_width = i2?(i2->Integer):0; - if (max_field_width && max_field_width >= max_len) { - double mod = (lev_dist + max_field_width * 3/4) / max_field_width; - if (mod < 1) { //don't make clamped_dist bigger - clamped_dist *= mod; - } - } + mssErrorf(1, "EXP", "%s(\"%s\", NULL) str2 cannot be NULL.", fn_name, str1); + return -1; } - - - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = 1.0 - clamped_dist; - return 0; -} - -/* - * hash_char_pair - * This method creates an vector table index based a given character pair. The characters are represented - * as their ASCII code points. - * - * Parameters: - * num1 : first ASCII code point (double) - * num2 : second ASCII code point (double) - * - * Returns: - * vector table index (integer) - */ -int exp_fn_i_hash_char_pair(double num1, double num2) - { - int func_result = round(((num1 * num1 * num1) + (num2 * num2 * num2)) * ((num1+1)/(num2+1))) -1; - return func_result % EXP_VECTOR_TABLE_SIZE; - } - - -/* - * exp_fn_i_frequency_table - * This method creates a vector frequency table based on a string of characters. - * - * Parameters: - * table : integer pointer to vector frequency table (unsigned short) - * term : the string of characters (char*) - * - * Returns: - * 0 - * - * LINK ../../centrallix-sysdoc/string_comparison.md#exp_fn_i_frequency_table - */ -int exp_fn_i_frequency_table(unsigned short *table, char *term) - { - int i; - // Initialize hash table with 0 values - for (i = 0; i < EXP_VECTOR_TABLE_SIZE; i++) + if (maybe_str2->DataType != DATA_T_STRING) { - table[i] = 0; + mssErrorf(1, "EXP", "%s(\"%s\", \?\?\?) str2 should be a string.", fn_name, str1); + return -1; } - - int j = -1; - for(i = 0; i < strlen(term) + 1; i++) + char* str2 = maybe_str2->String; + if (str2 == NULL) { - // If latter character is punctuation or whitespace, skip it - if (ispunct(term[i]) || isspace(term[i])) - { - continue; - } - - double temp1 = 0.0; - double temp2 = 0.0; - - // If previous character is null - if (j == -1) - { - temp1 = 96; - } - - // Else character is not null - else - { - temp1 = (int)tolower(term[j]); - } - - // If latter character is null - if (i == strlen(term)) + mssErrorf(1, "EXP", + "%s(\"%s\", nothing?) expected string from str2 (of datatype DataType = " + "DATA_T_STRING), but the String was NULL or did not exist!", + fn_name, str1 + ); + return -1; + } + + /** Handle either cos_cmp or lev_cmp. **/ + if (is_cos) + { /* cos_cmp */ + int ret; + + /** Build vectors. **/ + const pVector v1 = check_ptr(ca_build_vector(str1)); + const pVector v2 = check_ptr(ca_build_vector(str2)); + if (v1 == NULL || v2 == NULL) { - temp2 = 96; + mssErrorf(1, "EXP", + "%s(\"%s\", \"%s\") - Failed to build vectors.", + fn_name, str1, str2 + ); + ret = -1; } - - // Else character is not null else { - temp2 = (int)tolower(term[i]); - } - - // Else character is not null // If either character is a number, reassign the code point - if (temp1 >= 48 && temp1 <= 57) - { - temp1 += 75; - } - - if (temp2 >= 48 && temp2 <= 57) - { - temp2 += 75; + tree->Types.Double = ca_cos_compare(v1, v2); + tree->DataType = DATA_T_DOUBLE; + ret = 0; } - - // Hash the character pair into an index - int index = exp_fn_i_hash_char_pair(temp1, temp2); - - // Increment Frequency Table value by number from 0 to 13 - table[index] += ((unsigned short)temp1 + (unsigned short)temp2) % 13 + 1; - - // Move j up to latter character before incrementing i - j = i; - + + if (v1 != NULL) ca_free_vector(v1); + if (v2 != NULL) ca_free_vector(v2); + return ret; } - - return 0; - - } - -/* - * exp_fn_i_dot_product - * This method calculautes the dot product of two vectors. - * - * Parameters: - * dot_product : the place where the result is stored (double) - * r_freq_table1 : the first vector (unsigned short) - * r_freq_table2 : the second vector (unsigned short) - * - * Returns: - * 0 - * - * LINK ../../centrallix-sysdoc/string_comparison.md#exp_fn_i_dot_product - */ -int exp_fn_i_dot_product(double *dot_product, unsigned short *r_freq_table1, unsigned short *r_freq_table2) - { - int i; - for (i = 0; i < EXP_VECTOR_TABLE_SIZE; i++) - { - *dot_product = *dot_product + ((double)r_freq_table1[i] * (double)r_freq_table2[i]); + else + { /* lev_cmp */ + tree->Types.Double = ca_lev_compare(str1, str2); + tree->DataType = DATA_T_DOUBLE; + return 0; } - return 0; + return -1; } -/* - * exp_fn_i_magnitude - * This method calculates the magnitude of a vector - * - * Parameters: - * magnitude : the place where the result is stored (double) - * r_freq_table : the vector (unsigned short) - * - * LINK ../../centrallix-sysdoc/string_comparison.md#exp_fn_i_magnitude - */ -int exp_fn_i_magnitude(double *magnitude, unsigned short *r_freq_table) +/*** Computes cosine similarity between two strings. + *** + *** @param tree The tree resulting from this function. + *** @param objlist The evaluation "scope", including available variables. + *** @param maybe_str1 Possibly the first string. + *** @param maybe_str2 Possibly the second string. + *** @param u1 Unused parameter. + ***/ +int exp_fn_cos_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) { - int i; - for (i = 0; i < EXP_VECTOR_TABLE_SIZE; i++) - { - *magnitude = *magnitude + ((double)r_freq_table[i] * (double)r_freq_table[i]); - } - *magnitude = sqrt(*magnitude); - return 0; + return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, true); } -/* - * exp_fn_cos_compare - * This method calculates the cosine similarity of two vector frequency tables - * See centrallix-sysdoc/string_comparison.md for more information. - * - * Parameters: - * tree : structure where output is stored - * objlist : unused - * i0 : first data entry (pExpression) - * i1 : second data entry (pExpression) - * i2 : unused - * - * Returns: - * 0 - * - * LINK ../../centrallix-sysdoc/string_comparison.md#exp_fn_similarity - */ -int exp_fn_cos_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +/*** Computes levenshtein similarity by normalizing the levenshtein edit + *** distance between two strings with the length of the longer string. + *** + *** @param tree The tree resulting from this function. + *** @param objlist The evaluation "scope", including available variables. + *** @param maybe_str1 Possibly the first string. + *** @param maybe_str2 Possibly the second string. + *** @param u1 Unused parameter. + ***/ +int exp_fn_lev_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) { - // Ensure function receives two non-null parameters - if (!i0 || !i1) - { - mssError(1,"EXP","cos_compare() requires two parameter."); - return -1; - } - - // Ensure value passed in both parameters is not null - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) - { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; - } - - // Ensure both parameters contain string values - if ((i0->DataType != DATA_T_STRING) || (i1->DataType != DATA_T_STRING)) - { - mssError(1,"EXP","cos_compare() requires two string parameters."); - return -1; - } - - //If the two strings are identical, don't bother running cosine compare - if (strcmp(i0->String, i1->String) == 0) - { - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = 1.0; - return 0; - } - - // Allocate frequency tables (arrays of integers) for each term - unsigned short *table1 = nmMalloc(EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - unsigned short *table2 = nmMalloc(EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - - if (table1 == NULL || table2 == NULL) - { - mssError(1,"EXP","Memory allocation failed."); - return -1; - } - - // Calculate frequency tables for each term - exp_fn_i_frequency_table(table1, i0->String); - exp_fn_i_frequency_table(table2, i1->String); - - // Calculate dot product - double dot_product = 0; - exp_fn_i_dot_product(&dot_product, table1, table2); - - // Calculate magnitudes of each relative frequency vector - double magnitude1 = 0; - double magnitude2 = 0; - exp_fn_i_magnitude(&magnitude1, table1); - exp_fn_i_magnitude(&magnitude2, table2); - - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = dot_product / (magnitude1 * magnitude2); - nmFree(table1, EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - nmFree(table2, EXP_VECTOR_TABLE_SIZE * sizeof(unsigned short)); - - return 0; + return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, false); } -// /*** ========================= -// *** DUPE SECTION -// *** By: Israel Fuller -// *** Last Updated: September, 2025 -// *** -// *** This section of the file deals with finding duplocates. -// ***/ - -// /*** @brief Returns the smaller of two values. -// *** -// *** @param a The first value. -// *** @param b The second value. -// *** @return The smaller of the two values. -// *** -// *** @note This macro uses GNU C extensions and is type-safe. -// ***/ -// #define min(a, b) ({ \ -// __typeof__ (a) _a = (a); \ -// __typeof__ (b) _b = (b); \ -// (_a < _b) ? _a : _b; \ -// }) - -// /*** @brief Returns the larger of two values. -// *** -// *** @param a The first value. -// *** @param b The second value. -// *** @return The larger of the two values. -// *** -// *** @note This macro uses GNU C extensions and is type-safe. -// ***/ -// #define max(a, b) ({ \ -// __typeof__ (a) _a = (a); \ -// __typeof__ (b) _b = (b); \ -// (_a > _b) ? _a : _b; \ -// }) - -// /** The character used to create a pair with the first and last characters of a string. **/ -// #define EXP_BOUNDARY_CHAR ('a' - 1) - -// /*** Helpful error handling function. **/ -// void mssErrorf(int clr, char* module, const char* format, ...); - -// /*** Gets the hash, representing a pair of ASCII characters, represented by unsigned ints. -// *** -// *** @param num1 The first character in the pair. -// *** @param num1 The second character in the pair. -// *** @returns The resulting hash. -// ***/ -// unsigned int exp_fn_get_char_pair_hash(const unsigned int num1, const unsigned int num2) -// { -// if (num1 == EXP_BOUNDARY_CHAR && num2 == EXP_BOUNDARY_CHAR) -// { -// mssErrorf(1, "EXP", -// "exp_fn_get_char_pair_hash(%u, %u) - Warning: Pair of boundary characters.", -// num1, num2 -// ); -// } -// const double sum = (num1 * num1 * num1) + (num2 * num2 * num2); -// const double scale = ((double)num1 + 1.0) / ((double)num2 + 1.0); -// const unsigned int hash = (unsigned int)round(sum * scale) - 1u; -// return hash % EXP_NUM_DIMS; -// } - -// /*** Builds a vector using a string. -// *** -// *** Vectors are based on the frequencies of character pairs in the string. -// *** Space characters and punctuation characters (see code for list) are ignored, -// *** and all characters are converted to lowercase. Character 96, which is just -// *** before 'a' in the ASCII table (and maps to '`') is used to make pairs on the -// *** start and end of strings. The only supported characters for the passed char* -// *** are spaces, punctuation, uppercase and lowercase letters, and numbers. -// *** -// *** This results in the following modified ASCII table: -// *** ```csv -// *** #, char, #, char, #, char -// *** 97, a, 109, m, 121, y -// *** 98, b, 110, n, 122, z -// *** 99, c, 111, o, 123, 0 -// *** 100, d, 112, p, 124, 1 -// *** 101, e, 113, q, 125, 2 -// *** 102, f, 114, r, 126, 3 -// *** 103, g, 115, s, 127, 4 -// *** 104, h, 116, t, 128, 5 -// *** 105, i, 117, u, 129, 6 -// *** 106, j, 118, v, 130, 7 -// *** 107, k, 119, w, 131, 8 -// *** 108, l, 120, x, 132, 9 -// *** ``` -// *** Thus, any number from 96 (the start/end character) to 132 ('9') is a valid -// *** input to get_char_pair_hash(). -// *** -// *** After hashing each character pair, we add some number from 1 to 13 to the -// *** coresponding dimention. However, for most names, this results in a lot of -// *** zeros and a FEW positive numbers. Thus, after creating the dense vector, -// *** we convert it to a sparse vector in which a negative number replaces a run -// *** of that many zeros. Consider the following example: -// *** -// *** Dense Vector: `[1,0,0,0,3,0]` -// *** -// *** Sparse Vector: `[1,-3,3,-1]` -// *** -// *** Using these sparse vectors greatly reduces the required memory and gives -// *** aproximately an x5 boost to performance when traversing vectors, at the -// *** cost of more algorithmically complex code. -// *** -// *** @param str The string to be divided into pairs and hashed to make the vector. -// *** @returns The sparse vector built using the hashed character pairs. -// ***/ -// int* build_vector(char* str) { -// /** Allocate space for a dense vector. **/ -// unsigned int dense_vector[EXP_NUM_DIMS] = {0u}; - -// /** j is the former character, i is the latter. **/ -// const unsigned int num_chars = (unsigned int)strlen(str); -// for (unsigned int j = 65535u, i = 0u; i <= num_chars; i++) -// { -// /** isspace: space, \n, \v, \f, \r **/ -// if (isspace(str[i])) continue; - -// /** ispunct: !"#$%&'()*+,-./:;<=>?@[\]^_{|}~ **/ -// if (ispunct(str[i]) && str[i] != EXP_BOUNDARY_CHAR) continue; - -// /*** iscntrl (0-8): SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS -// *** (14-31): SO, SI, DLE, DC1-4, NAK, SYN, ETB, CAN -// *** EM, SUB, ESC, FS, GS, RS, US -// ***/ -// if (iscntrl(str[i]) && i != num_chars) { -// mssErrorf(1, "EXP", -// "build_vector(%s) - Warning: Skipping unknown character #%u.\n", -// str, (unsigned int)str[i] -// ); -// continue; -// } - -// /** First and last character should fall one before 'a' in the ASCII table. **/ -// unsigned int temp1 = (j == 65535u) ? EXP_BOUNDARY_CHAR : (unsigned int)tolower(str[j]); -// unsigned int temp2 = (i == num_chars) ? EXP_BOUNDARY_CHAR : (unsigned int)tolower(str[i]); - -// /** Shift numbers to the end of the lowercase letters. **/ -// if ('0' <= temp1 && temp1 <= '9') temp1 += 75u; -// if ('0' <= temp2 && temp2 <= '9') temp2 += 75u; - -// /** Hash the character pair into an index (dimension). **/ -// /** Note that temp will be between 97 ('a') and 132 ('9'). **/ -// unsigned int dim = exp_fn_get_char_pair_hash(temp1, temp2); - -// /** Increment the dimension of the dense vector by a number from 1 to 13. **/ -// dense_vector[dim] += (temp1 + temp2) % 13u + 1u; - -// j = i; -// } - -// /** Count how much space is needed for a sparse vector. **/ -// bool zero_prev = false; -// size_t size = 0u; -// for (unsigned int dim = 0u; dim < EXP_NUM_DIMS; dim++) -// { -// if (dense_vector[dim] == 0u) -// { -// size += (zero_prev) ? 0u : 1u; -// zero_prev = true; -// } -// else -// { -// size++; -// zero_prev = false; -// } -// } - -// /*** Check compression size. -// *** If this check fails, I doubt anything will break. However, the longest -// *** word I know (supercalifragilisticexpialidocious) has only 35 character -// *** pairs, so it shouldn't reach half this size (and it'd be even shorter -// *** if the hash generates at least one collision). -// *** -// *** Bad vector compression will result in degraded performace and increased -// *** memory usage, and likely also indicates a bug or modified assumption -// *** elsewhere in the code. -// *** -// *** If this warning is ever generated, it's definitely worth investigating. -// ***/ -// const size_t expected_max_size = 64u; -// if (size > expected_max_size) -// { -// mssErrorf(1, "EXP" -// "build_vector(%s) - Warning: Sparse vector larger than expected.\n" -// " > Size: %lu\n" -// " > #Dims: %u\n", -// str, -// size, -// EXP_NUM_DIMS -// ); -// } - -// /** Allocate space for sparse vector. **/ -// const size_t sparse_vector_size = size * sizeof(int); -// int* sparse_vector = (int*)nmSysMalloc(sparse_vector_size); -// if (sparse_vector == NULL) { -// mssErrorf(1, "EXP", -// "build_vector(%s) - nmSysMalloc(%lu) failed.", -// str, sparse_vector_size -// ); -// return NULL; -// } - -// /** Convert the dense vector above to a sparse vector. **/ -// unsigned int j = 0u, sparse_idx = 0u; -// while (j < EXP_NUM_DIMS) -// { -// if (dense_vector[j] == 0u) -// { -// /*** Count and store consecutive zeros, except the first one, -// *** which we already know is zero. -// ***/ -// unsigned int zero_count = 1u; -// j++; -// while (j < EXP_NUM_DIMS && dense_vector[j] == 0u) -// { -// zero_count++; -// j++; -// } -// sparse_vector[sparse_idx++] = (int)-zero_count; -// } -// else -// { -// /** Store the value. **/ -// sparse_vector[sparse_idx++] = (int)dense_vector[j++]; -// } -// } - -// return sparse_vector; -// } - -// /*** Compute the magnitude of a sparsely allocated vector. -// *** -// *** @param vector The vector. -// *** @returns The computed magnitude. -// ***/ -// double exp_fn_magnitude_sparse(const int* vector) -// { -// unsigned int magnitude = 0u; -// for (unsigned int i = 0u, dim = 0u; dim < EXP_NUM_DIMS;) -// { -// const int val = vector[i++]; - -// /** Negative val represents -val 0s in the array, so skip that many values. **/ -// if (val < 0) dim += (unsigned)(-val); - -// /** We have a param_value, so square it and add it to the magnitude. **/ -// else { magnitude += (unsigned)(val * val); dim++; } -// } -// return sqrt((double)magnitude); -// } - -// /*** Compute the magnitude of a densely allocated centroid. -// *** -// *** @param centroid The centroid. -// *** @returns The computed magnitude. -// ***/ -// double exp_fn_magnitude_dense(const double* centroid) -// { -// double magnitude = 0.0; -// for (int i = 0; i < EXP_NUM_DIMS; i++) -// magnitude += centroid[i] * centroid[i]; -// return sqrt(magnitude); -// } - -// /*** Parse a token from a sparsely allocated vector and write the param_value and -// *** number of remaining values to the passed locations. -// *** -// *** @param token The sparse vector token being parsed. -// *** @param remaining The location to save the remaining number of characters. -// *** @param param_value The location to save the param_value of the token. -// ***/ -// void exp_fn_parse_token(const int token, unsigned int* remaining, unsigned int* param_value) { -// if (token < 0) -// { -// /** This run contains -token zeros. **/ -// *remaining = (unsigned)(-token); -// *param_value = 0u; -// } -// else -// { -// /** This run contains one param_value. **/ -// *remaining = 1u; -// *param_value = (unsigned)(token); -// } -// } - -// /*** Calculate the similarity on sparcely allocated vectors. Comparing -// *** any string to an empty string should always return 0.5 (untested). -// *** -// *** @param v1 Sparse vector #1. -// *** @param v2 Sparse vector #2. -// *** @returns Similarity between 0 and 1 where -// *** 1 indicates identical and -// *** 0 indicates completely different. -// ***/ -// double exp_fn_sparse_similarity(const int* v1, const int* v2) -// { -// /** Calculate dot product. **/ -// unsigned int vec1_remaining = 0u, vec2_remaining = 0u; -// unsigned int dim = 0u, i1 = 0u, i2 = 0u, dot_product = 0u; -// while (dim < EXP_NUM_DIMS) -// { -// unsigned int val1 = 0u, val2 = 0u; -// if (vec1_remaining == 0u) exp_fn_parse_token(v1[i1++], &vec1_remaining, &val1); -// if (vec2_remaining == 0u) exp_fn_parse_token(v2[i2++], &vec2_remaining, &val2); - -// /*** Accumulate the dot_product. If either vector is 0 here, -// *** the total is 0 and this statement does nothing. -// ***/ -// dot_product += val1 * val2; - -// /** Consume overlap from both runs. **/ -// unsigned int overlap = min(vec1_remaining, vec2_remaining); -// vec1_remaining -= overlap; -// vec2_remaining -= overlap; -// dim += overlap; -// } - -// /** Optional optimization to speed up nonsimilar vectors. **/ -// if (dot_product == 0u) return 0.0; - -// /** Return the difference score. **/ -// return (double)dot_product / (exp_fn_magnitude_sparse(v1) * exp_fn_magnitude_sparse(v2)); -// } - -// /*** Calculate the difference on sparcely allocated vectors. Comparing -// *** any string to an empty string should always return 0.5 (untested). -// *** -// *** @param v1 Sparse vector #1. -// *** @param v2 Sparse vector #2. -// *** @returns Similarity between 0 and 1 where -// *** 1 indicates completely different and -// *** 0 indicates identical. -// ***/ -// #define exp_fn_sparse_dif(v1, v2) (1.0 - exp_fn_sparse_similarity(v1, v2)) - -// /*** Calculate the similarity between a sparsely allocated vector -// *** and a densely allocated centroid. Comparing any string to an -// *** empty string should always return 0.5 (untested). -// *** -// *** @param v1 Sparse vector #1. -// *** @param c1 Dense centroid #2. -// *** @returns Similarity between 0 and 1 where -// *** 1 indicates identical and -// *** 0 indicates completely different. -// ***/ -// double exp_fn_sparse_similarity_c(const int* v1, const double* c2) -// { -// /** Calculate dot product. **/ -// double dot_product = 0.0; -// for (unsigned int i = 0u, dim = 0u; dim < EXP_NUM_DIMS;) -// { -// const int val = v1[i++]; - -// /** Negative val represents -val 0s in the array, so skip that many values. **/ -// if (val < 0) dim += (unsigned)(-val); - -// /** We have a param_value, so square it and add it to the magnitude. **/ -// else dot_product += (double)val * c2[dim++]; -// } - -// /** Return the difference score. **/ -// return dot_product / (exp_fn_magnitude_sparse(v1) * exp_fn_magnitude_dense(c2)); -// } - -// /*** Calculate the difference between a sparsely allocated vector -// *** and a densely allocated centroid. Comparing any string to an -// *** empty string should always return 0.5 (untested). -// *** -// *** @param v1 Sparse vector #1. -// *** @param c1 Dense centroid #2. -// *** @returns Difference between 0 and 1 where -// *** 1 indicates completely different and -// *** 0 indicates identical. -// ***/ -// #define exp_fn_sparse_dif_c(v1, c2) (1.0 - exp_fn_sparse_similarity_c(v1, c2)) - -// /*** Calculate the average size of all clusters in a set of vectors. -// *** -// *** @param vectors The vectors of the dataset (allocated sparsely). -// *** @param num_vectors The number of vectors in the dataset. -// *** @param labels The clusters to which vectors are assigned. -// *** @param centroids The locations of the centroids (allocated densely). -// *** @param num_clusters The number of centroids (k). -// *** @returns The average cluster size. -// ***/ -// double exp_fn_get_cluster_size( -// int** vectors, -// const unsigned int num_vectors, -// unsigned int* labels, -// double centroids[][EXP_NUM_DIMS], -// const unsigned int num_clusters -// ) -// { -// double cluster_sums[num_clusters]; -// unsigned int cluster_counts[num_clusters]; -// for (unsigned int i = 0u; i < num_clusters; i++) -// cluster_sums[i] = 0.0; -// memset(cluster_counts, 0, sizeof(cluster_counts)); - -// /** Sum the difference from each vector to its cluster centroid. **/ -// for (unsigned int i = 0u; i < num_vectors; i++) -// { -// const unsigned int label = labels[i]; -// cluster_sums[label] += exp_fn_sparse_dif_c(vectors[i], centroids[label]); -// cluster_counts[label]++; -// } - -// /** Add up the average cluster size. **/ -// double cluster_total = 0.0; -// unsigned int num_valid_clusters = 0u; -// for (unsigned int label = 0u; label < num_clusters; label++) -// { -// const unsigned int cluster_count = cluster_counts[label]; -// if (cluster_count == 0u) continue; - -// cluster_total += cluster_sums[label] / cluster_count; -// num_valid_clusters++; -// } - -// /** Return average sizes. **/ -// return cluster_total / num_valid_clusters; -// } - -// /*** Compute the param_value for `k` (number of clusters), given a dataset of with -// *** a size of `n`. -// *** -// *** The following table shows data sizes vs.selected cluster size. In testing, -// *** these numbers tended to givea good balance of accuracy and dulocates detected. -// *** -// *** ```csv -// *** Data Size, Actual -// *** 10k, 12 -// *** 100k, 33 -// *** 1M, 67 -// *** 4M, 93 -// *** ``` -// *** -// *** This function is not intended for datasets smaller than (`n < ~2000`). -// *** These should be handled using complete search. -// *** -// *** LaTeX Notation: \log_{36}\left(n\right)^{3.1}-8 -// *** -// *** @param n The size of the dataset. -// *** @returns k, the number of clusters to use. -// *** -// *** Complexity: `O(1)` -// ***/ -// unsigned int exp_fn_compute_k(const unsigned int n) -// { -// return (unsigned)max(2, pow(log(n) / log(36), 3.2) - 8); -// } - -// /*** Executes the k-means clustering algorithm. Selects NUM_CLUSTERS random -// *** vectors as initial centroids. Then points are assigned to the nearest -// *** centroid, after which centroids are moved to the center of their points. -// *** -// *** @param vectors The vectors to cluster. -// *** @param num_vectors The number of vectors to cluster. -// *** @param labels Stores the final cluster identities of the vectors after -// *** clustering is completed. -// *** @param centroids Stores the locations of the centroids used for the clusters -// *** of the data. -// *** @param iterations The number of iterations that actually executed is stored -// *** here. Leave this NULL if you don't care. -// *** @param max_iter The max number of iterations. -// *** @param num_clusters The number of clusters to generate. -// *** -// *** @attention - Assumes: num_vectors is the length of vectors. -// *** @attention - Assumes: num_clusters is the length of labels. -// *** -// *** @attention - Issue: At larger numbers of clustering iterations, some -// *** clusters have a size of negative infinity. In this implementation, -// *** the bug is mitigated by setting a small number of max iterations, -// *** such as 16 instead of 100. -// *** @attention - Issue: Clusters do not apear to improve much after the first -// *** iteration, which puts the efficacy of the algorithm into question. This -// *** may be due to the uneven density of a typical dataset. However, the -// *** clusters still offer useful information. -// *** -// *** Complexity: -// *** -// *** - `O(kd + k + i*(k + n*(k+d) + kd))` -// *** -// *** - `O(kd + k + ik + ink + ind + ikd)` -// *** -// *** - `O(nk + nd)` -// ***/ -// void exp_fn_kmeans( -// int** vectors, -// const unsigned int num_vectors, -// unsigned int* labels, -// const unsigned int num_clusters, -// const unsigned int max_iter -// ) -// { -// // const size_t centroids_size = num_clusters * sizeof(double*); -// // const size_t centroid_size = EXP_NUM_DIMS * sizeof(double); -// // double** centroids = (double**)nmMalloc(centroids_size); -// // if (centroids == NULL) -// // { -// // fprintf(stderr, "exp_fn_kmeans() - nmMalloc(%u) failed.\n", centroids_size); -// // return; -// // } -// // for (int i = 0; i < num_clusters; i++) -// // { -// // double* centroid = centroids[i] = (double*)nmMalloc(centroid_size); -// // if (centroid == NULL) -// // { -// // fprintf(stderr, "exp_fn_kmeans() - nmMalloc(%u) failed.\n", centroid_size); -// // return; -// // } -// // memset(centroids[i], 0, centroid_size); -// // } -// double centroids[num_clusters][EXP_NUM_DIMS]; -// memset(centroids, 0, sizeof(centroids)); - -// /** Select random vectors to use as the initial centroids. **/ -// srand(time(NULL)); -// for (unsigned int i = 0u; i < num_clusters; i++) -// { -// // Pick a random vector. -// const unsigned int random_index = (unsigned int)rand() % num_vectors; - -// // Sparse copy the vector into a densely allocated centroid. -// double* centroid = centroids[i]; -// const int* vector = vectors[random_index]; -// for (unsigned int i = 0u, dim = 0u; dim < EXP_NUM_DIMS;) -// { -// const int token = vector[i++]; -// if (token > 0) centroid[dim++] = (double)token; -// else for (unsigned int j = 0u; j < -token; j++) centroid[dim++] = 0.0; -// } -// } - -// /** Allocate memory for new centroids. **/ -// double new_centroids[num_clusters][EXP_NUM_DIMS]; - -// /** Main exp_fn_kmeans loop. **/ -// double old_average_cluster_size = 1.0; -// unsigned int cluster_counts[num_clusters]; -// for (unsigned int iter = 0u; iter < max_iter; iter++) -// { -// bool changed = false; - -// /** Reset new centroids. **/ -// for (unsigned int i = 0u; i < num_clusters; i++) -// { -// cluster_counts[i] = 0u; -// for (unsigned int dim = 0; dim < EXP_NUM_DIMS; dim++) -// new_centroids[i][dim] = 0.0; -// } - -// /** Assign each point to the nearest centroid. **/ -// for (unsigned int i = 0u; i < num_vectors; i++) -// { -// const int* vector = vectors[i]; -// double min_dist = DBL_MAX; -// unsigned int best_centroid_label = 0u; - -// // Find nearest centroid. -// for (unsigned int j = 0u; j < num_clusters; j++) -// { -// const double dist = exp_fn_sparse_dif_c(vector, centroids[j]); -// if (dist < min_dist) -// { -// min_dist = dist; -// best_centroid_label = j; -// } -// } - -// /** Update label to new centroid, if necessary. **/ -// if (labels[i] != best_centroid_label) -// { -// labels[i] = best_centroid_label; -// changed = true; -// } - -// /** Accumulate values for new centroid calculation. **/ -// double* best_centroid = new_centroids[best_centroid_label]; -// for (unsigned int i = 0u, dim = 0u; dim < EXP_NUM_DIMS;) -// { -// const int val = vector[i++]; -// if (val < 0) dim += (unsigned)(-val); -// else best_centroid[dim++] += (double)val; -// } -// cluster_counts[best_centroid_label]++; -// } - -// /** Stop if centroids didn't change. **/ -// if (!changed) break; - -// /** Update centroids. **/ -// for (unsigned int i = 0u; i < num_clusters; i++) -// { -// if (cluster_counts[i] == 0u) continue; -// double* centroid = centroids[i]; -// const double* new_centroid = new_centroids[i]; -// const unsigned int cluster_count = cluster_counts[i]; -// for (unsigned int dim = 0u; dim < EXP_NUM_DIMS; dim++) -// centroid[dim] = new_centroid[dim] / cluster_count; -// } - -// /** Print cluster size for debugging. **/ -// const double average_cluster_size = exp_fn_get_cluster_size(vectors, num_vectors, labels, centroids, num_clusters); - -// /** Is there enough improvement? **/ -// const double improvement = old_average_cluster_size - average_cluster_size; -// if (improvement < KMEANS_IMPROVEMENT_THRESHOLD) break; -// old_average_cluster_size = average_cluster_size; -// } - -// // Free unused memory. -// // for (int i = 0; i < num_clusters; i++) { -// // nmFree(centroids[i], centroid_size); -// // } -// // nmFree(centroids, centroids_size); -// } - -// /** Duplocate information. **/ -// typedef struct -// { -// unsigned int id1; -// unsigned int id2; -// double similarity; -// } -// Dup, *pDup; - -// /*** Runs complete search to find duplocates if `num_vectors < MAX_COMPLETE_SEARCH` -// *** and runs a search using k-means clustering on larger amounts of data. -// *** -// *** @param vectors Array of precomputed frequency vectors for all dataset strings. -// *** @param num_vectors The number of vectors to be scanned. -// *** @param dupe_threshold The similarity threshold, below which dups are ignored. -// *** @returns The duplicates in pDup structs. -// ***/ -// pXArray lightning_search(int** vectors, const unsigned int num_vectors, const double dupe_threshold) -// { -// /** Allocate space for dups. **/ -// const size_t guess_size = num_vectors * 2u; -// pXArray dups = xaNew(guess_size); -// if (dups == NULL) -// { -// mssErrorf(1, "EXP", "lightning_search() - xaNew(%lu) failed.", guess_size); -// return NULL; -// } - -// /** Descide which algorithm to use. **/ -// if (num_vectors <= MAX_COMPLETE_SEARCH) -// { /** Do a complete search. **/ -// for (unsigned int i = 0u; i < num_vectors; i++) -// { -// const int* v1 = vectors[i]; -// for (unsigned int j = i + 1u; j < num_vectors; j++) -// { -// const int* v2 = vectors[j]; -// const double similarity = exp_fn_sparse_similarity(v1, v2); -// if (similarity > dupe_threshold) // Dup found! -// { -// Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); -// if (dup == NULL) -// { -// mssErrorf(1, "EXP", "lightning_search() - nmMalloc(%lu) failed.", sizeof(Dup)); -// goto err_free_dups; -// } - -// dup->id1 = i; -// dup->id2 = j; -// dup->similarity = similarity; -// xaAddItem(dups, (void*)dup); -// } -// } -// } -// } -// else -// { /** Do a k-means search. **/ -// /** Define constants for the algorithm. **/ -// const unsigned int max_iter = 64u; /** Hardcode value because idk. **/ -// const unsigned int num_clusters = exp_fn_compute_k(num_vectors); - -// /** Allocate static memory for finding clusters. **/ -// unsigned int labels[num_vectors]; -// memset(labels, 0u, sizeof(labels)); - -// /** Execute kmeans clustering. **/ -// exp_fn_kmeans(vectors, num_vectors, labels, num_clusters, max_iter); - -// /** Find duplocates in clusters. **/ -// for (unsigned int i = 0u; i < num_vectors; i++) -// { -// const int* v1 = vectors[i]; -// const unsigned int label = labels[i]; -// for (unsigned int j = i + 1u; j < num_vectors; j++) -// { -// if (labels[j] != label) continue; -// const int* v2 = vectors[j]; -// const double similarity = exp_fn_sparse_similarity(v1, v2); -// if (similarity > dupe_threshold) /* Dup found! */ -// { -// Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); -// if (dup == NULL) -// { -// mssErrorf(1, "EXP", -// "lightning_search() - nmMalloc(%lu) failed.", -// sizeof(Dup) -// ); -// goto err_free_dups; -// } - -// dup->id1 = i; -// dup->id2 = j; -// dup->similarity = similarity; -// xaAddItem(dups, (void*)dup); -// } -// } -// } -// } - -// /** Done **/ -// return dups; - -// /** Free dups. **/ -// err_free_dups:; -// const size_t num_dups = dups->nItems; -// for (unsigned int i = 0u; i < num_dups; i++) -// { -// nmFree(dups->Items[i], sizeof(Dup)); -// dups->Items[i] = NULL; -// } -// xaDeInit(dups); -// return NULL; -// } - -// /*** Computes Levenshtein distance between two strings. -// *** -// *** @param str1 The first string. -// *** @param str2 The second string. -// *** @param length1 The length of the first string. -// *** @param length1 The length of the first string. -// *** -// *** @attention - Tip: Pass 0 for the length of either string to infer it -// *** using the null terminating character. Thus, strings with no null -// *** terminator are supported if you pass explicit lengths. -// *** -// *** Complexity: O(length1 * length2). -// *** -// *** @see centrallix-sysdoc/string_comparison.md -// ***/ -// unsigned int exp_fn_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) -// { -// /*** lev_matrix: -// *** For all i and j, d[i][j] will hold the Levenshtein distance between -// *** the first i characters of s and the first j characters of t. -// *** -// *** As they say, no dynamic programming algorithm is complete without a -// *** matrix that you fill out and it has the answer in the final location. -// ***/ -// const size_t str1_len = (str1_length == 0u) ? strlen(str1) : str1_length; -// const size_t str2_len = (str2_length == 0u) ? strlen(str2) : str2_length; -// unsigned int lev_matrix[str1_len + 1][str2_len + 1]; - -// /*** Base case #0: -// *** Transforming an empty string into an empty string has 0 cost. -// ***/ -// lev_matrix[0][0] = 0u; - -// /*** Base case #1: -// *** Any source prefixe can be transformed into an empty string by -// *** dropping each character. -// ***/ -// for (unsigned int i = 1u; i <= str1_len; i++) -// lev_matrix[i][0] = i; - -// /*** Base case #2: -// *** Any target prefixes can be transformed into an empty string by -// *** inserting each character. -// ***/ -// for (unsigned int j = 1u; j <= str2_len; j++) -// lev_matrix[0][j] = j; - -// /** General Case **/ -// for (unsigned int i = 1u; i <= str1_len; i++) -// { -// for (unsigned int j = 1u; j <= str2_len; j++) -// { -// /** Equal characters need no changes. **/ -// if (str1[i - 1] == str2[j - 1]) -// lev_matrix[i][j] = lev_matrix[i - 1][j - 1]; - -// /*** We need to make a change, so use the opereration with the -// *** lowest cost out of delete, insert, replace, or swap. -// ***/ -// else -// { -// unsigned int cost_delete = lev_matrix[i - 1][j] + 1u; -// unsigned int cost_insert = lev_matrix[i][j - 1] + 1u; -// unsigned int cost_replace = lev_matrix[i-1][j-1] + 1u; - -// /** If a swap is possible, calculate the cost. **/ -// bool can_swap = ( -// i > 1 && j > 1 && -// str1[i - 1] == str2[j - 2] && -// str1[i - 2] == str2[j - 1] -// ); -// unsigned int cost_swap = (can_swap) ? lev_matrix[i - 2][j - 2] + 1 : UINT_MAX; - -// // Find the best operation. -// lev_matrix[i][j] = min(min(min(cost_delete, cost_insert), cost_replace), cost_swap); -// } -// } -// } - -// return lev_matrix[str1_len][str2_len]; -// } - -// /*** Runs complete search to find duplocates in phone numbers using the -// *** levenshtein min edit distance algorithm. -// *** -// *** @param dataset An array of characters for all dataset strings. -// *** @param dataset_size The number of phone numbers to be scanned. -// *** @param dupe_threshold The similarity threshold, below which dups are ignored. -// *** @returns The duplicates in pDup structs. -// ***/ -// pXArray phone_search(char dataset[][10u], const unsigned int dataset_size, const double dupe_threshold) -// { -// /** Allocate space for dups. **/ -// const size_t guess_size = dataset_size * 2u; -// pXArray dups = xaNew(guess_size); -// if (dups == NULL) -// { -// mssErrorf(1, "EXP", "phone_search() - xaNew(%lu) failed.", guess_size); -// return NULL; -// } - -// /** Search for dups using edit distance. **/ -// for (unsigned int i = 0u; i < dataset_size; i++) -// { -// const char* v1 = dataset[i]; -// for (unsigned int j = i + 1u; j < dataset_size; j++) -// { -// const char* v2 = dataset[j]; -// const unsigned int dist = exp_fn_edit_dist(v1, v2, 10u, 10u); -// const double similarity = (double)dist / 10.0; -// if (similarity > dupe_threshold) /* Dup found! */ -// { -// Dup* dup = (Dup*)nmMalloc(sizeof(Dup)); -// if (dup == NULL) -// { -// mssErrorf(1, "EXP", "phone_search() - nmMalloc(%lu) failed.", sizeof(Dup)); - -// /** Free data before returning. **/ -// const size_t num_dups = dups->nItems; -// for (unsigned int i = 0u; i < num_dups; i++) -// { -// void* dup = dups->Items[i]; -// nmFree(dup, sizeof(Dup)); -// } -// xaDeInit(dups); -// return NULL; -// } - -// dup->id1 = i; -// dup->id2 = j; -// dup->similarity = similarity; -// xaAddItem(dups, (void*)dup); -// } -// } -// } - -// return dups; -// } - -// /*** Usage: get_dups(, , ) -// *** data is assumed to contain only the following characters: -// *** (Data containing ` or control characters is undefined.) -// *** \n\v\f\r 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij -// *** klmnopqrstuvwxyz!"#$%&'()*+,-./:;<=>?@[\]^_{|}~ -// ***/ -// int exp_fn_get_dups_general(pExpression tree, pParamObjects objlist, pExpression maybe_dup_threshold, pExpression maybe_out_file_path, pExpression maybe_data, const char* fn_name, bool is_phone_numbers) -// { -// /** Check number of arguments. **/ -// if (!maybe_dup_threshold || !maybe_out_file_path || !maybe_data) -// { -// mssErrorf(1, "EXP", "%s(?) expects 3 parameters.", fn_name); -// return -1; -// } -// const int num_params = tree->Children.nItems; -// if (num_params != 3) -// { -// mssErrorf(1, "EXP", "%s(?) expects 3 parameter, got %d.", fn_name, num_params); -// return -1; -// } - -// /** Magic checks. **/ -// ASSERTMAGIC(tree, MGK_EXPRESSION); -// ASSERTMAGIC(maybe_dup_threshold, MGK_EXPRESSION); -// ASSERTMAGIC(maybe_out_file_path, MGK_EXPRESSION); -// ASSERTMAGIC(maybe_data, MGK_EXPRESSION); - -// /** Check object list. **/ -// if (!objlist) -// { -// mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); -// return -1; -// } -// ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); - -// /** Extract dup_threshold. **/ -// if (maybe_dup_threshold->Flags & EXPR_F_NULL) -// { -// mssErrorf(1, "EXP", "%s(NULL, ...) dup_threshold cannot be NULL.", fn_name); -// return -1; -// } -// if (maybe_dup_threshold->DataType != DATA_T_DOUBLE) -// { -// mssErrorf(1, "EXP", "%s(?, ...) dup_threshold must be a doube.", fn_name); -// return -1; -// } -// double dup_threshold = maybe_dup_threshold->Types.Double; -// if (isnan(dup_threshold)) -// { -// mssErrorf(1, "EXP", "%s(NAN, ...) dup_threshold cannot be NAN.", fn_name); -// return -1; -// } -// if (dup_threshold <= 0 || 1 <= dup_threshold) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, ...) dup_threshold must be between 0 and 1 (exclusive).", -// fn_name, dup_threshold -// ); -// return -1; -// } - -// /** Extract output file path. **/ -// if (maybe_out_file_path->Flags & EXPR_F_NULL) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, NULL, ...) out_file_path cannot be NULL.", -// fn_name, dup_threshold -// ); -// return -1; -// } -// if (maybe_out_file_path->DataType != DATA_T_STRING) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \?\?\?, ...) out_file_path should be a string.", -// fn_name, dup_threshold -// ); -// return -1; -// } -// char* out_file_path = maybe_out_file_path->String; -// if (out_file_path == NULL) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, nothing?, ...) expected string from out_file_path " -// "(of type DataType = DATA_T_STRING), but the String was NULL " -// "or did not exist!", -// fn_name, dup_threshold -// ); -// return -1; -// } -// size_t out_path_len = strlen(out_file_path); -// if (out_path_len == 0u) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", ...) out_file_path cannot be an empty string.", -// fn_name, dup_threshold, out_file_path -// ); -// return -1; -// } -// const size_t max_len = BUFSIZ - 48u; -// if (out_path_len >= max_len) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", ...) out_file_path length (%lu) > max length (%lu).", -// fn_name, dup_threshold, out_file_path, out_path_len, max_len -// ); -// return -1; -// } -// if (strncmp(out_file_path + (out_path_len - 4u), ".csv", 4u) != 0) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", ...) out_file_path must end in .csv, " -// "because the output file is a csv.", -// fn_name, dup_threshold, out_file_path -// ); -// return -1; -// } - -// /** Extract dataset string. **/ -// if (maybe_data->Flags & EXPR_F_NULL) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", NULL) data cannot be NULL.", -// fn_name, dup_threshold, out_file_path -// ); -// return -1; -// } -// if (maybe_data->DataType != DATA_T_STRING) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \?\?\?) data must be a string.", -// fn_name, dup_threshold, out_file_path -// ); -// return -1; -// } -// char* data = maybe_data->String; -// if (data == NULL) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \?\?\?) expected string from data " -// "(of type DataType = DATA_T_STRING), but the String " -// "was NULL or did not exist!", -// fn_name, dup_threshold, out_file_path -// ); -// return -1; -// } -// if (strlen(data) == 0u) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"%s\") data cannot be an empty string.", -// fn_name, dup_threshold, out_file_path, data -// ); -// return -1; -// } - -// /** Check number of entries in the dataset. **/ -// size_t dataset_size = 1; -// for (char* buf = data; *buf != '\0'; buf++) -// if (*buf == SEPARATOR_CHAR) dataset_size++; - -// /** Verify dataset is reasonable size. **/ -// if (dataset_size == 1) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"\?\?\?\") Expected data to contain multiple " -// "values separated by \""SEPARATOR"\", but data was: \"%s\"", -// fn_name, dup_threshold, out_file_path, data -// ); -// return -1; -// } - -// /** Parse strs out of the data into the dataset. **/ -// size_t count = 0u; -// char* token = strtok(data, SEPARATOR); -// char* dataset[dataset_size]; -// memset(dataset, 0, sizeof(dataset)); -// while (token && count < dataset_size) -// { -// char* new_token = strdup(token); -// if (new_token == NULL) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") Failed to copy token \"%s\" from data.", -// fn_name, dup_threshold, out_file_path, token -// ); -// goto err_free_dataset; -// } -// dataset[count++] = new_token; -// token = strtok(NULL, SEPARATOR); -// } - -// /** Allocate memory to store dups. **/ -// pXArray dups; - -// /** Handle phone numbers. **/ -// if (is_phone_numbers) -// { -// /*** Phone number strings are always 10 characters long. Thus, they -// *** are NOT NULL TERMINATED because we can assume the length. -// ***/ -// unsigned int num_phone_numbers = 0u; -// char phone_numbers[dataset_size][10u]; - -// /** Parse the dataset. **/ -// for (unsigned int i = 0u; i < dataset_size; i++) -// { -// char* maybe_phone_number = dataset[i]; - -// /** Verify length can be a valid phone number. **/ -// const size_t len = strlen(maybe_phone_number); -// if (len < 10u) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") is too short. (skipped)", -// fn_name, dup_threshold, out_file_path, maybe_phone_number -// ); -// continue; -// } -// if (len > 18u) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") is too long. (skipped)", -// fn_name, dup_threshold, out_file_path, maybe_phone_number -// ); -// continue; -// } - -// /** Parse phone number. **/ -// char buf[11u], cur_char = maybe_phone_number[0]; -// unsigned int j = ((cur_char == '+') ? 2u : -// ((cur_char == '1') ? 1u : 0u)); -// unsigned int number_len = 0u; -// while (cur_char != '\0' && number_len <= 10u) -// { -// cur_char = maybe_phone_number[j]; - -// if ( -// cur_char == '-' || -// cur_char == ' ' || -// cur_char == '(' || -// cur_char == ')' -// ) continue; -// else if (!isdigit(cur_char)) -// { -// /** Unknown character. **/ -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") contains unexpected character '%c'. (skipped)", -// fn_name, dup_threshold, out_file_path, maybe_phone_number, cur_char -// ); -// goto next_phone_number; -// } - -// /** Add the character to the phone number. */ -// buf[number_len] = cur_char; -// number_len++; - -// /** Advance to next number. **/ -// j++; -// } - -// /** Check number of digits. **/ -// if (number_len < 10u) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") has less than 10 digits. (skipped)", -// fn_name, dup_threshold, out_file_path, maybe_phone_number -// ); -// continue; -// } -// if (number_len > 10u) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") \"Phone number\" (\"%s\") has more than 10 digits. (skipped)", -// fn_name, dup_threshold, out_file_path, maybe_phone_number -// ); -// continue; -// } - -// /** Copy valid phone number (with no null-terminator). **/ -// memcpy(phone_numbers[num_phone_numbers++], buf, 10u); - -// next_phone_number:; -// } - -// /** Invoke phone number search to find dups in the processed data. **/ -// dups = phone_search(phone_numbers, num_phone_numbers, dup_threshold); -// } - -// /** Handle text. **/ -// else -// { -// /** Build vectors from the strs in the dataset. **/ -// const size_t vectors_size = dataset_size * sizeof(int*); -// int** vectors = (int**)nmMalloc(vectors_size); -// if (vectors == NULL) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") - nmMalloc(%lu) failed.", -// fn_name, dup_threshold, out_file_path, vectors_size -// ); -// goto err_free_dataset; -// } -// for (size_t i = 0; i < dataset_size; i++) -// { -// const int* vector = vectors[i] = build_vector(dataset[i]); -// if (vector == NULL) -// { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") - build_vector(%s) failed.", -// fn_name, dup_threshold, out_file_path, dataset[i] -// ); -// goto err_free_vectors; -// } -// if (vector[0] == -EXP_NUM_DIMS) { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") - build_vector(%s) produced no character pairs.", -// fn_name, dup_threshold, out_file_path, dataset[i] -// ); -// goto err_free_vectors; -// } -// } - -// /** Invoke lightning search to find dups using the vectors. **/ -// dups = lightning_search(vectors, dataset_size, dup_threshold); -// if (dups == NULL) { -// mssErrorf(1, "EXP", -// "%s(%lg, \"%s\", \"...\") - lightning_search() failed.", -// fn_name, dup_threshold, out_file_path -// ); -// goto err_free_vectors; -// } - -// /** Free unused memory. **/ -// for (size_t i = 0; i < dataset_size; i++) -// { -// nmSysFree(vectors[i]); -// vectors[i] = NULL; -// } -// nmFree(vectors, vectors_size); -// vectors = NULL; -// goto search_done; - -// /** Free vectors, if needed. **/ -// err_free_vectors: -// if (vectors != NULL) -// { -// for (size_t i = 0; i < dataset_size; i++) -// { -// if (vectors[i] == NULL) break; -// nmSysFree(vectors[i]); -// vectors[i] = NULL; -// } -// nmFree(vectors, vectors_size); -// vectors = NULL; -// } -// goto err_free_dataset; - -// search_done:; -// } - -// /** Check number of dups found. **/ -// const int num_dups = dups->nItems; - -// // Hack where we hardcode the path to the root directory because trying to -// // track it down is way too hard. -// const char root_path[] = "/usr/local/src/cx-git/centrallix-os"; - -// /** Create output file path. **/ -// char out_path[BUFSIZ]; -// snprintf(memset(out_path, 0, sizeof(out_path)), sizeof(out_path), "%s/%s", root_path, out_file_path); - -// /** Write output file. **/ -// FILE* file = fopen(out_path, "w"); -// if (file == NULL) -// { -// perror("Failed to open file."); -// mssErrorf(1, "EXP", -// "%s(%lg, \"...\", ...) failed to open file: %s", -// fn_name, dup_threshold, out_path -// ); -// goto err_free_dups; -// } -// const int setvbuf_ret = setvbuf(file, NULL, _IOFBF, (1000 * 1000)); -// if (setvbuf_ret != 0) -// { -// perror("Failed to set buffering on file."); -// mssErrorf(1, "EXP", -// "%s(%lg, \"...\", ...) failed to set buffering on file: %d, %s", -// fn_name, dup_threshold, setvbuf_ret, out_path -// ); -// goto err_close_file; -// } - -// /** Write CSV header. **/ -// fprintf(file, "id1,id2,sim\n"); - -// /*** If no data was written, make sure there is at least one row in the -// *** output file since assuming this file has data makes the sql faster. -// ***/ -// if (num_dups == 0u) -// fprintf(file, "error,undefined,0.0\n"); - -// /** Write CSV data rows. **/ -// else -// { -// for (unsigned int i = 0u; i < num_dups; i++) -// { -// Dup* data = (Dup*)dups->Items[i]; -// fprintf(file, "%s,%s,%.8lf\n", dataset[data->id1], dataset[data->id2], data->similarity); -// nmFree(data, sizeof(Dup)); /* Free unused data. */ -// dups->Items[i] = NULL; -// } -// } - -// /** Free unused data. **/ -// for (unsigned int i = 0u; i < dataset_size; i++) -// { -// free(dataset[i]); -// dataset[i] = NULL; -// } -// xaDeInit(dups); -// dups = NULL; - -// /** Close file. **/ -// const int fclose_ret = fclose(file); -// if (fclose_ret != 0) -// { -// perror("Failed to close file."); -// mssErrorf(1, "EXP", -// "%s(%lg, \"...\") failed to close file: %d, %s", -// fn_name, dup_threshold, fclose_ret, out_path -// ); -// goto err_free_dataset; -// } -// file = NULL; - -// /** Success. **/ -// tree->DataType = DATA_T_INTEGER; -// tree->Integer = (int)num_dups; -// return 0; - -// /** Error cases. **/ - -// /** Close file, if needed. **/ -// err_close_file: -// if (file != NULL) -// { -// const int fclose_ret = fclose(file); -// if (fclose_ret != 0) -// { -// char dbl_buf[DBL_BUF_SIZE]; -// snprintf(dbl_buf, sizeof(dbl_buf), "%lg", dup_threshold); -// perror("Failed to close file."); -// mssErrorf(1, "EXP", -// "%s(%s, \"...\") failed to close file: %d, %s", -// fn_name, dbl_buf, fclose_ret, out_path -// ); -// } -// } - -// /** Free dups, if needed. **/ -// err_free_dups: -// if (dups != NULL) -// { -// for (unsigned int i = 0u; i < num_dups; i++) -// { -// nmFree(dups->Items[i], sizeof(Dup)); -// dups->Items[i] = NULL; -// } -// xaDeInit(dups); -// dups = NULL; -// } - -// /** Free dataset, if needed. **/ -// err_free_dataset: -// for (unsigned int i = 0u; i < dataset_size; i++) -// { -// if (dataset[i] == NULL) break; -// free(dataset[i]); -// dataset[i] = NULL; -// } - -// return -1; -// } - -// int exp_fn_get_dups(pExpression tree, pParamObjects objlist, pExpression p1, pExpression p2, pExpression p3) -// { -// return exp_fn_get_dups_general(tree, objlist, p1, p2, p3, "get_dups", false); -// } - -// int exp_fn_get_dups_phone(pExpression tree, pParamObjects objlist, pExpression p1, pExpression p2, pExpression p3) -// { -// return exp_fn_get_dups_general(tree, objlist, p1, p2, p3, "get_dups_phone", true); -// } - -// /** Magic values. **/ -// #define EXP_NUM_FIELDS 7 -// #define EXP_INDEX_FIRST_NAME 0 -// #define EXP_INDEX_FIRST_NAME_METAPHONE 1 -// #define EXP_INDEX_LAST_NAME 2 -// #define EXP_INDEX_LAST_NAME_METAPHONE 3 -// #define EXP_INDEX_EMAIL 4 -// #define EXP_INDEX_PHONE 5 -// #define EXP_INDEX_ADDRESS 6 - -// /** No-op function. **/ -// int exp_fn_do_nothing() { return 0; } - -// /*** Function to add parameters to private storage so that more than 3 parameters can be passed. -// *** Currently, doubles are the only supported param type. -// *** -// *** Usage: param(, , ) : R, -// *** where: V : Double -// *** -// *** @param tree Return param_value. -// *** @param objlist Function scope. -// *** @param maybe_array The 1st param, should be NULL or another call to param(). -// *** @param maybe_param_name The 2nd param, should be a string for the name of the param. -// *** @param maybe_param_value The 3rd param, should be the param_value of the param being set. -// ***/ -// int exp_fn_param(pExpression tree, pParamObjects objlist, pExpression maybe_param_name, pExpression maybe_param_value, pExpression maybe_array) { -// // Verify arg number. -// if (!maybe_param_name || !maybe_param_value) -// { -// mssErrorf(1, "EXP", "param(?) expects two or three parameters."); -// return -1; -// } - -// // Magic checks. -// ASSERTMAGIC(tree, MGK_EXPRESSION); -// ASSERTMAGIC(maybe_param_name, MGK_EXPRESSION); -// ASSERTMAGIC(maybe_param_value, MGK_EXPRESSION); -// ASSERTMAGIC(maybe_array, MGK_EXPRESSION); - -// // Check object list. -// if (!objlist) -// { -// mssErrorf(1, "EXP", "param(\?\?\?) no object list?"); -// return -1; -// } -// ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); - -// // Extract param name. -// if (maybe_param_name->Flags & EXPR_F_NULL) -// { -// mssErrorf(1, "EXP", "param(NULL, ...) param_name cannot be null."); -// return -1; -// } -// if (maybe_param_name->DataType != DATA_T_STRING) -// { -// mssErrorf(1, "EXP", "param(?, ...) param_name must be a string."); -// return -1; -// } -// const char* param_name = maybe_param_name->String; - -// // Extract param value. -// if (maybe_param_value->Flags & EXPR_F_NULL) -// { -// mssErrorf(1, "EXP", "param(\"%s\", NULL, ...) param_value cannot be null.", param_name); -// return -1; -// } -// if (maybe_param_value->DataType != DATA_T_DOUBLE) -// { -// mssErrorf(1, "EXP", "param(\"%s\", ?, ...) param_value must be a doube.", param_name); -// return -1; -// } -// double param_value = maybe_param_value->Types.Double; - -// // Verify the value being set. -// // TODO: Replace with hashmap. -// signed int index = -1; -// if (strcmp(param_name, "first_name") == 0) index = EXP_INDEX_FIRST_NAME; -// else if (strcmp(param_name, "first_name_metaphone") == 0) index = EXP_INDEX_FIRST_NAME_METAPHONE; -// else if (strcmp(param_name, "last_name") == 0) index = EXP_INDEX_LAST_NAME; -// else if (strcmp(param_name, "last_name_metaphone") == 0) index = EXP_INDEX_LAST_NAME_METAPHONE; -// else if (strcmp(param_name, "email") == 0) index = EXP_INDEX_EMAIL; -// else if (strcmp(param_name, "phone") == 0) index = EXP_INDEX_PHONE; -// else if (strcmp(param_name, "address") == 0) index = EXP_INDEX_ADDRESS; -// if (index == -1) -// { -// mssErrorf(1, "EXP", -// "param(\"%s\", %lf, ...) invalid field name %s.", -// param_name, param_value, param_name -// ); -// return -1; -// } - -// // Extract array. -// double* array; -// if (!maybe_array || maybe_array->Flags & EXPR_F_NULL) -// { -// const size_t size = EXP_NUM_FIELDS * sizeof(double); -// void* PrivateData = tree->PrivateData = memset(nmSysMalloc(size), 0, size); -// tree->PrivateDataFinalize = exp_fn_do_nothing; // DON'T FREE MY DATA UNTIL I'M READY. - -// array = (double*)PrivateData; -// for (unsigned int i = 0u; i < EXP_NUM_FIELDS; i++) array[i] = NAN; -// } -// else if ( -// maybe_array->DataType == DATA_T_ARRAY && -// maybe_array->PrivateData != NULL && -// !strcmp(maybe_array->Name, "param") -// ) -// { -// tree->PrivateData = maybe_array->PrivateData; -// tree->PrivateDataFinalize = exp_fn_do_nothing; // DON'T FREE MY DATA UNTIL I'M READY. -// array = (double*)maybe_array->PrivateData; -// } -// else -// { -// mssErrorf(1, "EXP", "param(\"%s\", %lf, ...) if provided, array must be from a call to param().", param_name, param_value); -// return -1; -// } - -// // Warn on previous data. -// double old_value = array[index]; -// if (!isnan(old_value)) -// { -// fprintf(stderr, -// "Warning: Overwriting field '%s'(@ index %d) with %lf (was %lf).\n", -// param_name, index, param_value, old_value -// ); -// } - -// // Set param_value. -// array[index] = param_value; - -// // Done -// tree->DataType = DATA_T_ARRAY; -// tree->Integer = 0; -// tree->Types.Double = 0.0; -// return 0; -// } - -// int exp_fn_get_sim(pExpression tree, pParamObjects objlist, pExpression maybe_fields, pExpression unused1, pExpression unused2) -// { -// if (!maybe_fields || unused1 || unused2) -// { -// mssErrorf(1, "EXP", "get_sim(param(...)) expects one parameter, from param()."); -// return -1; -// } - -// // Magic checks. -// ASSERTMAGIC(tree, MGK_EXPRESSION); -// ASSERTMAGIC(maybe_fields, MGK_EXPRESSION); - -// // Check object list. -// if (!objlist) -// { -// mssErrorf(1, "EXP", "get_sim(\?\?\?) no object list?"); -// return -1; -// } -// ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); - -// // Verify arg. -// if (maybe_fields->Flags & EXPR_F_NULL) -// { -// mssErrorf(1, "EXP", "get_sim(NULL) fields from param() cannot be NULL."); -// return -1; -// } -// if (maybe_fields->DataType != DATA_T_ARRAY || maybe_fields->PrivateData == NULL) -// { -// mssErrorf(1, "EXP", "get_sim(\?\?\?) expects arg 0 to be fields from a call to param()."); -// return -1; -// } - -// // Extract arg(s?). -// double* fields = (double*)maybe_fields->PrivateData; - -// const double first_name = fields[EXP_INDEX_FIRST_NAME]; -// if (isnan(first_name)) -// { -// mssErrorf(1, "EXP", "get_sim(...) first_name similarity not set."); -// return -1; -// } - -// const double first_name_metaphone = fields[EXP_INDEX_FIRST_NAME_METAPHONE]; -// if (isnan(first_name_metaphone)) -// { -// mssErrorf(1, "EXP", "get_sim(...) first_name_metaphone similarity not set."); -// return -1; -// } - -// const double last_name = fields[EXP_INDEX_LAST_NAME]; -// if (isnan(last_name)) -// { -// mssErrorf(1, "EXP", "get_sim(...) last_name similarity not set."); -// return -1; -// } - -// const double last_name_metaphone = fields[EXP_INDEX_LAST_NAME_METAPHONE]; -// if (isnan(last_name_metaphone)) -// { -// mssErrorf(1, "EXP", "get_sim(...) last_name_metaphone similarity not set."); -// return -1; -// } - -// const double email = fields[EXP_INDEX_EMAIL]; -// if (isnan(email)) -// { -// mssErrorf(1, "EXP", "get_sim(...) email similarity not set."); -// return -1; -// } - -// const double phone = fields[EXP_INDEX_PHONE]; -// if (isnan(phone)) -// { -// mssErrorf(1, "EXP", "get_sim(...) phone similarity not set."); -// return -1; -// } - -// const double address = fields[EXP_INDEX_ADDRESS]; -// if (isnan(address)) -// { -// mssErrorf(1, "EXP", "get_sim(...) address similarity not set."); -// return -1; -// } - -// char* primary; -// char* secondary; -// meta_double_metaphone("text", &primary, &secondary); -// printf("Primary: %s, secondary: %s\n", primary, secondary); - -// // Print args. -// printf( -// "Sims:\n" -// "\tfirst_name: %lf\n" -// "\tfirst_name_metaphone: %lf\n" -// "\tlast_name: %lf\n" -// "\tlast_name_metaphone: %lf\n" -// "\temail: %lf\n" -// "\tphone: %lf\n" -// "\taddress: %lf\n", -// first_name, -// first_name_metaphone, -// last_name, -// last_name_metaphone, -// email, -// phone, -// address -// ); - -// // Compute total. -// const double first_name_total = max(first_name * 1.0, first_name_metaphone * 0.9); -// const double last_name_total = max(last_name * 1.0, last_name_metaphone * 0.9); -// double total = (first_name_total * last_name_total) * 0.6 + email * 0.2 + address * 0.2; - -// // Clean up. -// nmSysFree(fields); - -// // Return total. -// tree->DataType = DATA_T_DOUBLE; -// tree->Types.Double = total; -// return 0; -// } - - +/*** Computes double metaphone. + *** + *** @param tree The tree resulting from this function. + *** @param objlist The evaluation "scope", including available variables. + *** @param maybe_str Possibly the string passed to double metaphone. + *** @param u1 Unused parameter. + *** @param u2 Unused parameter. + ***/ int exp_fn_double_metaphone(pExpression tree, pParamObjects objlist, pExpression maybe_str, pExpression u1, pExpression u2) { const char fn_name[] = "double_metaphone"; /** Check number of arguments. **/ - if (!maybe_str || u1 || u2) + const int num_params = tree->Children.nItems; + if (num_params != 1) { - mssErrorf(1, "EXP", "%s(?) expects 1 parameter.", fn_name); + mssErrorf(1, "EXP", "%s(?) expects 1 parameter, got %d parameters.", fn_name, num_params); return -1; } - const int num_params = tree->Children.nItems; - if (num_params != 1) + if (maybe_str == NULL || u1 != NULL || u2 != NULL) { - mssErrorf(1, "EXP", "%s(?) expects 1 parameter, got %d.", fn_name, num_params); + mssErrorf(1, "EXP", "%s(?) expects 1 parameter.", fn_name); return -1; } @@ -6019,7 +4176,7 @@ int exp_fn_double_metaphone(pExpression tree, pParamObjects objlist, pExpression ASSERTMAGIC(maybe_str, MGK_EXPRESSION); /** Check object list. **/ - if (!objlist) + if (objlist == NULL) { mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); return -1; @@ -6041,9 +4198,8 @@ int exp_fn_double_metaphone(pExpression tree, pParamObjects objlist, pExpression if (str == NULL) { mssErrorf(1, "EXP", - "%s(nothing?) expected string from str " - "(of type DataType = DATA_T_STRING), but the String " - "was NULL or did not exist!", + "%s(nothing?) expected string from str (of datatype DataType = " + "DATA_T_STRING), but the String was NULL or did not exist!", fn_name ); return -1; @@ -6056,18 +4212,15 @@ int exp_fn_double_metaphone(pExpression tree, pParamObjects objlist, pExpression } /** Compute DoubleMetaphone. **/ - char* primary; - char* secondary; - meta_double_metaphone( - str, - memset(&primary, 0, sizeof(primary)), - memset(&secondary, 0, sizeof(secondary)) - ); + char* primary = NULL; + char* secondary = NULL; + meta_double_metaphone(str, &primary, &secondary); /** Process result. **/ const size_t primary_length = strlen(primary); const size_t secondary_length = strlen(secondary); - char* result = nmSysMalloc(primary_length + 1u + secondary_length + 1u); + char* result = check_ptr(nmSysMalloc(primary_length + 1u + secondary_length + 1u)); + if (result == NULL) return -1; sprintf(result, "%s%c%s", primary, CA_BOUNDARY_CHAR, secondary); /** Return the result. **/ @@ -6076,13 +4229,6 @@ int exp_fn_double_metaphone(pExpression tree, pParamObjects objlist, pExpression return 0; } -// // Clean up. -// #undef min -// #undef max - -// // END OF DUPE SECTION -// // =================== - /* * exp_fn_argon2id * This method hashes a given password using the Argon2 algorithm (ID variant) @@ -6205,7 +4351,9 @@ int exp_fn_argon2id(pExpression tree, pParamObjects objlist, pExpression passwor int exp_internal_DefineFunctions() { - + /** Initialize library **/ + ca_init(); + /** Function list for EXPR_N_FUNCTION nodes **/ xhAdd(&EXP.Functions, "getdate", (char*)exp_fn_getdate); xhAdd(&EXP.Functions, "user_name", (char*)exp_fn_user_name); @@ -6260,9 +4408,6 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "log10", (char*)exp_fn_log10); xhAdd(&EXP.Functions, "power", (char*)exp_fn_power); xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); - xhAdd(&EXP.Functions, "levenshtein", (char*)exp_fn_levenshtein); /* Only used in its own tests. */ - xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_compare); /* Only used in its own tests. */ - xhAdd(&EXP.Functions, "cos_compare", (char*)exp_fn_cos_compare); xhAdd(&EXP.Functions, "to_base64", (char*)exp_fn_to_base64); xhAdd(&EXP.Functions, "from_base64", (char*)exp_fn_from_base64); xhAdd(&EXP.Functions, "to_hex", (char*)exp_fn_to_hex); @@ -6271,19 +4416,19 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "argon2id",(char*)exp_fn_argon2id); /** Duplicate Detection **/ - // xhAdd(&EXP.Functions, "get_dups", (char*)exp_fn_get_dups); - // xhAdd(&EXP.Functions, "get_dups_phone", (char*)exp_fn_get_dups_phone); - // xhAdd(&EXP.Functions, "no_op", (char*)exp_fn_do_nothing); - // xhAdd(&EXP.Functions, "do_nothing", (char*)exp_fn_do_nothing); - // xhAdd(&EXP.Functions, "param", (char*)exp_fn_param); - // xhAdd(&EXP.Functions, "total_sim", (char*)exp_fn_get_sim); + xhAdd(&EXP.Functions, "cos_cmp", (char*)exp_fn_cos_cmp); + xhAdd(&EXP.Functions, "cos_compare", (char*)exp_fn_cos_cmp); + xhAdd(&EXP.Functions, "cosine_compare", (char*)exp_fn_cos_cmp); + xhAdd(&EXP.Functions, "lev_cmp", (char*)exp_fn_lev_cmp); + xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_cmp); + xhAdd(&EXP.Functions, "levenshtein_compare", (char*)exp_fn_lev_cmp); xhAdd(&EXP.Functions, "double_metaphone", (char*)exp_fn_double_metaphone); /** Windowing **/ xhAdd(&EXP.Functions, "row_number", (char*)exp_fn_row_number); xhAdd(&EXP.Functions, "dense_rank", (char*)exp_fn_dense_rank); xhAdd(&EXP.Functions, "lag", (char*)exp_fn_lag); - + /** Aggregate **/ xhAdd(&EXP.Functions, "count", (char*)exp_fn_count); xhAdd(&EXP.Functions, "avg", (char*)exp_fn_avg); @@ -6293,9 +4438,9 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "first", (char*)exp_fn_first); xhAdd(&EXP.Functions, "last", (char*)exp_fn_last); xhAdd(&EXP.Functions, "nth", (char*)exp_fn_nth); - + /** Reverse functions **/ xhAdd(&EXP.ReverseFunctions, "isnull", (char*)exp_fn_reverse_isnull); - + return 0; } diff --git a/centrallix/include/obj.h b/centrallix/include/obj.h index 54d4c988a..045d57f85 100644 --- a/centrallix/include/obj.h +++ b/centrallix/include/obj.h @@ -192,7 +192,6 @@ typedef struct _OSD int (*Commit)(); int (*GetQueryCoverageMask)(); int (*GetQueryIdentityPath)(); - int (*Unregister)(); } ObjDriver, *pObjDriver; diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index 4acfc8579..c10c6fca6 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -72,7 +72,7 @@ ***/ /** Pure Laziness **/ -// #define ENABLE_TPRINTF +#define ENABLE_TPRINTF /** Debugging **/ #ifndef ENABLE_TPRINTF @@ -243,7 +243,7 @@ void** ci_xaToTrimmedArray(pXArray arr) } const size_t arr_size = arr->nItems * sizeof(void*); - void** result = check_ptr(nmMalloc(arr_size)); + void** result = check_ptr(nmSysMalloc(arr_size)); memcpy(result, arr->Items, arr_size); return result; } @@ -675,7 +675,7 @@ static void ci_FreeSourceData(pSourceData source_data); static void ci_FreeClusterData(pClusterData cluster_data, bool recursive); static void ci_FreeSearchData(pSearchData search_data); static void ci_FreeNodeData(pNodeData node_data); -static void ci_FreeCaches(void); +static void ci_ClearCaches(void); /** Deep Size Computation Functions. **/ // LINK #sizing @@ -718,7 +718,6 @@ static void ci_CacheFreeSourceData(pXHashEntry entry, void* path); static void ci_CacheFreeCluster(pXHashEntry entry, void* path); static void ci_CacheFreeSearch(pXHashEntry entry, void* path); int clusterExecuteMethod(void* inf_v, char* methodname, pObjData param, pObjTrxTree oxt); -int clusterUnregister(pObjDriver object_driver, pObjSession session); /** Unimplemented DriverFunctions. **/ // LINK #unimplemented @@ -1539,7 +1538,6 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) if (path == NULL) goto err; /** Allocate node struct data. **/ - // pNodeData node_data = NodeData |> sizeof() |> nmMalloc() |> check_ptr(); pNodeData node_data = check_ptr(nmMalloc(sizeof(NodeData))); if (node_data == NULL) goto err; memset(node_data, 0, sizeof(NodeData)); @@ -1678,7 +1676,7 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) /** Itterate over each param in the structure file. **/ node_data->nParams = param_infs.nItems; const size_t params_size = node_data->nParams * sizeof(pParam); - node_data->Params = check_ptr(nmMalloc(params_size)); + node_data->Params = check_ptr(nmSysMalloc(params_size)); if (node_data->Params == NULL) goto err_free_arrs; memset(node_data->Params, 0, params_size); for (unsigned int i = 0u; i < node_data->nParams; i++) @@ -1755,7 +1753,7 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) if (node_data->nClusterDatas > 0) { const size_t clusters_size = node_data->nClusterDatas * sizeof(pClusterData); - node_data->ClusterDatas = check_ptr(nmMalloc(clusters_size)); + node_data->ClusterDatas = check_ptr(nmSysMalloc(clusters_size)); if (node_data->ClusterDatas == NULL) goto err_free_arrs; memset(node_data->ClusterDatas, 0, clusters_size); for (unsigned int i = 0u; i < node_data->nClusterDatas; i++) @@ -1773,7 +1771,7 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) if (node_data->nSearchDatas > 0) { const size_t searches_size = node_data->nSearchDatas * sizeof(pSearchData); - node_data->SearchDatas = check_ptr(nmMalloc(searches_size)); + node_data->SearchDatas = check_ptr(nmSysMalloc(searches_size)); if (node_data->SearchDatas == NULL) goto err_free_arrs; memset(node_data->SearchDatas, 0, searches_size); for (unsigned int i = 0u; i < node_data->nSearchDatas; i++) @@ -1810,17 +1808,39 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) /** @param source_data A pSourceData struct, freed by this function. **/ static void ci_FreeSourceData(pSourceData source_data) { + /** Guard segfault. **/ + if (source_data == NULL) + { + fprintf(stderr, "Call to ci_FreeSourceData(NULL);\n"); + return; + } + /** Free top level attributes, if they exist. **/ - if (source_data->Name != NULL) nmSysFree(source_data->Name); - if (source_data->SourcePath != NULL) nmSysFree(source_data->SourcePath); - if (source_data->AttrName != NULL) nmSysFree(source_data->AttrName); + if (source_data->Name != NULL) + { + nmSysFree(source_data->Name); + source_data->Name = NULL; + } + if (source_data->SourcePath != NULL) + { + nmSysFree(source_data->SourcePath); + source_data->SourcePath = NULL; + } + if (source_data->AttrName != NULL) + { + nmSysFree(source_data->AttrName); + source_data->AttrName = NULL; + } /** Free fetched data, if it exists. **/ if (source_data->Strings != NULL) { for (unsigned int i = 0u; i < source_data->nVectors; i++) + { nmSysFree(source_data->Strings[i]); - nmFree(source_data->Strings, source_data->nVectors * sizeof(char*)); + source_data->Strings[i] = NULL; + } + nmSysFree(source_data->Strings); source_data->Strings = NULL; } @@ -1828,13 +1848,17 @@ static void ci_FreeSourceData(pSourceData source_data) if (source_data->Vectors != NULL) { for (unsigned int i = 0u; i < source_data->nVectors; i++) + { ca_free_vector(source_data->Vectors[i]); - nmFree(source_data->Vectors, source_data->nVectors * sizeof(pVector)); + source_data->Vectors[i] = NULL; + } + nmSysFree(source_data->Vectors); source_data->Vectors = NULL; } - /** Free the source_data struct. **/ + /** Free the source data struct. **/ nmFree(source_data, sizeof(SourceData)); + source_data = NULL; } @@ -1846,21 +1870,33 @@ static void ci_FreeSourceData(pSourceData source_data) ***/ static void ci_FreeClusterData(pClusterData cluster_data, bool recursive) { - /** Free top level cluster data. **/ - if (cluster_data->Name != NULL) nmSysFree(cluster_data->Name); + /** Guard segfault. **/ + if (cluster_data == NULL) + { + fprintf(stderr, "Call to ci_FreeClusterData(NULL, %s);\n", (recursive) ? "true" : "false"); + return; + } + + /** Free attribute data. **/ + if (cluster_data->Name != NULL) + { + nmSysFree(cluster_data->Name); + cluster_data->Name = NULL; + } /** Free computed data, if it exists. **/ if (cluster_data->Clusters != NULL) { - const unsigned int nVectors = cluster_data->SourceData->nVectors; for (unsigned int i = 0u; i < cluster_data->nClusters; i++) { pCluster cluster = &cluster_data->Clusters[i]; - nmFree(cluster->Strings, cluster->Size * sizeof(char*)); - nmFree(cluster->Vectors, cluster->Size * sizeof(pVector)); + nmSysFree(cluster->Strings); + nmSysFree(cluster->Vectors); + cluster->Strings = NULL; + cluster->Vectors = NULL; } - nmFree(cluster_data->Clusters, nVectors * sizeof(Cluster)); - nmFree(cluster_data->Sims, nVectors * sizeof(double)); + nmSysFree(cluster_data->Clusters); + nmSysFree(cluster_data->Sims); cluster_data->Clusters = NULL; cluster_data->Sims = NULL; } @@ -1871,14 +1907,18 @@ static void ci_FreeClusterData(pClusterData cluster_data, bool recursive) if (recursive) { for (unsigned int i = 0u; i < cluster_data->nSubClusters; i++) + { ci_FreeClusterData(cluster_data->SubClusters[i], recursive); + cluster_data->SubClusters[i] = NULL; + } } - nmFree(cluster_data->SubClusters, cluster_data->nSubClusters * sizeof(void*)); + nmSysFree(cluster_data->SubClusters); cluster_data->SubClusters = NULL; } - /** Free the cluster struct. **/ + /** Free the cluster data struct. **/ nmFree(cluster_data, sizeof(ClusterData)); + cluster_data = NULL; } @@ -1886,15 +1926,35 @@ static void ci_FreeClusterData(pClusterData cluster_data, bool recursive) /** @param search_data A pSearchData struct, freed by this function. **/ static void ci_FreeSearchData(pSearchData search_data) { - if (search_data->Name != NULL) nmSysFree(search_data->Name); + /** Guard segfault. **/ + if (search_data == NULL) + { + fprintf(stderr, "Call to ci_FreeSearchData(NULL);\n"); + return; + } + + /** Free attribute data. **/ + if (search_data->Name != NULL) + { + nmSysFree(search_data->Name); + search_data->Name = NULL; + } + + /** Free computed data. **/ if (search_data->Dups != NULL) { for (unsigned int i = 0; i < search_data->nDups; i++) + { nmFree(search_data->Dups[i], sizeof(Dup)); - nmFree(search_data->Dups, search_data->nDups * sizeof(void*)); + search_data->Dups[i] = NULL; + } + nmSysFree(search_data->Dups); search_data->Dups = NULL; } + + /** Free the search data struct. **/ nmFree(search_data, sizeof(SearchData)); + search_data = NULL; } @@ -1902,6 +1962,13 @@ static void ci_FreeSearchData(pSearchData search_data) /** @param node_data A pNodeData struct, freed by this function. **/ static void ci_FreeNodeData(pNodeData node_data) { + /** Guard segfault. **/ + if (node_data == NULL) + { + fprintf(stderr, "Call to ci_FreeNodeData(NULL);\n"); + return; + } + /** Free parsed params, if they exist. **/ if (node_data->Params != NULL) { @@ -1909,28 +1976,36 @@ static void ci_FreeNodeData(pNodeData node_data) { if (node_data->Params[i] == NULL) break; paramFree(node_data->Params[i]); + node_data->Params[i] = NULL; } - nmFree(node_data->Params, node_data->nParams * sizeof(pParam)); + nmSysFree(node_data->Params); + node_data->Params = NULL; } - if (node_data->ParamList != NULL) expFreeParamList(node_data->ParamList); + if (node_data->ParamList != NULL) + { + expFreeParamList(node_data->ParamList); + node_data->ParamList = NULL; + } /** Free parsed clusters, if they exist. **/ if (node_data->ClusterDatas != NULL) { - /*** This data is cached, so we should NOT free it! - *** The caching system is responsible for the memory. + /*** This data is cached, so we should NOT free it! The caching system + *** is responsible for the memory. We only need to free the array + *** holding our pointers to said cached memory. ***/ - nmFree(node_data->ClusterDatas, node_data->nClusterDatas * sizeof(pClusterData)); + nmSysFree(node_data->ClusterDatas); node_data->ClusterDatas = NULL; } /** Free parsed searches, if they exist. **/ if (node_data->SearchDatas != NULL) { - /*** This data is cached, so we should NOT free it! - *** The caching system is responsible for the memory. + /*** This data is cached, so we should NOT free it! The caching system + *** is responsible for the memory. We only need to free the array + *** holding our pointers to said cached memory. ***/ - nmFree(node_data->SearchDatas, node_data->nSearchDatas * sizeof(pSearchData)); + nmSysFree(node_data->SearchDatas); node_data->SearchDatas = NULL; } @@ -1942,18 +2017,20 @@ static void ci_FreeNodeData(pNodeData node_data) ***/ if (node_data->SourceData != NULL) { - /*** This data is cached, so we should NOT free it! - *** The caching system is responsible for the memory. + /*** This data is cached, so we should NOT free it! The caching system + *** is responsible for the memory. We only need to free the array + *** holding our pointers to said cached memory. ***/ node_data->SourceData = NULL; } /** Free the node data. **/ nmFree(node_data, sizeof(NodeData)); + node_data = NULL; } -/** Frees all caches for all cluster driver instances. **/ -static void ci_FreeCaches(void) +/** Frees all data in caches for all cluster driver instances. **/ +static void ci_ClearCaches(void) { /*** Free caches in reverse of the order they are created in case *** cached data relies on its source during the freeing process. @@ -2207,13 +2284,13 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) successful = false; goto end_free_data; } - if (vector[0] == -CA_NUM_DIMS) + if (ca_is_empty(vector)) { mssErrorf(1, "Cluster", "Vector building for string \"%s\" produced no character pairs.", val); successful = false; goto end_free_data; } - if (vector[0] == -172 && vector[1] == 11 && vector[2] == -78) + if (ca_has_no_pairs(vector)) { /** Skip pVector with no pairs. **/ tprintf("."); @@ -2241,7 +2318,7 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) /** Trim data and store data. **/ const size_t data_size = source_data->nVectors * sizeof(char*); - source_data->Strings = check_ptr(nmMalloc(data_size)); + source_data->Strings = check_ptr(nmSysMalloc(data_size)); if (source_data->Strings == NULL) goto end_free_data; memcpy(source_data->Strings, data_xarray.Items, data_size); check(xaDeInit(&data_xarray)); /* Failure ignored. */ @@ -2249,7 +2326,7 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) /** Trim data and store vectors. **/ const size_t vectors_size = source_data->nVectors * sizeof(pVector); - source_data->Vectors = check_ptr(nmMalloc(vectors_size)); + source_data->Vectors = check_ptr(nmSysMalloc(vectors_size)); memcpy(source_data->Vectors, vector_xarray.Items, vectors_size); check(xaDeInit(&vector_xarray)); /* Failure ignored. */ vector_xarray.nAlloc = 0; @@ -2325,11 +2402,11 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) /** Allocate static memory for finding clusters. **/ const size_t clusters_size = cluster_data->nClusters * sizeof(Cluster); - cluster_data->Clusters = check_ptr(nmMalloc(clusters_size)); + cluster_data->Clusters = check_ptr(nmSysMalloc(clusters_size)); if (cluster_data->Clusters == NULL) goto err; memset(cluster_data->Clusters, 0, clusters_size); const size_t sims_size = source_data->nVectors * sizeof(double); - cluster_data->Sims = check_ptr(nmMalloc(sims_size)); + cluster_data->Sims = check_ptr(nmSysMalloc(sims_size)); if (cluster_data->Sims == NULL) goto err_free_clusters; memset(cluster_data->Sims, 0, sims_size); @@ -2342,9 +2419,9 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) /** Put all the data into one cluster. **/ pCluster first_cluster = &cluster_data->Clusters[0]; first_cluster->Size = source_data->nVectors; - first_cluster->Strings = check_ptr(nmMalloc(source_data->nVectors * sizeof(char*))); + first_cluster->Strings = check_ptr(nmSysMalloc(source_data->nVectors * sizeof(char*))); if (first_cluster->Strings == NULL) goto err_free_sims; - first_cluster->Vectors = check_ptr(nmMalloc(source_data->nVectors * sizeof(pVector))); + first_cluster->Vectors = check_ptr(nmSysMalloc(source_data->nVectors * sizeof(pVector))); if (first_cluster->Vectors == NULL) goto err_free_sims; memcpy(first_cluster->Strings, source_data->Strings, source_data->nVectors * sizeof(char*)); memcpy(first_cluster->Vectors, source_data->Vectors, source_data->nVectors * sizeof(pVector)); @@ -2372,7 +2449,7 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) /** Allocate lables. Note: kmeans does not require us to initialize them. **/ const size_t lables_size = source_data->nVectors * sizeof(unsigned int); - unsigned int* labels = check_ptr(nmMalloc(lables_size)); + unsigned int* labels = check_ptr(nmSysMalloc(lables_size)); if (labels == NULL) goto err_free_sims; /** Run kmeans. **/ @@ -2409,9 +2486,9 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) pXArray indexes_in_this_cluster = &indexes_in_cluster[i]; pCluster cluster = &cluster_data->Clusters[i]; cluster->Size = indexes_in_this_cluster->nItems; - cluster->Strings = check_ptr(nmMalloc(cluster->Size * sizeof(char*))); + cluster->Strings = check_ptr(nmSysMalloc(cluster->Size * sizeof(char*))); if (cluster->Strings == NULL) goto err_free_sims; - cluster->Vectors = check_ptr(nmMalloc(cluster->Size * sizeof(pVector))); + cluster->Vectors = check_ptr(nmSysMalloc(cluster->Size * sizeof(pVector))); if (cluster->Vectors == NULL) goto err_free_sims; for (unsigned int j = 0u; j < cluster->Size; j++) { @@ -2584,7 +2661,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) /** Store dups. **/ search_data->nDups = dups->nItems; search_data->Dups = (dups->nItems == 0) - ? check_ptr(nmMalloc(0)) + ? check_ptr(nmSysMalloc(0)) : ci_xaToTrimmedArray(dups); /** Free unused data. **/ @@ -4162,7 +4239,7 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx else printf("all files:\n"); /** Free caches. **/ - ci_FreeCaches(); + ci_ClearCaches(); tprintf("Cache dropped.\n"); return 0; @@ -4185,21 +4262,6 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx } -/*** Frees caches when the driver is unregistered. - *** - *** This function does not free either of the given parameters. - *** - *** @param object_driver The driver instance which was registered being unregistered. (unused) - *** @param session The session being closed. (unused) - *** Returns - ***/ -int clusterUnregister(pObjDriver object_driver, pObjSession session) - { - ci_FreeCaches(); - return 0; - } - - /** ================ Unimplemented Functions ================ **/ /** ANCHOR[id=unimplemented] **/ // LINK #functions @@ -4272,9 +4334,6 @@ int clusterCommit(void* inf_v, pObjTrxTree *oxt) ***/ int clusterInitialize(void) { - /** Initialize library. **/ - ca_init(); - /** Allocate the driver. **/ pObjDriver drv = (pObjDriver)check_ptr(nmMalloc(sizeof(ObjDriver))); if (drv == NULL) goto err; @@ -4320,36 +4379,47 @@ int clusterInitialize(void) drv->Commit = clusterCommit; drv->GetQueryCoverageMask = NULL; drv->GetQueryIdentityPath = NULL; - drv->Unregister = clusterUnregister; /** Register some structures. **/ + nmRegister(sizeof(SourceData), "ClusterSourceData"); + nmRegister(sizeof(Cluster), "Cluster"); nmRegister(sizeof(ClusterData), "ClusterData"); nmRegister(sizeof(SearchData), "ClusterSearch"); - nmRegister(sizeof(SourceData), "ClusterSourceData"); nmRegister(sizeof(NodeData), "ClusterNodeData"); nmRegister(sizeof(DriverData), "ClusterDriverData"); nmRegister(sizeof(ClusterQuery), "ClusterQuery"); nmRegister(sizeof(ClusterDriverCaches), "ClusterDriverCaches"); /** Print debug size info. **/ - char buf1[16], buf2[16], buf3[16], buf4[16], buf5[16], buf6[16], buf7[16]; - tprintf( - "Cluster driver struct sizes:\n" - " > sizeof(SourceData): %s\n" - " > sizeof(ClusterData): %s\n" - " > sizeof(SearchData): %s\n" - " > sizeof(NodeData): %s\n" - " > sizeof(DriverData): %s\n" - " > sizeof(ClusterQuery): %s\n" - " > sizeof(ClusterDriverCaches): %s\n", - snprint_bytes(buf1, sizeof(buf1), sizeof(SourceData)), - snprint_bytes(buf2, sizeof(buf2), sizeof(ClusterData)), - snprint_bytes(buf3, sizeof(buf3), sizeof(SearchData)), - snprint_bytes(buf4, sizeof(buf4), sizeof(NodeData)), - snprint_bytes(buf5, sizeof(buf5), sizeof(DriverData)), - snprint_bytes(buf6, sizeof(buf6), sizeof(ClusterQuery)), - snprint_bytes(buf7, sizeof(buf7), sizeof(ClusterDriverCaches)) - ); +// char buf1[16], buf2[16], buf3[16], buf4[16], buf5[16], buf6[16], buf7[16], buf8[16]; +// tprintf( +// "Cluster driver struct sizes:\n" +// " > sizeof(SourceData): %s\n" +// " > sizeof(Cluster): %s\n" +// " > sizeof(ClusterData): %s\n" +// " > sizeof(SearchData): %s\n" +// " > sizeof(NodeData): %s\n" +// " > sizeof(DriverData): %s\n" +// " > sizeof(ClusterQuery): %s\n" +// " > sizeof(ClusterDriverCaches): %s\n", +// snprint_bytes(buf1, sizeof(buf1), sizeof(SourceData)), +// snprint_bytes(buf2, sizeof(buf2), sizeof(Cluster)), +// snprint_bytes(buf3, sizeof(buf3), sizeof(ClusterData)), +// snprint_bytes(buf4, sizeof(buf4), sizeof(SearchData)), +// snprint_bytes(buf5, sizeof(buf5), sizeof(NodeData)), +// snprint_bytes(buf6, sizeof(buf6), sizeof(DriverData)), +// snprint_bytes(buf7, sizeof(buf7), sizeof(ClusterQuery)), +// snprint_bytes(buf8, sizeof(buf8), sizeof(ClusterDriverCaches)) +// ); + +// pVector v = ca_build_vector(""); +// const unsigned int len = ca_sparse_len(v); +// fprintf(stderr, "Vector (x%d): [%d", len, v[0]); +// for (unsigned int i = 1u; i < len; i++) +// { +// fprintf(stderr, ", %d", v[i]); +// } +// fprintf(stderr, "]\n"); /** Register the driver. **/ if (!check(objRegisterDriver(drv))) goto err; diff --git a/centrallix/test_obj.c b/centrallix/test_obj.c index 6b09a8586..5ef492de3 100644 --- a/centrallix/test_obj.c +++ b/centrallix/test_obj.c @@ -1271,13 +1271,6 @@ testobj_do_cmd(pObjSession s, char* cmd, int batch_mode, pLxSession inp_lx) } else if (!strcmp(cmdname,"quit")) { - /** Loop through each driver and call their unregister handler, if they have one. **/ - for (unsigned int i = 0u; i < OSYS.Drivers.nItems; i++) - { - pObjDriver cur = (pObjDriver)OSYS.Drivers.Items[i]; - if (cur->Unregister != NULL) cur->Unregister(cur, s); - } - mlxCloseSession(ls); return 1; } From b4634f3cbd1410c147e8b7cb6a6f3ac1fb89ad96 Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 30 Oct 2025 14:48:10 -0600 Subject: [PATCH 06/30] Begin adding query files to search for duplicates. Simplify dataqa_duplicates component in preparation for making it the boundary into our new duplicate system. Add exp functions: sparse_eql(), ln(), and logn(). Fix bugs in comparison functions. Make minor tweaks to objdrv_cluster.c. --- centrallix-lib/include/clusters.h | 2 + centrallix-lib/src/clusters.c | 19 ++- centrallix-os/cluster-schema.cluster | 2 +- centrallix-os/file.cluster | 4 +- centrallix/expression/exp_functions.c | 209 ++++++++++++++++++++++++++ centrallix/osdrivers/objdrv_cluster.c | 100 ++++++------ 6 files changed, 283 insertions(+), 53 deletions(-) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index bddd0800c..288f81714 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -33,6 +33,7 @@ /************************************************************************/ #include +#include #ifdef CXLIB_INTERNAL #include "xarray.h" @@ -90,6 +91,7 @@ int ca_kmeans( /** Comparison functions, for ca_search(). **/ double ca_cos_compare(void* v1, void* v2); double ca_lev_compare(void* str1, void* str2); +bool ca_eql(pVector v1, pVector v2); void* ca_most_similar( void* target, diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 864ff36eb..ef1222873 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -184,7 +184,7 @@ pVector ca_build_vector(const char* str) if (size > expected_max_size) { fprintf(stderr, - "cli_build_vector(%s) - Warning: Sparse vector larger than expected.\n" + "cli_build_vector(\"%s\") - Warning: Sparse vector larger than expected.\n" " > Size: %lu\n" " > #Dims: %u\n", str, @@ -555,7 +555,7 @@ double ca_lev_compare(void* str1, void* str2) const size_t len2 = strlen(str2); if (len1 == 0lu && len2 == 0lu) return 1.0; if (len1 != 0lu && len2 == 0lu) return 0.0; - if (len1 != 0lu && len2 != 0lu) return 0.0; + if (len1 == 0lu && len2 != 0lu) return 0.0; /** Compute levenshtein edit distance. **/ const unsigned int dist = edit_dist((const char*)str1, (const char*)str2, len1, len2); @@ -567,6 +567,21 @@ double ca_lev_compare(void* str1, void* str2) return normalized_similarity; } +/*** Check if two sparse vectors are identical. + *** + *** @param v1 The first vector. + *** @param v2 The second vector. + *** @returns true if they are equal, + *** false if any element is different. + ***/ +bool ca_eql(pVector v1, pVector v2) + { + const unsigned int len = ca_sparse_len(v1); + for (unsigned int i = 0u; i < len; i++) + if (v1[i] != v2[i]) return false; + return true; + } + /*** Calculate the average size of all clusters in a set of vectors. *** *** @param vectors The vectors of the dataset (allocated sparsely). diff --git a/centrallix-os/cluster-schema.cluster b/centrallix-os/cluster-schema.cluster index 9f11c1636..e87ae6b5f 100644 --- a/centrallix-os/cluster-schema.cluster +++ b/centrallix-os/cluster-schema.cluster @@ -38,8 +38,8 @@ file_name "system/cluster" search_name "system/search" { source : string ⊂ [cluster_name, ...] - threshold : double && 0.0 < x < 1.0 // optimization. similarity_measure : "cosine" | "levenshtein" + threshold : double && 0.0 < x < 1.0 // optimization. } ... } diff --git a/centrallix-os/file.cluster b/centrallix-os/file.cluster index 078a39fcc..95eacfee0 100644 --- a/centrallix-os/file.cluster +++ b/centrallix-os/file.cluster @@ -54,14 +54,14 @@ file_name "system/cluster" dups "cluster/search" { source = kmeans_cluster; - threshold = 0.75; similarity_measure = "cosine"; + threshold = 0.75; } dups2 "cluster/search" { source = no_clustering; - threshold = 0.75; similarity_measure = "cosine"; + threshold = 0.75; } } diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 4f9ffa563..751e07297 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -3288,6 +3288,101 @@ int exp_fn_log10(pExpression tree, pParamObjects objlist, pExpression i0, pExpre } +int exp_fn_log_natural(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) + { + double n; + + if (!i0) + { + mssError(1, "EXP", "ln() requires a number as its first parameter"); + goto error; + } + if (i0->Flags & EXPR_F_NULL) + { + tree->DataType = DATA_T_DOUBLE; + tree->Flags |= EXPR_F_NULL; + return 0; + } + switch(i0->DataType) + { + case DATA_T_INTEGER: + n = i0->Integer; + break; + case DATA_T_DOUBLE: + n = i0->Types.Double; + break; + case DATA_T_MONEY: + n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); + break; + default: + mssError(1, "EXP", "ln() requires a number as its first parameter"); + goto error; + } + if (n < 0) + { + mssError(1, "EXP", "ln(): cannot compute the logarithm of a negative number"); + goto error; + } + tree->DataType = DATA_T_DOUBLE; + tree->Types.Double = log(n); + return 0; + + error: + return -1; + } + + +int exp_fn_log_base_n(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) + { + double n, p; + + if (!i0 || !i1) + { + mssError(1, "EXP", "logn() requires numbers as its first and second parameters"); + goto error; + } + if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) + { + tree->DataType = DATA_T_DOUBLE; + tree->Flags |= EXPR_F_NULL; + return 0; + } + switch(i0->DataType) + { + case DATA_T_INTEGER: + n = i0->Integer; + break; + case DATA_T_DOUBLE: + n = i0->Types.Double; + break; + case DATA_T_MONEY: + n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); + break; + default: + mssError(1, "EXP", "logn() requires a number as its first parameter"); + goto error; + } + switch(i1->DataType) + { + case DATA_T_INTEGER: + p = i1->Integer; + break; + case DATA_T_DOUBLE: + p = i1->Types.Double; + break; + default: + mssError(1, "EXP", "logn() requires an integer or double as its second parameter"); + goto error; + } + tree->DataType = DATA_T_DOUBLE; + tree->Types.Double = log(n) / log(p); + return 0; + + error: + return -1; + } + + int exp_fn_power(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { double n, p; @@ -4008,6 +4103,7 @@ int exp_fn_nth(pExpression tree, pParamObjects objlist, pExpression i0, pExpress *** @param maybe_str2 Possibly the second string. *** @param u1 Unused parameter. *** @param is_cos Whether to compute cosine or levenshtien. + *** @returns 0 for success, -1 for failure. ***/ static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1, bool is_cos) { @@ -4126,6 +4222,7 @@ static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe *** @param maybe_str1 Possibly the first string. *** @param maybe_str2 Possibly the second string. *** @param u1 Unused parameter. + *** @returns 0 for success, -1 for failure. ***/ int exp_fn_cos_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) { @@ -4140,12 +4237,121 @@ int exp_fn_cos_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_st *** @param maybe_str1 Possibly the first string. *** @param maybe_str2 Possibly the second string. *** @param u1 Unused parameter. + *** @returns 0 for success, -1 for failure. ***/ int exp_fn_lev_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, false); } + +/*** Comparse two strings to see if their sparse vectors are equal. + *** + *** @param tree The tree resulting from this function. + *** @param objlist The evaluation "scope", including available variables. + *** @param maybe_str1 Possibly the first string. + *** @param maybe_str2 Possibly the second string. + *** @param u1 Unused parameter. + *** @returns 0 for success, -1 for failure. + ***/ +static int exp_fn_sparse_eql(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1, bool is_cos) + { + const char fn_name[] = "sparse_compare"; + + /** Check number of arguments. **/ + const int num_params = tree->Children.nItems; + if (num_params != 2) + { + mssErrorf(1, "EXP", "%s(?) expects 2 parameters, got %d parameters.", fn_name, num_params); + return -1; + } + if (maybe_str1 == NULL || maybe_str2 == NULL || u1 != NULL) + { + mssErrorf(1, "EXP", "%s(?) expects 2 parameters.", fn_name); + return -1; + } + + /** Magic checks. **/ + ASSERTMAGIC(tree, MGK_EXPRESSION); + ASSERTMAGIC(maybe_str1, MGK_EXPRESSION); + ASSERTMAGIC(maybe_str2, MGK_EXPRESSION); + + /** Check object list. **/ + if (objlist == NULL) + { + mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); + return -1; + } + ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); + + /** Extract str1. **/ + if (maybe_str1->Flags & EXPR_F_NULL) + { + mssErrorf(1, "EXP", "%s(NULL, ...) str1 cannot be NULL.", fn_name); + return -1; + } + if (maybe_str1->DataType != DATA_T_STRING) + { + mssErrorf(1, "EXP", "%s(\?\?\?, ..) str1 should be a string.", fn_name); + return -1; + } + char* str1 = maybe_str1->String; + if (str1 == NULL) + { + mssErrorf(1, "EXP", + "%s(nothing?, ...) expected string from str1 (of datatype DataType = " + "DATA_T_STRING), but the String was NULL or did not exist!", + fn_name + ); + return -1; + } + + /** Extract str2. **/ + if (maybe_str2->Flags & EXPR_F_NULL) + { + mssErrorf(1, "EXP", "%s(\"%s\", NULL) str2 cannot be NULL.", fn_name, str1); + return -1; + } + if (maybe_str2->DataType != DATA_T_STRING) + { + mssErrorf(1, "EXP", "%s(\"%s\", \?\?\?) str2 should be a string.", fn_name, str1); + return -1; + } + char* str2 = maybe_str2->String; + if (str2 == NULL) + { + mssErrorf(1, "EXP", + "%s(\"%s\", nothing?) expected string from str2 (of datatype DataType = " + "DATA_T_STRING), but the String was NULL or did not exist!", + fn_name, str1 + ); + return -1; + } + + /** Build vectors. **/ + int ret; + const pVector v1 = check_ptr(ca_build_vector(str1)); + const pVector v2 = check_ptr(ca_build_vector(str2)); + if (v1 == NULL || v2 == NULL) + { + mssErrorf(1, "EXP", + "%s(\"%s\", \"%s\") - Failed to build vectors.", + fn_name, str1, str2 + ); + ret = -1; + } + else + { + tree->Integer = (ca_eql(v1, v2)) ? 1 : 0; + tree->DataType = DATA_T_INTEGER; + ret = 0; + } + + if (v1 != NULL) ca_free_vector(v1); + if (v2 != NULL) ca_free_vector(v2); + return ret; + } + /*** Computes double metaphone. *** *** @param tree The tree resulting from this function. @@ -4406,6 +4612,8 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "hash", (char*)exp_fn_hash); xhAdd(&EXP.Functions, "hmac", (char*)exp_fn_hmac); xhAdd(&EXP.Functions, "log10", (char*)exp_fn_log10); + xhAdd(&EXP.Functions, "ln", (char*)exp_fn_log_natural); + xhAdd(&EXP.Functions, "logn", (char*)exp_fn_log_base_n); xhAdd(&EXP.Functions, "power", (char*)exp_fn_power); xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); xhAdd(&EXP.Functions, "to_base64", (char*)exp_fn_to_base64); @@ -4422,6 +4630,7 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "lev_cmp", (char*)exp_fn_lev_cmp); xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_cmp); xhAdd(&EXP.Functions, "levenshtein_compare", (char*)exp_fn_lev_cmp); + xhAdd(&EXP.Functions, "sparse_eql", (char*)exp_fn_sparse_eql); xhAdd(&EXP.Functions, "double_metaphone", (char*)exp_fn_double_metaphone); /** Windowing **/ diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index c10c6fca6..6d1aaf313 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -71,8 +71,8 @@ *** https://marketplace.visualstudio.com/items?itemName=ExodiusStudios.comment-anchors ***/ -/** Pure Laziness **/ -#define ENABLE_TPRINTF +/** Pure Laziness. **/ +// #define ENABLE_TPRINTF /** Debugging **/ #ifndef ENABLE_TPRINTF @@ -815,7 +815,7 @@ static int ci_ParseAttribute( if (datatype != exp->DataType) { mssErrorf(1, "Cluster", - "Expected \"%s\" : %s, but got type %s.", + "Expected ['%s' : %s], but got type %s.", attr_name, ci_TypeToStr(datatype), ci_TypeToStr(exp->DataType) ); goto err; @@ -826,7 +826,7 @@ static int ci_ParseAttribute( if (ret != 0) { mssErrorf(1, "Cluster", - "Failed to get \"%s\" : %s using expression \"%s\" (error code %d).", + "Failed to get ['%s' : %s] using expression \"%s\" (error code %d).", attr_name, ci_TypeToStr(datatype), exp->Name, ret ); goto err; @@ -1215,7 +1215,7 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) if (strcmp(group_type, "cluster/cluster") != 0) { fprintf(stderr, - "Warning: Unknown group \"%s\" : \"%s\" in cluster \"%s\".\n", + "Warning: Unknown group [\"%s\" : \"%s\"] in cluster \"%s\".\n", name, group_type, inf->Name ); continue; @@ -1364,13 +1364,13 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) if (search_data->Name == NULL) goto err_free_search; if (!check(objCurrentDate(&search_data->DateCreated))) goto err_free_search; - /** Get source. **/ - char* source_name; - if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&source_name), node_data->ParamList, true, true) != 0) return NULL; + /** Get source cluster. **/ + char* source_cluster_name; + if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&source_cluster_name), node_data->ParamList, true, true) != 0) return NULL; for (unsigned int i = 0; i < node_data->nClusterDatas; i++) { pClusterData cluster_data = node_data->ClusterDatas[i]; - if (strcmp(source_name, cluster_data->Name) == 0) + if (strcmp(source_cluster_name, cluster_data->Name) == 0) { /** Source found. **/ search_data->Source = cluster_data; @@ -1384,13 +1384,13 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) if (search_data->Source == NULL) { /** Print error. **/ - mssErrorf(1, "Cluster", "Could not find cluster \"%s\" for search \"%s\".", source_name, search_data->Name); + mssErrorf(1, "Cluster", "Could not find cluster \"%s\" for search \"%s\".", source_cluster_name, search_data->Name); /** Attempt to give a hint. **/ char* cluster_names[node_data->nClusterDatas]; for (unsigned int i = 0; i < node_data->nClusterDatas; i++) cluster_names[i] = node_data->ClusterDatas[i]->Name; - ci_TryHint(source_name, cluster_names, node_data->nClusterDatas); + ci_TryHint(source_cluster_name, cluster_names, node_data->nClusterDatas); /** Fail. **/ goto err_free_search; @@ -1457,7 +1457,7 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) char* group_type = check_ptr(sub_inf->UsrType); if (group_type == NULL) goto err_free_search; fprintf(stderr, - "Warning: Unknown group \"%s\" : \"%s\" in search \"%s\".\n", + "Warning: Unknown group [\"%s\" : \"%s\"] in search \"%s\".\n", name, group_type, inf->Name ); break; @@ -1513,7 +1513,7 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) ci_FreeSearchData(search_data); err: - mssErrorf(0, "Cluster", "Failed to parse search from group \"%s\".", inf->Name); + mssErrorf(0, "Cluster", "Failed to parse SearchData from group \"%s\".", inf->Name); return NULL; } @@ -2174,9 +2174,9 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) if (obj == NULL) { mssErrorf(0, "Cluster", - "Failed to open object driver:" - " > Attribute: \"%s\" : String\n" - " > Source Path: %s", + "Failed to open object driver:\n" + " > Attribute: ['%s' : String]\n" + " > Source Path: \"%s\"", source_data->AttrName, source_data->SourcePath ); @@ -2190,9 +2190,9 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) { mssErrorf(0, "Cluster", "Failed to open query:\n" - " > Attribute: \"%s\" : String\n" + " > Attribute: ['%s' : String]\n" " > Driver Used: %s\n" - " > Source Path: %s", + " > Source Path: \"%s\"", source_data->AttrName, obj->Driver->Name, source_data->SourcePath @@ -2221,9 +2221,9 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) { mssErrorf(0, "Cluster", "Failed to get type for %uth entry:\n" - " > Attribute: \"%s\" : String\n" + " > Attribute: '%s' : String\n" " > Driver Used: %s\n" - " > Source Path: %s", + " > Source Path: \"%s\"", i, source_data->AttrName, obj->Driver->Name, @@ -2235,9 +2235,9 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) { mssErrorf(1, "Cluster", "Type for %uth entry was not a string:\n" - " > Attribute: \"%s\" : %s!!\n" + " > Attribute: ['%s' : %s]\n" " > Driver Used: %s\n" - " > Source Path: %s", + " > Source Path: \"%s\"", i, source_data->AttrName, ci_TypeToStr(datatype), obj->Driver->Name, @@ -2254,9 +2254,9 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) tprintf("\n"); mssErrorf(0, "Cluster", "Failed to value for %uth entry:\n" - " > Attribute: \"%s\" : String\n" + " > Attribute: ['%s' : String]\n" " > Driver Used: %s\n" - " > Source Path: %s\n" + " > Source Path: \"%s\"\n" " > Error code: %d", i, source_data->AttrName, @@ -2264,7 +2264,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) source_data->SourcePath, ret ); - successful = false; goto end_free_data; } @@ -2393,7 +2392,7 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) /** We need the SourceData vectors to compute clusters. **/ if (ci_ComputeSourceData(source_data, node_data->ParamList->Session) != 0) { - mssErrorf(0, "Cluster", "Failed to compute SourceData."); + mssErrorf(0, "Cluster", "ClusterData computation failed due to missing SourceData."); goto err; } @@ -2532,7 +2531,7 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) cluster_data->Clusters = NULL; err: - mssErrorf(0, "Cluster", "Cluster computation failed for \"%s\".", cluster_data->Name); + mssErrorf(0, "Cluster", "ClusterData computation failed for \"%s\".", cluster_data->Name); return -1; } @@ -2561,7 +2560,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) ret = ci_ComputeClusterData(cluster_data, node_data); if (ret != 0) { - mssErrorf(0, "Cluster", "Search computation failed due to missing clusters."); + mssErrorf(0, "Cluster", "SearchData computation failed due to missing clusters."); goto err; } @@ -2608,7 +2607,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) search_data->Threshold, dups )); - if (dups_temp == NULL) goto err; + if (dups_temp == NULL) goto err_free; else dups = dups_temp; } } @@ -2639,7 +2638,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) search_data->Threshold, dups )); - if (dups_temp == NULL) goto err; + if (dups_temp == NULL) goto err_free; else dups = dups_temp; } } @@ -2651,10 +2650,10 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) "Unknown similarity meansure \"%s\".", ci_SimilarityMeasureToString(search_data->SimilarityMeasure) ); - goto err; + goto err_free; } timer_stop(timer); - if (dups_temp == NULL) goto err; + if (dups_temp == NULL) goto err_free; else dups = dups_temp; tprintf("Search done after %.4lf.\n", timer_get(timer)); @@ -2671,7 +2670,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) /** Success. **/ return 0; - err: + err_free: if (dups != NULL) { for (unsigned int i = 0u; i < dups->nItems; i++) @@ -2682,7 +2681,8 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) check(xaFree(dups)); /* Failure ignored. */ } - mssErrorf(0, "Cluster", "Search computation failed for \"%s\".", search_data->Name); + err: + mssErrorf(0, "Cluster", "SearchData computation failed for \"%s\".", search_data->Name); return -1; } @@ -2770,7 +2770,7 @@ static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData err: mssErrorf(1, "Cluster", - "Failed to get parameter %s : %s", + "Failed to get parameter ['%s' : %s]", attr_name, ci_TypeToStr(datatype) ); return -1; @@ -3040,7 +3040,7 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) case TARGET_ROOT: mssErrorf(1, "Cluster", "Querying the root node of a cluster file is not allowed."); fprintf(stderr, " > Hint: Try / or /\n"); - return NULL; + goto err; case TARGET_CLUSTER: { @@ -3049,8 +3049,8 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) ret = ci_ComputeClusterData(target, cluster_query->DriverData->NodeData); if (ret != 0) { - mssErrorf(0, "Cluster", "Internal cluster computation failed."); - return NULL; + mssErrorf(0, "Cluster", "Failed to compute ClusterData for query."); + goto err; } data_amount = target->nClusters; break; @@ -3063,8 +3063,8 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) ret = ci_ComputeSearchData(target, cluster_query->DriverData->NodeData); if (ret != 0) { - mssErrorf(0, "Cluster", "Internal search computation failed."); - return NULL; + mssErrorf(0, "Cluster", "Failed to compute SearchData for query."); + goto err; } data_amount = target->nDups; break; @@ -3073,29 +3073,33 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) case TARGET_CLUSTER_ENTRY: case TARGET_SEARCH_ENTRY: mssErrorf(1, "Cluster", "Querying a query result is not allowed."); - return NULL; + goto err; default: mssErrorf(1, "Cluster", "Unknown target type %u.", target_type); - return NULL; + goto err; } tprintf("Fetch Index: %u/16 (total: %u)\n", cluster_query->RowIndex, data_amount); - /** Cap results to 16 for faster debugging. TODO: Remove. **/ - data_amount = min(data_amount, 16); + /** Cap results to 16 for faster debugging. TODO: Israel - Remove. **/ +// data_amount = min(data_amount, 16); /** Check that the requested data exists, returning null if we've reached the end of the data. **/ if (cluster_query->RowIndex >= data_amount) return NULL; /** Create the result struct. **/ pDriverData driver_data = check_ptr(nmMalloc(sizeof(DriverData))); - if (driver_data == NULL) return NULL; + if (driver_data == NULL) goto err; memcpy(driver_data, cluster_query->DriverData, sizeof(DriverData)); driver_data->TargetType = new_target_type; driver_data->TargetIndex = cluster_query->RowIndex++; /** Success. **/ return driver_data; + + err: + mssErrorf(0, "Cluster", "Failed to fetch query result."); + return NULL; } @@ -3259,7 +3263,7 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val if (datatype != expected_datatype) { mssErrorf(1, "Cluster", - "Type mismatch: Accessing attribute '%s' : %s as type %s.", + "Type mismatch: Accessing attribute ['%s' : %s] as type %s.", attr_name, ci_TypeToStr(expected_datatype), ci_TypeToStr(datatype) ); return -1; @@ -4168,7 +4172,7 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx if (param->String == NULL) { mssErrorf(1, "Cluster", - "param : \"show\" | \"show_all\" | \"drop_all\" is required for the cache method." + "[param : \"show\" | \"show_all\" | \"drop_all\"] is required for the cache method." ); goto err; } @@ -4247,7 +4251,7 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx /** Unknown parameter. **/ mssErrorf(1, "Cluster", - "Expected param : \"show\" | \"show_all\" | \"drop_all\" the cache method, but got: \"%s\"", + "Expected [param : \"show\" | \"show_all\" | \"drop_all\"] for the cache method, but got: \"%s\"", param->String ); goto err; From 63a4dc224ae126d38c6902d118d2e92872d2490a Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 7 Nov 2025 11:10:31 -0700 Subject: [PATCH 07/30] Add warning for providing an invalid parameter. Modify cluster files to use string keys. Build vectors fully sparsely. Add ca_fprint_vector(). Add snprint_llu(). Add exp_fn_trim(). Update exp_fn_cmp(). Organize exp function definitions by group. Add statistics tracking to cluster driver. Reduce minimum hint threshold. Add array handling to ci_xaToTrimmedArray(). Update timer to handle multiple starts and stops properly. --- centrallix-lib/include/clusters.h | 13 +- centrallix-lib/include/util.h | 4 +- centrallix-lib/src/clusters.c | 321 ++++++++---- centrallix-lib/src/util.c | 37 +- centrallix-os/cluster-schema.cluster | 9 +- centrallix/expression/exp_functions.c | 325 ++++++++---- centrallix/osdrivers/objdrv_cluster.c | 698 +++++++++++++++----------- 7 files changed, 896 insertions(+), 511 deletions(-) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index 288f81714..8338cd5e0 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -41,7 +41,11 @@ #include "cxlib/xarray.h" #endif -#define CA_NUM_DIMS 251 /* aka. The vector table size. */ +/*** 2147483629 is the signed int max, and is also a prime number. + *** Using this value ensures that the longest run of 0s will not + *** cause an int underflow with the current encoding scheme. + ***/ +#define CA_NUM_DIMS 251 //2147483629 /* aka. The vector table size. */ /// LINK ../../centrallix-sysdoc/string_comparison.md#cosine_charsets /** The character used to create a pair with the first and last characters of a string. **/ @@ -55,8 +59,8 @@ typedef double* pCentroid; /* Dense centroid. */ /** Duplocate information. **/ typedef struct { - unsigned int id1; - unsigned int id2; + void* key1; + void* key2; double similarity; } Dup, *pDup; @@ -70,6 +74,7 @@ typedef struct pVector ca_build_vector(const char* str); unsigned int ca_sparse_len(const pVector vector); +void ca_print_vector(const pVector vector); void ca_free_vector(pVector sparse_vector); int ca_kmeans( pVector* vectors, @@ -105,12 +110,14 @@ pXArray ca_sliding_search( const unsigned int window_size, const double (*similarity)(void*, void*), const double dupe_threshold, + void** maybe_keys, pXArray dups); pXArray ca_complete_search( void** data, const unsigned int num_data, const double (*similarity)(void*, void*), const double dupe_threshold, + void** maybe_keys, pXArray dups); #endif /* End of .h file. */ diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index dd821767f..1f286cc26 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -25,11 +25,12 @@ extern "C" { unsigned int strtoui(const char *nptr, char **endptr, int base); char* snprint_bytes(char* buf, const size_t buf_size, unsigned int bytes); + char* snprint_llu(char* buf, size_t buflen, unsigned long long value); void fprint_mem(FILE* out); typedef struct { - double start, end; + double start, total; } Timer, *pTimer; @@ -38,6 +39,7 @@ extern "C" { pTimer timer_start(pTimer timer); pTimer timer_stop(pTimer timer); double timer_get(pTimer timer); + pTimer timer_reset(pTimer timer); void timer_de_init(pTimer timer); void timer_free(pTimer timer); diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index ef1222873..d61a558c7 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -50,18 +50,31 @@ /*** Gets the hash, representing a pair of ASCII characters, represented by unsigned ints. *** Thank you to professor John Delano for this hashing algorithm. *** - *** @param num1 The first character in the pair. - *** @param num1 The second character in the pair. + *** @param c1 The first character in the pair. + *** @param c2 The second character in the pair. *** @returns The resulting hash. ***/ -static unsigned int hash_char_pair(const unsigned int num1, const unsigned int num2) +static unsigned int hash_char_pair(const char c1, const char c2) { - const double sum = (num1 * num1 * num1) + (num2 * num2 * num2); - const double scale = ((double)num1 + 1.0) / ((double)num2 + 1.0); + const double sum = (c1 * c1 * c1) + (c2 * c2 * c2); + const double scale = ((double)c1 + 1.0) / ((double)c2 + 1.0); const unsigned int hash = (unsigned int)round(sum * scale) - 1u; return hash % CA_NUM_DIMS; } +typedef struct + { + unsigned char c1, c2; + unsigned int hash; + } + CharPair, *pCharPair; + +static int charpair_cmp(const void *p1, const void *p2) + { + const CharPair *a = p1, *b = p2; + return a->hash - b->hash; + } + /*** Builds a vector using a string. *** *** Vectors are based on the frequencies of character pairs in the string. @@ -109,123 +122,185 @@ static unsigned int hash_char_pair(const unsigned int num1, const unsigned int n ***/ pVector ca_build_vector(const char* str) { - /** Allocate space for a dense vector. **/ - unsigned int dense_vector[CA_NUM_DIMS] = {0u}; - - /** j is the former character, i is the latter. **/ - const unsigned int num_chars = (unsigned int)strlen(str); - for (unsigned int j = 65535u, i = 0u; i <= num_chars; i++) + char chars[strlen(str) + 2u]; + unsigned int num_chars = 0u; + chars[num_chars++] = CA_BOUNDARY_CHAR; /* Starting boundary character. */ + for (const char* char_ptr = str; *char_ptr != '\0'; char_ptr++) { - /** isspace: space, \n, \v, \f, \r **/ - if (isspace(str[i])) continue; - - /** ispunct: !"#$%&'()*+,-./:;<=>?@[\]^_{|}~ **/ - if (ispunct(str[i]) && str[i] != CA_BOUNDARY_CHAR) continue; + unsigned char c = *char_ptr; - /*** iscntrl (0-8): NULL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS - *** (14-31): SO, SI, DLE, DC1-4, NAK, SYN, ETB, CAN EM, - *** SUB, ESC, FS, GS, RS, US - ***/ - if (iscntrl(str[i]) && i != num_chars) - { - fprintf(stderr, - "ca_build_vector(%s) - Warning: Skipping unknown character #%u.\n", - str, (unsigned int)str[i] - ); - continue; - } + /** Always consider boundary character in string. **/ + if (c == CA_BOUNDARY_CHAR) goto skip_checks; - /** First and last character should fall one before 'a' in the ASCII table. **/ - unsigned int temp1 = (j == 65535u) ? CA_BOUNDARY_CHAR : (unsigned int)tolower(str[j]); - unsigned int temp2 = (i == num_chars) ? CA_BOUNDARY_CHAR : (unsigned int)tolower(str[i]); + /** Ignore insignificant characters: spaces and punctuation. **/ + if (isspace(c)) continue; /* space, \n, \v, \f, \r */ + if (ispunct(c)) continue; /* !"#$%&'()*+,-./:;<=>?@[\]^_{|}~ */ + skip_checks: /** Shift numbers to the end of the lowercase letters. **/ - if ('0' <= temp1 && temp1 <= '9') temp1 += 75u; - if ('0' <= temp2 && temp2 <= '9') temp2 += 75u; + if ('0' <= c && c <= '9') c += 75u; - /** Hash the character pair into an index (dimension). **/ - /** Note that temp will be between 97 ('a') and 132 ('9'). **/ - unsigned int dim = hash_char_pair(temp1, temp2); - - /** Increment the dimension of the dense vector by a number from 1 to 13. **/ - dense_vector[dim] += (temp1 + temp2) % 13u + 1u; - - j = i; + /** Store the character. **/ + chars[num_chars++] = tolower(c); } + chars[num_chars++] = CA_BOUNDARY_CHAR; /* Ending boundary character. */ - /** Count how much space is needed for a sparse vector. **/ - bool zero_prev = false; - size_t size = 0u; - for (unsigned int dim = 0u; dim < CA_NUM_DIMS; dim++) + /** Compute char pairs. **/ + CharPair char_pairs[num_chars]; + const unsigned int num_pairs = num_chars - 1u; + for (unsigned int i = 0u; i < num_pairs; i++) { - if (dense_vector[dim] == 0u) - { - size += (zero_prev) ? 0u : 1u; - zero_prev = true; - } - else - { - size++; - zero_prev = false; - } + /** Store characters. **/ + char_pairs[i].c1 = chars[i]; + char_pairs[i].c2 = chars[i + 1]; + + /** Hash the character pair into an index (dimension). **/ + /** Note that the passed value should always be between 97 ('a') and 132 ('9'). **/ + char_pairs[i].hash = hash_char_pair(chars[i], chars[i + 1]); } - /*** Check compression size. - *** If this check fails, I doubt anything will break. However, the longest - *** word I know (supercalifragilisticexpialidocious) has only 35 character - *** pairs, so it shouldn't reach half this size (and it'd be even shorter - *** if the hash generates at least one collision). - *** - *** Bad vector compression will result in degraded performace and increased - *** memory usage. This indicates a likely bug in the code. Thus, if this - *** warning is ever generated, it is definitely worth investigating. - ***/ - const size_t expected_max_size = 64u; - if (size > expected_max_size) - { - fprintf(stderr, - "cli_build_vector(\"%s\") - Warning: Sparse vector larger than expected.\n" - " > Size: %lu\n" - " > #Dims: %u\n", - str, - size, - CA_NUM_DIMS - ); - } + /** Sort char_pairs by hash value. **/ + qsort(char_pairs, num_pairs, sizeof(CharPair), charpair_cmp); - /** Allocate space for sparse vector. **/ - const size_t sparse_vector_size = size * sizeof(int); - pVector sparse_vector = (pVector)check_ptr(nmSysMalloc(sparse_vector_size)); + /** Allocate space for the sparce vector. **/ + pVector sparse_vector = (pVector)check_ptr(nmSysMalloc((num_pairs * 2u + 1u) * sizeof(int))); if (sparse_vector == NULL) return NULL; - /** Convert the dense vector above to a sparse vector. **/ - unsigned int j = 0u, sparse_idx = 0u; - while (j < CA_NUM_DIMS) - { - if (dense_vector[j] == 0u) - { - /*** Count and store consecutive zeros, except the first one, - *** which we already know is zero. - ***/ - unsigned int zero_count = 1u; - j++; - while (j < CA_NUM_DIMS && dense_vector[j] == 0u) - { - zero_count++; - j++; - } - sparse_vector[sparse_idx++] = (int)-zero_count; - } - else + /** Build the sparse vector. **/ + unsigned int cur = 0u, dim = 0u; + for (unsigned int i = 0u; i < num_pairs;) + { + unsigned int hash = char_pairs[i].hash; + + /** Proceed through the pairs until we find a unique hash. **/ + /** Dividing value by 2 each time reduces the impact of repeated pairs. **/ + int value = 0; + for (; i < num_pairs && char_pairs[i].hash == hash; i++) + value = (value / 2) + ((unsigned int)char_pairs[i].c1 + (unsigned int)char_pairs[i].c2) % 13u + 1u; + + /** Skip zeros to reach the dimension index specified by the hash. **/ + unsigned int num_zeros = hash - dim; + if (num_zeros > 0u) { - /** Store the value. **/ - sparse_vector[sparse_idx++] = (int)dense_vector[j++]; + sparse_vector[cur++] = (int)-num_zeros; + dim = hash; } + + /** Add the value to the sparse vector. **/ + sparse_vector[cur++] = value; + dim++; } + if (dim != CA_NUM_DIMS) sparse_vector[cur++] = -(CA_NUM_DIMS - dim); - return sparse_vector; + /** Trim extra space wasted by identical hashes. **/ + pVector trimmed_sparse_vector = (pVector)check_ptr(nmSysRealloc(sparse_vector, cur * sizeof(int))); + if (trimmed_sparse_vector == NULL) return NULL; + + return trimmed_sparse_vector; } +// Build vector by converting a dense vector to a sparse one. +//pVector ca_build_vector_old(const char* str) +// { +// /** Allocate space for a dense vector. **/ +// unsigned int dense_vector[CA_NUM_DIMS] = {0u}; +// +// /** j is the former character, i is the latter. **/ +// const unsigned int num_chars = (unsigned int)strlen(str); +// for (unsigned int j = 65535u, i = 0u; i <= num_chars; i++) +// { +// if (isspace(str[i])) continue; +// if (ispunct(str[i]) && str[i] != CA_BOUNDARY_CHAR) continue; +// +// /** First and last character should fall one before 'a' in the ASCII table. **/ +// unsigned int temp1 = (j == 65535u) ? CA_BOUNDARY_CHAR : (unsigned int)tolower(str[j]); +// unsigned int temp2 = (i == num_chars) ? CA_BOUNDARY_CHAR : (unsigned int)tolower(str[i]); +// +// /** Shift numbers to the end of the lowercase letters. **/ +// if ('0' <= temp1 && temp1 <= '9') temp1 += 75u; +// if ('0' <= temp2 && temp2 <= '9') temp2 += 75u; +// +// /** Hash the character pair into an index (dimension). **/ +// /** Note that temp will be between 97 ('a') and 132 ('9'). **/ +// unsigned int dim = hash_char_pair(temp1, temp2); +// +// /** Increment the dimension of the dense vector by a number from 1 to 13. **/ +// dense_vector[dim] += (temp1 + temp2) % 13u + 1u; +// +// j = i; +// } +// +// /** Count how much space is needed for a sparse vector. **/ +// bool zero_prev = false; +// size_t size = 0u; +// for (unsigned int dim = 0u; dim < CA_NUM_DIMS; dim++) +// { +// if (dense_vector[dim] == 0u) +// { +// size += (zero_prev) ? 0u : 1u; +// zero_prev = true; +// } +// else +// { +// size++; +// zero_prev = false; +// } +// } +// +// /*** Check compression size. +// *** If this check fails, I doubt anything will break. However, the longest +// *** word I know (supercalifragilisticexpialidocious) has only 35 character +// *** pairs, so it shouldn't reach half this size (and it'd be even shorter +// *** if the hash generates at least one collision). +// *** +// *** Bad vector compression will result in degraded performace and increased +// *** memory usage. This indicates a likely bug in the code. Thus, if this +// *** warning is ever generated, it is definitely worth investigating. +// ***/ +// const size_t expected_max_size = 256u; +// if (size > expected_max_size) +// { +// fprintf(stderr, +// "cli_build_vector(\"%s\") - Warning: Sparse vector larger than expected.\n" +// " > Size: %lu\n" +// " > #Dims: %u\n", +// str, +// size, +// CA_NUM_DIMS +// ); +// } +// +// /** Allocate space for sparse vector. **/ +// const size_t sparse_vector_size = size * sizeof(int); +// pVector sparse_vector = (pVector)check_ptr(nmSysMalloc(sparse_vector_size)); +// if (sparse_vector == NULL) return NULL; +// +// /** Convert the dense vector above to a sparse vector. **/ +// unsigned int dim = 0u, sparse_idx = 0u; +// while (dim < CA_NUM_DIMS) +// { +// if (dense_vector[dim] == 0u) +// { +// /** Count and store consecutive zeros, skipping the first one. **/ +// unsigned int zero_count = 1u; +// dim++; +// while (dim < CA_NUM_DIMS && dense_vector[dim] == 0u) +// { +// zero_count++; +// dim++; +// } +// sparse_vector[sparse_idx++] = (int)-zero_count; +// } +// else +// { +// /** Store the value. **/ +// sparse_vector[sparse_idx++] = (int)dense_vector[dim++]; +// } +// } +// +// return sparse_vector; +// } + /*** Free memory allocated to store a sparse vector. *** *** @param sparse_vector The sparse vector being freed. @@ -256,6 +331,21 @@ unsigned int ca_sparse_len(const pVector vector) return i; } +/*** Print the underlying implementation values sparsely allocated + *** vector (intended for debugging). + *** + *** @param out File to print to. + *** @param vector The vector. + ***/ +void ca_fprint_vector(FILE* out, const pVector vector) + { + const unsigned int len = ca_sparse_len(vector); + fprintf(out, "Vector: [%d", vector[0]); + for (unsigned int i = 1u; i < len; i++) + fprintf(out, ", %d", vector[i]); + fprintf(out, "]"); + } + /*** Compute the magnitude of a sparsely allocated vector. *** *** @param vector The vector. @@ -911,6 +1001,9 @@ void* ca_most_similar( *** the data param and returns their similarity. *** @param threshold The minimum threshold required for a duplocate to be *** included in the returned xArray. + *** @param maybe_keys A pointer to an array of keys, with one key per data. + *** These will be used to fill in the key1 and key2 attributes for each + *** struct. If this variable is null, these values are also left null. *** @param maybe_dups A pointer to an xArray in which dups should be found. *** Pass NULL to allocate a new one. *** @returns An xArray holding all of the duplocates found. If maybe_dups is @@ -922,11 +1015,12 @@ pXArray ca_sliding_search( const unsigned int window_size, const double (*similarity)(void*, void*), const double threshold, - pXArray dups) + void** maybe_keys, + pXArray maybe_dups) { /** Allocate space for dups (if necessary). **/ - const bool allocate_dups = (dups == NULL); - if (allocate_dups) + pXArray dups = maybe_dups; + if (dups == NULL) { /** Guess that we will need space for num_data * 2 dups. **/ const int guess_size = num_data * 2; @@ -955,8 +1049,11 @@ pXArray ca_sliding_search( glyph(find); Dup* dup = (Dup*)check_ptr(nmMalloc(sizeof(Dup))); if (dup == NULL) goto err_free_dups; - dup->id1 = i; - dup->id2 = j; + if (maybe_keys != NULL) + { + dup->key1 = maybe_keys[i]; + dup->key2 = maybe_keys[j]; + } dup->similarity = sim; if (!check_neg(xaAddItem(dups, (void*)dup))) goto err_free_dups; } @@ -973,7 +1070,7 @@ pXArray ca_sliding_search( /** Free the dups we added to the XArray. */ while (dups->nItems > num_starting_dups) nmFree(dups->Items[dups->nItems--], sizeof(Dup)); - if (allocate_dups) check(xaDeInit(dups)); /* Failure ignored. */ + if (maybe_dups == NULL) check(xaDeInit(dups)); /* Failure ignored. */ err: return NULL; @@ -990,6 +1087,9 @@ pXArray ca_sliding_search( *** the data param and returns their similarity. *** @param threshold The minimum threshold required for a duplocate to be *** included in the returned xArray. + *** @param maybe_keys A pointer to an array of keys, with one key per data. + *** These will be used to fill in the key1 and key2 attributes for each + *** struct. If this variable is null, these values are also left null. *** @param maybe_dups A pointer to an xArray in which dups should be found. *** Pass NULL to allocate a new one. *** @returns An xArray holding all of the duplocates found. If maybe_dups is @@ -1000,9 +1100,10 @@ pXArray ca_complete_search( const unsigned int num_data, const double (*similarity)(void*, void*), const double threshold, - pXArray dups) + void** maybe_keys, + pXArray maybe_dups) { - return ca_sliding_search(data, num_data, num_data, similarity, threshold, dups); + return ca_sliding_search(data, num_data, num_data, similarity, threshold, maybe_keys, maybe_dups); } /** Scope cleanup. **/ diff --git a/centrallix-lib/src/util.c b/centrallix-lib/src/util.c index 450c16593..b18361280 100644 --- a/centrallix-lib/src/util.c +++ b/centrallix-lib/src/util.c @@ -155,6 +155,32 @@ char* snprint_bytes(char* buf, const size_t buf_size, unsigned int bytes) } #undef nUints +char* snprint_llu(char* buf, size_t buflen, unsigned long long value) + { + if (buflen == 0) return NULL; + if (value == 0) + { + if (buflen > 1) { buf[0] = '0'; buf[1] = '\0'; } + else buf[0] = '\0'; + return buf; + } + + char tmp[32]; + unsigned int ti = 0; + while (value > 0 && ti < sizeof(tmp) - 1) + { + if (ti % 4 == 3) tmp[ti++] = ','; + tmp[ti++] = '0' + (value % 10); + value /= 10; + } + tmp[ti] = '\0'; + + unsigned int outlen = min(ti, buflen - 1u); + for (unsigned int i = 0u; i < outlen; i++) buf[i] = tmp[ti - i - 1]; + buf[outlen] = '\0'; + return buf; + } + void fprint_mem(FILE* out) { FILE* fp = fopen("/proc/self/statm", "r"); @@ -192,7 +218,7 @@ pTimer timer_init(pTimer timer) { if (timer == NULL) return NULL; timer->start = NAN; - timer->end = NAN; + timer->total = 0.0; return timer; } @@ -211,13 +237,18 @@ pTimer timer_start(pTimer timer) pTimer timer_stop(pTimer timer) { if (!timer) return timer; - timer->end = get_time(); + timer->total += get_time() - timer->start; return timer; } double timer_get(pTimer timer) { - return (timer) ? timer->end - timer->start : NAN; + return (timer) ? timer->total : NAN; + } + +pTimer timer_reset(pTimer timer) + { + return timer_init(timer); } void timer_de_init(pTimer timer) {} diff --git a/centrallix-os/cluster-schema.cluster b/centrallix-os/cluster-schema.cluster index e87ae6b5f..277e2bb12 100644 --- a/centrallix-os/cluster-schema.cluster +++ b/centrallix-os/cluster-schema.cluster @@ -14,7 +14,8 @@ file_name "system/cluster" ... source : DataSourcePath - attr_name : string ⊂ DataSourcePath/columns + key_attr : string ⊂ DataSourcePath/columns + data_attr : string ⊂ DataSourcePath/columns cluster_name "cluster/cluster" { @@ -54,10 +55,8 @@ file_name "system/cluster" ... /search_name - /{query} - - /id1 : uint < sizeof(source/attr_name) // The id of the first data point. - - /id2 : uint < sizeof(source/attr_name) // The id of the second data point. - - /val1 : string // The value of the first data point. - - /val2 : string // The value of the second data point. + - /key1 : string // The key of the first data point. + - /key2 : string // The key of the second data point. - /sim : double && 0.0 < x <= threshold // The similarity of the two data points. ... diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 751e07297..71f906e3d 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -74,16 +74,30 @@ #include "expression.h" #include "obj.h" -/** Duplocate detection settings. **/ -// #define SEPARATOR "|" -// #define SEPARATOR_CHAR '|' -// #define DBL_BUF_SIZE 16u -// #define USE_PARALLEL_COMPLETE_SEARCH true -// #define MIN_PARALLEL_COMPLETE_SEARCH 1000 -// #define MAX_COMPLETE_SEARCH 50 * 1000 // Default: 100 * 1000 -// #define KMEANS_IMPROVEMENT_THRESHOLD 0.0002 -#define EXP_NUM_DIMS 251 /* aka. The size of the vector table. */ -const int EXP_VECTOR_TABLE_SIZE = EXP_NUM_DIMS; /* Should probably be removed. */ + +/** TODO: I think this should be moved to datatypes. **/ +/** Should maybe replace duplocate functionality elsewhere. **/ +static char* ci_TypeToStr(const int type) + { + switch (type) + { + case DATA_T_UNAVAILABLE: return "Unknown"; + case DATA_T_INTEGER: return "Integer"; + case DATA_T_STRING: return "String"; + case DATA_T_DOUBLE: return "Double"; + case DATA_T_DATETIME: return "DateTime"; + case DATA_T_INTVEC: return "IntVecor"; + case DATA_T_STRINGVEC: return "StringVector"; + case DATA_T_MONEY: return "Money"; + case DATA_T_ARRAY: return "Array"; + case DATA_T_CODE: return "Code"; + case DATA_T_BINARY: return "Binary"; + } + + /** Invalid type. **/ + mssErrorf(1, "Cluster", "Invalid type %d.\n", type); + return "Invalid"; /* Shall not parse to a valid type in ci_TypeFromStr(). */ + } /****** Evaluator functions follow for expEvalFunction ******/ @@ -1239,6 +1253,31 @@ int exp_fn_rtrim(pExpression tree, pParamObjects objlist, pExpression i0, pExpre } +int exp_fn_trim(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) + { + int ret; + + /** Invoke left trim. **/ + ret = exp_fn_ltrim(tree, objlist, i0, i1, i2); + if (ret != 0) + { + mssErrorf(0, "EXP", "Failed to left trim (error code: %d).", ret); + return ret; + } + + /** Invoke right trim. **/ + ret = exp_fn_rtrim(tree, objlist, i0, i1, i2); + if (ret != 0) + { + mssErrorf(0, "EXP", "Failed to right trim (error code: %d).", ret); + return ret; + } + + /** Success. **/ + return 0; + } + + int exp_fn_right(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { int n,i; @@ -4105,10 +4144,8 @@ int exp_fn_nth(pExpression tree, pParamObjects objlist, pExpression i0, pExpress *** @param is_cos Whether to compute cosine or levenshtien. *** @returns 0 for success, -1 for failure. ***/ -static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1, bool is_cos) +static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1, const char* fn_name) { - const char fn_name[] = "cos_cmp"; - /** Check number of arguments. **/ const int num_params = tree->Children.nItems; if (num_params != 2) @@ -4138,49 +4175,33 @@ static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe /** Extract str1. **/ if (maybe_str1->Flags & EXPR_F_NULL) { - mssErrorf(1, "EXP", "%s(NULL, ...) str1 cannot be NULL.", fn_name); - return -1; + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_DOUBLE; + return 0; } if (maybe_str1->DataType != DATA_T_STRING) { mssErrorf(1, "EXP", "%s(\?\?\?, ..) str1 should be a string.", fn_name); return -1; } - char* str1 = maybe_str1->String; - if (str1 == NULL) - { - mssErrorf(1, "EXP", - "%s(nothing?, ...) expected string from str1 (of datatype DataType = " - "DATA_T_STRING), but the String was NULL or did not exist!", - fn_name - ); - return -1; - } + char* str1 = check_ptr(maybe_str1->String); /** Extract str2. **/ if (maybe_str2->Flags & EXPR_F_NULL) { - mssErrorf(1, "EXP", "%s(\"%s\", NULL) str2 cannot be NULL.", fn_name, str1); - return -1; + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_DOUBLE; + return 0; } if (maybe_str2->DataType != DATA_T_STRING) { mssErrorf(1, "EXP", "%s(\"%s\", \?\?\?) str2 should be a string.", fn_name, str1); return -1; } - char* str2 = maybe_str2->String; - if (str2 == NULL) - { - mssErrorf(1, "EXP", - "%s(\"%s\", nothing?) expected string from str2 (of datatype DataType = " - "DATA_T_STRING), but the String was NULL or did not exist!", - fn_name, str1 - ); - return -1; - } + char* str2 = check_ptr(maybe_str2->String); /** Handle either cos_cmp or lev_cmp. **/ - if (is_cos) + if (fn_name[0] == 'c') { /* cos_cmp */ int ret; @@ -4215,35 +4236,20 @@ static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe return -1; } -/*** Computes cosine similarity between two strings. - *** - *** @param tree The tree resulting from this function. - *** @param objlist The evaluation "scope", including available variables. - *** @param maybe_str1 Possibly the first string. - *** @param maybe_str2 Possibly the second string. - *** @param u1 Unused parameter. - *** @returns 0 for success, -1 for failure. - ***/ -int exp_fn_cos_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) - { - return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, true); - } -/*** Computes levenshtein similarity by normalizing the levenshtein edit - *** distance between two strings with the length of the longer string. - *** - *** @param tree The tree resulting from this function. - *** @param objlist The evaluation "scope", including available variables. - *** @param maybe_str1 Possibly the first string. - *** @param maybe_str2 Possibly the second string. - *** @param u1 Unused parameter. - *** @returns 0 for success, -1 for failure. - ***/ +int exp_fn_cos_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) + { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "cos_cmp"); } +int exp_fn_cos_compare(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) + { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "cos_compare"); } +int exp_fn_cosine_compare(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) + { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "cosine_compare"); } int exp_fn_lev_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) - { - return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, false); - } - + { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "lev_cmp"); } +int exp_fn_lev_compare(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) + { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "lev_compare"); } +int exp_fn_levenshtein_compare(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) + { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "levenshtein_compare"); } + /*** Comparse two strings to see if their sparse vectors are equal. *** @@ -4435,6 +4441,110 @@ int exp_fn_double_metaphone(pExpression tree, pParamObjects objlist, pExpression return 0; } + +int exp_fn_aggregate_similarities(pExpression tree, pParamObjects objlist) + { + const char fn_name[] = "aggregate_similarities"; + + /** Check number of arguments. **/ + const int num_params = tree->Children.nItems; + if (num_params != 6) + { + mssErrorf(1, "EXP", "%s(?) expects 6 parameters, got %d parameters.", fn_name, num_params); + return -1; + } + + /** Magic checks. **/ + ASSERTMAGIC(tree, MGK_EXPRESSION); + ASSERTMAGIC(tree->Children.Items[0], MGK_EXPRESSION); + ASSERTMAGIC(tree->Children.Items[1], MGK_EXPRESSION); + ASSERTMAGIC(tree->Children.Items[2], MGK_EXPRESSION); + ASSERTMAGIC(tree->Children.Items[3], MGK_EXPRESSION); + + /** Check object list. **/ + if (objlist == NULL) + { + mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); + return -1; + } + ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); + + /** Extract parameters. **/ + double params[4] = {NAN}; + const char names[4][8] = {"name", "email", "phone", "address"}; + for (unsigned int i = 0; i < 4u; i++) + { + pExpression param = (pExpression)tree->Children.Items[i]; + + /** Ignore null values. **/ + if (param->Flags & EXPR_F_NULL) continue; + + /** Only accept doubles. **/ + if (param->DataType != DATA_T_DOUBLE) + { + mssErrorf(1, "EXP", + "%s() param%u (%s) expected type %s but got %s.", + fn_name, i, names[i], ci_TypeToStr(DATA_T_DOUBLE), ci_TypeToStr(param->DataType) + ); + if (param->DataType == DATA_T_INTEGER) fprintf(stderr, "Value: %d\n", param->Integer); + return -1; + } + + /** Do not accept NaN. **/ + params[i] = param->Types.Double; + if (isnan(params[i])) + { + mssErrorf(1, "EXP", "%s() param%u (%s) cannot be NaN", fn_name, names[i], i); + return -1; + } + } + + char* dup_names[2] = {NULL}; + for (unsigned int i = 0; i < 2u; i++) + { + pExpression param = (pExpression)tree->Children.Items[i + 4u]; + + /** Ignore null values. **/ + if (param->Flags & EXPR_F_NULL) continue; + + /** Only accept doubles. **/ + if (param->DataType != DATA_T_STRING) + { + mssErrorf(1, "EXP", + "%s() param%u expected type %s but got %s.", + fn_name, i, ci_TypeToStr(DATA_T_STRING), ci_TypeToStr(param->DataType) + ); + return -1; + } + + dup_names[i] = param->String; + } + + FILE *f = check_ptr(fopen("/home/israel/exp_log.swift", "a")); + check_neg(fprintf(f, "aggregate_similarities(%g, %g, %g, %g, \"%s\", \"%s\")", params[0], params[1], params[2], params[3], dup_names[0], dup_names[1])); + + /** Compute aggregated similarity. **/ + double name_sim = params[0]; + double email_sim = params[1]; + double phone_sim = params[2]; + double address_sim = params[3]; + + double mean = 0.0, n = 0.0; + if (name_sim > 0.0) { mean += name_sim; n++; } + if (email_sim > 0.0) { mean += email_sim; n++; } + if (phone_sim > 0.0) { mean += phone_sim; n++; } + if (address_sim > 0.0) { mean += address_sim; n++; } + mean /= n; + + /** Success. **/ + tree->DataType = DATA_T_DOUBLE; + tree->Types.Double = mean; + fprintf(f, " = %g\n", tree->Types.Double); + check(fclose(f)); + return 0; + } + + /* * exp_fn_argon2id * This method hashes a given password using the Argon2 algorithm (ID variant) @@ -4557,27 +4667,42 @@ int exp_fn_argon2id(pExpression tree, pParamObjects objlist, pExpression passwor int exp_internal_DefineFunctions() { - /** Initialize library **/ + /** Initialize library. **/ ca_init(); - /** Function list for EXPR_N_FUNCTION nodes **/ - xhAdd(&EXP.Functions, "getdate", (char*)exp_fn_getdate); + /** Function list for EXPR_N_FUNCTION nodes. **/ + + /** General. **/ xhAdd(&EXP.Functions, "user_name", (char*)exp_fn_user_name); xhAdd(&EXP.Functions, "convert", (char*)exp_fn_convert); xhAdd(&EXP.Functions, "wordify", (char*)exp_fn_wordify); xhAdd(&EXP.Functions, "abs", (char*)exp_fn_abs); xhAdd(&EXP.Functions, "ascii", (char*)exp_fn_ascii); xhAdd(&EXP.Functions, "condition", (char*)exp_fn_condition); - xhAdd(&EXP.Functions, "charindex", (char*)exp_fn_charindex); - xhAdd(&EXP.Functions, "upper", (char*)exp_fn_upper); - xhAdd(&EXP.Functions, "lower", (char*)exp_fn_lower); - xhAdd(&EXP.Functions, "mixed", (char*)exp_fn_mixed); - xhAdd(&EXP.Functions, "char_length", (char*)exp_fn_char_length); - xhAdd(&EXP.Functions, "datepart", (char*)exp_fn_datepart); xhAdd(&EXP.Functions, "isnull", (char*)exp_fn_isnull); + xhAdd(&EXP.Functions, "eval", (char*)exp_fn_eval); + xhAdd(&EXP.Functions, "truncate", (char*)exp_fn_truncate); + xhAdd(&EXP.Functions, "constrain", (char*)exp_fn_constrain); + xhAdd(&EXP.Functions, "has_endorsement", (char*)exp_fn_has_endorsement); + xhAdd(&EXP.Functions, "rand", (char*)exp_fn_rand); + xhAdd(&EXP.Functions, "nullif", (char*)exp_fn_nullif); + xhAdd(&EXP.Functions, "hash", (char*)exp_fn_hash); + xhAdd(&EXP.Functions, "hmac", (char*)exp_fn_hmac); + xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); + xhAdd(&EXP.Functions, "octet_length", (char*)exp_fn_octet_length); + xhAdd(&EXP.Functions, "argon2id",(char*)exp_fn_argon2id); + + /** Dates. **/ + xhAdd(&EXP.Functions, "getdate", (char*)exp_fn_getdate); + xhAdd(&EXP.Functions, "datepart", (char*)exp_fn_datepart); + xhAdd(&EXP.Functions, "dateadd", (char*)exp_fn_dateadd); + xhAdd(&EXP.Functions, "datediff", (char*)exp_fn_datediff); + + /** Strings. **/ xhAdd(&EXP.Functions, "ltrim", (char*)exp_fn_ltrim); xhAdd(&EXP.Functions, "lztrim", (char*)exp_fn_lztrim); xhAdd(&EXP.Functions, "rtrim", (char*)exp_fn_rtrim); + xhAdd(&EXP.Functions, "trim", (char*)exp_fn_trim); xhAdd(&EXP.Functions, "substring", (char*)exp_fn_substring); xhAdd(&EXP.Functions, "right", (char*)exp_fn_right); xhAdd(&EXP.Functions, "ralign", (char*)exp_fn_ralign); @@ -4587,12 +4712,22 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "escape", (char*)exp_fn_escape); xhAdd(&EXP.Functions, "quote", (char*)exp_fn_quote); xhAdd(&EXP.Functions, "substitute", (char*)exp_fn_substitute); - xhAdd(&EXP.Functions, "eval", (char*)exp_fn_eval); + xhAdd(&EXP.Functions, "upper", (char*)exp_fn_upper); + xhAdd(&EXP.Functions, "lower", (char*)exp_fn_lower); + xhAdd(&EXP.Functions, "mixed", (char*)exp_fn_mixed); + xhAdd(&EXP.Functions, "char_length", (char*)exp_fn_char_length); + xhAdd(&EXP.Functions, "charindex", (char*)exp_fn_charindex); + xhAdd(&EXP.Functions, "dateformat", (char*)exp_fn_dateformat); + xhAdd(&EXP.Functions, "moneyformat", (char*)exp_fn_moneyformat); + + /** Numbering systems (e.g. base 16 aka. hex, base 8 aka. octal, etc.). **/ + xhAdd(&EXP.Functions, "to_base64", (char*)exp_fn_to_base64); + xhAdd(&EXP.Functions, "from_base64", (char*)exp_fn_from_base64); + xhAdd(&EXP.Functions, "to_hex", (char*)exp_fn_to_hex); + xhAdd(&EXP.Functions, "from_hex", (char*)exp_fn_from_hex); + + /** Math. **/ xhAdd(&EXP.Functions, "round", (char*)exp_fn_round); - xhAdd(&EXP.Functions, "dateadd", (char*)exp_fn_dateadd); - xhAdd(&EXP.Functions, "datediff", (char*)exp_fn_datediff); - xhAdd(&EXP.Functions, "truncate", (char*)exp_fn_truncate); - xhAdd(&EXP.Functions, "constrain", (char*)exp_fn_constrain); xhAdd(&EXP.Functions, "sin", (char*)exp_fn_sin); xhAdd(&EXP.Functions, "cos", (char*)exp_fn_cos); xhAdd(&EXP.Functions, "tan", (char*)exp_fn_tan); @@ -4604,41 +4739,28 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "square", (char*)exp_fn_square); xhAdd(&EXP.Functions, "degrees", (char*)exp_fn_degrees); xhAdd(&EXP.Functions, "radians", (char*)exp_fn_radians); - xhAdd(&EXP.Functions, "has_endorsement", (char*)exp_fn_has_endorsement); - xhAdd(&EXP.Functions, "rand", (char*)exp_fn_rand); - xhAdd(&EXP.Functions, "nullif", (char*)exp_fn_nullif); - xhAdd(&EXP.Functions, "dateformat", (char*)exp_fn_dateformat); - xhAdd(&EXP.Functions, "moneyformat", (char*)exp_fn_moneyformat); - xhAdd(&EXP.Functions, "hash", (char*)exp_fn_hash); - xhAdd(&EXP.Functions, "hmac", (char*)exp_fn_hmac); xhAdd(&EXP.Functions, "log10", (char*)exp_fn_log10); xhAdd(&EXP.Functions, "ln", (char*)exp_fn_log_natural); xhAdd(&EXP.Functions, "logn", (char*)exp_fn_log_base_n); xhAdd(&EXP.Functions, "power", (char*)exp_fn_power); - xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); - xhAdd(&EXP.Functions, "to_base64", (char*)exp_fn_to_base64); - xhAdd(&EXP.Functions, "from_base64", (char*)exp_fn_from_base64); - xhAdd(&EXP.Functions, "to_hex", (char*)exp_fn_to_hex); - xhAdd(&EXP.Functions, "from_hex", (char*)exp_fn_from_hex); - xhAdd(&EXP.Functions, "octet_length", (char*)exp_fn_octet_length); - xhAdd(&EXP.Functions, "argon2id",(char*)exp_fn_argon2id); - /** Duplicate Detection **/ + /** Duplicate detection. **/ xhAdd(&EXP.Functions, "cos_cmp", (char*)exp_fn_cos_cmp); - xhAdd(&EXP.Functions, "cos_compare", (char*)exp_fn_cos_cmp); - xhAdd(&EXP.Functions, "cosine_compare", (char*)exp_fn_cos_cmp); + xhAdd(&EXP.Functions, "cos_compare", (char*)exp_fn_cos_compare); + xhAdd(&EXP.Functions, "cosine_compare", (char*)exp_fn_cosine_compare); xhAdd(&EXP.Functions, "lev_cmp", (char*)exp_fn_lev_cmp); - xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_cmp); - xhAdd(&EXP.Functions, "levenshtein_compare", (char*)exp_fn_lev_cmp); + xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_compare); + xhAdd(&EXP.Functions, "levenshtein_compare", (char*)exp_fn_levenshtein_compare); xhAdd(&EXP.Functions, "sparse_eql", (char*)exp_fn_sparse_eql); + xhAdd(&EXP.Functions, "aggregate_similarities", (char*)exp_fn_aggregate_similarities); xhAdd(&EXP.Functions, "double_metaphone", (char*)exp_fn_double_metaphone); - /** Windowing **/ + /** Windowing. **/ xhAdd(&EXP.Functions, "row_number", (char*)exp_fn_row_number); xhAdd(&EXP.Functions, "dense_rank", (char*)exp_fn_dense_rank); xhAdd(&EXP.Functions, "lag", (char*)exp_fn_lag); - /** Aggregate **/ + /** Aggregate. **/ xhAdd(&EXP.Functions, "count", (char*)exp_fn_count); xhAdd(&EXP.Functions, "avg", (char*)exp_fn_avg); xhAdd(&EXP.Functions, "sum", (char*)exp_fn_sum); @@ -4648,7 +4770,8 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "last", (char*)exp_fn_last); xhAdd(&EXP.Functions, "nth", (char*)exp_fn_nth); - /** Reverse functions **/ + + /** Reverse functions. **/ xhAdd(&EXP.ReverseFunctions, "isnull", (char*)exp_fn_reverse_isnull); return 0; diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index 6d1aaf313..b3d416668 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -159,7 +160,7 @@ void mssErrorf(int clr, char* module, const char* format, ...) *** *** LINK ../../centrallix-lib/include/datatypes.h:72 ***/ -int ci_TypeFromStr(const char* str) +static int ci_TypeFromStr(const char* str) { /** All valid types are non-null strings, at least 2 characters long. **/ if (str == NULL || str[0] == '\0' || str[1] == '\0') return -1; @@ -211,7 +212,7 @@ int ci_TypeFromStr(const char* str) /** TODO: I think this should be moved to datatypes. **/ /** Should maybe replace duplocate functionality elsewhere. **/ -char* ci_TypeToStr(const int type) +static char* ci_TypeToStr(const int type) { switch (type) { @@ -234,17 +235,37 @@ char* ci_TypeToStr(const int type) } /** TODO: I think this should be moved to xarray. **/ -/** Contract: Return value is null iff pXArray has 0 items. **/ -void** ci_xaToTrimmedArray(pXArray arr) +/*** Trims an xArray, returning a new array (with nmSysMalloc). + *** + *** @param arr The array to be trimmed. + *** @param cleanup 0: No clean up. + *** 1: DeInit arr. + *** 2: Free arr. + *** *: Any other value prints a warning and does nothing. + *** @returns The new array, or null if and only if the passed pXArray has 0 items. + ***/ +static void** ci_xaToTrimmedArray(pXArray arr, int array_handling) { - if (arr->nItems == 0) { - mssErrorf(1, "Cluster", "Failed to trim XArray of length 0."); - return NULL; - } - const size_t arr_size = arr->nItems * sizeof(void*); void** result = check_ptr(nmSysMalloc(arr_size)); + if (result == NULL) return NULL; memcpy(result, arr->Items, arr_size); + + /** Handle the array. **/ + switch (array_handling) + { + case 0: break; + case 1: check(xaDeInit(arr)); arr->nAlloc = 0; break; /* Failure ignored. */ + case 2: check(xaFree(arr)); break; /* Failure ignored. */ + default: + /** Uh oh, there might be a memory leak... **/ + fprintf(stderr, + "Warning: ci_xaToTrimmedArray(%p, %d) - Unknown value (%d) for array_handling.\n", + arr, array_handling, array_handling + ); + break; + } + return result; } @@ -376,8 +397,8 @@ char* const ATTR_CLUSTER_ENTRY[] = }; char* const ATTR_SEARCH_ENTRY[] = { - "val1", - "val2", + "key1", + "key2", "sim", END_OF_ARRAY, }; @@ -397,15 +418,20 @@ char* const METHOD_NAME[] = *** *** Memory Stats: *** - Padding: 4 bytes - *** - Total size: 72 bytes + *** - Total size: 80 bytes *** *** @skip --> Attribute Data. *** @param Name The source name, specified in the .cluster file. *** @param Key The key associated with this object in the SourceDataCache. *** @param SourcePath The path to the data source from which to retrieve data. - *** @param AttrName The name of the attribute to get from the data source. + *** @param KeyAttr The name of the attribute to use when getting keys from + *** the SourcePath. + *** @param NameAttr The name of the attribute to use when getting data from + *** the SourcePath. *** *** @skip --> Computed data. + *** @param Strings The keys for each data string strings recieved from the + *** database, allowing them to be lined up again when queried. *** @param Strings The data strings to be clustered and searched, or NULL if *** they have not been fetched from the source. *** @param Vectors The cosine comparison vectors from the fetched data, or @@ -416,14 +442,16 @@ char* const METHOD_NAME[] = *** *** @skip --> Time. *** @param DateCreated The date and time that this object was created and initialized. - *** @param DateComputed The date and time that the Labels field was computed. + *** @param DateComputed The date and time that the computed attributes were computed. ***/ typedef struct _SOURCE { char* Name; char* Key; char* SourcePath; - char* AttrName; + char* KeyAttr; + char* NameAttr; + char** Keys; char** Strings; pVector* Vectors; unsigned int nVectors; @@ -468,7 +496,7 @@ typedef struct *** each clustering iteration. If there is less improvement, the algorithm *** will stop. The "max" in a .cluster file is represented by -inf. *** @param MaxIterations The maximum number of iterations that a clustering - *** algorithm can run for. Note: Sliding window uses this field to store + *** algorithm can run for. Note: Sliding window uses this attribute to store *** the window_size. *** *** @skip --> Relationship Data. @@ -481,12 +509,12 @@ typedef struct *** @param Clusters An array of length num_clusters, NULL if the clusters *** have not yet been computed. *** @param Sims An array of num_vectors elements, where index i stores the - *** similarity of vector i to its assigned cluster. This field is NULL + *** similarity of vector i to its assigned cluster. This attribute is NULL *** if the clusters have not yet been computed. *** *** @skip --> Time. *** @param DateCreated The date and time that this object was created and initialized. - *** @param DateComputed The date and time that the Labels field was computed. + *** @param DateComputed The date and time that the computed attributes were computed. ***/ typedef struct _CLUSTER { @@ -530,7 +558,7 @@ typedef struct _CLUSTER *** *** @skip --> Time. *** @param DateCreated The date and time that this object was created and initialized. - *** @param DateComputed The date and time that the Dups field was computed. + *** @param DateComputed The date and time that the computed attributes were computed. ***/ typedef struct _SEARCH { @@ -653,6 +681,20 @@ struct } ClusterDriverCaches; +struct + { + unsigned long long OpenCalls; + unsigned long long OpenQueryCalls; + unsigned long long FetchCalls; + unsigned long long CloseCalls; + unsigned long long GetTypeCalls; + unsigned long long GetValCalls; + unsigned long long GetValCalls_name; + unsigned long long GetValCalls_key1; + unsigned long long GetValCalls_key2; + unsigned long long GetValCalls_sim; + } ClusterStatistics; + /** ================ Function Declarations ================ **/ /** ANCHOR[id=functions] **/ @@ -661,6 +703,8 @@ struct /** Parsing Functions. **/ // LINK #parsing +static void ci_GiveHint(const char* hint); +static bool ci_TryHint(char* value, char** valid_values, const unsigned int n_valid_values); static int ci_ParseAttribute(pStructInf inf, char* attr_name, int datatype, pObjData data, pParamObjects param_list, bool required, bool print_type_error); static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf cluster_inf, pParamObjects param_list); static SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf cluster_inf, pParamObjects param_list); @@ -741,7 +785,8 @@ static void ci_GiveHint(const char* hint) fprintf(stderr, " > Hint: Did you mean \"%s\"?\n", hint); } -/*** Given the user a hint when they specify an invalid string for a field + +/*** Given the user a hint when they specify an invalid string for an attribute *** where we know the list of valid strings. The hint is only displayed if *** their string is close enough to a valid string. *** @@ -753,11 +798,12 @@ static void ci_GiveHint(const char* hint) ***/ static bool ci_TryHint(char* value, char** valid_values, const unsigned int n_valid_values) { - char* guess = ca_most_similar(value, (void**)valid_values, n_valid_values, ca_lev_compare, 0.5); + char* guess = ca_most_similar(value, (void**)valid_values, n_valid_values, ca_lev_compare, 0.25); if (guess == NULL) return false; /* No hint. */ /** Issue hint. **/ ci_GiveHint(guess); + tprintf(" > Similarity: %.4g\n", ca_lev_compare(value, guess)); return true; } @@ -845,7 +891,7 @@ static int ci_ParseAttribute( // LINK #functions -/*** Parses a ClusteringAlgorithm from the algorithm field in the pStructInf +/*** Parses a ClusteringAlgorithm from the algorithm attribute in the pStructInf *** representing some structure with that attribute in a parsed structure file. *** *** @attention - Promises that a failure invokes mssError() at least once. @@ -892,7 +938,7 @@ static ClusterAlgorithm ci_ParseClusteringAlgorithm(pStructInf inf, pParamObject // LINK #functions -/*** Parses a SimilarityMeasure from the similarity_measure field in the given +/*** Parses a SimilarityMeasure from the similarity_measure attribute in the given *** pStructInf parameter, which represents some structure with that attribute *** in a parsed structure file. *** @@ -951,78 +997,70 @@ static SimilarityMeasure ci_ParseSimilarityMeasure(pStructInf inf, pParamObjects ***/ static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, char* path) { - char* buf; + char* buf = NULL; + + /** Allocate SourceData. **/ + pSourceData source_data = check_ptr(nmMalloc(sizeof(SourceData))); + if (source_data == NULL) goto err_free; + memset(source_data, 0, sizeof(SourceData)); + + /** Initialize obvious values for SourceData. **/ + source_data->Name = check_ptr(nmSysStrdup(inf->Name)); + if (source_data->Name == NULL) goto err_free; + if (!check(objCurrentDate(&source_data->DateCreated))) goto err_free; /** Get source. **/ - if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err; - char* source_path = check_ptr(nmSysStrdup(buf)); - if (source_path == NULL) goto err; + if (ci_ParseAttribute(inf, "source", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err_free; + source_data->SourcePath = check_ptr(nmSysStrdup(buf)); + if (source_data->SourcePath == NULL) goto err_free; - /** Get attribute name. **/ - if (ci_ParseAttribute(inf, "attr_name", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err; - char* attr_name = check_ptr(nmSysStrdup(buf)); - if (attr_name == NULL) goto err_free_path; + /** Get the attribute name to use when querying keys from the source. **/ + if (ci_ParseAttribute(inf, "key_attr", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err_free; + source_data->KeyAttr = check_ptr(nmSysStrdup(buf)); + if (source_data->KeyAttr == NULL) goto err_free; + + /** Get the attribute name to use for querying data from the source. **/ + if (ci_ParseAttribute(inf, "data_attr", DATA_T_STRING, POD(&buf), param_list, true, true) != 0) goto err_free; + source_data->NameAttr = check_ptr(nmSysStrdup(buf)); + if (source_data->NameAttr == NULL) goto err_free; /** Create cache entry key. **/ - const size_t len = strlen(path) + strlen(source_path) + strlen(attr_name) + 3lu; - char* key = check_ptr(nmSysMalloc(len * sizeof(char))); - if (key == NULL) goto err_free_attr; - snprintf(key, len, "%s?%s:%s", path, source_path, attr_name); + const size_t len = strlen(path) + strlen(source_data->SourcePath) + strlen(source_data->KeyAttr) + strlen(source_data->NameAttr) + 5lu; + source_data->Key = check_ptr(nmSysMalloc(len * sizeof(char))); + if (source_data->Key == NULL) goto err_free; + snprintf(source_data->Key, len, "%s?%s->%s:%s", path, source_data->SourcePath, source_data->KeyAttr, source_data->NameAttr); /** Check for a cached version. **/ - pSourceData source_maybe = (pSourceData)xhLookup(&ClusterDriverCaches.SourceDataCache, key); + pSourceData source_maybe = (pSourceData)xhLookup(&ClusterDriverCaches.SourceDataCache, source_data->Key); if (source_maybe != NULL) { /** Cache hit. **/ - tprintf("# source: \"%s\"\n", key); + tprintf("# source: \"%s\"\n", source_data->Key); /** Cause an imediate invalid read if cache was incorrectly freed. **/ tprintf("--> Name: %s\n", source_maybe->Name); /** Free data we don't need. **/ - nmSysFree(source_path); - nmSysFree(attr_name); - nmSysFree(key); + nmSysFree(source_data->Key); + ci_FreeSourceData(source_data); /** Return the cached source data. **/ return source_maybe; } - /** Cache miss: Create a new source data object. **/ - pSourceData source_data = check_ptr(nmMalloc(sizeof(SourceData))); - if (source_data == NULL) goto err_free_key; - memset(source_data, 0, sizeof(SourceData)); - source_data->Key = key; - source_data->SourcePath = source_path; - source_data->AttrName = attr_name; - source_data->Name = check_ptr(nmSysStrdup(inf->Name)); - if (source_data->Name == NULL) goto err_free_source; - if (!check(objCurrentDate(&source_data->DateCreated))) goto err_free_source; - - /** Add the new object to the cache for next time. **/ - tprintf("+ source: \"%s\"\n", key); - if (!check(xhAdd(&ClusterDriverCaches.SourceDataCache, key, (void*)source_data))) - goto err_free_source; + /** Cache miss: Add the new object to the cache for next time. **/ + tprintf("+ source: \"%s\"\n", source_data->Key); + if (!check(xhAdd(&ClusterDriverCaches.SourceDataCache, source_data->Key, (void*)source_data))) + goto err_free; /** Success. **/ return source_data; /** Error handling. **/ - err_free_source: - ci_FreeSourceData(source_data); - nmSysFree(key); - goto err; - - err_free_key: - nmSysFree(key); - - err_free_attr: - nmSysFree(attr_name); - - err_free_path: - nmSysFree(source_path); + err_free: + if (source_data->Key != NULL) nmSysFree(source_data->Key); + if (source_data != NULL) ci_FreeSourceData(source_data); - err: mssErrorf(0, "Cluster", "Failed to parse source data from group \"%s\" in file: %s", inf->Name, path @@ -1241,10 +1279,7 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) } } cluster_data->nSubClusters = sub_clusters.nItems; - cluster_data->SubClusters = (cluster_data->nSubClusters > 0u) ? - (pClusterData*)ci_xaToTrimmedArray(&sub_clusters) - : NULL; /* No sub-clusters. */ - check(xaDeInit(&sub_clusters)); /* Failure ignored. */ + cluster_data->SubClusters = (pClusterData*)ci_xaToTrimmedArray(&sub_clusters, 1); /** Create the cache key. **/ parsing_done:; @@ -1592,7 +1627,8 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) /** Valid attribute names. **/ char* attrs[] = { "source", - "attr_name", + "key_attr", + "data_attr", }; const unsigned int nattrs = sizeof(attrs) / sizeof(char*); @@ -1695,16 +1731,7 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) /** Check each provided param to see if the user provided value. **/ for (unsigned int j = 0u; j < num_provided_params; j++) { - pStruct provided_param = provided_params[j]; - if (provided_param == NULL) - { - mssErrorf(1, "Cluster", "Provided param struct cannot be NULL."); - fprintf(stderr, - "Debug info: obj->Pathname->OpenCtl[%d]->SubInf[%u] is NULL", - parent->SubPtr - 1, j - ); - goto err_free_arrs; - } + pStruct provided_param = check_ptr(provided_params[j]); /* Failure ignored. */ /** If this provided param value isn't for the param, ignore it. **/ if (strcmp(provided_param->Name, param->Name) != 0) continue; @@ -1724,7 +1751,7 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) ); goto err_free_arrs; } - tprintf("Found provided value for %s, which is now %d\n", param->Name, param->Value->Data.Integer); + tprintf("Found provided value for %s of type %s\n", param->Name, ci_TypeToStr(param->Type)); /** Provided value successfully handled, we're done. **/ break; @@ -1744,6 +1771,28 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) check(xaDeInit(¶m_infs)); /* Failure ignored. */ param_infs.nAlloc = 0; + /** Iterate over provided parameters and warn the user if they specified a parameter that does not exist. **/ + for (unsigned int i = 0u; i < num_provided_params; i++) + { + pStruct provided_param = check_ptr(provided_params[i]); /* Failure ignored. */ + char* provided_name = provided_param->Name; + + /** Look to see if this provided param actually exists for this driver instance. **/ + for (unsigned int j = 0u; j < node_data->nParams; j++) + if (strcmp(provided_name, node_data->Params[j]->Name) == 0) + goto next_provided_param; + + /** This param doesn't exist, warn the user and attempt to give them a hint. **/ + fprintf(stderr, "Warning: Unknown provided parameter '%s' for cluster file: %s.\n", provided_name, ci_file_name(parent)); + char** param_names = check_ptr(nmSysMalloc(node_data->nParams * sizeof(char*))); + for (unsigned int j = 0u; j < node_data->nParams; j++) + param_names[j] = node_data->Params[j]->Name; + ci_TryHint(provided_name, param_names, node_data->nParams); + nmSysFree(param_names); + + next_provided_param:; + } + /** Parse source data. **/ node_data->SourceData = ci_ParseSourceData(inf, node_data->ParamList, path); if (node_data->SourceData == NULL) goto err_free_arrs; @@ -1826,10 +1875,15 @@ static void ci_FreeSourceData(pSourceData source_data) nmSysFree(source_data->SourcePath); source_data->SourcePath = NULL; } - if (source_data->AttrName != NULL) + if (source_data->KeyAttr != NULL) + { + nmSysFree(source_data->KeyAttr); + source_data->KeyAttr = NULL; + } + if (source_data->NameAttr != NULL) { - nmSysFree(source_data->AttrName); - source_data->AttrName = NULL; + nmSysFree(source_data->NameAttr); + source_data->NameAttr = NULL; } /** Free fetched data, if it exists. **/ @@ -2060,7 +2114,8 @@ static unsigned int ci_SizeOfSourceData(pSourceData source_data) unsigned int size = 0u; if (source_data->Name != NULL) size += strlen(source_data->Name) * sizeof(char); if (source_data->SourcePath != NULL) size += strlen(source_data->SourcePath) * sizeof(char); - if (source_data->AttrName != NULL) size += strlen(source_data->AttrName) * sizeof(char); + if (source_data->KeyAttr != NULL) size += strlen(source_data->KeyAttr) * sizeof(char); + if (source_data->NameAttr != NULL) size += strlen(source_data->NameAttr) * sizeof(char); if (source_data->Strings != NULL) { for (unsigned int i = 0u; i < source_data->nVectors; i++) @@ -2175,9 +2230,9 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) { mssErrorf(0, "Cluster", "Failed to open object driver:\n" - " > Attribute: ['%s' : String]\n" - " > Source Path: \"%s\"", - source_data->AttrName, + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n", + source_data->KeyAttr, source_data->NameAttr, source_data->SourcePath ); goto end; @@ -2190,85 +2245,86 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) { mssErrorf(0, "Cluster", "Failed to open query:\n" - " > Attribute: ['%s' : String]\n" - " > Driver Used: %s\n" - " > Source Path: \"%s\"", - source_data->AttrName, - obj->Driver->Name, - source_data->SourcePath + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name ); goto end_close; } /** Initialize an xarray to store the retrieved data. **/ - XArray data_xarray, vector_xarray; + XArray key_xarray, data_xarray, vector_xarray; + memset(&key_xarray, 0, sizeof(XArray)); memset(&data_xarray, 0, sizeof(XArray)); memset(&vector_xarray, 0, sizeof(XArray)); - if (!check(xaInit(&data_xarray, 64))) goto end_close_query; + if (!check(xaInit(&key_xarray, 64))) goto end_close_query; + if (!check(xaInit(&data_xarray, 64))) goto end_free_data; if (!check(xaInit(&vector_xarray, 64))) goto end_free_data; /** Fetch data and build vectors. **/ tprintf("Skips: "); - unsigned int i = 0u; while (true) { pObject entry = objQueryFetch(query, O_RDONLY); if (entry == NULL) break; /* Done. */ - /** Type checking. **/ - const int datatype = objGetAttrType(entry, source_data->AttrName); - if (datatype == -1) + /** Data value: Type checking. **/ + const int data_datatype = objGetAttrType(entry, source_data->NameAttr); + if (data_datatype == -1) { mssErrorf(0, "Cluster", "Failed to get type for %uth entry:\n" - " > Attribute: '%s' : String\n" - " > Driver Used: %s\n" - " > Source Path: \"%s\"", - i, - source_data->AttrName, - obj->Driver->Name, - source_data->SourcePath + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name ); goto end_free_data; } - if (datatype != DATA_T_STRING) + if (data_datatype != DATA_T_STRING) { mssErrorf(1, "Cluster", "Type for %uth entry was not a string:\n" - " > Attribute: ['%s' : %s]\n" - " > Driver Used: %s\n" - " > Source Path: \"%s\"", - i, - source_data->AttrName, ci_TypeToStr(datatype), - obj->Driver->Name, - source_data->SourcePath + " > Attribute: ['%s':'%s' : %s]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, ci_TypeToStr(data_datatype), + source_data->SourcePath, + obj->Driver->Name ); goto end_free_data; } - /** Get value from database. **/ - char* val; - ret = objGetAttrValue(entry, source_data->AttrName, DATA_T_STRING, POD(&val)); + /** Data value: Get value from database. **/ + char* data; + ret = objGetAttrValue(entry, source_data->NameAttr, DATA_T_STRING, POD(&data)); if (ret != 0) { tprintf("\n"); mssErrorf(0, "Cluster", "Failed to value for %uth entry:\n" - " > Attribute: ['%s' : String]\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" " > Driver Used: %s\n" - " > Source Path: \"%s\"\n" - " > Error code: %d", - i, - source_data->AttrName, - obj->Driver->Name, + " > Error code: %d\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, source_data->SourcePath, + obj->Driver->Name, ret ); goto end_free_data; } /** Skip empty strings. **/ - if (strlen(val) == 0) + if (strlen(data) == 0) { tprintf("_"); check(fflush(stdout)); /* Failure ignored. */ @@ -2276,16 +2332,16 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) } /** Convert the string to a vector. **/ - pVector vector = ca_build_vector(val); + pVector vector = ca_build_vector(data); if (vector == NULL) { - mssErrorf(1, "Cluster", "Failed to build vectors for string \"%s\".", val); + mssErrorf(1, "Cluster", "Failed to build vectors for string \"%s\".", data); successful = false; goto end_free_data; } if (ca_is_empty(vector)) { - mssErrorf(1, "Cluster", "Vector building for string \"%s\" produced no character pairs.", val); + mssErrorf(1, "Cluster", "Vector building for string \"%s\" produced no character pairs.", data); successful = false; goto end_free_data; } @@ -2298,10 +2354,66 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) continue; } - /** Store value. **/ - char* dup_val = check_ptr(nmSysStrdup(val)); - if (dup_val == NULL) goto end_free_data; - if (!check_neg(xaAddItem(&data_xarray, (void*)dup_val))) goto end_free_data; + + /** Key value: Type checking. **/ + const int key_datatype = objGetAttrType(entry, source_data->KeyAttr); + if (key_datatype == -1) + { + mssErrorf(0, "Cluster", + "Failed to get type for key on %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free_data; + } + if (key_datatype != DATA_T_STRING) + { + mssErrorf(1, "Cluster", + "Type for key on %uth entry was not a string:\n" + " > Attribute: ['%s':'%s' : %s]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, ci_TypeToStr(key_datatype), + source_data->SourcePath, + obj->Driver->Name + ); + goto end_free_data; + } + + /** key value: Get value from database. **/ + char* key; + ret = objGetAttrValue(entry, source_data->KeyAttr, DATA_T_STRING, POD(&key)); + if (ret != 0) + { + tprintf("\n"); + mssErrorf(0, "Cluster", + "Failed to value for key on %uth entry:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n" + " > Error code: %d\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name, + ret + ); + goto end_free_data; + } + + /** Store values. **/ + char* key_dup = check_ptr(nmSysStrdup(key)); + if (key_dup == NULL) goto end_free_data; + char* data_dup = check_ptr(nmSysStrdup(data)); + if (data_dup == NULL) goto end_free_data; + if (!check_neg(xaAddItem(&key_xarray, (void*)key_dup))) goto end_free_data; + if (!check_neg(xaAddItem(&data_xarray, (void*)data_dup))) goto end_free_data; if (!check_neg(xaAddItem(&vector_xarray, (void*)vector))) goto end_free_data; /** Clean up. **/ @@ -2314,26 +2426,39 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) } tprintf("\nData aquired.\n"); source_data->nVectors = vector_xarray.nItems; + if (source_data->nVectors == 0) + { + mssErrorf(0, "Cluster", + "Data source path did not contain any valid data:\n" + " > Attribute: ['%s':'%s' : String]\n" + " > Source Path: %s\n" + " > Driver Used: %s\n", + vector_xarray.nItems, + source_data->KeyAttr, source_data->NameAttr, + source_data->SourcePath, + obj->Driver->Name + ); + } - /** Trim data and store data. **/ - const size_t data_size = source_data->nVectors * sizeof(char*); - source_data->Strings = check_ptr(nmSysMalloc(data_size)); + /** Trim and store: keys, data, and vectors. **/ + source_data->Keys = (char**)check_ptr(ci_xaToTrimmedArray(&key_xarray, 1)); + source_data->Strings = (char**)check_ptr(ci_xaToTrimmedArray(&data_xarray, 1)); + source_data->Vectors = (int**)check_ptr(ci_xaToTrimmedArray(&vector_xarray, 1)); + if (source_data->Keys == NULL) goto end_free_data; if (source_data->Strings == NULL) goto end_free_data; - memcpy(source_data->Strings, data_xarray.Items, data_size); - check(xaDeInit(&data_xarray)); /* Failure ignored. */ - data_xarray.nAlloc = 0; - - /** Trim data and store vectors. **/ - const size_t vectors_size = source_data->nVectors * sizeof(pVector); - source_data->Vectors = check_ptr(nmSysMalloc(vectors_size)); - memcpy(source_data->Vectors, vector_xarray.Items, vectors_size); - check(xaDeInit(&vector_xarray)); /* Failure ignored. */ - vector_xarray.nAlloc = 0; + if (source_data->Vectors == NULL) goto end_free_data; /** Success. **/ + fprintf(stderr, "[SourceData: %s] Compute done.\n", source_data->Name); successful = true; end_free_data: + if (key_xarray.nAlloc != 0) + { + for (unsigned int i = 0u; i < vector_xarray.nItems; i++) + nmSysFree(key_xarray.Items[i]); + check(xaDeInit(&key_xarray)); /* Failure ignored. */ + } if (data_xarray.nAlloc != 0) { for (unsigned int i = 0u; i < data_xarray.nItems; i++) @@ -2464,7 +2589,7 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) cluster_data->Sims )); timer_stop(timer); - tprintf("Clustering done after %.4lf.\n", timer_get(timer)); + tprintf("Clustering done after %.4lfs.\n", timer_get(timer)); if (!successful) goto err_free_sims; /** Convert the labels into clusters. **/ @@ -2511,7 +2636,7 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) } /** Success. **/ - tprintf("Clustering done.\n"); + fprintf(stderr, "[ClusterData: %s] Compute done.\n", cluster_data->Name); return 0; err_free_sims: @@ -2564,16 +2689,6 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) goto err; } - /** Check for unimplemented similarity measures. **/ - if (search_data->SimilarityMeasure != SIMILARITY_COSINE) - { - mssErrorf(1, "Cluster", - "The similarity meausre \"%s\" is not implemented.", - ci_SimilarityMeasureToString(search_data->SimilarityMeasure) - ); - goto err; - } - /** Record the date and time. **/ if (!check(objCurrentDate(&search_data->DateComputed))) goto err; @@ -2593,6 +2708,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) cluster_data->MaxIterations, /* Window size. */ ca_cos_compare, search_data->Threshold, + (void**)cluster_data->SourceData->Keys, dups )); } @@ -2605,6 +2721,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) cluster_data->Clusters[i].Size, ca_cos_compare, search_data->Threshold, + (void**)cluster_data->SourceData->Keys, dups )); if (dups_temp == NULL) goto err_free; @@ -2624,6 +2741,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) cluster_data->MaxIterations, /* Window size. */ ca_lev_compare, search_data->Threshold, + (void**)cluster_data->SourceData->Keys, dups )); } @@ -2636,6 +2754,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) cluster_data->Clusters[i].Size, ca_lev_compare, search_data->Threshold, + (void**)cluster_data->SourceData->Keys, dups )); if (dups_temp == NULL) goto err_free; @@ -2655,19 +2774,16 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) timer_stop(timer); if (dups_temp == NULL) goto err_free; else dups = dups_temp; - tprintf("Search done after %.4lf.\n", timer_get(timer)); + tprintf("Search done after %.4lfs.\n", timer_get(timer)); /** Store dups. **/ search_data->nDups = dups->nItems; search_data->Dups = (dups->nItems == 0) ? check_ptr(nmSysMalloc(0)) - : ci_xaToTrimmedArray(dups); - - /** Free unused data. **/ - tprintf("Cleanup.\n"); - check(xaFree(dups)); /* Failure ignored. */ + : ci_xaToTrimmedArray(dups, 2); /** Success. **/ + fprintf(stderr, "[SearchData: %s] Compute done.\n", search_data->Name); return 0; err_free: @@ -2808,6 +2924,7 @@ static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) { tprintf("Warning: clusterOpen(\"%s\") is under active development.\n", ci_file_name(parent)); + ClusterStatistics.OpenCalls++; /** If CREAT and EXCL are specified, exclusively create it, failing if the file already exists. **/ pSnNode node_struct = NULL; @@ -2973,6 +3090,7 @@ int clusterClose(void* inf_v, pObjTrxTree* oxt) { tprintf("Warning: clusterClose() is under active development.\n"); pDriverData driver_data = (pDriverData)inf_v; + ClusterStatistics.CloseCalls++; /** Entries are shallow copies so we shouldn't do a deep free. **/ if (driver_data->TargetType == TARGET_CLUSTER_ENTRY @@ -3005,6 +3123,7 @@ int clusterClose(void* inf_v, pObjTrxTree* oxt) ***/ void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt) { + ClusterStatistics.OpenQueryCalls++; tprintf("Warning: clusterOpenQuery() is under active development.\n"); pClusterQuery cluster_query = check_ptr(nmMalloc(sizeof(ClusterQuery))); if (cluster_query == NULL) return NULL; @@ -3029,7 +3148,8 @@ void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt) void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) { int ret; - tprintf("Warning: clusterQueryFetch() is under active development.\n"); + ClusterStatistics.FetchCalls++; +// tprintf("Warning: clusterQueryFetch() is under active development.\n"); pClusterQuery cluster_query = (pClusterQuery)qy_v; /** Ensure that the data being fetched exists and is computed. **/ @@ -3114,7 +3234,7 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) ***/ int clusterQueryClose(void* qy_v, pObjTrxTree* oxt) { - tprintf("Warning: clusterQueryClose() is under active development.\n"); +// tprintf("Warning: clusterQueryClose() is under active development.\n"); nmFree(qy_v, sizeof(ClusterQuery)); return 0; @@ -3134,6 +3254,7 @@ int clusterQueryClose(void* qy_v, pObjTrxTree* oxt) int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) { pDriverData driver_data = (pDriverData)inf_v; + ClusterStatistics.GetTypeCalls++; /** Guard possible segfault. **/ if (attr_name == NULL) @@ -3142,8 +3263,8 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) return DATA_T_UNAVAILABLE; } - /** Performance shortcut for frequently requested attributes: val, val1, val2, and sim. **/ - if (attr_name[0] == 'v' || attr_name[0] == 's') goto handle_targets; + /** Performance shortcut for frequently requested attributes: key1, key2, and sim. **/ + if (attr_name[0] == 'k' || attr_name[0] == 's') goto handle_targets; /** Debug info. **/ if (oxt == NULL) tprintf(" > "); @@ -3171,7 +3292,8 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) { case TARGET_ROOT: if (strcmp(attr_name, "source") == 0 - || strcmp(attr_name, "attr_name") == 0) + || strcmp(attr_name, "data_attr") == 0 + || strcmp(attr_name, "key_attr") == 0) return DATA_T_STRING; break; @@ -3200,11 +3322,8 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) break; case TARGET_SEARCH_ENTRY: - if (strcmp(attr_name, "id1") == 0 - || strcmp(attr_name, "id2") == 0) - return DATA_T_INTEGER; - if (strcmp(attr_name, "val1") == 0 - || strcmp(attr_name, "val2") == 0) + if (strcmp(attr_name, "key1") == 0 + || strcmp(attr_name, "key2") == 0) return DATA_T_STRING; if (strcmp(attr_name, "sim") == 0) return DATA_T_DOUBLE; @@ -3241,6 +3360,7 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt) { pDriverData driver_data = (pDriverData)inf_v; + ClusterStatistics.GetValCalls++; /** Guard possible segfault. **/ if (attr_name == NULL) @@ -3249,9 +3369,8 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val return DATA_T_UNAVAILABLE; } - /** Performance shortcut for frequently requested attributes: val1, val2, and sim. **/ - if ( - (attr_name[0] == 'v' && datatype == DATA_T_STRING) /* val1, val2 : String */ + /** Performance shortcut for frequently requested attributes: key1, key2, and sim. **/ + if ((attr_name[0] == 'k' && datatype == DATA_T_STRING) /* key1, key2 : string */ || (attr_name[0] == 's' && datatype == DATA_T_DOUBLE) /* sim : double */ ) goto handle_targets; @@ -3272,6 +3391,7 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val /** Handle name and annotation. **/ if (strcmp(attr_name, "name") == 0) { + ClusterStatistics.GetValCalls_name++; switch (driver_data->TargetType) { case TARGET_ROOT: @@ -3336,7 +3456,7 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val case TARGET_ROOT: case TARGET_CLUSTER_ENTRY: case TARGET_SEARCH_ENTRY: - /** Field is not defined for this target type. **/ + /** Attribute is not defined for this target type. **/ return -1; case TARGET_CLUSTER: @@ -3356,7 +3476,7 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val case TARGET_ROOT: case TARGET_CLUSTER_ENTRY: case TARGET_SEARCH_ENTRY: - /** Field is not defined for this target type. **/ + /** Attribute is not defined for this target type. **/ return -1; case TARGET_CLUSTER: @@ -3393,9 +3513,14 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val val->String = ((pSourceData)driver_data->TargetData)->SourcePath; return 0; } - if (strcmp(attr_name, "attr_name") == 0) + if (strcmp(attr_name, "key_attr") == 0) + { + val->String = ((pSourceData)driver_data->TargetData)->KeyAttr; + return 0; + } + if (strcmp(attr_name, "name_attr") == 0) { - val->String = ((pSourceData)driver_data->TargetData)->AttrName; + val->String = ((pSourceData)driver_data->TargetData)->NameAttr; return 0; } break; @@ -3460,6 +3585,7 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val case TARGET_CLUSTER_ENTRY: { pClusterData target = (pClusterData)driver_data->TargetData; + pCluster target_cluster = &target->Clusters[driver_data->TargetIndex]; if (strcmp(attr_name, "items") == 0) { @@ -3468,7 +3594,6 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val if (vec != NULL) nmFree(vec, sizeof(StringVec)); /** Allocate and initiallize the requested data. **/ - pCluster target_cluster = &target->Clusters[driver_data->TargetIndex]; val->StringVec = vec = check_ptr(nmMalloc(sizeof(StringVec))); if (val->StringVec == NULL) return -1; val->StringVec->nStrings = target_cluster->Size; @@ -3485,37 +3610,22 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val pSearchData target = (pSearchData)driver_data->TargetData; pDup target_dup = target->Dups[driver_data->TargetIndex]; - if (strcmp(attr_name, "id1") == 0) - { - unsigned int value = target_dup->id1; - if (value > INT_MAX) - fprintf(stderr, "Warning: id1 value of %u exceeds INT_MAX (%d).\n", value, INT_MAX); - val->Integer = (int)value; - return 0; - } - if (strcmp(attr_name, "id2") == 0) - { - unsigned int value = target_dup->id2; - if (value > INT_MAX) - fprintf(stderr, "Warning: id2 value of %u exceeds INT_MAX (%d).\n", value, INT_MAX); - val->Integer = (int)value; - return 0; - } - if (strcmp(attr_name, "val1") == 0) + if (strcmp(attr_name, "sim") == 0) { - val->String = driver_data->NodeData->SourceData->Strings[target_dup->id1]; - // val->Integer = (int)target_dup->id1; + ClusterStatistics.GetValCalls_sim++; + val->Double = target_dup->similarity; return 0; } - if (strcmp(attr_name, "val2") == 0) + if (strcmp(attr_name, "key1") == 0) { - val->String = driver_data->NodeData->SourceData->Strings[target_dup->id2]; - // val->Integer = (int)target_dup->id2; + ClusterStatistics.GetValCalls_key1++; + val->String = target_dup->key1; return 0; } - if (strcmp(attr_name, "sim") == 0) + if (strcmp(attr_name, "key2") == 0) { - val->Double = target_dup->similarity; + ClusterStatistics.GetValCalls_key2++; + val->String = target_dup->key2; return 0; } break; @@ -3542,10 +3652,10 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val /*** Create a new presentation hints object, describing this attribute on the *** provided cluster driver instance. *** - *** Note: expCompileExpression() and nmSysStrdup() are run unchecked because - *** the worst case senario is that the fields are set to null and ignored, - *** which I consider to be better than ending the script because one of - *** them failed. + *** Note: Failures from nmSysStrdup() and several others are ignored because + *** the worst case senario is that the attributes are set to null, which + *** will cause them to be ignored. I consider that to be better than than + *** throwing an error that could unnecessarily disrupt normal usage. *** *** @param inf_v The driver instance to be read. *** @param attr_name The name of the requested attribute. @@ -3563,7 +3673,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb if (hints == NULL) goto err; memset(hints, 0, sizeof(ObjPresentationHints)); - /** Hints that are the same for all fields **/ + /** Hints that are the same for all attributes. **/ hints->GroupID = -1; hints->VisualLength2 = 1; hints->Style |= OBJ_PH_STYLE_READONLY | OBJ_PH_STYLE_CREATEONLY | OBJ_PH_STYLE_NOTNULL; @@ -3604,7 +3714,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb { hints->Length = 24; hints->VisualLength = 20; - hints->Format = nmSysStrdup("datetime"); + hints->Format = check_ptr(nmSysStrdup("datetime")); /* Failure ignored. */ goto success; } else goto unknown_attribute; @@ -3618,14 +3728,21 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb { hints->Length = _PC_PATH_MAX; hints->VisualLength = 64; - hints->FriendlyName = "Source Path"; + hints->FriendlyName = check_ptr(nmSysStrdup("Source Path")); /* Failure ignored. */ goto success; } - if (strcmp(attr_name, "attr_name") == 0) + if (strcmp(attr_name, "key_attr") == 0) { hints->Length = 255; hints->VisualLength = 32; - hints->FriendlyName = "Attribute Name"; + hints->FriendlyName = check_ptr(nmSysStrdup("Key Attribute Name")); /* Failure ignored. */ + goto success; + } + if (strcmp(attr_name, "data_attr") == 0) + { + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Data Attribute Name")); /* Failure ignored. */ goto success; } break; @@ -3640,7 +3757,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 8; hints->VisualLength = 4; - hints->FriendlyName = nmSysStrdup("Number of Clusters"); + hints->FriendlyName = check_ptr(nmSysStrdup("Number of Clusters")); /* Failure ignored. */ goto success; } if (strcmp(attr_name, "min_improvement") == 0) @@ -3653,7 +3770,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 16; hints->VisualLength = 8; - hints->FriendlyName = nmSysStrdup("Minimum Improvement Threshold"); + hints->FriendlyName = check_ptr(nmSysStrdup("Minimum Improvement Threshold")); /* Failure ignored. */ goto success; } if (strcmp(attr_name, "max_iterations") == 0) @@ -3666,15 +3783,15 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 8; hints->VisualLength = 4; - hints->FriendlyName = nmSysStrdup("Maximum Number of Clustering Iterations"); + hints->FriendlyName = check_ptr(nmSysStrdup("Maximum Iterations")); /* Failure ignored. */ goto success; } if (strcmp(attr_name, "algorithm") == 0) { /** Enum values. **/ - check(xaInit(&(hints->EnumList), nClusteringAlgorithms)); + check(xaInit(&(hints->EnumList), nClusteringAlgorithms)); /* Failure ignored. */ for (unsigned int i = 0u; i < nClusteringAlgorithms; i++) - check_neg(xaAddItem(&(hints->EnumList), &ALL_CLUSTERING_ALGORITHMS[i])); + check_neg(xaAddItem(&(hints->EnumList), &ALL_CLUSTERING_ALGORITHMS[i])); /* Failure ignored. */ /** Min and max values. **/ hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); @@ -3689,7 +3806,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 24; hints->VisualLength = 20; - hints->FriendlyName = nmSysStrdup("Clustering Algorithm"); + hints->FriendlyName = check_ptr(nmSysStrdup("Clustering Algorithm")); /* Failure ignored. */ goto success; } /** Fall-through: Start of overlapping region. **/ @@ -3698,9 +3815,9 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb if (strcmp(attr_name, "similarity_measure") == 0) { /** Enum values. **/ - check(xaInit(&(hints->EnumList), nSimilarityMeasures)); + check(xaInit(&(hints->EnumList), nSimilarityMeasures)); /* Failure ignored. */ for (unsigned int i = 0u; i < nSimilarityMeasures; i++) - check_neg(xaAddItem(&(hints->EnumList), &ALL_SIMILARITY_MEASURES[i])); + check_neg(xaAddItem(&(hints->EnumList), &ALL_SIMILARITY_MEASURES[i])); /* Failure ignored. */ /** Display flags. **/ hints->Style |= OBJ_PH_STYLE_BUTTONS; @@ -3715,7 +3832,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 32; hints->VisualLength = 20; - hints->FriendlyName = nmSysStrdup("Similarity Measure"); + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity Measure")); /* Failure ignored. */ goto success; } @@ -3726,7 +3843,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb { hints->Length = 64; hints->VisualLength = 32; - hints->FriendlyName = nmSysStrdup("Source Cluster Name"); + hints->FriendlyName = check_ptr(nmSysStrdup("Source Cluster Name")); /* Failure ignored. */ goto success; } if (strcmp(attr_name, "threshold") == 0) @@ -3738,39 +3855,22 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 16; hints->VisualLength = 8; - hints->FriendlyName = nmSysStrdup("Similarity Threshold"); + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity Threshold")); /* Failure ignored. */ goto success; } break; case TARGET_CLUSTER_ENTRY: { - pClusterData target = (pClusterData)driver_data->TargetData; + pClusterData target = (pClusterData)check_ptr(driver_data->TargetData); + if (target == NULL) goto err; - if (strcmp(attr_name, "id") == 0) - { - pSourceData source_data = (pSourceData)target->SourceData; - - /** Min and max values. **/ - hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - if (source_data->Vectors != NULL) - { - char buf[16u]; - snprintf(buf, sizeof(buf), "%u", source_data->nVectors); - hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - } - - /** Other hints. **/ - hints->Length = 8; - hints->VisualLength = 4; - goto success; - } - if (strcmp(attr_name, "val") == 0) + if (strcmp(attr_name, "items") == 0) { /** Other hints. **/ - hints->Length = 255; - hints->VisualLength = 32; - hints->FriendlyName = nmSysStrdup("Value"); + hints->Length = 65536; + hints->VisualLength = 256; + hints->FriendlyName = check_ptr(nmSysStrdup("Cluster Data")); /* Failure ignored. */ goto success; } if (strcmp(attr_name, "sim") == 0) @@ -3782,7 +3882,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 16; hints->VisualLength = 8; - hints->FriendlyName = nmSysStrdup("Similarity"); + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity")); /* Failure ignored. */ goto success; } break; @@ -3790,32 +3890,21 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb case TARGET_SEARCH_ENTRY: { - pSearchData target = (pSearchData)driver_data->TargetData; + pSearchData target = (pSearchData)check_ptr(driver_data->TargetData); + if (target == NULL) goto err; - if (strcmp(attr_name, "id1") == 0 || strcmp(attr_name, "id2") == 0) + if (strcmp(attr_name, "key1") == 0) { - pSourceData source_data = (pSourceData)target->Source->SourceData; - - /** Min and max values. **/ - hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - if (source_data->Vectors != NULL) - { - char buf[16u]; - snprintf(buf, sizeof(buf), "%u", source_data->nVectors); - hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - } - - /** Other hints. **/ - hints->Length = 8; - hints->VisualLength = 4; + hints->Length = 255; + hints->VisualLength = 32; + hints->FriendlyName = check_ptr(nmSysStrdup("Key 1")); /* Failure ignored. */ goto success; } - if (strcmp(attr_name, "val1") == 0 || strcmp(attr_name, "val2") == 0) + if (strcmp(attr_name, "key2") == 0) { - /** Other hints. **/ hints->Length = 255; hints->VisualLength = 32; - hints->FriendlyName = nmSysStrdup("Value"); + hints->FriendlyName = check_ptr(nmSysStrdup("Key 2")); /* Failure ignored. */ goto success; } if (strcmp(attr_name, "sim") == 0) @@ -3827,7 +3916,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Other hints. **/ hints->Length = 16; hints->VisualLength = 8; - hints->FriendlyName = nmSysStrdup("Similarity"); + hints->FriendlyName = check_ptr(nmSysStrdup("Similarity")); /* Failure ignored. */ goto success; } break; @@ -3841,7 +3930,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Unknown attribute. **/ unknown_attribute:; char* name; - clusterGetAttrValue(inf_v, "name", DATA_T_STRING, POD(&name), NULL); + check(clusterGetAttrValue(inf_v, "name", DATA_T_STRING, POD(&name), NULL)); /* Failure ignored. */ mssErrorf(1, "Cluster", "Unknown attribute '%s' for cluster object %s (target type: %u, \"%s\").", attr_name, driver_data->NodeData->SourceData->Name, driver_data->TargetType, name @@ -4174,7 +4263,7 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx mssErrorf(1, "Cluster", "[param : \"show\" | \"show_all\" | \"drop_all\"] is required for the cache method." ); - goto err; + goto err; } /** show and show_all. **/ @@ -4256,6 +4345,34 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx ); goto err; } + + if (strcmp(method_name, "stat") == 0) + { + unsigned long long ExpectedOpenCalls = 10666; + unsigned long long ExpectedOpenQueryCalls = 10665; + unsigned long long ExpectedFetchCalls = 3368007; + unsigned long long ExpectedCloseCalls = 3368007; + unsigned long long ExpectedGetTypeCalls = 26664164; + unsigned long long ExpectedGetValCalls = 15021419; + unsigned long long ExpectedGetValCalls_name = 3368008; + unsigned long long ExpectedGetValCalls_key1 = 3357342; + unsigned long long ExpectedGetValCalls_key2 = 1574; + unsigned long long ExpectedGetValCalls_sim = 8283829; + char buf[12]; + printf("Cluster Driver Statistics:\n"); + printf(" Stat Name Value\n"); + printf(" OpenCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenCalls), snprint_llu(buf, sizeof(buf), ExpectedOpenCalls), ClusterStatistics.OpenCalls / ExpectedOpenCalls * 100.0); + printf(" OpenQueryCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenQueryCalls), snprint_llu(buf, sizeof(buf), ExpectedOpenQueryCalls), ClusterStatistics.OpenQueryCalls / ExpectedOpenQueryCalls * 100.0); + printf(" FetchCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.FetchCalls), snprint_llu(buf, sizeof(buf), ExpectedFetchCalls), ClusterStatistics.FetchCalls / ExpectedFetchCalls * 100.0); + printf(" CloseCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.CloseCalls), snprint_llu(buf, sizeof(buf), ExpectedCloseCalls), ClusterStatistics.CloseCalls / ExpectedCloseCalls * 100.0); + printf(" GetTypeCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetTypeCalls), snprint_llu(buf, sizeof(buf), ExpectedGetTypeCalls), ClusterStatistics.GetTypeCalls / ExpectedGetTypeCalls * 100.0); + printf(" GetValCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls), ClusterStatistics.GetValCalls / ExpectedGetValCalls * 100.0); + printf(" GetValCalls_name %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_name), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls_name), ClusterStatistics.GetValCalls_name / ExpectedGetValCalls_name * 100.0); + printf(" GetValCalls_key1 %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key1), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls_key1), ClusterStatistics.GetValCalls_key1 / ExpectedGetValCalls_key1 * 100.0); + printf(" GetValCalls_key2 %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key2), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls_key2), ClusterStatistics.GetValCalls_key2 / ExpectedGetValCalls_key2 * 100.0); + printf(" GetValCalls_sim %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_sim), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls_sim), ClusterStatistics.GetValCalls_sim / ExpectedGetValCalls_sim * 100.0); + return 0; + } /** Unknown parameter. **/ mssErrorf(1, "Cluster", "Unknown command: \"%s\"", method_name); @@ -4343,17 +4460,20 @@ int clusterInitialize(void) if (drv == NULL) goto err; memset(drv, 0, sizeof(ObjDriver)); - /** Initialize globals. **/ + /** Initialize caches. **/ memset(&ClusterDriverCaches, 0, sizeof(ClusterDriverCaches)); if (!check(xhInit(&ClusterDriverCaches.SourceDataCache, 251, 0))) goto err; if (!check(xhInit(&ClusterDriverCaches.ClusterDataCache, 251, 0))) goto err; if (!check(xhInit(&ClusterDriverCaches.SearchDataCache, 251, 0))) goto err; + /** Initialize statistics. **/ + memset(&ClusterStatistics, 0, sizeof(ClusterStatistics)); + /** Setup the structure. **/ if (check_ptr(strcpy(drv->Name, "clu - Clustering Driver")) == NULL) goto err; - if (!check(xaInit(&(drv->RootContentTypes), 1))) goto err; - if (!check_neg(xaAddItem(&(drv->RootContentTypes), "system/cluster"))) goto err; - drv->Capabilities = OBJDRV_C_TRANS | OBJDRV_C_FULLQUERY; /* TODO: Greg, double check these are correct. */ + if (!check(xaInit(&drv->RootContentTypes, 1))) goto err; + if (!check_neg(xaAddItem(&drv->RootContentTypes, "system/cluster"))) goto err; + drv->Capabilities = OBJDRV_C_TRANS | OBJDRV_C_FULLQUERY; /* TODO: Greg, are these correct? Should I add any others? */ /** Setup the function references. **/ drv->Open = clusterOpen; @@ -4415,15 +4535,17 @@ int clusterInitialize(void) // snprint_bytes(buf7, sizeof(buf7), sizeof(ClusterQuery)), // snprint_bytes(buf8, sizeof(buf8), sizeof(ClusterDriverCaches)) // ); - -// pVector v = ca_build_vector(""); -// const unsigned int len = ca_sparse_len(v); -// fprintf(stderr, "Vector (x%d): [%d", len, v[0]); -// for (unsigned int i = 1u; i < len; i++) -// { -// fprintf(stderr, ", %d", v[i]); -// } -// fprintf(stderr, "]\n"); +// + // 'st' (7: 13) collides with 'an' (7: 11) +// char* str1 = "This is a very long string of text"; +// char* str2 = "This is a very long string of textttttttttttt"; +// pVector v1 = ca_build_vector(str1); +// pVector v2 = ca_build_vector(str2); +// ca_fprint_vector(stdout, v1); printf("\n"); +// ca_fprint_vector(stdout, v2); printf("\n"); +// fprintf(stderr, "'%s' ?= '%s' -> %g\n", str1, str2, ca_cos_compare(v1, v2)); +// ca_free_vector(v1); +// ca_free_vector(v2); /** Register the driver. **/ if (!check(objRegisterDriver(drv))) goto err; From 4b656a4a407468296395b25ac11433eebdcb928f Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 13 Nov 2025 16:50:35 -0700 Subject: [PATCH 08/30] Improve exp_functions() to use central schema verification. Re-add Levenshtein to exp_functions. Publish edit_dist() in the cluster library. Fix mistakes in cluster driver function signatures. Fix spelling mistakes. Add detail to an error message in the lexer. Remove unused .cluster files. Clean up cluster-schema.cluster. Clean up other unused junk. --- centrallix-doc/Widgets/widgets.xml | 2 +- centrallix-lib/include/clusters.h | 7 + centrallix-lib/include/glyph.h | 6 +- centrallix-lib/include/util.h | 56 +- centrallix-lib/src/clusters.c | 32 +- centrallix-lib/src/mtlexer.c | 6 +- centrallix-lib/src/util.c | 4 +- centrallix-lib/src/xhash.c | 6 +- centrallix-os/cluster-schema.cluster | 111 -- centrallix-os/file.cluster | 67 - centrallix-os/file2.cluster | 42 - centrallix-sysdoc/OSDriver_Authoring.md | 1222 +++++++++++++----- centrallix-sysdoc/string_comparison.md | 101 -- centrallix-sysdoc/string_similarity.md | 63 +- centrallix/expression/exp_double_metaphone.c | 30 +- centrallix/expression/exp_functions.c | 513 +++----- centrallix/osdrivers/objdrv_cluster.c | 167 +-- 17 files changed, 1200 insertions(+), 1235 deletions(-) delete mode 100644 centrallix-os/file.cluster delete mode 100644 centrallix-os/file2.cluster delete mode 100644 centrallix-sysdoc/string_comparison.md diff --git a/centrallix-doc/Widgets/widgets.xml b/centrallix-doc/Widgets/widgets.xml index b6b50afde..f38f178d0 100644 --- a/centrallix-doc/Widgets/widgets.xml +++ b/centrallix-doc/Widgets/widgets.xml @@ -3731,7 +3731,7 @@ myTabControl "widget/tab" The title of the column to be displayed in the header row. - The type of the column: "text", "check", or "image". "text" is a normal column, and displays the textual value of the data element. "check" displays a checkmark if the data is non-zero (integers) or for strings if the value is non-empty and not "N" or "No". "image" displays the image referred to by the pathname contained in the data value. + The type of the column: "text", "check", "image", or "progress". "text" is a normal column, and displays the textual value of the data element. "check" displays a checkmark if the data is non-zero (integers) or for strings if the value is non-empty and not "N" or "No". "image" displays the image referred to by the pathname contained in the data value. "progress" displays a progress bar, with additional fields such as bar_color, bar_textcollor, and bar_padding. width of the column. diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index 8338cd5e0..05480e742 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -71,11 +71,17 @@ typedef struct nmRegister(sizeof(pCentroid), "pCentroid"); \ nmRegister(pCentroidSize, "Centroid"); \ nmRegister(sizeof(Dup), "Dup") + +/** Edit distance function. **/ +unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length); +/** Vector functions. **/ pVector ca_build_vector(const char* str); unsigned int ca_sparse_len(const pVector vector); void ca_print_vector(const pVector vector); void ca_free_vector(pVector sparse_vector); + +/** Kmeans function. **/ int ca_kmeans( pVector* vectors, const unsigned int num_vectors, @@ -98,6 +104,7 @@ double ca_cos_compare(void* v1, void* v2); double ca_lev_compare(void* str1, void* str2); bool ca_eql(pVector v1, pVector v2); +/** Similarity search functions. **/ void* ca_most_similar( void* target, void** data, diff --git a/centrallix-lib/include/glyph.h b/centrallix-lib/include/glyph.h index 5f78eab5d..cfafd3946 100644 --- a/centrallix-lib/include/glyph.h +++ b/centrallix-lib/include/glyph.h @@ -35,8 +35,8 @@ #include -/** Uncomment to use glyphs. **/ -/** TODO: Israel - Comment this out. **/ +/** Uncomment to activate glyphs. **/ +/** Should not be enabled in production code on the master branch. */ // #define ENABLE_GLYPHS #ifdef ENABLE_GLYPHS @@ -50,7 +50,7 @@ *** *** @param name The symbol name of the visualizer. *** @param str The string printed for the visualization. - *** @param interval The number of invokations of glyph() required to print. + *** @param interval The number of invocations of glyph() required to print. *** @param flush Whether to flush on output. ***/ #define glyph_init(name, str, interval, flush) \ diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index 1f286cc26..0f2685039 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -50,7 +50,10 @@ extern "C" { #ifndef __cplusplus #include -/** TODO: Greg, is the __typeof__ syntax from GCC a portability concern? **/ +/*** TODO: Greg - Can we assume this code will always be compiled with GCC? + *** If not, then the __typeof__, __LINE__, and __FILE__ syntaxes might be a + *** portability concern. + ***/ /*** @brief Returns the smaller of two values. *** @@ -58,7 +61,7 @@ extern "C" { *** @param b The second value. *** @return The smaller of the two values. *** - *** @note This macro uses GCC extensions to enusre type safety. + *** @note This macro uses GCC extensions to ensure type safety. ***/ #define min(a, b) \ ({ \ @@ -73,7 +76,7 @@ extern "C" { *** @param b The second value. *** @return The larger of the two values. *** - *** @note This macro uses GCC extensions to enusre type safety. + *** @note This macro uses GCC extensions to ensure type safety. ***/ #define max(a, b) \ ({ \ @@ -151,53 +154,6 @@ void print_diagnostics(int code, const char* function_name, const char* file_nam _r; \ }) -/** Pattern for printing a binary int using printf(). **/ -#define INT_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c" - -/*** Converts an int to the values that should be passed to printf() for the - *** INT_TO_BINARY_PATTERN pattern. - *** - *** @attention - Double evaluation is NOT HANDLED so int_val will be evaluted - *** 32 times when this macro is used. Ensure that evaluation of the value - *** passed for int_val does not have important side effects! - *** - *** @param int_val The int to be printed. - *** @returns Values for printf(). - ***/ -#define INT_TO_BINARY(int_val) \ - ((int_val) & 0b10000000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b01000000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00100000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00010000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00001000000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000100000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000010000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000001000000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000100000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000010000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000001000000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000100000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000010000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000001000000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000100000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000010000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000001000000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000100000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000010000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000001000000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000100000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000010000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000001000000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000100000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000010000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000001000000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000100000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000010000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000001000 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000000100 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000000010 ? '1' : '0'), \ - ((int_val) & 0b00000000000000000000000000000001 ? '1' : '0') - #endif /* __cplusplus */ #endif /* UTILITY_H */ diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index d61a558c7..4a96b6ca1 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -104,7 +104,7 @@ static int charpair_cmp(const void *p1, const void *p2) *** input to get_char_pair_hash(). *** *** After hashing each character pair, we add some number from 1 to 13 to the - *** coresponding dimention. However, for most names, this results in a lot of + *** coresponding dimension. However, for most names, this results in a lot of *** zeros and a FEW positive numbers. Thus, after creating the dense vector, *** we convert it to a sparse vector in which a negative number replaces a run *** of that many zeros. Consider the following example: @@ -114,7 +114,7 @@ static int charpair_cmp(const void *p1, const void *p2) *** Sparse pVector: `[1,-3,3,-1]` *** *** Using these sparse vectors greatly reduces the required memory and gives - *** aproximately an x5 boost to performance when traversing vectors, at the + *** approximately an x5 boost to performance when traversing vectors, at the *** cost of more algorithmically complex code. *** *** @param str The string to be divided into pairs and hashed to make the vector. @@ -162,7 +162,7 @@ pVector ca_build_vector(const char* str) /** Sort char_pairs by hash value. **/ qsort(char_pairs, num_pairs, sizeof(CharPair), charpair_cmp); - /** Allocate space for the sparce vector. **/ + /** Allocate space for the sparse vector. **/ pVector sparse_vector = (pVector)check_ptr(nmSysMalloc((num_pairs * 2u + 1u) * sizeof(int))); if (sparse_vector == NULL) return NULL; @@ -403,7 +403,7 @@ static void parse_vector_token(const int token, unsigned int* remaining, unsigne } } -/*** Calculate the similarity on sparcely allocated vectors. Comparing +/*** Calculate the similarity on sparsely allocated vectors. Comparing *** any string to an empty string should always return 0.5 (untested). *** *** @param v1 Sparse vector #1. @@ -442,7 +442,7 @@ static double sparse_similarity(const pVector v1, const pVector v2) return (double)dot_product / (magnitude_sparse(v1) * magnitude_sparse(v2)); } -/*** Calculate the difference on sparcely allocated vectors. Comparing +/*** Calculate the difference on sparsely allocated vectors. Comparing *** any string to an empty string should always return 0.5 (untested). *** *** @param v1 Sparse vector #1. @@ -511,7 +511,7 @@ static double sparse_similarity_to_centroid(const pVector v1, const pCentroid c2 *** @skip *** LINK ../../centrallix-sysdoc/string_comparison.md#levenshtein ***/ -static unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) +unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) { /*** lev_matrix: *** For all i and j, d[i][j] will hold the Levenshtein distance between @@ -554,7 +554,7 @@ static unsigned int edit_dist(const char* str1, const char* str2, const size_t s if (str1[i - 1] == str2[j - 1]) lev_matrix[i][j] = lev_matrix[i - 1][j - 1]; - /*** We need to make a change, so use the opereration with the + /*** We need to make a change, so use the oppereration with the *** lowest cost out of delete, insert, replace, or swap. ***/ else @@ -587,7 +587,7 @@ static unsigned int edit_dist(const char* str1, const char* str2, const size_t s return result; } -/*** Compares two strings using their cosie simiarity, returning a value +/*** Compares two strings using their cosie similarity, returning a value *** between `0.0` (completely different) and `1.0` (identical). If either *** OR BOTH strings are NULL, this function returns `0.0`. *** @@ -618,7 +618,7 @@ double ca_cos_compare(void* v1, void* v2) return sparse_similarity(vec1, vec2); } -/*** Compares two strings using their levenstien edit distance to compute a +/*** Compares two strings using their Levenshtein edit distance to compute a *** similarity between `0.0` (completely different) and `1.0` (identical). *** If both strings are empty, this function returns `1.0` (identical). If *** either OR BOTH strings are NULL, this function returns `0.0`. @@ -722,7 +722,7 @@ static double get_cluster_size( *** a size of `n`. *** *** The following table shows data sizes vs.selected cluster size. In testing, - *** these numbers tended to givea good balance of accuracy and dulocates detected. + *** these numbers tended to give a good balance of accuracy and duplicates detected. *** *** ```csv *** Data Size, Actual @@ -771,7 +771,7 @@ unsigned int compute_k(const unsigned int n) *** clusters have a size of negative infinity. In this implementation, *** the bug is mitigated by setting a small number of max iterations, *** such as 16 instead of 100. - *** @attention - Issue: Clusters do not apear to improve much after the first + *** @attention - Issue: Clusters do not appear to improve much after the first *** iteration, which puts the efficacy of the algorithm into question. This *** may be due to the uneven density of a typical dataset. However, the *** clusters still offer useful information. @@ -962,7 +962,7 @@ int ca_kmeans( *** @param similarity A function which takes two data items of the type *** of the data param and returns their similarity. *** @param threshold The minimum similarity threshold. If the most similar - *** data does not meet this threshold, the funciton returns NULL. + *** data does not meet this threshold, the function returns NULL. *** @returns A pointer to the most similar piece of data found in the data *** array, or NULL if the most similar data did not meet the threshold. ***/ @@ -988,10 +988,10 @@ void* ca_most_similar( } -/*** Runs a sliding search over the povided data, comparing each element to +/*** Runs a sliding search over the provided data, comparing each element to *** the following `window_size` elements, invoking the passed comparison *** function just under `window_size * num_data` times. If any comparison - *** yeilds a similarity greater than the threshold, it is stored in the + *** yields a similarity greater than the threshold, it is stored in the *** xArray returned by this function. *** *** @param data The data to be searched. @@ -1076,9 +1076,9 @@ pXArray ca_sliding_search( return NULL; } -/*** Runs a complete search over the povided data, comparing each element to +/*** Runs a complete search over the provided data, comparing each element to *** each other element, invoking the passed comparison function `num_data^2` - *** times. If any comparison yeilds a similarity greater than the threshold, + *** times. If any comparison yields a similarity greater than the threshold, *** it is stored in the xArray returned by this function. *** *** @param data The data to be searched. diff --git a/centrallix-lib/src/mtlexer.c b/centrallix-lib/src/mtlexer.c index e92ea49ff..39a69cc15 100644 --- a/centrallix-lib/src/mtlexer.c +++ b/centrallix-lib/src/mtlexer.c @@ -7,6 +7,7 @@ #include #include #include + #include "newmalloc.h" #include "mtask.h" #include "mtlexer.h" @@ -907,7 +908,9 @@ mlxNextToken(pLxSession this) } else { - mssError(1,"MLX","Unexpected character encountered"); + char buf[4]; + snprintf(buf, sizeof(buf), "%c", ch); // mssError() does not support %c. + mssError(1, "MLX", "Unexpected character encountered: '%s'", buf); this->TokType = MLX_TOK_ERROR; break; } @@ -1305,4 +1308,3 @@ mlxSetOffset(pLxSession this, unsigned long new_offset) return 0; } - diff --git a/centrallix-lib/src/util.c b/centrallix-lib/src/util.c index b18361280..f60349a74 100644 --- a/centrallix-lib/src/util.c +++ b/centrallix-lib/src/util.c @@ -85,7 +85,7 @@ unsigned int strtoui(const char *nptr, char **endptr, int base){ } /*** Detects the optimal number of threads to use on this system. - *** Note: Multithreading is not currently supported, so this funciton + *** Note: Multithreading is not currently supported, so this function *** will always return 1, for now. *** *** @returns The number of threads that should be used on this system. @@ -120,7 +120,7 @@ static char* units_metric[nUnits] = {"bytes", "KB", "MB", "GB"}; *** *** @param buf The buffer to which new text will be written, using snprintf(). *** @param buf_size The amount of space in the buffer, passed to snprintf(). - *** It is recomended to have at least 12 characters available. + *** It is recommended to have at least 12 characters available. *** @param bytes The number of bytes, which will be formatted and written *** to the buffer.. *** @returns buf, for chaining. diff --git a/centrallix-lib/src/xhash.c b/centrallix-lib/src/xhash.c index 32a4a35eb..46ef3a6fb 100644 --- a/centrallix-lib/src/xhash.c +++ b/centrallix-lib/src/xhash.c @@ -295,11 +295,11 @@ xhClear(pXHashTable this, int (*free_fn)(), void* free_arg) *** @param this The affected hash table. *** @param callback_fn A callback function to be called on each hash table *** entry. It takes 2 parameters: the current hash table entry and a void* - *** argument specified using each_arg. If any invokation of the callback + *** argument specified using each_arg. If any invocation of the callback *** function returns a value other than 0, xhForEach() will immediately *** fail, returning that value as the error code. - *** @param each_arg An aditional argument which will be passed to each - *** invokation of the callback function. + *** @param each_arg An additional argument which will be passed to each + *** invocation of the callback function. *** @returns 0 if the function executes successfully. *** 1 if the callback function is NULL. *** n (where n != 0) if the callback function returns n. diff --git a/centrallix-os/cluster-schema.cluster b/centrallix-os/cluster-schema.cluster index 277e2bb12..4113a339a 100644 --- a/centrallix-os/cluster-schema.cluster +++ b/centrallix-os/cluster-schema.cluster @@ -59,114 +59,3 @@ file_name "system/cluster" - /key2 : string // The key of the second data point. - /sim : double && 0.0 < x <= threshold // The similarity of the two data points. ... - -// Other notes - -// This means centrallix scripts will have to chose when to switch -// from complete search to clustered search. I think this is a good -// thing, because that feels like a higher-level responsibility. - -// Invoke file: -// select * from /file.cluster - -// Driver-authoring.md -// Comprehend stparse.c (lib vs. centrallix?) -// Design what a .cluster file looks like. -// -// Figure out how to invoke the object system. - -// Random queries - -// Names -SELECT CONCAT(p_given_name, ' ', p_surname) AS full_name, - COUNT(*) AS num_dups -FROM p_partner -WHERE p_given_name is not null -AND p_surname is not null -AND p_given_name != "" -AND p_surname != "" -AND p_given_name != " " -AND p_surname != " " -GROUP BY full_name -ORDER BY num_dups DESC -LIMIT 1; -// Result: Ine Bradley with 4 dups - -// Phone Numbers -SELECT CONCAT(ci.p_phone_country, ci.p_phone_area_city, ci.p_contact_data) AS phone_number, - COUNT(*) AS num_dups -FROM p_partner AS p -JOIN p_contact_info AS ci - ON p.p_partner_key = ci.p_partner_key -WHERE ci.p_contact_data != ' ' -AND ci.p_contact_data != '' -AND (ci.p_contact_type = 'P' OR ci.p_contact_type = 'C') -GROUP BY phone_number -ORDER BY num_dups DESC -LIMIT 1; -// Result: 1813762-2274 with 2 dups - -// Emails and Addresses -SELECT CONCAT(ci.p_contact_data, ' ', - l.p_in_care_of, ' ', - l.p_address_1, ' ', - l.p_address_2, ' ', - l.p_address_3, ' ', - l.p_city, ' ', - l.p_state_province, ' ', - l.p_country_code, ' ', - l.p_postal_code) AS email_and_address, - COUNT(*) AS duplicate_count -FROM p_partner AS p -JOIN p_contact_info AS ci - ON p.p_partner_key = ci.p_partner_key -JOIN p_location AS l - ON p.p_partner_key = l.p_partner_key -WHERE ci.p_contact_type = 'E' -GROUP BY email_and_address -ORDER BY duplicate_count DESC -LIMIT 1; -// Result: richard.aypofblcsg@iipr.yeen with 2 dups - -// Email -SELECT ci.p_contact_data AS email, - COUNT(*) AS duplicate_count -FROM p_partner AS p -JOIN p_contact_info AS ci - ON p.p_partner_key = ci.p_partner_key -WHERE ci.p_contact_type = 'E' -GROUP BY email -ORDER BY duplicate_count DESC -LIMIT 1; - -// Result: uoehtbtjvqh20@ltirs.zese with 2 dups - -// Address -SELECT CONCAT(l.p_in_care_of, ' ', - l.p_address_1, ' ', - l.p_address_2, ' ', - l.p_address_3, ' ', - l.p_city, ' ', - l.p_state_province, ' ', - l.p_country_code, ' ', - l.p_postal_code) AS address, - COUNT(*) AS duplicate_count -FROM p_partner AS p -JOIN p_location AS l - ON p.p_partner_key = l.p_partner_key -WHERE l.p_address_1 != ' ' -GROUP BY address -ORDER BY duplicate_count DESC -LIMIT 1; -// Result: "742 1ben Sc E Adams FL US 49152" with 4 - - -// Output to dataset -INTO OUTFILE '/var/lib/mysql/db_output.csv' -LINES TERMINATED BY '|' - -// Output to CSV -INTO OUTFILE '/var/lib/mysql/db_output.csv' -FIELDS TERMINATED BY ',' -ENCLOSED BY '"' -LINES TERMINATED BY '\n'; diff --git a/centrallix-os/file.cluster b/centrallix-os/file.cluster deleted file mode 100644 index 95eacfee0..000000000 --- a/centrallix-os/file.cluster +++ /dev/null @@ -1,67 +0,0 @@ -$Version=2$ -file_name "system/cluster" - { - // Developer can specify parameters to improve file reuseability. - // TIP: Improve performance by declairing frequently used parameters first. - k "cluster/parameter" { type = integer; style=notnull; } - str "cluster/parameter" { type = string; } - int "cluster/parameter" { type = integer; default = runserver(:parameters:k); } - dbl "cluster/parameter" { type = double; default=4.2; } - // conversion "cluster/parameter" { type=double; default=4; } - - null_str "cluster/parameter" { type = string; default = null; } - null_int "cluster/parameter" { type = integer; default = null; } - null_dbl "cluster/parameter" { type = double; default = null; } - - // We calculate k in a centrallix script using: - // k = max(2, pow(log(n) / log(36), 3.2) - 8) - // where n is the number of records passed. - - // Specify the data source at the top of the file. - // How do we pass distinct data? Should the driver - // handle that for us? - source = "/apps/kardia/data/Kardia_DB/p_partner/rows"; - attr_name = p_given_name; // runserver(:parameters:str) - - // Multiple data sources when? - - // Clustering object specifies properties for clustering. - kmeans_cluster "cluster/cluster" - { - algorithm = "k-means"; - similarity_measure = "cosine"; - // window_size = 16; - num_clusters = runserver(:parameters:k); - min_improvement = 0.0001; - max_iterations = 48; - - // Create subclusters. (Not implemented) - sub_cluster "cluster/cluster" - { - algorithm = "none"; - similarity_measure = "cosine"; - num_clusters = 7; - min_improvement = "max"; - } - } - - // Complete search. - no_clustering "cluster/cluster" - { - algorithm = "none"; - } - - dups "cluster/search" - { - source = kmeans_cluster; - similarity_measure = "cosine"; - threshold = 0.75; - } - - dups2 "cluster/search" - { - source = no_clustering; - similarity_measure = "cosine"; - threshold = 0.75; - } - } diff --git a/centrallix-os/file2.cluster b/centrallix-os/file2.cluster deleted file mode 100644 index a55c37f85..000000000 --- a/centrallix-os/file2.cluster +++ /dev/null @@ -1,42 +0,0 @@ -$Version=2$ -file_name "system/cluster" - { - // Developer can specify parameters to improve file reuseability. - // TIP: Improve performance by declairing frequently used parameters first. - k "cluster/parameter" { type = integer; style=notnull; } - str "cluster/parameter" { type = string; default="k-means"; } - int "cluster/parameter" { type=integer; default=:parameters:k; } - dbl "cluster/parameter" { type=double; default=4.2; } - // conversion "cluster/parameter" { type=double; default=4; } - - null_str "cluster/parameter" { type = string; default = null; } - null_int "cluster/parameter" { type = integer; default = null; } - null_dbl "cluster/parameter" { type = double; default = null; } - - // We calculate k in a centrallix script using: - // k = max(2, pow(log(n) / log(36), 3.2) - 8) - // where n is the number of records passed. - - // Specify the data source at the top of the file. - // How do we pass distinct data? Should the driver - // handle that for us? - source = "/apps/kardia/data/Kardia_DB/p_partner/rows"; - attr_name = "p_given_name"; - - // Clustering object specifies properties for clustering. - kmeans_cluster "cluster/cluster" - { - algorithm = "k-means"; - similarity_measure = "cosine"; - num_clusters = :parameters:k; - min_improvement = 0.0001; - max_iterations = 48; - } - - dups "cluster/search" - { - source = kmeans_cluster; - threshold = 0.75; - similarity_measure = "cosine"; - } - } diff --git a/centrallix-sysdoc/OSDriver_Authoring.md b/centrallix-sysdoc/OSDriver_Authoring.md index 5755d15c5..d00c192f6 100644 --- a/centrallix-sysdoc/OSDriver_Authoring.md +++ b/centrallix-sysdoc/OSDriver_Authoring.md @@ -1,52 +1,76 @@ # ObjectSystem Driver Interface -Author: Greg Beeley -Date: January 13, 1999 +**Author**: Greg Beeley -Updated: March 9, 2011 +**Date**: January 13, 1999 -License: Copyright (C) 2001-2011 LightSys Technology Services. See LICENSE.txt for more information. +**Updated**: November 27, 2025 + +**License**: Copyright (C) 2001-2011 LightSys Technology Services. See LICENSE.txt for more information. ## Table of Contents - [ObjectSystem Driver Interface](#objectsystem-driver-interface) - [Table of Contents](#table-of-contents) - [I Introduction](#i-introduction) - [II Interface](#ii-interface) - - [A. Initialization](#a--initialization) - - [B. Opening And Closing Objects](#b--opening-and-closing-objects) - - [C. Creating and Deleting Objects.](#c--creating-and-deleting-objects) - - [D. Reading and Writing Object Content.](#d--reading-and-writing-object-content) - - [E. Querying for Child Objects.](#e--querying-for-child-objects) - - [F. Managing Object Attributes](#f--managing-object-attributes) - - [G. Managing Object Methods](#g--managing-object-methods) + - [Function: Open](#function-open) + - [Function: OpenChild()](#function-openchild) + - [Function: Close()](#function-close) + - [Function: Create()](#function-create) + - [Function: Delete()](#function-delete) + - [Function: DeleteObj()](#function-deleteobj) + - [Function: Read()](#function-read) + - [Function: Write()](#function-write) + - [Function: OpenQuery()](#function-openquery) + - [Function: QueryDelete()](#function-querydelete) + - [Function: QueryFetch()](#function-queryfetch) + - [Function: QueryCreate()](#function-querycreate) + - [Function: QueryClose()](#function-queryclose) + - [Function: GetAttrType()](#function-getattrtype) + - [Function: GetAttrValue()](#function-getattrvalue) + - [Function: GetFirstAttr()](#function-getfirstattr--getnextattr) + - [Function: GetNextAttr()](#function-getfirstattr--getnextattr) + - [Function: SetAttrValue()](#function-setattrvalue) + - [Function: AddAttr()](#function-addattr) + - [Function: OpenAttr()](#function-openattr) + - [Function: GetFirstMethod()](#function-getfirstmethod--getnextmethod) + - [Function: GetNextMethod()](#function-getfirstmethod--getnextmethod) + - [Function: ExecuteMethod()](#function-executemethod) + - [Function: PresentationHints()](#function-presentationhints) + - [Function: Info()](#function-info) + - [Function: Commit()](#function-commit) + - [Function: GetQueryCoverageMask()](#function-getquerycoveragemask) + - [Function: GetQueryIdentityPath()](#function-getqueryidentitypath) - [III Reading the Node Object](#iii-reading-the-node-object) - - [pSnNode snReadNode(pObject obj)](#psnnode-snreadnodepobject-obj) - - [pSnNode snNewNode(pObject obj, char* content_type)](#psnnode-snnewnodepobject-obj-char-content_type) - - [int snWriteNode(pSnNode node)](#int-snwritenodepsnnode-node) - - [int snDeleteNode(pSnNode node)](#int-sndeletenodepsnnode-node) - - [int snGetSerial(pSnNode node)](#int-sngetserialpsnnode-node) - - [pStructInf stParseMsg(pFile inp_fd, int flags)](#pstructinf-stparsemsgpfile-inp_fd-int-flags) - - [pStructInf stParseMsgGeneric(void* src, int (*read_fn)(), int flags)](#pstructinf-stparsemsggenericvoid-src-int-read_fn-int-flags) - - [int stGenerateMsg(pFile out_fd, pStructInf info, int flags)](#int-stgeneratemsgpfile-out_fd-pstructinf-info-int-flags) - - [int stGenerateMsgGeneric(void* dst, int (*write_fn)(), pStructInf info, int flags)](#int-stgeneratemsggenericvoid-dst-int-write_fn-pstructinf-info-int-flags) - - [pStructInf stCreateStruct(char* name, char* type)](#pstructinf-stcreatestructchar-name-char-type) - - [pStructInf stAddAttr(pStructInf inf, char* name)](#pstructinf-staddattrpstructinf-inf-char-name) - - [pStructInf stAddGroup(pStructInf inf, char* name, char* type)](#pstructinf-staddgrouppstructinf-inf-char-name-char-type) - - [int stAddValue(pStructInf inf, char* strval, int intval)](#int-staddvaluepstructinf-inf-char-strval-int-intval) - - [pStructInf stLookup(pStructInf inf, char* name)](#pstructinf-stlookuppstructinf-inf-char-name) - - [int stAttrValue(pStructInf inf, int* intval, char** strval, int nval)](#int-stattrvaluepstructinf-inf-int-intval-char-strval-int-nval) - - [int stFreeInf(pStructInf this)](#int-stfreeinfpstructinf-this) + - [Module: st_node](#module-st_node) + - [st_node: snReadNode()](#st_node-snreadnode) + - [st_node: snNewNode()](#st_node-snnewnode) + - [st_node: snWriteNode()](#st_node-snwritenode) + - [st_node: snDelete()](#st_node-sndeletenode) + - [st_node: snGetSerial()](#st_node-sngetserial) + - [st_node: snGetLastModification()](#st_node-sngetlastmodification) + - [Module: stparse](#module-stparse) + - [stparse: stStructType()](#stparse-ststructtype) + - [stparse: stLookup()](#stparse-stlookup) + - [stparse: stAttrValue()](#stparse-stattrvalue) + - [stparse: stGetExpression()](#stparse-stgetexpression) + - [stparse: stCreateStruct()](#stparse-stcreatestruct) + - [stparse: stAddAttr()](#stparse-staddattr) + - [stparse: stAddGroup()](#stparse-staddgroup) + - [stparse: stAddValue()](#stparse-staddvalue) + - [stparse: stFreeInf()](#stparse-stfreeinf) + - [stparse: Using Fields Directly](#stparse-using-fields-directly) - [IV Memory Management in Centrallix](#iv-memory-management-in-centrallix) - - [void* nmMalloc(int size)](#void-nmmallocint-size) - - [void nmFree(void* ptr, int size)](#void-nmfreevoid-ptr-int-size) - - [void nmStats()](#void-nmstats) - - [void nmRegister(int size, char* name)](#void-nmregisterint-size-char-name) - - [void nmDebug()](#void-nmdebug) - - [void nmDeltas()](#void-nmdeltas) - - [void* nmSysMalloc(int size)](#void-nmsysmallocint-size) - - [void nmSysFree(void* ptr)](#void-nmsysfreevoid-ptr) - - [void* nmSysRealloc(void* ptr, int newsize)](#void-nmsysreallocvoid-ptr-int-newsize) - - [char* nmSysStrdup(const char* str)](#char-nmsysstrdupconst-char-str) + - [nmMalloc()](#nmmalloc) + - [nmFree()](#nmfree) + - [nmStats()](#nmstats) + - [nmRegister()](#nmregister) + - [nmDebug()](#nmdebug) + - [nmDeltas()](#nmdeltas) + - [nmSysMalloc()](#nmsysmalloc) + - [nmSysRealloc()](#nmsysrealloc) + - [nmSysStrdup()](#nmsysstrdup) + - [nmSysFree()](#nmsysfree) - [V Other Utility Modules](#v-other-utility-modules) - [A. XArray (XA) - Arrays](#axarray-xa---arrays) - [xaInit(pXArray this, int init_size)](#xainitpxarray-this-int-init_size) @@ -115,520 +139,1028 @@ License: Copyright (C) 2001-2011 LightSys Technology Services. See LICENSE.txt - [B. Object attribute enumeration, getting, and setting.](#bobject-attribute-enumeration-getting-and-setting) - [C. Object querying (for subobjects)](#cobject-querying-for-subobjects) + + ## I Introduction -An objectsystem driver's purpose is to provide access to a particular type of local or network data/resource, and to organize that data in a tree- structured heirarchy that can be integrated into the Centrallix's ObjectSystem. This tree structure will vary based on the data being presented, but will fit the basic ObjectSystem model of a heirarchy of objects, each having attributes, perhaps some methods, and possibly content. +An objectsystem driver's purpose is to provide access to a particular type of local or network data/resource. Specific information about the resource to be accessed (such as credentials for a database, queries for selecting data, the auth token for an API, etc.) is stored in a file that is openned by the relevant driver. For example, the query driver (defined in `objdrv_query.c`) opens `.qy` files, which store one or more ObjectSQL queries used to fetch data. -Each objectsystem driver will implement this subtree structure rooted at what is called the "node" object. The node has a specifically recognizable object type which the ObjectSystem Management Layer uses to determine which OS Driver to pass control to. Normally, the 'node' object is a UNIX file either with a particular extension registered with the OSML, or a UNIX file residing in a directory containing a '.type' file, which contains the explicit object type for all objects in that directory without recognizable extensions. +When the object system starts up, each driver registers one or more type names that it supports (e.g. `"system/query"` for the query driver). When a file is openned, the object system uses the file's type name to select which driver to use. It finds this type name with one of two strategies. If the file has an extension (e.g. `example.qy`), that extension can be mapped to a type name using `types.cfg` (e.g. `.qy` maps to `"system/query"`). Althernatively, the file may reside in a directory containing a `.type` file which explicitly specifies the type name for all files in that directory without recognizable extensions. -Normally, objectsystem drivers will be able to manage any number of 'node' objects and the subtrees rooted at them. Each 'node' object will normally relate to a particular instance of a network resource, or in some cases, a group of resources that are easily enumerated. For example, a POP3 server would be a network resource that an OS driver could be written for. If the network had multiple POP3 servers, then that one OS driver would be able to access each of them using different node objects. However, if somehow the OS driver were able to easily enumerate the various POP3 servers on the network (i.e., they responded to some kind of hypothetical broadcast query), then the OS driver author could optionally design the driver to list the POP3 servers under a single node for the whole network. +Once a file is openned, the driver should organize provided data into a tree-structured hierarchy, which becomes part of the path used by Centrallix's ObjectSystem. For example, when opening `example.qy` in the ObjectSystem, the driver makes `/rows` and `/columns` available, allowing for paths such as `/apps/data/example.qy/rows`. The root of a driver's tree (`example.qy`) is called the driver's "node" object, and most paths traverse the root nodes of multiple drivers. A driver author is free to define any manner of tree structures for representing data available within their driver. However, the structure should fit the basic ObjectSystem model of a hierarchy of objects, each having attributes, and optionally some methods and/or content. -The structure of the subtree beneath the node object is entirely up to the drivers' author to determine; the OSML does not impose any structural restrictions on such subtrees. +A driver can be openned multiple times, leading one driver to have multiple "node" objects, also called instances. Typically, each "node" object relates to a particular instance of a network resource. For example, an instance of a POP3 driver might represent a POP3 server on the network. If the network had multiple POP3 servers, this driver could be used to access each of them through different node objects (e.g. `dev.pop3`, `prod.pop3`, etc.). However, if somehow the OS driver were able to easily enumerate the various POP3 servers on the network (i.e., they responded to some kind of hypothetical broadcast query), then the OS driver author could also design the driver to list the POP3 servers under a single node for the whole network. -Here is one example of an OS Driver's node object and subtree (this is for the Sybase OS Driver, objdrv_sybase.c): +The structure of the subtree beneath the node object is entirely up to the drivers' author to determine; the OSML does not impose any structural restrictions on such subtrees. Each object within this structure (e.g. `/example.qy`) can have three types of readable data: +- Child objects (e.g. `/rows`) which can have their own data. +- Content, which can be read similar to reading a file. +- Query data, allowing the object to be queried for information. -``` -OMSS_DB (type = application/sybase) +Thus, parent objects with child objects behave similarly to a directory, although they can still have separate readable data _and_ queryable data. This may seem foreign in the standard file system paradime, however, it is common for web servers, where opening a directory often returns `index.html` file in that directory, or some other form of information to allow further navigation. Querying an object was originally intended as a way to quickly traversal of its child objects, although queries are not required to be implemented this way. + +Below is an example of the Sybase driver's node object and its subtrees of child objects (defined in `objdrv_sybase.c`): + +```sh +OMSS_DB (type = "application/sybase") | - +--- JNetHelp (type = system/table) - | | - | +--- columns (type = system/table-columns) - | | | - | | +--- document_id (type = system/column) - | | | - | | +--- parent_id (type = system/column) - | | | - | | +--- title (type = system/column) - | | | - | | +--- content (type = system/column) - | | - | +--- rows (type = system/table-rows) - | | - | +--- 1 (type = system/row) - | | - | +--- 2 (type = system/row) + +----- JNetHelp (type = "system/table") + | | + | +----- columns (type = "system/table-columns") + | | | + | | +----- document_id (type = "system/column") + | | | + | | +----- parent_id (type = "system/column") + | | | + | | +----- title (type = "system/column") + | | | + | | +----- content (type = "system/column") + | | + | +----- rows (type = "system/table-rows") + | | + | +----- 1 (type = "system/row") + | | + | +----- 2 (type = "system/row") | - +--- Partner (type = system/table) + +----- Partner (type = "system/table") ``` (... and so forth) -In this case the node object would contain the information necessary to access the database, such as server name, database name, max connections to pool, and so forth. More about the node object and managing its parameters will be discussed later in this document. +In this case, the `OMSS_DB` file becomes the driver's node object. This file would contain the information necessary to access the database, such as server name, database name, max connections to pool, and so forth. -OS Drivers support several primary areas of functionality: opening and closing objects, reading and writing object content (if the object has content), setting and viewing object attributes, executing object methods, and querying an object's child objects based on name and/or attribute values. Drivers will also support the creation and deletion of objects and/or a set of child objects. +OS Drivers support several primary areas of functionality: +- Opening and closing objects. +- Creating and deleting node objects (optional). +- Reading and writing object content (optional). +- Getting and (optionally) setting object attributes. +- Executing object methods (optional). +- Querying data attributes (optional). -## II Interface -This section describes the standard interface between the OSML and the ObjectSystem driver itself. +Using the example above, we can query from the database using a statement like `select :title from /OMSS_DB/JNetHelp/rows`, which will open a sybase driver instance, then open a query and repeatedly fetch rows, getting the `title` attribute from each row. -### A. Initialization -Each OS Driver will have an initialization function, normally named xxxInitialize() where 'xxx' is the driver's abbreviative prefix. This prefix should be attached to each and every function within the OS driver for consistency and project management. Normally 'xxx' is two to four characters, all lowercase. This initialization function is called when the Centrallix starts up, and at least at the present time, this initial call to the OS driver must be manually added to the appropriate startup code, currently found in 'centrallix.c'. - -Within the initialization function, the driver should initialize all necessary global variables and register itself with the OSML. Global variables should all be placed inside a single global 'struct', which is normally named similarly to the driver's prefix, except normally in all uppercase. Under no circumstances should global variables be accessed outside of the module, except via the module's functions. - -To register with the OSML, the driver must first allocate an ObjDriver structure and fill in its contents. +## II Interface +This section describes the standard interface between the OSML and the ObjectSystem driver itself. Every driver should implement certain required functions. (**Note**: Many drivers "implement" some required functions to simply fail with a not implemented or not supported error. For example, most database drivers "implement" `Read()` and `Write()` this way because database content should be queried, not read). Various optional functions are also available, which a driver is not required to implement. + + +The driver should implement an `Initialize()` function, as well as the following (* indicates required functions): +| Function Name | Description +| --------------------------------------------------------- | ------------ +| [Open](#function-open)* | Opens a new driver instance object on a given root node. +| [OpenChild](#function-openchild) | ??? +| [Close](#function-close)* | Close an open object created by either `Open()` or `QueryFetch()`. +| [Create](#function-create) | Create a new driver root node object. +| [Delete](#function-delete) | Delete an existing driver root node object. +| [DeleteObj](#function-deleteobj)* | ??? +| [OpenQuery](#function-openquery)** | Start a new query for child objects of a given object. +| [QueryDelete](#function-querydelete) | Delete specific objects from a query's result set. +| [QueryFetch](#function-queryfetch)** | Open the next child object in the query's result set. +| [QueryCreate](#function-querycreate) | ??? +| [QueryClose](#function-queryclose)** | Close an open query. +| [Read](#function-read)* | Read content from the object. +| [Write](#function-write)* | Write content to the object. +| [GetAttrType](#function-getattrtype)* | Get the type of a given object's attribute. +| [GetAttrValue](#function-getattrvalue)* | Get the value of a given object's attribute. +| [GetFirstAttr](#function-getfirstattr--getnextattr)* | Get the name of the object's first attribute. +| [GetNextAttr](#function-getfirstattr--getnextattr)* | Get the name of the object's next attribute. +| [SetAttrValue](#function-setattrvalue) | Set the value of an object's attribute. +| [AddAttr](#function-addattr) | Add a new attribute to an object. +| [OpenAttr](#function-openattr) | Open an attribute as if it were an object with content. +| [GetFirstMethod](#function-getfirstmethod--getnextmethod) | Get the name of an object's first method. +| [GetNextMethod](#function-getfirstmethod--getnextmethod) | Get the name of an object's next method. +| [ExecuteMethod](#function-executemethod) | Execute a method with a given name and optional parameter string. +| [PresentationHints](#function-presentationhints) | Get info about an object's attributes. +| [Info](#function-info)* | Get info about an object instance. +| [Commit](#function-commit) | Commit changes made to an object. +| [GetQueryCoverageMask](#function-getquerycoveragemask) | ??? +| [GetQueryIdentityPath](#function-getqueryidentitypath) | ??? + +_*Function is always required._ + +_**Function is required to support queries._ + + +--- +### Abbreviative Prefix +Each OS Driver will have an abbreviation prefix, such as `qy` for the query driver or `sydb` for the sybase database driver. This prefix should be prepended to the start of every public function name within the OS driver for consistency and scope management (e.g. `qyInitialize()`, `sydbQueryFetch()`, etc.). Normally, a driver's abbreviation prefix is two to four characters, all lowercase and may be the same as a file extension the driver supports. However, this is not an absolute requirement (see the cluster driver in `objdrv_cluster.c` which supports `.cluster` files using an abbreviation prefix of `cluster`). + +This document uses `xxx` to refer to an unspecified abbreviative prefix. + +--- +### Internal Functions +It is highly likely that driver authors will find shared functionality in the following functions, or wish to abstract out functionality from any of them for a variety of reasons. When creating additional internal functions in this way, they should be named using the convention of `xxx_internal_FunctionName()`, or possibly `xxxi_FunctionName()` for short. + +--- +### Function: Initialize ```c - pObjDriver drv; - - drv = (pObjDriver)nmMalloc(sizeof(ObjDriver)); +/*** @returns 0 if successful, or + *** -1 if an error occurred. + ***/ +int xxxInitialize(void) ``` +- ⚠️ **Warning**: Currently, the success/failure of this function is ignored by the caller. +- 📖 **Note**: Unlike other functions defined in the driver, each driver author must manually add this call to the start up code, found in the `cxDriverInit()` function in `centrallix.c`. -This involves setting a large number of fields to the appropriate entry points within the OS Driver, as well as telling the OSML what object type(s) are handled by the driver and giving the OSML a description of the driver. A list of the required entry point functions / fields follows: - -| Function/Field | Description -| -------------------- | ------------ -| Open | Function that the OSML calls when the user opens an object managed by this driver. -| Close | Close an open object. -| Create | Create a new object. -| Delete | Delete an existing object. -| OpenQuery | Start a query for child objects. -| QueryDelete | Delete all objects in the query result set. -| QueryFetch | Open the next child object in the query's result set. -| QueryClose | Close an open query. -| Read | Read content from the object. -| Write | Write content to the object. -| GetAttrType | Get the type of an object's attribute. -| GetAttrValue | Get the value of an object's attribute. -| GetFirstAttr | Get the first attribute associated with the object. -| GetNextAttr | Get the next attribute associated with the object. -| SetAttrValue | Set the value of an attribute. -| AddAttr | Add a new attribute to an object. -| OpenAttr | Open an attribute as if it were an object with content. -| GetFirstMethod | Get the first method of the object. -| GetNextMethod | Get the next method of an object. -| ExecuteMethod | Execute a method with an optional string parameter. - -The only method that can be set to NULL is the QueryDelete method, in which case the OSML will call QueryFetch() and Delete() in succession. However, if the underlying network resource has the capability of intelligently deleting objects matching the query's criteria, this method should be implemented (as with a database server). - -Another field in the driver structure is the Capabilities field. This field is a bitmask, and can currently contain zero or more of the following options: +The initialization function is called when the Centrallix starts up, and should register the driver with the OSML and initialize necessary global variables. It is recommended to place global variables in a single global 'struct' that is named with the driver's prefix in all uppercase. Global variables should **NOT** be accessed from outside the driver. Instead, the driver should define functions to access them, allowing it to abstract details away from other drivers. -- OBJDRV_C_FULLQUERY: Indicates that this objectsystem driver will intelligently process the query's expression tree specified in the OpenQuery call, and will only return objects that match that expression. If this flag is missing, the OSML will filter objects returned by QueryFetch so that the calling user does not get objects that do not match the query. Typically this is set by database server drivers. +To register itself with the OSML, the driver should first allocate an ObjDriver structure and initialize its contents: - THE ABOVE IS OUT-OF-DATE. From now on, a driver can determine whether to handle the Where and OrderBy on a per-query basis, by setting values in the ObjQuery structure used when opening a new query. This is because a driver may be able to handle Where and OrderBy for some object listings but not for others. +```c +pObjDriver drv = (pObjDriver)nmMalloc(sizeof(ObjDriver)); +if (drv == NULL) goto error_handling; +memset(drv, 0, sizeof(ObjDriver)); +... +``` -- OBJDRV_C_TRANS: Indicates that this objectsystem driver requires transaction management by the OSML's transaction layer (the OXT layer). OS drivers that require this normally are those that for some reason cannot complete operations in independence from one another. For example, with a database driver, the creation of a new row object and the setting of its attributes must be done as one operation, although the operation requires several calls from the end user's process. The OXT allows for the grouping of objectsystem calls so that the os driver does not have to complete them independently, but instead can wait until several calls have been made before actually completing the operation. +To initialize this struct, the driver must: +- Provide a name (in `drv->Name`). +- Provide an array of supported root node types (in `drv->RootContentTypes`). +- Provide capability flags (in `drv->Capabilities`). +- Provide function pointers to implemented functions (see [II Interface](#ii-interface) for a list). -The 'Name' field should be filled in with a description of the OS driver, with a maximum length of 63 characters (plus the string null terminator). Normally, the 2-4 letter prefix of the driver is included at the beginning of 'Name', such as "UXD - UNIX filesystem driver". +#### Name +The `name` field is a 64 character buffer (allowing names up to 63 characters, with a null terminator). It usually follows the format of the driver abbreviation prefix (in all uppercase), followed by a dash, followed by a descriptive name for the driver. -Finally, the 'RootContentTypes' field is an XArray containing a list of strings, each of which specifies the node object types that the driver will handle. Such types are added to this XArray using the normal XArray utility functions, such as: +For example: ```c - xaInit(&drv->RootContentTypes, 16); - xaAddItem(&drv->RootContentTypes, "system/file"); - xaAddItem(&drv->RootContentTypes, "system/directory"); +if (strcpy(drv->Name, "SYBD - Sybase Database Driver") == NULL) goto error_handling; ``` -When the structure has been filled out, the os driver should call the OSML to register itself, using the objRegisterDriver function: +#### RootContentTypes +The `RootContentTypes` field is an XArray containing a list of strings, representing the type names that the driver can open. This should only include types the driver will handle as root nodes, not other objects created by the driver. Thus, the sybase driver would include `"application/sybase"`, but not `"system/table"`. +For example: ```c - objRegisterDriver(drv); +if (xaInit(&(drv->RootContentTypes), 2) != 0) goto error_handling; +if (xaAddItem(&(drv->RootContentTypes), "application/sybase") < 0) goto error_handling; +if (xaAddItem(&(drv->RootContentTypes), ""system/query"") < 0) goto error_handling; ``` -The initialization function should return 0 to indicate success, or -1 on failure. Currently, initialization success/failure is not verified by lsmain.c. +- 📖 **Note**: To make a specific file extension (like `.qy`) open in a driver, edit `types.cfg` to map that file extension to an available root content type supported by the driver (such as `"system/query"`). -The driver should NOT nmFree() the allocated driver structure unless the objRegisterDriver() routine fails (returns -1). +#### Capabilities +The capabilities field is a bitmask which can contain zero or more of the following flags: -Note that the RootContentTypes handled by the driver should only include the types of the objects this driver will handle as node objects. For instance, the Sybase database access driver uses "application/sybase" as its top level type. It won't register such things as "system/table". +- `OBJDRV_C_FULLQUERY`: Indicates that this objectsystem driver will intelligently process the query's expression tree specified in the OpenQuery call, and will only return objects that match that expression. If this flag is missing, the OSML will filter objects returned by QueryFetch so that the calling user does not get objects that do not match the query. Typically this is set by database server drivers. + - > **THE ABOVE IS OUT-OF-DATE** (May 16th, 2022): A driver can now determine whether to handle the Where and OrderBy on a per-query basis, by setting values in the ObjQuery structure used when opening a new query. This allows a because a driver to handle Where and OrderBy for some object listings but not others. -### B. Opening And Closing Objects -As an overview, the normal procedure for the open routine to follow is this: +- `OBJDRV_C_TRANS`: Indicates that this objectsystem driver requires transaction management by the OSML's transaction layer (the OXT layer). OS drivers that require this normally are those that for some reason cannot complete operations in independence from one another. For example, with a database driver, the creation of a new row object and the setting of its attributes must be done as one operation, although the operation requires several calls from the end user's process. The OXT allows for the grouping of objectsystem calls so that the os driver does not have to complete them independently, but instead can wait until several calls have been made before actually completing the operation. -1. Access the node object, or create it, depending on whether the object already exists as well as the open mode flags indicated by the end-user. -2. Upon successful node object access, determine what additional components of the pathname are to be handled by this driver, and verify that they can be opened, depending on the object's open mode (CREAT, EXCL, etc.) -3. If it hasn't been already, allocate a structure that will represent this open object and contain information about it and how we're to handle it. It should include a pointer to the node object. -4. Perform any operations inherent in the open process that have not already been performed (such as reading database table information, etc., when a db table's row is being accessed). -5. Return a pointer to the structure allocated in (3) as a void pointer. The OSML will pass this pointer back to the driver on subsequent calls that involve this object. +#### Registering the Driver Struct +When all values within the structure have been initialized, the driver should call the OSML to register itself, using the `objRegisterDriver()` function: -The first basic part of the OS driver consists of the Open and Close routines, normally named 'xxxOpen' and 'xxxClose' within the driver, where 'xxx' is the driver's prefix. The Close routine is normally fairly simple, but the Open routine is one of the most complicated routines in a typical OS driver, for the Open routine must parse the subtree pathname beneath the node object. For example, if the node object had a pathname like: - -```sh - /datasources/OMSS_DB +```c +if (objRegisterDriver(drv) != 0) goto error_handling; ``` -and the user opened an object called: -```sh - /datasources/OMSS_DB/JNetHelp/rows/1 +--- +### Function: Open() +```c +void* xxxOpen(pObject parent, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); ``` -the OS driver would have to determine what the subtree pathname 'JNetHelp/rows/1' means, since this path will mean different things to different os drivers. - -The Open routine also must determine whether the object already exists or not, and if not, whether to create a new object. This logic is largely dependent on the `obj->Mode` flags, as if `O_CREAT` is included, the driver must attempt to create the object if it does not already exist, and if `O_EXCL` is included, the driver must refuse to open the object if it already exists, as with the UNIX `open()` system call semantics. +The `Open()` function opens a given file to create a new driver instance. This procedure normally includes the following steps: -Finally, if the os driver specified a capability of `OBJDRV_C_TRANS`, it must pay attention to the current state of the end-user's transaction. If a new object is being created, an object is being deleted, or other modifications/additions are being performed, and if the OXT layer indicates a transaction is in process, the driver must either complete the current transaction and then complete the current call, or else add the current delete/create/modify call to the transaction tree (in which case the tree item is preallocated; all the driver needs to do is fill it in). The transaction layer will be discussed in depth later in this document. +1. Access or create the node object, depending on specified flags and whether or not it already exists. +2. Parse additional contents of the path after the root node. +3. Allocate a structure that will represent the open object, including a pointer to the node object. +4. Perform other opening operations (such as reading database table information, etc., when a db table's row is being accessed). +5. Return a pointer to the node instance as a void pointer. This pointer will be passed as `void* inf_v` to the driver in subsequent calls involving this object (except the Query functions, discussed below). -As a part of the Open process, the OS driver will normally allocate an internal structure to represent the current open object, and will return that structure as a `void*` data type in the return value. This pointer will be then passed to each of the other driver entry point functions, with the exception of QueryFetch, QueryDelete, and Query- Close, which will be discussed later. +- 📖 **Note - Transactions**: If the os driver specified the `OBJDRV_C_TRANS` capability, it must respect the current state of the user's transaction. If a new object is being created, an object is being deleted, or other modifications/additions are being performed, and if the OXT layer indicates a transaction is in process, the driver must either complete the current transaction and then complete the current call, or else add the current delete/create/modify call to the transaction tree (in which case the tree item is preallocated; all the driver needs to do is fill it in). This is handled using the transaction tree parameter (`oxt : pObjTrxTree*`). The transaction later is discussed in depth in the ??? section. + -The Open() routine is called with five parameters: +#### Accessing the Node Object +If `O_CREAT` and `O_EXCL` are both specified in `parent->Mode`, the driver should **only** create a new file and fail if the file already exists (refusing to open and read it). Otherwise, the driver should read an existing file, or create one if it does not exist and `O_CREAT` is specified, failing if no file can be read or created. -- `obj` (pObject) - This is a pointer to the Object sturcture maintained by the OSML. This structure will contain some important fields for processing the open() request. +#### Parsing Path Contents +The task of parsing the provided path into the subtree beneath its root node is one of the more complex operations for a driver. For example, the path to a driver's root node might be `/datasources/OMSS_DB` and the user opens an object called `/datasources/OMSS_DB/JNetHelp/rows/1`. In this case, the OS driver must parse the meaning of the subtree path `JNetHelp/rows/1`, storing the data targetted by the user into the driver instance to allow later method calls to access the correct data. - - `obj->Mode` is a bitmask of the O_* flags, which include `O_RDONLY`, `O_WRONLY`, `O_RDWR`, `O_CREAT`, `O_TRUNC`, and `O_EXCL`. +#### Parameters +The `Open()` routine is called with five parameters: - - `obj->Pathname` is a Pathname structure which contains the complete parsed pathname for the object. This structure is defined in the file `include/obj.h`, and has a buffer for the pathname as well as an array of pointers to the pathname's components. The function `obj_internal_PathPart()` can be used to obtain at will any component or series of components of the pathname. +- `obj : pObject`: A pointer to the Object structure maintained by the OSML. This structure includes some useful fields: + + - `obj->Mode : int`: A bitmask of the O_* flags, which include: `O_RDONLY` (read only), `O_WRONLY` (write only), `O_RDWR` (read/write), `O_CREAT` (create), `O_TRUNC` (truncate), and `O_EXCL` (exclusive, see above). + + - `obj->Pathname : pPathname`: A pointer to a Pathname struct (defined in `include/obj.h`) which contains the complete parsed pathname for the object. This provides a buffer for the pathname as well as an array of pointers to the pathname's components. The function `obj_internal_PathPart()` can be used to obtain at will any component or series of components of the pathname. - - `obj->Pathname->OpenCtl[]` contains parameters to the open() operation. Frequently these params provide additional information on how to open the object. The use of these parameters is determined by the author of the objectsystem driver. The parameters are those passed in normal URL fasion (?param=value, etc.). Typically, the only OpenCtl of interest is going to be `obj->Pathname->OpenCtl[obj->SubPtr]` (see below for SubPtr meaning). + - `obj->Pathname->OpenCtl : pStruct[]`: Parameters for the open() operation, as defined by the driver author. These are specified in the path in a similar way to URLs (`example.qy?param1=value¶m2=other_value`). Drivers typically only use `obj->Pathname->OpenCtl[obj->SubPtr]` (see SubPtr below) to retrieve their own parameters, ignoring parameters passed to other drivers in the path. - - `obj->SubPtr` is the number of components in the path that are a part of the node object's path. For example, in the above path of '/datasources/OMSS_DB', the path would be internally represented as './datasources/ OMSS_DB', and the SubPtr would be 3. + - `obj->SubPtr : short`: The number of components in the path that are a part of the path to the root node object, including the `.` for the top level directory. For example, in the above path of `/data/file.csv`, the path would be internally represented as `./ data/ file.csv`, so SubPtr is 3. - - `obj->SubCnt` reflects the number of components of the path which are under the control of the current driver. This includes the node object, so SubCnt will always be at least 1. For example, when opening '/data/file.csv/rows/1', and the driver in question is the CSV driver, SubPtr would be 3 (includes an "invisible" first component), from '/data/file.csv', and SubCnt would be 3, from 'file.csv/rows/1'. The driver will need to SET THE SUBCNT value in its Open function. SubPtr is already set. + - `obj->SubCnt : short`: _The driver should set this value_ to show the number of components it controls. This includes the root node object, so `SubCnt` will always be at least 1. For example, when opening `/data/file.csv/rows/1`, the CSV driver will read the `SubPtr` of 3 (see above), representing `./ data/ file.csv`. It will then set a `SubCnt` of 3, representing that it will control `file.csv /rows /1`. (The driver only sets `SubCnt`, `SubPtr` is provided.) - - `obj->Prev` is the underlying object as opened by the next-lower-level driver. It is the duty of this driver to parse the content of that object and do something meaningful with it. + - `obj->Prev : pObject`: The underlying object as opened by the next-lower-level driver. The file can be accessed and parsed by calling functions and passing this pointer to them (such as the st_parse functions, see below). **DO NOT attempt to open the file directly with a call like `fopen()`,** as this would require hard coding the path to the root directory of the object system, which *will* break if the code runs on another machine. - - `obj->Prev->Flags` contains some critical information about the underlying object. If it contains the flag `OBJ_F_CREATED`, then the underlying object was just created by this open() operation. In that case, this driver is expected to create the node with snNewNode() (see later in this document) as long as obj->Mode contains O_CREAT. + - `obj->Prev->Flags : short`: Contains some useful flags about the underlying object, such as: + - `OBJ_F_CREATED`: The underlying object was just created by this open() operation. In that case, this driver is expected to create the node with `snNewNode()` (see later in this document) as long as `obj->Mode` contains `O_CREAT`. + -- `mask` (int) - Indicates the security mask to be given to the object if it is being created. Typically, this will only apply to files and directories. The values are the same as UNIX chmod() type values. +- `mask : int`: The permission mask to be given to the object, if it is being created. Typically, this will only apply to files and directories, so most drivers can ignore it. The values are the same as the UNIX [octal digit permissions](https://en.wikipedia.org/wiki/Chmod#:~:text=Octal%20digit%20permission) used for the `chmod()` command. -- `systype` (pContentType) - This param indicates the content type of the node object as determined by the OSML. The ContentType structure is defined in include/ obj.h, and includes among other things the name of the content type. For example, for the reporting driver, this type would be "system/report". +- `sys_type : pContentType`: Indicates the content type of the node object as determined by the OSML. The ContentType structure is defined in `include/obj.h`. `sys_type->Name` lists the name of the content type (e.g. `"system/query"` for the query driver). + -- `usrtype` (char*) - This param is the requested object type by the user and is normally used when creating a new object, though under some circumstances it may change the way the open operates on an existing object. For example, the reporting driver can change whether it generates HTML report text or plaintext reports based on usrtype being either "text/html" or "text/plain". +- `usr_type : char*`: The object type requested by the user. This is normally used when creating a new object, though some drivers also use it when opening an existing object. For example, the reporting driver generates HTML report text or plaintext reports if `usr_type` is `"text/html"` or `"text/plain"` (respectively). -- `oxt` (pObjTrxTree*) - This param is only used by object drivers that specified a capability of OBJDRV_C_TRANS. More on this field later. For non-transaction-aware drivers, this field can be safely ignored. +- `oxt : pObjTrxTree*`: The transaction tree, used when the driver specifies the `OBJDRV_C_TRANS` capability. More on this field later. Non-transaction-aware drivers can safely ignore this field. + + 📖 **Note**: Yes, this param *is* a pointer to a pointer. Essentially, a pointer passed by reference. - Yes, this param *is* a pointer to a pointer. Essentially, a pointer passed by reference. -The Open routine should return its internal structure pointer on success, or `NULL` on failure. It is normal to allocate one such structure per Open call, and for the structure to point, among other things, to shared data describing the node object. Accessing the node object is described later in this document. +The `Open()` routine should return a pointer to an internal driver structure on success, or `NULL` on failure. It is normal to allocate one such structure per `Open()` call, and for one of the structure fields to point to shared data describing the node object. Accessing the node object is described later in this document. -It is important to know what kinds of fields normally are placed in the allocated data structure returned by Open. These fields are all determined by the driver author, but here are a few typical ones that are helpful to have ("inf" is the pointer to the structure here): +While driver instance structures may vary, some fields are common in most drivers (`inf` is the pointer to the structure here): | Field | Type | Description | ---------- | --------- | ------------ -| inf->Obj | pObject | This is a copy of the 'obj' pointer passed to the Open routine. -| inf->Mask | int | The 'mask' argument passed to Open. -| inf->Node | pSnNode | A pointer to the node object, as returned from snNewNode() or snReadNode(), or if structure files aren't being used as the node content type, a pointer to whatever structure contains information about the node object. +| inf->Obj | pObject | A copy of the `obj` pointer passed to `Open()`. +| inf->Mask | int | The `mask` argument passed to `Open()`. +| inf->Node | pSnNode | A pointer to the node object. This can come from `snNewNode()` or `snReadNode()` (for structure files), or other node struct information. + + +--- +### Function: OpenChild() +*(Optional)* +```c +void* xxxOpenChild(void* inf_v, pObject obj, char* child_name, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` +**No documentation provided.** + +--- +### Function: Close() +```c +int xxxClose(void* inf_v, pObjTrxTree* oxt); +``` +The close function closes a driver instance, freeing all allocated data and releasing all shared memory such as open connections, files, or other driver instances. The driver must ensure that all memory allocated by originally opening the object (or allocated by other functions that may be called on an open object) is properly deallocated. This includes the internal structure returned by `Open()`, or by `QueryFetch()`, which is passed in as `inf_v`. The driver may also need to decrement the Open Count (`node->OpenCnt--`) if it had to increment this value during `Open()`. Before doing so, it should also perform a `snWriteNode()` to write any modified node information to the node object. -The Close() routine is called with two parameters: +- 📖 **Note**: Remember that the passed driver instance may originally be from a call to `Open()` or a call to `QueryFetch()`. + +- 📖 **Note**: Even if close fails, the object should still be closed in whatever way is possible. The end-user should deal with the resulting situation by reviewing the `mssError()` messages left by the driver. + +- 📖 **Note**: Information may be left unfreed if it is stored in a cache for later use. + +The `Close()` routine is called with two parameters: | Param | Type | Description | ------ | ------------ | ------------ -| inf_v | void* | This param is the pointer that the Open routine returned. Normally the driver will cast the void* parameter to some other structure pointer to access the object's information. -| oxt | pObjTrxTree* | The transaction tree pointer. - -The Close routine should return 0 on success or -1 on failure. The os driver must make sure it properly deallocates the memory used by originally opening the object, such as the internal structure returned by open and passed in as inf_v. +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -Note the semantics of a Close failure - the object should still be closed in whatever way is still meaningful. The end-user must deal with the situation by reviewing the returned mssError messages. +The Close routine should return 0 on success or -1 on failure. -Before exiting, the Close routine should make sure it decrements the Open Count (node->OpenCnt--). Before doing so, it should also perform a snWriteNode() to write any modified node information back to the node object. -### C. Creating and Deleting Objects. -The Create and Delete functions are used for creating and deleting objects. Normally, the os driver will process the Pathname in the same manner for Create and Delete as for Open, thus such functionality could be placed in another function. +### Function: Create() +```c +int xxxCreate(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` +The `Create()` function is used to create a new object, and uses the same parameters and return value as `Open()` (documented in detail above). This often means adding a new file to the file system to represent the object. Many drivers do not implement this and recommend that driver end-users create files using a standard text editor or programatically using more general means, such as general structure file generation. If implemented, this function frequently requires very similar path parsing functionality to `Open()`. -As a side note, within Centrallix, the standard function naming convention is to use `xxx_internal_FunctionName()` for functions that are more or less internal to the module and not a part of any standard interface. +- 📖 **Note**: For many drivers, the `Create()` function calls the driver's `Open()` function with `O_CREAT`, then calls its `Close()` function, although some drivers may manage this differently. -The Create routine has parameters identical to the Open routine. It should return 0 on success and -1 on error. -The Delete routine is passed the following parameters: +### Function: Delete() +```c +int clusterDelete(pObject obj, pObjTrxTree* oxt); +``` +The `Delete()` function is used to delete an object, which often means removing a file from the file system. The Delete routine is passed the following parameters: | Param | Type | Description | ------ | ------------- | ------------ | obj | pObject | The Object structure pointer, used in the same way as in Open and Delete. -| oxt | pObjTrxTree* | The transaction tree pointer. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. Delete should return 0 on success and -1 on failure. -For many objectsystem drivers, the Create function simply calls the driver's internal Open() with O_CREAT and then its internal Close, although some drivers could manage Create differently from Open. -### D. Reading and Writing Object Content. -Some, but not all, objects will have content. If the object does or can have content, the driver should handle these functions as is appropriate. Otherwise, the driver should return a failure code (-1) from these functions. +### Function: DeleteObj() +```c +int xxxDeleteObj(void* inf_v, pObjTrxTree* oxt); +``` +**No documentation provided.** + + +### Function: Read() +```c +int xxxRead(void* inf_v, char* buffer, int max_cnt, int offset, int flags, pObjTrxTree* oxt); +``` + + +The `Read()` function reads content from objects that have content, similar to reading content from a file. If the object does or can have content, the driver should handle these functions as is appropriate. Otherwise, the driver should return a failure code (-1) and call `mssError()` in these functions. -The Read routine reads content from the object, as if reading from a file. The parameters passed are almost identical to those used in the fdRead command in MTASK: +The parameters passed are intentionally similar to the `fdRead()` function in `mtask.c`: | Parameter | Type | Description | --------- | ------------- | ------------ -| inf_v | void* | The generic pointer to the structure returned from Open(). -| buffer | char* | The destination buffer for the data being read in. -| maxcnt | int | The maximum number of bytes to read into the buffer. -| flags | int | Either 0 or FD_U_SEEK, in which case the user is specifying the seek offset for the read in the 5th argument. Of course, not all objects will be seekable, and furthermore, some of the objects handled by the driver may have full or limited seek functionality, even though others may not. -| arg | int | Extra argument, currently only used to specify an optional seek offset. -| oxt | pObjTrxTree* | The transaction tree pointer. +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| buffer | char* | The buffer where read data should be stored. +| max_cnt | int | The maximum number of bytes to read into the buffer. +| flags | int | Either `0` or `FD_U_SEEK`. If `FD_U_SEEK` is specified, the caller should specify a seek offset in the 5th argument (`arg`). +| arg | int | Extra argument, currently only used to specify the optional seek offset. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -The Write routine is very similar, except that instead of 'maxcnt', the third argument is 'cnt', and specifies how much data is in the buffer waiting to be written. +- 📖 **Note**: Not all objects can be seekable and some of the objects handled by the driver may have limited seek functionality, even if others do not. Each of these routines should return -1 on failure and return the number of bytes read/written on success. At end of file or on device hangup, 0 should be returned once, and then subsequent calls should return -1. -### E. Querying for Child Objects. -Many objects will have the capability of having sub-objects beneath them, called child objects. In such a case, the parent object becomes a directory of sorts, even though the parent object may also have content, something which is somewhat foreign in the standard filesystem world, but is common for web servers, where opening a directory returns the file 'index.html' on many occasions. -To enumerate a parent object's child objects, the query functions are used. A query may have a specific criteria so that only objects having certain attributes will be listed. As mentioned earlier in this document, a driver may or may not choose to intelligently handle those criteria. The driver has the option of always enumerating all child objects via its query functions, and allowing the OSML filter them and only return to the user the objects that match the criteria. But it also can do the filtering itself or, more typically, pass the filtering on to the source of the data the driver manages, as with a database server. +### Function: Write() +```c +int xxxWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt); +``` + +The `Write()` function is very similar to the `Read()` function above, allowing the caller to write data to objects of supporting drivers with content. However, the third argument (`max_cnt`) is replaced with `cnt`, specifying the number of bytes of data in the buffer that should be written. -The query mechanism can also be used to delete a set of child objects, optionally matching a certain criteria. The QueryDelete method may be left NULL in the ObjDriver structure if the driver does not implement full query support, in which case the OSML will iterate through the query results and delete the objects one by one. -The first main function for handling queries is OpenQuery. This function is passed three arguments: +### Function: OpenQuery() +```c +void* xxxOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); +``` +The `OpenQuery()` function opens a new query instance struct for fetching query results from a specific driver instance. Queries are often used to enumerate an object's child objects, although this is not a requirement. Queries may include specific criteria, and the driver may decide to intelligently handle them (either manually or, more often, by passing them on to a lower level driver or database) or simply to enumerating all results with its query functions. In the latter case, the OSML layer will filter results and only return objects that match the criteria to the user. -- `inf_v` (void*) The value returned from Open for this object. +`OpenQuery()` is passed three parameters: +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| query | pObjQuery | A query structure created by the object system. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -- `query` (pObjQuery) The query structure setup by the OSML. It will contain several key fields: +The `query : pObjQuery` parameter contains several useful fields: +| Parameter | Type | Description +| --------------- | ----------------------- | ------------ +| query->QyText | char* | The text specifying the criteria (i.e., the WHERE clause, in Centrallix SQL syntax). +| query->Tree | void* (pExpression) | The compiled expression tree. This expression evaluates to a nonzero value for `true` if the where clause is satisfied, or zero for `false` if it is not. +| query->SortBy[] | void*[] (pExpression[]) | An array of expressions giving the various components of the sorting criteria. +| query->Flags | int | The driver should set and/or clear the `OBJ_QY_F_FULLQUERY` and `OBJ_QY_F_FULLSORT` flags, if needed. - - `query->QyText`: the text of the criteria (i.e., the WHERE clause, in Centrallix SQL syntax) +The `OBJ_QY_F_FULLQUERY` flag indicates that the driver will handle the full WHERE clause specified in `query->Tree`. - - `query->Tree`: the compiled expression tree, which evaluates to nonzero for true or zero for false as the WHERE clause condition. +The `OBJ_QY_F_FULLSORT` flag indicates that the driver will handle all sorting for the data specified in `query->SortBy[]`. - - `query->SortBy[]`: an array of expressions giving the various components of the sorting criteria. +If the driver can easily handle sorting/selection (as when querying an database), it should set these flags. Otherwise, it should let the OSML handle the ORDER BY and WHERE conditions to avoid unnecessary work for the driver author. - - `query->Flags`: the driver should set and/or clear the flags `OBJ_QY_F_FULLQUERY` and `OBJ_QY_F_FULLSORT` if need be. The former indicates that the driver is willing to handle the full WHERE clause (the query->Tree). The latter indicates that the driver is willing to handle the sorting of the data as well (in query->SortBy[]). If the driver can easily have the sorting/selection done (as when querying an RDBMS), it should set these flags. Otherwise, it should let the OSML take care of the ORDER BY and WHERE conditions. +The `OpenQuery()` function returns a `void*` for the query instance struct, which will be passed to the other query functions (`QueryDelete()`, `QueryFetch()`, and `QueryClose()`). This structure normally points to the driver instance struct to allow easy access to queried data. `OpenQuery()` returns `NULL` if the object does not support queries or if an error occurs, in which case `mssError()` should be called before returning. -- `oxt` (pObjTrxTree*) The transaction tree pointer. -The OpenQuery function should return a void* value, which will within the driver point to a structure used for managing the query. This structure will normally have a pointer to the inf_v value returned by Open as well, since inf_v is never passed to QueryFetch, QueryDelete or QueryClose. OpenQuery should return NULL if the object does not support queries or if some other error condition occurs that will prevent the execution of the query. +### Function: QueryDelete() +*(Optional)* +```c +int xxxQueryDelete(void* qy_v, pObjTrxTree* oxt); +``` + +Deletes results in the query result set, optionally matching a certain criteria. `QueryDelete()` is passed two parameters: -Once the query is underway with OpenQuery, the user will either start fetching the results with QueryFetch, or will issue a delete operation with QueryDelete. +| Parameter | Type | Description +| --------- | ------------- | ------------ +| qy_v | void* | A query instance pointer (returned from `QueryOpen()`). +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -The QueryFetch routine should return an inf_v pointer to the child object, or NULL if no more child objects are to be returned by the query. Some drivers may be able to use their internal Open function to generate the newly opened object, although others will directly allocate the inf_v structure and fill it in based on the current queried child object. QueryFetch will be passed these parameters: +`QueryDelete()` returns 0 to indicate a successful deletion, or -1 to indicate failure, in which case `mssError()` should be called before returning. -| Parameter | Type | Description -| ---------- | -------------- | ------------ -| qy_v | void* | The value returned by OpenQuery. -| obj | pObject | The newly-created object structure that the OSML is using to track the newly queried child object. -| mode | int | The open mode for the new object, as with obj->Mode in Open(). -| oxt | pObjTrxTree* | The transaction tree pointer. +If a delete is needed and this method is not implemented, the OSML will iterate through the query results and delete the objects one by one. -All object drivers will need to add an element to the obj->Pathname structure to indicate the path to the child object being returned. This will involve a process somewhat like this: (given that new_name is the new object's name, qy is the current query structure, which contains a field 'Parent' that points to the inf_v originally returned by Open, and where the inf_v contains a field Obj that points to the Object structure containing a Pathname structure) +### Function: QueryFetch() ```c - int cnt; +void* xxxQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt); +``` +The `QueryFetch()` function fetches a driver instance pointer (aka. an `inf_v` pointer) to a child object, or `NULL` if there are no more child objects. It may be helpful to think of `QueryFetch()` as similar to an alternate form of `Open()`, even if your driver does not implement the functionality to `Open()` every object that can be found with `QueryFetch()`. In fact, some drivers may use an internal `Open()` function to generate the opened objects. + +`QueryFetch()` takes four parameters: + +| Parameter | Type | Description +| ---------- | ------------- | ------------ +| qy_v | void* | A query instance struct (returned by `OpenQuery()`). +| obj | pObject | An object structure that the OSML uses to track the newly queried child object. +| mode | int | The open mode for the new object, the same as `obj->Mode` in `Open()`. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + + +The driver should add an element to the `obj->Pathname` structure to indicate the path of the returned child object. This will involve a process somewhat like this, where: +- `new_name : char*` is the new object's name. +- `qy : pMyDriversQueryInf` is the current query structure. +- `qy->Parent->Obj->Pathname : pPathname` points to the affected Pathname struct. + +```c + int count; pObject obj; char* new_name; pMyDriversQueryInf qy; - /** Build the filename. **/ - cnt = snprintf(obj->Pathname->Pathbuf, 256, "%s/%s", - qy->Parent->Obj->Pathname->Pathbuf,new_name); - if (cnt < 0 || cnt >= 256) return NULL; - obj->Pathname->Elements[obj->Pathname->nElements++] = - strrchr(obj->Pathname->Pathbuf,'/')+1; + /** Build the new filename. **/ + count = snprintf(obj->Pathname->Pathbuf, 256, "%s/%s", qy->Parent->Obj->Pathname->Pathbuf, new_name); + if (count < 0 || 256 <= count) return NULL; + obj->Pathname->Elements[obj->Pathname->nElements++] = strrchr(obj->Pathname->Pathbuf, '/') + 1; +``` + +### Function: QueryCreate() +```c +void* xxxQueryCreate(void* qy_v, pObject new_obj, char* name, int mode, int permission_mask, pObjTrxTree *oxt); ``` + +**No documentation provided.** + -QueryDelete is passed the qy_v void* parameter, and an oxt parameter. It should return 0 on successful deletion, and -1 on failure. +### Function: QueryClose() +```c +int xxxQueryClose(void* qy_v, pObjTrxTree* oxt); +``` +The close function closes a query instance, freeing all allocated data and releasing all shared memory such as open connections, files, or other driver instances. This function operates very similarly to `Close()`, documented in detail above. The query should be closed, whether or not `QueryFetch()` has been called enough times to enumerate all of the query results. -QueryClose is also passed qy_v and oxt. It should close the query, whether or not QueryFetch has been called enough times to enumerate all of the query results. -### F. Managing Object Attributes -All objects will have at least some attributes. Five attributes are mandatory: 'name', 'content_type', 'inner_type', 'outer_type', and 'annotation'. All compliant drivers must implement these five attributes, all of which have a data type of DATA_T_STRING. +### Object Attributes +All objects can have attributes, and there are five required attributes that all drivers must implement (explained below). Currently, the OS specification includes support for the following data types: -- DATA_T_INTEGER - 32-bit signed integer. -- DATA_T_STRING - Zero-terminated ASCII string. -- DATA_T_DOUBLE - Double-precision floating point. -- DATA_T_DATETIME - date/time structure. -- DATA_T_MONEY - money data type. +| Name | Description +| ----------------- | ------------ +| `DATA_T_INTEGER` | 32-bit signed integer. +| `DATA_T_STRING` | Null-terminated ASCII string. +| `DATA_T_DOUBLE` | Double-precision floating point number. +| `DATA_T_DATETIME` | Date/time structure. +| `DATA_T_MONEY` | Money structure. + +See `datatypes.h` for more information. + +For `true`/`false` or `on`/`off` attributes, use `DATA_T_INTEGER` where 0 indicates `false` and 1 indicates `true`. + +The following five attributes are required (all are of type `DATA_T_STRING`): + +| Attribute | Description +| ------------ | ------------ +| name | The name of the object, just as it appears in any directory listing. The name of the object must always be unique for its directory. +| annotation | A short description of the object. While users may not assign annotations to all objects, each object should be able to have an annotation. For example, in the Sybase driver, annotations for rows are created by assigning an 'expression' to the table in question, such as `first_name + last_name` for a people table. +| content_type | The type of the object's content, given as a MIME-type. Specify `"system/void"` if the object does not have content. +| inner_type | An alias for 'content_type'. Both should be supported. +| outer_type | This is the type of the object itself (the container). Specify `"system/row"` for objects that can be queried. + +The `last_modification : DATA_T_DATETIME` attribute is a sixth, optional attribute that may be useful in some situations. This attribute should indicate the last time that the object's content was modified or updated. + + + + +### Function: GetAttrType() +```c +int xxxGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt); +``` +The `GetAttrType()` function returns DATA_T_xxx value for the datatype of the requested. It takes three parameters: + +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| attr_name | char* | The name of the attribute to be queried. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +This function should return `DATA_T_UNAVAILABLE` if the requested attribute does not exist on the driver instance. It should return -1 to indicate an error, in which case `mssError()` should be called before returning. + +For example, calling the following on any driver should return `DATA_T_STRING`. +```c +int datatype = driver->GetAttrType(inf_v, 'name', oxt); +``` + + +### Function: GetAttrValue() +```c +int xxxGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +``` +The `GetAttrValue()` function takes four parameters: + +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| attr_name | char* | The name of the attribute to be queried. +| val | pObjData | A pointer to a location where the value of the attribute should be stored. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. -True/false or on/off attributes should be treated as DATA_T_INTEGER for the time being with values of 0 and 1. +The value pointer should be handled in different ways, depending on the type: +- For `DATA_T_INTEGER` types, it is assumed to point to a 32-bit integer where the value should be written. +- For `DATA_T_STRING` types, it is assumed to point to an empty `char*` location where a pointer to a string should be written. +- For `DATA_T_DOUBLE` types, it is assumed to point to a double value where the double should be written. +- For `DATA_T_DATETIME` types, it is assumed to point to an empty `pDateTime` where a pointer to a date time struct (see `obj.h`) should be written. -Here is a description of the functionality of the five mandatory attributes: +In this way, integer and double values are returned by value, and string or datetime values are returned by reference. Items returned by reference are guaranteed to be valid until either the object is closed, or another call to `GetAttrValue()` or `SetAttrValue()` call is made on the same driver (which ever happens first). -| Attribute | Description -| -------------- | ------------ -| 'name' | This attribute indicates the name of the object, just as it should appear in any directory listing. The name of the object must be unique for the directory it is in. -| 'content_type' | This is the type of the object's content, given as a MIME-type. -| 'annotation' | This is an annotation for the object. While users may not assign annotations to all objects, each object should be able to have an annotation. Normally the annotation is a short description of what the object is. For the Sybase driver, annotations for rows are created by assigning an 'expression' to the table in question, such as 'first_name + last_name' for a people table. -| 'inner_type' | An alias for 'content_type'. Both should be supported. -| 'outer_type' | This is the type of the object itself (the container). +This function should return -1 on a non-existent attribute, 0 on success, and 1 if the value is `NULL` or undefined / unset. -A sixth attribute is not mandatory, but is useful if the object might have content that could in turn be a node object (be interpreted by another driver). This attribute is 'last_modification', of type DATA_T_DATETIME, and should indicate when the object's content was last updated or modified. +- 📖 **Note**: The caller of this function can use the POD(x) macro to typecast appropriate pointers to the pObjData pointer, passed to this function. The ObjData structure is a UNION type of structure, allowing easy manipulation of data of various types. See `datatypes.h` for more information. -The first function to be aware of is the GetAttrType function. This routine takes the inf_v pointer, the name of the attribute in question, and the oxt* pointer. It should return the DATA_T_xxx value for the data type of the attribute. +- 📖 **Note**: In legacy code, a typecasted void* was used instead of a pObjData pointer used today. This method was binary compatible the current solution because the pObjData is a pointer to a struct union. See `datatypes.h` for more information. -Next is the GetAttrValue function, which takes four parameters: the inf_v pointer, the name of the attribute, a void pointer pointing to where the attribute's value will be put, and the oxt* pointer. The way the value pointer is handled depends on the data type. For DATA_T_INTEGER types, the value pointer is assumed to be pointing to a 32-bit integer where the integer value can be written. For DATA_T_ STRING types, the value pointer is assumed to be pointing to an empty pointer location where a pointer to the string can be stored. For DATA_T_DATETIME types, the value pointer is assumed to be pointing to an empty pointer where a pointer to a date time structure (from obj.h) can be stored. And for double values, the value pointer points to a double value where the double will be stored. In this way, integer and double values are returned from GetAttrValue by value, and string or datetime values are returned from GetAttrValue by reference. Items returned by reference must be guaranteed to be valid until the object is closed, or another GetAttrValue or SetAttrValue call is made. This function should return -1 on a non-existent attribute, 0 on success, and 1 if the value is NULL or unset. -UPDATE ON GETATTR/SETATTR: These functions now, instead of taking a void* pointer for the value, take a pObjData pointer, which points to an ObjData structure. The POD(x) macro can be used to typecast appropriate pointers to a pObjData pointer. The ObjData structure is a UNION type of structure, allowing easy manipulation of data of various types. See 'datatypes.h'. Note that this is binary compatible with the old way of using a typecasted void pointer. +### Function: SetAttrValue() +```c +int xxxSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +``` +The `SetAttrValue()` function is the same as `GetAttrValue()`, however it sets the value by reading it from the `val` parameter instead of getting the value by writing it to the `val` parameter. The return value is also identical, and `mssError()` should be invoked on failure, or if setting attributes programatically is not implemented. -The SetAttrValue function works much the same way as GetAttrValue, just with the information moving in the opposite direction. The third parameter, void* value, is treated in the same manner. -The GetFirstAttr and GetNextAttr functions each take two parameters, the inf_v pointer and the oxt* pointer, and are used to iterate through the non-mandatory attributes for the object. GetFirstAttr should return a string naming the first attribute, and GetNextAttr should iterate through subsequent attributes. When the attributes are exhausted, these functions should return NULL. The attributes 'name', 'annotation', and 'content_type' should not be returned. If the object has no other attributes, GetFirstAttr should return NULL. +### Function: GetFirstAttr() & GetNextAttr() +```c +char* xxxGetFirstAttr(void* inf_v, pObjTrxTree* oxt); +char* xxxGetNextAttr(void* inf_v, pObjTrxTree* oxt); +``` +These functions return the names of attributes that can be queried on an object. They both take the same two parameters. -AddAttr is used to add a new attribute to an existing object. Not all objects support this, and many will refuse the operation. The parameters are as follows: void* inf_v, char* attrname, int type, void* value, and pObjTrxTree* oxt. +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +These functions should only return the names of significant values, so `name`, `annotation`, etc. should not be returned from these functions, even though they are required to be valid values for any object. Typically, this is implemented by `GetFirstAttr()` resetting some internal value in the driver `inf_v`, then returning the result of `GetNextAttr()`. `GetNextAttr()` extracts a string from an array or other list of valid attribute names for the object and increments the internal counter. Once the attributes are exhausted, `GetNextAttr()` returns `NULL` and `GetFirstAttr()` can be used to restart and begin querying elements from the start of the list again. If an object has no significant attributes, `GetFirstAttr()` and `GetNextAttr()` both return NULL. -OpenAttr is used to open an attribute for objRead/objWrite as if it were an object with content. Not all object drivers will support this; this routine should return an inf_v pointer for the new descriptor, and takes four parameters: void* inf_v, char* attrname, int mode, and pObjTrxTree* oxt. The mode is used in the same manner as the Open function. -### G. Managing Object Methods -Objects may optionally have methods associated with them. Each method is given a unique name within the object, and can take a single string parameter. Three functions exist for managing methods. +### Function: AddAttr() +```c +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree* oxt); +``` +The `AddAttr()` function adds a new attribute to an existing object. Not all objects support this, and many will refuse the operation. The parameters are the same as those of `GetAttrValue()` and `SetAttrValue()`, documented in detail above. + + +### Function: OpenAttr() +```c +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree* oxt); +``` +The `OpenAttr()` function is used to open an attribute for `objRead()`/`objWrite()` as if it were an object with content. Not all object drivers will support this, and many will refuse the operation. + +This function takes 4 parameters. `inf_v`, `attr_name`, and `oxt` are the same as they are for `GetAttrValue()` and `SetAttrValue()`. `mode` is the same as it is for `Open()`. This function should return an `inf_v` pointer for the new descriptor (similar to `Open()` and `QueryFetch()` above). + + +### Function: ExecuteMethod() +```c +int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt); +``` +The `ExecuteMethod()` function is used to execute a method on an object. This feature is rarely used, but some drivers have created methods for actions like dropping their cache or printing debug information. Each method has a unique name within that object, and can take a single string parameter. + +The `ExecuteMethod()` function takes four parameters: + +| Parameter | Type | Description +| ----------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| method_name | char* | The name of the method to be executed. +| param | pObjData | A pointer to a location where the string value of the param is stored. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +- 📖 **Note**: The `pObjData` type of the `param` parameter makes it possible that other types of parameters could be supported in the future, however, this is not currently implemented. + +The function returns 0 on success, and -1 to indicate an error, in which case `mssError()` should be called before returning. + + +### Function: GetFirstMethod() & GetNextMethod() +```c +char* xxxGetFirstMethod(void* inf_v, pObjTrxTree* oxt); +char* xxxGetNextMethod(void* inf_v, pObjTrxTree* oxt); +``` +These functions work the same as `GetFirstAttr()` and `GetNextAttr()` (respectively), except that they return the method names instead of the attribute names. + + +### Function: PresentationHints() +```c +pObjPresentationHints xxxPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt); +``` +The `PresentationHints()` function allows the caller to request extra information about a specific attribute on a specific driver instance object. Most of this information is intended to be used for displaying the attribute in a user interface, although it can also be useful for general data validation. As such, many drivers may not implement this function. + +The `PresentationHints()` function takes three parameters: + +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| attr_name | char* | The name of the requested attribute. +| oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. + +The returns a new pObjPresentationHints struct on success, or NULL to indicate an error, in which case `mssError()` should be called before returning. This struct should be allocated using `nmMalloc()`, and memset to zero, like this: +```c +pObjPresentationHints hints = nmMalloc(sizeof(ObjPresentationHints)); +if (hints == NULL) goto error_handling; +memset(hints, 0, sizeof(ObjPresentationHints)); +``` + +The return value, `hints : ObjPresentationHints`, contains the following useful fields which the function should set to give various useful information about the attribute. +- `hints->Constraint : void*`: An expression for determining if a value is valid. +- `hints->DefaultExpr : void*`: An expression defining the default value. +- `hints->MinValue : void*`: An expression defining the minimum valid value. +- `hints->MaxValue : void*`: An expression defining the maximum valid value. +- `hints->EnumList : XArray`: If the attribute is a string enum, this XArray lists the valid string values. +- `hints->EnumQuery : char*`: A query string which enumerates the valid values a string enum attribute. +- `hints->Format : char*`: presentation format - datetime or money +- `hints->AllowChars : char*`: An array of all valid characters for a string attribute, NULL to allow all characters. +- `hints->BadChars : char*`: An array of all invalid characters for a string attribute. +- `hints->Length : int`: The maximum length of data that can be included in a string attribute. +- `hints->VisualLength : int`: The length that the attribute should be displayed if it is show to the user. +- `hints->VisualLength2 : int`: The number of lines to use in a multi-line edit box for the attribute. +- `hints->BitmaskRO : unsigned int`: which bits, if any, in bitmask are read-only +- `hints->Style : int`: Style flags, documented below. +- `hints->StyleMask : int`: A mask for which style flags were set and which were left unset / undefined. +- `hints->GroupID : int`: Used to assign attributes to groups. Use -1 if the attribute is not in a group. +- `hints->GroupName : char*`: The name of the group to which this attribute belongs, or NULL if it is ungrouped or if the group is named elsewhere. +- `hints->OrderID : int`: Used to specify an attribute order. +- `hints->FriendlyName : char*`: Used to specify a "display name" for an attribute (e.g. `n_rows` might have a friendly name of `"Number of Rows"`). Should be `nmSysMalloc()`ed, often using `nmSysStrdup()`. + +- ⚠️ **Warning**: Behavior is undefined if: + - If a character is included in both `hints->AllowChars` and `hints->BadChars`. + - The data is longer than length. + +The `hints->Style` field can be set with several useful flags. To specify that a flag is not set (e.g. to specify explicitly that a field does allow `NULL`s), set the coresponding bit in the `hints->StyleMask` field while leaving the the bit in the `hints->Style` field set to 0. + +The following macros are provided for setting style flags: +- `OBJ_PH_STYLE_BITMASK`: The items in `hints->EnumList` or `hints->EnumQuery` are bitmasked. +- `OBJ_PH_STYLE_LIST`: List-style presentation should be used for the values of an enum attribute. +- `OBJ_PH_STYLE_BUTTONS`: Radio buttons or check boxes should be used for the presentation of enum attribute values. +- `OBJ_PH_STYLE_NOTNULL`: The attribute does not allow `NULL` values. +- `OBJ_PH_STYLE_STRNULL`: An empty string (`""`) should be treated as a `NULL` value. +- `OBJ_PH_STYLE_GROUPED`: The GroupID should be checked and so that fields can be grouped together. +- `OBJ_PH_STYLE_READONLY`: The user is not allowed to modify this attribute. +- `OBJ_PH_STYLE_HIDDEN`: This attribute should be hidden and not presented to the user. +- `OBJ_PH_STYLE_PASSWORD`: Values in this attribute should be hidden, such as for passwords. +- `OBJ_PH_STYLE_MULTILINE`: String values should allow multiline editting. +- `OBJ_PH_STYLE_HIGHLIGHT`: This attribute should be highlighted when presented to the user. +- `OBJ_PH_STYLE_LOWERCASE`: This attribute only allows lowercase characters. +- `OBJ_PH_STYLE_UPPERCASE`: This attribute only allows uppercase characters. +- `OBJ_PH_STYLE_TABPAGE`: Prefer the tab-page layout for grouped fields. +- `OBJ_PH_STYLE_SEPWINDOW`: Prefer separate windows for grouped fields. +- `OBJ_PH_STYLE_ALWAYSDEF`: Always reset the default value when this attribute is modified. +- `OBJ_PH_STYLE_CREATEONLY`: This attribute is writeable only when created, after that it is read only. +- `OBJ_PH_STYLE_MULTISEL`: Multiple select +- `OBJ_PH_STYLE_KEY`: This attribute is a primary key. +- `OBJ_PH_STYLE_APPLYCHG`: Presentation hints should be applied on DataChange instead of on DataModify. + + +### Function: Info() +```c +int xxxInfo(void* inf_v, pObjectInfo info); +``` +The `Info()` function allows the caller to request extra information about a specific driver instance object. It takes two parameters: + +| Parameter | Type | Description +| --------- | ------------- | ------------ +| inf_v | void* | A driver instance pointer (returned from `Open()` or `QueryFetch()`). +| info | pObjectInfo | A driver info struct allocated by the caller which the driver sets with information. + +The `pObjectInfo` struct has two fields: `Flags` and `nSubobjects`. This function should set `info->Flags` to 0 (to ensure no uninitialized noise gets into the data), then & it with all of the following flags that apply to that object. +- `OBJ_INFO_F_CAN_HAVE_SUBOBJ` / `OBJ_INFO_F_CANT_HAVE_SUBOBJ`: Indicates that the object can or cannot have subobjects. +- `OBJ_INFO_F_HAS_SUBOBJ` / `OBJ_INFO_F_NO_SUBOBJ`: Indicates that the object has or does not have subobjects. +- `OBJ_INFO_F_SUBOBJ_CNT_KNOWN`: Indicates that we know the number of subobjects. If set, the count should be stored in `info->nSubobjects`. +- `OBJ_INFO_F_CAN_HAVE_CONTENT` / `OBJ_INFO_F_CANT_HAVE_CONTENT`: Indicates that the object can or cannot have content (see `Read()` / `Write()`). +- `OBJ_INFO_F_HAS_CONTENT` / `OBJ_INFO_F_NO_CONTENT`: Indicates that this object does or does not have content (see `Read()` / `Write()`). +- `OBJ_INFO_F_CAN_SEEK_FULL`: Seeking is fully supported (both forwards and backwards) on the object. +- `OBJ_INFO_F_CAN_SEEK_REWIND`: Seeking is only supported with an offset of `0`. +- `OBJ_INFO_F_CANT_SEEK`: Seeking is not supported at all. +- `OBJ_INFO_F_CAN_ADD_ATTR` / `OBJ_INFO_F_CANT_ADD_ATTR`: Indicates that the object does or does not allow attributes to be added with the [AddAttr()](#function-addattr) function. +- `OBJ_INFO_F_SUPPORTS_INHERITANCE`: Indicates that the object supports inheritance through attributes such as `cx__inherit`. See ??? for more information about object inheritance. + +- `OBJ_INFO_F_FORCED_LEAF`: Indicates that the object is forced to be a 'leaf' unless ls__type used. +- `OBJ_INFO_F_TEMPORARY`: Indicates that this is a temporary object without a vaoid pathname. + + +The function returns 0 on success, and -1 to indicate an error, in which case `mssError()` should be called before returning. + + +### Function: Commit() +```c +int xxxCommit(void* inf_v, pObjTrxTree *oxt); +``` +**No documentation provided.** + + +### Function: GetQueryCoverageMask() +```c +int xxxGetQueryCoverageMask(pObjQuery this); +``` +**No documentation provided.** + + +### Function: GetQueryIdentityPath() +```c +int xxxGetQueryIdentityPath(pObjQuery this, char* pathbuf, int maxlen); +``` +**No documentation provided.** + -The first two functions, GetFirstMethod and GetNextMethod, work identically to their counterparts dealing with attributes. The third function, ExecuteMethod, starts a method executing. This function takes four parameters: the inf_v pointer, the name of the method, the optional string parameter, and the oxt* pointer. ## III Reading the Node Object -The Node object has content which controls what resource(s) this driver will actually access, so it is important for the driver to access the node object's content. If the driver's node objects are structure files (which is normally the case when dealing with a remote network resource), then the SN module can make opening the node object much more painless. It also performs caching automatically to improve performance. +A driver will commonly configure itself by reading text content from its node object file, at the root of its object subtree. This content may define what resource(s) a driver should provide, how it should access or compute them, and other similar information. Most drivers use the structure file format for their node objects because SN module makes parsing, reading, and writing these files easier. It also performs caching automatically to improve performance. -Note that the Node object will technically ALREADY BE OPEN as an object in the objectsystem. The OSML does that for you. If your driver will not use the SN/ST modules, then it should read the node object via the normal objRead() function, and write it via objWrite(). Your driver should NEVER objClose() the node object! The OSML does that for you. +- 📖 **Note**: The node object will **already be open** as an object in the ObjectSystem: The OSML does this for each driver. If a driver does not use the SN/ST modules, then it should read and write the node object directly with `objRead()` and `objWrite()`. A driver should **NEVER** `objClose()` the node object! The OSML handles that. -An objectsystem driver will commonly configure itself by reading a text file at the root of its object subtree. There are two main modules available for making this easier. +Although using the structure file format may be complex, it allows significant flexibility. Data is structured in hierarchies where each sub-object can have named attributes as well as sub-objects. Centrallix is filled with examples of this, including any `.qy`, `.app`, `.cmp`, or `.cluster` file. -The normal way to manage object parameters is to use a structure file. Structure files are a little more complicated, but allow for arrays of values for a given attribute name, as well as allowing for tree- structured hierarchies of attributes and values. Structure files are accessed via the stparse and st_node modules. The stparse module provides access to the individual attributes and groups of attributes, and the st_node module loads and saves the structure file heirarchies as a whole. The st_node module also provides node caching to reduce disk activity and eliminate repeated parsing of one file. +Structure files are accessed via the st_node (SN) and stparse (SP) modules. The st_node module loads and saves the structure file heirarchies as a whole. It also manages caching to reduce disk activity and eliminate repeated parsing of the same file. The stparse module provides access to the individual attributes and groups of attributes within a node structure file. -For example, if two sessions open two files, '/test1.rpt' and '/test2.rpt' the st_node (SN) module will cache the internal representations of these node object files, and for successive uses of these node objects, the physical file will not be re-parsed. The file will be re-parsed if its timestamp changes. +For example, if two sessions open two files, `/test1.rpt` and `/test2.rpt` the st_node module will cache the internal representations of these node object files, and for successive uses of these node objects, the physical file will not be re-parsed. The file will be re-parsed if its timestamp changes. + -If the underlying object does not support the attribute "last_modification" (assumed to be the timestamp), then SN prints a warning. In essence, this warning indicates that changes to the underlying object will not trigger the SN module to re-read the structure file defining the node object. Otherwise, the SN module keeps track of the timestamp, and if it changes, the node object is re-read and re-parsed. +If the underlying object does not support the attribute "last_modification" (assumed to be the timestamp), then st_node prints a warning. In essence, this warning indicates that changes to the underlying object will not trigger the st_node module to re-read the structure file defining the node object. Otherwise, the st_node module keeps track of the timestamp, and if it changes, the node object is re-read and re-parsed. -The driver's first course of action to obtain node object data is to open the node object with the SN module. The SN module's functions are listed below: +### Module: st_node +To obtain node object data, the driver should first open the node object with the st_node module. To use this module, include the file `st_node.h`, which provides the following functions (read `st_node.c` for more functions and additional information): -### pSnNode snReadNode(pObject obj) -This function reads a Structure File from the already-open node object which is passed in the "obj" parameter in the xxxOpen() routine. The "obj" parameter has an element, obj->Prev, which is a link to the node object as opened by the previous driver in the OSML's chain of drivers for handling this open(). All you need to know to get the parsed node object is the following: +### st_node: snReadNode() ```c - pSnNode node; +pSnNode snReadNode(pObject obj); +``` +The `snReadNode()` function reads a Structure File from the `obj` parameter, which should be a previously openned object. In a driver's `Open()` function, this is `obj->Prev` (the node object as opened by the previous driver in the OSML's chain of drivers). - node = snReadNode(obj->Prev); +**Usage:** +```c +pSnNode node = snReadNode(obj->Prev); +if (node == NULL) goto error_handling; ``` -The returned node structure is managed by the SN module and need not be nmFree()ed. The only thing that must be done is that the driver should increment the node structure's link count like this: +The returned node structure is managed by the SN module and does not need to be `nmFree()`ed. Instead, the driver should increment the node structure's link count for as long as it intends to use this structure, using `node->OpenCnt++;`. When the structure is no longer needed (e.g. when the driver instance is closed), the driver should decrement the link count. + + +### st_node: snNewNode() +```c +pSnNode snNewNode(pObject obj, char* content_type); +``` +The `snNewNode()` function creates a new node object of the given content type. The open link count should be incremented and decremented when appropriate, as with `snReadNode()`. +**Usage:** ```c - node->OpenCnt++; +pSnNode node = snNewNode(obj->Prev, "system/structure"); +if (node == NULL) goto error_handling; ``` -When closing an object (and thus releasing a reference to the Node structure), the driver should decrement the link count. +In this case, the new structure file will have the type: `"system/structure"`. + +- 📖 **Note**: This function only creates node object content, so the underlying object file must already exist. The OSML should do this for you because the previous driver (`obj->Prev`) creates the underlying object. -### pSnNode snNewNode(pObject obj, char* content_type) -This function creates a new node object with a given content type. The open link count should be incremented as appropriate, as before with snReadNode(). +### st_node: snWriteNode() ```c - pSnNode node; +int snWriteNode(pSnNode node); +``` +The `snWriteNode()` function writes a node's internal data back out to the node file, if the node's status (`node->Status`) is set to `SN_NS_DIRTY`. Otherwise, `snWriteNode()` does nothing. + - node = snNewNode(obj->Prev, "system/structure"); +### st_node: snDelete() +```c +int snDelete(pSnNode node); ``` +The `snDelete()` function deletes a node by removing the node's data from the internal node cache. -The "system/structure" argument is the type that will be assigned to the newly created node object. Note that the underlying object must already exist in order for this to create a node object as that object's content. Normally the OSML does this for you by commanding the previous driver (handling obj->Prev) to create the underlying object in question. +- 📖 **Note**: This does not actually delete the node file. -### int snWriteNode(pSnNode node) -This function writes a node's internal representation back out to the node file. The node's status (node->Status) should be set to SN_NS_DIRTY in order for the write to actually occur. Otherwise, snWriteNode() does nothing. -### int snDeleteNode(pSnNode node) -This function deletes a node file. At this point, does not actually delete the file but instead just removes the node's data structures from the internal node cache. +### st_node: snGetSerial() +```c +int snGetSerial(pSnNode node); +``` +The `snGetSerial()` function returns the serial number of the node. -### int snGetSerial(pSnNode node) -This function returns the serial number of the node. Each time the node is re-read because of modifications to the file or is written via snWriteNode because of modifications to the internal structure, the serial number is increased. This is a good way for a driver to refresh internal information that it caches should it determine a node object has changed. +Each time the node is re-read because of modifications to the node file or is written with because `snWriteNode()` was called after modifications to the internal structure, the serial number is increased. This is a good way for a driver to determine if the node file has changed so it can refresh internal cached data. -The stparse module is used to examine the parsed contents of the node file. A node file using the stparse module (and thus st_node module) has a structure file format; see StructureFile.txt. The file format is a tree structure with objects, subobjects, and attributes. The internal parsed representation is a tree, with each tree node being an object in the structure file, and each node having attributes, each of which is also a tree node. Thus, there are three different node types in the tree representation: the top-level ST_T_STRUCT element, which can contain subgroups and attributes; a mid-level ST_T_SUBGROUP tree node, which has a content type, name, and can contain attributes and other subgroups, and lastly a ST_T_ATTRIB node which contains an attribute name and attribute values, either integer or string, and optional lists of such up to 64 items in length. To use this module, include the file stparse.h. -The following functions are used to manage a parsed structure file: +### st_node: snGetLastModification() +```c +pDateTime snGetLastModification(pSnNode node); +``` +The `snGetLastModification()` function returns the date and time that a file was last modified. This pointer will remain valid as long as the passed `pSnNode` struct remains valid. It is managed by the `st_node` module, so the caller should not free the returned pointer. This function promises not to fail and return `NULL`. -### pStructInf stParseMsg(pFile inp_fd, int flags) -This function is internal-use-only and is used by the st_node module to parse a structure file. -### pStructInf stParseMsgGeneric(void* src, int (*read_fn)(), int flags) -This function is also internal-use-only (unless you want to parse the file manually without st_node's help) and is used to parse the structure file when the structure file isn't being read from an MTASK pFile descriptor. This is always the case, as the structure file data is being read from a pObject pointer. In such a case, src is the pObject pointer and read_fn is objRead(). +### Module: stparse +The stparse module is used to examine the parsed contents of the node file using the structure file format; see [StructureFile.txt](../centrallix-doc/StructureFile.txt). This format is a tree structure with node objects that can each have sub-objects and named attributes. Thus, stparse uses three distinct node types: +- `ST_T_STRUCT`: The top-level node, containing the subtrees and attributes in the file. +- `ST_T_SUBGROUP`: A mid-level type for subobjects within the top-level node. Each subgroup has a content type, name, and may contain attributes and other subgroups. +- `ST_T_ATTRIB`: A bottom-level type for each named attribute. Each attribute has a name and values, either of type integer or string, and optional lists of such up to 64 items in length. -### int stGenerateMsg(pFile out_fd, pStructInf info, int flags) -This function, also internal-use only, is used by the st_node module to write a structure file whose internal representation is given in the 'info' parameter. +To use this module, include the file `stparse.h`, which includes the following functions (read `stparse.c` for more functions and additional information): -### int stGenerateMsgGeneric(void* dst, int (*write_fn)(), pStructInf info, int flags) -This function is stParseMsgGeneric's converse. -### pStructInf stCreateStruct(char* name, char* type) -This function creates a new top-level tree item of type ST_T_STRUCT, with a given name and content-type. +### stparse: stStructType() +```c +int stStructType(pStructInf this); +``` +The `stStructType()` function returns the struct type of the past `pStructInf` parameter, which is either `ST_T_ATTRIB` or `ST_T_SUBGROUP` (see above). -### pStructInf stAddAttr(pStructInf inf, char* name) -This function adds a node of type ST_T_ATTRIB to either a ST_T_STRUCT or ST_T_SUBGROUP type of node, with a given name and no values associated with that name (see AddValue, below). The new attribute tree node is linked under the 'inf' node passed, and is returned. +- ⚠️ **Warning**: The root node of type `ST_T_STRUCT` will return `ST_T_SUBGROUP` from this function. If you wish to avoid this, read `inf->Type` (see [stparse: Using Fields Directly](#stparse-using-fields-directly) for more info). It is unclear whether this behavior is a bug or a feature. I've decided to call it a feature! ;) -### pStructInf stAddGroup(pStructInf inf, char* name, char* type) -This function adds a node of type ST_T_SUBGROUP to either a ST_T_SUBGROUP or ST_T_STRUCT tree node, with a given name and content type (content type such as 'report/query'). -### int stAddValue(pStructInf inf, char* strval, int intval) -This function adds a value to an attribute, and can be called multiple times on an attribute to add a list of values. If 'strval' is not null, a string value is added, otherwise an integer value is added. The string is NOT copied, but is simply pointed-to. If the string is non-static, and has a lifetime less than the ST_T_ATTRIB tree node, then the following procedure must be used: +### stparse: stLookup() +```c +pStructInf stLookup(pStructInf inf, char* name); +``` +The `stLookup()` function searches all sub-tree nodes for a group or attribute of the given name and returns a pointer to it or returns `NULL` if no group or attribute was found. + +### stparse: stAttrValue() ```c - char* ptr; - char* nptr; - pStructInf attr_inf; +int stAttrValue(pStructInf inf, int* intval, char** strval, int nval); +``` +This function gets the value of the given attribute in an `ST_T_ATTRIB` node. If the value is an integer, the caller should pass a pointer to an integer where it can be stored. If the value is a string, the caller should pass a pointer to string (aka. a `char*`) where char* for the string can be stored. The unused alternate pointer must be left `NULL`. `nval` can normally be 0, but if the attribute has several values, setting nval to 1, 2, 3, etc., returns the 2nd, 3rd, 4th item, respectively. + +This function returns -1 if the attribute value did not exist, if the wrong type was requested, or if 'inf' was `NULL`. + +It is common practice to use `stLookup()` and `stAttrValue()` or `stGetExpression()` (see below) together to retrieve values, for example (where `inf` is a `pStructInfo` variable from somewhere): - attr_inf = stAddAttr(my_parent_inf, "myattr"); - nptr = (char*)malloc(strlen(ptr)+1); - if (!nptr) go_report_the_error_and_return; - strcpy(nptr, ptr); - stAddValue(attr_inf, nptr, 0); - attr_inf->StrAlloc[0] = 1; +```c +char* ptr; +if (stAttrValue(stLookup(inf, "my_attr"), NULL, &ptr, 0) != 0) + goto error_handling; +printf("The value is: %s\n", ptr); ``` -By following this method (making a copy of the string and then setting the StrAlloc value for that string), when the StructInf tree node is freed by the stparse module, the string will auto- matically be freed as well. -### pStructInf stLookup(pStructInf inf, char* name) -This routine examines all sub-tree-nodes, both group and attribute nodes, for a group or attribute with the given name. If it finds one, it returns a pointer to the sub-node, otherwise NULL. +### stparse: stGetExpression() +```c +pExpression stGetExpression(pStructInf this, int nval); +``` +Returns a pointer to an expression that represents the value of the nval-th element of the given struct. -### int stAttrValue(pStructInf inf, int* intval, char** strval, int nval) -This function returns the value of the given attribute in an ST_T_ATTRIB tree node. If a string value is being returned, pass a pointer to the string pointer. If an integer value is being returned, pass a pointer to an integer. The pointer not being used must be left NULL. 'nval' can normally be 0, but if the attribute has several values, setting nval to 1,2,3, etc., returns the 2nd, 3rd, 4th item, respectively. This routing returns -1 if the attribute value did not exist or if the wrong type was requested. It also returns -1 if 'inf' was NULL. -It is common practice to use the stLookup and stAttrValue functions together to retrieve values, and search for an attribute StructInf and retrieve its value in one operation: +### stparse: stCreateStruct() +```c +pStructInf stCreateStruct(char* name, char* type); +``` +This function creates a new top-level tree item of type `ST_T_STRUCT`, with a given name and content-type. + +### stparse: stAddAttr() ```c - pStructInf inf; - char* ptr; +pStructInf stAddAttr(pStructInf inf, char* name); +``` +This function adds a node of type `ST_T_ATTRIB` to either an `ST_T_STRUCT` or an `ST_T_SUBGROUP` type of node, with a given name and no values (see AddValue, below). The new attribute tree node is linked under the `inf` node passed, and is returned. - if (stAttrValue(stLookup(inf, "myattr"),NULL,&ptr,0) == 0) - { - printf("%s is the value\n", ptr); - } + +### stparse: stAddGroup() +```c +pStructInf stAddGroup(pStructInf inf, char* name, char* type); ``` +This function adds a node of type `ST_T_SUBGROUP` to either an `ST_T_SUBGROUP` or an `ST_T_STRUCT` tree node, with a given name and content type (content type such as `"report/query"`). -### int stFreeInf(pStructInf this) -This function is used to free a StructInf tree node. It will free any sub-nodes first, so if that is not desired, be sure to disconnect them by removing them from the SubInf array and appropriately adjusting the nSubInf counter, and setting the SubInf array position to NULL. This function also disconnects the tree node from its parent, if any, so if the parent is already free()'d, be sure to set the node's Parent pointer to NULL. Any strings marked allocated with the StrAlloc flags will be free()'d. -It is also common practice to bypass the stXxx() functions entirely and access the elements of the StructInf structures themselves. This is not forbidden, and may be done. See the file stparse.h for a description of the structure. For example, +### stparse: stAddValue() +```c +int stAddValue(pStructInf inf, char* strval, int intval); +``` +This function adds a value to an attribute, and can be called multiple times on an attribute to add a list of values. If `strval` is not null, a string value is added, otherwise an integer value is added. The string is NOT copied, but is simply pointed-to. If the string is non-static, and has a lifetime less than the `ST_T_ATTRIB` tree node, then the following procedure should be used, where `str` is the string pointer to the string: ```c - pStructInf inf; - int i; +pStructInf attr_inf = stAddAttr(my_parent_inf, "my_attr"); +if (attr_inf == NULL) goto error_handling; + +char* new_str = (char*)malloc(strlen(str) + 1lu); +if (new_str == NULL) goto error_handling; +strcpy(new_str, str); +stAddValue(attr_inf, new_str, 0); +attr_inf->StrAlloc[0] = 1; +``` - for(i=0;inSubInf;i++) - { - if (inf->SubInf[i]->Type == ST_T_ATTRIB) - { - /** do stuff with attribute... **/ - } - } +With this method (making a copy of the string and then setting the StrAlloc value for that string), the string is automatically freed when the StructInf tree node is freed by the stparse module. + + +### stparse: stFreeInf() +```c +int stFreeInf(pStructInf this); ``` +This function is used to free a `StructInf` tree node. This also recursively frees sub-tree nodes, so these should be disconnected before calling if they are still needed. To do this, remove them from the SubInf array by appropriately adjusting the nSubInf counter and setting the SubInf array position to `NULL`. This function also disconnects the tree node from its parent, if any, so if the parent is already `free()`'d, prevent this behavior by setting the node's Parent pointer to `NULL` before calling this function. Any strings marked allocated with the StrAlloc flags will also be `free()`'d by this function, so update that flag if necessary. + + +### stparse: Using Fields Directly +It is also common practice to bypass the stparse functions entirely and access the elements of the `StructInf` struct directly, which is allowed. (See `stparse.h` for more information about this structure.) + +For example (assuming `inf` is a `pStructInfo` variable in scope): +```c +for (unsigned int i = 0u; i < inf->nSubInf; i++) + { + switch (inf->SubInf[i]->Type) + { + case ST_T_ATTRIB: + /** Do stuff with attribute... **/ + break; + + case ST_T_SUBGROUP: + /** Do stuff with group... **/ + break; + + ... + } + } +``` + + ## IV Memory Management in Centrallix -Centrallix has its own memory manager that caches freshly-deallocated blocks of memory in lists according to size so that they can be quickly reallocated. This memory manager also catches double-freeing of blocks, making debugging of memory problems a little easier. + +Centrallix has its own memory management wrapper that caches deallocated blocks of memory by size to allow for faster reuse. This wrapper also detects double-freeing of blocks (sometimes), making debugging of memory problems just a little bit easier. + +In addition, the memory manager provides statistics on the hit ratio of allocated blocks coming from the lists vs. `malloc()`, and on how many blocks of each size/type are `malloc()`ed and cached. This information can be helpful for tracking down memory leaks. Empirical testing has shown an increase of performance of around 50% or more in programs with the newmalloc module in use. -In addition the memory manager provides statistics on the hit ratio of allocated blocks coming from the lists vs. malloc(), and information on how many blocks of each size/type are allocated out and cached. This information can be invaluable in tracking down memory leaks. +One caveat is that this memory manager does not provide `nmRealloc()` function, only `nmMalloc()` and `nmFree()`. Thus, either `malloc()`, `free()`, and `realloc()` or `nmSysMalloc()`, `nmSysFree()`, and `nmSysRealloc()` should be used for blocks of memory that might vary in size. -One caveat is that this memory manager does not provide a realloc() function, so the standard malloc(), free(), and realloc() must be used for blocks of memory that might grow in size. This memory manager is also perhaps not the best to use for blocks of memory of arbitrary sizes, but rather is best for allocating structures quickly that are of a specific size and belong to specific objects, such as the StructInf structure or the SnNode structure, and others. In short, use it for structures, but not for strings. +- 📖 **Note**: This memory manager is usually the wrong choice for blocks of memory of arbitrary sizes. It is intended for allocating structures quickly that are of a specific size. For example, allocated space for a struct that is always the same size. + +- 🥱 **tl;dr**: Use `nmMalloc()` for structs, not for strings. + +- ⚠️ **Warning**: Calling `free()` on a block obtained from `nmMalloc()` or calling `nmFree()` on a block obtained from `malloc()` might not crash the program immediately. Instead, it will result in either inefficient use of the memory manager, or a significant memory leak, respectively. These practices will also lead to incorrect results from the statistics and block count mechanisms. -Empirical testing has shown an increase of performance of around 50% or more in programs with the newmalloc module in use. The following are the functions for the newmalloc module: -### void* nmMalloc(int size) -This function allocates a block of the given 'size'. It returns NULL if the memory could not be allocated. +### nmMalloc() +```c +void* nmMalloc(int size); +``` +This function allocates a block of the given `size`. It returns `NULL` if the memory could not be allocated. + + +### nmFree() +```c +void nmFree(void* ptr, int size); +``` +This function frees the block of memory. + +- ⚠️ **Warning**: The caller **must know the size of the block.** Getting this wrong is very bad!! For structures, this is trivial, simply use `sizeof()`, exactly the same as with `nmMalloc()`. + -### void nmFree(void* ptr, int size) -This function frees the block of memory. NOTE THAT THE CALLING FUNCTION MUST KNOW THE SIZE OF THE BLOCK. Getting this wrong is very bad. For structures, this is trivial, just use sizeof() just like with nmMalloc(). +### nmStats() +```c +void nmStats(void); +``` +Prints statistics about the memory manager, for debugging and optimizing. -### void nmStats() -Prints out statistics on how well the memory manager is doing. +For example: +``` +NewMalloc subsystem statistics: + nmMalloc: 0 calls, 0 hits (-nan%) + nmFree: 0 calls + bigblks: 0 too big, 0 largest size +``` -### void nmRegister(int size, char* name) -Registers a name with a block size. This allows the memory manager to be intelligent when reporting block allocation counts. The first argument is the size of the block, the second, an intelligent name for that size of block. A size can have more than one name. This function is optional and need not be used except when tracking down memory leaks, but can be used freely. + -Typically this function is called in a module's Initialize() function on each of the structures the module uses internally. -### void nmDebug() -Prints out a listing of block allocation counts, giving (by size): 1) number of blocks allocated but not yet freed, 2) number of blocks in the cache, 3) total allocations for this block size, and a list of names (from nmRegister()) for that block size. +### nmRegister() +```c +void nmRegister(int size, char* name); +``` +Registers an inteligent name with a block size. This allows the memory manager to be intelligent when reporting block allocation counts. A given size can have more than one name. This function is optional and not required for any production code to work, but using it can make tracking down memory leaks easier. -### void nmDeltas() -Prints a listing of all blocks whose allocation count has changed, and by how much, since the last nmDeltas() call. This function is VERY USEFUL FOR MEMORY LEAK DETECTIVE WORK. +This function is usually called in a module's `Initialize()` function on each of the structures the module uses internally. -### void* nmSysMalloc(int size) -Allocates memory without using the block-caching algorithm. This is roughly equivalent to malloc(), but pointers returned by malloc and this function are not compatible with each other - i.e., you cannot free() something that was nmSysMalloc'ed, nor can you nmSysFree() something that was malloc'ed. -This function is much better to use on variable-sized blocks of memory. nmMalloc is better for fixed-size blocks, such as for data structures. +### nmDebug() +```c +void nmDebug(void); +``` +Prints a listing of block allocation counts, giving (by size): +- The number of blocks allocated but not yet freed. +- The number of blocks in the cache. +- The total allocations for this block size. +- A list of names (from `nmRegister()`) for that block size. -### void nmSysFree(void* ptr) -Frees a block of memory allocated by nmSysMalloc, nmSysStrdup, or nmSysRealloc. -### void* nmSysRealloc(void* ptr, int newsize) -Changes the size of an allocated block of memory that was obtained via nmSysMalloc or nmSysRealloc or nmSysStrdup. The new pointer may be different if the block had to be moved. This is the rough equivalent of realloc(). Usage Note: If you are realloc'ing a block of memory, and need to store pointers to data somewhere inside the block, it is often better to store the offset rather than a full pointer, as a pointer would become invalid if a nmSysRealloc caused the block to move. +### nmDeltas() +```c +void nmDeltas(void); +``` +Prints a listing of all blocks whose allocation count has changed, and by how much, since the last `nmDeltas()` call. This function is VERY USEFUL FOR MEMORY LEAK DETECTIVE WORK. + + +### nmSysMalloc() +```c +void* nmSysMalloc(int size); +``` +Allocates memory without using the block-caching algorithm. This is roughly equivalent to `malloc()`, but pointers returned by malloc and this function are not compatible with each other - i.e., you cannot `free()` something that was `nmSysMalloc()`'ed, nor can you `nmSysFree()` something that was `malloc()`'ed. + +- 📖 **Note**: This function is much better to use on variable-sized blocks of memory. `nmMalloc()` is better for fixed-size blocks, such as for data structures. + + +### nmSysRealloc() +```c +void* nmSysRealloc(void* ptr, int newsize); +``` +Changes the size of an allocated block of memory that was obtained from `nmSysMalloc()`, `nmSysRealloc()`, or `nmSysStrdup()`. The new pointer may be different if the block has to be moved. This is the rough equivalent of `realloc()`. + +- 📖 **Note**: If you are `realloc()`'ing a block of memory and need to store pointers to data somewhere inside the block, it is often better to store an offset rather than a full pointer. This is because a full pointer becomes invalid if a `nmSysRealloc()` causes the block to move. + + +### nmSysStrdup() +```c +char* nmSysStrdup(const char* str); +``` +Allocates memory using `nmSysMalloc()` function and copies the string `str` into this memory. It is a rough equivalent of `strdup()`. The resulting pointer can be free'd using `nmSysFree()`. + + +### nmSysFree() +```c +void nmSysFree(void* ptr); +``` +Frees a block of memory allocated by `nmSysMalloc()`, `nmSysRealloc()`, or `nmSysStrdup()`. -### char* nmSysStrdup(const char* str) -Allocates memory for a copy of the string str by using the nmSysMalloc function, and then makes a copy of the string str. It is a rough equivalent of strdup(). The resulting pointer can be free'd using nmSysFree(). -Calling free() on a block obtained from nmMalloc() or calling nmFree() on a block obtained from malloc() will not crash the program. Instead, it will result in either inefficient use of the memory manager, or a huge memory leak, respectively. These practices will also render the statistics and block count mechanisms useless. ## V Other Utility Modules -There are many other utility modules useful in Centrallix. These include the xarray module, used for managing growable arrays; the xhash module, used for managing hash tables with no overflow problems and variable-length keys, the xstring module used for managing growable strings; the expression module used for compiling and evaluating expressions; and the mtsession module, used for managing session-level variables and reporting errors. + + +The Centrallix library (`centralllix-lib`) has a host of useful utility modules. These include `xarray`, used for managing growable arrays; `xstring`, used for managing growable strings; `xhash`, used for managing hash tables with no overflow problems and variable-length keys; `expression`, used for compiling and evaluating expressions; and `mtsession`, used for managing session-level variables and reporting errors. + ### A. XArray (XA) - Arrays The first is the xarray (XA) module. diff --git a/centrallix-sysdoc/string_comparison.md b/centrallix-sysdoc/string_comparison.md deleted file mode 100644 index dac13d544..000000000 --- a/centrallix-sysdoc/string_comparison.md +++ /dev/null @@ -1,101 +0,0 @@ -# String Comparison -The following sections discuss the two approaches to calculating similarity between two strings. Both approaches use a SQL function to calculate a similarity metric (on a scale of 0 to 1) for two string parameters. - -## Table of Contents -- [String Comparison](#string-comparison) - - [Table of Contents](#table-of-contents) - - [Levenshtein Similarity](#levenshtein-similarity) - - [Levenshtein](#levenshtein) - - [Cosine Similarity](#cosine-similarity) - - [CHAR_SET](#char_set) - - [Frequency Table](#frequency-table) - - [Relative Frequency Table](#relative-frequency-table) - - [TF-IDF](#tf-idf) - - [Dot Product](#dot-product) - - [Magnitude](#magnitude) - - [Similarity](#similarity) - - [Future Implementation](#future-implementation) - - [Inverse Document Frequency (IDF)](#inverse-document-frequency-idf) - -## Levenshtein Similarity -The [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) distance is defined as the number of insertions, deletions, or substitutions required to make one string exactly like another string. - -### Levenshtein -```c -int exp_fn_levenshtein(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) -``` -Returns the levenshtein edit distance between two strings. - -```c -int exp_fn_fuzzy_compare(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) -``` -Returns a value between 0.0 (complete match) and 1.0 (complete difference) between strings a and b, based on the (levenshtein distance) / (max len of input strings). -Some alterations to the calculation are as follows: -- Matching an empty string against anything returns 0.5. -- A string that only required insertions to become the other string has its `(lev_dist)/(strlen)` value halved before returning. -- The parameter `max_field_width` is required, but not used. - -## Cosine Similarity - -The [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) function is defined as the dot product of two vectors divided by the product of the magnitude of the two vectors. We use the relative frequency of the individual characters within each term as the vectors in the calculation. The following functions are used to calculate cosine similarity. - -### CHAR_SET -```c -const char *CHAR_SET ... -``` -`CHAR_SET` represents all of the characters that should be considered during the calculation of similarity. `CHAR_SET` can be extended to include additional characters, as necessary. - -### Frequency Table - -```c -int exp_fn_i_frequency_table(double *table, char *term) -``` -Helper function for similarity(). Creates a frequency table containing indices corresponding to all characters in `CHAR_SET` (all other characters are ignored). The values in the frequency table will contain the number of times each character appers in `term`. - -The `table` parameter must be allocated prior to calling the function with `nmMalloc()` using `sizeof(x * sizeof(double))`, where `x` is the length of `CHAR_SET`. The function will initialize all `table` values to 0, before calculating the frequency values. - -### Relative Frequency Table -```c -int exp_fn_i_relative_frequency_table(double *frequency_table) -``` -Helper function for similarity(). Converts a frequency table into a relative frequency table, where each value in the `frequency_table` is converted to the percent of occurrence (i.e., frequency divided by the sum of total occurrences). - -The `frequency_table` parameter must have been created using the `exp_fn_i_frequency_table` function above. - -### TF-IDF -```c -int exp_fn_i_tf_idf_table(double *frequency_table) -``` -Helper function for similarity(). Creates a TF x IDF vector from a frequency table, where each value in the resulting table is created by multiplying the relative frequency of each letter by the corresponding coefficient in the IDF array. - -The `frequency_table` parameter must have been created using the `exp_fn_i_frequency_table` function above. - -### Dot Product - -```c -int exp_fn_i_dot_product(double *dot_product, double *r_freq_table1, double *r_freq_table2) -``` -Helper function for similarity(). Calculates the dot product of two relative frequency tables (sum of the squared values from each relative frequency table). - -The `dot_product` parameter should be initialized to 0 before calling the function. The table parameters must contain relative frequency tables that are generated from the `exp_fn_i_relative_frequency_table` function. The lengths of both tables must equal the length of `CHAR_SET`. - -### Magnitude - -```c -int exp_fn_i_magnitude(double *magnitude, double *r_freq_table) -``` -Helper function for similarity(). Calculates the magnitude of a relative frequency table (square root of the sum of the squared relative frequencies). - -The `magnitude` parameter should be initialized to 0 before calling the function. The table parameter must contain a relative frequency table that was generated from the `exp_fn_i_relative_frequency_table` function. The length of the frequency table must equal the length of `CHAR_SET`. - -### Similarity - -```c -int exp_fn_similarity(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) -``` -Returns a value between 0.0 (completely different) and 1.0 (complete match) reflecting the similarity between the value passed in to i0 and the value passed in to i1. The first two parameters should contain strings that need to be compared. If the value 1 is passed in the third parameter, then the similarity function will rely on TF x IDF scores to determine similarity. If no third parameter is passed, then the function will rely only on relative frequency scores. - -## Future Implementation - -### Inverse Document Frequency (IDF) -In text mining, the most common metric to use in the cosine similarity function is the [TF x IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) metric. Our approach uses only TF (term frequency). Inverse document frequency calculates a weighting factor for each character. This could increase precision a small amount by weighting characters that appear on many records as less important in distinguishing matches, and weighting characters that appear on only certain records as more important. IDF could be calculated by iterating through the entire partner dataset each time. The current approach uses the relative frequency of each letter used in the English language on [Wikipedia](https://en.wikipedia.org/wiki/Letter_frequency), which may not be consistent with the data in the partner database. diff --git a/centrallix-sysdoc/string_similarity.md b/centrallix-sysdoc/string_similarity.md index f466a057c..b9a3a28b6 100644 --- a/centrallix-sysdoc/string_similarity.md +++ b/centrallix-sysdoc/string_similarity.md @@ -49,7 +49,7 @@ ---> # String Similarity -The following sections discuss the approaches to calculating similarity between two strings which are implemented in the `clusters.c` library. This library can be incuded using `#include "clusters.h"` in centrallix-lib and `#include "cxlib/clusters.h"` in centrallix. +The following sections discuss the approaches to calculating similarity between two strings which are implemented in the `clusters.c` library. This library can be included using `#include "clusters.h"` in centrallix-lib and `#include "cxlib/clusters.h"` in centrallix. ## Table of Contents @@ -74,10 +74,10 @@ The following sections discuss the approaches to calculating similarity between ## Cosine Similarity -The [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) function is defined as the dot product of two vectors divided by the product of the magnitude of the two vectors. Conceptually, it's like finding the _angle_ between two vectors. To get these vectors, we use the relative frequency of character pairs within each string. To reduce memory cost and speed up computation, we store them in a special sparcely allocated form, described below. +The [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) function is defined as the dot product of two vectors divided by the product of the magnitude of the two vectors. Conceptually, it's like finding the _angle_ between two vectors. To get these vectors, we use the relative frequency of character pairs within each string. To reduce memory cost and speed up computation, we store them in a special sparsely allocated form, described below. ### Character Sets -Cosine compare currnetly uses the following character sets. These can be extended or modified later, if necessary. +Cosine compare currently uses the following character sets. These can be extended or modified later, if necessary. ```c const char ALLOW_SET[] = " \n\v\f\r!#$%&\"'()*+,-./:;<=>?@[]^_{|}~ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; const char CHAR_SET[] = "`abcdefghijklmnopqrstuvwxyz0123456789"; @@ -85,83 +85,86 @@ const char SIGNIFICANT_SET[] = "`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstu const char IGNORE_SET[] = " \n\v\f\r!#$%&\"'()*+,-./:;<=>?@[]^_{|}"; const char BOUNDARY_CHAR = ('a' - 1); // aka. '`' ``` -- `ALLOW_SET` represents all characters which can be passed to a similarity detection algorithm. Passing other characters may cause warnings and errors, undefined or unintended behavior, and even security concerns. -- `CHAR_SET` represents all of the characters that will be uniquely considered during the calculation of similarity. Currently, this is all lowercase letters and numbers. -- `SIGNIFICANT_SET` represents all of the characters that are significant for the purposes of similarity. For example, the upercase letters are significant because they are considered identical to lowercase letters. Thus, they are included in the `SIGNIFICANT_SET`, but not in the `CHAR_SET`. -- `IGNORE_SET` represents characters which, while allowed to be passed to a similarity algorithm, will be ignored. For example, the strings "Ya!!" and "Ya..." will be considered identical. +- `ALLOW_SET` represents all characters which can be passed to a similarity detection algorithm. Passing other characters may cause warnings and errors, undefined or unintended behavior, and even security concerns. +- `CHAR_SET` represents all of the characters that will be uniquely considered during the calculation of similarity. Currently, this is all lowercase letters and numbers. +- `SIGNIFICANT_SET` represents all of the characters that are significant for the purposes of similarity. For example, the uppercase letters are significant because they are considered identical to lowercase letters. Thus, they are included in the `SIGNIFICANT_SET`, but not in the `CHAR_SET`. +- `IGNORE_SET` represents characters which, while allowed to be passed to a similarity algorithm, will be ignored. For example, the strings "Ya!!" and "Ya..." will be considered identical. - The `BOUNDARY_CHAR` is a special character which is conceptually added to the start and end of any string to be checked. - This allows for pairs that functionally include only the first and last character. - This character appears to have been selected to be one before the first character in `CHAR_SET` (thus convention dictates that it be written `'a' - 1` to indicate this), although it's unknown if that's the main or only reason. - If `clusters.h` is included, it can be accessed using the `CA_BOUNDARY_CHAR` macro. ### Character Pair Hashing -Even with a small set of ASCII characters (say 36), there are still `36^2 = 1296` possible character pairs. If the number of characters in the `CHAR_SET` ever needed to be expanded - for example, to include all UTF-8 characters - this number would quickly explode exponentially to utterly infeasible proportions. Thus, a hashing algorithm is employed to hash each character pair down to a more reasonable number of dimensions (which can be accessed with the `CA_NUM_DIMS` macro). +Even with a small set of ASCII characters (say 36), there are still `36^2 = 1296` possible character pairs. If the number of characters in the `CHAR_SET` ever needed to be expanded - for example, to include all UTF-8 characters - this number would quickly explode exponentially to utterly infeasible proportions. Thus, a hashing algorithm is employed to hash each character pair down to a more reasonable number of dimensions (which can be accessed with the `CA_NUM_DIMS` macro). ### String Vectors -Any string of characters in the `ALLOW_SET` can be represented by a vector. For simplicty, imagine this vector has only `5` dimensions. To find this vector, we hash each character pair in the string. As each character pair is hashed (for example, that the pair "ab" happens to hash to `3`), the corresponding dimension is increased by some amount. This amount varies to based on the characters in the pair, helping to mitigate the impact of collisions where different character pairs hash to identical numbers (a larger number of dimensions also helps to mitigate this). +Any string of characters in the `ALLOW_SET` can be represented by a vector. For simplicity, imagine this vector has only `5` dimensions. To find this vector, we hash each character pair in the string. As each character pair is hashed (for example, that the pair "ab" happens to hash to `3`), the corresponding dimension is increased by some amount. This amount varies to based on the characters in the pair, helping to mitigate the impact of collisions where different character pairs hash to identical numbers (a larger number of dimensions also helps to mitigate this). -Remember that the first and last characters form a pair with the `BOUNDARY_CHAR`, so the string "ab" has three pairs: "a", "ab", and "b". If these each hash to `2`, `3`, and `0`. Thus, the vector generated by the string "ab" might be: `[7, 0, 4, 3, 0]`. Notice that dimensions #1 and #4 are both `0` because no character pairs generated a hash of `1` or `4`. In real usecases, the vast majority of elements are `0`s because the number of dimensions used is much larger than the number of character pairs in a typical string. +Remember that the first and last characters form a pair with the `BOUNDARY_CHAR`, so the string "ab" has three pairs: "a", "ab", and "b". If these each hash to `2`, `3`, and `0`. Thus, the vector generated by the string "ab" might be: `[7, 0, 4, 3, 0]`. Notice that dimensions #1 and #4 are both `0` because no character pairs generated a hash of `1` or `4`. In real usecases, the vast majority of elements are `0`s because the number of dimensions used is much larger than the number of character pairs in a typical string. ### Sparse Vectors -As noted above, the vast majority of elements in a vector generated by a typical string are `0`s. This would lead to a large waste of memory and computation if every `0` was stored separately, so instead, vectors are stored sparsely. Because all hashes are positive integers, we represent `n` `0`s with a value of ` -n`. Thus, the vector `[0, 1, 0, 0, 0]` (representing an empty string in `5` dimensions) would be represented sparsely as `[-1, 1, -3]`. +As noted above, the vast majority of elements in a vector generated by a typical string are `0`s. This would lead to a large waste of memory and computation if every `0` was stored separately, so instead, vectors are stored sparsely. Because all hashes are positive integers, we represent `n` `0`s with a value of ` -n`. Thus, the vector `[0, 1, 0, 0, 0]` (representing an empty string in `5` dimensions) would be represented sparsely as `[-1, 1, -3]`. **Note**: A value of `0` in a sparse vector is undefined, so no element should be equal to `0`. -**Note**: Sparse arrays can vary greatly in length. To find their size, one needs to traverse the array until the total number of values found adds up to `CA_NUM_DIMS`. The `ca_sparse_len()` function can be used to do this. Also, the `ca_build_vector()` and `ca_free_vector()` use the `nmSys` functions from `newmalloc.h` to avoid conflicts over the size of the allocated data. +**Note**: Sparse arrays can vary greatly in length. To find their size, one needs to traverse the array until the total number of values found adds up to `CA_NUM_DIMS`. The `ca_sparse_len()` function can be used to do this. Also, the `ca_build_vector()` and `ca_free_vector()` use the `nmSys` functions from `newmalloc.h` to avoid conflicts over the size of the allocated data. ### Computing Similarity -Finally, to find the cosine similarity between two strings, we can simply take the [dot product](https://en.wikipedia.org/wiki/Dot_product) of their coresponding vectors. Then, we normalize the dot product by dividing by the magnitudes of both vectors multiplied together. Two strings can be compared this way using the `ca_cos_compare()` function. +Finally, to find the cosine similarity between two strings, we can simply take the [dot product](https://en.wikipedia.org/wiki/Dot_product) of their coresponding vectors. Then, we normalize the dot product by dividing by the magnitudes of both vectors multiplied together. Two strings can be compared this way using the `ca_cos_compare()` function. ## Levenshtein Similarity -The [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) distance is defined as the number of insertions, deletions, or substitutions required to make one string exactly like another string. The version implemented in `clusters.c` additionally allows a new operation called a "swap" in which two adjacent characters change places. Transpositions of larger pieces of text are, unfortunately, not handled as well, which is a potential downfall of using levenshtein edit distance. +The [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) distance is defined as the number of insertions, deletions, or substitutions required to make one string exactly like another string. The version implemented in `clusters.c` additionally allows a new operation called a "swap" in which two adjacent characters change places. Transpositions of larger pieces of text are, unfortunately, not handled as well, which is a potential downfall of using levenshtein edit distance. The levenshtein similarity of two strings can be compared using the `ca_lev_compare()` function. ## Clustering -When searching for similar strings in a large amount of data (for example, `1,000,000` strings), comparing every string to every other string can be very computationally expensive. To speed up this process, it is helpful to _cluster_ similar strings together, then only compare strings within similar clusters. This sacrifices some accuracy to allow large amounts of data to be searched and compared in a feasable amount of time. +When searching for similar strings in a large amount of data (for example, `1,000,000` strings), comparing every string to every other string can be very computationally expensive. To speed up this process, it is helpful to _cluster_ similar strings together, then only compare strings within similar clusters. This sacrifices some accuracy to allow large amounts of data to be searched and compared in a feasible amount of time. ### K-means Clustering -When clustering data using the [k-means](https://en.wikipedia.org/wiki/K-means_clustering) algorithm, data is divided into a predefined number of clusters with the goal of maximizing the average similarity of datapoints within any given cluster. To quickly summarize the algorithm: -1. Randomly select `k` datapoints to be the initial centroids of each cluster. -2. For each datapoint, find the centroid it is most similar to, and assign it to that clustser. -3. For each cluster, find the new centroid by averaging all datapoints in the cluster. -4. Repeat steps 2 and 3 until the clusters stabilize (i.e. no datapoint changes clusters). +When clustering data using the [k-means](https://en.wikipedia.org/wiki/K-means_clustering) algorithm, data is divided into a predefined number of clusters with the goal of maximizing the average similarity of data points within any given cluster. To quickly summarize the algorithm: +1. Randomly select `k` data points to be the initial centroids of each cluster. +2. For each data point, find the centroid it is most similar to, and assign it to that cluster. +3. For each cluster, find the new centroid by averaging all data points in the cluster. +4. Repeat steps 2 and 3 until the clusters stabilize (i.e. no data point changes clusters). -The implementation used in `clusters.c` also allows the programmer to specify a maximum number of iterations (called `max_iter` in the code) to prevent this process from running forever. Additionally, successive iterations can give diminishing results or even produce clusters that are slightly worse. To improve performance, the programmer can also specify a minimum improvement threshold (called `min_improvement`). Clusters must become more similar by at least this amount each iteration, otherwise the algorithm ends, even if the maximum number of iterations has not yet been reached. +The implementation used in `clusters.c` also allows the programmer to specify a maximum number of iterations (called `max_iter` in the code) to prevent this process from running forever. Additionally, successive iterations can give diminishing results or even produce clusters that are slightly worse. To improve performance, the programmer can also specify a minimum improvement threshold (called `min_improvement`). Clusters must become more similar by at least this amount each iteration, otherwise the algorithm ends, even if the maximum number of iterations has not yet been reached. The `ca_kmeans()` function can be invoked using [the cosine comparison string vectors](#string-vectors) (see above) to cluster them into similar clusters. ### K-means++ Clustering **Not yet implemented** -This method is largely identical to k-means, except that [k-means++](https://en.wikipedia.org/wiki/K-means%2B%2B) assignes the initial centroids using an aproximate algorithm designed to avoid some of the poor clusterings possible with random assignment. +This method is largely identical to k-means, except that [k-means++](https://en.wikipedia.org/wiki/K-means%2B%2B) assigns the initial centroids using an approximate algorithm designed to avoid some of the poor clustering possible with random assignment. ### K-medoids Clustering **Not yet implemented** -This method is also very similar to k-means, except that [k-medoids](https://en.wikipedia.org/wiki/K-medoids) places an aditional requirement that all centroids be points in the data. This would theoretically allow for other similarity measures (such as levenshtein edit distance) to be used for clustering instead of only cosine compare. +This method is also very similar to k-means, except that [k-medoids](https://en.wikipedia.org/wiki/K-medoids) places an additional requirement that all centroids be points in the data. This would theoretically allow for other similarity measures (such as Levenshtein edit distance) to be used for clustering instead of only cosine compare. ### DB-Scan **Proposed, not yet implemented or documented** ### Sliding Clusters -A far more basic method of "clustering" is to simply sort all data alphabetically, then, instead of comparing each string to all other strings, it can be compared to only the next `n` strings. Of course, differences near the start of a string (for example, "fox" vs. "box") will cause those strings to sort far away from each other, leading them to be completely missed. +A far more basic method of "clustering" is to simply sort all data alphabetically, then, instead of comparing each string to all other strings, it can be compared to only the next `n` strings. Of course, differences near the start of a string (for example, "fox" vs. "box") will cause those strings to sort far away from each other, leading them to be completely missed. -Sorting using a similarity measure, such as `ca_cos_compare()` or `ca_lev_compare()` would resolve this issue. However, these comparison functions do not meet the transitivity requirement for sorting, which is that `(A < B) & (B < C) -> (A < C)`. For example, "car" is similar to "boxcar", which is also similar to "box". However, "car" and "box" are not similar at all. +Sorting using a similarity measure, such as `ca_cos_compare()` or `ca_lev_compare()` would resolve this issue. However, these comparison functions do not meet the transitivity requirement for sorting, which is that `(A < B) & (B < C) -> (A < C)`. For example, "car" is similar to "boxcar", which is also similar to "box". However, "car" and "box" are not similar at all. Additionally, sorting by the cosine vectors (similarly to how we cluster by them when using k-means) was proposed, but further investigation showed that this was also not possible. -For problems where a sorting algorithm exists which can mitigate the above issues, this solution may prove very promissing. However, so far we have not found such a problem, so the other clustering algorithms tend to out perform Sliding Clusters. +For problems where a sorting algorithm exists which can mitigate the above issues, this solution may prove very promising. However, so far we have not found such a problem, so the other clustering algorithms tend to outperform Sliding Clusters. ## Future Implementation ### K-means Fuzzy Clustering -One of the biggest downsides with k-means is that it creates very arbitrary boundaries between clusters. Elements on either side of these boundaries may be highly similar, but if comparisons only occur within a cluster, these similar entries will be missed. The problem becomes more extreme as a higher k value (more clusters) is used, creating more arbitrary boundaries. This drawback is probably the main reason that clustering sacrifices some accuracy over searching every element. +One of the biggest downsides with k-means is that it creates very arbitrary boundaries between clusters. Elements on either side of these boundaries may be highly similar, but if comparisons only occur within a cluster, these similar entries will be missed. The problem becomes more extreme as a higher k value (more clusters) is used, creating more arbitrary boundaries. This drawback is probably the main reason that clustering sacrifices some accuracy over searching every element. -Running the entire search multiple types may allow some of these to be found because the initial cluster locations are random. This approach is partially implemented for duplocate searching because the algorithm runs nightly anyway, so a simple up-sert (**UP**date existing entries; in**SERT** new entries) slightly reduces this problem. However, this solution is obviously far from ideal. +Running the entire search multiple types may allow some of these to be found because the initial cluster locations are random. This approach is partially implemented for duplicate searching because the algorithm runs nightly anyway, so a simple upsert (**UP**date existing entries; in**SERT** new entries) slightly reduces this problem. However, this solution is obviously far from ideal. -If the clustering could be expanded with an additional step that makes clusters larger, adding elements from other clusters to them, this might effectively mitigate the issue. It may also allow developers to use larger numbers of clusters, improving performance as well as accuracy. Further research is needed to verify the effectiveness of this approach before an implementation is written. +If the clustering could be expanded with an additional step that makes clusters larger, adding elements from other clusters to them, this might effectively mitigate the issue. It may also allow developers to use larger numbers of clusters, improving performance as well as accuracy. Further research is needed to verify the effectiveness of this approach before an implementation is written. ### Implement Missing Algorithms -Several algorithms (such as [k-means++](#k-means-clustering-1), [k-medoids](#k-medoids-clustering), and [DBScan](#db-scan)) above are proposed but lack an implementation. They may be effective and useful, however, to reduce development time, they have not yet been implemented. +Several algorithms (such as [k-means++](#k-means-clustering-1), [k-medoids](#k-medoids-clustering), and [DBScan](#db-scan)) above are proposed but lack an implementation. They may be effective and useful, however, to reduce development time, they have not yet been implemented. + +### Upgrade Other Duplicate Detection Systems +When a new record is entered, a quick scan is run to check if it might be a duplicate. There is also a button in the UI for a record that lets you run a duplicate check. These systems could also be upgraded using the new algorithms and strategies developed for general duplicate detection. \ No newline at end of file diff --git a/centrallix/expression/exp_double_metaphone.c b/centrallix/expression/exp_double_metaphone.c index f3d76c49b..8b7c4cd6f 100644 --- a/centrallix/expression/exp_double_metaphone.c +++ b/centrallix/expression/exp_double_metaphone.c @@ -18,9 +18,9 @@ /* */ /* A summary of the relevant content from https://dev.perl.org/licenses */ /* has been included below for the convenience of the reader. This */ -/* was collected and saved on September 5th, 2025 and may not reflect */ -/* current information. For the most up to date information, please use */ -/* the link above. */ +/* information was collected and saved on September 5th, 2025 and may */ +/* differ from current information. For the most up to date copy of */ +/* this information, please use the link provided above. */ /* */ /* Perl5 is Copyright © 1993 and later, by Larry Wall and others. */ /* */ @@ -64,11 +64,15 @@ /* */ /* Module: exp_double_metaphone.c */ /* Author: Maurice Aubrey */ -/* Description: This module implements a "sounds like" algorithm */ -/* developed by Lawrence Philips which he published */ -/* in the June, 2000 issue of C/C++ Users Journal. */ -/* Double Metaphone is an improved version of Philips' */ -/* original Metaphone algorithm. */ +/* Description: This module implements a "sounds like" algorithm by */ +/* Lawrence Philips which he published in the June, 2000 */ +/* issue of C/C++ Users Journal. Double Metaphone is an */ +/* improved version of the original Metaphone algorithm */ +/* written by Philips'. This implementaton was written by */ +/* Maurice Aubrey for C/C++ with bug fixes provided by */ +/* Kevin Atkinson. It was revised by Israel Fuller to */ +/* better align with the Centrallix coding style and */ +/* standards so that it could be included here. */ /************************************************************************/ /*** Note to future programmers reading this file (by Israel Fuller): @@ -83,7 +87,7 @@ *** might not line up with the original author. *** *** To be honest, though, trying to make this code as readable as possible - *** was very challanging due to all the messy boolean algebra. If there is + *** was very challenging due to all the messy boolean algebra. If there is *** ever a professional linguist reading this, please factor out some of the *** logic into local variables with descriptive names so that the rest of us *** can read this code without our eyes glazing over. @@ -205,7 +209,7 @@ void meta_destroy_string(MetaString* s) /*** Increases a MetaString's buffer size. *** *** @param s The MetaString* being modified. - *** @param chars_needed Minimumn number of characters to increase buffer size. + *** @param chars_needed Minimum number of characters to increase buffer size. ***/ void meta_increase_buffer(MetaString* s, const size_t chars_needed) { @@ -347,7 +351,7 @@ void meta_double_metaphone(const char* str, char** primary_code, char** secondar if (str == NULL || (length = strlen(str)) == 0u) { fprintf(stderr, "Warning: Call to meta_double_metaphone() with invalid string.\n"); - /** Double Metaphone on an invalid string yeilds two empty strings. **/ + /** Double Metaphone on an invalid string yields two empty strings. **/ *primary_code = (char*)SAFE_MALLOC(sizeof(char)); *secondary_code = (char*)SAFE_MALLOC(sizeof(char)); return; @@ -1066,7 +1070,7 @@ void meta_double_metaphone(const char* str, char** primary_code, char** secondar } /** german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' **/ - /** also, -sz- in slavic language altho in hungarian it is pronounced 's' **/ + /** also, -sz- in slavic language although in hungarian it is pronounced 's' **/ if (current == 0 && meta_is_str_at(original, (current + 1), "M", "N", "L", "W", "")) { meta_add_str(primary, "S"); @@ -1269,7 +1273,7 @@ void meta_double_metaphone(const char* str, char** primary_code, char** secondar /*** Built in test cases. *** *** These tests have been integrated into the Centrallix testing environment, - *** where they can be run using `export TONLY=expfn_double_metaphone_00`, + *** where they can be run using `export TONLY=exp_fn_double_metaphone_00`, *** followed by make test, in the Centrallix directory. *** *** The can also be run here by executing the following commands in the diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 71f906e3d..159c292b0 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -86,7 +86,7 @@ static char* ci_TypeToStr(const int type) case DATA_T_STRING: return "String"; case DATA_T_DOUBLE: return "Double"; case DATA_T_DATETIME: return "DateTime"; - case DATA_T_INTVEC: return "IntVecor"; + case DATA_T_INTVEC: return "IntVector"; case DATA_T_STRINGVEC: return "StringVector"; case DATA_T_MONEY: return "Money"; case DATA_T_ARRAY: return "Array"; @@ -2414,17 +2414,37 @@ int exp_fn_truncate(pExpression tree, pParamObjects objlist, pExpression i0, pEx /*** constrain(value, min, max) ***/ int exp_fn_constrain(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) { - if (!i0 || !i1 || !i2 || (i0->DataType != i1->DataType) || i0->DataType != i2->DataType || !(i0->DataType == DATA_T_INTEGER || i0->DataType == DATA_T_MONEY || i0->DataType == DATA_T_DOUBLE)) - { - mssError(1,"EXP","constrain() requires three numeric parameters of the same data type"); - return -1; - } + /** Skip null value. **/ tree->DataType = i0->DataType; if ((i0->Flags & EXPR_F_NULL)) { tree->Flags |= EXPR_F_NULL; return 0; } + + /** Verify parameters. **/ + if (i0 == NULL || i1 == NULL || i2 == NULL) + { + mssError(1, "EXP", "constrain() expects three parameters."); + return -1; + } + if (i0->DataType != DATA_T_INTEGER && i0->DataType != DATA_T_DOUBLE && i0->DataType != DATA_T_MONEY) + { + mssError(1, "EXP", + "constrain() expects three numeric parameters: %s is not numeric.", + ci_TypeToStr(i0->DataType) + ); + if (i0->DataType == DATA_T_STRING) printf("Value: '%s'\n", i0->String); + return -1; + } + if (i0->DataType != i1->DataType || i1->DataType != i2->DataType) + { + mssError(1, "EXP", + "constrain() expects three numeric parameters of the same data type but got types %s, %s, and %s.", + ci_TypeToStr(i0->DataType), ci_TypeToStr(i1->DataType), ci_TypeToStr(i2->DataType) + ); + return -1; + } /* check min */ if (!(i1->Flags & EXPR_F_NULL)) @@ -4131,78 +4151,143 @@ int exp_fn_nth(pExpression tree, pParamObjects objlist, pExpression i0, pExpress return 0; } - -/*** Computes cosine or levenshtien similarity between two strings. These two - *** tasks have a large amount of overlapping logic (mostly error checking), - *** so doing them with one function greatly reduces code duplocation. - *** - *** @param tree The tree resulting from this function. - *** @param objlist The evaluation "scope", including available variables. - *** @param maybe_str1 Possibly the first string. - *** @param maybe_str2 Possibly the second string. - *** @param u1 Unused parameter. - *** @param is_cos Whether to compute cosine or levenshtien. - *** @returns 0 for success, -1 for failure. - ***/ -static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1, const char* fn_name) +static int exp_fn_verify_schema( + const char* fn_name, + const int* param_types, + const int num_params, + pExpression tree, + pParamObjects obj_list) { - /** Check number of arguments. **/ - const int num_params = tree->Children.nItems; - if (num_params != 2) + /** Verify object list and session. **/ + if (obj_list == NULL) { - mssErrorf(1, "EXP", "%s(?) expects 2 parameters, got %d parameters.", fn_name, num_params); + mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); return -1; } - if (maybe_str1 == NULL || maybe_str2 == NULL || u1 != NULL) + ASSERTMAGIC(obj_list->Session, MGK_OBJSESSION); + + /** Verify expression tree. **/ + ASSERTMAGIC(tree, MGK_EXPRESSION); + + /** Verify parameter number. **/ + const int num_params_actual = tree->Children.nItems; + if (num_params != num_params_actual) { - mssErrorf(1, "EXP", "%s(?) expects 2 parameters.", fn_name); + mssErrorf(1, "EXP", + "%s(?) expects %u param%s, got %d param%s.", + fn_name, num_params, (num_params > 1) ? "s" : "", num_params_actual, (num_params_actual > 1) ? "s" : "" + ); return -1; } + + /** Verify parameter datatypes. **/ + for (int i = 0; i < num_params; i++) + { + const pExpression arg = tree->Children.Items[i]; + ASSERTMAGIC(arg, MGK_EXPRESSION); + + /** Skip null values. **/ + if (arg->Flags & EXPR_F_NULL) continue; + + /** Extract datatypes. **/ + const int expected_datatype = param_types[i]; + const int actual_datatype = arg->DataType; + + /** Verify datatypes. **/ + if (expected_datatype != actual_datatype) + { + mssErrorf(1, "EXP", + "%s(...) param #%d/%d expects type %s (%d) but got type %s (%d).", + fn_name, i + 1, num_params, ci_TypeToStr(expected_datatype), expected_datatype, ci_TypeToStr(actual_datatype), actual_datatype + ); + return -1; + } + } - /** Magic checks. **/ - ASSERTMAGIC(tree, MGK_EXPRESSION); - ASSERTMAGIC(maybe_str1, MGK_EXPRESSION); - ASSERTMAGIC(maybe_str2, MGK_EXPRESSION); + /** Pass. **/ + return 0; + } + + +int exp_fn_metaphone(pExpression tree, pParamObjects obj_list) + { + const char fn_name[] = "metaphone"; - /** Check object list. **/ - if (objlist == NULL) + /** Verify function schema. **/ + if (exp_fn_verify_schema(fn_name, (int[]){ DATA_T_STRING }, 1, tree, obj_list) != 0) { - mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); + mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); return -1; } - ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); - /** Extract str1. **/ - if (maybe_str1->Flags & EXPR_F_NULL) + /** Extract string param. **/ + pExpression maybe_str = check_ptr(tree->Children.Items[0]); + if (maybe_str->Flags & EXPR_F_NULL) { tree->Flags |= EXPR_F_NULL; - tree->DataType = DATA_T_DOUBLE; + tree->DataType = DATA_T_STRING; + return 0; + } + const char* str = check_ptr(maybe_str->String); + const size_t str_len = strlen(str); + if (str_len == 0u) + { + tree->String = ""; + tree->DataType = DATA_T_STRING; return 0; } - if (maybe_str1->DataType != DATA_T_STRING) + + /** Compute DoubleMetaphone. **/ + char* primary = NULL; + char* secondary = NULL; + meta_double_metaphone(str, &primary, &secondary); + + /** Process result. **/ + const size_t result_length = strlen(primary) + 1u + strlen(secondary) + 1u; + char* result = check_ptr(nmSysMalloc(result_length * sizeof(char*))); + if (result == NULL) return -1; + sprintf(result, "%s%c%s", primary, CA_BOUNDARY_CHAR, secondary); + + /** Return the result. **/ + tree->String = result; + tree->DataType = DATA_T_STRING; + return 0; + } + + +/*** Computes cosine or Levenshtein similarity between two strings. These two + *** tasks have a large amount of overlapping logic (mostly error checking), + *** so doing them with one function greatly reduces code duplocation. + *** + *** @param tree The tree resulting from this function. + *** @param obj_list The evaluation "scope", including available variables. + *** @param fn_name Either `cos_compare()` or `lev_compare()`. + *** @returns 0 for success, -1 for failure. + ***/ +static int exp_fn_compare(pExpression tree, pParamObjects obj_list, const char* fn_name) + { + /** Verify function schema. **/ + if (exp_fn_verify_schema(fn_name, (int[]){ DATA_T_STRING, DATA_T_STRING }, 2, tree, obj_list) != 0) { - mssErrorf(1, "EXP", "%s(\?\?\?, ..) str1 should be a string.", fn_name); + mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); return -1; } - char* str1 = check_ptr(maybe_str1->String); - - /** Extract str2. **/ - if (maybe_str2->Flags & EXPR_F_NULL) + + /** Extract strings. **/ + pExpression maybe_str1 = check_ptr(tree->Children.Items[0]); + pExpression maybe_str2 = check_ptr(tree->Children.Items[1]); + if (maybe_str1->Flags & EXPR_F_NULL || maybe_str2->Flags & EXPR_F_NULL) { tree->Flags |= EXPR_F_NULL; tree->DataType = DATA_T_DOUBLE; return 0; } - if (maybe_str2->DataType != DATA_T_STRING) - { - mssErrorf(1, "EXP", "%s(\"%s\", \?\?\?) str2 should be a string.", fn_name, str1); - return -1; - } + char* str1 = check_ptr(maybe_str1->String); char* str2 = check_ptr(maybe_str2->String); - /** Handle either cos_cmp or lev_cmp. **/ + /** Handle either cos_compare() or lev_compare(). **/ if (fn_name[0] == 'c') - { /* cos_cmp */ + { /* cos_compare() */ int ret; /** Build vectors. **/ @@ -4218,17 +4303,19 @@ static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe } else { + /** Compute the similarity. **/ tree->Types.Double = ca_cos_compare(v1, v2); tree->DataType = DATA_T_DOUBLE; ret = 0; } + /** Clean up. **/ if (v1 != NULL) ca_free_vector(v1); if (v2 != NULL) ca_free_vector(v2); return ret; } else - { /* lev_cmp */ + { /* lev_compare() */ tree->Types.Double = ca_lev_compare(str1, str2); tree->DataType = DATA_T_DOUBLE; return 0; @@ -4237,310 +4324,43 @@ static int exp_fn_cmp(pExpression tree, pParamObjects objlist, pExpression maybe } -int exp_fn_cos_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) - { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "cos_cmp"); } -int exp_fn_cos_compare(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) - { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "cos_compare"); } -int exp_fn_cosine_compare(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) - { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "cosine_compare"); } -int exp_fn_lev_cmp(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) - { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "lev_cmp"); } -int exp_fn_lev_compare(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) - { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "lev_compare"); } -int exp_fn_levenshtein_compare(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1) - { return exp_fn_cmp(tree, objlist, maybe_str1, maybe_str2, u1, "levenshtein_compare"); } - - -/*** Comparse two strings to see if their sparse vectors are equal. - *** - *** @param tree The tree resulting from this function. - *** @param objlist The evaluation "scope", including available variables. - *** @param maybe_str1 Possibly the first string. - *** @param maybe_str2 Possibly the second string. - *** @param u1 Unused parameter. - *** @returns 0 for success, -1 for failure. - ***/ -static int exp_fn_sparse_eql(pExpression tree, pParamObjects objlist, pExpression maybe_str1, pExpression maybe_str2, pExpression u1, bool is_cos) +int exp_fn_cos_compare(pExpression tree, pParamObjects obj_list) { - const char fn_name[] = "sparse_compare"; - - /** Check number of arguments. **/ - const int num_params = tree->Children.nItems; - if (num_params != 2) - { - mssErrorf(1, "EXP", "%s(?) expects 2 parameters, got %d parameters.", fn_name, num_params); - return -1; - } - if (maybe_str1 == NULL || maybe_str2 == NULL || u1 != NULL) - { - mssErrorf(1, "EXP", "%s(?) expects 2 parameters.", fn_name); - return -1; - } - - /** Magic checks. **/ - ASSERTMAGIC(tree, MGK_EXPRESSION); - ASSERTMAGIC(maybe_str1, MGK_EXPRESSION); - ASSERTMAGIC(maybe_str2, MGK_EXPRESSION); - - /** Check object list. **/ - if (objlist == NULL) - { - mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); - return -1; - } - ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); - - /** Extract str1. **/ - if (maybe_str1->Flags & EXPR_F_NULL) - { - mssErrorf(1, "EXP", "%s(NULL, ...) str1 cannot be NULL.", fn_name); - return -1; - } - if (maybe_str1->DataType != DATA_T_STRING) - { - mssErrorf(1, "EXP", "%s(\?\?\?, ..) str1 should be a string.", fn_name); - return -1; - } - char* str1 = maybe_str1->String; - if (str1 == NULL) - { - mssErrorf(1, "EXP", - "%s(nothing?, ...) expected string from str1 (of datatype DataType = " - "DATA_T_STRING), but the String was NULL or did not exist!", - fn_name - ); - return -1; - } - - /** Extract str2. **/ - if (maybe_str2->Flags & EXPR_F_NULL) - { - mssErrorf(1, "EXP", "%s(\"%s\", NULL) str2 cannot be NULL.", fn_name, str1); - return -1; - } - if (maybe_str2->DataType != DATA_T_STRING) - { - mssErrorf(1, "EXP", "%s(\"%s\", \?\?\?) str2 should be a string.", fn_name, str1); - return -1; - } - char* str2 = maybe_str2->String; - if (str2 == NULL) - { - mssErrorf(1, "EXP", - "%s(\"%s\", nothing?) expected string from str2 (of datatype DataType = " - "DATA_T_STRING), but the String was NULL or did not exist!", - fn_name, str1 - ); - return -1; - } - - /** Build vectors. **/ - int ret; - const pVector v1 = check_ptr(ca_build_vector(str1)); - const pVector v2 = check_ptr(ca_build_vector(str2)); - if (v1 == NULL || v2 == NULL) - { - mssErrorf(1, "EXP", - "%s(\"%s\", \"%s\") - Failed to build vectors.", - fn_name, str1, str2 - ); - ret = -1; - } - else - { - tree->Integer = (ca_eql(v1, v2)) ? 1 : 0; - tree->DataType = DATA_T_INTEGER; - ret = 0; - } - - if (v1 != NULL) ca_free_vector(v1); - if (v2 != NULL) ca_free_vector(v2); - return ret; + return exp_fn_compare(tree, obj_list, "cos_compare"); } - -/*** Computes double metaphone. - *** - *** @param tree The tree resulting from this function. - *** @param objlist The evaluation "scope", including available variables. - *** @param maybe_str Possibly the string passed to double metaphone. - *** @param u1 Unused parameter. - *** @param u2 Unused parameter. - ***/ -int exp_fn_double_metaphone(pExpression tree, pParamObjects objlist, pExpression maybe_str, pExpression u1, pExpression u2) +int exp_fn_lev_compare(pExpression tree, pParamObjects obj_list) { - const char fn_name[] = "double_metaphone"; - - /** Check number of arguments. **/ - const int num_params = tree->Children.nItems; - if (num_params != 1) - { - mssErrorf(1, "EXP", "%s(?) expects 1 parameter, got %d parameters.", fn_name, num_params); - return -1; - } - if (maybe_str == NULL || u1 != NULL || u2 != NULL) - { - mssErrorf(1, "EXP", "%s(?) expects 1 parameter.", fn_name); - return -1; - } - - /** Magic checks. **/ - ASSERTMAGIC(tree, MGK_EXPRESSION); - ASSERTMAGIC(maybe_str, MGK_EXPRESSION); - - /** Check object list. **/ - if (objlist == NULL) - { - mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); - return -1; - } - ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); - - /** Extract str. **/ - if (maybe_str->Flags & EXPR_F_NULL) - { - mssErrorf(1, "EXP", "%s(NULL) str cannot be NULL.", fn_name); - return -1; - } - if (maybe_str->DataType != DATA_T_STRING) - { - mssErrorf(1, "EXP", "%s(\?\?\?) str should be a string.", fn_name); - return -1; - } - const char* str = maybe_str->String; - if (str == NULL) - { - mssErrorf(1, "EXP", - "%s(nothing?) expected string from str (of datatype DataType = " - "DATA_T_STRING), but the String was NULL or did not exist!", - fn_name - ); - return -1; - } - const size_t str_len = strlen(str); - if (str_len == 0u) - { - mssErrorf(1, "EXP", "%s(\"\") str cannot be an empty string.", fn_name); - return -1; - } - - /** Compute DoubleMetaphone. **/ - char* primary = NULL; - char* secondary = NULL; - meta_double_metaphone(str, &primary, &secondary); - - /** Process result. **/ - const size_t primary_length = strlen(primary); - const size_t secondary_length = strlen(secondary); - char* result = check_ptr(nmSysMalloc(primary_length + 1u + secondary_length + 1u)); - if (result == NULL) return -1; - sprintf(result, "%s%c%s", primary, CA_BOUNDARY_CHAR, secondary); - - /** Return the result. **/ - tree->String = result; - tree->DataType = DATA_T_STRING; - return 0; + return exp_fn_compare(tree, obj_list, "lev_compare"); } - -int exp_fn_aggregate_similarities(pExpression tree, pParamObjects objlist) - { - const char fn_name[] = "aggregate_similarities"; - - /** Check number of arguments. **/ - const int num_params = tree->Children.nItems; - if (num_params != 6) - { - mssErrorf(1, "EXP", "%s(?) expects 6 parameters, got %d parameters.", fn_name, num_params); - return -1; - } - /** Magic checks. **/ - ASSERTMAGIC(tree, MGK_EXPRESSION); - ASSERTMAGIC(tree->Children.Items[0], MGK_EXPRESSION); - ASSERTMAGIC(tree->Children.Items[1], MGK_EXPRESSION); - ASSERTMAGIC(tree->Children.Items[2], MGK_EXPRESSION); - ASSERTMAGIC(tree->Children.Items[3], MGK_EXPRESSION); +int exp_fn_levenshtein(pExpression tree, pParamObjects obj_list) + { + const char fn_name[] = "levenshtein"; - /** Check object list. **/ - if (objlist == NULL) + /** Verify function schema. **/ + if (exp_fn_verify_schema(fn_name, (int[]){ DATA_T_STRING, DATA_T_STRING }, 2, tree, obj_list) != 0) { - mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); + mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); return -1; } - ASSERTMAGIC(objlist->Session, MGK_OBJSESSION); - - /** Extract parameters. **/ - double params[4] = {NAN}; - const char names[4][8] = {"name", "email", "phone", "address"}; - for (unsigned int i = 0; i < 4u; i++) - { - pExpression param = (pExpression)tree->Children.Items[i]; - - /** Ignore null values. **/ - if (param->Flags & EXPR_F_NULL) continue; - - /** Only accept doubles. **/ - if (param->DataType != DATA_T_DOUBLE) - { - mssErrorf(1, "EXP", - "%s() param%u (%s) expected type %s but got %s.", - fn_name, i, names[i], ci_TypeToStr(DATA_T_DOUBLE), ci_TypeToStr(param->DataType) - ); - if (param->DataType == DATA_T_INTEGER) fprintf(stderr, "Value: %d\n", param->Integer); - return -1; - } - - /** Do not accept NaN. **/ - params[i] = param->Types.Double; - if (isnan(params[i])) - { - mssErrorf(1, "EXP", "%s() param%u (%s) cannot be NaN", fn_name, names[i], i); - return -1; - } - } - char* dup_names[2] = {NULL}; - for (unsigned int i = 0; i < 2u; i++) + /** Extract strings. **/ + pExpression maybe_str1 = check_ptr(tree->Children.Items[0]); + pExpression maybe_str2 = check_ptr(tree->Children.Items[1]); + if (maybe_str1->Flags & EXPR_F_NULL || maybe_str2->Flags & EXPR_F_NULL) { - pExpression param = (pExpression)tree->Children.Items[i + 4u]; - - /** Ignore null values. **/ - if (param->Flags & EXPR_F_NULL) continue; - - /** Only accept doubles. **/ - if (param->DataType != DATA_T_STRING) - { - mssErrorf(1, "EXP", - "%s() param%u expected type %s but got %s.", - fn_name, i, ci_TypeToStr(DATA_T_STRING), ci_TypeToStr(param->DataType) - ); - return -1; - } - - dup_names[i] = param->String; + tree->Flags |= EXPR_F_NULL; + tree->DataType = DATA_T_INTEGER; + return 0; } + char* str1 = check_ptr(maybe_str1->String); + char* str2 = check_ptr(maybe_str2->String); - FILE *f = check_ptr(fopen("/home/israel/exp_log.swift", "a")); - check_neg(fprintf(f, "aggregate_similarities(%g, %g, %g, %g, \"%s\", \"%s\")", params[0], params[1], params[2], params[3], dup_names[0], dup_names[1])); - - /** Compute aggregated similarity. **/ - double name_sim = params[0]; - double email_sim = params[1]; - double phone_sim = params[2]; - double address_sim = params[3]; - - double mean = 0.0, n = 0.0; - if (name_sim > 0.0) { mean += name_sim; n++; } - if (email_sim > 0.0) { mean += email_sim; n++; } - if (phone_sim > 0.0) { mean += phone_sim; n++; } - if (address_sim > 0.0) { mean += address_sim; n++; } - mean /= n; - - /** Success. **/ - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = mean; - fprintf(f, " = %g\n", tree->Types.Double); - check(fclose(f)); + /** Compute edit distance. **/ + /** Length 0 is provided for both strings so that the function will compute it for us. **/ + tree->Integer = edit_dist(str1, str2, 0lu, 0lu); + tree->DataType = DATA_T_INTEGER; return 0; } @@ -4552,7 +4372,7 @@ int exp_fn_aggregate_similarities(pExpression tree, pParamObjects objlist) * Parameters: * pExpression tree: * pParamObjects: - * pExpression passowrd: The password, passed as a pExpression + * pExpression password: The password, passed as a pExpression * pExpression salt: The salt, passed as a pExpression * * returns: @@ -4745,15 +4565,10 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "power", (char*)exp_fn_power); /** Duplicate detection. **/ - xhAdd(&EXP.Functions, "cos_cmp", (char*)exp_fn_cos_cmp); + xhAdd(&EXP.Functions, "metaphone", (char*)exp_fn_metaphone); xhAdd(&EXP.Functions, "cos_compare", (char*)exp_fn_cos_compare); - xhAdd(&EXP.Functions, "cosine_compare", (char*)exp_fn_cosine_compare); - xhAdd(&EXP.Functions, "lev_cmp", (char*)exp_fn_lev_cmp); xhAdd(&EXP.Functions, "lev_compare", (char*)exp_fn_lev_compare); - xhAdd(&EXP.Functions, "levenshtein_compare", (char*)exp_fn_levenshtein_compare); - xhAdd(&EXP.Functions, "sparse_eql", (char*)exp_fn_sparse_eql); - xhAdd(&EXP.Functions, "aggregate_similarities", (char*)exp_fn_aggregate_similarities); - xhAdd(&EXP.Functions, "double_metaphone", (char*)exp_fn_double_metaphone); + xhAdd(&EXP.Functions, "levenshtein", (char*)exp_fn_levenshtein); /** Windowing. **/ xhAdd(&EXP.Functions, "row_number", (char*)exp_fn_row_number); diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index b3d416668..4bf94ebe9 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -102,7 +102,7 @@ void void_func() {} *** not readily available. *** *** @param clr Whether to clear the current error stack. As a rule of thumb, - *** if you are the first one to detec the error, clear the stack so that + *** if you are the first one to detect the error, clear the stack so that *** other unrelated messages are not shown. If you are detecting an error *** from another function that may also call an mssError() function, do *** not clear the stack. @@ -188,7 +188,7 @@ static int ci_TypeFromStr(const char* str) case 'I': case 'i': if (strcmp(str+1, "Integer"+1) == 0) return DATA_T_INTEGER; - if (strcmp(str+1, "IntVecor"+1) == 0) return DATA_T_INTVEC; + if (strcmp(str+1, "IntVector"+1) == 0) return DATA_T_INTVEC; break; case 'M': case 'm': @@ -221,7 +221,7 @@ static char* ci_TypeToStr(const int type) case DATA_T_STRING: return "String"; case DATA_T_DOUBLE: return "Double"; case DATA_T_DATETIME: return "DateTime"; - case DATA_T_INTVEC: return "IntVecor"; + case DATA_T_INTVEC: return "IntVector"; case DATA_T_STRINGVEC: return "StringVector"; case DATA_T_MONEY: return "Money"; case DATA_T_ARRAY: return "Array"; @@ -282,7 +282,7 @@ static void** ci_xaToTrimmedArray(pXArray arr, int array_handling) }) -/** ================ Enum Declairations ================ **/ +/** ================ Enum Declarations ================ **/ /** ANCHOR[id=enums] **/ /** Enum representing a clustering algorithm. **/ @@ -430,7 +430,7 @@ char* const METHOD_NAME[] = *** the SourcePath. *** *** @skip --> Computed data. - *** @param Strings The keys for each data string strings recieved from the + *** @param Strings The keys for each data string strings received from the *** database, allowing them to be lined up again when queried. *** @param Strings The data strings to be clustered and searched, or NULL if *** they have not been fetched from the source. @@ -741,7 +741,7 @@ static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData /** Driver Functions. **/ // LINK #driver -void* clusterOpen(pObject obj, int mask, pContentType systype, char* usr_type, pObjTrxTree* oxt); +void* clusterOpen(pObject parent, int mask, pContentType systype, char* usr_type, pObjTrxTree* oxt); int clusterClose(void* inf_v, pObjTrxTree* oxt); void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt); @@ -749,30 +749,30 @@ int clusterQueryClose(void* qy_v, pObjTrxTree* oxt); int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt); int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt); -char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt); -char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt); +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree* oxt); +char* clusterGetNextAttr(void* inf_v, pObjTrxTree* oxt); int clusterInfo(void* inf_v, pObjectInfo info); /** Method Execution Functions. **/ // LINK #method -char* clusterGetFirstMethod(void* inf_v, pObjTrxTree oxt); -char* clusterGetNextMethod(void* inf_v, pObjTrxTree oxt); +char* clusterGetFirstMethod(void* inf_v, pObjTrxTree* oxt); +char* clusterGetNextMethod(void* inf_v, pObjTrxTree* oxt); static int ci_PrintEntry(pXHashEntry entry, void* arg); static void ci_CacheFreeSourceData(pXHashEntry entry, void* path); static void ci_CacheFreeCluster(pXHashEntry entry, void* path); static void ci_CacheFreeSearch(pXHashEntry entry, void* path); -int clusterExecuteMethod(void* inf_v, char* methodname, pObjData param, pObjTrxTree oxt); +int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt); /** Unimplemented DriverFunctions. **/ // LINK #unimplemented int clusterCreate(pObject obj, int mask, pContentType systype, char* usrtype, pObjTrxTree* oxt); -int clusterDeleteObj(void* inf_v, pObjTrxTree* oxt); int clusterDelete(pObject obj, pObjTrxTree* oxt); -int clusterRead(void* inf_v, char* buffer, int maxcnt, int offset, int flags, pObjTrxTree* oxt); +int clusterDeleteObj(void* inf_v, pObjTrxTree* oxt); +int clusterRead(void* inf_v, char* buffer, int max_cnt, int offset, int flags, pObjTrxTree* oxt); int clusterWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObjTrxTree* oxt); -int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree oxt); -int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree oxt); -void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree oxt); +int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt); +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree* oxt); +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree* oxt); int clusterCommit(void* inf_v, pObjTrxTree *oxt); /** ================ Parsing Functions ================ **/ @@ -1037,7 +1037,7 @@ static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, /** Cache hit. **/ tprintf("# source: \"%s\"\n", source_data->Key); - /** Cause an imediate invalid read if cache was incorrectly freed. **/ + /** Cause an immediate invalid read if cache was incorrectly freed. **/ tprintf("--> Name: %s\n", source_maybe->Name); /** Free data we don't need. **/ @@ -1709,7 +1709,7 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) int num_provided_params = (has_provided_params) ? parent->Pathname->OpenCtl[parent->SubPtr - 1]->nSubInf : 0; pStruct* provided_params = (has_provided_params) ? parent->Pathname->OpenCtl[parent->SubPtr - 1]->SubInf : NULL; - /** Itterate over each param in the structure file. **/ + /** Iterate over each param in the structure file. **/ node_data->nParams = param_infs.nItems; const size_t params_size = node_data->nParams * sizeof(pParam); node_data->Params = check_ptr(nmSysMalloc(params_size)); @@ -1920,7 +1920,7 @@ static void ci_FreeSourceData(pSourceData source_data) /*** Free pClusterData struct with an option to recursively free subclusters. *** *** @param cluster_data The cluster data struct to free. - *** @param recrusive Whether to recursively free subclusters. + *** @param recursive Whether to recursively free subclusters. ***/ static void ci_FreeClusterData(pClusterData cluster_data, bool recursive) { @@ -2142,7 +2142,7 @@ static unsigned int ci_SizeOfSourceData(pSourceData source_data) *** caching systems, so it is not technically part of the struct. *** *** @param cluster_data The cluster data struct to be queried. - *** @param recrusive Whether to recursively free subclusters. + *** @param recursive Whether to recursively free subclusters. *** @returns The size in bytes of the struct and all internal allocated data. ***/ static unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive) @@ -2224,7 +2224,7 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) if (!check(objCurrentDate(&source_data->DateComputed))) goto end; /** Open the source path specified by the .cluster file. **/ - tprintf("Openning...\n"); + tprintf("Opening...\n"); pObject obj = objOpen(session, source_data->SourcePath, OBJ_O_RDONLY, 0600, "system/directory"); if (obj == NULL) { @@ -2239,7 +2239,7 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) } /** Generate a "query" for retrieving data. **/ - tprintf("Openning query...\n"); + tprintf("Opening query...\n"); pObjQuery query = objOpenQuery(obj, NULL, NULL, NULL, NULL, 0); if (query == NULL) { @@ -2424,7 +2424,7 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) // ret = ret; // Fall-through: Failure ignored. } } - tprintf("\nData aquired.\n"); + tprintf("\nData acquired.\n"); source_data->nVectors = vector_xarray.nItems; if (source_data->nVectors == 0) { @@ -2508,7 +2508,7 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) ***/ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) { - /** If the clusters are alreadyd computed, we're done. **/ + /** If the clusters are already computed, we're done. **/ if (cluster_data->Clusters != NULL) return 0; /** Make source data available. **/ @@ -2565,7 +2565,7 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) if (cluster_data->SimilarityMeasure != SIMILARITY_COSINE) { mssErrorf(1, "Cluster", - "The similarity meausre \"%s\" is not implemented.", + "The similarity measure \"%s\" is not implemented.", ci_SimilarityMeasureToString(cluster_data->SimilarityMeasure) ); goto err_free_sims; @@ -2840,8 +2840,8 @@ static int ci_GetParamType(void* inf_v, const char* attr_name) *** *** @attention - Warning: If the retrieved value is `NULL`, the pObjectData *** val is not updated, and the function returns 1, indicating `NULL`. - *** This is intended behavior, for consistancy with other Centrallix - *** functions, so keep it in mind so you're not surpised. + *** This is intended behavior, for consistency with other Centrallix + *** functions, so keep it in mind so you're not surprised. *** *** @param inf_v Node data containing the list of paramenters. *** @param attr_name The name of the requested paramenter. @@ -2853,9 +2853,9 @@ static int ci_GetParamType(void* inf_v, const char* attr_name) *** so that they will have a pointer to the data. *** This buffer will not be modified unless the data is successfully *** found. If a value other than 0 is returned, the buffer is not updated. - *** @returns 0 if successsful, + *** @returns 0 if successful, *** 1 if the variable is null, - *** -1 if an error occures. + *** -1 if an error occurs. *** *** LINK ../../centrallix-lib/include/datatypes.h:72 ***/ @@ -2919,7 +2919,7 @@ static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData *** @param oxt The object system tree, similar to a kind of "scope" (unused). *** *** @returns A pDriverData struct representing a driver instance, or - *** NULL if an error occures. + *** NULL if an error occurs. ***/ void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) { @@ -3352,8 +3352,8 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) *** so that they will have a pointer to the data. *** This buffer will not be modified unless the data is successfully *** found. If a value other than 0 is returned, the buffer is not updated. - *** @returns 0 if successsful, - *** -1 if an error occures. + *** @returns 0 if successful, + *** -1 if an error occurs. *** *** LINK ../../centrallix-lib/include/datatypes.h:72 ***/ @@ -3593,7 +3593,7 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val static StringVec* vec = NULL; if (vec != NULL) nmFree(vec, sizeof(StringVec)); - /** Allocate and initiallize the requested data. **/ + /** Allocate and initialize the requested data. **/ val->StringVec = vec = check_ptr(nmMalloc(sizeof(StringVec))); if (val->StringVec == NULL) return -1; val->StringVec->nStrings = target_cluster->Size; @@ -3653,15 +3653,15 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val *** provided cluster driver instance. *** *** Note: Failures from nmSysStrdup() and several others are ignored because - *** the worst case senario is that the attributes are set to null, which + *** the worst case scenario is that the attributes are set to null, which *** will cause them to be ignored. I consider that to be better than than *** throwing an error that could unnecessarily disrupt normal usage. *** *** @param inf_v The driver instance to be read. *** @param attr_name The name of the requested attribute. *** @param oxt The object system tree, similar to a kind of "scope" (unused). - *** @returns A presentation hints object, if successsful, - *** NULL if an error occures. + *** @returns A presentation hints object, if successful, + *** NULL if an error occurs. ***/ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt) { @@ -3954,13 +3954,13 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /*** Returns the name of the first attribute that one can get from *** this driver instance (using GetAttrType() and GetAttrValue()). *** Resets the internal variable (TargetAttrIndex) used to maintain - *** itteration state for clusterGetNextAttr(). + *** iteration state for clusterGetNextAttr(). *** *** @param inf_v The driver instance to be read. *** @param oxt Unused. *** @returns The name of the first attribute. ***/ -char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt) +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree* oxt) { tprintf("Warning: clusterGetFirstAttr() is under active development.\n"); pDriverData driver_data = (pDriverData)inf_v; @@ -3973,13 +3973,13 @@ char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt) /*** Returns the name of the next attribute that one can get from *** this driver instance (using GetAttrType() and GetAttrValue()). *** Uses an internal variable (TargetAttrIndex) used to maintain - *** the state of this itteration over repeated calls. + *** the state of this iteration over repeated calls. *** *** @param inf_v The driver instance to be read. *** @param oxt Unused. *** @returns The name of the next attribute. ***/ -char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt) +char* clusterGetNextAttr(void* inf_v, pObjTrxTree* oxt) { tprintf("Warning: clusterGetNextAttr("); pDriverData driver_data = (pDriverData)inf_v; @@ -4004,7 +4004,7 @@ char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt) *** *** @param inf_v The driver instance to be checked. *** @param info The struct to be populated with driver flags. - *** @returns 0 if succesful, + *** @returns 0 if successful, *** -1 if the driver is an unimplemented type (should never happen). ***/ int clusterInfo(void* inf_v, pObjectInfo info) @@ -4088,14 +4088,14 @@ int clusterInfo(void* inf_v, pObjectInfo info) /*** Returns the name of the first method that one can execute from *** this driver instance (using clusterExecuteMethod()). Resets the - *** internal variable (TargetMethodIndex) used to maintain itteration + *** internal variable (TargetMethodIndex) used to maintain iteration *** state for clusterGetNextMethod(). *** *** @param inf_v The driver instance to be read. *** @param oxt Unused. - *** @returns The name of the first methd. + *** @returns The name of the first method. ***/ -char* clusterGetFirstMethod(void* inf_v, pObjTrxTree oxt) +char* clusterGetFirstMethod(void* inf_v, pObjTrxTree* oxt) { tprintf("Warning: clusterGetFirstMethod() is under active development.\n"); pDriverData driver_data = (pDriverData)inf_v; @@ -4108,13 +4108,13 @@ char* clusterGetFirstMethod(void* inf_v, pObjTrxTree oxt) /*** Returns the name of the next method that one can get from *** this driver instance (using GetAttrType() and GetAttrValue()). *** Uses an internal variable (TargetMethodIndex) used to maintain - *** the state of this itteration over repeated calls. + *** the state of this iteration over repeated calls. *** *** @param inf_v The driver instance to be read. *** @param oxt Unused. *** @returns The name of the next method. ***/ -char* clusterGetNextMethod(void* inf_v, pObjTrxTree oxt) +char* clusterGetNextMethod(void* inf_v, pObjTrxTree* oxt) { tprintf("Warning: clusterGetNextMethod() is under active development."); pDriverData driver_data = (pDriverData)inf_v; @@ -4247,7 +4247,7 @@ static void ci_CacheFreeSearch(pXHashEntry entry, void* path) *** @param param A possibly optional param passed to the method. *** @param oxt The object system tree, similar to a kind of "scope" (unused). ***/ -int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree oxt) +int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt) { tprintf("Warning: clusterExecuteMethod(\"%s\") is under active development.\n", method_name); pDriverData driver_data = (pDriverData)inf_v; @@ -4304,7 +4304,7 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx )); if (failed) { - mssErrorf(0, "Cluster", "Unexpected error occured while showhing caches."); + mssErrorf(0, "Cluster", "Unexpected error occurred while showhing caches."); ret = -1; } @@ -4345,7 +4345,7 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx ); goto err; } - + if (strcmp(method_name, "stat") == 0) { unsigned long long ExpectedOpenCalls = 10666; @@ -4388,25 +4388,25 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx // LINK #functions /** Not implemented. **/ -int clusterCreate(pObject obj, int mask, pContentType systype, char* usrtype, pObjTrxTree* oxt) +int clusterCreate(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) { mssErrorf(1, "Cluster", "clusterCreate() is not implemented."); return -ENOSYS; } /** Not implemented. **/ -int clusterDeleteObj(void* inf_v, pObjTrxTree* oxt) +int clusterDelete(pObject obj, pObjTrxTree* oxt) { - mssErrorf(1, "Cluster", "clusterDeleteObj() is not implemented."); + mssErrorf(1, "Cluster", "clusterDelete() is not implemented."); return -1; } /** Not implemented. **/ -int clusterDelete(pObject obj, pObjTrxTree* oxt) +int clusterDeleteObj(void* inf_v, pObjTrxTree* oxt) { - mssErrorf(1, "Cluster", "clusterDelete() is not implemented."); + mssErrorf(1, "Cluster", "clusterDeleteObj() is not implemented."); return -1; } /** Not implemented. **/ -int clusterRead(void* inf_v, char* buffer, int maxcnt, int offset, int flags, pObjTrxTree* oxt) +int clusterRead(void* inf_v, char* buffer, int max_cnt, int offset, int flags, pObjTrxTree* oxt) { mssErrorf(1, "Cluster", "clusterRead() not implemented."); fprintf(stderr, "HINT: Use queries instead, (e.g. clusterOpenQuery()).\n"); @@ -4419,25 +4419,25 @@ int clusterWrite(void* inf_v, char* buffer, int cnt, int offset, int flags, pObj return -1; } /** Not implemented. **/ -int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree oxt) +int clusterSetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* oxt) { mssErrorf(1, "Cluster", "clusterSetAttrValue() not implemented because clusters are imutable."); return -1; } /** Not implemented. **/ -int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree oxt) +int clusterAddAttr(void* inf_v, char* attr_name, int type, pObjData val, pObjTrxTree* oxt) { mssErrorf(1, "Cluster", "clusterAddAttr() not implemented because clusters are imutable."); return -1; } /** Not implemented. **/ -void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree oxt) +void* clusterOpenAttr(void* inf_v, char* attr_name, int mode, pObjTrxTree* oxt) { mssErrorf(1, "Cluster", "clusterOpenAttr() not implemented."); return NULL; } /** Not implemented. **/ -int clusterCommit(void* inf_v, pObjTrxTree *oxt) +int clusterCommit(void* inf_v, pObjTrxTree* oxt) { mssErrorf(1, "Cluster", "clusterCommit() not implemented because clusters are imutable."); return 0; @@ -4451,7 +4451,7 @@ int clusterCommit(void* inf_v, pObjTrxTree *oxt) *** - Initializing global data needed for the driver. *** *** @returns 0 if successful, or - *** -1 if an error occured. + *** -1 if an error occurs. ***/ int clusterInitialize(void) { @@ -4470,10 +4470,10 @@ int clusterInitialize(void) memset(&ClusterStatistics, 0, sizeof(ClusterStatistics)); /** Setup the structure. **/ - if (check_ptr(strcpy(drv->Name, "clu - Clustering Driver")) == NULL) goto err; + if (check_ptr(strcpy(drv->Name, "cluster - Clustering Driver")) == NULL) goto err; if (!check(xaInit(&drv->RootContentTypes, 1))) goto err; if (!check_neg(xaAddItem(&drv->RootContentTypes, "system/cluster"))) goto err; - drv->Capabilities = OBJDRV_C_TRANS | OBJDRV_C_FULLQUERY; /* TODO: Greg, are these correct? Should I add any others? */ + drv->Capabilities = 0; /* TODO: Greg - Should I add any of these? */ /** Setup the function references. **/ drv->Open = clusterOpen; @@ -4503,8 +4503,11 @@ int clusterInitialize(void) drv->Commit = clusterCommit; drv->GetQueryCoverageMask = NULL; drv->GetQueryIdentityPath = NULL; - - /** Register some structures. **/ + + /** Register the driver. **/ + if (!check(objRegisterDriver(drv))) goto err; + + /** Register structs used in this project with the newmalloc memory management system. **/ nmRegister(sizeof(SourceData), "ClusterSourceData"); nmRegister(sizeof(Cluster), "Cluster"); nmRegister(sizeof(ClusterData), "ClusterData"); @@ -4514,42 +4517,6 @@ int clusterInitialize(void) nmRegister(sizeof(ClusterQuery), "ClusterQuery"); nmRegister(sizeof(ClusterDriverCaches), "ClusterDriverCaches"); - /** Print debug size info. **/ -// char buf1[16], buf2[16], buf3[16], buf4[16], buf5[16], buf6[16], buf7[16], buf8[16]; -// tprintf( -// "Cluster driver struct sizes:\n" -// " > sizeof(SourceData): %s\n" -// " > sizeof(Cluster): %s\n" -// " > sizeof(ClusterData): %s\n" -// " > sizeof(SearchData): %s\n" -// " > sizeof(NodeData): %s\n" -// " > sizeof(DriverData): %s\n" -// " > sizeof(ClusterQuery): %s\n" -// " > sizeof(ClusterDriverCaches): %s\n", -// snprint_bytes(buf1, sizeof(buf1), sizeof(SourceData)), -// snprint_bytes(buf2, sizeof(buf2), sizeof(Cluster)), -// snprint_bytes(buf3, sizeof(buf3), sizeof(ClusterData)), -// snprint_bytes(buf4, sizeof(buf4), sizeof(SearchData)), -// snprint_bytes(buf5, sizeof(buf5), sizeof(NodeData)), -// snprint_bytes(buf6, sizeof(buf6), sizeof(DriverData)), -// snprint_bytes(buf7, sizeof(buf7), sizeof(ClusterQuery)), -// snprint_bytes(buf8, sizeof(buf8), sizeof(ClusterDriverCaches)) -// ); -// - // 'st' (7: 13) collides with 'an' (7: 11) -// char* str1 = "This is a very long string of text"; -// char* str2 = "This is a very long string of textttttttttttt"; -// pVector v1 = ca_build_vector(str1); -// pVector v2 = ca_build_vector(str2); -// ca_fprint_vector(stdout, v1); printf("\n"); -// ca_fprint_vector(stdout, v2); printf("\n"); -// fprintf(stderr, "'%s' ?= '%s' -> %g\n", str1, str2, ca_cos_compare(v1, v2)); -// ca_free_vector(v1); -// ca_free_vector(v2); - - /** Register the driver. **/ - if (!check(objRegisterDriver(drv))) goto err; - /** Success. **/ return 0; From fa28afa4b0c282b45fe46a87a7d11db904578925 Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 14 Nov 2025 11:48:53 -0700 Subject: [PATCH 09/30] Add ClusterDriverRequirements (forgot to commit them before). Add known issues to string similarity documentation. Clean up and organize todos. Clean up testing code in several files. --- .../ClusterDriverRequirements-old.md | 186 +++++++++++++++ centrallix-sysdoc/OSDriver_Authoring.md | 22 +- centrallix-sysdoc/string_similarity.md | 6 +- centrallix/expression/exp_functions.c | 2 +- centrallix/osdrivers/objdrv_cluster.c | 217 ++++++------------ 5 files changed, 271 insertions(+), 162 deletions(-) create mode 100644 centrallix-sysdoc/ClusterDriverRequirements-old.md diff --git a/centrallix-sysdoc/ClusterDriverRequirements-old.md b/centrallix-sysdoc/ClusterDriverRequirements-old.md new file mode 100644 index 000000000..601f41703 --- /dev/null +++ b/centrallix-sysdoc/ClusterDriverRequirements-old.md @@ -0,0 +1,186 @@ + +## Cluster Driver Specifications +### Cluster Open +```c +void* clusterOpen(pObject obj, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt); +``` +`clusterOpen()` shall... +- Create or read a node, as indicated by passed flags. + - Read flags from `obj->Mode`. + - If `O_EXCL` is specified, `O_CREAT` is specified, and there are no other elements in the path, create a new node. + - Otherwise attempt to read the previous object (in `obj->Prev`). + - If this fails and `O_CREAT` is specified, create a new node. + - If there is still no node, fail. +- Parse the provided path. + - Use `obj_internal_PathPart()` with the pathname in `obj->Pathname`. + - Not parse previous parts of the path already parsed by other drivers. + - Start at the `obj->SubPtr`-th path element (skipping `obj->SubPtr - 1` elements). + - Consume elements in the path until `obj_internal_PathPart()` returns `NULL`. + - Store the number of elements consumed in `obj->SubCnt`. +- Determine what data is being targeted from the parsed path. + - If the relevant part of the path contains only the name of the file, the driver shall set the target to root. + - If it contains the name of a valid (sub)cluster or search, the driver shall set the target to that (sub)cluster or search. + - Otherwise, the driver shall produce a descriptive error. +- Parse the provided structure file. + - Follow the spec given in `cluster-schema.cluster`. + - Produce descriptive errors when issues are detected. +- Return a new struct containing necessary information, including: + - The name, source path, and attribute name. + - All parameters (and a param list for scope), clusters, and searches. + - Each parameter shall be represented by a `pParam` object (see `params.h`). + - Each cluster shall be represented by a struct with information including: + - Its name, clustering algorithm, and similarity measure. + - The number of clusters to generate. + - If a k-means algorithm is specified, the improvement threshold. + - The maximum number of iterations to run. + - A list of subclusters with at least this information for each. + - Each search shall be represented by a struct with information including: + - Its name, threshold, and similarity measure. + - Its source, which is a valid cluster name of a cluster in the clusters list. + - Information about targets, derived from the path. + +### Cluster Close +```c +int clusterClose(void* inf_v, pObjTrxTree* oxt); +``` +`clusterClose()` shall... +- Free all allocated data in the driver struct. +- Close any open files or the like in the driver struct. +- Return 0. + +### Cluster Open Query +```c +void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt); +``` +`clusterOpenQuery()` shall... +- Return a query struct that can be passed to `clusterQueryFetch()`. + - This struct shall contain an index to the last row accessed (starting at 0). + - This struct shall contain a pointer to the driver data. + +### Cluster Query Fetch +```c +void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) +``` +`clusterQueryFetch()` shall... +- If the driver struct targets the root node, this function shall produce an error. +- If the driver struct targets an entry, this function shall produce a different error. +- If the driver targets a cluster or search, this function shall return a driver struct targetting the cluster or search *entry* (respectively) indicated by the query struct's row pointer, and increment the pointer. + - Exception: If no data remains, this function shall return `NULL` instead. + - This request shall cause clustering / searching to execute, if it has not executed already. + +### Cluster Query Close +```c +int clusterQueryClose(void* qy_v, pObjTrxTree* oxt); +``` +`clusterQueryClose()` shall... +- Free all allocated data in the query struct. +- Close any open files or the like in the query struct. +- Return 0. + +### Cluster Get Attribute Type +```c +int clusterGetAttrType(void* qy_v, pObjTrxTree* oxt); +``` +`clusterGetAttrType()` shall... +- Return the `DATA_T_...` type of the requested attribute, or `DATA_T_UNAVAILABLE` if the attribute does not exist. +- The name, content_type, inner_type, and outer_type attributes shall be of type `DATA_T_STRING`. +- The last_modification attribute shall be of type `DATA_T_DATETIME`. +- If the target is root... + - The source and attr_name attributes shall be of type `DATA_T_STRING`. +- If the target is a cluster... + - The algorithm and similarity_measure attributes shall be of type `DATA_T_STRING`. + - The num_clusters and max_iterations attributes shall be of type `DATA_T_INTEGER`. + - The improvement_threshold and average_similarity attributes shall be of type `DATA_T_DOUBLE`. +- If the target is a search... + - The source and similarity_measure attribute shall be of type `DATA_T_STRING`. + - The threshold attribute shall be of type `DATA_T_DOUBLE`. +- If the target is a cluster entry... + - The val attribute shall be of type `DATA_T_INTEGER`. + - The sim attribute shall be of type `DATA_T_DOUBLE`. +- If the target is a search entry... + - The val1 and val2 attribute shall be of type `DATA_T_INTEGER`. + - The sim attribute shall be of type `DATA_T_DOUBLE`. + +### Cluster Get Attribute Value +```c +int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val, pObjTrxTree* _); +``` +`clusterGetAttrValue()` shall... +- If the given datatype does not match that returned from `clusterGetAttrType()`, the function shall produce an error. +- Requesting the name attribute shall produce the following values, depending on the target: + - If the target is root, the name in the driver struct (aka. the one specified in the .cluster file) shall be produced. + - If the target is a cluster or cluster entry, the name of the cluster shall be produced. + - If the target is a search or search entry, the name of the search shall be produced. +- Requesting the annotation shall produce some string describing the driver. +- Requesting the outer_type shall produce "system/row". +- Requesting the inner_type or content_type shall produce "system/void". (All path elements are consumed.) +- If the target is root... + - Requesting source shall produce the source path. + - Requesting attr_name shall produce the attribute name. +- If the target is a cluster... + - Requesting algorithm shall produce the name of the clustering algorithm. + - Requesting similarity_measure shall produce the name of the similarity measure. + - Requesting num_clusters shall produce the number of clusters. + - Requesting max_iterations shall produce the maximum number of iterations. + - Requesting improvement_threshold shall produce the minimum improvement threshold. + - Requesting average_similarity shall produce the average size of clusters, running clustering / searching algorithms, if necessary. +- If the target is a search... + - Requesting source shall produce the name of the source cluster for the search. + - Requesting similarity_measure shall produce the name of the similarity measure. + - Requesting threshold shall produce the filtering threshold. +- If the target is a cluster entry... + - Requesting val shall produce the value of the data point in this cluster. + - Requesting sim shall produce the similarity of the data point to the center of the cluster. +- If the target is a cluster entry... + - Requesting val1 or val2 shall produce the first and second value (respectively)detected in this search. + - Requesting sim shall produce the similarity of these two data points. + + +### Cluster Get First Attribute +```c +char* clusterGetFirstAttr(void* inf_v, pObjTrxTree oxt); +``` +`clusterGetFirstAttr()` shall... +- Reset the current attribute index on the driver struct to 0. +- Return the value of invoking `clusterGetNextAttr()`. + +### Cluster Get Next Attribute +```c +char* clusterGetNextAttr(void* inf_v, pObjTrxTree oxt); +``` +`clusterGetNextAttr()` shall... +- Return the attribute name at the attribute index given by the driver struct in the list of attributes based on the target type. +- Return `NULL` if the end of the list has been reached. +- Increase the attribute index on the driver struct by 1. + +- The attribute name list for a targetting root shall include "source" and "attr_name". +- The attribute name list for a targetting a cluster shall include "algorithm", "similarity_measure", "num_clusters", "improvement_threshold", and "max_iterations". +- The attribute name list for a targetting a search shall include "source", "threshold", and "similarity_measure". +- The attribute name list for a targetting a cluster entry shall include "val" and "sim". +- The attribute name list for a targetting a search entry shall include "val1", "val2", and "sim". + +### Cluster Get Next Attribute +```c +int clusterInfo(void* inf_v, pObjectInfo info); +``` +`clusterInfo()` shall... +- Provide the OBJ_INFO_F_CANT_ADD_ATTR flag. +- Provide the OBJ_INFO_F_CANT_HAVE_CONTENT flag. +- Provide the OBJ_INFO_F_NO_CONTENT flag. +- If the target is a root... + - Provide the OBJ_INFO_F_CAN_HAVE_SUBOBJ flag. + - Provide the OBJ_INFO_F_SUBOBJ_CNT_KNOWN flag. + - Provide the OBJ_INFO_F_HAS_SUBOBJ flag if there is at least one cluster or search. + - Provide the OBJ_INFO_F_NO_SUBOBJ flag otherwise. + - Provide the total number of clusters and searches as the number of subobjects. +- If the target is a cluster... + - Provide the OBJ_INFO_F_CAN_HAVE_SUBOBJ flag. + - Provide the OBJ_INFO_F_HAS_SUBOBJ flag. + - If the algorithm has been run, provide OBJ_INFO_F_SUBOBJ_CNT_KNOWN flag and the number of data points clustered as the number of subobjects. +- If the target is a search... + - Provide the OBJ_INFO_F_CAN_HAVE_SUBOBJ flag. + - If the algorithm has been run... + - Provide OBJ_INFO_F_SUBOBJ_CNT_KNOWN flag and the number of elements found by the search as the number of subobjects. + - Provide the OBJ_INFO_F_HAS_SUBOBJ flag if at least one element was found. +- If the target is a cluster entry or a search entry... + - Provide the OBJ_INFO_F_CANT_HAVE_SUBOBJ flag. \ No newline at end of file diff --git a/centrallix-sysdoc/OSDriver_Authoring.md b/centrallix-sysdoc/OSDriver_Authoring.md index d00c192f6..f679dac32 100644 --- a/centrallix-sysdoc/OSDriver_Authoring.md +++ b/centrallix-sysdoc/OSDriver_Authoring.md @@ -202,9 +202,12 @@ Using the example above, we can query from the database using a statement like ` This section describes the standard interface between the OSML and the ObjectSystem driver itself. Every driver should implement certain required functions. (**Note**: Many drivers "implement" some required functions to simply fail with a not implemented or not supported error. For example, most database drivers "implement" `Read()` and `Write()` this way because database content should be queried, not read). Various optional functions are also available, which a driver is not required to implement. The driver should implement an `Initialize()` function, as well as the following (* indicates required functions): | Function Name | Description @@ -332,7 +335,8 @@ The `Open()` function opens a given file to create a new driver instance. This p 5. Return a pointer to the node instance as a void pointer. This pointer will be passed as `void* inf_v` to the driver in subsequent calls involving this object (except the Query functions, discussed below). - 📖 **Note - Transactions**: If the os driver specified the `OBJDRV_C_TRANS` capability, it must respect the current state of the user's transaction. If a new object is being created, an object is being deleted, or other modifications/additions are being performed, and if the OXT layer indicates a transaction is in process, the driver must either complete the current transaction and then complete the current call, or else add the current delete/create/modify call to the transaction tree (in which case the tree item is preallocated; all the driver needs to do is fill it in). This is handled using the transaction tree parameter (`oxt : pObjTrxTree*`). The transaction later is discussed in depth in the ??? section. - + + #### Accessing the Node Object If `O_CREAT` and `O_EXCL` are both specified in `parent->Mode`, the driver should **only** create a new file and fail if the file already exists (refusing to open and read it). Otherwise, the driver should read an existing file, or create one if it does not exist and `O_CREAT` is specified, failing if no file can be read or created. @@ -540,7 +544,6 @@ The `QueryFetch()` function fetches a driver instance pointer (aka. an `inf_v` p | mode | int | The open mode for the new object, the same as `obj->Mode` in `Open()`. | oxt | pObjTrxTree* | The transaction tree pointer for the `OBJDRV_C_TRANS` capability. - The driver should add an element to the `obj->Pathname` structure to indicate the path of the returned child object. This will involve a process somewhat like this, where: - `new_name : char*` is the new object's name. - `qy : pMyDriversQueryInf` is the current query structure. @@ -747,7 +750,7 @@ The return value, `hints : ObjPresentationHints`, contains the following useful - `hints->MaxValue : void*`: An expression defining the maximum valid value. - `hints->EnumList : XArray`: If the attribute is a string enum, this XArray lists the valid string values. - `hints->EnumQuery : char*`: A query string which enumerates the valid values a string enum attribute. -- `hints->Format : char*`: presentation format - datetime or money +- `hints->Format : char*`: presentation format - datetime or money - `hints->AllowChars : char*`: An array of all valid characters for a string attribute, NULL to allow all characters. - `hints->BadChars : char*`: An array of all invalid characters for a string attribute. - `hints->Length : int`: The maximum length of data that can be included in a string attribute. @@ -785,7 +788,7 @@ The following macros are provided for setting style flags: - `OBJ_PH_STYLE_SEPWINDOW`: Prefer separate windows for grouped fields. - `OBJ_PH_STYLE_ALWAYSDEF`: Always reset the default value when this attribute is modified. - `OBJ_PH_STYLE_CREATEONLY`: This attribute is writeable only when created, after that it is read only. -- `OBJ_PH_STYLE_MULTISEL`: Multiple select +- `OBJ_PH_STYLE_MULTISEL`: Multiple select - `OBJ_PH_STYLE_KEY`: This attribute is a primary key. - `OBJ_PH_STYLE_APPLYCHG`: Presentation hints should be applied on DataChange instead of on DataModify. @@ -812,10 +815,12 @@ The `pObjectInfo` struct has two fields: `Flags` and `nSubobjects`. This functi - `OBJ_INFO_F_CANT_SEEK`: Seeking is not supported at all. - `OBJ_INFO_F_CAN_ADD_ATTR` / `OBJ_INFO_F_CANT_ADD_ATTR`: Indicates that the object does or does not allow attributes to be added with the [AddAttr()](#function-addattr) function. - `OBJ_INFO_F_SUPPORTS_INHERITANCE`: Indicates that the object supports inheritance through attributes such as `cx__inherit`. See ??? for more information about object inheritance. - + + - `OBJ_INFO_F_FORCED_LEAF`: Indicates that the object is forced to be a 'leaf' unless ls__type used. - `OBJ_INFO_F_TEMPORARY`: Indicates that this is a temporary object without a vaoid pathname. + The function returns 0 on success, and -1 to indicate an error, in which case `mssError()` should be called before returning. @@ -852,7 +857,6 @@ Although using the structure file format may be complex, it allows significant f Structure files are accessed via the st_node (SN) and stparse (SP) modules. The st_node module loads and saves the structure file heirarchies as a whole. It also manages caching to reduce disk activity and eliminate repeated parsing of the same file. The stparse module provides access to the individual attributes and groups of attributes within a node structure file. For example, if two sessions open two files, `/test1.rpt` and `/test2.rpt` the st_node module will cache the internal representations of these node object files, and for successive uses of these node objects, the physical file will not be re-parsed. The file will be re-parsed if its timestamp changes. - If the underlying object does not support the attribute "last_modification" (assumed to be the timestamp), then st_node prints a warning. In essence, this warning indicates that changes to the underlying object will not trigger the st_node module to re-read the structure file defining the node object. Otherwise, the st_node module keeps track of the timestamp, and if it changes, the node object is re-read and re-parsed. @@ -1157,7 +1161,7 @@ Frees a block of memory allocated by `nmSysMalloc()`, `nmSysRealloc()`, or `nmSy ## V Other Utility Modules - + The Centrallix library (`centralllix-lib`) has a host of useful utility modules. These include `xarray`, used for managing growable arrays; `xstring`, used for managing growable strings; `xhash`, used for managing hash tables with no overflow problems and variable-length keys; `expression`, used for compiling and evaluating expressions; and `mtsession`, used for managing session-level variables and reporting errors. diff --git a/centrallix-sysdoc/string_similarity.md b/centrallix-sysdoc/string_similarity.md index b9a3a28b6..33667a05c 100644 --- a/centrallix-sysdoc/string_similarity.md +++ b/centrallix-sysdoc/string_similarity.md @@ -167,4 +167,8 @@ If the clustering could be expanded with an additional step that makes clusters Several algorithms (such as [k-means++](#k-means-clustering-1), [k-medoids](#k-medoids-clustering), and [DBScan](#db-scan)) above are proposed but lack an implementation. They may be effective and useful, however, to reduce development time, they have not yet been implemented. ### Upgrade Other Duplicate Detection Systems -When a new record is entered, a quick scan is run to check if it might be a duplicate. There is also a button in the UI for a record that lets you run a duplicate check. These systems could also be upgraded using the new algorithms and strategies developed for general duplicate detection. \ No newline at end of file +When a new record is entered, a quick scan is run to check if it might be a duplicate. There is also a button in the UI for a record that lets you run a duplicate check. These systems could also be upgraded using the new algorithms and strategies developed for general duplicate detection. + +### Known Issues +- The cluster driver often fails to open the structure file if it was modifed since the last time the path was openned. Opening a different path (including the root path, even though it does not support queries) fixes this issue. This is either a bug in the st_node caching or in the cluster driver's usage of stparse. +- The cluster does not invalidate caches if the underlying data source changes. This bug exists because I wasn't sure how to do this, but I'm pretty sure it's possible. Workaround: Developers should use `exec "cache" "drop_all"` to invalidate caches when data is changed, or use a fresh object system instance. diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 159c292b0..dd65a0c52 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -75,7 +75,7 @@ #include "obj.h" -/** TODO: I think this should be moved to datatypes. **/ +/** TODO: Greg - I think this should be moved to datatypes. **/ /** Should maybe replace duplocate functionality elsewhere. **/ static char* ci_TypeToStr(const int type) { diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index 4bf94ebe9..462d0625f 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -72,18 +72,6 @@ *** https://marketplace.visualstudio.com/items?itemName=ExodiusStudios.comment-anchors ***/ -/** Pure Laziness. **/ -// #define ENABLE_TPRINTF - -/** Debugging **/ -#ifndef ENABLE_TPRINTF -void void_func() {} -#define tprintf void_func -#endif -#ifdef ENABLE_TPRINTF -#define tprintf printf -#endif - /** Defaults for unspecified optional attributes. **/ #define DEFAULT_MIN_IMPROVEMENT 0.0001 #define DEFAULT_MAX_ITERATIONS 64u @@ -91,7 +79,7 @@ void void_func() {} /** ================ Stuff That Should Be Somewhere Else ================ **/ /** ANCHOR[id=temp] **/ -/** TODO: I think this should be moved to mtsession. **/ +/** TODO: Greg - I think this should be moved to mtsession. **/ /*** I caused at least 10 bugs so far trying to pass format specifiers to *** mssError without realizing that it didn't support them. Eventually, I *** got fed up enough with the whole thing to write the following function. @@ -147,7 +135,7 @@ void mssErrorf(int clr, char* module, const char* format, ...) } -/** TODO: I think this should be moved to datatypes. **/ +/** TODO: Greg - I think this should be moved to datatypes. **/ /** Should maybe replace current type parsing in the presentation hints. **/ /*** Parse the given string into a datatype. The case of the first character *** is ignored, but all other characters must be capitalized correctly. @@ -210,8 +198,8 @@ static int ci_TypeFromStr(const char* str) return -1; } -/** TODO: I think this should be moved to datatypes. **/ -/** Should maybe replace duplocate functionality elsewhere. **/ +/** TODO: Greg - I think this should be moved to datatypes. **/ +/** Should maybe replace this functionality where it appears elsewhere. **/ static char* ci_TypeToStr(const int type) { switch (type) @@ -234,7 +222,7 @@ static char* ci_TypeToStr(const int type) return "Invalid"; /* Shall not parse to a valid type in ci_TypeFromStr(). */ } -/** TODO: I think this should be moved to xarray. **/ +/** TODO: Greg - I think this should be moved to xarray. **/ /*** Trims an xArray, returning a new array (with nmSysMalloc). *** *** @param arr The array to be trimmed. @@ -803,7 +791,6 @@ static bool ci_TryHint(char* value, char** valid_values, const unsigned int n_va /** Issue hint. **/ ci_GiveHint(guess); - tprintf(" > Similarity: %.4g\n", ca_lev_compare(value, guess)); return true; } @@ -816,9 +803,9 @@ static bool ci_TryHint(char* value, char** valid_values, const unsigned int n_va *** *** @attention - Promises that a failure invokes mssError() at least once. *** - *** TODO: Greg - Review Carefully. - *** This function took a lot of debugging to get it to work. Please make sure - *** it works correctly and properly requires runserver() for dynamic attributes. + *** TODO: Greg - Review carefully. I think this code is the reason that runserver() + *** is NOT REQUIRED for dynamic attributes in the cluster driver. I had to debug + *** and rewrite this for ages and it uses several functions I don't 100% understand. ***/ static int ci_ParseAttribute( pStructInf inf, @@ -830,7 +817,6 @@ static int ci_ParseAttribute( bool print_type_error) { int ret; - tprintf("Invoking ci_ParseAttribute('%s').\n", attr_name); /** Get attribute inf. **/ pStructInf attr_info = stLookup(inf, attr_name); @@ -1035,10 +1021,8 @@ static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, if (source_maybe != NULL) { /** Cache hit. **/ - tprintf("# source: \"%s\"\n", source_data->Key); /** Cause an immediate invalid read if cache was incorrectly freed. **/ - tprintf("--> Name: %s\n", source_maybe->Name); /** Free data we don't need. **/ nmSysFree(source_data->Key); @@ -1049,7 +1033,6 @@ static pSourceData ci_ParseSourceData(pStructInf inf, pParamObjects param_list, } /** Cache miss: Add the new object to the cache for next time. **/ - tprintf("+ source: \"%s\"\n", source_data->Key); if (!check(xhAdd(&ClusterDriverCaches.SourceDataCache, source_data->Key, (void*)source_data))) goto err_free; @@ -1087,8 +1070,6 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) { int result; - tprintf("Parsing cluster: %s\n", inf->Name); - /** Extract values. **/ pParamObjects param_list = node_data->ParamList; pSourceData source_data = node_data->SourceData; @@ -1252,7 +1233,7 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) if (group_type == NULL) goto err_free_subclusters; if (strcmp(group_type, "cluster/cluster") != 0) { - fprintf(stderr, + mssErrorf(1, "Cluster", "Warning: Unknown group [\"%s\" : \"%s\"] in cluster \"%s\".\n", name, group_type, inf->Name ); @@ -1333,13 +1314,7 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) /** Check for a cached version. **/ pClusterData cluster_maybe = (pClusterData)xhLookup(&ClusterDriverCaches.ClusterDataCache, key); if (cluster_maybe != NULL) - { - /** Cache hit. **/ - tprintf("# cluster: \"%s\"\n", key); - - /** Cause invalid read if cache was incorrectly freed. **/ - tprintf("--> Name: %s\n", cluster_maybe->Name); - + { /* Cache hit. */ /** Free the parsed cluster that we no longer need. */ ci_FreeClusterData(cluster_data, false); nmSysFree(key); @@ -1349,7 +1324,6 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) } /** Cache miss. **/ - tprintf("+ cluster: \"%s\"\n", key); if (!check(xhAdd(&ClusterDriverCaches.ClusterDataCache, key, (void*)cluster_data))) goto err_free_key; return cluster_data; @@ -1386,9 +1360,7 @@ static pClusterData ci_ParseClusterData(pStructInf inf, pNodeData node_data) *** @returns A new pSearchData struct on success, or NULL on failure. ***/ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) - { - tprintf("Parsing search: %s\n", inf->Name); - + { /** Allocate space for search struct. **/ pSearchData search_data = check_ptr(nmMalloc(sizeof(SearchData))); if (search_data == NULL) goto err; @@ -1525,10 +1497,7 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) /** Check for a cached version. **/ pSearchData search_maybe = (pSearchData)xhLookup(search_cache, key); if (search_maybe != NULL) - { - /** Cache hit. **/ - tprintf("# search: \"%s\"\n", key); - tprintf("--> Name: %s\n", search_maybe->Name); /* Cause invalid read if cache was incorrectly freed. */ + { /* Cache hit. */ /** Free the parsed search that we no longer need. **/ ci_FreeSearchData(search_data); @@ -1539,7 +1508,6 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) } /** Cache miss. **/ - tprintf("+ search: \"%s\"\n", key); check(xhAdd(search_cache, key, (void*)search_data)); return search_data; @@ -1751,7 +1719,6 @@ static pNodeData ci_ParseNodeData(pStructInf inf, pObject parent) ); goto err_free_arrs; } - tprintf("Found provided value for %s of type %s\n", param->Name, ci_TypeToStr(param->Type)); /** Provided value successfully handled, we're done. **/ break; @@ -1860,7 +1827,7 @@ static void ci_FreeSourceData(pSourceData source_data) /** Guard segfault. **/ if (source_data == NULL) { - fprintf(stderr, "Call to ci_FreeSourceData(NULL);\n"); + fprintf(stderr, "Warning: Call to ci_FreeSourceData(NULL);\n"); return; } @@ -1927,7 +1894,7 @@ static void ci_FreeClusterData(pClusterData cluster_data, bool recursive) /** Guard segfault. **/ if (cluster_data == NULL) { - fprintf(stderr, "Call to ci_FreeClusterData(NULL, %s);\n", (recursive) ? "true" : "false"); + fprintf(stderr, "Warning: Call to ci_FreeClusterData(NULL, %s);\n", (recursive) ? "true" : "false"); return; } @@ -1983,7 +1950,7 @@ static void ci_FreeSearchData(pSearchData search_data) /** Guard segfault. **/ if (search_data == NULL) { - fprintf(stderr, "Call to ci_FreeSearchData(NULL);\n"); + fprintf(stderr, "Warning: Call to ci_FreeSearchData(NULL);\n"); return; } @@ -2019,7 +1986,7 @@ static void ci_FreeNodeData(pNodeData node_data) /** Guard segfault. **/ if (node_data == NULL) { - fprintf(stderr, "Call to ci_FreeNodeData(NULL);\n"); + fprintf(stderr, "Warning: Call to ci_FreeNodeData(NULL);\n"); return; } @@ -2111,6 +2078,13 @@ static void ci_ClearCaches(void) ***/ static unsigned int ci_SizeOfSourceData(pSourceData source_data) { + /** Guard segfault. **/ + if (source_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_SizeOfSourceData(NULL);\n"); + return 0u; + } + unsigned int size = 0u; if (source_data->Name != NULL) size += strlen(source_data->Name) * sizeof(char); if (source_data->SourcePath != NULL) size += strlen(source_data->SourcePath) * sizeof(char); @@ -2146,20 +2120,22 @@ static unsigned int ci_SizeOfSourceData(pSourceData source_data) *** @returns The size in bytes of the struct and all internal allocated data. ***/ static unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursive) - { + { + /** Guard segfault. **/ + if (cluster_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_SizeOfClusterData(NULL, %s);\n", (recursive) ? "true" : "false"); + return 0u; + } + unsigned int size = 0u; if (cluster_data->Name != NULL) size += strlen(cluster_data->Name) * sizeof(char); if (cluster_data->Clusters != NULL) { const unsigned int nVectors = cluster_data->SourceData->nVectors; for (unsigned int i = 0u; i < cluster_data->nClusters; i++) - { - const unsigned int cluster_size = cluster_data->Clusters[i].Size; - size += cluster_size * sizeof(char*); - size += cluster_size * sizeof(pVector); - } - size += nVectors * sizeof(Cluster); - size += nVectors * sizeof(double); + size += cluster_data->Clusters[i].Size * (sizeof(char*) + sizeof(pVector)); + size += nVectors * (sizeof(Cluster) + sizeof(double)); } if (cluster_data->SubClusters != NULL) { @@ -2188,6 +2164,13 @@ static unsigned int ci_SizeOfClusterData(pClusterData cluster_data, bool recursi ***/ static unsigned int ci_SizeOfSearchData(pSearchData search_data) { + /** Guard segfault. **/ + if (search_data == NULL) + { + fprintf(stderr, "Warning: Call to ci_SizeOfSearchData(NULL);\n"); + return 0u; + } + unsigned int size = 0u; if (search_data->Name != NULL) size += strlen(search_data->Name) * sizeof(char); if (search_data->Dups != NULL) size += search_data->nDups * (sizeof(void*) + sizeof(Dup)); @@ -2224,7 +2207,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) if (!check(objCurrentDate(&source_data->DateComputed))) goto end; /** Open the source path specified by the .cluster file. **/ - tprintf("Opening...\n"); pObject obj = objOpen(session, source_data->SourcePath, OBJ_O_RDONLY, 0600, "system/directory"); if (obj == NULL) { @@ -2239,7 +2221,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) } /** Generate a "query" for retrieving data. **/ - tprintf("Opening query...\n"); pObjQuery query = objOpenQuery(obj, NULL, NULL, NULL, NULL, 0); if (query == NULL) { @@ -2265,7 +2246,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) if (!check(xaInit(&vector_xarray, 64))) goto end_free_data; /** Fetch data and build vectors. **/ - tprintf("Skips: "); while (true) { pObject entry = objQueryFetch(query, O_RDONLY); @@ -2307,7 +2287,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) ret = objGetAttrValue(entry, source_data->NameAttr, DATA_T_STRING, POD(&data)); if (ret != 0) { - tprintf("\n"); mssErrorf(0, "Cluster", "Failed to value for %uth entry:\n" " > Attribute: ['%s':'%s' : String]\n" @@ -2326,7 +2305,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) /** Skip empty strings. **/ if (strlen(data) == 0) { - tprintf("_"); check(fflush(stdout)); /* Failure ignored. */ continue; } @@ -2348,7 +2326,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) if (ca_has_no_pairs(vector)) { /** Skip pVector with no pairs. **/ - tprintf("."); check(fflush(stdout)); /* Failure ignored. */ ca_free_vector(vector); continue; @@ -2391,7 +2368,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) ret = objGetAttrValue(entry, source_data->KeyAttr, DATA_T_STRING, POD(&key)); if (ret != 0) { - tprintf("\n"); mssErrorf(0, "Cluster", "Failed to value for key on %uth entry:\n" " > Attribute: ['%s':'%s' : String]\n" @@ -2424,7 +2400,7 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) // ret = ret; // Fall-through: Failure ignored. } } - tprintf("\nData acquired.\n"); + source_data->nVectors = vector_xarray.nItems; if (source_data->nVectors == 0) { @@ -2449,7 +2425,6 @@ static int ci_ComputeSourceData(pSourceData source_data, pObjSession session) if (source_data->Vectors == NULL) goto end_free_data; /** Success. **/ - fprintf(stderr, "[SourceData: %s] Compute done.\n", source_data->Name); successful = true; end_free_data: @@ -2539,7 +2514,6 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) { case ALGORITHM_NONE: { - tprintf("Applying no clustering...\n"); /** Put all the data into one cluster. **/ pCluster first_cluster = &cluster_data->Clusters[0]; first_cluster->Size = source_data->nVectors; @@ -2554,13 +2528,11 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) case ALGORITHM_SLIDING_WINDOW: /** Computed in each search for efficiency. **/ - tprintf("Skipping sliding window clustering...\n"); memset(cluster_data->Clusters, 0, clusters_size); break; case ALGORITHM_KMEANS: { - tprintf("Applying kmeans clustering...\n"); /** Check for unimplemented similarity measures. **/ if (cluster_data->SimilarityMeasure != SIMILARITY_COSINE) { @@ -2577,7 +2549,6 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) if (labels == NULL) goto err_free_sims; /** Run kmeans. **/ - tprintf("Running kmeans\n"); Timer timer_i, *timer = timer_start(timer_init(&timer_i)); const bool successful = check(ca_kmeans( source_data->Vectors, @@ -2589,7 +2560,6 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) cluster_data->Sims )); timer_stop(timer); - tprintf("Clustering done after %.4lfs.\n", timer_get(timer)); if (!successful) goto err_free_sims; /** Convert the labels into clusters. **/ @@ -2636,7 +2606,6 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) } /** Success. **/ - fprintf(stderr, "[ClusterData: %s] Compute done.\n", cluster_data->Name); return 0; err_free_sims: @@ -2692,8 +2661,6 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) /** Record the date and time. **/ if (!check(objCurrentDate(&search_data->DateComputed))) goto err; - tprintf("Invoking search.\n"); - Timer timer_i, *timer = timer_start(timer_init(&timer_i)); /** Execute the search using the specified source and comparison function. **/ pXArray dups = NULL, dups_temp = NULL; switch (search_data->SimilarityMeasure) @@ -2771,10 +2738,8 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) ); goto err_free; } - timer_stop(timer); if (dups_temp == NULL) goto err_free; else dups = dups_temp; - tprintf("Search done after %.4lfs.\n", timer_get(timer)); /** Store dups. **/ search_data->nDups = dups->nItems; @@ -2783,7 +2748,6 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) : ci_xaToTrimmedArray(dups, 2); /** Success. **/ - fprintf(stderr, "[SearchData: %s] Compute done.\n", search_data->Name); return 0; err_free: @@ -2817,7 +2781,6 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) ***/ static int ci_GetParamType(void* inf_v, const char* attr_name) { - tprintf("Call to ci_GetParamType(\"%s\")\n", attr_name); pNodeData node_data = (pNodeData)inf_v; /** Find the parameter. **/ @@ -2861,7 +2824,6 @@ static int ci_GetParamType(void* inf_v, const char* attr_name) ***/ static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) { - tprintf("Call to ci_GetParamValue(\"%s\", %s)\n", attr_name, ci_TypeToStr(datatype)); pNodeData node_data = (pNodeData)inf_v; /** Find the parameter. **/ @@ -2896,7 +2858,6 @@ static int ci_GetParamValue(void* inf_v, char* attr_name, int datatype, pObjData /** Not implemented. **/ static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData val) { - tprintf("Call to ci_SetParamValue(%s, %s)\n", attr_name, ci_TypeToStr(datatype)); mssErrorf(1, "Cluster", "SetParamValue() is not implemented because clusters are imutable."); return -1; } @@ -2923,7 +2884,7 @@ static int ci_SetParamValue(void* inf_v, char* attr_name, int datatype, pObjData ***/ void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_type, pObjTrxTree* oxt) { - tprintf("Warning: clusterOpen(\"%s\") is under active development.\n", ci_file_name(parent)); + /** Update statistics. **/ ClusterStatistics.OpenCalls++; /** If CREAT and EXCL are specified, exclusively create it, failing if the file already exists. **/ @@ -2973,12 +2934,10 @@ void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_typ driver_data->NodeData = node_data; /** Detect target from path. **/ - tprintf("Parsing node path: %d %d\n", parent->SubPtr, parent->SubCnt); parent->SubCnt = 0; char* target_name = obj_internal_PathPart(parent->Pathname, parent->SubPtr + parent->SubCnt++, 1); if (target_name == NULL) { /** Target found: Root **/ - tprintf("Found target: Root.\n"); driver_data->TargetType = TARGET_ROOT; driver_data->TargetData = (void*)driver_data->NodeData->SourceData; return (void*)driver_data; /* Success. */ @@ -2992,7 +2951,6 @@ void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_typ /** Target found: Cluster **/ driver_data->TargetType = TARGET_CLUSTER; - tprintf("Found target cluster: %s\n", cluster->Name); /** Check for sub-clusters in the path. **/ while (true) @@ -3014,7 +2972,6 @@ void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_typ if (strcmp(sub_cluster->Name, path_part) != 0) continue; /** Target found: Sub-cluster **/ - tprintf("Found target sub-cluster: %s\n", sub_cluster->Name); cluster = sub_cluster; goto continue_descent; } @@ -3045,7 +3002,6 @@ void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_typ mssErrorf(1, "Cluster", "Unknown path part %s.", extra_data); goto err_free_node; } - tprintf("Found target search: %s %d %d\n", search->Name, parent->SubPtr, parent->SubCnt); return (void*)driver_data; /* Success. */ } @@ -3088,7 +3044,6 @@ void* clusterOpen(pObject parent, int mask, pContentType sys_type, char* usr_typ ***/ int clusterClose(void* inf_v, pObjTrxTree* oxt) { - tprintf("Warning: clusterClose() is under active development.\n"); pDriverData driver_data = (pDriverData)inf_v; ClusterStatistics.CloseCalls++; @@ -3124,7 +3079,6 @@ int clusterClose(void* inf_v, pObjTrxTree* oxt) void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt) { ClusterStatistics.OpenQueryCalls++; - tprintf("Warning: clusterOpenQuery() is under active development.\n"); pClusterQuery cluster_query = check_ptr(nmMalloc(sizeof(ClusterQuery))); if (cluster_query == NULL) return NULL; cluster_query->DriverData = (pDriverData)inf_v; @@ -3148,10 +3102,11 @@ void* clusterOpenQuery(void* inf_v, pObjQuery query, pObjTrxTree* oxt) void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) { int ret; - ClusterStatistics.FetchCalls++; -// tprintf("Warning: clusterQueryFetch() is under active development.\n"); pClusterQuery cluster_query = (pClusterQuery)qy_v; + /** Update statistics. **/ + ClusterStatistics.FetchCalls++; + /** Ensure that the data being fetched exists and is computed. **/ TargetType target_type = cluster_query->DriverData->TargetType, new_target_type; unsigned int data_amount = 0u; @@ -3199,10 +3154,6 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) mssErrorf(1, "Cluster", "Unknown target type %u.", target_type); goto err; } - tprintf("Fetch Index: %u/16 (total: %u)\n", cluster_query->RowIndex, data_amount); - - /** Cap results to 16 for faster debugging. TODO: Israel - Remove. **/ -// data_amount = min(data_amount, 16); /** Check that the requested data exists, returning null if we've reached the end of the data. **/ if (cluster_query->RowIndex >= data_amount) return NULL; @@ -3233,9 +3184,7 @@ void* clusterQueryFetch(void* qy_v, pObject obj, int mode, pObjTrxTree* oxt) *** @returns 0, success. ***/ int clusterQueryClose(void* qy_v, pObjTrxTree* oxt) - { -// tprintf("Warning: clusterQueryClose() is under active development.\n"); - + { nmFree(qy_v, sizeof(ClusterQuery)); return 0; } @@ -3254,6 +3203,8 @@ int clusterQueryClose(void* qy_v, pObjTrxTree* oxt) int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) { pDriverData driver_data = (pDriverData)inf_v; + + /** Update statistics. **/ ClusterStatistics.GetTypeCalls++; /** Guard possible segfault. **/ @@ -3266,10 +3217,6 @@ int clusterGetAttrType(void* inf_v, char* attr_name, pObjTrxTree* oxt) /** Performance shortcut for frequently requested attributes: key1, key2, and sim. **/ if (attr_name[0] == 'k' || attr_name[0] == 's') goto handle_targets; - /** Debug info. **/ - if (oxt == NULL) tprintf(" > "); - tprintf("Call to clusterGetAttrType(%s)\n", attr_name); - /** Types for general attributes. **/ if (strcmp(attr_name, "name") == 0 || strcmp(attr_name, "annotation") == 0 @@ -3374,9 +3321,6 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val || (attr_name[0] == 's' && datatype == DATA_T_DOUBLE) /* sim : double */ ) goto handle_targets; - /** Debug info. **/ - tprintf("Call to clusterGetAttrValue(%s)\n", attr_name); - /** Type check. **/ const int expected_datatype = clusterGetAttrType(inf_v, attr_name, NULL); if (datatype != expected_datatype) @@ -3665,7 +3609,6 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val ***/ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pObjTrxTree* oxt) { - tprintf("Warning: clusterPresentationHints(\"%s\") is under active development.", attr_name); pDriverData driver_data = (pDriverData)inf_v; /** Malloc presentation hints struct. **/ @@ -3795,7 +3738,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Min and max values. **/ hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - char buf[4u]; + char buf[8]; snprintf(buf, sizeof(buf), "%d", nClusteringAlgorithms); hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); @@ -3825,7 +3768,7 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb /** Min and max values. **/ hints->MinValue = expCompileExpression("0", tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); - char buf[4u]; + char buf[8]; snprintf(buf, sizeof(buf), "%d", nSimilarityMeasures); hints->MaxValue = expCompileExpression(buf, tmp_list, MLX_F_ICASE | MLX_F_FILENAMES, 0); @@ -3962,7 +3905,6 @@ pObjPresentationHints clusterPresentationHints(void* inf_v, char* attr_name, pOb ***/ char* clusterGetFirstAttr(void* inf_v, pObjTrxTree* oxt) { - tprintf("Warning: clusterGetFirstAttr() is under active development.\n"); pDriverData driver_data = (pDriverData)inf_v; driver_data->TargetAttrIndex = 0u; return clusterGetNextAttr(inf_v, oxt); @@ -3981,10 +3923,8 @@ char* clusterGetFirstAttr(void* inf_v, pObjTrxTree* oxt) ***/ char* clusterGetNextAttr(void* inf_v, pObjTrxTree* oxt) { - tprintf("Warning: clusterGetNextAttr("); pDriverData driver_data = (pDriverData)inf_v; const unsigned int i = driver_data->TargetAttrIndex++; - tprintf("%u) is under active development.\n", i); switch (driver_data->TargetType) { case TARGET_ROOT: return ATTR_ROOT[i]; @@ -4009,7 +3949,6 @@ char* clusterGetNextAttr(void* inf_v, pObjTrxTree* oxt) ***/ int clusterInfo(void* inf_v, pObjectInfo info) { - tprintf("Warning: clusterInfo() is under active development.\n"); pDriverData driver_data = (pDriverData)inf_v; pNodeData node_data = (pNodeData)driver_data->NodeData; @@ -4073,7 +4012,6 @@ int clusterInfo(void* inf_v, pObjectInfo info) goto err; } - tprintf("Info result: "INT_TO_BINARY_PATTERN"\n", INT_TO_BINARY(info->Flags)); return 0; err: @@ -4097,7 +4035,6 @@ int clusterInfo(void* inf_v, pObjectInfo info) ***/ char* clusterGetFirstMethod(void* inf_v, pObjTrxTree* oxt) { - tprintf("Warning: clusterGetFirstMethod() is under active development.\n"); pDriverData driver_data = (pDriverData)inf_v; driver_data->TargetMethodIndex = 0u; return clusterGetNextMethod(inf_v, oxt); @@ -4116,7 +4053,6 @@ char* clusterGetFirstMethod(void* inf_v, pObjTrxTree* oxt) ***/ char* clusterGetNextMethod(void* inf_v, pObjTrxTree* oxt) { - tprintf("Warning: clusterGetNextMethod() is under active development."); pDriverData driver_data = (pDriverData)inf_v; return METHOD_NAME[driver_data->TargetMethodIndex++]; } @@ -4197,7 +4133,6 @@ static void ci_CacheFreeSourceData(pXHashEntry entry, void* path) if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; /** Free data. **/ - tprintf("- source: \"%s\"\n", key); ci_FreeSourceData(source_data); nmSysFree(key); } @@ -4215,7 +4150,6 @@ static void ci_CacheFreeCluster(pXHashEntry entry, void* path) if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; /** Free data. **/ - tprintf("- cluster: \"%s\"\n", key); ci_FreeClusterData(cluster_data, false); nmSysFree(key); } @@ -4233,7 +4167,6 @@ static void ci_CacheFreeSearch(pXHashEntry entry, void* path) if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return; /** Free data. **/ - tprintf("- search: \"%s\"\n", key); ci_FreeSearchData(search_data); nmSysFree(key); } @@ -4249,7 +4182,6 @@ static void ci_CacheFreeSearch(pXHashEntry entry, void* path) ***/ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrxTree* oxt) { - tprintf("Warning: clusterExecuteMethod(\"%s\") is under active development.\n", method_name); pDriverData driver_data = (pDriverData)inf_v; /** Cache management method. **/ @@ -4266,7 +4198,7 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx goto err; } - /** show and show_all. **/ + /** 'show' and 'show_all'. **/ bool show = false; if (strcmp(param->String, "show") == 0) { @@ -4323,18 +4255,11 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx return ret; } - /** drop_all. **/ + /** 'drop_all'. **/ if (strcmp(param->String, "drop_all") == 0) { - /** Print info. **/ - printf("\nDropping cache for "); - if (path != NULL) printf("\"%s\":\n", path); - else printf("all files:\n"); - - /** Free caches. **/ + printf("\nDropping cache for all files:\n"); ci_ClearCaches(); - - tprintf("Cache dropped.\n"); return 0; } @@ -4348,29 +4273,19 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx if (strcmp(method_name, "stat") == 0) { - unsigned long long ExpectedOpenCalls = 10666; - unsigned long long ExpectedOpenQueryCalls = 10665; - unsigned long long ExpectedFetchCalls = 3368007; - unsigned long long ExpectedCloseCalls = 3368007; - unsigned long long ExpectedGetTypeCalls = 26664164; - unsigned long long ExpectedGetValCalls = 15021419; - unsigned long long ExpectedGetValCalls_name = 3368008; - unsigned long long ExpectedGetValCalls_key1 = 3357342; - unsigned long long ExpectedGetValCalls_key2 = 1574; - unsigned long long ExpectedGetValCalls_sim = 8283829; char buf[12]; printf("Cluster Driver Statistics:\n"); - printf(" Stat Name Value\n"); - printf(" OpenCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenCalls), snprint_llu(buf, sizeof(buf), ExpectedOpenCalls), ClusterStatistics.OpenCalls / ExpectedOpenCalls * 100.0); - printf(" OpenQueryCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenQueryCalls), snprint_llu(buf, sizeof(buf), ExpectedOpenQueryCalls), ClusterStatistics.OpenQueryCalls / ExpectedOpenQueryCalls * 100.0); - printf(" FetchCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.FetchCalls), snprint_llu(buf, sizeof(buf), ExpectedFetchCalls), ClusterStatistics.FetchCalls / ExpectedFetchCalls * 100.0); - printf(" CloseCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.CloseCalls), snprint_llu(buf, sizeof(buf), ExpectedCloseCalls), ClusterStatistics.CloseCalls / ExpectedCloseCalls * 100.0); - printf(" GetTypeCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetTypeCalls), snprint_llu(buf, sizeof(buf), ExpectedGetTypeCalls), ClusterStatistics.GetTypeCalls / ExpectedGetTypeCalls * 100.0); - printf(" GetValCalls %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls), ClusterStatistics.GetValCalls / ExpectedGetValCalls * 100.0); - printf(" GetValCalls_name %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_name), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls_name), ClusterStatistics.GetValCalls_name / ExpectedGetValCalls_name * 100.0); - printf(" GetValCalls_key1 %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key1), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls_key1), ClusterStatistics.GetValCalls_key1 / ExpectedGetValCalls_key1 * 100.0); - printf(" GetValCalls_key2 %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key2), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls_key2), ClusterStatistics.GetValCalls_key2 / ExpectedGetValCalls_key2 * 100.0); - printf(" GetValCalls_sim %10s / %10s (%.4g%%)\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_sim), snprint_llu(buf, sizeof(buf), ExpectedGetValCalls_sim), ClusterStatistics.GetValCalls_sim / ExpectedGetValCalls_sim * 100.0); + printf(" Stat Name Value\n"); + printf(" OpenCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenCalls)); + printf(" OpenQueryCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenQueryCalls)); + printf(" FetchCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.FetchCalls)); + printf(" CloseCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.CloseCalls)); + printf(" GetTypeCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetTypeCalls)); + printf(" GetValCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls)); + printf(" GetValCalls_name %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_name)); + printf(" GetValCalls_key1 %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key1)); + printf(" GetValCalls_key2 %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key2)); + printf(" GetValCalls_sim %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_sim)); return 0; } @@ -4473,7 +4388,7 @@ int clusterInitialize(void) if (check_ptr(strcpy(drv->Name, "cluster - Clustering Driver")) == NULL) goto err; if (!check(xaInit(&drv->RootContentTypes, 1))) goto err; if (!check_neg(xaAddItem(&drv->RootContentTypes, "system/cluster"))) goto err; - drv->Capabilities = 0; /* TODO: Greg - Should I add any of these? */ + drv->Capabilities = 0; /* TODO: Greg - Should I indicate any capabilities? */ /** Setup the function references. **/ drv->Open = clusterOpen; @@ -4523,6 +4438,6 @@ int clusterInitialize(void) /** Error cleanup. **/ err: if (drv != NULL) nmFree(drv, sizeof(ObjDriver)); - fprintf(stderr, "Error: Failed to initialize cluster driver.\n"); + mssErrorf(1, "Cluster", "Failed to initialize cluster driver.\n"); return -1; } From 81a1d2fd3c46edd7b511f877afb60071f73497fe Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 14 Nov 2025 12:05:39 -0700 Subject: [PATCH 10/30] Clean up unintended usage of glyph.h --- centrallix-lib/src/clusters.c | 124 ---------------------------------- 1 file changed, 124 deletions(-) diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 4a96b6ca1..ef2336269 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -42,7 +42,6 @@ #include #include "clusters.h" -#include "glyph.h" #include "newmalloc.h" #include "util.h" #include "xarray.h" @@ -199,108 +198,6 @@ pVector ca_build_vector(const char* str) return trimmed_sparse_vector; } -// Build vector by converting a dense vector to a sparse one. -//pVector ca_build_vector_old(const char* str) -// { -// /** Allocate space for a dense vector. **/ -// unsigned int dense_vector[CA_NUM_DIMS] = {0u}; -// -// /** j is the former character, i is the latter. **/ -// const unsigned int num_chars = (unsigned int)strlen(str); -// for (unsigned int j = 65535u, i = 0u; i <= num_chars; i++) -// { -// if (isspace(str[i])) continue; -// if (ispunct(str[i]) && str[i] != CA_BOUNDARY_CHAR) continue; -// -// /** First and last character should fall one before 'a' in the ASCII table. **/ -// unsigned int temp1 = (j == 65535u) ? CA_BOUNDARY_CHAR : (unsigned int)tolower(str[j]); -// unsigned int temp2 = (i == num_chars) ? CA_BOUNDARY_CHAR : (unsigned int)tolower(str[i]); -// -// /** Shift numbers to the end of the lowercase letters. **/ -// if ('0' <= temp1 && temp1 <= '9') temp1 += 75u; -// if ('0' <= temp2 && temp2 <= '9') temp2 += 75u; -// -// /** Hash the character pair into an index (dimension). **/ -// /** Note that temp will be between 97 ('a') and 132 ('9'). **/ -// unsigned int dim = hash_char_pair(temp1, temp2); -// -// /** Increment the dimension of the dense vector by a number from 1 to 13. **/ -// dense_vector[dim] += (temp1 + temp2) % 13u + 1u; -// -// j = i; -// } -// -// /** Count how much space is needed for a sparse vector. **/ -// bool zero_prev = false; -// size_t size = 0u; -// for (unsigned int dim = 0u; dim < CA_NUM_DIMS; dim++) -// { -// if (dense_vector[dim] == 0u) -// { -// size += (zero_prev) ? 0u : 1u; -// zero_prev = true; -// } -// else -// { -// size++; -// zero_prev = false; -// } -// } -// -// /*** Check compression size. -// *** If this check fails, I doubt anything will break. However, the longest -// *** word I know (supercalifragilisticexpialidocious) has only 35 character -// *** pairs, so it shouldn't reach half this size (and it'd be even shorter -// *** if the hash generates at least one collision). -// *** -// *** Bad vector compression will result in degraded performace and increased -// *** memory usage. This indicates a likely bug in the code. Thus, if this -// *** warning is ever generated, it is definitely worth investigating. -// ***/ -// const size_t expected_max_size = 256u; -// if (size > expected_max_size) -// { -// fprintf(stderr, -// "cli_build_vector(\"%s\") - Warning: Sparse vector larger than expected.\n" -// " > Size: %lu\n" -// " > #Dims: %u\n", -// str, -// size, -// CA_NUM_DIMS -// ); -// } -// -// /** Allocate space for sparse vector. **/ -// const size_t sparse_vector_size = size * sizeof(int); -// pVector sparse_vector = (pVector)check_ptr(nmSysMalloc(sparse_vector_size)); -// if (sparse_vector == NULL) return NULL; -// -// /** Convert the dense vector above to a sparse vector. **/ -// unsigned int dim = 0u, sparse_idx = 0u; -// while (dim < CA_NUM_DIMS) -// { -// if (dense_vector[dim] == 0u) -// { -// /** Count and store consecutive zeros, skipping the first one. **/ -// unsigned int zero_count = 1u; -// dim++; -// while (dim < CA_NUM_DIMS && dense_vector[dim] == 0u) -// { -// zero_count++; -// dim++; -// } -// sparse_vector[sparse_idx++] = (int)-zero_count; -// } -// else -// { -// /** Store the value. **/ -// sparse_vector[sparse_idx++] = (int)dense_vector[dim++]; -// } -// } -// -// return sparse_vector; -// } - /*** Free memory allocated to store a sparse vector. *** *** @param sparse_vector The sparse vector being freed. @@ -837,17 +734,10 @@ int ca_kmeans( } } - /** Setup debug visualizations. **/ - glyph_init(iter, "\n", 1, false); - glyph_init(find, ".", 64, false); - glyph_init(update_label, "!", 16, false); - glyph_init(update_centroid, ":", 8, false); - /** Main kmeans loop. **/ double old_average_cluster_size = 1.0; for (unsigned int iter = 0u; iter < max_iter; iter++) { - glyph(iter); bool changed = false; /** Reset new centroids. **/ @@ -861,7 +751,6 @@ int ca_kmeans( /** Assign each point to the nearest centroid. **/ for (unsigned int i = 0u; i < num_vectors; i++) { - glyph(find); const pVector vector = vectors[i]; double min_dist = DBL_MAX; unsigned int best_centroid_label = 0u; @@ -880,7 +769,6 @@ int ca_kmeans( /** Update label to new centroid, if necessary. **/ if (labels[i] != best_centroid_label) { - glyph(update_label); labels[i] = best_centroid_label; changed = true; } @@ -902,7 +790,6 @@ int ca_kmeans( /** Update centroids. **/ for (unsigned int i = 0u; i < num_clusters; i++) { - glyph(update_centroid); if (cluster_counts[i] == 0u) continue; pCentroid centroid = centroids[i]; const pCentroid new_centroid = new_centroids[i]; @@ -926,8 +813,6 @@ int ca_kmeans( vector_sims[i] = sparse_similarity_to_centroid(vectors[i], centroids[labels[i]]); } - glyph_print("\n"); - /** Success. **/ successful = true; @@ -1028,25 +913,17 @@ pXArray ca_sliding_search( if (dups == NULL) goto err; } const int num_starting_dups = dups->nItems; - - /** Setup debug visualizations. **/ - glyph_init(outer, " ", 4, true); - glyph_init(inner, ".", 128, false); - glyph_init(find, "!", 32, false); /** Search for dups. **/ for (unsigned int i = 0u; i < num_data; i++) { - glyph(outer); const unsigned int window_start = i + 1u; const unsigned int window_end = min(i + window_size, num_data); for (unsigned int j = window_start; j < window_end; j++) { - glyph(inner); const double sim = similarity(data[i], data[j]); if (sim > threshold) /* Dup found! */ { - glyph(find); Dup* dup = (Dup*)check_ptr(nmMalloc(sizeof(Dup))); if (dup == NULL) goto err_free_dups; if (maybe_keys != NULL) @@ -1059,7 +936,6 @@ pXArray ca_sliding_search( } } } - glyph_print("\n"); /** Success. **/ return dups; From e624d40b242d8cd5644c05f243688eb6da13f53e Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 14 Nov 2025 16:10:00 -0700 Subject: [PATCH 11/30] Attempt to reduce issues from ambiguously signed chars. --- centrallix-lib/include/clusters.h | 2 +- centrallix-lib/src/clusters.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index 05480e742..ffa1223fb 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -49,7 +49,7 @@ /// LINK ../../centrallix-sysdoc/string_comparison.md#cosine_charsets /** The character used to create a pair with the first and last characters of a string. **/ -#define CA_BOUNDARY_CHAR ('a' - 1) +#define CA_BOUNDARY_CHAR (unsigned char)('a' - 1) /** Types. **/ typedef int* pVector; /* Sparse vector. */ diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index ef2336269..ba126e5f1 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -53,7 +53,7 @@ *** @param c2 The second character in the pair. *** @returns The resulting hash. ***/ -static unsigned int hash_char_pair(const char c1, const char c2) +static unsigned int hash_char_pair(const unsigned char c1, const unsigned char c2) { const double sum = (c1 * c1 * c1) + (c2 * c2 * c2); const double scale = ((double)c1 + 1.0) / ((double)c2 + 1.0); @@ -121,12 +121,17 @@ static int charpair_cmp(const void *p1, const void *p2) ***/ pVector ca_build_vector(const char* str) { - char chars[strlen(str) + 2u]; + unsigned char chars[strlen(str) + 2u]; unsigned int num_chars = 0u; chars[num_chars++] = CA_BOUNDARY_CHAR; /* Starting boundary character. */ for (const char* char_ptr = str; *char_ptr != '\0'; char_ptr++) { - unsigned char c = *char_ptr; + char maybe_char = *char_ptr; + if (maybe_char < 0) + { + fprintf(stderr, "Warning: Unexpected negative char '%c' in string: \"%s\"\n", maybe_char, str); + } + unsigned char c = (unsigned char)maybe_char; /** Always consider boundary character in string. **/ if (c == CA_BOUNDARY_CHAR) goto skip_checks; @@ -175,7 +180,10 @@ pVector ca_build_vector(const char* str) /** Dividing value by 2 each time reduces the impact of repeated pairs. **/ int value = 0; for (; i < num_pairs && char_pairs[i].hash == hash; i++) - value = (value / 2) + ((unsigned int)char_pairs[i].c1 + (unsigned int)char_pairs[i].c2) % 13u + 1u; + { + value /= 2; /* Reduce impact of repeated pairs. */ + value += ((unsigned int)char_pairs[i].c1 + (unsigned int)char_pairs[i].c2) % 13u + 1u; + } /** Skip zeros to reach the dimension index specified by the hash. **/ unsigned int num_zeros = hash - dim; From b0e000bfa58535b3a73b79e4f2dcec0e60f3553b Mon Sep 17 00:00:00 2001 From: Israel Date: Mon, 17 Nov 2025 10:43:37 -0700 Subject: [PATCH 12/30] All tests now pass. --- centrallix-lib/src/clusters.c | 7 +- centrallix/tests/test_cos_compare_00.cmp | 18 +- centrallix/tests/test_cos_compare_00.to | 23 ++- .../tests/test_expfn_double_metaphone_00.to | 161 ------------------ ...one_00.cmp => test_expfn_metaphone_00.cmp} | 0 centrallix/tests/test_expfn_metaphone_00.to | 161 ++++++++++++++++++ centrallix/tests/test_fuzzycompare_00.cmp | 13 -- centrallix/tests/test_fuzzycompare_00.to | 15 -- centrallix/tests/test_lev_compare_00.cmp | 23 +++ centrallix/tests/test_lev_compare_00.to | 28 +++ centrallix/tests/test_levenshtein_00.cmp | 24 ++- centrallix/tests/test_levenshtein_00.to | 29 +++- centrallix/tests/test_similarity_00.cmp | 5 - centrallix/tests/test_similarity_00.to | 7 - 14 files changed, 281 insertions(+), 233 deletions(-) delete mode 100644 centrallix/tests/test_expfn_double_metaphone_00.to rename centrallix/tests/{test_expfn_double_metaphone_00.cmp => test_expfn_metaphone_00.cmp} (100%) create mode 100644 centrallix/tests/test_expfn_metaphone_00.to delete mode 100644 centrallix/tests/test_fuzzycompare_00.cmp delete mode 100644 centrallix/tests/test_fuzzycompare_00.to create mode 100644 centrallix/tests/test_lev_compare_00.cmp create mode 100644 centrallix/tests/test_lev_compare_00.to delete mode 100644 centrallix/tests/test_similarity_00.cmp delete mode 100644 centrallix/tests/test_similarity_00.to diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index ba126e5f1..84f01c535 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -127,10 +127,7 @@ pVector ca_build_vector(const char* str) for (const char* char_ptr = str; *char_ptr != '\0'; char_ptr++) { char maybe_char = *char_ptr; - if (maybe_char < 0) - { - fprintf(stderr, "Warning: Unexpected negative char '%c' in string: \"%s\"\n", maybe_char, str); - } + if (maybe_char < 0) fprintf(stderr, "Warning: Unexpected negative char '%c' in string: \"%s\"\n", maybe_char, str); unsigned char c = (unsigned char)maybe_char; /** Always consider boundary character in string. **/ @@ -181,7 +178,7 @@ pVector ca_build_vector(const char* str) int value = 0; for (; i < num_pairs && char_pairs[i].hash == hash; i++) { - value /= 2; /* Reduce impact of repeated pairs. */ + // value /= 2; /* Reduce impact of repeated pairs. */ value += ((unsigned int)char_pairs[i].c1 + (unsigned int)char_pairs[i].c2) % 13u + 1u; } diff --git a/centrallix/tests/test_cos_compare_00.cmp b/centrallix/tests/test_cos_compare_00.cmp index d586365f7..2061443ac 100644 --- a/centrallix/tests/test_cos_compare_00.cmp +++ b/centrallix/tests/test_cos_compare_00.cmp @@ -1,7 +1,11 @@ -Attribute [case1]: integer 1 -Attribute [case2]: integer 1 -Attribute [case3]: integer 1 -Attribute [case4]: integer 1 -Attribute [case5]: integer 1 -Attribute [case6]: integer 1 -Attribute [case7]: integer 1 +Attribute [case1]: string "pass" +Attribute [case2]: string "pass" +Attribute [case3]: string "pass" +Attribute [case4]: string "pass" +Attribute [cynthia]: string "pass" +Attribute [timothy]: string "pass" +Attribute [lance]: string "pass" +Attribute [gregory]: string "pass" +Attribute [nathan]: string "pass" +Attribute [identical]: string "pass" +Attribute [name]: string "pass" diff --git a/centrallix/tests/test_cos_compare_00.to b/centrallix/tests/test_cos_compare_00.to index 5bf950514..f45dec13a 100644 --- a/centrallix/tests/test_cos_compare_00.to +++ b/centrallix/tests/test_cos_compare_00.to @@ -1,17 +1,24 @@ ##NAME Text Mining String Similarity with Cosine Compare -# All email addresses and phone numbers are imaginary and were fabricated for the purposes of this test +# Basic tests of cosine similarity. +query select case1 = condition((cos_compare('hello', 'hello') >= 0.999) and (cos_compare('hello', 'hello') <= 1.0), "pass", "fail") +query select case2 = condition((cos_compare('hello', 'zephora') <= 0.001) and (cos_compare('hello', 'zephora') >= 0.0), "pass", "fail") +query select case3 = condition((cos_compare('hello', 'hello world') <= 0.7) and (cos_compare('hello', 'hello world') >= 0.6), "pass", "fail") +query select case4 = condition((cos_compare('hello there', 'hellow there') >= 0.9) and (cos_compare('hello', 'hellow') <= 1.0), "pass", "fail") + -query select case1 = (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") >= 0.49) and (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") <= 0.54) +# Tests on fabricated contact information. +# All email addresses and phone numbers are imaginary and were fabricated for the purposes of this test +query select cynthia = condition((cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") >= 0.49) and (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") <= 0.54), "pass", "fail") -query select case2 = (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") >= 0.425) and (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") <= 0.475) +query select timothy = condition((cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") >= 0.425) and (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") <= 0.475), "pass", "fail") -query select case3 = (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") >= 0.35) and (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") <= 0.40) +query select lance = condition((cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") >= 0.35) and (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") <= 0.40), "pass", "fail") -query select case4 = (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") >= 0.94) and (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") <= 0.99) +query select gregory = condition((cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") >= 0.94) and (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") <= 0.99), "pass", "fail") -query select case5 = (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") >=0.66) and (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") <= 0.71) +query select nathan = condition((cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") >= 0.66) and (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") <= 0.71), "pass", "fail") -query select case6 = (cos_compare("This is an identical case", "This is an identical case") >=0.975) and (cos_compare("This is an identical case", "This is an identical case") <=1.00) +query select identical = condition((cos_compare("This is an identical case", "This is an identical case") >= 0.975) and (cos_compare("This is an identical case", "This is an identical case") <= 1.00), "pass", "fail") -query select case7 = (cos_compare("Samuel", "Alex") >= 0.00) and (cos_compare("Samuel", "Alex") <= 0.025) +query select name = condition((cos_compare("Samuel", "Alex") >= 0.00) and (cos_compare("Samuel", "Alex") <= 0.025), "pass", "fail") diff --git a/centrallix/tests/test_expfn_double_metaphone_00.to b/centrallix/tests/test_expfn_double_metaphone_00.to deleted file mode 100644 index efd7548cc..000000000 --- a/centrallix/tests/test_expfn_double_metaphone_00.to +++ /dev/null @@ -1,161 +0,0 @@ -##NAME double_metaphone() function - -# Special thanks to the following websites for double checking the correct results: -# 1: https://words.github.io/double-metaphone -# 2: https://mainegenealogy.net/metaphone_converter.asp -# 3: https://en.toolpage.org/tool/metaphone - -# These tests were collected from the following sources: -# - Example comments in the source code of exp_double_metaphone.c -# - Maurice Aubrey's Tests* -# - Tests manually written by Israel Fuller -# - Tests written by prompting ChatGPT-5 (preview)** -# -# *Source: https://github.com/gitpan/Text-DoubleMetaphone/blob/master/t/words.txt -# **GPT-5 mini (Preview) was run in GitHub Copilot to suggest the words -# for some tests after analizing a generated coverage report. I (Israel) -# used the suggestions to write some "AI generated" test cases. -# -# For more information, see the manual test suite implementation at the -# end of the exp_double_metaphone.c file. - -query select result = double_metaphone("Test") -query select result = double_metaphone("Basic") -query select result = double_metaphone("Centrallix") -query select result = double_metaphone("Lawrence") -query select result = double_metaphone("Philips") -query select result = double_metaphone("Acceptingness") -query select result = double_metaphone("Supercalifragilisticexpialidocious") -query select result = double_metaphone("Suoicodilaipxecitsiligarfilacrepus") -query select result = double_metaphone("Smith") -query select result = double_metaphone("Schmidt") -query select result = double_metaphone("Snider") -query select result = double_metaphone("Schneider") -query select result = double_metaphone("Arnow") -query select result = double_metaphone("Arnoff") -query select result = double_metaphone("Accede") -query select result = double_metaphone("Accident") -query select result = double_metaphone("Actually") -query select result = double_metaphone("Arch") -query select result = double_metaphone("Artois") -query select result = double_metaphone("Bacchus") -query select result = double_metaphone("Bacci") -query select result = double_metaphone("Bajador") -query select result = double_metaphone("Bellocchio") -query select result = double_metaphone("Bertucci") -query select result = double_metaphone("Biaggi") -query select result = double_metaphone("Bough") -query select result = double_metaphone("Breaux") -query select result = double_metaphone("Broughton") -query select result = double_metaphone("Cabrillo") -query select result = double_metaphone("Caesar") -query select result = double_metaphone("Cagney") -query select result = double_metaphone("Campbell") -query select result = double_metaphone("Carlisle") -query select result = double_metaphone("Carlysle") -query select result = double_metaphone("Chemistry") -query select result = double_metaphone("Chianti") -query select result = double_metaphone("Chorus") -query select result = double_metaphone("Cough") -query select result = double_metaphone("Czerny") -query select result = double_metaphone("Dumb") -query select result = double_metaphone("Edgar") -query select result = double_metaphone("Edge") -query select result = double_metaphone("Filipowicz") -query select result = double_metaphone("Focaccia") -query select result = double_metaphone("Gallegos") -query select result = double_metaphone("Germanic") -query select result = double_metaphone("Ghiradelli") -query select result = double_metaphone("Ghislane") -query select result = double_metaphone("Gospel") -query select result = double_metaphone("Gough") -query select result = double_metaphone("Greek") -query select result = double_metaphone("Hochmeier") -query select result = double_metaphone("Hugh") -query select result = double_metaphone("Island") -query select result = double_metaphone("Isle") -query select result = double_metaphone("Italian") -query select result = double_metaphone("Jankelowicz") -query select result = double_metaphone("Jose") -query select result = double_metaphone("Laugh") -query select result = double_metaphone("Mac Caffrey") -query select result = double_metaphone("Mac Gregor") -query select result = double_metaphone("Manager") -query select result = double_metaphone("McHugh") -query select result = double_metaphone("McLaughlin") -query select result = double_metaphone("Michael") -query select result = double_metaphone("Middle") -query select result = double_metaphone("Orchestra") -query select result = double_metaphone("Orchid") -query select result = double_metaphone("Pinyin") -query select result = double_metaphone("Raspberry") -query select result = double_metaphone("Resnais") -query select result = double_metaphone("Rogier") -query select result = double_metaphone("Rough") -query select result = double_metaphone("Salvador") -query select result = double_metaphone("San jacinto") -query select result = double_metaphone("Schenker") -query select result = double_metaphone("Schermerhorn") -query select result = double_metaphone("Schlesinger") -query select result = double_metaphone("School") -query select result = double_metaphone("Schooner") -query select result = double_metaphone("Succeed") -query select result = double_metaphone("Sugar") -query select result = double_metaphone("Sugary") -query select result = double_metaphone("Tagliaro") -query select result = double_metaphone("Thames") -query select result = double_metaphone("Thomas") -query select result = double_metaphone("Thumb") -query select result = double_metaphone("Tichner") -query select result = double_metaphone("Tough") -query select result = double_metaphone("Vghee") -query select result = double_metaphone("Wachtler") -query select result = double_metaphone("Wechsler") -query select result = double_metaphone("Word") -query select result = double_metaphone("Xavier") -query select result = double_metaphone("Yankelovich") -query select result = double_metaphone("Zhao") -query select result = double_metaphone("McClellan") -query select result = double_metaphone("maurice") -query select result = double_metaphone("aubrey") -query select result = double_metaphone("cambrillo") -query select result = double_metaphone("heidi") -query select result = double_metaphone("katherine") -query select result = double_metaphone("catherine") -query select result = double_metaphone("richard") -query select result = double_metaphone("bob") -query select result = double_metaphone("eric") -query select result = double_metaphone("geoff") -query select result = double_metaphone("dave") -query select result = double_metaphone("ray") -query select result = double_metaphone("steven") -query select result = double_metaphone("bryce") -query select result = double_metaphone("randy") -query select result = double_metaphone("bryan") -query select result = double_metaphone("brian") -query select result = double_metaphone("otto") -query select result = double_metaphone("auto") -query select result = double_metaphone("Abbott") -query select result = double_metaphone("Back") -query select result = double_metaphone("Bacher") -query select result = double_metaphone("Charles") -query select result = double_metaphone("Ghana") -query select result = double_metaphone("Gnome") -query select result = double_metaphone("Raj") -query select result = double_metaphone("Quentin") -query select result = double_metaphone("Who") -query select result = double_metaphone("Shoemaker") -query select result = double_metaphone("Sian") -query select result = double_metaphone("Scold") -query select result = double_metaphone("Station") -query select result = double_metaphone("Match") -query select result = double_metaphone("Pizza") -query select result = double_metaphone("Agnes") -query select result = double_metaphone("Science") -query select result = double_metaphone("Van Gogh") -query select result = double_metaphone("Josef") -query select result = double_metaphone("Object") -query select result = double_metaphone("Sholz") -query select result = double_metaphone("Scharf") -query select result = double_metaphone("Kasia") -query select result = double_metaphone("Van Geller") diff --git a/centrallix/tests/test_expfn_double_metaphone_00.cmp b/centrallix/tests/test_expfn_metaphone_00.cmp similarity index 100% rename from centrallix/tests/test_expfn_double_metaphone_00.cmp rename to centrallix/tests/test_expfn_metaphone_00.cmp diff --git a/centrallix/tests/test_expfn_metaphone_00.to b/centrallix/tests/test_expfn_metaphone_00.to new file mode 100644 index 000000000..de1897c3e --- /dev/null +++ b/centrallix/tests/test_expfn_metaphone_00.to @@ -0,0 +1,161 @@ +##NAME metaphone() function + +# Special thanks to the following websites for double checking the correct results: +# 1: https://words.github.io/double-metaphone +# 2: https://mainegenealogy.net/metaphone_converter.asp +# 3: https://en.toolpage.org/tool/metaphone + +# These tests were collected from the following sources: +# - Example comments in the source code of exp_double_metaphone.c +# - Maurice Aubrey's Tests* +# - Tests manually written by Israel Fuller +# - Tests written by prompting ChatGPT-5 (preview)** +# +# *Source: https://github.com/gitpan/Text-DoubleMetaphone/blob/master/t/words.txt +# **GPT-5 mini (Preview) was run in GitHub Copilot to suggest the words +# for some tests after analizing a generated coverage report. I (Israel) +# used the suggestions to write some "AI generated" test cases. +# +# For more information, see the manual test suite implementation at the +# end of the exp_double_metaphone.c file. + +query select result = metaphone("Test") +query select result = metaphone("Basic") +query select result = metaphone("Centrallix") +query select result = metaphone("Lawrence") +query select result = metaphone("Philips") +query select result = metaphone("Acceptingness") +query select result = metaphone("Supercalifragilisticexpialidocious") +query select result = metaphone("Suoicodilaipxecitsiligarfilacrepus") +query select result = metaphone("Smith") +query select result = metaphone("Schmidt") +query select result = metaphone("Snider") +query select result = metaphone("Schneider") +query select result = metaphone("Arnow") +query select result = metaphone("Arnoff") +query select result = metaphone("Accede") +query select result = metaphone("Accident") +query select result = metaphone("Actually") +query select result = metaphone("Arch") +query select result = metaphone("Artois") +query select result = metaphone("Bacchus") +query select result = metaphone("Bacci") +query select result = metaphone("Bajador") +query select result = metaphone("Bellocchio") +query select result = metaphone("Bertucci") +query select result = metaphone("Biaggi") +query select result = metaphone("Bough") +query select result = metaphone("Breaux") +query select result = metaphone("Broughton") +query select result = metaphone("Cabrillo") +query select result = metaphone("Caesar") +query select result = metaphone("Cagney") +query select result = metaphone("Campbell") +query select result = metaphone("Carlisle") +query select result = metaphone("Carlysle") +query select result = metaphone("Chemistry") +query select result = metaphone("Chianti") +query select result = metaphone("Chorus") +query select result = metaphone("Cough") +query select result = metaphone("Czerny") +query select result = metaphone("Dumb") +query select result = metaphone("Edgar") +query select result = metaphone("Edge") +query select result = metaphone("Filipowicz") +query select result = metaphone("Focaccia") +query select result = metaphone("Gallegos") +query select result = metaphone("Germanic") +query select result = metaphone("Ghiradelli") +query select result = metaphone("Ghislane") +query select result = metaphone("Gospel") +query select result = metaphone("Gough") +query select result = metaphone("Greek") +query select result = metaphone("Hochmeier") +query select result = metaphone("Hugh") +query select result = metaphone("Island") +query select result = metaphone("Isle") +query select result = metaphone("Italian") +query select result = metaphone("Jankelowicz") +query select result = metaphone("Jose") +query select result = metaphone("Laugh") +query select result = metaphone("Mac Caffrey") +query select result = metaphone("Mac Gregor") +query select result = metaphone("Manager") +query select result = metaphone("McHugh") +query select result = metaphone("McLaughlin") +query select result = metaphone("Michael") +query select result = metaphone("Middle") +query select result = metaphone("Orchestra") +query select result = metaphone("Orchid") +query select result = metaphone("Pinyin") +query select result = metaphone("Raspberry") +query select result = metaphone("Resnais") +query select result = metaphone("Rogier") +query select result = metaphone("Rough") +query select result = metaphone("Salvador") +query select result = metaphone("San jacinto") +query select result = metaphone("Schenker") +query select result = metaphone("Schermerhorn") +query select result = metaphone("Schlesinger") +query select result = metaphone("School") +query select result = metaphone("Schooner") +query select result = metaphone("Succeed") +query select result = metaphone("Sugar") +query select result = metaphone("Sugary") +query select result = metaphone("Tagliaro") +query select result = metaphone("Thames") +query select result = metaphone("Thomas") +query select result = metaphone("Thumb") +query select result = metaphone("Tichner") +query select result = metaphone("Tough") +query select result = metaphone("Vghee") +query select result = metaphone("Wachtler") +query select result = metaphone("Wechsler") +query select result = metaphone("Word") +query select result = metaphone("Xavier") +query select result = metaphone("Yankelovich") +query select result = metaphone("Zhao") +query select result = metaphone("McClellan") +query select result = metaphone("maurice") +query select result = metaphone("aubrey") +query select result = metaphone("cambrillo") +query select result = metaphone("heidi") +query select result = metaphone("katherine") +query select result = metaphone("catherine") +query select result = metaphone("richard") +query select result = metaphone("bob") +query select result = metaphone("eric") +query select result = metaphone("geoff") +query select result = metaphone("dave") +query select result = metaphone("ray") +query select result = metaphone("steven") +query select result = metaphone("bryce") +query select result = metaphone("randy") +query select result = metaphone("bryan") +query select result = metaphone("brian") +query select result = metaphone("otto") +query select result = metaphone("auto") +query select result = metaphone("Abbott") +query select result = metaphone("Back") +query select result = metaphone("Bacher") +query select result = metaphone("Charles") +query select result = metaphone("Ghana") +query select result = metaphone("Gnome") +query select result = metaphone("Raj") +query select result = metaphone("Quentin") +query select result = metaphone("Who") +query select result = metaphone("Shoemaker") +query select result = metaphone("Sian") +query select result = metaphone("Scold") +query select result = metaphone("Station") +query select result = metaphone("Match") +query select result = metaphone("Pizza") +query select result = metaphone("Agnes") +query select result = metaphone("Science") +query select result = metaphone("Van Gogh") +query select result = metaphone("Josef") +query select result = metaphone("Object") +query select result = metaphone("Sholz") +query select result = metaphone("Scharf") +query select result = metaphone("Kasia") +query select result = metaphone("Van Geller") diff --git a/centrallix/tests/test_fuzzycompare_00.cmp b/centrallix/tests/test_fuzzycompare_00.cmp deleted file mode 100644 index baa6db1e9..000000000 --- a/centrallix/tests/test_fuzzycompare_00.cmp +++ /dev/null @@ -1,13 +0,0 @@ -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 diff --git a/centrallix/tests/test_fuzzycompare_00.to b/centrallix/tests/test_fuzzycompare_00.to deleted file mode 100644 index 78141a473..000000000 --- a/centrallix/tests/test_fuzzycompare_00.to +++ /dev/null @@ -1,15 +0,0 @@ -##NAME Levenshtein String Comparison - -query select sw1 = 1 where fuzzy_compare('hello', 'hello!', 20) >= 0 and fuzzy_compare("hello","hello!", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'asdfkh', 20) >= 0 and fuzzy_compare("hello","asdfkh", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'aaaaaaaaaaaaaaaaa', 20) >= 0 and fuzzy_compare("hello","aaaaaaaaaaaaaaaaa", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'nope', 20) >= 0 and fuzzy_compare("hello","nope", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('below', 'hello!', 20) >= 0 and fuzzy_compare("below","hello!", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('kitten', 'smitten', 20) >= 0 and fuzzy_compare("kitten","smitten", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', 'bobbobbobbob', 20) >= 0 and fuzzy_compare("hello","bobbobbobbob", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hello', '', 20) >= 0 and fuzzy_compare("hello","", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('', '', 20) >= 0 and fuzzy_compare("","", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('blooooop', 'blob', 20) >= 0 and fuzzy_compare("blooooop","blob", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('', '!', 20) >= 0 and fuzzy_compare("","!", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('h', 'h', 20) >= 0 and fuzzy_compare("h","h", 20) <= 1 -query select sw1 = 1 where fuzzy_compare('hi', 'hi', 20) >= 0 and fuzzy_compare("hi","hi", 20) <= 1 diff --git a/centrallix/tests/test_lev_compare_00.cmp b/centrallix/tests/test_lev_compare_00.cmp new file mode 100644 index 000000000..1c295a360 --- /dev/null +++ b/centrallix/tests/test_lev_compare_00.cmp @@ -0,0 +1,23 @@ +Attribute [case1]: string "pass" +Attribute [case2]: string "pass" +Attribute [case3]: string "pass" +Attribute [case4]: string "pass" +Attribute [case5]: string "pass" +Attribute [case6]: string "pass" +Attribute [case7]: string "pass" +Attribute [case8]: string "pass" +Attribute [case9]: string "pass" +Attribute [case10]: string "pass" +Attribute [case11]: string "pass" +Attribute [case12]: string "pass" +Attribute [case13]: string "pass" +Attribute [case14]: string "pass" +Attribute [case15]: string "pass" +Attribute [case16]: string "pass" +Attribute [case17]: string "pass" +Attribute [case18]: string "pass" +Attribute [case19]: string "pass" +Attribute [case20]: string "pass" +Attribute [case21]: string "pass" +Attribute [case22]: string "pass" +Attribute [case23]: string "pass" diff --git a/centrallix/tests/test_lev_compare_00.to b/centrallix/tests/test_lev_compare_00.to new file mode 100644 index 000000000..5d9cec0f7 --- /dev/null +++ b/centrallix/tests/test_lev_compare_00.to @@ -0,0 +1,28 @@ +##NAME Levenshtein String Comparison + +# Legacy tests. +query select case1 = condition(lev_compare('hello', 'hello!') >= 0 and lev_compare('hello','hello!') <= 1, 'pass', 'fail') +query select case2 = condition(lev_compare('hello', 'asdfkh') >= 0 and lev_compare('hello','asdfkh') <= 1, 'pass', 'fail') +query select case3 = condition(lev_compare('hello', 'aaaaaaaaaaaaaaaaa') >= 0 and lev_compare('hello','aaaaaaaaaaaaaaaaa') <= 1, 'pass', 'fail') +query select case4 = condition(lev_compare('hello', 'nope') >= 0 and lev_compare('hello', 'nope') <= 1, 'pass', 'fail') +query select case5 = condition(lev_compare('below', 'hello!') >= 0 and lev_compare('below', 'hello!') <= 1, 'pass', 'fail') +query select case6 = condition(lev_compare('kitten', 'smitten') >= 0 and lev_compare('kitten', 'smitten') <= 1, 'pass', 'fail') +query select case7 = condition(lev_compare('hello', 'bobbobbobbob') >= 0 and lev_compare('hello', 'bobbobbobbob') <= 1, 'pass', 'fail') +query select case8 = condition(lev_compare('hello', '') >= 0 and lev_compare('hello', '') <= 1, 'pass', 'fail') +query select case9 = condition(lev_compare('', '') >= 0 and lev_compare('', '') <= 1, 'pass', 'fail') +query select case10 = condition(lev_compare('blooooop', 'blob') >= 0 and lev_compare('blooooop', 'blob') <= 1, 'pass', 'fail') +query select case11 = condition(lev_compare('', '!') >= 0 and lev_compare('','!') <= 1, 'pass', 'fail') +query select case12 = condition(lev_compare('h', 'h') >= 0 and lev_compare('h','h') <= 1, 'pass', 'fail') +query select case13 = condition(lev_compare('hi', 'hi') >= 0 and lev_compare('hi','hi') <= 1, 'pass', 'fail') + +# Kitten tests. +query select case14 = condition(lev_compare('kitten', 'kitten') >= 0.99 and lev_compare('kitten', 'kitten') <= 1.0, 'pass', 'fail') -- 0 edits +query select case15 = condition(lev_compare('kitten', 'skitten') >= 0.8 and lev_compare('kitten', 'skitten') <= 0.9, 'pass', 'fail') -- 1 insert +query select case16 = condition(lev_compare('kitten', 'itten') >= 0.8 and lev_compare('kitten', 'itten') <= 0.9, 'pass', 'fail') -- 1 delete +query select case17 = condition(lev_compare('kitten', 'mitten') >= 0.8 and lev_compare('kitten', 'mitten') <= 0.9, 'pass', 'fail') -- 1 replace +query select case18 = condition(lev_compare('kitten', 'smitten') >= 0.7 and lev_compare('kitten', 'smitten') <= 0.8, 'pass', 'fail') -- 1 insert and one replace +query select case19 = condition(lev_compare('kitten', 'iktten') >= 0.8 and lev_compare('kitten', 'iktten') <= 0.9, 'pass', 'fail') -- 1 transpose +query select case20 = condition(lev_compare('kitten', 'kittens') >= 0.8 and lev_compare('kitten', 'kittens') <= 0.9, 'pass', 'fail') -- 1 insert (end) +query select case21 = condition(lev_compare('kitten', 'kitte') >= 0.8 and lev_compare('kitten', 'kitte') <= 0.9, 'pass', 'fail') -- 1 delete (end) +query select case22 = condition(lev_compare('kitten', 'kittem') >= 0.8 and lev_compare('kitten', 'kittem') <= 0.9, 'pass', 'fail') -- 1 replace (end) +query select case23 = condition(lev_compare('kitten', 'kittne') >= 0.8 and lev_compare('kitten', 'kittne') <= 0.9, 'pass', 'fail') -- 1 transpose (end) diff --git a/centrallix/tests/test_levenshtein_00.cmp b/centrallix/tests/test_levenshtein_00.cmp index 0bc319c9d..2a084162d 100644 --- a/centrallix/tests/test_levenshtein_00.cmp +++ b/centrallix/tests/test_levenshtein_00.cmp @@ -1,6 +1,18 @@ -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 2 -Attribute [sw1]: integer 2 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 +Attribute [case1]: string "pass" +Attribute [case2]: string "pass" +Attribute [case3]: string "pass" +Attribute [case4]: string "pass" +Attribute [case5]: string "pass" +Attribute [case6]: string "pass" +Attribute [case7]: string "pass" +Attribute [case8]: string "pass" +Attribute [case9]: string "pass" +Attribute [case10]: string "pass" +Attribute [case11]: string "pass" +Attribute [case12]: string "pass" +Attribute [case13]: string "pass" +Attribute [case14]: string "pass" +Attribute [case15]: string "pass" +Attribute [case16]: string "pass" +Attribute [case17]: string "pass" +Attribute [case18]: string "pass" diff --git a/centrallix/tests/test_levenshtein_00.to b/centrallix/tests/test_levenshtein_00.to index a666c3a4b..33f78e5f8 100644 --- a/centrallix/tests/test_levenshtein_00.to +++ b/centrallix/tests/test_levenshtein_00.to @@ -1,8 +1,25 @@ ##NAME Levenshtein String Comparison -query select sw1 = levenshtein('hello', 'hello!') -query select sw1 = levenshtein('kitten', 'mitten') -query select sw1 = levenshtein('kitten', 'smitten') -query select sw1 = levenshtein('lawn', 'flown') -query select sw1 = levenshtein('kitten', 'itten') -query select sw1 = levenshtein('kitten', 'skitten') +# Kitten tests. +query select case1 = condition(levenshtein('kitten', 'kitten') == 0, 'pass', 'fail') -- 0 edits +query select case2 = condition(levenshtein('kitten', 'skitten') == 1, 'pass', 'fail') -- 1 insert +query select case3 = condition(levenshtein('kitten', 'itten') == 1, 'pass', 'fail') -- 1 delete +query select case4 = condition(levenshtein('kitten', 'mitten') == 1, 'pass', 'fail') -- 1 replace +query select case5 = condition(levenshtein('kitten', 'smitten') == 2, 'pass', 'fail') -- 1 insert and one replace +query select case6 = condition(levenshtein('kitten', 'iktten') == 1, 'pass', 'fail') -- 1 transpose +query select case7 = condition(levenshtein('kitten', 'kittens') == 1, 'pass', 'fail') -- 1 insert (end) +query select case8 = condition(levenshtein('kitten', 'kitte') == 1, 'pass', 'fail') -- 1 delete (end) +query select case9 = condition(levenshtein('kitten', 'kittem') == 1, 'pass', 'fail') -- 1 replace (end) +query select case10 = condition(levenshtein('kitten', 'kittne') == 1, 'pass', 'fail') -- 1 transpose (end) + +# Alternate words. +query select case11 = condition(levenshtein('lawn', 'flown') == 2, 'pass', 'fail') -- 1 insert and one replace +query select case12 = condition(levenshtein('hello', 'hello!') == 1, 'pass', 'fail') -- 1 insert (end) +query select case13 = condition(levenshtein('zert', 'zerf') == 1, 'pass', 'fail') -- 1 replace (end) +query select case14 = condition(levenshtein('llearr', 'lear') == 2, 'pass', 'fail') -- 2 deletes (start & end) + +# Edge cases. +query select case15 = condition(levenshtein('', '') == 0, 'pass', 'fail') -- 0 edits +query select case16 = condition(levenshtein('This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string...', 'This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string...') == 0, 'pass', 'fail') -- 0 edits. +query select case17 = condition(levenshtein('This is a very long string!! I do not expect this function to need to process a string longer than this, because this string is a full 254 characters. That is pretty long. The object system limits strings to this size so we cannot make a longer string...', 'This is quite a lengthy string. I do not expect the function to compute any longer string since this one is a full 254 characters. That is plenty, even if someone adds many contact details to their record!! Thus, this test should cover most cases we see.') == 133, 'pass', 'fail') -- 133 edits. +query select case18 = condition(levenshtein('AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', 'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB') == 254, 'pass', 'fail') -- 254 replaces. diff --git a/centrallix/tests/test_similarity_00.cmp b/centrallix/tests/test_similarity_00.cmp deleted file mode 100644 index a0d292206..000000000 --- a/centrallix/tests/test_similarity_00.cmp +++ /dev/null @@ -1,5 +0,0 @@ -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 -Attribute [sw1]: integer 1 diff --git a/centrallix/tests/test_similarity_00.to b/centrallix/tests/test_similarity_00.to deleted file mode 100644 index a0942ab76..000000000 --- a/centrallix/tests/test_similarity_00.to +++ /dev/null @@ -1,7 +0,0 @@ -##NAME Text Mining String Similarity - -query select sw1 = (cos_compare('hello', 'hello') >= 0.999) and (cos_compare('hello', 'hello') <= 1) -query select sw1 = (cos_compare('hello', 'nancy') <= 0.001) and (cos_compare('hello', 'nancy') >= 0) -query select sw1 = (cos_compare('hello', 'hello world') <= 0.891) and (cos_compare('hello', 'hello world') >= 0.890) -query select sw1 = (cos_compare('hello', 'hellow') >= 0.935) and (cos_compare('hello', 'hellow') <= 0.936) -query select sw1 = (cos_compare('hello', 'hellow', 1) >= 0.935) and (cos_compare('hello', 'hellow', 1) <= 0.936) From 08743657d5d4aa922f85ac5c2f7f0e1ff117da79 Mon Sep 17 00:00:00 2001 From: Israel Date: Mon, 17 Nov 2025 10:50:42 -0700 Subject: [PATCH 13/30] Re-apply reduced weight for duplicate pairs (temporarily turned off last commit). Update tests to pass with this modification. --- centrallix-lib/src/clusters.c | 2 +- centrallix/tests/test_cos_compare_00.to | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 84f01c535..7d8a225ca 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -178,7 +178,7 @@ pVector ca_build_vector(const char* str) int value = 0; for (; i < num_pairs && char_pairs[i].hash == hash; i++) { - // value /= 2; /* Reduce impact of repeated pairs. */ + value /= 2; /* Reduce impact of repeated pairs. */ value += ((unsigned int)char_pairs[i].c1 + (unsigned int)char_pairs[i].c2) % 13u + 1u; } diff --git a/centrallix/tests/test_cos_compare_00.to b/centrallix/tests/test_cos_compare_00.to index f45dec13a..c5b0b1a5b 100644 --- a/centrallix/tests/test_cos_compare_00.to +++ b/centrallix/tests/test_cos_compare_00.to @@ -11,13 +11,13 @@ query select case4 = condition((cos_compare('hello there', 'hellow there') >= 0. # All email addresses and phone numbers are imaginary and were fabricated for the purposes of this test query select cynthia = condition((cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") >= 0.49) and (cos_compare("Cynthia Adams; cynthiaadams@gmail.com; 720-769-1293", "Timothy Adams; thetbear@gmail.com; 720-891-1470") <= 0.54), "pass", "fail") -query select timothy = condition((cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") >= 0.425) and (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") <= 0.475), "pass", "fail") +query select timothy = condition((cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") >= 0.45) and (cos_compare("Timothy Adams; thetbear@gmail.com; 720-891-1470", "Lance Freson; lancetheturtle@gmail.com; 720-111-8189") <= 0.50), "pass", "fail") -query select lance = condition((cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") >= 0.35) and (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") <= 0.40), "pass", "fail") +query select lance = condition((cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") >= 0.425) and (cos_compare("Lance Freson; lancetheturtle@gmail.com; 720-111-8189", "Gregory Freson; greatgregory@gmail.com; 720-198-5791") <= 0.475), "pass", "fail") query select gregory = condition((cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") >= 0.94) and (cos_compare("Gregory Freson; greatgregory@gmail.com; 720-198-5791", "Gregory Freson; greatgregory@gmail.co; 720-198-5791") <= 0.99), "pass", "fail") -query select nathan = condition((cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") >= 0.66) and (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") <= 0.71), "pass", "fail") +query select nathan = condition((cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") >= 0.575) and (cos_compare("Nathan Mayor; nmmayor@yahoo.com; +1-800-192-9128", "Mindy Mayor; nmmayor@yahoo.com; 720-981-9149") <= 0.625), "pass", "fail") query select identical = condition((cos_compare("This is an identical case", "This is an identical case") >= 0.975) and (cos_compare("This is an identical case", "This is an identical case") <= 1.00), "pass", "fail") From 01d918aa2d4f3f3cb03fbf95d682c6da5388fca2 Mon Sep 17 00:00:00 2001 From: Israel Date: Mon, 17 Nov 2025 11:09:19 -0700 Subject: [PATCH 14/30] Clean up. --- centrallix-lib/src/clusters.c | 20 +++------ centrallix-os/cluster-schema.cluster | 6 +-- centrallix-os/testdir/file.cluster | 64 ---------------------------- 3 files changed, 8 insertions(+), 82 deletions(-) delete mode 100644 centrallix-os/testdir/file.cluster diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 7d8a225ca..e0b71efaa 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -234,18 +234,17 @@ unsigned int ca_sparse_len(const pVector vector) } /*** Print the underlying implementation values sparsely allocated - *** vector (intended for debugging). + *** vector (for debugging). *** - *** @param out File to print to. *** @param vector The vector. ***/ -void ca_fprint_vector(FILE* out, const pVector vector) +void ca_print_vector(const pVector vector) { const unsigned int len = ca_sparse_len(vector); - fprintf(out, "Vector: [%d", vector[0]); + printf("Vector: [%d", vector[0]); for (unsigned int i = 1u; i < len; i++) - fprintf(out, ", %d", vector[i]); - fprintf(out, "]"); + printf(", %d", vector[i]); + printf("]"); } /*** Compute the magnitude of a sparsely allocated vector. @@ -409,9 +408,6 @@ static double sparse_similarity_to_centroid(const pVector v1, const pCentroid c2 *** *** @attention - `Complexity`: O(nm), where n and m are the lengths of str1 *** and str2 (respectively). - *** - *** @skip - *** LINK ../../centrallix-sysdoc/string_comparison.md#levenshtein ***/ unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) { @@ -500,9 +496,6 @@ unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_len *** @param v1 A `pVector` to the first string to compare. *** @param v2 A `pVector` to the second string to compare. *** @returns The cosine similarity between the two strings. - *** - *** @skip - *** LINK ../../centrallix-sysdoc/string_comparison.md#cosine ***/ double ca_cos_compare(void* v1, void* v2) { @@ -532,9 +525,6 @@ double ca_cos_compare(void* v1, void* v2) *** @param str1 A `char*` to the first string to compare. *** @param str2 A `char*` to the second string to compare. *** @returns The levenshtein similarity between the two strings. - *** - *** @skip - *** LINK ../../centrallix-sysdoc/string_comparison.md#levenshtein ***/ double ca_lev_compare(void* str1, void* str2) { diff --git a/centrallix-os/cluster-schema.cluster b/centrallix-os/cluster-schema.cluster index 4113a339a..5e11cd7c2 100644 --- a/centrallix-os/cluster-schema.cluster +++ b/centrallix-os/cluster-schema.cluster @@ -4,9 +4,9 @@ file_name "system/cluster" { name "cluster/parameter" { - type : DATA_T // See datatypes.h - ?default : type - ?name : String // Overrides the name above. + type : DATA_T // See datatypes.h + ?default : type // Default value for the variable. + ?name : String // Overrides the name above. ?style : StyleObj // idk where to find docs for this. } // Access with :parameters:name. Accessing dynamic data (e.g. parameters) diff --git a/centrallix-os/testdir/file.cluster b/centrallix-os/testdir/file.cluster deleted file mode 100644 index 929efdd03..000000000 --- a/centrallix-os/testdir/file.cluster +++ /dev/null @@ -1,64 +0,0 @@ -$Version=2$ -file_name "system/cluster" - { - // Developer can specify parameters to improve file reuseability. - // TIP: Improve performance by declairing frequently used parameters first. - k "cluster/parameter" { type = integer; style=notnull; } - str "cluster/parameter" { type = string; } - int "cluster/parameter" { type = integer; default = runserver(:parameters:k); } - dbl "cluster/parameter" { type = double; default=4.2; } - // conversion "cluster/parameter" { type=double; default=4; } - - null_str "cluster/parameter" { type = string; default = null; } - null_int "cluster/parameter" { type = integer; default = null; } - null_dbl "cluster/parameter" { type = double; default = null; } - - // We calculate k in a centrallix script using: - // k = max(2, pow(log(n) / log(36), 3.2) - 8) - // where n is the number of records passed. - - // Specify the data source at the top of the file. - // How do we pass distinct data? Should the driver - // handle that for us? - source = "/apps/kardia/data/Kardia_DB/p_partner/rows"; - attr_name = p_given_name; // runserver(:parameters:str) - - // Clustering object specifies properties for clustering. - kmeans_cluster "cluster/cluster" - { - algorithm = "k-means"; - similarity_measure = "cosine"; - num_clusters = runserver(:parameters:k); - min_improvement = 0.0001; - max_iterations = 48; - - // Create subclusters. (Not implemented) - sub_cluster "cluster/cluster" - { - algorithm = "none"; - similarity_measure = "cosine"; - num_clusters = 7; - min_improvement = "max"; - } - } - - // Complete search. - no_clustering "cluster/cluster" - { - algorithm = "none"; - } - - dups "cluster/search" - { - source = kmeans_cluster; - threshold = 0.75; - similarity_measure = "cosine"; - } - - dups2 "cluster/search" - { - source = no_clustering; - threshold = 0.75; - similarity_measure = "cosine"; - } - } From 42a65f17a15c96fe2792268e7af6638cd285d4c3 Mon Sep 17 00:00:00 2001 From: Israel Date: Mon, 17 Nov 2025 13:34:41 -0700 Subject: [PATCH 15/30] Update licences. --- centrallix-lib/include/clusters.h | 7 ++++-- centrallix-lib/include/util.h | 33 +++++++++++++++---------- centrallix-lib/src/clusters.c | 7 ++++-- centrallix-lib/src/util.c | 33 +++++++++++++++---------- centrallix-sysdoc/OSDriver_Authoring.md | 31 +++++++++++++++++++++++ centrallix-sysdoc/string_similarity.md | 5 +++- 6 files changed, 85 insertions(+), 31 deletions(-) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index ffa1223fb..c0718cea9 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -28,8 +28,11 @@ /* Module: lib_cluster.h */ /* Author: Israel Fuller */ /* Creation: September 29, 2025 */ -/* Description: Internal algorithms for the cluster object driver. */ -/* See centrallix-sysdoc/EAV_Pivot.md for more information. */ +/* Description: Clustering library used to cluster and search data with */ +/* cosine similarity and Levenshtein similarity (aka. edit */ +/* distance). Used by the "clustering driver". */ +/* For more information on how to use this library, see */ +/* string-similarity.md in the centrallix-sysdoc folder. */ /************************************************************************/ #include diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index 0f2685039..853954409 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -2,19 +2,26 @@ #define UTILITY_H /************************************************************************/ -/* Centrallix Application Server System */ -/* Centrallix Base Library */ -/* */ -/* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ -/* */ -/* You may use these files and this library under the terms of the */ -/* GNU Lesser General Public License, Version 2.1, contained in the */ -/* included file "COPYING". */ -/* */ -/* Module: (util.c,.h) */ -/* Author: Micah Shennum */ -/* Date: May 26, 2011 */ -/* Description: Collection of utilities */ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: util.c, util.h */ +/* Author: Micah Shennum and Israel Fuller */ +/* Date: May 26, 2011 */ +/* Description: Collection of utilities including: */ +/* - Utilities for parsing numbers. */ +/* - The timer utility for benchmarking code. */ +/* - snprint_bytes() for formatting a byte count. */ +/* - snprint_llu() for formatting large numbers. */ +/* - fprint_mem() for printing memory stats. */ +/* - min() and max() for handling numbers. */ +/* - The check functions for reliably printing debug data. */ /************************************************************************/ #ifdef __cplusplus diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index e0b71efaa..4bfce8ee6 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -1,4 +1,3 @@ - /************************************************************************/ /* Centrallix Application Server System */ /* Centrallix Core */ @@ -26,7 +25,11 @@ /* Module: lib_cluster.c */ /* Author: Israel Fuller */ /* Creation: September 29, 2025 */ -/* Description: Internal algorithms for the cluster object driver. */ +/* Description: Clustering library used to cluster and search data with */ +/* cosine similarity and Levenshtein similarity (aka. edit */ +/* distance). Used by the "clustering driver". */ +/* For more information on how to use this library, see */ +/* string-similarity.md in the centrallix-sysdoc folder. */ /************************************************************************/ /** This file has additional documentation in string_similarity.md. **/ diff --git a/centrallix-lib/src/util.c b/centrallix-lib/src/util.c index f60349a74..7c234a341 100644 --- a/centrallix-lib/src/util.c +++ b/centrallix-lib/src/util.c @@ -1,17 +1,24 @@ /************************************************************************/ -/* Centrallix Application Server System */ -/* Centrallix Base Library */ -/* */ -/* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ -/* */ -/* You may use these files and this library under the terms of the */ -/* GNU Lesser General Public License, Version 2.1, contained in the */ -/* included file "COPYING". */ -/* */ -/* Module: (util.c,.h) */ -/* Author: Micah Shennum */ -/* Date: May 26, 2011 */ -/* Description: Collection of utilities */ +/* Centrallix Application Server System */ +/* Centrallix Base Library */ +/* */ +/* Copyright (C) 1998-2011 LightSys Technology Services, Inc. */ +/* */ +/* You may use these files and this library under the terms of the */ +/* GNU Lesser General Public License, Version 2.1, contained in the */ +/* included file "COPYING". */ +/* */ +/* Module: util.c, util.h */ +/* Author: Micah Shennum and Israel Fuller */ +/* Date: May 26, 2011 */ +/* Description: Collection of utilities including: */ +/* - Utilities for parsing numbers. */ +/* - The timer utility for benchmarking code. */ +/* - snprint_bytes() for formatting a byte count. */ +/* - snprint_llu() for formatting large numbers. */ +/* - fprint_mem() for printing memory stats. */ +/* - min() and max() for handling numbers. */ +/* - The check functions for reliably printing debug data. */ /************************************************************************/ #include diff --git a/centrallix-sysdoc/OSDriver_Authoring.md b/centrallix-sysdoc/OSDriver_Authoring.md index f679dac32..8e58f7cee 100644 --- a/centrallix-sysdoc/OSDriver_Authoring.md +++ b/centrallix-sysdoc/OSDriver_Authoring.md @@ -1,3 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + # ObjectSystem Driver Interface **Author**: Greg Beeley diff --git a/centrallix-sysdoc/string_similarity.md b/centrallix-sysdoc/string_similarity.md index 33667a05c..526cbe307 100644 --- a/centrallix-sysdoc/string_similarity.md +++ b/centrallix-sysdoc/string_similarity.md @@ -25,7 +25,10 @@ - + + + + - - # String Similarity -The following sections discuss the approaches to calculating similarity between two strings which are implemented in the `clusters.c` library. This library can be included using `#include "clusters.h"` in centrallix-lib and `#include "cxlib/clusters.h"` in centrallix. +The following sections discuss the approaches to calculating similarity between two strings using the `clusters.c` library. This library can be included using `#include "cxlib/clusters.h"` in the centrallix codebase (use `#include "clusters.h"` in other libaries in centrallix-lib). -## Table of Contents +## Table of Contents - [String Comparison](#string-comparison) - [Table of Contents](#table-of-contents) - [Cosine Similarity](#cosine-similarity) @@ -76,10 +56,10 @@ The following sections discuss the approaches to calculating similarity between - [Implement Missing Algorithms](#implement-missing-algorithms) -## Cosine Similarity +## Cosine Similarity The [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) function is defined as the dot product of two vectors divided by the product of the magnitude of the two vectors. Conceptually, it's like finding the _angle_ between two vectors. To get these vectors, we use the relative frequency of character pairs within each string. To reduce memory cost and speed up computation, we store them in a special sparsely allocated form, described below. -### Character Sets +### Character Sets Cosine compare currently uses the following character sets. These can be extended or modified later, if necessary. ```c const char ALLOW_SET[] = " \n\v\f\r!#$%&\"'()*+,-./:;<=>?@[]^_{|}~ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; @@ -97,26 +77,26 @@ const char BOUNDARY_CHAR = ('a' - 1); // aka. '`' - This character appears to have been selected to be one before the first character in `CHAR_SET` (thus convention dictates that it be written `'a' - 1` to indicate this), although it's unknown if that's the main or only reason. - If `clusters.h` is included, it can be accessed using the `CA_BOUNDARY_CHAR` macro. -### Character Pair Hashing +### Character Pair Hashing Even with a small set of ASCII characters (say 36), there are still `36^2 = 1296` possible character pairs. If the number of characters in the `CHAR_SET` ever needed to be expanded - for example, to include all UTF-8 characters - this number would quickly explode exponentially to utterly infeasible proportions. Thus, a hashing algorithm is employed to hash each character pair down to a more reasonable number of dimensions (which can be accessed with the `CA_NUM_DIMS` macro). -### String Vectors +### String Vectors Any string of characters in the `ALLOW_SET` can be represented by a vector. For simplicity, imagine this vector has only `5` dimensions. To find this vector, we hash each character pair in the string. As each character pair is hashed (for example, that the pair "ab" happens to hash to `3`), the corresponding dimension is increased by some amount. This amount varies to based on the characters in the pair, helping to mitigate the impact of collisions where different character pairs hash to identical numbers (a larger number of dimensions also helps to mitigate this). Remember that the first and last characters form a pair with the `BOUNDARY_CHAR`, so the string "ab" has three pairs: "a", "ab", and "b". If these each hash to `2`, `3`, and `0`. Thus, the vector generated by the string "ab" might be: `[7, 0, 4, 3, 0]`. Notice that dimensions #1 and #4 are both `0` because no character pairs generated a hash of `1` or `4`. In real usecases, the vast majority of elements are `0`s because the number of dimensions used is much larger than the number of character pairs in a typical string. -### Sparse Vectors +### Sparse Vectors As noted above, the vast majority of elements in a vector generated by a typical string are `0`s. This would lead to a large waste of memory and computation if every `0` was stored separately, so instead, vectors are stored sparsely. Because all hashes are positive integers, we represent `n` `0`s with a value of ` -n`. Thus, the vector `[0, 1, 0, 0, 0]` (representing an empty string in `5` dimensions) would be represented sparsely as `[-1, 1, -3]`. **Note**: A value of `0` in a sparse vector is undefined, so no element should be equal to `0`. **Note**: Sparse arrays can vary greatly in length. To find their size, one needs to traverse the array until the total number of values found adds up to `CA_NUM_DIMS`. The `ca_sparse_len()` function can be used to do this. Also, the `ca_build_vector()` and `ca_free_vector()` use the `nmSys` functions from `newmalloc.h` to avoid conflicts over the size of the allocated data. -### Computing Similarity +### Computing Similarity Finally, to find the cosine similarity between two strings, we can simply take the [dot product](https://en.wikipedia.org/wiki/Dot_product) of their coresponding vectors. Then, we normalize the dot product by dividing by the magnitudes of both vectors multiplied together. Two strings can be compared this way using the `ca_cos_compare()` function. -## Levenshtein Similarity +## Levenshtein Similarity The [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) distance is defined as the number of insertions, deletions, or substitutions required to make one string exactly like another string. The version implemented in `clusters.c` additionally allows a new operation called a "swap" in which two adjacent characters change places. Transpositions of larger pieces of text are, unfortunately, not handled as well, which is a potential downfall of using levenshtein edit distance. The levenshtein similarity of two strings can be compared using the `ca_lev_compare()` function. diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index 462d0625f..7ee0bdbba 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -1,4 +1,3 @@ - /************************************************************************/ /* Centrallix Application Server System */ /* Centrallix Core */ @@ -42,14 +41,12 @@ #include #include "cxlib/clusters.h" -#include "cxlib/mtask.h" #include "cxlib/mtsession.h" #include "cxlib/newmalloc.h" #include "cxlib/util.h" #include "cxlib/xarray.h" #include "cxlib/xhash.h" #include "expression.h" -#include "hints.h" #include "obj.h" #include "param.h" #include "st_node.h" From ee0bca7351115d3ea5c6cdbd46db7b1225ad4424 Mon Sep 17 00:00:00 2001 From: Israel Date: Wed, 19 Nov 2025 11:58:18 -0700 Subject: [PATCH 17/30] Add "show_less" option to the cache method (skips printing uncomputed caches). Fix a formatting issue with the stat method. Fix a missing include in the util.c library. --- centrallix-lib/src/util.c | 1 + centrallix/osdrivers/objdrv_cluster.c | 93 +++++++++++++++++++-------- 2 files changed, 67 insertions(+), 27 deletions(-) diff --git a/centrallix-lib/src/util.c b/centrallix-lib/src/util.c index cd8a3b49a..e39572f95 100644 --- a/centrallix-lib/src/util.c +++ b/centrallix-lib/src/util.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index 7ee0bdbba..8fb97b184 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -4067,7 +4067,8 @@ static int ci_PrintEntry(pXHashEntry entry, void* arg) void** args = (void**)arg; unsigned int* type_id_ptr = (unsigned int*)args[0]; unsigned int* total_bytes_ptr = (unsigned int*)args[1]; - char* path = (char*)args[2]; + unsigned long long* less_ptr = (unsigned long long*)args[2]; + char* path = (char*)args[3]; /** If a path is provided, check that it matches the start of the key. **/ if (path != NULL && strncmp(key, (char*)path, strlen((char*)path)) != 0) return 0; @@ -4081,25 +4082,46 @@ static int ci_PrintEntry(pXHashEntry entry, void* arg) case 1u: { pSourceData source_data = (pSourceData)data; + + /** Compute size. **/ + bytes = ci_SizeOfSourceData(source_data); + + /** If less is specified, skip uncomputed source. **/ + if (*less_ptr > 0llu && source_data->Vectors == NULL) goto no_print; + + /** Compute printing information. **/ type = "Source"; name = source_data->Name; - bytes = ci_SizeOfSourceData(source_data); break; } case 2u: { pClusterData cluster_data = (pClusterData)data; + + /** Compute size. **/ + bytes = ci_SizeOfClusterData(cluster_data, false); + + /** If less is specified, skip uncomputed source. **/ + if (*less_ptr > 0llu && cluster_data->Clusters == NULL) goto no_print; + + /** Compute printing information. **/ type = "Cluster"; name = cluster_data->Name; - bytes = ci_SizeOfClusterData(cluster_data, false); break; } case 3u: { pSearchData search_data = (pSearchData)data; + + /** Compute size. **/ + bytes = ci_SizeOfSearchData(search_data); + + /** If less is specified, skip uncomputed source. **/ + if (*less_ptr > 0llu && search_data->Dups == NULL) goto no_print; + + /** Compute printing information. **/ type = "Search"; name = search_data->Name; - bytes = ci_SizeOfSearchData(search_data); break; } default: @@ -4107,14 +4129,20 @@ static int ci_PrintEntry(pXHashEntry entry, void* arg) return -1; } - /** Increment total bytes. **/ - *total_bytes_ptr += bytes; + /** Print the cache entry data. **/ char buf[12]; snprint_bytes(buf, sizeof(buf), bytes); printf("%-8s %-16s %-12s \"%s\"\n", type, name, buf, key); + increment_total: + *total_bytes_ptr += bytes; + return 0; + + no_print: + (*less_ptr)++; + goto increment_total; } @@ -4190,14 +4218,18 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx if (param->String == NULL) { mssErrorf(1, "Cluster", - "[param : \"show\" | \"show_all\" | \"drop_all\"] is required for the cache method." + "[param : \"show\" | \"show_less\" | \"show_all\" | \"drop_all\"] is required for the cache method." ); goto err; } /** 'show' and 'show_all'. **/ bool show = false; - if (strcmp(param->String, "show") == 0) + unsigned long long skip_uncomputed = 0llu; + if (strcmp(param->String, "show_less") == 0) + /** Specify show_less to skip uncomputed caches. **/ + skip_uncomputed = 1ull; + if (skip_uncomputed == 1ull || strcmp(param->String, "show") == 0) { show = true; path = ci_file_path(driver_data->NodeData->Parent); @@ -4217,25 +4249,32 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx failed |= !check(xhForEach( &ClusterDriverCaches.SourceDataCache, ci_PrintEntry, - (void*[]){&i, &source_bytes, path} + (void*[]){&i, &source_bytes, (void*)&skip_uncomputed, path} )); i++; failed |= !check(xhForEach( &ClusterDriverCaches.ClusterDataCache, ci_PrintEntry, - (void*[]){&i, &cluster_bytes, path} + (void*[]){&i, &cluster_bytes, (void*)&skip_uncomputed, path} )); i++; failed |= !check(xhForEach( &ClusterDriverCaches.SearchDataCache, ci_PrintEntry, - (void*[]){&i, &search_bytes, path} + (void*[]){&i, &search_bytes, (void*)&skip_uncomputed, path} )); if (failed) { mssErrorf(0, "Cluster", "Unexpected error occurred while showhing caches."); ret = -1; } + + /** Precomputations. **/ + unsigned int total_caches = 0u + + (unsigned int)ClusterDriverCaches.SourceDataCache.nItems + + (unsigned int)ClusterDriverCaches.ClusterDataCache.nItems + + (unsigned int)ClusterDriverCaches.SearchDataCache.nItems; + if (total_caches <= skip_uncomputed) printf("All caches skipped, nothing to show...\n"); /** Print stats. **/ char buf[16]; @@ -4244,10 +4283,10 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx printf("%-8s %-4d %-12s\n", "Source", ClusterDriverCaches.SourceDataCache.nItems, snprint_bytes(buf, sizeof(buf), source_bytes)); printf("%-8s %-4d %-12s\n", "Cluster", ClusterDriverCaches.ClusterDataCache.nItems, snprint_bytes(buf, sizeof(buf), cluster_bytes)); printf("%-8s %-4d %-12s\n", "Search", ClusterDriverCaches.SearchDataCache.nItems, snprint_bytes(buf, sizeof(buf), search_bytes)); - printf("%-8s %-4d %-12s\n\n", "Total", - ClusterDriverCaches.SourceDataCache.nItems + ClusterDriverCaches.ClusterDataCache.nItems + ClusterDriverCaches.SearchDataCache.nItems, - snprint_bytes(buf, sizeof(buf), source_bytes + cluster_bytes + search_bytes) - ); + printf("%-8s %-4d %-12s\n\n", "Total", total_caches, snprint_bytes(buf, sizeof(buf), source_bytes + cluster_bytes + search_bytes)); + + /** Print skip stats (if anything was skipped.) **/ + if (skip_uncomputed > 0llu) printf("Skipped %llu uncomputed caches.\n\n", skip_uncomputed - 1llu); return ret; } @@ -4262,7 +4301,7 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx /** Unknown parameter. **/ mssErrorf(1, "Cluster", - "Expected [param : \"show\" | \"show_all\" | \"drop_all\"] for the cache method, but got: \"%s\"", + "Expected [param : \"show\" | \"show_less\" | \"show_all\" | \"drop_all\"] for the cache method, but got: \"%s\"", param->String ); goto err; @@ -4272,17 +4311,17 @@ int clusterExecuteMethod(void* inf_v, char* method_name, pObjData param, pObjTrx { char buf[12]; printf("Cluster Driver Statistics:\n"); - printf(" Stat Name Value\n"); - printf(" OpenCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenCalls)); - printf(" OpenQueryCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenQueryCalls)); - printf(" FetchCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.FetchCalls)); - printf(" CloseCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.CloseCalls)); - printf(" GetTypeCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetTypeCalls)); - printf(" GetValCalls %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls)); - printf(" GetValCalls_name %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_name)); - printf(" GetValCalls_key1 %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key1)); - printf(" GetValCalls_key2 %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key2)); - printf(" GetValCalls_sim %8s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_sim)); + printf(" Stat Name %12s\n", "Value"); + printf(" OpenCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenCalls)); + printf(" OpenQueryCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.OpenQueryCalls)); + printf(" FetchCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.FetchCalls)); + printf(" CloseCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.CloseCalls)); + printf(" GetTypeCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetTypeCalls)); + printf(" GetValCalls %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls)); + printf(" GetValCalls_name %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_name)); + printf(" GetValCalls_key1 %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key1)); + printf(" GetValCalls_key2 %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_key2)); + printf(" GetValCalls_sim %12s\n", snprint_llu(buf, sizeof(buf), ClusterStatistics.GetValCalls_sim)); return 0; } From 0c9eb2cf957b109b981b8f318652e0cff8effa0d Mon Sep 17 00:00:00 2001 From: Israel Date: Wed, 19 Nov 2025 15:30:35 -0700 Subject: [PATCH 18/30] Update cluster library to use dynamic memory for any data over a couple hundred bytes. Add check_double() to handle functions that return NAN on failure. Clean up. --- centrallix-lib/include/clusters.h | 4 +- centrallix-lib/include/util.h | 14 ++ centrallix-lib/src/clusters.c | 191 +++++++++++++++----------- centrallix/expression/exp_functions.c | 16 ++- centrallix/osdrivers/objdrv_cluster.c | 24 +++- 5 files changed, 160 insertions(+), 89 deletions(-) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index c0718cea9..218422253 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -76,7 +76,7 @@ typedef struct nmRegister(sizeof(Dup), "Dup") /** Edit distance function. **/ -unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length); +int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length); /** Vector functions. **/ pVector ca_build_vector(const char* str); @@ -102,7 +102,7 @@ int ca_kmeans( _v[0] == -172 && _v[1] == 11 && _v[2] == -78; \ }) -/** Comparison functions, for ca_search(). **/ +/** Comparison functions (see ca_search()). **/ double ca_cos_compare(void* v1, void* v2); double ca_lev_compare(void* str1, void* str2); bool ca_eql(pVector v1, pVector v2); diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index 853954409..2c8537327 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -147,6 +147,20 @@ void print_diagnostics(int code, const char* function_name, const char* file_nam success; \ }) +/*** Ensures that developer diagnostics are printed if the result of the + *** passed function call is a NAN double. Not intended for user errors. + *** + *** @param result The result of the function we're checking. + *** @returns result + ***/ +#define check_double(result) \ + ({ \ + errno = 0; /* Reset errno to prevent confusion. */ \ + __typeof__ (result) _r = (result); \ + if (isnan(_r)) print_diagnostics(0, #result, __FILE__, __LINE__); \ + _r; \ + }) + /*** Ensures that developer diagnostics are printed if the result of the *** passed function call is a NULL pointer. Not intended for user errors. *** diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 4bfce8ee6..6487e28e7 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -124,8 +124,11 @@ static int charpair_cmp(const void *p1, const void *p2) ***/ pVector ca_build_vector(const char* str) { - unsigned char chars[strlen(str) + 2u]; unsigned int num_chars = 0u; + unsigned char* chars = check_ptr(nmSysMalloc((strlen(str) + 2u) * sizeof(unsigned char))); + if (chars == NULL) goto err; + + /** Begin adding char pairs (in order). **/ chars[num_chars++] = CA_BOUNDARY_CHAR; /* Starting boundary character. */ for (const char* char_ptr = str; *char_ptr != '\0'; char_ptr++) { @@ -149,8 +152,9 @@ pVector ca_build_vector(const char* str) } chars[num_chars++] = CA_BOUNDARY_CHAR; /* Ending boundary character. */ - /** Compute char pairs. **/ - CharPair char_pairs[num_chars]; + /** Compute hash values for char pairs. **/ + CharPair* char_pairs = check_ptr(nmSysMalloc(num_chars * sizeof(CharPair))); + if (char_pairs == NULL) goto err; const unsigned int num_pairs = num_chars - 1u; for (unsigned int i = 0u; i < num_pairs; i++) { @@ -163,12 +167,16 @@ pVector ca_build_vector(const char* str) char_pairs[i].hash = hash_char_pair(chars[i], chars[i + 1]); } + /** Free unused memory. **/ + nmSysFree(chars); + chars = NULL; + /** Sort char_pairs by hash value. **/ qsort(char_pairs, num_pairs, sizeof(CharPair), charpair_cmp); /** Allocate space for the sparse vector. **/ - pVector sparse_vector = (pVector)check_ptr(nmSysMalloc((num_pairs * 2u + 1u) * sizeof(int))); - if (sparse_vector == NULL) return NULL; + pVector sparse_vector = check_ptr(nmSysMalloc((num_pairs * 2u + 1u) * sizeof(int))); + if (sparse_vector == NULL) goto err; /** Build the sparse vector. **/ unsigned int cur = 0u, dim = 0u; @@ -199,11 +207,23 @@ pVector ca_build_vector(const char* str) } if (dim != CA_NUM_DIMS) sparse_vector[cur++] = -(CA_NUM_DIMS - dim); + /** Free unused memory. **/ + nmSysFree(char_pairs); + char_pairs = NULL; + /** Trim extra space wasted by identical hashes. **/ - pVector trimmed_sparse_vector = (pVector)check_ptr(nmSysRealloc(sparse_vector, cur * sizeof(int))); - if (trimmed_sparse_vector == NULL) return NULL; + pVector trimmed_sparse_vector = check_ptr(nmSysRealloc(sparse_vector, cur * sizeof(int))); + if (trimmed_sparse_vector == NULL) goto err; + sparse_vector = NULL; /* Mark memory freed by nmSysRealloc() no longer valid. */ + /** Return the result. **/ return trimmed_sparse_vector; + + err: + if (sparse_vector != NULL) nmSysFree(sparse_vector); + if (char_pairs != NULL) nmSysFree(char_pairs); + if (chars != NULL) nmSysFree(chars); + return NULL; } /*** Free memory allocated to store a sparse vector. @@ -404,6 +424,7 @@ static double sparse_similarity_to_centroid(const pVector v1, const pCentroid c2 *** @param str2 The second string. *** @param length1 The length of the first string. *** @param length1 The length of the first string. + *** @returns The edit distance between the two strings, or a negative value on error. *** *** @attention - `Tip`: Pass 0 for the length of either string to infer it *** using the null terminating character. Conversely, character arrays @@ -412,8 +433,10 @@ static double sparse_similarity_to_centroid(const pVector v1, const pCentroid c2 *** @attention - `Complexity`: O(nm), where n and m are the lengths of str1 *** and str2 (respectively). ***/ -unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) +int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) { + int result = -1; + /*** lev_matrix: *** For all i and j, d[i][j] will hold the Levenshtein distance between *** the first i characters of s and the first j characters of t. @@ -423,9 +446,13 @@ unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_len ***/ const size_t str1_len = (str1_length == 0u) ? strlen(str1) : str1_length; const size_t str2_len = (str2_length == 0u) ? strlen(str2) : str2_length; - unsigned int* lev_matrix[str1_len + 1]; + unsigned int** lev_matrix = check_ptr(nmSysMalloc((str1_len + 1) * sizeof(unsigned int*))); + if (lev_matrix == NULL) goto end; for (unsigned int i = 0u; i < str1_len + 1u; i++) - lev_matrix[i] = nmMalloc((str2_len + 1) * sizeof(unsigned int)); + { + lev_matrix[i] = check_ptr(nmSysMalloc((str2_len + 1) * sizeof(unsigned int))); + if (lev_matrix[i] == NULL) goto end; + } /*** Base case #0: *** Transforming an empty string into an empty string has 0 cost. @@ -472,19 +499,36 @@ unsigned int edit_dist(const char* str1, const char* str2, const size_t str1_len ); unsigned int cost_swap = (can_swap) ? lev_matrix[i - 2][j - 2] + 1 : UINT_MAX; - // Find the best operation. + /** Assign the best operation. **/ lev_matrix[i][j] = min(min(min(cost_delete, cost_insert), cost_replace), cost_swap); } } } /** Store result. **/ - unsigned int result = lev_matrix[str1_len][str2_len]; + unsigned int unsigned_result = lev_matrix[str1_len][str2_len]; + if (unsigned_result > INT_MAX) + { + fprintf(stderr, + "Warning: Integer overflow detected in edit_dist(\"%s\", \"%s\", %lu, %lu) = %u > %d\n", + str1, str2, str1_length, str2_length, unsigned_result, INT_MAX + ); + } + result = (int)unsigned_result; /** Cleanup. **/ - for (unsigned int i = 0u; i < str1_len + 1u; i++) - nmFree(lev_matrix[i], (str2_len + 1) * sizeof(unsigned int)); + end: + if (lev_matrix != NULL) + { + for (unsigned int i = 0u; i < str1_len + 1u; i++) + { + if (lev_matrix[i] == NULL) break; + else nmSysFree(lev_matrix[i]); + } + nmSysFree(lev_matrix); + } + /** Done. **/ return result; } @@ -527,7 +571,7 @@ double ca_cos_compare(void* v1, void* v2) *** *** @param str1 A `char*` to the first string to compare. *** @param str2 A `char*` to the second string to compare. - *** @returns The levenshtein similarity between the two strings. + *** @returns The levenshtein similarity between the two strings, or NAN on failure. ***/ double ca_lev_compare(void* str1, void* str2) { @@ -543,7 +587,8 @@ double ca_lev_compare(void* str1, void* str2) if (len1 == 0lu && len2 != 0lu) return 0.0; /** Compute levenshtein edit distance. **/ - const unsigned int dist = edit_dist((const char*)str1, (const char*)str2, len1, len2); + const int dist = check_neg(edit_dist((const char*)str1, (const char*)str2, len1, len2)); + if (dist < 0) return NAN; /** Normalize edit distance into a similarity measure. **/ const double normalized_similarity = 1.0 - (double)dist / (double)max(len1, len2); @@ -583,11 +628,14 @@ static double get_cluster_size( pCentroid* centroids, const unsigned int num_clusters) { + double result = NAN; /** Could be up to around 1KB on the stack, but I think that's fine. **/ - double cluster_sums[num_clusters]; - unsigned int cluster_counts[num_clusters]; - memset(cluster_sums, 0, sizeof(cluster_sums)); - memset(cluster_counts, 0, sizeof(cluster_counts)); + double* cluster_sums = check_ptr(nmSysMalloc(num_clusters * sizeof(double))); + unsigned int* cluster_counts = check_ptr(nmSysMalloc(num_clusters * sizeof(unsigned int))); + if (cluster_sums == NULL) goto end; + if (cluster_counts == NULL) goto end; + memset(cluster_sums, 0, sizeof(num_clusters * sizeof(double))); + memset(cluster_counts, 0, sizeof(num_clusters * sizeof(unsigned int))); /** Sum the difference from each vector to its cluster centroid. **/ for (unsigned int i = 0u; i < num_vectors; i++) @@ -609,37 +657,15 @@ static double get_cluster_size( num_valid_clusters++; } - /** Return average sizes. **/ - return cluster_total / num_valid_clusters; - } - -/*** Compute the param_value for `k` (number of clusters), given a dataset of with - *** a size of `n`. - *** - *** The following table shows data sizes vs.selected cluster size. In testing, - *** these numbers tended to give a good balance of accuracy and duplicates detected. - *** - *** ```csv - *** Data Size, Actual - *** 10k, 12 - *** 100k, 33 - *** 1M, 67 - *** 4M, 93 - *** ``` - *** - *** This function is not intended for datasets smaller than (`n < ~2000`). - *** These should be handled using complete search. - *** - *** LaTeX Notation: \log_{36}\left(n\right)^{3.1}-8 - *** - *** @param n The size of the dataset. - *** @returns k, the number of clusters to use. - *** - *** Complexity: `O(1)` - ***/ -unsigned int compute_k(const unsigned int n) - { - return (unsigned)max(2, pow(log(n) / log(36), 3.2) - 8); + /** Calculate average sizes. **/ + result = cluster_total / num_valid_clusters; + + end: + /** Clean up. **/ + if (cluster_sums != NULL) nmSysFree(cluster_sums); + if (cluster_counts != NULL) nmSysFree(cluster_counts); + + return result; } /*** Executes the k-means clustering algorithm. Selects NUM_CLUSTERS random @@ -696,22 +722,19 @@ int ca_kmeans( /** Allocate space to store centroids and new_centroids. **/ /** Dynamic allocation is required because these densely allocated arrays might be up to 500KB! **/ const size_t centroids_size = num_clusters * sizeof(pCentroid); - pCentroid* centroids = (pCentroid*)check_ptr(nmMalloc(centroids_size)); + pCentroid* centroids = check_ptr(nmMalloc(centroids_size)); + pCentroid* new_centroids = check_ptr(nmMalloc(centroids_size)); if (centroids == NULL) goto end; + if (new_centroids == NULL) goto end; memset(centroids, 0, centroids_size); - pCentroid* new_centroids = (pCentroid*)check_ptr(nmMalloc(centroids_size)); - if (new_centroids == NULL) goto end_free_centroids; memset(new_centroids, 0, centroids_size); for (unsigned int i = 0u; i < num_clusters; i++) { - /** Malloc each centroid. **/ - centroids[i] = (pCentroid)check_ptr(nmMalloc(pCentroidSize)); - if (centroids[i] == NULL) goto end_deep_free_centroids; + centroids[i] = check_ptr(nmMalloc(pCentroidSize)); + new_centroids[i] = check_ptr(nmMalloc(pCentroidSize)); + if (centroids[i] == NULL) goto end; + if (new_centroids[i] == NULL) goto end; memset(centroids[i], 0, pCentroidSize); - - /** Malloc each new centroid. **/ - new_centroids[i] = (pCentroid)check_ptr(nmMalloc(pCentroidSize)); - if (new_centroids[i] == NULL) goto end_deep_free_centroids; memset(new_centroids[i], 0, pCentroidSize); } @@ -797,8 +820,9 @@ int ca_kmeans( } /** Is there enough improvement? **/ - if (min_improvement < -1) continue; /** Skip check if it will always fail. **/ - const double average_cluster_size = get_cluster_size(vectors, num_vectors, labels, centroids, num_clusters); + if (min_improvement < -1) continue; /** Skip check if it will never end the loop. **/ + const double average_cluster_size = check_double(get_cluster_size(vectors, num_vectors, labels, centroids, num_clusters)); + if (isnan(average_cluster_size)) goto end; const double improvement = old_average_cluster_size - average_cluster_size; if (improvement < min_improvement) break; old_average_cluster_size = average_cluster_size; @@ -815,22 +839,25 @@ int ca_kmeans( successful = true; /** Clean up. **/ - end_deep_free_centroids: - for (unsigned int i = 0u; i < num_clusters; i++) + end: + if (centroids != NULL) + { + for (unsigned int i = 0u; i < num_clusters; i++) + { + if (centroids[i] != NULL) nmFree(centroids[i], pCentroidSize); + else break; + } + nmFree(centroids, num_clusters * sizeof(pCentroid)); + } + if (new_centroids != NULL) { - if (centroids[i] != NULL) nmFree(centroids[i], pCentroidSize); - else break; - if (new_centroids[i] != NULL) nmFree(new_centroids[i], pCentroidSize); - else break; + for (unsigned int i = 0u; i < num_clusters; i++) + { + if (new_centroids[i] != NULL) nmFree(new_centroids[i], pCentroidSize); + else break; + } + nmFree(new_centroids, num_clusters * sizeof(pCentroid)); } - - // end_free_new_centroids: - nmFree(new_centroids, num_clusters * sizeof(pCentroid)); - - end_free_centroids: - nmFree(centroids, num_clusters * sizeof(pCentroid)); - - end: return (successful) ? 0 : -1; } @@ -860,7 +887,8 @@ void* ca_most_similar( double best_sim = -INFINITY; for (unsigned int i = 0u; (num_data == 0u) ? (data[i] != NULL) : (i < num_data); i++) { - const double sim = similarity(target, data[i]); + const double sim = check_double(similarity(target, data[i])); + if (isnan(sim)) continue; /* Skip this comparison. */ if (sim > best_sim && sim > threshold) { most_similar = data[i]; @@ -889,8 +917,8 @@ void* ca_most_similar( *** struct. If this variable is null, these values are also left null. *** @param maybe_dups A pointer to an xArray in which dups should be found. *** Pass NULL to allocate a new one. - *** @returns An xArray holding all of the duplocates found. If maybe_dups is - *** not NULL, this will be that xArray, to allow for chaining. + *** @returns An xArray holding all of the duplocates found, or NULL if an + *** error occurs. ***/ pXArray ca_sliding_search( void** data, @@ -919,7 +947,8 @@ pXArray ca_sliding_search( const unsigned int window_end = min(i + window_size, num_data); for (unsigned int j = window_start; j < window_end; j++) { - const double sim = similarity(data[i], data[j]); + const double sim = check_double(similarity(data[i], data[j])); + if (isnan(sim)) goto err_free_dups; if (sim > threshold) /* Dup found! */ { Dup* dup = (Dup*)check_ptr(nmMalloc(sizeof(Dup))); diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index dd65a0c52..4932bcf1a 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -4316,7 +4316,15 @@ static int exp_fn_compare(pExpression tree, pParamObjects obj_list, const char* } else { /* lev_compare() */ - tree->Types.Double = ca_lev_compare(str1, str2); + double lev_sim = check_double(ca_lev_compare(str1, str2)); + if (isnan(lev_sim)) + { + mssErrorf(1, "EXP", "%s(\"%s\", \"%s\") Failed to compute levenstein edit distance."); + return -1; + } + + /** Return the computed result. **/ + tree->Types.Double = lev_sim; tree->DataType = DATA_T_DOUBLE; return 0; } @@ -4359,7 +4367,11 @@ int exp_fn_levenshtein(pExpression tree, pParamObjects obj_list) /** Compute edit distance. **/ /** Length 0 is provided for both strings so that the function will compute it for us. **/ - tree->Integer = edit_dist(str1, str2, 0lu, 0lu); + int dist = check_neg(edit_dist(str1, str2, 0lu, 0lu)); + if (dist < 0) return -1; + + /** Return the computed distance. **/ + tree->Integer = dist; tree->DataType = DATA_T_INTEGER; return 0; } diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index 8fb97b184..e38fb3e68 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -2546,7 +2546,6 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) if (labels == NULL) goto err_free_sims; /** Run kmeans. **/ - Timer timer_i, *timer = timer_start(timer_init(&timer_i)); const bool successful = check(ca_kmeans( source_data->Vectors, source_data->nVectors, @@ -2556,7 +2555,6 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) labels, cluster_data->Sims )); - timer_stop(timer); if (!successful) goto err_free_sims; /** Convert the labels into clusters. **/ @@ -2675,6 +2673,11 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) (void**)cluster_data->SourceData->Keys, dups )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute sliding search with cosine similarity measure."); + goto err_free; + } } else { @@ -2688,7 +2691,11 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) (void**)cluster_data->SourceData->Keys, dups )); - if (dups_temp == NULL) goto err_free; + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute complete search with cosine similarity measure."); + goto err_free; + } else dups = dups_temp; } } @@ -2708,6 +2715,11 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) (void**)cluster_data->SourceData->Keys, dups )); + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute sliding search with Levenstein similarity measure."); + goto err_free; + } } else { @@ -2721,7 +2733,11 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) (void**)cluster_data->SourceData->Keys, dups )); - if (dups_temp == NULL) goto err_free; + if (dups_temp == NULL) + { + mssErrorf(1, "Cluster", "Failed to compute complete search with Levenstein similarity measure."); + goto err_free; + } else dups = dups_temp; } } From 394764efdbc6548162b1b5b7a9401428ec8e6aaa Mon Sep 17 00:00:00 2001 From: Israel Date: Wed, 19 Nov 2025 15:37:31 -0700 Subject: [PATCH 19/30] Remove necessary requests for the driver name in objQueryFetch(). --- centrallix/objectsystem/obj_query.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/centrallix/objectsystem/obj_query.c b/centrallix/objectsystem/obj_query.c index 9b64f241f..c4dff40ac 100644 --- a/centrallix/objectsystem/obj_query.c +++ b/centrallix/objectsystem/obj_query.c @@ -414,7 +414,6 @@ objQueryFetch(pObjQuery this, int mode) { pObject obj = NULL; void* obj_data; - char* name; char buf[OBJSYS_MAX_PATH + 32]; pObjQuerySortItem sort_item; int rval; @@ -529,14 +528,6 @@ objQueryFetch(pObjQuery this, int mode) goto error; } obj->Data = obj_data; - - this->Obj->Driver->GetAttrValue(obj_data, "name", DATA_T_STRING, &name, NULL); - if (strlen(name) + strlen(this->Obj->Pathname->Pathbuf) + 2 > OBJSYS_MAX_PATH) - { - mssError(1,"OSML","Filename in query result exceeded internal limits"); - OSMLDEBUG(OBJ_DEBUG_F_APITRACE, " null\n"); - goto error; - } /** If we need to check it, do so now. **/ if (!(this->Flags & OBJ_QY_F_FULLQUERY) && this->Tree) @@ -778,4 +769,3 @@ objGetQueryIdentityPath(pObjQuery this, char* pathbuf, int maxlen) return 0; } - From 9b8cc19754e2109004de9bac1cb944ce378fbdf9 Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 20 Nov 2025 09:52:50 -0700 Subject: [PATCH 20/30] Fix bugs that caused regressions after the updates to the cluster library. Round similarity results to avoid floating point errors. Enable caching for memory allocated in get_cluster_size(). Rename edit_dist() to ca_edit_dist() to match format for public functions. Rename print_diagnostics() to print_err(). --- centrallix-lib/include/clusters.h | 2 +- centrallix-lib/include/util.h | 12 +++---- centrallix-lib/src/clusters.c | 51 ++++++++++++++++----------- centrallix-lib/src/util.c | 2 +- centrallix/expression/exp_functions.c | 10 ++++-- centrallix/osdrivers/objdrv_cluster.c | 5 +-- 6 files changed, 48 insertions(+), 34 deletions(-) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index 218422253..879ac652a 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -76,7 +76,7 @@ typedef struct nmRegister(sizeof(Dup), "Dup") /** Edit distance function. **/ -int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length); +int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length); /** Vector functions. **/ pVector ca_build_vector(const char* str); diff --git a/centrallix-lib/include/util.h b/centrallix-lib/include/util.h index 2c8537327..03b63abaf 100644 --- a/centrallix-lib/include/util.h +++ b/centrallix-lib/include/util.h @@ -100,7 +100,7 @@ extern "C" { }) /** Error Handling. **/ -void print_diagnostics(int code, const char* function_name, const char* file_name, const int line_number); +void print_err(int code, const char* function_name, const char* file_name, const int line_number); /*** Ensures that developer diagnostics are printed if the result of the *** passed function call is not zero. Not intended for user errors. @@ -113,7 +113,7 @@ void print_diagnostics(int code, const char* function_name, const char* file_nam errno = 0; /* Reset errno to prevent confusion. */ \ __typeof__ (result) _r = (result); \ const bool success = (_r == 0); \ - if (!success) print_diagnostics(_r, #result, __FILE__, __LINE__); \ + if (!success) print_err(_r, #result, __FILE__, __LINE__); \ success; \ }) @@ -128,7 +128,7 @@ void print_diagnostics(int code, const char* function_name, const char* file_nam errno = 0; /* Reset errno to prevent confusion. */ \ __typeof__ (result) _r = (result); \ const bool success = (_r >= 0); \ - if (!success) print_diagnostics(_r, #result, __FILE__, __LINE__); \ + if (!success) print_err(_r, #result, __FILE__, __LINE__); \ success; \ }) @@ -143,7 +143,7 @@ void print_diagnostics(int code, const char* function_name, const char* file_nam errno = 0; /* Reset errno to prevent confusion. */ \ __typeof__ (result) _r = (result); \ const bool success = (_r != -1); \ - if (!success) print_diagnostics(_r, #result, __FILE__, __LINE__); \ + if (!success) print_err(_r, #result, __FILE__, __LINE__); \ success; \ }) @@ -157,7 +157,7 @@ void print_diagnostics(int code, const char* function_name, const char* file_nam ({ \ errno = 0; /* Reset errno to prevent confusion. */ \ __typeof__ (result) _r = (result); \ - if (isnan(_r)) print_diagnostics(0, #result, __FILE__, __LINE__); \ + if (isnan(_r)) print_err(0, #result, __FILE__, __LINE__); \ _r; \ }) @@ -171,7 +171,7 @@ void print_diagnostics(int code, const char* function_name, const char* file_nam ({ \ errno = 0; /* Reset errno to prevent confusion. */ \ __typeof__ (result) _r = (result); \ - if (_r == NULL) print_diagnostics(0, #result, __FILE__, __LINE__); \ + if (_r == NULL) print_err(0, #result, __FILE__, __LINE__); \ _r; \ }) diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 6487e28e7..92125ce03 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -433,7 +433,7 @@ static double sparse_similarity_to_centroid(const pVector v1, const pCentroid c2 *** @attention - `Complexity`: O(nm), where n and m are the lengths of str1 *** and str2 (respectively). ***/ -int edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) +int ca_edit_dist(const char* str1, const char* str2, const size_t str1_length, const size_t str2_length) { int result = -1; @@ -473,12 +473,12 @@ int edit_dist(const char* str1, const char* str2, const size_t str1_length, cons for (unsigned int j = 1u; j <= str2_len; j++) lev_matrix[0][j] = j; - /** General Case **/ + /** General Case. **/ for (unsigned int i = 1u; i <= str1_len; i++) { for (unsigned int j = 1u; j <= str2_len; j++) { - /** Equal characters need no changes. **/ + /** If the characters are equal, no change is needed. **/ if (str1[i - 1] == str2[j - 1]) lev_matrix[i][j] = lev_matrix[i - 1][j - 1]; @@ -510,7 +510,7 @@ int edit_dist(const char* str1, const char* str2, const size_t str1_length, cons if (unsigned_result > INT_MAX) { fprintf(stderr, - "Warning: Integer overflow detected in edit_dist(\"%s\", \"%s\", %lu, %lu) = %u > %d\n", + "Warning: Integer overflow detected in ca_edit_dist(\"%s\", \"%s\", %lu, %lu) = %u > %d\n", str1, str2, str1_length, str2_length, unsigned_result, INT_MAX ); } @@ -556,8 +556,8 @@ double ca_cos_compare(void* v1, void* v2) if (v1_empty && !v2_empty) return 0.0; if (!v1_empty && v2_empty) return 0.0; - /** Return the sparse similarity. **/ - return sparse_similarity(vec1, vec2); + /** Apply rounding to avoid annoying floating point issues before returning. **/ + return round(sparse_similarity(vec1, vec2) * 1000000) / 1000000; } /*** Compares two strings using their Levenshtein edit distance to compute a @@ -587,14 +587,14 @@ double ca_lev_compare(void* str1, void* str2) if (len1 == 0lu && len2 != 0lu) return 0.0; /** Compute levenshtein edit distance. **/ - const int dist = check_neg(edit_dist((const char*)str1, (const char*)str2, len1, len2)); - if (dist < 0) return NAN; + const int edit_dist = ca_edit_dist((const char*)str1, (const char*)str2, len1, len2); + if (!check_neg(edit_dist)) return NAN; /** Normalize edit distance into a similarity measure. **/ - const double normalized_similarity = 1.0 - (double)dist / (double)max(len1, len2); + const double normalized_similarity = 1.0 - (double)edit_dist / (double)max(len1, len2); - /** Done. **/ - return normalized_similarity; + /** Apply rounding to avoid annoying floating point issues before returning. **/ + return round(normalized_similarity * 1000000) / 1000000; } /*** Check if two sparse vectors are identical. @@ -629,9 +629,15 @@ static double get_cluster_size( const unsigned int num_clusters) { double result = NAN; - /** Could be up to around 1KB on the stack, but I think that's fine. **/ - double* cluster_sums = check_ptr(nmSysMalloc(num_clusters * sizeof(double))); - unsigned int* cluster_counts = check_ptr(nmSysMalloc(num_clusters * sizeof(unsigned int))); + + /** Allocate space to store clusters as averages are computed. **/ + /*** We use nmMalloc() here because this function is usually called + *** repeatedly with the same number of clusters in the k-means loop. + *** Also, it is likely that k-means may be invoked multiple times with + *** the same k value, leading to additional caching benefits. + ***/ + double* cluster_sums = check_ptr(nmMalloc(num_clusters * sizeof(double))); + unsigned int* cluster_counts = check_ptr(nmMalloc(num_clusters * sizeof(unsigned int))); if (cluster_sums == NULL) goto end; if (cluster_counts == NULL) goto end; memset(cluster_sums, 0, sizeof(num_clusters * sizeof(double))); @@ -662,8 +668,8 @@ static double get_cluster_size( end: /** Clean up. **/ - if (cluster_sums != NULL) nmSysFree(cluster_sums); - if (cluster_counts != NULL) nmSysFree(cluster_counts); + if (cluster_sums != NULL) nmFree(cluster_sums, num_clusters * sizeof(double)); + if (cluster_counts != NULL) nmFree(cluster_counts, num_clusters * sizeof(unsigned int)); return result; } @@ -939,7 +945,7 @@ pXArray ca_sliding_search( if (dups == NULL) goto err; } const int num_starting_dups = dups->nItems; - + /** Search for dups. **/ for (unsigned int i = 0u; i < num_data; i++) { @@ -948,7 +954,11 @@ pXArray ca_sliding_search( for (unsigned int j = window_start; j < window_end; j++) { const double sim = check_double(similarity(data[i], data[j])); - if (isnan(sim)) goto err_free_dups; + if (isnan(sim) || sim < 0.0 || 1.0 < sim) + { + fprintf(stderr, "Invalid similarity %g %lf.\n", sim, sim); + goto err_free_dups; + } if (sim > threshold) /* Dup found! */ { Dup* dup = (Dup*)check_ptr(nmMalloc(sizeof(Dup))); @@ -968,11 +978,10 @@ pXArray ca_sliding_search( return dups; /** Error cleanup. **/ - err_free_dups: - /** Free the dups we added to the XArray. */ + /** Free the dups that we added to the XArray. **/ while (dups->nItems > num_starting_dups) - nmFree(dups->Items[dups->nItems--], sizeof(Dup)); + nmFree(dups->Items[--dups->nItems], sizeof(Dup)); if (maybe_dups == NULL) check(xaDeInit(dups)); /* Failure ignored. */ err: diff --git a/centrallix-lib/src/util.c b/centrallix-lib/src/util.c index e39572f95..d326944d1 100644 --- a/centrallix-lib/src/util.c +++ b/centrallix-lib/src/util.c @@ -268,7 +268,7 @@ void timer_free(pTimer timer) /*** Function for failing on error, assuming the error came from a library or *** system function call, so that the error buffer is set to a valid value. ***/ -void print_diagnostics(int code, const char* function_name, const char* file_name, const int line_number) +void print_err(int code, const char* function_name, const char* file_name, const int line_number) { /** Create a descriptive error message. **/ char error_buf[BUFSIZ]; diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 4932bcf1a..b2e3e84a8 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -4367,11 +4367,15 @@ int exp_fn_levenshtein(pExpression tree, pParamObjects obj_list) /** Compute edit distance. **/ /** Length 0 is provided for both strings so that the function will compute it for us. **/ - int dist = check_neg(edit_dist(str1, str2, 0lu, 0lu)); - if (dist < 0) return -1; + int edit_dist = ca_edit_dist(str1, str2, 0lu, 0lu); + if (!check_neg(edit_dist)) + { + mssErrorf(1, "EXP", "%s(\"%s\", \"%s\") Failed to compute edit distance.\n", fn_name, str1, str2); + return -1; + } /** Return the computed distance. **/ - tree->Integer = dist; + tree->Integer = edit_dist; tree->DataType = DATA_T_INTEGER; return 0; } diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index e38fb3e68..a264d886b 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -2691,7 +2691,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) (void**)cluster_data->SourceData->Keys, dups )); - if (dups_temp == NULL) + if (dups_temp == NULL) { mssErrorf(1, "Cluster", "Failed to compute complete search with cosine similarity measure."); goto err_free; @@ -2733,7 +2733,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) (void**)cluster_data->SourceData->Keys, dups )); - if (dups_temp == NULL) + if (dups_temp == NULL) { mssErrorf(1, "Cluster", "Failed to compute complete search with Levenstein similarity measure."); goto err_free; @@ -2753,6 +2753,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) } if (dups_temp == NULL) goto err_free; else dups = dups_temp; + // fprintf(stderr, "Done searching, found %d dups.\n", dups->nItems); /** Store dups. **/ search_data->nDups = dups->nItems; From 17156b7344d1aa2c32a839c66960b810b67b4d60 Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 20 Nov 2025 13:53:39 -0700 Subject: [PATCH 21/30] Fix an invalid free (nmFree used instead of nmSysFree()). Fix a possible uninitialized read. Fix memset() not initializing data. --- centrallix-lib/src/clusters.c | 23 ++++++++++++++++------- centrallix/osdrivers/objdrv_cluster.c | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 92125ce03..4504f53d7 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -154,7 +154,7 @@ pVector ca_build_vector(const char* str) /** Compute hash values for char pairs. **/ CharPair* char_pairs = check_ptr(nmSysMalloc(num_chars * sizeof(CharPair))); - if (char_pairs == NULL) goto err; + if (char_pairs == NULL) goto err_free_chars; const unsigned int num_pairs = num_chars - 1u; for (unsigned int i = 0u; i < num_pairs; i++) { @@ -176,7 +176,7 @@ pVector ca_build_vector(const char* str) /** Allocate space for the sparse vector. **/ pVector sparse_vector = check_ptr(nmSysMalloc((num_pairs * 2u + 1u) * sizeof(int))); - if (sparse_vector == NULL) goto err; + if (sparse_vector == NULL) goto err_free_char_pairs; /** Build the sparse vector. **/ unsigned int cur = 0u, dim = 0u; @@ -213,16 +213,22 @@ pVector ca_build_vector(const char* str) /** Trim extra space wasted by identical hashes. **/ pVector trimmed_sparse_vector = check_ptr(nmSysRealloc(sparse_vector, cur * sizeof(int))); - if (trimmed_sparse_vector == NULL) goto err; + if (trimmed_sparse_vector == NULL) goto err_free_sparse_vector; sparse_vector = NULL; /* Mark memory freed by nmSysRealloc() no longer valid. */ /** Return the result. **/ return trimmed_sparse_vector; - err: + err_free_sparse_vector: if (sparse_vector != NULL) nmSysFree(sparse_vector); + + err_free_char_pairs: if (char_pairs != NULL) nmSysFree(char_pairs); + + err_free_chars: if (chars != NULL) nmSysFree(chars); + + err: return NULL; } @@ -640,8 +646,11 @@ static double get_cluster_size( unsigned int* cluster_counts = check_ptr(nmMalloc(num_clusters * sizeof(unsigned int))); if (cluster_sums == NULL) goto end; if (cluster_counts == NULL) goto end; - memset(cluster_sums, 0, sizeof(num_clusters * sizeof(double))); - memset(cluster_counts, 0, sizeof(num_clusters * sizeof(unsigned int))); + for (unsigned int i = 0u; i < num_clusters; i++) + { + cluster_sums[i] = 0.0; + cluster_counts[i] = 0u; + } /** Sum the difference from each vector to its cluster centroid. **/ for (unsigned int i = 0u; i < num_vectors; i++) @@ -757,7 +766,7 @@ int ca_kmeans( { const int token = vector[i++]; if (token > 0) centroid[dim++] = (double)token; - else for (unsigned int j = 0u; j < -token; j++) centroid[dim++] = 0.0; + else for (unsigned int j = 0u; j < (unsigned)-token; j++) centroid[dim++] = 0.0; } } diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index a264d886b..8561eaba1 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -2567,7 +2567,7 @@ static int ci_ComputeClusterData(pClusterData cluster_data, pNodeData node_data) /** Iterate through each label and add the index of the specified cluster to the xArray. **/ for (unsigned long long i = 0llu; i < source_data->nVectors; i++) if (!check_neg(xaAddItem(&indexes_in_cluster[labels[i]], (void*)i))) goto err_free_sims; - nmFree(labels, lables_size); /* Free unused data. */ + nmSysFree(labels); /* Free unused data. */ /** Iterate through each cluster, store it, and free the xArray. **/ for (unsigned int i = 0u; i < cluster_data->nClusters; i++) From 29640a165ae7041158bcca22113a08941006f466 Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 20 Nov 2025 16:21:45 -0700 Subject: [PATCH 22/30] Minor improvements and clean up. --- centrallix/osdrivers/objdrv_cluster.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index 8561eaba1..c09939b66 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -392,6 +392,7 @@ char* const ATTR_SEARCH_ENTRY[] = char* const METHOD_NAME[] = { "cache", + "stat", END_OF_ARRAY, }; @@ -549,7 +550,7 @@ typedef struct _SEARCH { char* Name; char* Key; - pClusterData Source; + pClusterData SourceCluster; double Threshold; pDup* Dups; unsigned int nDups; @@ -1376,8 +1377,8 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) pClusterData cluster_data = node_data->ClusterDatas[i]; if (strcmp(source_cluster_name, cluster_data->Name) == 0) { - /** Source found. **/ - search_data->Source = cluster_data; + /** SourceCluster found. **/ + search_data->SourceCluster = cluster_data; break; } @@ -1385,7 +1386,7 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) } /** Did we find the requested source? **/ - if (search_data->Source == NULL) + if (search_data->SourceCluster == NULL) { /** Print error. **/ mssErrorf(1, "Cluster", "Could not find cluster \"%s\" for search \"%s\".", source_cluster_name, search_data->Name); @@ -1479,7 +1480,7 @@ static pSearchData ci_ParseSearchData(pStructInf inf, pNodeData node_data) } /** Create cache entry key. **/ - char* source_key = search_data->Source->Key; + char* source_key = search_data->SourceCluster->Key; const size_t len = strlen(source_key) + strlen(search_data->Name) + 16lu; char* key = check_ptr(nmSysMalloc(len * sizeof(char))); if (key == NULL) goto err_free_search; @@ -2645,7 +2646,7 @@ static int ci_ComputeSearchData(pSearchData search_data, pNodeData node_data) if (search_data->Dups != NULL) return 0; /** We need the cluster data to be computed before we search it. **/ - pClusterData cluster_data = search_data->Source; + pClusterData cluster_data = search_data->SourceCluster; ret = ci_ComputeClusterData(cluster_data, node_data); if (ret != 0) { @@ -3525,7 +3526,7 @@ int clusterGetAttrValue(void* inf_v, char* attr_name, int datatype, pObjData val if (strcmp(attr_name, "source") == 0) { - val->String = target->Source->Name; + val->String = target->SourceCluster->Name; return 0; } if (strcmp(attr_name, "similarity_measure") == 0) From 0fa62d3e487bae818a5e23a06ef99b006c09bcd4 Mon Sep 17 00:00:00 2001 From: Israel Date: Thu, 20 Nov 2025 16:38:48 -0700 Subject: [PATCH 23/30] Correct minor mistakes. Improve documentation. --- centrallix-lib/include/clusters.h | 4 ++++ centrallix-lib/src/clusters.c | 17 ++++++++++++++ centrallix/osdrivers/objdrv_cluster.c | 33 ++++++++++++++++----------- 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/centrallix-lib/include/clusters.h b/centrallix-lib/include/clusters.h index 879ac652a..8f916210e 100644 --- a/centrallix-lib/include/clusters.h +++ b/centrallix-lib/include/clusters.h @@ -47,6 +47,10 @@ /*** 2147483629 is the signed int max, and is also a prime number. *** Using this value ensures that the longest run of 0s will not *** cause an int underflow with the current encoding scheme. + *** + *** Unfortunately, we can't use a number this large yet because + *** kmeans algorithm creates densely allocated centroids with + *** `CA_NUM_DIMS` dimensions, so a large number causes it to fail. ***/ #define CA_NUM_DIMS 251 //2147483629 /* aka. The vector table size. */ diff --git a/centrallix-lib/src/clusters.c b/centrallix-lib/src/clusters.c index 4504f53d7..48119176a 100644 --- a/centrallix-lib/src/clusters.c +++ b/centrallix-lib/src/clusters.c @@ -64,6 +64,14 @@ static unsigned int hash_char_pair(const unsigned char c1, const unsigned char c return hash % CA_NUM_DIMS; } +/*** An internal struct for temporarily storing character pairs while building + *** sparse vectors. + *** + *** @param c1 The first character in the character pair. + *** @param c2 The second character in the character pair. + *** @param hash The hash for the two characters, calculated by calling the + *** hash_char_pair() function (above). + **/ typedef struct { unsigned char c1, c2; @@ -71,6 +79,15 @@ typedef struct } CharPair, *pCharPair; +/*** Internal function to compare two character pairs to allow us to sort them + *** by hash (ascending). + *** + *** @param p1 The first pCharPair. + *** @param p2 The second pCharPair. + *** @returns An int > 0 if p1's hash is larger. + *** An int < 0 if p2's hash is larger. + *** 0 if p1 and p2 have identical hashes. + ***/ static int charpair_cmp(const void *p1, const void *p2) { const CharPair *a = p1, *b = p2; diff --git a/centrallix/osdrivers/objdrv_cluster.c b/centrallix/osdrivers/objdrv_cluster.c index c09939b66..a41ffbd21 100644 --- a/centrallix/osdrivers/objdrv_cluster.c +++ b/centrallix/osdrivers/objdrv_cluster.c @@ -350,11 +350,14 @@ typedef unsigned char TargetType; /** Attribute name lists by TargetType. **/ #define END_OF_ARRAY NULL -char* const ATTR_ROOT[] = { +char* const ATTR_ROOT[] = + { "source", "attr_name", + "date_created", + "date_computed", END_OF_ARRAY, -}; + }; char* const ATTR_CLUSTER[] = { "algorithm", @@ -378,6 +381,8 @@ char* const ATTR_SEARCH[] = char* const ATTR_CLUSTER_ENTRY[] = { "items", + "date_created", + "date_computed", END_OF_ARRAY, }; char* const ATTR_SEARCH_ENTRY[] = @@ -385,6 +390,8 @@ char* const ATTR_SEARCH_ENTRY[] = "key1", "key2", "sim", + "date_created", + "date_computed", END_OF_ARRAY, }; @@ -432,17 +439,17 @@ char* const METHOD_NAME[] = ***/ typedef struct _SOURCE { - char* Name; - char* Key; - char* SourcePath; - char* KeyAttr; - char* NameAttr; - char** Keys; - char** Strings; - pVector* Vectors; - unsigned int nVectors; - DateTime DateCreated; - DateTime DateComputed; + char* Name; + char* Key; + char* SourcePath; + char* KeyAttr; + char* NameAttr; + char** Keys; + char** Strings; + pVector* Vectors; + unsigned int nVectors; + DateTime DateCreated; + DateTime DateComputed; } SourceData, *pSourceData; From 06bae81769b36244c3fb555c4041c7d1965d334e Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 21 Nov 2025 14:36:37 -0700 Subject: [PATCH 24/30] Implement a more extendable schema verification system. --- centrallix/expression/exp_functions.c | 193 ++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index b2e3e84a8..2e98cfd7c 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -99,6 +99,199 @@ static char* ci_TypeToStr(const int type) return "Invalid"; /* Shall not parse to a valid type in ci_TypeFromStr(). */ } +/*** Specifies expectations about an argument. + *** + *** @param Datatypes An array of datatypes (terminated with a -1). Set to NULL + *** to accept any datatype as valid for this argument. + *** @param Flags Flags to require other properties about an argument. If the + *** flag a required behavior for specific types, the requirement will be + *** skipped for other types. + *** + *** Valid Flags: + *** - `EXP_ARG_NOT_NULL`: Expect the arg to not be null. + *** - `EXP_ARG_FORCE_TYPE`: Run type check on null args (not recommended). + *** + *** @attention - Checks like `EXP_ARG_NON_EMPTY`, `EXP_ARG_NON_NAN`, etc. also + *** succeed for `NULL` values. To avoid this, specify `EXP_ARG_NOT_NULL`. + ***/ +typedef struct + { + int* Datatypes; + int Flags; + } + ArgExpect, *pArgExpect; + +#define EXP_ARG_NO_FLAGS (0) +#define EXP_ARG_NOT_NULL (1 << 0) +#define EXP_ARG_FORCE_TYPE (1 << 1) + +/*** An internal function used by the schema verifier (below) to verify each + *** argument of the schema. + ***/ +static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg_expect) + { + /** The expectation struct cannot be NULL. **/ + if (arg_expect == NULL) + { + mssErrorf(1, "EXP", + "%s(...): Expectation struct cannot be NULL", + fn_name + ); + return -1; + } + + /** Extract values. **/ + ASSERTMAGIC(arg, MGK_EXPRESSION); + int actual_datatype = arg->DataType; + + /** Check for a provided NULL value. **/ + if (arg->Flags & EXPR_F_NULL) + { + if (arg_expect->Flags & EXP_ARG_NOT_NULL) + { + mssErrorf(1, "EXP", + "%s(...): Expects a non-null value, but got NULL : %s (%d).", + fn_name, ci_TypeToStr(actual_datatype), actual_datatype + ); + return -1; + } + + /** Skip type checks unless forced. **/ + if (!(arg_expect->Flags & EXP_ARG_FORCE_TYPE)) goto skip_type_checks; + } + + /** No type checking required. **/ + if (arg_expect->Datatypes == NULL) goto skip_type_checks; + + /** No types given: Probably a mistake. **/ + if (arg_expect->Datatypes[0] == -1) + { + mssErrorf(1, "EXP", + "%s(...): Array of allowed Datatypes is empty.", + fn_name + ); + return -1; + } + + /** Verify Datatypes. **/ + bool found = false; + for (int j = 0; arg_expect->Datatypes[j] != -1; j++) + { + const int expected_datatype = arg_expect->Datatypes[j]; + if (expected_datatype == actual_datatype) + { + found = true; + break; + } + } + + /** Handle failure. **/ + if (!found) + { + /** Accumulate additional valid types. **/ + char buf[256] = {'\0'}; + int cur = 0, j = 1; + while (true) + { + int datatype = arg_expect->Datatypes[j++]; + if (datatype == -1) break; + + cur += snprintf( + buf + cur, 256 - cur, + " or %s (%d)", + ci_TypeToStr(datatype), datatype + ); + } + + /** Print error. **/ + int first_datatype = arg_expect->Datatypes[0]; + mssErrorf(1, "EXP", + "%s(...): Expects type %s (%d)%s but got type %s (%d).", + fn_name, ci_TypeToStr(first_datatype), first_datatype, buf, ci_TypeToStr(actual_datatype), actual_datatype + ); + return -1; + } + + skip_type_checks: + return 0; + } + +/*** Verify that arguments passed to a function match some expected values. + *** + *** @param fn_name The name of the function (for error messages). + *** @param arg_expects A pointer to an array of ArgExpect structs, each + *** representing expectations for a single argument, in the order they + *** are passed to the function. + *** @param num_args The number of arguments to expect to be passed to the + *** function (and the length of arg_expects). + *** @param tree The tree containing the actual arguments passed. + *** @param obj_list The object list scope which was passed to the function. + *** @returns 0 if all arguments are successfully verified, or + *** -1 if an error occurs or arguments are incorrect. + *** + *** @attention - Promises that an error message will be printed with a call + *** to mssError() if an error occurs. + *** + *** Example: + *** ```c + *** char fn_name[] = "example"; + *** if (verify_schema(fn_name, + *** (ArgExpect[]){ + *** {(int[]){DATA_T_INTEGER, DATA_T_DOUBLE, -1}, EXP_PARAM_NOT_NULL}, + *** {(int[]){DATA_T_STRING, -1}, 0} + *** }, 2, + *** tree, obj_list + *** ) != 0) + *** { + *** mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); + *** return -1; + *** } + *** ``` + ***/ +static int verify_schema( + const char* fn_name, + const ArgExpect* arg_expects, + const int num_args, + pExpression tree, + pParamObjects obj_list) + { + /** Verify object list and session. **/ + if (obj_list == NULL) + { + mssErrorf(1, "EXP", "%s(\?\?\?): No object list?", fn_name); + return -1; + } + ASSERTMAGIC(obj_list->Session, MGK_OBJSESSION); + + /** Verify expression tree. **/ + ASSERTMAGIC(tree, MGK_EXPRESSION); + + /** Verify argument count. **/ + const int num_args_actual = tree->Children.nItems; + if (num_args != num_args_actual) + { + mssErrorf(1, "EXP", + "%s(?): Expects %u argument%s, got %d argument%s.", + fn_name, num_args, (num_args == 1) ? "" : "s", num_args_actual, (num_args_actual == 1) ? "" : "s" + ); + return -1; + } + + /** Verify argument datatypes. **/ + for (int i = 0; i < num_args; i++) + { + if (verify_arg(fn_name, tree->Children.Items[i], &arg_expects[i]) != 0) + { + mssErrorf(0, "EXP", "%s(...): Error while reading arg #%d/%d.", fn_name, i + 1, num_args); + return -1; + } + } + + /** Pass. **/ + return 0; + } + + /****** Evaluator functions follow for expEvalFunction ******/ int exp_fn_getdate(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) From 13fd4b7017bcf3da971129d3d8e1a6b3f86d5fe3 Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 21 Nov 2025 14:38:24 -0700 Subject: [PATCH 25/30] Replace old schema verification with the new system. --- centrallix/expression/exp_functions.c | 94 ++++++++------------------- 1 file changed, 26 insertions(+), 68 deletions(-) diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 2e98cfd7c..50e3ec745 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -4344,72 +4344,18 @@ int exp_fn_nth(pExpression tree, pParamObjects objlist, pExpression i0, pExpress return 0; } -static int exp_fn_verify_schema( - const char* fn_name, - const int* param_types, - const int num_params, - pExpression tree, - pParamObjects obj_list) - { - /** Verify object list and session. **/ - if (obj_list == NULL) - { - mssErrorf(1, "EXP", "%s(\?\?\?) no object list?", fn_name); - return -1; - } - ASSERTMAGIC(obj_list->Session, MGK_OBJSESSION); - - /** Verify expression tree. **/ - ASSERTMAGIC(tree, MGK_EXPRESSION); - - /** Verify parameter number. **/ - const int num_params_actual = tree->Children.nItems; - if (num_params != num_params_actual) - { - mssErrorf(1, "EXP", - "%s(?) expects %u param%s, got %d param%s.", - fn_name, num_params, (num_params > 1) ? "s" : "", num_params_actual, (num_params_actual > 1) ? "s" : "" - ); - return -1; - } - - /** Verify parameter datatypes. **/ - for (int i = 0; i < num_params; i++) - { - const pExpression arg = tree->Children.Items[i]; - ASSERTMAGIC(arg, MGK_EXPRESSION); - - /** Skip null values. **/ - if (arg->Flags & EXPR_F_NULL) continue; - - /** Extract datatypes. **/ - const int expected_datatype = param_types[i]; - const int actual_datatype = arg->DataType; - - /** Verify datatypes. **/ - if (expected_datatype != actual_datatype) - { - mssErrorf(1, "EXP", - "%s(...) param #%d/%d expects type %s (%d) but got type %s (%d).", - fn_name, i + 1, num_params, ci_TypeToStr(expected_datatype), expected_datatype, ci_TypeToStr(actual_datatype), actual_datatype - ); - return -1; - } - } - - /** Pass. **/ - return 0; - } - int exp_fn_metaphone(pExpression tree, pParamObjects obj_list) { const char fn_name[] = "metaphone"; /** Verify function schema. **/ - if (exp_fn_verify_schema(fn_name, (int[]){ DATA_T_STRING }, 1, tree, obj_list) != 0) + if (verify_schema(fn_name, + (ArgExpect[]){{(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}}, 1, + tree, obj_list + ) != 0) { - mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", fn_name); return -1; } @@ -4460,9 +4406,15 @@ int exp_fn_metaphone(pExpression tree, pParamObjects obj_list) static int exp_fn_compare(pExpression tree, pParamObjects obj_list, const char* fn_name) { /** Verify function schema. **/ - if (exp_fn_verify_schema(fn_name, (int[]){ DATA_T_STRING, DATA_T_STRING }, 2, tree, obj_list) != 0) - { - mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); + if (verify_schema(fn_name, + (ArgExpect[]){ + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS} + }, 2, + tree, obj_list + ) != 0) + { + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", fn_name); return -1; } @@ -4489,7 +4441,7 @@ static int exp_fn_compare(pExpression tree, pParamObjects obj_list, const char* if (v1 == NULL || v2 == NULL) { mssErrorf(1, "EXP", - "%s(\"%s\", \"%s\") - Failed to build vectors.", + "%s(\"%s\", \"%s\"): Failed to build vectors.", fn_name, str1, str2 ); ret = -1; @@ -4512,7 +4464,7 @@ static int exp_fn_compare(pExpression tree, pParamObjects obj_list, const char* double lev_sim = check_double(ca_lev_compare(str1, str2)); if (isnan(lev_sim)) { - mssErrorf(1, "EXP", "%s(\"%s\", \"%s\") Failed to compute levenstein edit distance."); + mssErrorf(1, "EXP", "%s(\"%s\", \"%s\"): Failed to compute levenstein edit distance."); return -1; } @@ -4540,9 +4492,15 @@ int exp_fn_levenshtein(pExpression tree, pParamObjects obj_list) const char fn_name[] = "levenshtein"; /** Verify function schema. **/ - if (exp_fn_verify_schema(fn_name, (int[]){ DATA_T_STRING, DATA_T_STRING }, 2, tree, obj_list) != 0) - { - mssErrorf(0, "EXP", "%s(?) Call does not match function schema.", fn_name); + if (verify_schema(fn_name, + (ArgExpect[]){ + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS} + }, 2, + tree, obj_list + ) != 0) + { + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", fn_name); return -1; } @@ -4563,7 +4521,7 @@ int exp_fn_levenshtein(pExpression tree, pParamObjects obj_list) int edit_dist = ca_edit_dist(str1, str2, 0lu, 0lu); if (!check_neg(edit_dist)) { - mssErrorf(1, "EXP", "%s(\"%s\", \"%s\") Failed to compute edit distance.\n", fn_name, str1, str2); + mssErrorf(1, "EXP", "%s(\"%s\", \"%s\"): Failed to compute edit distance.\n", fn_name, str1, str2); return -1; } From e83c15f8255ea451e3310019400031576225ad12 Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 21 Nov 2025 14:41:00 -0700 Subject: [PATCH 26/30] Expand the new schema verification system with extra data validation features. --- centrallix/expression/exp_functions.c | 181 ++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 50e3ec745..7f8ad875d 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -110,6 +110,13 @@ static char* ci_TypeToStr(const int type) *** Valid Flags: *** - `EXP_ARG_NOT_NULL`: Expect the arg to not be null. *** - `EXP_ARG_FORCE_TYPE`: Run type check on null args (not recommended). + *** - `EXP_ARG_NON_EMPTY`: Expect string to be non-empty. Expect a + *** stringvec or intvec to have elements (does not check them). + *** - `EXP_ARG_POSITIVE`: Expect a positive or zero value for int, double, + *** money, or datetime. (Includes NON_NAN: NAN is not positive). + *** - `EXP_ARG_NEGATIVE`: Expect a negative or zero value for int, double, + *** money, or datetime. (Includes NON_NAN: NAN is not negative). + *** - `EXP_ARG_NON_NAN`: Expect a double to be a number, not NAN. *** *** @attention - Checks like `EXP_ARG_NON_EMPTY`, `EXP_ARG_NON_NAN`, etc. also *** succeed for `NULL` values. To avoid this, specify `EXP_ARG_NOT_NULL`. @@ -124,6 +131,10 @@ typedef struct #define EXP_ARG_NO_FLAGS (0) #define EXP_ARG_NOT_NULL (1 << 0) #define EXP_ARG_FORCE_TYPE (1 << 1) +#define EXP_ARG_NON_EMPTY (1 << 2) +#define EXP_ARG_NEGATIVE (1 << 3) +#define EXP_ARG_POSITIVE (1 << 4) +#define EXP_ARG_NON_NAN (1 << 5) /*** An internal function used by the schema verifier (below) to verify each *** argument of the schema. @@ -213,6 +224,150 @@ static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg } skip_type_checks: + /** All flag checks not implemented above should pass on NULL values. **/ + if (arg->Flags & EXPR_F_NULL) return 0; + + /** Verify other Flags by type, if specified. **/ + switch (actual_datatype) + { + case DATA_T_INTEGER: + { + int value = arg->Integer; + if (arg_expect->Flags & EXP_ARG_POSITIVE && value < 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects positive int but got %d.", + fn_name, value + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_NEGATIVE && value > 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects negative int but got %d.", + fn_name, value + ); + return -1; + } + break; + } + + case DATA_T_DOUBLE: + { + double value = arg->Types.Double; + if (arg_expect->Flags & EXP_ARG_NON_NAN && isnan(value)) + { + mssErrorf(1, "EXP", + "%s(...): Expects non-nan double but got %g.", + fn_name, value + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_POSITIVE && value < 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects positive double but got %g.", + fn_name, value + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_NEGATIVE && value > 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects negative double but got %g.", + fn_name, value + ); + return -1; + } + break; + } + + case DATA_T_STRING: + { + char* str = arg->String; + if (arg_expect->Flags & EXP_ARG_NON_EMPTY && str[0] == '\0') + { + mssErrorf(1, "EXP", + "%s(...): Expects string to contain characters, but got \"\".", + fn_name + ); + return -1; + } + break; + } + + case DATA_T_DATETIME: + { + pDateTime value = &arg->Types.Date; + if (arg_expect->Flags & EXP_ARG_POSITIVE && value->Value < 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects positive date offset but got %llu.", + fn_name, value->Value + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_NEGATIVE && value->Value > 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects negative date offset but got %llu.", + fn_name, value->Value + ); + return -1; + } + break; + } + + case DATA_T_MONEY: + { + pMoneyType value = &arg->Types.Money; + if (arg_expect->Flags & EXP_ARG_POSITIVE && value->WholePart < 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects positive money value but got $%d.%d.", + fn_name, value->WholePart, value->FractionPart + ); + return -1; + } + if (arg_expect->Flags & EXP_ARG_NEGATIVE && value->WholePart > 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects negative money value but got $%d.%d.", + fn_name, value->WholePart, value->FractionPart + ); + return -1; + } + } + + case DATA_T_STRINGVEC: + { + pStringVec str_vec = &arg->Types.StrVec; + if (arg_expect->Flags & EXP_ARG_NON_EMPTY && str_vec->nStrings == 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects StringVec to contain strings, but got [].", + fn_name + ); + return -1; + } + break; + } + + case DATA_T_INTVEC: + { + pIntVec int_vec = &arg->Types.IntVec; + if (arg_expect->Flags & EXP_ARG_NON_EMPTY && int_vec->nIntegers == 0) + { + mssErrorf(1, "EXP", + "%s(...): Expects IntVec to contain strings, but got [].", + fn_name + ); + return -1; + } + break; + } + } + return 0; } @@ -291,6 +446,31 @@ static int verify_schema( return 0; } +int exp_fn_test(pExpression tree, pParamObjects obj_list) + { + char fn_name[] = "test"; + if (verify_schema(fn_name, + (ArgExpect[]){ + {(int[]){DATA_T_INTEGER, DATA_T_DOUBLE, -1}, EXP_ARG_NOT_NULL | EXP_ARG_NON_NAN}, + {(int[]){DATA_T_STRING, -1}, EXP_ARG_NOT_NULL | EXP_ARG_NON_EMPTY} + }, 2, + tree, obj_list + ) != 0) + { + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", fn_name); + return -1; + } + + pExpression arg1 = tree->Children.Items[0]; + pExpression arg2 = tree->Children.Items[1]; + if (arg1->DataType == DATA_T_INTEGER) printf("Success: %d, '%s'.\n", arg1->Integer, arg2->String); + else printf("Success: %g, '%s'.\n", arg1->Types.Double, arg2->String); + + tree->DataType = DATA_T_INTEGER; + tree->Flags |= EXPR_F_NULL; + return 0; + } + /****** Evaluator functions follow for expEvalFunction ******/ @@ -4678,6 +4858,7 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); xhAdd(&EXP.Functions, "octet_length", (char*)exp_fn_octet_length); xhAdd(&EXP.Functions, "argon2id",(char*)exp_fn_argon2id); + xhAdd(&EXP.Functions, "test", (char*)exp_fn_test); /** Dates. **/ xhAdd(&EXP.Functions, "getdate", (char*)exp_fn_getdate); From 070cfe3191265ee9c5546066799cfdcedf3572d0 Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 21 Nov 2025 16:04:14 -0700 Subject: [PATCH 27/30] Clean up, bug fixes, and naming convention updates. Remove test function. --- centrallix/expression/exp_functions.c | 54 +++++++-------------------- 1 file changed, 14 insertions(+), 40 deletions(-) diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 7f8ad875d..08867f8af 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -139,7 +139,7 @@ typedef struct /*** An internal function used by the schema verifier (below) to verify each *** argument of the schema. ***/ -static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg_expect) +static int exp_fn_i_verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg_expect) { /** The expectation struct cannot be NULL. **/ if (arg_expect == NULL) @@ -251,7 +251,7 @@ static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg } break; } - + case DATA_T_DOUBLE: { double value = arg->Types.Double; @@ -281,7 +281,7 @@ static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg } break; } - + case DATA_T_STRING: { char* str = arg->String; @@ -295,7 +295,7 @@ static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg } break; } - + case DATA_T_DATETIME: { pDateTime value = &arg->Types.Date; @@ -317,15 +317,15 @@ static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg } break; } - + case DATA_T_MONEY: { pMoneyType value = &arg->Types.Money; if (arg_expect->Flags & EXP_ARG_POSITIVE && value->WholePart < 0) { mssErrorf(1, "EXP", - "%s(...): Expects positive money value but got $%d.%d.", - fn_name, value->WholePart, value->FractionPart + "%s(...): Expects positive money value but got $%d.%g.", + fn_name, value->WholePart, (double)value->FractionPart / 100.0 ); return -1; } @@ -333,7 +333,7 @@ static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg { mssErrorf(1, "EXP", "%s(...): Expects negative money value but got $%d.%d.", - fn_name, value->WholePart, value->FractionPart + fn_name, value->WholePart, (double)value->FractionPart / 100.0 ); return -1; } @@ -390,7 +390,7 @@ static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg *** Example: *** ```c *** char fn_name[] = "example"; - *** if (verify_schema(fn_name, + *** if (exp_fn_i_verify_schema(fn_name, *** (ArgExpect[]){ *** {(int[]){DATA_T_INTEGER, DATA_T_DOUBLE, -1}, EXP_PARAM_NOT_NULL}, *** {(int[]){DATA_T_STRING, -1}, 0} @@ -403,7 +403,7 @@ static int verify_arg(const char* fn_name, pExpression arg, const ArgExpect* arg *** } *** ``` ***/ -static int verify_schema( +static int exp_fn_i_verify_schema( const char* fn_name, const ArgExpect* arg_expects, const int num_args, @@ -435,7 +435,7 @@ static int verify_schema( /** Verify argument datatypes. **/ for (int i = 0; i < num_args; i++) { - if (verify_arg(fn_name, tree->Children.Items[i], &arg_expects[i]) != 0) + if (exp_fn_i_verify_arg(fn_name, tree->Children.Items[i], &arg_expects[i]) != 0) { mssErrorf(0, "EXP", "%s(...): Error while reading arg #%d/%d.", fn_name, i + 1, num_args); return -1; @@ -446,31 +446,6 @@ static int verify_schema( return 0; } -int exp_fn_test(pExpression tree, pParamObjects obj_list) - { - char fn_name[] = "test"; - if (verify_schema(fn_name, - (ArgExpect[]){ - {(int[]){DATA_T_INTEGER, DATA_T_DOUBLE, -1}, EXP_ARG_NOT_NULL | EXP_ARG_NON_NAN}, - {(int[]){DATA_T_STRING, -1}, EXP_ARG_NOT_NULL | EXP_ARG_NON_EMPTY} - }, 2, - tree, obj_list - ) != 0) - { - mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", fn_name); - return -1; - } - - pExpression arg1 = tree->Children.Items[0]; - pExpression arg2 = tree->Children.Items[1]; - if (arg1->DataType == DATA_T_INTEGER) printf("Success: %d, '%s'.\n", arg1->Integer, arg2->String); - else printf("Success: %g, '%s'.\n", arg1->Types.Double, arg2->String); - - tree->DataType = DATA_T_INTEGER; - tree->Flags |= EXPR_F_NULL; - return 0; - } - /****** Evaluator functions follow for expEvalFunction ******/ @@ -4530,7 +4505,7 @@ int exp_fn_metaphone(pExpression tree, pParamObjects obj_list) const char fn_name[] = "metaphone"; /** Verify function schema. **/ - if (verify_schema(fn_name, + if (exp_fn_i_verify_schema(fn_name, (ArgExpect[]){{(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}}, 1, tree, obj_list ) != 0) @@ -4586,7 +4561,7 @@ int exp_fn_metaphone(pExpression tree, pParamObjects obj_list) static int exp_fn_compare(pExpression tree, pParamObjects obj_list, const char* fn_name) { /** Verify function schema. **/ - if (verify_schema(fn_name, + if (exp_fn_i_verify_schema(fn_name, (ArgExpect[]){ {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS} @@ -4672,7 +4647,7 @@ int exp_fn_levenshtein(pExpression tree, pParamObjects obj_list) const char fn_name[] = "levenshtein"; /** Verify function schema. **/ - if (verify_schema(fn_name, + if (exp_fn_i_verify_schema(fn_name, (ArgExpect[]){ {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS}, {(int[]){DATA_T_STRING, -1}, EXP_ARG_NO_FLAGS} @@ -4858,7 +4833,6 @@ int exp_internal_DefineFunctions() xhAdd(&EXP.Functions, "pbkdf2", (char*)exp_fn_pbkdf2); xhAdd(&EXP.Functions, "octet_length", (char*)exp_fn_octet_length); xhAdd(&EXP.Functions, "argon2id",(char*)exp_fn_argon2id); - xhAdd(&EXP.Functions, "test", (char*)exp_fn_test); /** Dates. **/ xhAdd(&EXP.Functions, "getdate", (char*)exp_fn_getdate); From 8795aaf1b89c934ca6ec05797e559077b7551e50 Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 21 Nov 2025 17:44:12 -0700 Subject: [PATCH 28/30] Add tests for log and power functions. --- centrallix/tests/test_expfn_log_00.cmp | 34 ++++++++++++++++++++ centrallix/tests/test_expfn_log_00.to | 44 ++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 centrallix/tests/test_expfn_log_00.cmp create mode 100644 centrallix/tests/test_expfn_log_00.to diff --git a/centrallix/tests/test_expfn_log_00.cmp b/centrallix/tests/test_expfn_log_00.cmp new file mode 100644 index 000000000..c072b68f2 --- /dev/null +++ b/centrallix/tests/test_expfn_log_00.cmp @@ -0,0 +1,34 @@ +Attribute [ln(1)]: double 0.0 +Attribute [ln(e)]: double 1.0 +Attribute [ln(0)]: double -inf.0 +Attribute [ln(-1)]: double nan.0 +Attribute [ln(10)]: double 2.30258509 +Attribute [ln(1.5)]: double 0.40546511 +Attribute [ln(1e-10)]: integer 1 +Attribute [ln(1e10)+]: integer 1 +Attribute [ln(1e10)-]: integer 1 +Attribute [log10(1)]: double 0.0 +Attribute [log10(10)]: double 1.0 +Attribute [log10(0)]: double -inf.0 +Attribute [log10(-10)]: double nan.0 +Attribute [log10(100)]: double 2.0 +Attribute [log10(0.01)]: double -2.0 +Attribute [log10(1.234)]: double 0.09131516 +Attribute [log10(1e-10)]: double -10.0 +Attribute [log10(1e10)]: double 10.0 +Attribute [logn(8, 2)]: double 3.0 +Attribute [logn(1000, 10)]: double 3.0 +Attribute [logn(10, 0)]: double -0.0 +Attribute [logn(10, 1)]: double inf.0 +Attribute [logn(8, -2)]: double nan.0 +Attribute [logn(0, 2)]: double -inf.0 +Attribute [logn(-8, 2)]: double nan.0 +Attribute [logn(1, 2)]: double 0.0 +Attribute [logn(1e10, 10)]: double 10.0 +Attribute [logn(8, 0.5)]: double -3.0 +Attribute [ln(2.718281828)]: double 1.0 +Attribute [log10(3.14159)]: double 0.49714951 +Attribute [logn(10, 1.1)]: double 0.04139269 +Attribute [logn(1.1, 10)]: double 24.15885793 +Attribute [logn(10, 0.001)]: double -3.0 +Attribute [logn(0.1, 1000)]: double -3.0 diff --git a/centrallix/tests/test_expfn_log_00.to b/centrallix/tests/test_expfn_log_00.to new file mode 100644 index 000000000..e454e4003 --- /dev/null +++ b/centrallix/tests/test_expfn_log_00.to @@ -0,0 +1,44 @@ +##NAME log() functions + +# Natural Log: ln(x) +query select 'ln(1)' = ln(1) -- Expect 0. +query select 'ln(e)' = ln(2.718281828459045) -- Expect 1. +query select 'ln(0)' = ln(0) -- Expect -inf (log approaches infinity). +query select 'ln(-1)' = ln(-1) -- Expect NaN (log undefined for negative). +query select 'ln(10)' = round(ln(10), 8) -- Expect ~2.30258509. +query select 'ln(1.5)' = round(ln(1.5), 8) -- Expect ~0.40546511. +query select 'ln(1e-10)' = ln(0.0000000001) < 0.0000000001 -- Expect true (value is very small). +query select 'ln(1e10)+' = ln(10000000000.0) > 23.0 -- Expect true (value is ~23.02585). +query select 'ln(1e10)-' = ln(10000000000.0) < 23.1 -- Expect true (value is ~23.02585). + +# Log base 10: log10(x) +query select 'log10(1)' = log10(1) -- Expect 0. +query select 'log10(10)' = log10(10) -- Expect 1. +query select 'log10(0)' = log10(0) -- Expect -inf. +query select 'log10(-10)' = log10(-10) -- Expect NaN. +query select 'log10(100)' = log10(100) -- Expect 2. +query select 'log10(0.01)' = log10(0.01) -- Expect -2. +query select 'log10(1.234)' = round(log10(1.234), 8) -- Expect ~0.091315. +query select 'log10(1e-10)' = log10(0.0000000001) -- Expect ~-10. +query select 'log10(1e10)' = log10(10000000000.0) -- Expect ~10. + +# General base n of x: logn(x, n) +# Edge cases: base <= 0 or base == 1 (invalid), x <= 0 (invalid) +query select 'logn(8, 2)' = logn(8, 2) -- Expect 3. +query select 'logn(1000, 10)' = logn(1000, 10) -- Expect 3. +query select 'logn(10, 0)' = logn(10, 0) -- Expect -0.0 (base 0 is undefined). +query select 'logn(10, 1)' = logn(10, 1) -- Expect inf (base 1 is undefined). +query select 'logn(8, -2)' = logn(8, -2) -- Expect NaN (negative base). +query select 'logn(0, 2)' = logn(0, 2) -- Expect -inf (x=0). +query select 'logn(-8, 2)' = logn(-8, 2) -- Expect NaN or error (x negative). +query select 'logn(1, 2)' = logn(1, 2) -- Expect 0. +query select 'logn(1e10, 10)' = logn(10000000000.0, 10) -- Expect 10. +query select 'logn(8, 0.5)' = logn(8, 0.5) -- Expect negative value. + +-- Additional double/int mixed cases +query select 'ln(2.718281828)' = round(ln(2.718281828), 8) -- Expect ~1 (close to e). +query select 'log10(3.14159)' = round(log10(3.14159), 8) -- Expect ~0.49715. +query select 'logn(10, 1.1)' = round(logn(1.1, 10), 8) -- Expect 0.04139289. +query select 'logn(1.1, 10)' = round(logn(10, 1.1), 8) -- Expect 24.15885793. +query select 'logn(10, 0.001)' = round(logn(0.001, 10), 8) -- Expect ~-0.33333333... +query select 'logn(0.1, 1000)' = round(logn(1000, 0.1), 8) -- Expect ~-0.33333333... From 2e948d8028f8c88aa933d17c172a4b089c5795b3 Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 21 Nov 2025 17:44:53 -0700 Subject: [PATCH 29/30] Add exp_fn_i_get_number(). --- centrallix/expression/exp_functions.c | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index 08867f8af..e8d67976d 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -446,6 +446,47 @@ static int exp_fn_i_verify_schema( return 0; } +/*** Extract a number from a numeric expression. + *** + *** @param numeric_expr The numeric expression to be extracted. + *** @param result_ptr A pointer to a double where the result is stored. + *** @returns 0 on success, + *** -1 on failure, + *** 1 if the expression is NULL. + ***/ +static int exp_fn_i_get_number(pExpression numeric_expr, double* result_ptr) + { + /** Check for null values. **/ + if (numeric_expr == NULL || numeric_expr->Flags & EXPR_F_NULL) return 1; + + /** Check for null destination. **/ + if (result_ptr == NULL) + { + mssError(1, "EXP", "Null location provided to store numeric result."); + return -1; + } + + /** Get the numeric value. **/ + double n; + switch(numeric_expr->DataType) + { + case DATA_T_INTEGER: n = numeric_expr->Integer; break; + case DATA_T_DOUBLE: n = numeric_expr->Types.Double; break; + case DATA_T_MONEY: n = objDataToDouble(DATA_T_MONEY, &(numeric_expr->Types.Money)); break; + default: + mssError(1, "EXP", + "%s (%d) is not a numeric type.", + ci_TypeToStr(numeric_expr->DataType), numeric_expr->DataType + ); + return -1; + } + + /** Store the result. **/ + *result_ptr = n; + + return 0; + } + /****** Evaluator functions follow for expEvalFunction ******/ From 4c347be808bf95a80fa7ee8e5346cf95900ab36a Mon Sep 17 00:00:00 2001 From: Israel Date: Fri, 21 Nov 2025 17:45:44 -0700 Subject: [PATCH 30/30] Add exp_fn_i_do_math() to bring the power of schema verification to logarithm and power functions. --- centrallix/expression/exp_functions.c | 223 ++++++-------------------- 1 file changed, 52 insertions(+), 171 deletions(-) diff --git a/centrallix/expression/exp_functions.c b/centrallix/expression/exp_functions.c index e8d67976d..208bd48f8 100644 --- a/centrallix/expression/exp_functions.c +++ b/centrallix/expression/exp_functions.c @@ -3691,194 +3691,75 @@ int exp_fn_from_base64(pExpression tree, pParamObjects objlist, pExpression i0, return -1; } - -int exp_fn_log10(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +static int exp_fn_i_do_math(pExpression tree, pParamObjects obj_list, const char* fn_name, double (*math)(), int arg_num) { - double n; - - if (!i0) - { - mssError(1, "EXP", "log10() requires a number as its first parameter"); - goto error; - } - if (i0->Flags & EXPR_F_NULL) - { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; - } - switch(i0->DataType) - { - case DATA_T_INTEGER: - n = i0->Integer; - break; - case DATA_T_DOUBLE: - n = i0->Types.Double; - break; - case DATA_T_MONEY: - n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); - break; - default: - mssError(1, "EXP", "log10() requires a number as its first parameter"); - goto error; - } - if (n < 0) - { - mssError(1, "EXP", "log10(): cannot compute the logarithm of a negative number"); - goto error; - } - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = log10(n); - return 0; - - error: + /** Verify function schema: expect arg_num numeric values. **/ + ArgExpect expects[arg_num]; + for (int i = 0; i < arg_num; i++) + expects[i] = (ArgExpect){(int[]){DATA_T_INTEGER, DATA_T_DOUBLE, DATA_T_MONEY, -1}, EXP_ARG_NO_FLAGS}; + if (exp_fn_i_verify_schema(fn_name, expects, arg_num, tree, obj_list) != 0) + { + mssErrorf(0, "EXP", "%s(?): Call does not match function schema.", fn_name); return -1; - } - - -int exp_fn_log_natural(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) - { - double n; - - if (!i0) - { - mssError(1, "EXP", "ln() requires a number as its first parameter"); - goto error; - } - if (i0->Flags & EXPR_F_NULL) + } + + /** Null checks. **/ + for (int i = 0; i < arg_num; i++) + { + pExpression arg = tree->Children.Items[i]; + if (arg->Flags & EXPR_F_NULL) { tree->DataType = DATA_T_DOUBLE; tree->Flags |= EXPR_F_NULL; return 0; } - switch(i0->DataType) - { - case DATA_T_INTEGER: - n = i0->Integer; - break; - case DATA_T_DOUBLE: - n = i0->Types.Double; - break; - case DATA_T_MONEY: - n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); - break; - default: - mssError(1, "EXP", "ln() requires a number as its first parameter"); - goto error; - } - if (n < 0) - { - mssError(1, "EXP", "ln(): cannot compute the logarithm of a negative number"); - goto error; - } - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = log(n); - return 0; + } - error: + /** Maximum supported args. **/ + if (arg_num > 4) + { + mssErrorf(1, "EXP", "%s(...): exp_fn_i_do_math() does not support functions with more than 4 arguments. If this is an issue, please increase the number of arguments here: %s:%d", fn_name, __FILE__, __LINE__); return -1; - } - - -int exp_fn_log_base_n(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) - { - double n, p; - - if (!i0 || !i1) - { - mssError(1, "EXP", "logn() requires numbers as its first and second parameters"); - goto error; - } - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) - { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; - } - switch(i0->DataType) - { - case DATA_T_INTEGER: - n = i0->Integer; - break; - case DATA_T_DOUBLE: - n = i0->Types.Double; - break; - case DATA_T_MONEY: - n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); - break; - default: - mssError(1, "EXP", "logn() requires a number as its first parameter"); - goto error; - } - switch(i1->DataType) + } + + /** Get the numbers for the args. **/ + double n[4]; + for (int i = 0; i < arg_num; i++) + { + if (!check(exp_fn_i_get_number(tree->Children.Items[i], &(n[i])))) { - case DATA_T_INTEGER: - p = i1->Integer; - break; - case DATA_T_DOUBLE: - p = i1->Types.Double; - break; - default: - mssError(1, "EXP", "logn() requires an integer or double as its second parameter"); - goto error; + mssErrorf(0, "EXP", "%s(...): Failed to get arg%d.", fn_name, i); + return -1; } - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = log(n) / log(p); - return 0; + } - error: - return -1; + tree->DataType = DATA_T_DOUBLE; + tree->Types.Double = math(n[0], n[1], n[2], n[3]); /* Call function with max supported args. */ + return 0; } +int exp_fn_log_natural(pExpression tree, pParamObjects obj_list) + { + return exp_fn_i_do_math(tree, obj_list, "ln", log, 1); + } +int exp_fn_log10(pExpression tree, pParamObjects obj_list) + { + return exp_fn_i_do_math(tree, obj_list, "log10", log10, 1); + } -int exp_fn_power(pExpression tree, pParamObjects objlist, pExpression i0, pExpression i1, pExpression i2) +/** This is why we need lambdas in C. **/ +double exp_fn_i_log_base_n(double x, double base) { - double n, p; + return log(x) / log(base); + } - if (!i0 || !i1) - { - mssError(1, "EXP", "power() requires numbers as its first and second parameters"); - goto error; - } - if ((i0->Flags & EXPR_F_NULL) || (i1->Flags & EXPR_F_NULL)) - { - tree->DataType = DATA_T_DOUBLE; - tree->Flags |= EXPR_F_NULL; - return 0; - } - switch(i0->DataType) - { - case DATA_T_INTEGER: - n = i0->Integer; - break; - case DATA_T_DOUBLE: - n = i0->Types.Double; - break; - case DATA_T_MONEY: - n = objDataToDouble(DATA_T_MONEY, &(i0->Types.Money)); - break; - default: - mssError(1, "EXP", "power() requires a number as its first parameter"); - goto error; - } - switch(i1->DataType) - { - case DATA_T_INTEGER: - p = i1->Integer; - break; - case DATA_T_DOUBLE: - p = i1->Types.Double; - break; - default: - mssError(1, "EXP", "power() requires an integer or double as its second parameter"); - goto error; - } - tree->DataType = DATA_T_DOUBLE; - tree->Types.Double = pow(n, p); - return 0; - - error: - return -1; +int exp_fn_log_base_n(pExpression tree, pParamObjects obj_list) + { + return exp_fn_i_do_math(tree, obj_list, "logn", exp_fn_i_log_base_n, 2); + } +int exp_fn_power(pExpression tree, pParamObjects obj_list) + { + return exp_fn_i_do_math(tree, obj_list, "power", pow, 2); }