From d5cfd0819eebb4faadaaae458de527d1d3be7c4c Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Wed, 19 Nov 2025 18:12:58 +0300 Subject: [PATCH 01/28] utf8-utils.cpp: constexpr std::array for constants --- common/unicode/utf8-utils.cpp | 328 +++++++++++++++++----------------- 1 file changed, 165 insertions(+), 163 deletions(-) diff --git a/common/unicode/utf8-utils.cpp b/common/unicode/utf8-utils.cpp index fbf65bee80..2c67a5f5a9 100644 --- a/common/unicode/utf8-utils.cpp +++ b/common/unicode/utf8-utils.cpp @@ -1,9 +1,10 @@ // Compiler for PHP (aka KPHP) -// Copyright (c) 2020 LLC «V Kontakte» +// Copyright (c) 2025 LLC «V Kontakte» // Distributed under the GPL v3 License, see LICENSE.notice.txt #include "common/unicode/utf8-utils.h" +#include #include #include #include @@ -991,164 +992,165 @@ int simplify_character(int c) { } } -const int _s_1__[] = {97, 0}; -const int _v_1__[] = {1072, 0}; -const int _s_2__[] = {98, 0}; -const int _v_2__[] = {1073, 0}; -const int _s_3__[] = {99, 0}; -const int _v_3__[] = {1082, 0}; -const int _s_4__[] = {99, 104, 0}; -const int _v_4__[] = {1095, 0}; -const int _s_5__[] = {100, 0}; -const int _v_5__[] = {1076, 0}; -const int _s_6__[] = {101, 0}; -const int _v_6__[] = {1077, 0}; -const int _s_7__[] = {101, 105, 0}; -const int _v_7__[] = {1077, 1081, 0}; -const int _s_8__[] = {101, 121, 0}; -const int _v_8__[] = {1077, 1081, 0}; -const int _s_9__[] = {102, 0}; -const int _v_9__[] = {1092, 0}; -const int _s_10__[] = {103, 0}; -const int _v_10__[] = {1075, 0}; -const int _s_11__[] = {104, 0}; -const int _v_11__[] = {1093, 0}; -const int _s_12__[] = {105, 0}; -const int _v_12__[] = {1080, 0}; -const int _s_13__[] = {105, 97, 0}; -const int _v_13__[] = {1080, 1103, 0}; -const int _s_14__[] = {105, 121, 0}; -const int _v_14__[] = {1080, 1081, 0}; -const int _s_15__[] = {106, 0}; -const int _v_15__[] = {1081, 0}; -const int _s_16__[] = {106, 111, 0}; -const int _v_16__[] = {1077, 0}; -const int _s_17__[] = {106, 117, 0}; -const int _v_17__[] = {1102, 0}; -const int _s_18__[] = {106, 97, 0}; -const int _v_18__[] = {1103, 0}; -const int _s_19__[] = {107, 0}; -const int _v_19__[] = {1082, 0}; -const int _s_20__[] = {107, 104, 0}; -const int _v_20__[] = {1093, 0}; -const int _s_21__[] = {108, 0}; -const int _v_21__[] = {1083, 0}; -const int _s_22__[] = {109, 0}; -const int _v_22__[] = {1084, 0}; -const int _s_23__[] = {110, 0}; -const int _v_23__[] = {1085, 0}; -const int _s_24__[] = {111, 0}; -const int _v_24__[] = {1086, 0}; -const int _s_25__[] = {112, 0}; -const int _v_25__[] = {1087, 0}; -const int _s_26__[] = {113, 0}; -const int _v_26__[] = {1082, 0}; -const int _s_27__[] = {114, 0}; -const int _v_27__[] = {1088, 0}; -const int _s_28__[] = {115, 0}; -const int _v_28__[] = {1089, 0}; -const int _s_29__[] = {115, 104, 0}; -const int _v_29__[] = {1096, 0}; -const int _s_30__[] = {115, 104, 99, 104, 0}; -const int _v_30__[] = {1097, 0}; -const int _s_31__[] = {115, 99, 104, 0}; -const int _v_31__[] = {1097, 0}; -const int _s_32__[] = {116, 0}; -const int _v_32__[] = {1090, 0}; -const int _s_33__[] = {116, 115, 0}; -const int _v_33__[] = {1094, 0}; -const int _s_34__[] = {117, 0}; -const int _v_34__[] = {1091, 0}; -const int _s_35__[] = {118, 0}; -const int _v_35__[] = {1074, 0}; -const int _s_36__[] = {119, 0}; -const int _v_36__[] = {1074, 0}; -const int _s_37__[] = {120, 0}; -const int _v_37__[] = {1082, 1089, 0}; -const int _s_38__[] = {121, 0}; -const int _v_38__[] = {1080, 0}; -const int _s_39__[] = {121, 111, 0}; -const int _v_39__[] = {1077, 0}; -const int _s_40__[] = {121, 117, 0}; -const int _v_40__[] = {1102, 0}; -const int _s_41__[] = {121, 97, 0}; -const int _v_41__[] = {1103, 0}; -const int _s_42__[] = {122, 0}; -const int _v_42__[] = {1079, 0}; -const int _s_43__[] = {122, 104, 0}; -const int _v_43__[] = {1078, 0}; -const int _s_44__[] = {1072, 0}; -const int _v_44__[] = {97, 0}; -const int _s_45__[] = {1073, 0}; -const int _v_45__[] = {98, 0}; -const int _s_46__[] = {1074, 0}; -const int _v_46__[] = {118, 0}; -const int _s_47__[] = {1075, 0}; -const int _v_47__[] = {103, 0}; -const int _s_48__[] = {1076, 0}; -const int _v_48__[] = {100, 0}; -const int _s_49__[] = {1077, 0}; -const int _v_49__[] = {101, 0}; -const int _s_50__[] = {1105, 0}; -const int _v_50__[] = {101, 0}; -const int _s_51__[] = {1078, 0}; -const int _v_51__[] = {122, 104, 0}; -const int _s_52__[] = {1079, 0}; -const int _v_52__[] = {122, 0}; -const int _s_53__[] = {1080, 0}; -const int _v_53__[] = {105, 0}; -const int _s_54__[] = {1080, 1081, 0}; -const int _v_54__[] = {121, 0}; -const int _s_55__[] = {1080, 1103, 0}; -const int _v_55__[] = {105, 97, 0}; -const int _s_56__[] = {1081, 0}; -const int _v_56__[] = {121, 0}; -const int _s_57__[] = {1082, 0}; -const int _v_57__[] = {107, 0}; -const int _s_58__[] = {1082, 1089, 0}; -const int _v_58__[] = {120, 0}; -const int _s_59__[] = {1083, 0}; -const int _v_59__[] = {108, 0}; -const int _s_60__[] = {1084, 0}; -const int _v_60__[] = {109, 0}; -const int _s_61__[] = {1085, 0}; -const int _v_61__[] = {110, 0}; -const int _s_62__[] = {1086, 0}; -const int _v_62__[] = {111, 0}; -const int _s_63__[] = {1087, 0}; -const int _v_63__[] = {112, 0}; -const int _s_64__[] = {1088, 0}; -const int _v_64__[] = {114, 0}; -const int _s_65__[] = {1089, 0}; -const int _v_65__[] = {115, 0}; -const int _s_66__[] = {1090, 0}; -const int _v_66__[] = {116, 0}; -const int _s_67__[] = {1091, 0}; -const int _v_67__[] = {117, 0}; -const int _s_68__[] = {1092, 0}; -const int _v_68__[] = {102, 0}; -const int _s_69__[] = {1093, 0}; -const int _v_69__[] = {107, 104, 0}; -const int _s_70__[] = {1094, 0}; -const int _v_70__[] = {116, 115, 0}; -const int _s_71__[] = {1095, 0}; -const int _v_71__[] = {99, 104, 0}; -const int _s_72__[] = {1096, 0}; -const int _v_72__[] = {115, 104, 0}; -const int _s_73__[] = {1097, 0}; -const int _v_73__[] = {115, 104, 99, 104, 0}; -const int _s_74__[] = {1098, 0}; -const int _v_74__[] = {0}; -const int _s_75__[] = {1099, 0}; -const int _v_75__[] = {121, 0}; -const int _s_76__[] = {1100, 0}; -const int _v_76__[] = {0}; -const int _s_77__[] = {1101, 0}; -const int _v_77__[] = {101, 0}; -const int _s_78__[] = {1102, 0}; -const int _v_78__[] = {121, 117, 0}; -const int _s_79__[] = {1103, 0}; -const int _v_79__[] = {121, 97, 0}; +// TODO does constexpr std::array enough for safe use in runtime-light ? +constexpr std::array _s_1__{97, 0}; +constexpr std::array _v_1__{1072, 0}; +constexpr std::array _s_2__{98, 0}; +constexpr std::array _v_2__{1073, 0}; +constexpr std::array _s_3__{99, 0}; +constexpr std::array _v_3__{1082, 0}; +constexpr std::array _s_4__{99, 104, 0}; +constexpr std::array _v_4__{1095, 0}; +constexpr std::array _s_5__{100, 0}; +constexpr std::array _v_5__{1076, 0}; +constexpr std::array _s_6__{101, 0}; +constexpr std::array _v_6__{1077, 0}; +constexpr std::array _s_7__{101, 105, 0}; +constexpr std::array _v_7__{1077, 1081, 0}; +constexpr std::array _s_8__{101, 121, 0}; +constexpr std::array _v_8__{1077, 1081, 0}; +constexpr std::array _s_9__{102, 0}; +constexpr std::array _v_9__{1092, 0}; +constexpr std::array _s_10__{103, 0}; +constexpr std::array _v_10__{1075, 0}; +constexpr std::array _s_11__{104, 0}; +constexpr std::array _v_11__{1093, 0}; +constexpr std::array _s_12__{105, 0}; +constexpr std::array _v_12__{1080, 0}; +constexpr std::array _s_13__{105, 97, 0}; +constexpr std::array _v_13__{1080, 1103, 0}; +constexpr std::array _s_14__{105, 121, 0}; +constexpr std::array _v_14__{1080, 1081, 0}; +constexpr std::array _s_15__{106, 0}; +constexpr std::array _v_15__{1081, 0}; +constexpr std::array _s_16__{106, 111, 0}; +constexpr std::array _v_16__{1077, 0}; +constexpr std::array _s_17__{106, 117, 0}; +constexpr std::array _v_17__{1102, 0}; +constexpr std::array _s_18__{106, 97, 0}; +constexpr std::array _v_18__{1103, 0}; +constexpr std::array _s_19__{107, 0}; +constexpr std::array _v_19__{1082, 0}; +constexpr std::array _s_20__{107, 104, 0}; +constexpr std::array _v_20__{1093, 0}; +constexpr std::array _s_21__{108, 0}; +constexpr std::array _v_21__{1083, 0}; +constexpr std::array _s_22__{109, 0}; +constexpr std::array _v_22__{1084, 0}; +constexpr std::array _s_23__{110, 0}; +constexpr std::array _v_23__{1085, 0}; +constexpr std::array _s_24__{111, 0}; +constexpr std::array _v_24__{1086, 0}; +constexpr std::array _s_25__{112, 0}; +constexpr std::array _v_25__{1087, 0}; +constexpr std::array _s_26__{113, 0}; +constexpr std::array _v_26__{1082, 0}; +constexpr std::array _s_27__{114, 0}; +constexpr std::array _v_27__{1088, 0}; +constexpr std::array _s_28__{115, 0}; +constexpr std::array _v_28__{1089, 0}; +constexpr std::array _s_29__{115, 104, 0}; +constexpr std::array _v_29__{1096, 0}; +constexpr std::array _s_30__{115, 104, 99, 104, 0}; +constexpr std::array _v_30__{1097, 0}; +constexpr std::array _s_31__{115, 99, 104, 0}; +constexpr std::array _v_31__{1097, 0}; +constexpr std::array _s_32__{116, 0}; +constexpr std::array _v_32__{1090, 0}; +constexpr std::array _s_33__{116, 115, 0}; +constexpr std::array _v_33__{1094, 0}; +constexpr std::array _s_34__{117, 0}; +constexpr std::array _v_34__{1091, 0}; +constexpr std::array _s_35__{118, 0}; +constexpr std::array _v_35__{1074, 0}; +constexpr std::array _s_36__{119, 0}; +constexpr std::array _v_36__{1074, 0}; +constexpr std::array _s_37__{120, 0}; +constexpr std::array _v_37__{1082, 1089, 0}; +constexpr std::array _s_38__{121, 0}; +constexpr std::array _v_38__{1080, 0}; +constexpr std::array _s_39__{121, 111, 0}; +constexpr std::array _v_39__{1077, 0}; +constexpr std::array _s_40__{121, 117, 0}; +constexpr std::array _v_40__{1102, 0}; +constexpr std::array _s_41__{121, 97, 0}; +constexpr std::array _v_41__{1103, 0}; +constexpr std::array _s_42__{122, 0}; +constexpr std::array _v_42__{1079, 0}; +constexpr std::array _s_43__{122, 104, 0}; +constexpr std::array _v_43__{1078, 0}; +constexpr std::array _s_44__{1072, 0}; +constexpr std::array _v_44__{97, 0}; +constexpr std::array _s_45__{1073, 0}; +constexpr std::array _v_45__{98, 0}; +constexpr std::array _s_46__{1074, 0}; +constexpr std::array _v_46__{118, 0}; +constexpr std::array _s_47__{1075, 0}; +constexpr std::array _v_47__{103, 0}; +constexpr std::array _s_48__{1076, 0}; +constexpr std::array _v_48__{100, 0}; +constexpr std::array _s_49__{1077, 0}; +constexpr std::array _v_49__{101, 0}; +constexpr std::array _s_50__{1105, 0}; +constexpr std::array _v_50__{101, 0}; +constexpr std::array _s_51__{1078, 0}; +constexpr std::array _v_51__{122, 104, 0}; +constexpr std::array _s_52__{1079, 0}; +constexpr std::array _v_52__{122, 0}; +constexpr std::array _s_53__{1080, 0}; +constexpr std::array _v_53__{105, 0}; +constexpr std::array _s_54__{1080, 1081, 0}; +constexpr std::array _v_54__{121, 0}; +constexpr std::array _s_55__{1080, 1103, 0}; +constexpr std::array _v_55__{105, 97, 0}; +constexpr std::array _s_56__{1081, 0}; +constexpr std::array _v_56__{121, 0}; +constexpr std::array _s_57__{1082, 0}; +constexpr std::array _v_57__{107, 0}; +constexpr std::array _s_58__{1082, 1089, 0}; +constexpr std::array _v_58__{120, 0}; +constexpr std::array _s_59__{1083, 0}; +constexpr std::array _v_59__{108, 0}; +constexpr std::array _s_60__{1084, 0}; +constexpr std::array _v_60__{109, 0}; +constexpr std::array _s_61__{1085, 0}; +constexpr std::array _v_61__{110, 0}; +constexpr std::array _s_62__{1086, 0}; +constexpr std::array _v_62__{111, 0}; +constexpr std::array _s_63__{1087, 0}; +constexpr std::array _v_63__{112, 0}; +constexpr std::array _s_64__{1088, 0}; +constexpr std::array _v_64__{114, 0}; +constexpr std::array _s_65__{1089, 0}; +constexpr std::array _v_65__{115, 0}; +constexpr std::array _s_66__{1090, 0}; +constexpr std::array _v_66__{116, 0}; +constexpr std::array _s_67__{1091, 0}; +constexpr std::array _v_67__{117, 0}; +constexpr std::array _s_68__{1092, 0}; +constexpr std::array _v_68__{102, 0}; +constexpr std::array _s_69__{1093, 0}; +constexpr std::array _v_69__{107, 104, 0}; +constexpr std::array _s_70__{1094, 0}; +constexpr std::array _v_70__{116, 115, 0}; +constexpr std::array _s_71__{1095, 0}; +constexpr std::array _v_71__{99, 104, 0}; +constexpr std::array _s_72__{1096, 0}; +constexpr std::array _v_72__{115, 104, 0}; +constexpr std::array _s_73__{1097, 0}; +constexpr std::array _v_73__{115, 104, 99, 104, 0}; +constexpr std::array _s_74__{1098, 0}; +constexpr std::array _v_74__{0}; +constexpr std::array _s_75__{1099, 0}; +constexpr std::array _v_75__{121, 0}; +constexpr std::array _s_76__{1100, 0}; +constexpr std::array _v_76__{0}; +constexpr std::array _s_77__{1101, 0}; +constexpr std::array _v_77__{101, 0}; +constexpr std::array _s_78__{1102, 0}; +constexpr std::array _v_78__{121, 117, 0}; +constexpr std::array _s_79__{1103, 0}; +constexpr std::array _v_79__{121, 97, 0}; int translit_string_utf8_from_en_to_ru(int* input, int* output) { @@ -1158,8 +1160,8 @@ int translit_string_utf8_from_en_to_ru(int* input, int* output) { k++; \ } \ if (!s[k]) { \ - match_v = v; \ - match_s = s; \ + match_v = v.data(); \ + match_s = s.data(); \ } int i = 0, j = 0, k = 0; @@ -1340,8 +1342,8 @@ int translit_string_utf8_from_ru_to_en(int* input, int* output) { k++; \ } \ if (!s[k]) { \ - match_v = v; \ - match_s = s; \ + match_v = v.data(); \ + match_s = s.data(); \ } int i = 0, j = 0, k = 0; From 03786a3a672b254bde0ad9b051495dfe0c334be3 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Fri, 21 Nov 2025 16:36:19 +0300 Subject: [PATCH 02/28] implemented core of f$prepare_search_query --- .../stdlib/string/string_functions.h | 237 ++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 runtime-light/stdlib/string/string_functions.h diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h new file mode 100644 index 0000000000..7c6022ca93 --- /dev/null +++ b/runtime-light/stdlib/string/string_functions.h @@ -0,0 +1,237 @@ +// Compiler for PHP (aka KPHP) +// Copyright (c) 2025 LLC «V Kontakte» +// Distributed under the GPL v3 License, see LICENSE.notice.txt + +#pragma once + +#include +#include +#include +#include +#include + +#include "auto/common/unicode-utils-auto.h" +#include "common/unicode/utf8-utils.h" +#include "runtime-common/core/runtime-core.h" + +namespace string_functions_impl_ { + +// TODO May be better extract MAX_NAME_SIZE to utf8-utils.h instead of copy-pasting it ? +inline constexpr size_t MAX_NAME_SIZE{65536}; +inline constexpr size_t MAX_BYTES_SPAN_SIZE{MAX_NAME_SIZE * 4 + 4}; +inline constexpr size_t MAX_CODE_POINTS_SPAN_SIZE{MAX_NAME_SIZE + 4}; + +inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; + +/* Search generated ranges for specified character */ +inline int32_t binary_search_ranges(int32_t code) noexcept { + size_t r{prepare_table_ranges_size}; + // TODO code points must be uint32_t ?! + if ((uint32_t)code > MAX_UTF8_CODE_POINT) { + return 0; + } + + size_t l{0}; + while (l < r) { + // TODO verify this formula + size_t m{((l + r + 2) >> 2) << 1}; + if (prepare_table_ranges[m] <= code) { + l = m; + } else { + // TODO why `- 2` ? + r = m - 2; + } + } + + // prepare_table_ranges[l] - key + // prepare_table_ranges[l + 1] - value + int32_t t{prepare_table_ranges[l + 1]}; + if (t < 0) { + // TODO блять что это ?? + return code - prepare_table_ranges[l] + (~t); + } + if (t <= 0x10ffff) { + return t; + } + switch (t - 0x200000) { + case 0: + // TODO а это + return (code & -2); + case 1: + // TODO и это ещё + return (code | 1); + case 2: + // TODO ?? + return ((code - 1) | 1); + default: + // TODO тут делаем k2_exit ?? + assert(0); + exit(1); + } +} + +inline constexpr int32_t WHITESPACE{static_cast(' ')}; +inline constexpr int32_t PLUS{static_cast('+')}; + +// TODO naming +/* Prepares unicode 0-terminated string input for search, + leaving only digits and letters with diacritics. + Length of string can decrease. + Returns length of result. */ +inline void prepare_search_string(std::span& code_points) noexcept { + size_t output_size{}; + for (size_t i{}; i < code_points.size(); ++i) { + int32_t c{code_points[i]}; + int32_t new_c{}; + if (static_cast(c) < static_cast(TABLE_SIZE)) { + // Таблица каких-то преобразований для первых 1280 символов + new_c = static_cast(prepare_table[c]); + } else { + // Бинпоиск по мапе преобразований сразу целых range'ей + // prepare_table_ranges - мапа, закодированная в массиве, ага + new_c = binary_search_ranges(c); + } + // TODO replace with `new_c != 0` ? + if (new_c) { + // we forbid 2 whitespaces after each other and starting whitespace + if (new_c != WHITESPACE || (output_size > 0 && code_points[output_size - 1] != WHITESPACE)) { + code_points[output_size++] = new_c; + } + } + } + if (output_size > 0 && code_points[output_size - 1] == WHITESPACE) { + // throw out terminating whitespace + --output_size; + } + code_points[output_size] = 0; + code_points = code_points.subspan(output_size); +} + +// TODO naming +inline std::span prepare_str_unicode(std::span code_points) noexcept { + prepare_search_string(code_points); + code_points[code_points.size()] = WHITESPACE; + + // TODO init + std::span word_start_indices{TODO_string_buffer_pointer, TODO_size}; // indices of first char of every word in `code_points`. + size_t words_count{}; + size_t i{}; + // looking for the beginnings of the words + while (i < code_points.size()) { + word_start_indices[words_count++] = i; + while (i < code_points.size() && code_points[i] != ' ') { + i++; + } + i++; + } + word_start_indices = word_start_indices.subspan(words_count); + + auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool { + while (code_points[x] != WHITESPACE && code_points[x] == code_points[y]) { + ++x; + ++y; + } + if (code_points[x] == WHITESPACE) { + return code_points[y] != WHITESPACE; + } + if (code_points[y] == WHITESPACE) { + return false; + } + return code_points[x] < code_points[y]; + }}; + + std::sort(word_start_indices.begin(), word_start_indices.end(), word_less_cmp); + + size_t uniq_words_count{}; + for (i = 0; i < words_count; i++) { + // drop duplicates + if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) { + word_start_indices[uniq_words_count++] = word_start_indices[i]; + } else { + // TODO разобраться, зачем сохранять именно последний элемент из дубликатов? + word_start_indices[uniq_words_count - 1] = word_start_indices[i]; + } + } + + std::span result{TODO, TODO}; + size_t result_size{}; + // output words with '+' separator + for (i = 0; i < uniq_words_count; i++) { + size_t ind = word_start_indices[i]; + while (code_points[ind] != WHITESPACE) { + result[result_size++] = code_points[ind++]; + } + result[result_size++] = PLUS; + } + result[result_size++] = 0; + + // TODO assert !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + assert(result_size < MAX_NAME_SIZE); + result = result.subspan(result_size); + return result; +} + +// TODO naming +inline std::span clean_str_unicode(std::span code_points) noexcept { + // TODO prepare_str_unicode надо переписать для runtime-light + std::span prepared_code_points{prepare_str_unicode(code_points)}; + // put_string_utf8 можно использовать в runtime-light + std::span utf8_result{TODO, TODO}; + auto length{static_cast(put_string_utf8(prepared_code_points.data(), reinterpret_cast(utf8_result.data())))}; + // TODO assert !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + assert(length < utf8_result.size()); + utf8_result = utf8_result.subspan(length); + + size_t i{}; + size_t result_size{}; + while (i < utf8_result.size()) { + char* c{reinterpret_cast(&utf8_result[i])}; + bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || + !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || + // скипаем год ? + (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || + !std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) || + !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || + !std::strncmp(c, "8233+", 5)}; + do { + // TODO почему это присваивание не внутри следующего if'a? + // Оно же потом будет перетёрто либо следующим присваиванием, либо в `*s = 0` + utf8_result[result_size] = utf8_result[i]; + if (!skip) { + ++result_size; + } + } while (i++ != '+'); + } + utf8_result[result_size] = static_cast(0); + + return utf8_result; +} + +inline std::span prepare_search_query_impl(std::span x) noexcept { + if (x.empty() || x.size() >= MAX_NAME_SIZE) { + return x; + } + + // TODO what is better, RuntimeContext.static_SB or StringLibContext.static_buf ? + RuntimeContext::get().static_SB.clean(); + RuntimeContext::get().static_SB.reserve(??? + ??? + ???); + + // TODO is int32_t canonical way of representing code points? + // May be replace with uint32_t? + std::span utf8_code_points{ + // TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO + MAX_CODE_POINTS_SPAN_SIZE, + }; + + // html_string_to_utf8 можно полностью использовать в runtime-light + html_string_to_utf8(reinterpret_cast(x.data()), utf8_code_points.data()); + return clean_str_unicode(utf8_code_points); +} + +} // namespace string_functions_impl_ + +inline string f$prepare_search_query(const string& query) noexcept { + std::span s{ + string_functions_impl_::prepare_search_query_impl({reinterpret_cast(query.c_str()), static_cast(query.size())})}; + return {reinterpret_cast(s.data()), static_cast(s.size())}; +} From 8a3942ae7663d3d20caf3608e57827880588e8e8 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Fri, 21 Nov 2025 16:48:07 +0300 Subject: [PATCH 03/28] clean up a bit --- runtime-light/stdlib/string/string_functions.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h index 7c6022ca93..9ae26a40fe 100644 --- a/runtime-light/stdlib/string/string_functions.h +++ b/runtime-light/stdlib/string/string_functions.h @@ -25,13 +25,13 @@ inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; /* Search generated ranges for specified character */ inline int32_t binary_search_ranges(int32_t code) noexcept { - size_t r{prepare_table_ranges_size}; // TODO code points must be uint32_t ?! if ((uint32_t)code > MAX_UTF8_CODE_POINT) { return 0; } size_t l{0}; + size_t r{prepare_table_ranges_size}; while (l < r) { // TODO verify this formula size_t m{((l + r + 2) >> 2) << 1}; @@ -119,7 +119,7 @@ inline std::span prepare_str_unicode(std::span code_points) no // looking for the beginnings of the words while (i < code_points.size()) { word_start_indices[words_count++] = i; - while (i < code_points.size() && code_points[i] != ' ') { + while (i < code_points.size() && code_points[i] != WHITESPACE) { i++; } i++; @@ -157,7 +157,7 @@ inline std::span prepare_str_unicode(std::span code_points) no size_t result_size{}; // output words with '+' separator for (i = 0; i < uniq_words_count; i++) { - size_t ind = word_start_indices[i]; + size_t ind{word_start_indices[i]}; while (code_points[ind] != WHITESPACE) { result[result_size++] = code_points[ind++]; } @@ -173,13 +173,11 @@ inline std::span prepare_str_unicode(std::span code_points) no // TODO naming inline std::span clean_str_unicode(std::span code_points) noexcept { - // TODO prepare_str_unicode надо переписать для runtime-light std::span prepared_code_points{prepare_str_unicode(code_points)}; // put_string_utf8 можно использовать в runtime-light std::span utf8_result{TODO, TODO}; auto length{static_cast(put_string_utf8(prepared_code_points.data(), reinterpret_cast(utf8_result.data())))}; - // TODO assert !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - assert(length < utf8_result.size()); + TODO assert(length < utf8_result.size()); utf8_result = utf8_result.subspan(length); size_t i{}; @@ -194,13 +192,11 @@ inline std::span clean_str_unicode(std::span code_poin !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || !std::strncmp(c, "8233+", 5)}; do { - // TODO почему это присваивание не внутри следующего if'a? - // Оно же потом будет перетёрто либо следующим присваиванием, либо в `*s = 0` - utf8_result[result_size] = utf8_result[i]; if (!skip) { + utf8_result[result_size] = utf8_result[i]; ++result_size; } - } while (i++ != '+'); + } while (utf8_result[i++] != static_cast('+')); } utf8_result[result_size] = static_cast(0); From b3dd2f1fbdc0eef39f7b5a7e9bab111398f634be Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Mon, 24 Nov 2025 18:56:26 +0300 Subject: [PATCH 04/28] RuntimeContext::get().static_SB used for prepare_search_query --- .../stdlib/string/string_functions.h | 51 ++++++++++--------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h index 9ae26a40fe..b746aa9f2f 100644 --- a/runtime-light/stdlib/string/string_functions.h +++ b/runtime-light/stdlib/string/string_functions.h @@ -13,20 +13,28 @@ #include "auto/common/unicode-utils-auto.h" #include "common/unicode/utf8-utils.h" #include "runtime-common/core/runtime-core.h" +#include "runtime-light/stdlib/diagnostics/logs.h" namespace string_functions_impl_ { // TODO May be better extract MAX_NAME_SIZE to utf8-utils.h instead of copy-pasting it ? -inline constexpr size_t MAX_NAME_SIZE{65536}; -inline constexpr size_t MAX_BYTES_SPAN_SIZE{MAX_NAME_SIZE * 4 + 4}; -inline constexpr size_t MAX_CODE_POINTS_SPAN_SIZE{MAX_NAME_SIZE + 4}; +inline constexpr size_t MAX_NAME_SIZE = 65536; +inline constexpr size_t MAX_NAME_INDICES_SIZE = MAX_NAME_SIZE + 4; +inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; +inline constexpr size_t MAX_NAME_BYTES_SIZE = MAX_NAME_SIZE * 4 + 4; + +// TODO как учитывать align ? +inline constexpr size_t SOURCE_CODE_POINTS_SPAN_BEGIN = 0; +inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + sizeof(size_t) * MAX_NAME_INDICES_SIZE; +inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t RESULT_BYTES_SPAN_END = RESULT_BYTES_SPAN_BEGIN + sizeof(int32_t) * MAX_NAME_BYTES_SIZE; inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; /* Search generated ranges for specified character */ inline int32_t binary_search_ranges(int32_t code) noexcept { - // TODO code points must be uint32_t ?! - if ((uint32_t)code > MAX_UTF8_CODE_POINT) { + if (code > MAX_UTF8_CODE_POINT) { return 0; } @@ -64,9 +72,7 @@ inline int32_t binary_search_ranges(int32_t code) noexcept { // TODO ?? return ((code - 1) | 1); default: - // TODO тут делаем k2_exit ?? - assert(0); - exit(1); + k2::exit(1); } } @@ -112,8 +118,8 @@ inline std::span prepare_str_unicode(std::span code_points) no prepare_search_string(code_points); code_points[code_points.size()] = WHITESPACE; - // TODO init - std::span word_start_indices{TODO_string_buffer_pointer, TODO_size}; // indices of first char of every word in `code_points`. + auto* word_indices_begin{reinterpret_cast(RuntimeContext::get().static_SB.buffer()[WORD_INDICES_SPAN_BEGIN])}; + std::span word_start_indices{word_indices_begin, MAX_NAME_INDICES_SIZE}; // indices of first char of every word in `code_points`. size_t words_count{}; size_t i{}; // looking for the beginnings of the words @@ -148,12 +154,12 @@ inline std::span prepare_str_unicode(std::span code_points) no if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) { word_start_indices[uniq_words_count++] = word_start_indices[i]; } else { - // TODO разобраться, зачем сохранять именно последний элемент из дубликатов? word_start_indices[uniq_words_count - 1] = word_start_indices[i]; } } - std::span result{TODO, TODO}; + auto* result_begin{reinterpret_cast(RuntimeContext::get().static_SB.buffer()[RESULT_CODE_POINTS_SPAN_BEGIN])}; + std::span result{result_begin, MAX_NAME_CODE_POINTS_SIZE}; size_t result_size{}; // output words with '+' separator for (i = 0; i < uniq_words_count; i++) { @@ -165,8 +171,7 @@ inline std::span prepare_str_unicode(std::span code_points) no } result[result_size++] = 0; - // TODO assert !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - assert(result_size < MAX_NAME_SIZE); + kphp::log::assertion(result_size < MAX_NAME_SIZE); result = result.subspan(result_size); return result; } @@ -174,10 +179,12 @@ inline std::span prepare_str_unicode(std::span code_points) no // TODO naming inline std::span clean_str_unicode(std::span code_points) noexcept { std::span prepared_code_points{prepare_str_unicode(code_points)}; + + auto* utf8_result_begin{reinterpret_cast(prepared_code_points.begin()[RESULT_CODE_POINTS_SPAN_BEGIN])}; + std::span utf8_result{utf8_result_begin, MAX_NAME_BYTES_SIZE}; // put_string_utf8 можно использовать в runtime-light - std::span utf8_result{TODO, TODO}; auto length{static_cast(put_string_utf8(prepared_code_points.data(), reinterpret_cast(utf8_result.data())))}; - TODO assert(length < utf8_result.size()); + kphp::log::assertion(length < utf8_result.size()); utf8_result = utf8_result.subspan(length); size_t i{}; @@ -208,15 +215,13 @@ inline std::span prepare_search_query_impl(std::span(RuntimeContext::get().static_SB.buffer())}; std::span utf8_code_points{ - // TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO - MAX_CODE_POINTS_SPAN_SIZE, + utf8_code_points_begin, + MAX_NAME_CODE_POINTS_SIZE, }; // html_string_to_utf8 можно полностью использовать в runtime-light From 232a5e89f250abf576aef2b1eef2432cf5cc69de Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Mon, 24 Nov 2025 19:05:03 +0300 Subject: [PATCH 05/28] fixes --- runtime-light/stdlib/string/string_functions.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h index b746aa9f2f..72a2cc52c2 100644 --- a/runtime-light/stdlib/string/string_functions.h +++ b/runtime-light/stdlib/string/string_functions.h @@ -13,6 +13,7 @@ #include "auto/common/unicode-utils-auto.h" #include "common/unicode/utf8-utils.h" #include "runtime-common/core/runtime-core.h" +#include "runtime-light/k2-platform/k2-api.h" #include "runtime-light/stdlib/diagnostics/logs.h" namespace string_functions_impl_ { @@ -28,7 +29,7 @@ inline constexpr size_t SOURCE_CODE_POINTS_SPAN_BEGIN = 0; inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + sizeof(size_t) * MAX_NAME_INDICES_SIZE; inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; -inline constexpr size_t RESULT_BYTES_SPAN_END = RESULT_BYTES_SPAN_BEGIN + sizeof(int32_t) * MAX_NAME_BYTES_SIZE; +inline constexpr size_t RESULT_BYTES_SPAN_END = RESULT_BYTES_SPAN_BEGIN + sizeof(std::byte) * MAX_NAME_BYTES_SIZE; inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; @@ -215,10 +216,11 @@ inline std::span prepare_search_query_impl(std::span(RuntimeContext::get().static_SB.buffer())}; + auto* utf8_code_points_begin{reinterpret_cast(runtime_context.static_SB.buffer())}; std::span utf8_code_points{ utf8_code_points_begin, MAX_NAME_CODE_POINTS_SIZE, From 5c60f22143918016ee4352ddb77a02e0cfadaaf3 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Mon, 24 Nov 2025 20:07:04 +0300 Subject: [PATCH 06/28] std::addressof --- runtime-light/stdlib/string/string_functions.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h index 72a2cc52c2..69249431bc 100644 --- a/runtime-light/stdlib/string/string_functions.h +++ b/runtime-light/stdlib/string/string_functions.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "auto/common/unicode-utils-auto.h" @@ -191,7 +192,7 @@ inline std::span clean_str_unicode(std::span code_poin size_t i{}; size_t result_size{}; while (i < utf8_result.size()) { - char* c{reinterpret_cast(&utf8_result[i])}; + char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || // скипаем год ? From 7f857774996056cbc6875d24ec11c2cde5655c3b Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Mon, 24 Nov 2025 20:10:13 +0300 Subject: [PATCH 07/28] removed 2025 --- common/unicode/utf8-utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/unicode/utf8-utils.cpp b/common/unicode/utf8-utils.cpp index 2c67a5f5a9..16c6be0aaa 100644 --- a/common/unicode/utf8-utils.cpp +++ b/common/unicode/utf8-utils.cpp @@ -1,5 +1,5 @@ // Compiler for PHP (aka KPHP) -// Copyright (c) 2025 LLC «V Kontakte» +// Copyright (c) 2020 LLC «V Kontakte» // Distributed under the GPL v3 License, see LICENSE.notice.txt #include "common/unicode/utf8-utils.h" From 7204dfa0273711aab7011630d11fe6c06948b212 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Tue, 25 Nov 2025 13:57:11 +0300 Subject: [PATCH 08/28] f$prepare_search_query: split string_buf into 4 spans --- common/unicode/unicode-utils.cpp | 9 ++- common/unicode/unicode-utils.h | 6 ++ .../stdlib/string/string_functions.h | 69 +++++++++---------- 3 files changed, 41 insertions(+), 43 deletions(-) diff --git a/common/unicode/unicode-utils.cpp b/common/unicode/unicode-utils.cpp index 646997ab86..ebeb7b05ed 100644 --- a/common/unicode/unicode-utils.cpp +++ b/common/unicode/unicode-utils.cpp @@ -93,11 +93,10 @@ int prepare_search_string(int* input) { return output - input; } -#define MAX_NAME_SIZE 65536 -static char prep_buf[4 * MAX_NAME_SIZE + 4]; -int prep_ibuf[MAX_NAME_SIZE + 4]; -static int prep_ibuf_res[MAX_NAME_SIZE + 4]; -static int* words_ibuf[MAX_NAME_SIZE + 4]; +static char prep_buf[MAX_NAME_BYTES_SIZE]; +int prep_ibuf[MAX_NAME_CODE_POINTS_SIZE]; +static int prep_ibuf_res[MAX_NAME_CODE_POINTS_SIZE]; +static int* words_ibuf[MAX_NAME_CODE_POINTS_SIZE]; int stricmp_void(const void* x, const void* y) { const int* s1 = *(const int**)x; diff --git a/common/unicode/unicode-utils.h b/common/unicode/unicode-utils.h index fbbbe516b5..fb214488c0 100644 --- a/common/unicode/unicode-utils.h +++ b/common/unicode/unicode-utils.h @@ -4,6 +4,12 @@ #pragma once +#include + +inline constexpr size_t MAX_NAME_SIZE = 65536; +inline constexpr size_t MAX_NAME_BYTES_SIZE = 4 * MAX_NAME_SIZE + 4; +inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; + int unicode_toupper(int code); int unicode_tolower(int code); const char* clean_str(const char* x); diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h index 69249431bc..5aef128983 100644 --- a/runtime-light/stdlib/string/string_functions.h +++ b/runtime-light/stdlib/string/string_functions.h @@ -12,25 +12,31 @@ #include #include "auto/common/unicode-utils-auto.h" +#include "common/unicode/unicode-utils.h" #include "common/unicode/utf8-utils.h" #include "runtime-common/core/runtime-core.h" +#include "runtime-common/stdlib/string/string-context.h" #include "runtime-light/k2-platform/k2-api.h" #include "runtime-light/stdlib/diagnostics/logs.h" namespace string_functions_impl_ { -// TODO May be better extract MAX_NAME_SIZE to utf8-utils.h instead of copy-pasting it ? -inline constexpr size_t MAX_NAME_SIZE = 65536; -inline constexpr size_t MAX_NAME_INDICES_SIZE = MAX_NAME_SIZE + 4; -inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; -inline constexpr size_t MAX_NAME_BYTES_SIZE = MAX_NAME_SIZE * 4 + 4; +// TODO naming +inline constexpr size_t __MAX_SIZEOF = std::max({sizeof(int32_t), sizeof(size_t), sizeof(std::byte)}); + +inline constexpr size_t __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = (sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE + __MAX_SIZEOF - 1) & ~(__MAX_SIZEOF - 1); +inline constexpr size_t __WORD_INDICES_SPAN_SIZE_IN_BYTES = (sizeof(size_t) * MAX_NAME_CODE_POINTS_SIZE + __MAX_SIZEOF - 1) & ~(__MAX_SIZEOF - 1); +inline constexpr size_t __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES = (sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE + __MAX_SIZEOF - 1) & ~(__MAX_SIZEOF - 1); +inline constexpr size_t __RESULT_BYTES_SPAN_SIZE_IN_BYTES = (sizeof(std::byte) * MAX_NAME_BYTES_SIZE + __MAX_SIZEOF - 1) & ~(__MAX_SIZEOF - 1); + +static_assert(__SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES + __WORD_INDICES_SPAN_SIZE_IN_BYTES + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES + + __RESULT_BYTES_SPAN_SIZE_IN_BYTES < + StringLibContext::STATIC_BUFFER_LENGTH); -// TODO как учитывать align ? inline constexpr size_t SOURCE_CODE_POINTS_SPAN_BEGIN = 0; -inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; -inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + sizeof(size_t) * MAX_NAME_INDICES_SIZE; -inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; -inline constexpr size_t RESULT_BYTES_SPAN_END = RESULT_BYTES_SPAN_BEGIN + sizeof(std::byte) * MAX_NAME_BYTES_SIZE; +inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES; +inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + __WORD_INDICES_SPAN_SIZE_IN_BYTES; +inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES; inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; @@ -43,12 +49,10 @@ inline int32_t binary_search_ranges(int32_t code) noexcept { size_t l{0}; size_t r{prepare_table_ranges_size}; while (l < r) { - // TODO verify this formula size_t m{((l + r + 2) >> 2) << 1}; if (prepare_table_ranges[m] <= code) { l = m; } else { - // TODO why `- 2` ? r = m - 2; } } @@ -57,7 +61,6 @@ inline int32_t binary_search_ranges(int32_t code) noexcept { // prepare_table_ranges[l + 1] - value int32_t t{prepare_table_ranges[l + 1]}; if (t < 0) { - // TODO блять что это ?? return code - prepare_table_ranges[l] + (~t); } if (t <= 0x10ffff) { @@ -65,13 +68,10 @@ inline int32_t binary_search_ranges(int32_t code) noexcept { } switch (t - 0x200000) { case 0: - // TODO а это return (code & -2); case 1: - // TODO и это ещё return (code | 1); case 2: - // TODO ?? return ((code - 1) | 1); default: k2::exit(1); @@ -92,15 +92,11 @@ inline void prepare_search_string(std::span& code_points) noexcept { int32_t c{code_points[i]}; int32_t new_c{}; if (static_cast(c) < static_cast(TABLE_SIZE)) { - // Таблица каких-то преобразований для первых 1280 символов new_c = static_cast(prepare_table[c]); } else { - // Бинпоиск по мапе преобразований сразу целых range'ей - // prepare_table_ranges - мапа, закодированная в массиве, ага new_c = binary_search_ranges(c); } - // TODO replace with `new_c != 0` ? - if (new_c) { + if (new_c != 0) { // we forbid 2 whitespaces after each other and starting whitespace if (new_c != WHITESPACE || (output_size > 0 && code_points[output_size - 1] != WHITESPACE)) { code_points[output_size++] = new_c; @@ -120,8 +116,9 @@ inline std::span prepare_str_unicode(std::span code_points) no prepare_search_string(code_points); code_points[code_points.size()] = WHITESPACE; - auto* word_indices_begin{reinterpret_cast(RuntimeContext::get().static_SB.buffer()[WORD_INDICES_SPAN_BEGIN])}; - std::span word_start_indices{word_indices_begin, MAX_NAME_INDICES_SIZE}; // indices of first char of every word in `code_points`. + auto& string_lib_ctx{StringLibContext::get()}; + auto* word_indices_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), WORD_INDICES_SPAN_BEGIN))}; + std::span word_start_indices{word_indices_begin, MAX_NAME_CODE_POINTS_SIZE}; // indices of first char of every word in `code_points`. size_t words_count{}; size_t i{}; // looking for the beginnings of the words @@ -160,7 +157,7 @@ inline std::span prepare_str_unicode(std::span code_points) no } } - auto* result_begin{reinterpret_cast(RuntimeContext::get().static_SB.buffer()[RESULT_CODE_POINTS_SPAN_BEGIN])}; + auto* result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_CODE_POINTS_SPAN_BEGIN))}; std::span result{result_begin, MAX_NAME_CODE_POINTS_SIZE}; size_t result_size{}; // output words with '+' separator @@ -179,12 +176,12 @@ inline std::span prepare_str_unicode(std::span code_points) no } // TODO naming -inline std::span clean_str_unicode(std::span code_points) noexcept { - std::span prepared_code_points{prepare_str_unicode(code_points)}; +inline std::span clean_str_unicode(std::span source_code_points) noexcept { + std::span prepared_code_points{prepare_str_unicode(source_code_points)}; - auto* utf8_result_begin{reinterpret_cast(prepared_code_points.begin()[RESULT_CODE_POINTS_SPAN_BEGIN])}; + auto& string_lib_ctx{StringLibContext::get()}; + auto* utf8_result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_BYTES_SPAN_BEGIN))}; std::span utf8_result{utf8_result_begin, MAX_NAME_BYTES_SIZE}; - // put_string_utf8 можно использовать в runtime-light auto length{static_cast(put_string_utf8(prepared_code_points.data(), reinterpret_cast(utf8_result.data())))}; kphp::log::assertion(length < utf8_result.size()); utf8_result = utf8_result.subspan(length); @@ -217,19 +214,15 @@ inline std::span prepare_search_query_impl(std::span(runtime_context.static_SB.buffer())}; - std::span utf8_code_points{ - utf8_code_points_begin, + auto& string_lib_ctx{StringLibContext::get()}; + auto* source_code_points_begin{reinterpret_cast((std::next(string_lib_ctx.static_buf.get(), SOURCE_CODE_POINTS_SPAN_BEGIN)))}; + std::span source_code_points{ + source_code_points_begin, MAX_NAME_CODE_POINTS_SIZE, }; - // html_string_to_utf8 можно полностью использовать в runtime-light - html_string_to_utf8(reinterpret_cast(x.data()), utf8_code_points.data()); - return clean_str_unicode(utf8_code_points); + html_string_to_utf8(reinterpret_cast(x.data()), source_code_points.data()); + return clean_str_unicode(source_code_points); } } // namespace string_functions_impl_ From 9b9070b1257aa0ab7c5842fd54d9f309896c4665 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Tue, 25 Nov 2025 17:08:03 +0300 Subject: [PATCH 09/28] small clean up --- runtime-light/stdlib/string/string_functions.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h index 5aef128983..a9df732bde 100644 --- a/runtime-light/stdlib/string/string_functions.h +++ b/runtime-light/stdlib/string/string_functions.h @@ -21,7 +21,6 @@ namespace string_functions_impl_ { -// TODO naming inline constexpr size_t __MAX_SIZEOF = std::max({sizeof(int32_t), sizeof(size_t), sizeof(std::byte)}); inline constexpr size_t __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = (sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE + __MAX_SIZEOF - 1) & ~(__MAX_SIZEOF - 1); @@ -81,7 +80,6 @@ inline int32_t binary_search_ranges(int32_t code) noexcept { inline constexpr int32_t WHITESPACE{static_cast(' ')}; inline constexpr int32_t PLUS{static_cast('+')}; -// TODO naming /* Prepares unicode 0-terminated string input for search, leaving only digits and letters with diacritics. Length of string can decrease. @@ -111,23 +109,23 @@ inline void prepare_search_string(std::span& code_points) noexcept { code_points = code_points.subspan(output_size); } -// TODO naming inline std::span prepare_str_unicode(std::span code_points) noexcept { prepare_search_string(code_points); code_points[code_points.size()] = WHITESPACE; auto& string_lib_ctx{StringLibContext::get()}; auto* word_indices_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), WORD_INDICES_SPAN_BEGIN))}; - std::span word_start_indices{word_indices_begin, MAX_NAME_CODE_POINTS_SIZE}; // indices of first char of every word in `code_points`. + // indices of first char of every word in `code_points`. + std::span word_start_indices{word_indices_begin, MAX_NAME_CODE_POINTS_SIZE}; size_t words_count{}; size_t i{}; // looking for the beginnings of the words while (i < code_points.size()) { word_start_indices[words_count++] = i; while (i < code_points.size() && code_points[i] != WHITESPACE) { - i++; + ++i; } - i++; + ++i; } word_start_indices = word_start_indices.subspan(words_count); @@ -148,7 +146,7 @@ inline std::span prepare_str_unicode(std::span code_points) no std::sort(word_start_indices.begin(), word_start_indices.end(), word_less_cmp); size_t uniq_words_count{}; - for (i = 0; i < words_count; i++) { + for (i = 0; i < words_count; ++i) { // drop duplicates if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) { word_start_indices[uniq_words_count++] = word_start_indices[i]; @@ -161,7 +159,7 @@ inline std::span prepare_str_unicode(std::span code_points) no std::span result{result_begin, MAX_NAME_CODE_POINTS_SIZE}; size_t result_size{}; // output words with '+' separator - for (i = 0; i < uniq_words_count; i++) { + for (i = 0; i < uniq_words_count; ++i) { size_t ind{word_start_indices[i]}; while (code_points[ind] != WHITESPACE) { result[result_size++] = code_points[ind++]; @@ -175,7 +173,6 @@ inline std::span prepare_str_unicode(std::span code_points) no return result; } -// TODO naming inline std::span clean_str_unicode(std::span source_code_points) noexcept { std::span prepared_code_points{prepare_str_unicode(source_code_points)}; @@ -191,8 +188,7 @@ inline std::span clean_str_unicode(std::span source_co while (i < utf8_result.size()) { char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || - !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || - // скипаем год ? + !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) | (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || !std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) || !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || From ac810b12ccac023d100068004d1fff0a22dfb653 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Tue, 25 Nov 2025 18:03:26 +0300 Subject: [PATCH 10/28] removed alignment of span sizes --- runtime-light/stdlib/string/string_functions.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h index a9df732bde..dfd2e10040 100644 --- a/runtime-light/stdlib/string/string_functions.h +++ b/runtime-light/stdlib/string/string_functions.h @@ -21,12 +21,10 @@ namespace string_functions_impl_ { -inline constexpr size_t __MAX_SIZEOF = std::max({sizeof(int32_t), sizeof(size_t), sizeof(std::byte)}); - -inline constexpr size_t __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = (sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE + __MAX_SIZEOF - 1) & ~(__MAX_SIZEOF - 1); -inline constexpr size_t __WORD_INDICES_SPAN_SIZE_IN_BYTES = (sizeof(size_t) * MAX_NAME_CODE_POINTS_SIZE + __MAX_SIZEOF - 1) & ~(__MAX_SIZEOF - 1); -inline constexpr size_t __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES = (sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE + __MAX_SIZEOF - 1) & ~(__MAX_SIZEOF - 1); -inline constexpr size_t __RESULT_BYTES_SPAN_SIZE_IN_BYTES = (sizeof(std::byte) * MAX_NAME_BYTES_SIZE + __MAX_SIZEOF - 1) & ~(__MAX_SIZEOF - 1); +inline constexpr size_t __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t __WORD_INDICES_SPAN_SIZE_IN_BYTES = sizeof(size_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t __RESULT_BYTES_SPAN_SIZE_IN_BYTES = sizeof(std::byte) * MAX_NAME_BYTES_SIZE; static_assert(__SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES + __WORD_INDICES_SPAN_SIZE_IN_BYTES + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES + __RESULT_BYTES_SPAN_SIZE_IN_BYTES < @@ -188,7 +186,7 @@ inline std::span clean_str_unicode(std::span source_co while (i < utf8_result.size()) { char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || - !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) | + !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || !std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) || !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || @@ -211,7 +209,7 @@ inline std::span prepare_search_query_impl(std::span((std::next(string_lib_ctx.static_buf.get(), SOURCE_CODE_POINTS_SPAN_BEGIN)))}; + auto* source_code_points_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), SOURCE_CODE_POINTS_SPAN_BEGIN))}; std::span source_code_points{ source_code_points_begin, MAX_NAME_CODE_POINTS_SIZE, From 2433a7190fdc25c57b776454df50a944a4b99d18 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Tue, 25 Nov 2025 18:12:42 +0300 Subject: [PATCH 11/28] removed attributes "@kphp-extern-func-info stub generation-required" from prepare_search_query builtin --- builtin-functions/kphp-light/stdlib/server-functions.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/builtin-functions/kphp-light/stdlib/server-functions.txt b/builtin-functions/kphp-light/stdlib/server-functions.txt index 3a348d9f4e..0e46d2a141 100644 --- a/builtin-functions/kphp-light/stdlib/server-functions.txt +++ b/builtin-functions/kphp-light/stdlib/server-functions.txt @@ -69,6 +69,8 @@ function setlocale ($category ::: int, $locale ::: string) ::: string | false; function memory_get_detailed_stats() ::: int[]; +function prepare_search_query ($query ::: string) ::: string; + function memory_get_total_usage() ::: int; function inet_pton ($address ::: string) ::: string | false; @@ -131,7 +133,3 @@ function flush() ::: void; define('PHP_QUERY_RFC1738', 1); define('PHP_QUERY_RFC3986', 2); - -/** @kphp-extern-func-info stub generation-required */ -function prepare_search_query ($query ::: string) ::: string; - From 82d5b64f0be10b144c6b91693bfaf87d48744cc0 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Wed, 26 Nov 2025 13:42:37 +0300 Subject: [PATCH 12/28] added test_prepare_search_query.py and one test case --- .../tests/prepare_search_query/__init__.py | 0 .../tests/prepare_search_query/data/example1 | 17 ++++++++++++++++ .../data/example1_prepared | 1 + .../tests/prepare_search_query/php/index.php | 7 +++++++ .../test_prepare_search_query.py | 20 +++++++++++++++++++ 5 files changed, 45 insertions(+) create mode 100644 tests/python/tests/prepare_search_query/__init__.py create mode 100644 tests/python/tests/prepare_search_query/data/example1 create mode 100644 tests/python/tests/prepare_search_query/data/example1_prepared create mode 100644 tests/python/tests/prepare_search_query/php/index.php create mode 100644 tests/python/tests/prepare_search_query/test_prepare_search_query.py diff --git a/tests/python/tests/prepare_search_query/__init__.py b/tests/python/tests/prepare_search_query/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/python/tests/prepare_search_query/data/example1 b/tests/python/tests/prepare_search_query/data/example1 new file mode 100644 index 0000000000..c59e725370 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example1 @@ -0,0 +1,17 @@ + abacaba dAbAcAbA АбАсАБа йфяывчАСПМИРТОЬЛЩЗБДЮЭ. + К4ЙГЩ ЩГ рщг №кКЙ РШ зй021к 01293г0129 г + + ++_+ +_ +_ +__ ++_ Щ+!"_ №+!_" №+!"_ №+_ "Щ+_ "Щ + + йк й3 к2 + + + +7 88 76кн 68е79 н8г9 ншп + + test test + + test test +test +test +test test TeSt tEsT diff --git a/tests/python/tests/prepare_search_query/data/example1_prepared b/tests/python/tests/prepare_search_query/data/example1_prepared new file mode 100644 index 0000000000..391798b66d --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example1_prepared @@ -0,0 +1 @@ +01293г0129+68е79+76кн+88+abacaba+dabacaba+test+testtesttesttest+абасаба+г+зй021к+й3+йк+йфяывчаспмиртоьлщзбдюэ+к27+к4йгщ+ккй+н8г9+ншп+рш+рщг+щ+щг+ \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/php/index.php b/tests/python/tests/prepare_search_query/php/index.php new file mode 100644 index 0000000000..7fef3c802c --- /dev/null +++ b/tests/python/tests/prepare_search_query/php/index.php @@ -0,0 +1,7 @@ + Date: Wed, 26 Nov 2025 16:38:34 +0300 Subject: [PATCH 13/28] #include "auto/common/unicode-utils-auto.h" moved to .cpp --- .../stdlib/string/string_functions.cpp | 83 +++++++++++++++++++ .../stdlib/string/string_functions.h | 67 +-------------- 2 files changed, 87 insertions(+), 63 deletions(-) create mode 100644 runtime-light/stdlib/string/string_functions.cpp diff --git a/runtime-light/stdlib/string/string_functions.cpp b/runtime-light/stdlib/string/string_functions.cpp new file mode 100644 index 0000000000..0b8683b7f3 --- /dev/null +++ b/runtime-light/stdlib/string/string_functions.cpp @@ -0,0 +1,83 @@ +// Compiler for PHP (aka KPHP) +// Copyright (c) 2025 LLC «V Kontakte» +// Distributed under the GPL v3 License, see LICENSE.notice.txt + +#include "runtime-light/stdlib/string/string_functions.h" + +#include +#include +#include + +#include "auto/common/unicode-utils-auto.h" +#include "runtime-light/k2-platform/k2-api.h" + +namespace string_functions_impl_ { + +/* Search generated ranges for specified character */ +int32_t binary_search_ranges(int32_t code) noexcept { + if (code > MAX_UTF8_CODE_POINT) { + return 0; + } + + size_t l{0}; + size_t r{prepare_table_ranges_size}; + while (l < r) { + size_t m{((l + r + 2) >> 2) << 1}; + if (prepare_table_ranges[m] <= code) { + l = m; + } else { + r = m - 2; + } + } + + // prepare_table_ranges[l] - key + // prepare_table_ranges[l + 1] - value + int32_t t{prepare_table_ranges[l + 1]}; + if (t < 0) { + return code - prepare_table_ranges[l] + (~t); + } + if (t <= 0x10ffff) { + return t; + } + switch (t - 0x200000) { + case 0: + return (code & -2); + case 1: + return (code | 1); + case 2: + return ((code - 1) | 1); + default: + k2::exit(1); + } +} + +/* Prepares unicode 0-terminated string input for search, + leaving only digits and letters with diacritics. + Length of string can decrease. + Returns length of result. */ +void prepare_search_string(std::span& code_points) noexcept { + size_t output_size{}; + for (size_t i{}; i < code_points.size(); ++i) { + int32_t c{code_points[i]}; + int32_t new_c{}; + if (static_cast(c) < static_cast(TABLE_SIZE)) { + new_c = static_cast(prepare_table[c]); + } else { + new_c = binary_search_ranges(c); + } + if (new_c != 0) { + // we forbid 2 whitespaces after each other and starting whitespace + if (new_c != WHITESPACE || (output_size > 0 && code_points[output_size - 1] != WHITESPACE)) { + code_points[output_size++] = new_c; + } + } + } + if (output_size > 0 && code_points[output_size - 1] == WHITESPACE) { + // throw out terminating whitespace + --output_size; + } + code_points[output_size] = 0; + code_points = code_points.subspan(output_size); +} + +} // namespace string_functions_impl_ diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h index dfd2e10040..c94d11d977 100644 --- a/runtime-light/stdlib/string/string_functions.h +++ b/runtime-light/stdlib/string/string_functions.h @@ -11,7 +11,6 @@ #include #include -#include "auto/common/unicode-utils-auto.h" #include "common/unicode/unicode-utils.h" #include "common/unicode/utf8-utils.h" #include "runtime-common/core/runtime-core.h" @@ -37,75 +36,17 @@ inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; -/* Search generated ranges for specified character */ -inline int32_t binary_search_ranges(int32_t code) noexcept { - if (code > MAX_UTF8_CODE_POINT) { - return 0; - } - - size_t l{0}; - size_t r{prepare_table_ranges_size}; - while (l < r) { - size_t m{((l + r + 2) >> 2) << 1}; - if (prepare_table_ranges[m] <= code) { - l = m; - } else { - r = m - 2; - } - } - - // prepare_table_ranges[l] - key - // prepare_table_ranges[l + 1] - value - int32_t t{prepare_table_ranges[l + 1]}; - if (t < 0) { - return code - prepare_table_ranges[l] + (~t); - } - if (t <= 0x10ffff) { - return t; - } - switch (t - 0x200000) { - case 0: - return (code & -2); - case 1: - return (code | 1); - case 2: - return ((code - 1) | 1); - default: - k2::exit(1); - } -} - inline constexpr int32_t WHITESPACE{static_cast(' ')}; inline constexpr int32_t PLUS{static_cast('+')}; +/* Search generated ranges for specified character */ +int32_t binary_search_ranges(int32_t code) noexcept; + /* Prepares unicode 0-terminated string input for search, leaving only digits and letters with diacritics. Length of string can decrease. Returns length of result. */ -inline void prepare_search_string(std::span& code_points) noexcept { - size_t output_size{}; - for (size_t i{}; i < code_points.size(); ++i) { - int32_t c{code_points[i]}; - int32_t new_c{}; - if (static_cast(c) < static_cast(TABLE_SIZE)) { - new_c = static_cast(prepare_table[c]); - } else { - new_c = binary_search_ranges(c); - } - if (new_c != 0) { - // we forbid 2 whitespaces after each other and starting whitespace - if (new_c != WHITESPACE || (output_size > 0 && code_points[output_size - 1] != WHITESPACE)) { - code_points[output_size++] = new_c; - } - } - } - if (output_size > 0 && code_points[output_size - 1] == WHITESPACE) { - // throw out terminating whitespace - --output_size; - } - code_points[output_size] = 0; - code_points = code_points.subspan(output_size); -} +void prepare_search_string(std::span& code_points) noexcept; inline std::span prepare_str_unicode(std::span code_points) noexcept { prepare_search_string(code_points); From fd49228d41d074fc92aeb0fb9242fd143a57a37a Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Thu, 27 Nov 2025 18:13:46 +0300 Subject: [PATCH 14/28] added test_prepare_search_query.py test --- runtime-light/stdlib/stdlib.cmake | 1 + ...ing_functions.cpp => string-functions.cpp} | 6 +- .../stdlib/string/string-functions.h | 159 ++++++++++++++++ .../stdlib/string/string_functions.h | 169 ------------------ .../php/data/component-config.yaml | 9 + .../tests/prepare_search_query/php/index.php | 6 +- .../test_prepare_search_query.py | 15 +- 7 files changed, 186 insertions(+), 179 deletions(-) rename runtime-light/stdlib/string/{string_functions.cpp => string-functions.cpp} (92%) delete mode 100644 runtime-light/stdlib/string/string_functions.h create mode 100644 tests/python/tests/prepare_search_query/php/data/component-config.yaml diff --git a/runtime-light/stdlib/stdlib.cmake b/runtime-light/stdlib/stdlib.cmake index 3831da6b07..47f3648188 100644 --- a/runtime-light/stdlib/stdlib.cmake +++ b/runtime-light/stdlib/stdlib.cmake @@ -39,6 +39,7 @@ prepend( string/regex-functions.cpp string/regex-state.cpp string/string-state.cpp + string/string-functions.cpp system/system-functions.cpp system/system-state.cpp time/date-interval.cpp diff --git a/runtime-light/stdlib/string/string_functions.cpp b/runtime-light/stdlib/string/string-functions.cpp similarity index 92% rename from runtime-light/stdlib/string/string_functions.cpp rename to runtime-light/stdlib/string/string-functions.cpp index 0b8683b7f3..958607b375 100644 --- a/runtime-light/stdlib/string/string_functions.cpp +++ b/runtime-light/stdlib/string/string-functions.cpp @@ -2,7 +2,7 @@ // Copyright (c) 2025 LLC «V Kontakte» // Distributed under the GPL v3 License, see LICENSE.notice.txt -#include "runtime-light/stdlib/string/string_functions.h" +#include "runtime-light/stdlib/string/string-functions.h" #include #include @@ -57,7 +57,7 @@ int32_t binary_search_ranges(int32_t code) noexcept { Returns length of result. */ void prepare_search_string(std::span& code_points) noexcept { size_t output_size{}; - for (size_t i{}; i < code_points.size(); ++i) { + for (size_t i{}; code_points[i] != 0; ++i) { int32_t c{code_points[i]}; int32_t new_c{}; if (static_cast(c) < static_cast(TABLE_SIZE)) { @@ -77,7 +77,7 @@ void prepare_search_string(std::span& code_points) noexcept { --output_size; } code_points[output_size] = 0; - code_points = code_points.subspan(output_size); + code_points = code_points.first(output_size); } } // namespace string_functions_impl_ diff --git a/runtime-light/stdlib/string/string-functions.h b/runtime-light/stdlib/string/string-functions.h index 28b7ad35c6..0f1480ab5d 100644 --- a/runtime-light/stdlib/string/string-functions.h +++ b/runtime-light/stdlib/string/string-functions.h @@ -4,10 +4,169 @@ #pragma once +#include +#include #include +#include +#include +#include +#include "common/unicode/unicode-utils.h" +#include "common/unicode/utf8-utils.h" #include "runtime-common/core/runtime-core.h" +#include "runtime-common/stdlib/string/string-context.h" #include "runtime-light/k2-platform/k2-api.h" +#include "runtime-light/stdlib/diagnostics/logs.h" + +namespace string_functions_impl_ { + +inline constexpr size_t __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t __WORD_INDICES_SPAN_SIZE_IN_BYTES = sizeof(size_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t __RESULT_BYTES_SPAN_SIZE_IN_BYTES = sizeof(std::byte) * MAX_NAME_BYTES_SIZE; + +static_assert(__SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES + __WORD_INDICES_SPAN_SIZE_IN_BYTES + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES + + __RESULT_BYTES_SPAN_SIZE_IN_BYTES < + StringLibContext::STATIC_BUFFER_LENGTH); + +inline constexpr size_t SOURCE_CODE_POINTS_SPAN_BEGIN = 0; +inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES; +inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + __WORD_INDICES_SPAN_SIZE_IN_BYTES; +inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES; + +inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; + +inline constexpr int32_t WHITESPACE{static_cast(' ')}; +inline constexpr int32_t PLUS{static_cast('+')}; + +/* Search generated ranges for specified character */ +int32_t binary_search_ranges(int32_t code) noexcept; + +/* Prepares unicode 0-terminated string input for search, + leaving only digits and letters with diacritics. + Length of string can decrease. + Returns length of result. */ +void prepare_search_string(std::span& code_points) noexcept; + +inline std::span prepare_str_unicode(std::span code_points) noexcept { + prepare_search_string(code_points); + code_points[code_points.size()] = WHITESPACE; + + auto& string_lib_ctx{StringLibContext::get()}; + auto* word_indices_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), WORD_INDICES_SPAN_BEGIN))}; + // indices of first char of every word in `code_points`. + std::span word_start_indices{word_indices_begin, MAX_NAME_CODE_POINTS_SIZE}; + size_t words_count{}; + size_t i{}; + // looking for the beginnings of the words + while (i < code_points.size()) { + word_start_indices[words_count++] = i; + while (i < code_points.size() && code_points[i] != WHITESPACE) { + ++i; + } + ++i; + } + word_start_indices = word_start_indices.first(words_count); + + auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool { + while (code_points[x] != WHITESPACE && code_points[x] == code_points[y]) { + ++x; + ++y; + } + if (code_points[x] == WHITESPACE) { + return code_points[y] != WHITESPACE; + } + if (code_points[y] == WHITESPACE) { + return false; + } + return code_points[x] < code_points[y]; + }}; + + std::sort(word_start_indices.begin(), word_start_indices.end(), word_less_cmp); + + size_t uniq_words_count{}; + for (i = 0; i < words_count; ++i) { + // drop duplicates + if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) { + word_start_indices[uniq_words_count++] = word_start_indices[i]; + } else { + word_start_indices[uniq_words_count - 1] = word_start_indices[i]; + } + } + + auto* result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_CODE_POINTS_SPAN_BEGIN))}; + std::span result{result_begin, MAX_NAME_CODE_POINTS_SIZE}; + size_t result_size{}; + // output words with '+' separator + for (i = 0; i < uniq_words_count; ++i) { + size_t ind{word_start_indices[i]}; + while (code_points[ind] != WHITESPACE) { + result[result_size++] = code_points[ind++]; + } + result[result_size++] = PLUS; + } + result[result_size++] = 0; + + kphp::log::assertion(result_size < MAX_NAME_SIZE); + result = result.first(result_size); + return result; +} + +inline std::span clean_str_unicode(std::span source_code_points) noexcept { + std::span prepared_code_points{prepare_str_unicode(source_code_points)}; + + auto& string_lib_ctx{StringLibContext::get()}; + auto* utf8_result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_BYTES_SPAN_BEGIN))}; + std::span utf8_result{utf8_result_begin, MAX_NAME_BYTES_SIZE}; + auto length{static_cast(put_string_utf8(prepared_code_points.data(), reinterpret_cast(utf8_result.data())))}; + kphp::log::assertion(length < utf8_result.size()); + utf8_result = utf8_result.first(length); + + size_t i{}; + size_t result_size{}; + while (i < utf8_result.size()) { + char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; + bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || + !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || + (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || + !std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) || + !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || + !std::strncmp(c, "8233+", 5)}; + do { + if (!skip) { + utf8_result[result_size] = utf8_result[i]; + ++result_size; + } + } while (utf8_result[i++] != static_cast('+')); + } + utf8_result[result_size] = static_cast(0); + + return utf8_result; +} + +inline std::span prepare_search_query_impl(std::span x) noexcept { + if (x.empty() || x.size() >= MAX_NAME_SIZE) { + return x; + } + + auto& string_lib_ctx{StringLibContext::get()}; + auto* source_code_points_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), SOURCE_CODE_POINTS_SPAN_BEGIN))}; + std::span source_code_points{ + source_code_points_begin, + MAX_NAME_CODE_POINTS_SIZE, + }; + + html_string_to_utf8(reinterpret_cast(x.data()), source_code_points.data()); + return clean_str_unicode(source_code_points); +} + +} // namespace string_functions_impl_ + +inline string f$prepare_search_query(const string& query) noexcept { + std::span s{ + string_functions_impl_::prepare_search_query_impl({reinterpret_cast(query.c_str()), static_cast(query.size())})}; + return {reinterpret_cast(s.data()), static_cast(s.size())}; +} inline Optional f$setlocale(int64_t category, const string& locale) noexcept { const int32_t i32category{static_cast(category)}; diff --git a/runtime-light/stdlib/string/string_functions.h b/runtime-light/stdlib/string/string_functions.h deleted file mode 100644 index c94d11d977..0000000000 --- a/runtime-light/stdlib/string/string_functions.h +++ /dev/null @@ -1,169 +0,0 @@ -// Compiler for PHP (aka KPHP) -// Copyright (c) 2025 LLC «V Kontakte» -// Distributed under the GPL v3 License, see LICENSE.notice.txt - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "common/unicode/unicode-utils.h" -#include "common/unicode/utf8-utils.h" -#include "runtime-common/core/runtime-core.h" -#include "runtime-common/stdlib/string/string-context.h" -#include "runtime-light/k2-platform/k2-api.h" -#include "runtime-light/stdlib/diagnostics/logs.h" - -namespace string_functions_impl_ { - -inline constexpr size_t __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; -inline constexpr size_t __WORD_INDICES_SPAN_SIZE_IN_BYTES = sizeof(size_t) * MAX_NAME_CODE_POINTS_SIZE; -inline constexpr size_t __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; -inline constexpr size_t __RESULT_BYTES_SPAN_SIZE_IN_BYTES = sizeof(std::byte) * MAX_NAME_BYTES_SIZE; - -static_assert(__SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES + __WORD_INDICES_SPAN_SIZE_IN_BYTES + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES + - __RESULT_BYTES_SPAN_SIZE_IN_BYTES < - StringLibContext::STATIC_BUFFER_LENGTH); - -inline constexpr size_t SOURCE_CODE_POINTS_SPAN_BEGIN = 0; -inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES; -inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + __WORD_INDICES_SPAN_SIZE_IN_BYTES; -inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES; - -inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; - -inline constexpr int32_t WHITESPACE{static_cast(' ')}; -inline constexpr int32_t PLUS{static_cast('+')}; - -/* Search generated ranges for specified character */ -int32_t binary_search_ranges(int32_t code) noexcept; - -/* Prepares unicode 0-terminated string input for search, - leaving only digits and letters with diacritics. - Length of string can decrease. - Returns length of result. */ -void prepare_search_string(std::span& code_points) noexcept; - -inline std::span prepare_str_unicode(std::span code_points) noexcept { - prepare_search_string(code_points); - code_points[code_points.size()] = WHITESPACE; - - auto& string_lib_ctx{StringLibContext::get()}; - auto* word_indices_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), WORD_INDICES_SPAN_BEGIN))}; - // indices of first char of every word in `code_points`. - std::span word_start_indices{word_indices_begin, MAX_NAME_CODE_POINTS_SIZE}; - size_t words_count{}; - size_t i{}; - // looking for the beginnings of the words - while (i < code_points.size()) { - word_start_indices[words_count++] = i; - while (i < code_points.size() && code_points[i] != WHITESPACE) { - ++i; - } - ++i; - } - word_start_indices = word_start_indices.subspan(words_count); - - auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool { - while (code_points[x] != WHITESPACE && code_points[x] == code_points[y]) { - ++x; - ++y; - } - if (code_points[x] == WHITESPACE) { - return code_points[y] != WHITESPACE; - } - if (code_points[y] == WHITESPACE) { - return false; - } - return code_points[x] < code_points[y]; - }}; - - std::sort(word_start_indices.begin(), word_start_indices.end(), word_less_cmp); - - size_t uniq_words_count{}; - for (i = 0; i < words_count; ++i) { - // drop duplicates - if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) { - word_start_indices[uniq_words_count++] = word_start_indices[i]; - } else { - word_start_indices[uniq_words_count - 1] = word_start_indices[i]; - } - } - - auto* result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_CODE_POINTS_SPAN_BEGIN))}; - std::span result{result_begin, MAX_NAME_CODE_POINTS_SIZE}; - size_t result_size{}; - // output words with '+' separator - for (i = 0; i < uniq_words_count; ++i) { - size_t ind{word_start_indices[i]}; - while (code_points[ind] != WHITESPACE) { - result[result_size++] = code_points[ind++]; - } - result[result_size++] = PLUS; - } - result[result_size++] = 0; - - kphp::log::assertion(result_size < MAX_NAME_SIZE); - result = result.subspan(result_size); - return result; -} - -inline std::span clean_str_unicode(std::span source_code_points) noexcept { - std::span prepared_code_points{prepare_str_unicode(source_code_points)}; - - auto& string_lib_ctx{StringLibContext::get()}; - auto* utf8_result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_BYTES_SPAN_BEGIN))}; - std::span utf8_result{utf8_result_begin, MAX_NAME_BYTES_SIZE}; - auto length{static_cast(put_string_utf8(prepared_code_points.data(), reinterpret_cast(utf8_result.data())))}; - kphp::log::assertion(length < utf8_result.size()); - utf8_result = utf8_result.subspan(length); - - size_t i{}; - size_t result_size{}; - while (i < utf8_result.size()) { - char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; - bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || - !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || - (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || - !std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) || - !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || - !std::strncmp(c, "8233+", 5)}; - do { - if (!skip) { - utf8_result[result_size] = utf8_result[i]; - ++result_size; - } - } while (utf8_result[i++] != static_cast('+')); - } - utf8_result[result_size] = static_cast(0); - - return utf8_result; -} - -inline std::span prepare_search_query_impl(std::span x) noexcept { - if (x.empty() || x.size() >= MAX_NAME_SIZE) { - return x; - } - - auto& string_lib_ctx{StringLibContext::get()}; - auto* source_code_points_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), SOURCE_CODE_POINTS_SPAN_BEGIN))}; - std::span source_code_points{ - source_code_points_begin, - MAX_NAME_CODE_POINTS_SIZE, - }; - - html_string_to_utf8(reinterpret_cast(x.data()), source_code_points.data()); - return clean_str_unicode(source_code_points); -} - -} // namespace string_functions_impl_ - -inline string f$prepare_search_query(const string& query) noexcept { - std::span s{ - string_functions_impl_::prepare_search_query_impl({reinterpret_cast(query.c_str()), static_cast(query.size())})}; - return {reinterpret_cast(s.data()), static_cast(s.size())}; -} diff --git a/tests/python/tests/prepare_search_query/php/data/component-config.yaml b/tests/python/tests/prepare_search_query/php/data/component-config.yaml new file mode 100644 index 0000000000..5683788426 --- /dev/null +++ b/tests/python/tests/prepare_search_query/php/data/component-config.yaml @@ -0,0 +1,9 @@ +entry: script +components: + script: + image: KPHP + scope: Request + args: + ini hello: "world" + runtime-config: ${RUNTIME_CONFIG_PATH} + links: {} diff --git a/tests/python/tests/prepare_search_query/php/index.php b/tests/python/tests/prepare_search_query/php/index.php index 7fef3c802c..67bc296318 100644 --- a/tests/python/tests/prepare_search_query/php/index.php +++ b/tests/python/tests/prepare_search_query/php/index.php @@ -1,7 +1,11 @@ $res); + echo json_encode($resp); } main(); diff --git a/tests/python/tests/prepare_search_query/test_prepare_search_query.py b/tests/python/tests/prepare_search_query/test_prepare_search_query.py index 1f989ba728..f210b2330d 100644 --- a/tests/python/tests/prepare_search_query/test_prepare_search_query.py +++ b/tests/python/tests/prepare_search_query/test_prepare_search_query.py @@ -1,20 +1,23 @@ +import json import os from python.lib.testcase import WebServerAutoTestCase -directory_path = "data" +directory_path = "kphp/tests/python/tests/prepare_search_query/data" prepared_suffix = "_prepared" -class TestShutdownFunctions(WebServerAutoTestCase): +class TestPrepareSearchQuery(WebServerAutoTestCase): def test_prepare_search_query(self): for file in os.listdir(directory_path): if not os.path.basename(file).endswith(prepared_suffix): - with open(file, "r", encoding="utf-8") as query_file: - with open(file + prepared_suffix, "r", encoding="utf-8") as prepared_query_file: + with open(os.path.join(directory_path, file), "r") as query_file: + with open(os.path.join(directory_path, file + prepared_suffix), "r") as prepared_query_file: query = query_file.read() expected_prepared_query = prepared_query_file.read() - resp = self.web_server.http_post(query) + d = {"post": query} + resp = self.web_server.http_post(json=d) self.assertEqual(resp.status_code, 200) - self.assertEqual(resp.text, expected_prepared_query) + result = json.loads(resp.text)["POST_BODY"] + self.assertEqual(result, expected_prepared_query) From 877b96ca21d74380cfcc077ed957e320cc061434 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Thu, 27 Nov 2025 19:27:09 +0300 Subject: [PATCH 15/28] json data replaced with raw binary data --- .../python/tests/prepare_search_query/data/example1_prepared | 2 +- tests/python/tests/prepare_search_query/php/index.php | 3 +-- .../tests/prepare_search_query/test_prepare_search_query.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/python/tests/prepare_search_query/data/example1_prepared b/tests/python/tests/prepare_search_query/data/example1_prepared index 391798b66d..b97f42cdc2 100644 --- a/tests/python/tests/prepare_search_query/data/example1_prepared +++ b/tests/python/tests/prepare_search_query/data/example1_prepared @@ -1 +1 @@ -01293г0129+68е79+76кн+88+abacaba+dabacaba+test+testtesttesttest+абасаба+г+зй021к+й3+йк+йфяывчаспмиртоьлщзбдюэ+к27+к4йгщ+ккй+н8г9+ншп+рш+рщг+щ+щг+ \ No newline at end of file +01293г0129+68е79+7+76кн+88+abacaba+dabacaba+test+абасаба+г+зй021к+й3+йк+йфяывчаспмиртоьлщзбдюэ+к2+к4йгщ+ккй+н8г9+ншп+рш+рщг+щ+щг+ \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/php/index.php b/tests/python/tests/prepare_search_query/php/index.php index 67bc296318..7047b41c85 100644 --- a/tests/python/tests/prepare_search_query/php/index.php +++ b/tests/python/tests/prepare_search_query/php/index.php @@ -2,8 +2,7 @@ function main() { $raw_post_data = file_get_contents('php://input'); - $post_data = json_decode($raw_post_data, $associative=true); - $res = prepare_search_query($post_data["post"]); + $res = prepare_search_query($raw_post_data); $resp = array("POST_BODY" => $res); echo json_encode($resp); } diff --git a/tests/python/tests/prepare_search_query/test_prepare_search_query.py b/tests/python/tests/prepare_search_query/test_prepare_search_query.py index f210b2330d..6d3634ddd2 100644 --- a/tests/python/tests/prepare_search_query/test_prepare_search_query.py +++ b/tests/python/tests/prepare_search_query/test_prepare_search_query.py @@ -15,8 +15,8 @@ def test_prepare_search_query(self): query = query_file.read() expected_prepared_query = prepared_query_file.read() - d = {"post": query} - resp = self.web_server.http_post(json=d) + headers = {"Content-Type": "text/plain; charset=utf-8"} + resp = self.web_server.http_post(headers=headers, data=query.encode("utf-8")) self.assertEqual(resp.status_code, 200) result = json.loads(resp.text)["POST_BODY"] From a2ef9cf783af8104f4f484a3dacc4ef270e3fc5e Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Thu, 27 Nov 2025 19:58:27 +0300 Subject: [PATCH 16/28] added utf-8 examples for prepare_search_query test --- .../tests/prepare_search_query/data/example10 | Bin 0 -> 1408 bytes .../prepare_search_query/data/example10_prepared | 1 + .../tests/prepare_search_query/data/example2 | 1 + .../prepare_search_query/data/example2_prepared | 1 + .../tests/prepare_search_query/data/example3 | 1 + .../prepare_search_query/data/example3_prepared | 1 + .../tests/prepare_search_query/data/example4 | 1 + .../prepare_search_query/data/example4_prepared | 1 + .../tests/prepare_search_query/data/example5 | Bin 0 -> 31 bytes .../prepare_search_query/data/example5_prepared | 1 + .../tests/prepare_search_query/data/example6 | 1 + .../prepare_search_query/data/example6_prepared | 1 + .../tests/prepare_search_query/data/example7 | 1 + .../prepare_search_query/data/example7_prepared | 1 + .../tests/prepare_search_query/data/example8 | Bin 0 -> 148 bytes .../prepare_search_query/data/example8_prepared | 1 + .../tests/prepare_search_query/data/example9 | 2 ++ .../prepare_search_query/data/example9_prepared | 1 + 18 files changed, 16 insertions(+) create mode 100644 tests/python/tests/prepare_search_query/data/example10 create mode 100644 tests/python/tests/prepare_search_query/data/example10_prepared create mode 100644 tests/python/tests/prepare_search_query/data/example2 create mode 100644 tests/python/tests/prepare_search_query/data/example2_prepared create mode 100644 tests/python/tests/prepare_search_query/data/example3 create mode 100644 tests/python/tests/prepare_search_query/data/example3_prepared create mode 100644 tests/python/tests/prepare_search_query/data/example4 create mode 100644 tests/python/tests/prepare_search_query/data/example4_prepared create mode 100644 tests/python/tests/prepare_search_query/data/example5 create mode 100644 tests/python/tests/prepare_search_query/data/example5_prepared create mode 100644 tests/python/tests/prepare_search_query/data/example6 create mode 100644 tests/python/tests/prepare_search_query/data/example6_prepared create mode 100644 tests/python/tests/prepare_search_query/data/example7 create mode 100644 tests/python/tests/prepare_search_query/data/example7_prepared create mode 100644 tests/python/tests/prepare_search_query/data/example8 create mode 100644 tests/python/tests/prepare_search_query/data/example8_prepared create mode 100644 tests/python/tests/prepare_search_query/data/example9 create mode 100644 tests/python/tests/prepare_search_query/data/example9_prepared diff --git a/tests/python/tests/prepare_search_query/data/example10 b/tests/python/tests/prepare_search_query/data/example10 new file mode 100644 index 0000000000000000000000000000000000000000..a964b269f1856673ec866853cf90ba623885d3af GIT binary patch literal 1408 zcmV-`1%LWSKSSuXqRyB?3|Qujs8ewhNk-I`FzSo96;zSI>q+;^AorZ6J zc~WyhZxI7zKn5_|kHv%JqK+U_EoUOxuIZ4l;MM#sHt%786-c^B%v zf-7OwhTW0HtS2vWN=ZKz1XE-Na#Pl_9xH2RK3fQIEIHqfJX?7!WNKd(Y1gE~fdWEz zbaFZq@v*m^HCu1nqyFmx#lfQIm;oTnZ@ViquWa9z`fZ^W+wG{}MLuBhgSuN&Ef=#q^h zNhC$elw?Zug^IOJVp|m9q=7CeZs)d*$E9~nC0ShtX>-Y*W!0TGFG@}q=DeQMo-G41 zO={(;xG2w`&aoudtyW}RI$CVyfV}ahjhkIv)UkLNMJ+(^u$!dRw|vQmD&CtaKPuLg zbYJ4Qn|whH$%|IS77d;r9>xlatiK?jzA|HP#{lPMF40AJxtb)MiMj1j6w+Ey_Dj& zr~=!n0Y>SYm+zRm*q>^1P8leF1q#rMK20S#CFY-)Wo}F5udW)^h-FGAc^zlNs5#cU zXBcKXdSt@3Rn&*#s)9B>@Eupe~po1Pd!%&X0xQ546zOg+<) z&zW}u=Z&~QYibNI+@MbhJMD(KGgA-kiK8A1E)_;;IZQ4KFJLWY2_5f~ry@-)-KNZd z^1Yy*R66sCi!Gw__6mF1Y45O68unS>rd zGc`#7?TeQ+TjGhX97OYuzp5cND>Fb_9NxVeAbe;L;Jt@i$BRJEx9_pP1OVcPg7c}M zhB#d4zOUu7hc_tzHQ=(VYz89Xy^(id6vD4+Mj%iCFB5MAVA7R;eH1t_>ztlS$%EUM zbSWTVWf~n>D@zCyQb{BTcy|rQtO7i7ck{2eo?=fq#J_w>9Pg=tT{#UuLRL*NdRtHQ zfTf2c;H9I)jRnhrM&YcXGHQL;gvGVWxP0EAKL*dPNmm3(6-PrULw`dEVBw5|RbI%J zE=ph%6=w`WDr$U6bU+nKMt)`wEd^)dii!w=(OE5ne-qTQa- zq!?-CjfK{TDQeiTYSXp{A8_BV2QSf#9NDt!k*jA1(3sD?5qxV#E=%dTw;VUyh3k~R z3{lIWTOM`7um|v?h@e?v0D2874O&P^csU4j9e5{GOT@WW0_Cca#GKuj>4lf{hPkRU zZCJ*$8B|Eau@Q9&BN}@+@1Bt!Iy+e%Hpsu(q)P|igd*9$O%F_1E%BF?ltmv-*|^$| z-Ht>Od1}YFVR9ZpHYDJ!pc3<^ldLF3c~S4alkSq0=(?FS%eyajPa4UtTX0V?E6;>K z^RSkjQ%8KzmE5XhO=)*gVoX-ek~MDQl9~)EVj>I@bLOyt6CW+;poMU2Y%1lCx5$=X zKzwfMw1zeeP$6g2vEE|_`% literal 0 HcmV?d00001 diff --git a/tests/python/tests/prepare_search_query/data/example10_prepared b/tests/python/tests/prepare_search_query/data/example10_prepared new file mode 100644 index 0000000000..01668895ae --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example10_prepared @@ -0,0 +1 @@ +0wp+0ۏń䢎+2mj媸+4ȁ뮨戯+7+9ߎ+a+aնݑŭ+a꾙n+a􇁃6v+b+bwtr+c赢θb+d+dj+e+eʔdj􅊵mb+fǽkʀ+g+hq+hyb酢+i+ifԗ0ꋷ+isɞe+iւz+iפá+jn+kf+ne󷆤p7㮐ӷae+n綍ǥwl+o+oye+p+rji+rs+s+sd+sr+t+u+w+x挨sq+y+yrsao+yy+zl值񦍛+ņ+ȯ+ɇ+ɋf2ʄa+ρ+ϟα+є+ӈoį+ӡ᫉v+աx+׭vd+ل蒍+ٮ駗+ޛ+ᤁ+ᵯǘ+嫢򙫓+漞ӟ+꽂+뙚դb򺝆o++d++𰛤է+񱷝5+󿀆𥜧+ \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/data/example2 b/tests/python/tests/prepare_search_query/data/example2 new file mode 100644 index 0000000000..24de910c13 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example2 @@ -0,0 +1 @@ +Y \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/data/example2_prepared b/tests/python/tests/prepare_search_query/data/example2_prepared new file mode 100644 index 0000000000..6e4f379512 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example2_prepared @@ -0,0 +1 @@ +y+ \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/data/example3 b/tests/python/tests/prepare_search_query/data/example3 new file mode 100644 index 0000000000..63d8dbd40c --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example3 @@ -0,0 +1 @@ +b \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/data/example3_prepared b/tests/python/tests/prepare_search_query/data/example3_prepared new file mode 100644 index 0000000000..950b67b138 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example3_prepared @@ -0,0 +1 @@ +b+ \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/data/example4 b/tests/python/tests/prepare_search_query/data/example4 new file mode 100644 index 0000000000..01e6c6a5f9 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example4 @@ -0,0 +1 @@ +⚞žPuRZC[ \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/data/example4_prepared b/tests/python/tests/prepare_search_query/data/example4_prepared new file mode 100644 index 0000000000..2dfb253acd --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example4_prepared @@ -0,0 +1 @@ +urzc+žp+ \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/data/example5 b/tests/python/tests/prepare_search_query/data/example5 new file mode 100644 index 0000000000000000000000000000000000000000..8dd45ae465adf49cc15fa834161bc0d1b0b85159 GIT binary patch literal 31 ncmcDx+FJK;+A?v63vFMT_jftE`k8AbTQPjtwP;~Z&K^1Uc>4mE_GvK3($%r&r(X0?7 zV`@n-FBQ&)(6>ry;;gaar;*^ZsvgvgVQurCppV?8Q_hG-%(NOqR0a|?(W>Q~y3Dz6 C(@XmR literal 0 HcmV?d00001 diff --git a/tests/python/tests/prepare_search_query/data/example8_prepared b/tests/python/tests/prepare_search_query/data/example8_prepared new file mode 100644 index 0000000000..3dbe59fb32 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example8_prepared @@ -0,0 +1 @@ +exk+n2+բq+ֆо+즕tŏ+ \ No newline at end of file diff --git a/tests/python/tests/prepare_search_query/data/example9 b/tests/python/tests/prepare_search_query/data/example9 new file mode 100644 index 0000000000..4690309332 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example9 @@ -0,0 +1,2 @@ +l~' Date: Thu, 27 Nov 2025 20:25:24 +0300 Subject: [PATCH 17/28] added newline to test files --- tests/python/tests/prepare_search_query/data/example10_prepared | 2 +- tests/python/tests/prepare_search_query/data/example1_prepared | 2 +- tests/python/tests/prepare_search_query/data/example2 | 2 +- tests/python/tests/prepare_search_query/data/example2_prepared | 2 +- tests/python/tests/prepare_search_query/data/example3 | 2 +- tests/python/tests/prepare_search_query/data/example3_prepared | 2 +- tests/python/tests/prepare_search_query/data/example4 | 2 +- tests/python/tests/prepare_search_query/data/example4_prepared | 2 +- tests/python/tests/prepare_search_query/data/example5_prepared | 2 +- tests/python/tests/prepare_search_query/data/example6 | 2 +- tests/python/tests/prepare_search_query/data/example6_prepared | 2 +- tests/python/tests/prepare_search_query/data/example7 | 2 +- tests/python/tests/prepare_search_query/data/example7_prepared | 2 +- tests/python/tests/prepare_search_query/data/example8_prepared | 2 +- tests/python/tests/prepare_search_query/data/example9 | 2 +- tests/python/tests/prepare_search_query/data/example9_prepared | 2 +- .../tests/prepare_search_query/test_prepare_search_query.py | 2 ++ 17 files changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/python/tests/prepare_search_query/data/example10_prepared b/tests/python/tests/prepare_search_query/data/example10_prepared index 01668895ae..7311741531 100644 --- a/tests/python/tests/prepare_search_query/data/example10_prepared +++ b/tests/python/tests/prepare_search_query/data/example10_prepared @@ -1 +1 @@ -0wp+0ۏń䢎+2mj媸+4ȁ뮨戯+7+9ߎ+a+aնݑŭ+a꾙n+a􇁃6v+b+bwtr+c赢θb+d+dj+e+eʔdj􅊵mb+fǽkʀ+g+hq+hyb酢+i+ifԗ0ꋷ+isɞe+iւz+iפá+jn+kf+ne󷆤p7㮐ӷae+n綍ǥwl+o+oye+p+rji+rs+s+sd+sr+t+u+w+x挨sq+y+yrsao+yy+zl值񦍛+ņ+ȯ+ɇ+ɋf2ʄa+ρ+ϟα+є+ӈoį+ӡ᫉v+աx+׭vd+ل蒍+ٮ駗+ޛ+ᤁ+ᵯǘ+嫢򙫓+漞ӟ+꽂+뙚դb򺝆o++d++𰛤է+񱷝5+󿀆𥜧+ \ No newline at end of file +0wp+0ۏń䢎+2mj媸+4ȁ뮨戯+7+9ߎ+a+aնݑŭ+a꾙n+a􇁃6v+b+bwtr+c赢θb+d+dj+e+eʔdj􅊵mb+fǽkʀ+g+hq+hyb酢+i+ifԗ0ꋷ+isɞe+iւz+iפá+jn+kf+ne󷆤p7㮐ӷae+n綍ǥwl+o+oye+p+rji+rs+s+sd+sr+t+u+w+x挨sq+y+yrsao+yy+zl值񦍛+ņ+ȯ+ɇ+ɋf2ʄa+ρ+ϟα+є+ӈoį+ӡ᫉v+աx+׭vd+ل蒍+ٮ駗+ޛ+ᤁ+ᵯǘ+嫢򙫓+漞ӟ+꽂+뙚դb򺝆o++d++𰛤է+񱷝5+󿀆𥜧+ diff --git a/tests/python/tests/prepare_search_query/data/example1_prepared b/tests/python/tests/prepare_search_query/data/example1_prepared index b97f42cdc2..1dc52ae97e 100644 --- a/tests/python/tests/prepare_search_query/data/example1_prepared +++ b/tests/python/tests/prepare_search_query/data/example1_prepared @@ -1 +1 @@ -01293г0129+68е79+7+76кн+88+abacaba+dabacaba+test+абасаба+г+зй021к+й3+йк+йфяывчаспмиртоьлщзбдюэ+к2+к4йгщ+ккй+н8г9+ншп+рш+рщг+щ+щг+ \ No newline at end of file +01293г0129+68е79+7+76кн+88+abacaba+dabacaba+test+абасаба+г+зй021к+й3+йк+йфяывчаспмиртоьлщзбдюэ+к2+к4йгщ+ккй+н8г9+ншп+рш+рщг+щ+щг+ diff --git a/tests/python/tests/prepare_search_query/data/example2 b/tests/python/tests/prepare_search_query/data/example2 index 24de910c13..9bda8c35c2 100644 --- a/tests/python/tests/prepare_search_query/data/example2 +++ b/tests/python/tests/prepare_search_query/data/example2 @@ -1 +1 @@ -Y \ No newline at end of file +Y diff --git a/tests/python/tests/prepare_search_query/data/example2_prepared b/tests/python/tests/prepare_search_query/data/example2_prepared index 6e4f379512..469527404f 100644 --- a/tests/python/tests/prepare_search_query/data/example2_prepared +++ b/tests/python/tests/prepare_search_query/data/example2_prepared @@ -1 +1 @@ -y+ \ No newline at end of file +y+ diff --git a/tests/python/tests/prepare_search_query/data/example3 b/tests/python/tests/prepare_search_query/data/example3 index 63d8dbd40c..6178079822 100644 --- a/tests/python/tests/prepare_search_query/data/example3 +++ b/tests/python/tests/prepare_search_query/data/example3 @@ -1 +1 @@ -b \ No newline at end of file +b diff --git a/tests/python/tests/prepare_search_query/data/example3_prepared b/tests/python/tests/prepare_search_query/data/example3_prepared index 950b67b138..071dc66971 100644 --- a/tests/python/tests/prepare_search_query/data/example3_prepared +++ b/tests/python/tests/prepare_search_query/data/example3_prepared @@ -1 +1 @@ -b+ \ No newline at end of file +b+ diff --git a/tests/python/tests/prepare_search_query/data/example4 b/tests/python/tests/prepare_search_query/data/example4 index 01e6c6a5f9..36774f9fe7 100644 --- a/tests/python/tests/prepare_search_query/data/example4 +++ b/tests/python/tests/prepare_search_query/data/example4 @@ -1 +1 @@ -⚞žPuRZC[ \ No newline at end of file +⚞žPuRZC[ diff --git a/tests/python/tests/prepare_search_query/data/example4_prepared b/tests/python/tests/prepare_search_query/data/example4_prepared index 2dfb253acd..f31ecc781f 100644 --- a/tests/python/tests/prepare_search_query/data/example4_prepared +++ b/tests/python/tests/prepare_search_query/data/example4_prepared @@ -1 +1 @@ -urzc+žp+ \ No newline at end of file +urzc+žp+ diff --git a/tests/python/tests/prepare_search_query/data/example5_prepared b/tests/python/tests/prepare_search_query/data/example5_prepared index 199b158083..2daa175e5d 100644 --- a/tests/python/tests/prepare_search_query/data/example5_prepared +++ b/tests/python/tests/prepare_search_query/data/example5_prepared @@ -1 +1 @@ -8յ+c+і􃿊aen7+ᖦ+ \ No newline at end of file +8յ+c+і􃿊aen7+ᖦ+ diff --git a/tests/python/tests/prepare_search_query/data/example6 b/tests/python/tests/prepare_search_query/data/example6 index 6a587efb0c..95cc2606dc 100644 --- a/tests/python/tests/prepare_search_query/data/example6 +++ b/tests/python/tests/prepare_search_query/data/example6 @@ -1 +1 @@ -׬qԻė^#xܵ칈T8y+䣳 V,ڦAڍ< 0 and expected_prepared_query[-1] == '\n': + expected_prepared_query = expected_prepared_query[:-1] headers = {"Content-Type": "text/plain; charset=utf-8"} resp = self.web_server.http_post(headers=headers, data=query.encode("utf-8")) From 37aefd38c63a0e79854fe98f9bf55047216584c2 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Fri, 28 Nov 2025 14:15:59 +0300 Subject: [PATCH 18/28] relative path for example files --- .../tests/prepare_search_query/test_prepare_search_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/tests/prepare_search_query/test_prepare_search_query.py b/tests/python/tests/prepare_search_query/test_prepare_search_query.py index d323a1779a..8b3dd04242 100644 --- a/tests/python/tests/prepare_search_query/test_prepare_search_query.py +++ b/tests/python/tests/prepare_search_query/test_prepare_search_query.py @@ -2,7 +2,7 @@ import os from python.lib.testcase import WebServerAutoTestCase -directory_path = "kphp/tests/python/tests/prepare_search_query/data" +directory_path = os.path.join(os.path.dirname(__file__), "data") prepared_suffix = "_prepared" From e9b49ef5ef12a758cb530d568dbbffef7bfc9415 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Fri, 28 Nov 2025 20:00:54 +0300 Subject: [PATCH 19/28] removed args from component-config.yaml --- .../tests/prepare_search_query/php/data/component-config.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/python/tests/prepare_search_query/php/data/component-config.yaml b/tests/python/tests/prepare_search_query/php/data/component-config.yaml index 5683788426..2ed98fed14 100644 --- a/tests/python/tests/prepare_search_query/php/data/component-config.yaml +++ b/tests/python/tests/prepare_search_query/php/data/component-config.yaml @@ -3,7 +3,4 @@ components: script: image: KPHP scope: Request - args: - ini hello: "world" - runtime-config: ${RUNTIME_CONFIG_PATH} links: {} From 2296f42f43523bfff9b229e228a3454fa3dbfd2c Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Thu, 12 Mar 2026 18:05:38 +0300 Subject: [PATCH 20/28] common prepare_search_query_impl --- common/unicode/unicode-utils.cpp | 184 ++++++++++-------- common/unicode/unicode-utils.h | 3 +- .../stdlib/string/string-functions.h | 32 +++ .../stdlib/string/string-functions.cpp | 3 +- .../stdlib/string/string-functions.h | 8 +- runtime/string_functions.cpp | 6 +- 6 files changed, 140 insertions(+), 96 deletions(-) diff --git a/common/unicode/unicode-utils.cpp b/common/unicode/unicode-utils.cpp index ebeb7b05ed..0a47bb748f 100644 --- a/common/unicode/unicode-utils.cpp +++ b/common/unicode/unicode-utils.cpp @@ -4,7 +4,11 @@ #include "common/unicode/unicode-utils.h" +#include #include +#include +#include +#include #include #include @@ -13,7 +17,7 @@ #include "common/unicode/utf8-utils.h" /* Search generated ranges for specified character */ -static int binary_search_ranges(const int* ranges, int r, int code) { +static int binary_search_ranges(const int* ranges, int r, int code, std::function assertf) { if ((unsigned int)code > 0x10ffff) { return 0; } @@ -43,9 +47,9 @@ static int binary_search_ranges(const int* ranges, int r, int code) { case 2: return ((code - 1) | 1); default: - assert(0); - exit(1); + assertf(false); } + return 0; } /* Convert character to upper case */ @@ -66,38 +70,38 @@ int unicode_tolower(int code) { } } +inline constexpr int32_t WHITESPACE_CODE_POINT{static_cast(' ')}; +inline constexpr int32_t PLUS_CODE_POINT{static_cast('+')}; + /* Prepares unicode 0-terminated string input for search, leaving only digits and letters with diacritics. Length of string can decrease. Returns length of result. */ -int prepare_search_string(int* input) { - int i; - int* output = input; - for (i = 0; input[i]; i++) { - int c = input[i], new_c; - if ((unsigned int)c < (unsigned int)TABLE_SIZE) { - new_c = prepare_table[c]; +size_t prepare_search_string(int32_t* code_points, std::function assertf) noexcept { + size_t output_size{}; + for (size_t i{}; code_points[i] != 0; ++i) { + int32_t c{code_points[i]}; + int32_t new_c{}; + if (static_cast(c) < static_cast(TABLE_SIZE)) { + new_c = static_cast(prepare_table[c]); } else { - new_c = binary_search_ranges(prepare_table_ranges, prepare_table_ranges_size, c); + new_c = binary_search_ranges(prepare_table_ranges, prepare_table_ranges_size, c, assertf); } - if (new_c) { - if (new_c != 0x20 || (output > input && output[-1] != 0x20)) { - *output++ = new_c; + if (new_c != 0) { + // we forbid 2 whitespaces after each other and starting whitespace + if (new_c != WHITESPACE_CODE_POINT || (output_size > 0 && code_points[output_size - 1] != WHITESPACE_CODE_POINT)) { + code_points[output_size++] = new_c; } } } - if (output > input && output[-1] == 0x20) { - output--; + if (output_size > 0 && code_points[output_size - 1] == WHITESPACE_CODE_POINT) { + // throw out terminating whitespace + --output_size; } - *output = 0; - return output - input; + code_points[output_size] = 0; + return output_size; } -static char prep_buf[MAX_NAME_BYTES_SIZE]; -int prep_ibuf[MAX_NAME_CODE_POINTS_SIZE]; -static int prep_ibuf_res[MAX_NAME_CODE_POINTS_SIZE]; -static int* words_ibuf[MAX_NAME_CODE_POINTS_SIZE]; - int stricmp_void(const void* x, const void* y) { const int* s1 = *(const int**)x; const int* s2 = *(const int**)y; @@ -106,88 +110,102 @@ int stricmp_void(const void* x, const void* y) { return *s1 - *s2; } -int* prepare_str_unicode(const int* x) { - int* v = prep_ibuf; - - int n; - if (v != x) { - for (n = 0; x[n]; n++) { - v[n] = x[n]; +inline size_t prepare_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::function assertf) noexcept { + size_t code_points_length = prepare_search_string(code_points, assertf); + code_points[code_points_length] = WHITESPACE_CODE_POINT; + + size_t words_count{}; + size_t i{}; + // looking for the beginnings of the words + while (i < code_points_length) { + word_start_indices[words_count++] = i; + while (i < code_points_length && code_points[i] != WHITESPACE_CODE_POINT) { + ++i; } - v[n] = 0; + ++i; } - n = prepare_search_string(v); - v[n] = ' '; - - int i = 0, k = 0; - while (i < n) { - words_ibuf[k++] = v + i; - while (v[i] && v[i] != ' ') { - i++; + auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool { + while (code_points[x] != WHITESPACE_CODE_POINT && code_points[x] == code_points[y]) { + ++x; + ++y; } - i++; - } + if (code_points[x] == WHITESPACE_CODE_POINT) { + return code_points[y] != WHITESPACE_CODE_POINT; + } + if (code_points[y] == WHITESPACE_CODE_POINT) { + return false; + } + return code_points[x] < code_points[y]; + }}; - qsort(words_ibuf, (size_t)k, sizeof(int*), stricmp_void); + std::sort(word_start_indices, std::next(word_start_indices, words_count), word_less_cmp); - int j = 0; - for (i = 0; i < k; i++) { - if (j == 0 || stricmp_void(&words_ibuf[j - 1], &words_ibuf[i])) { - words_ibuf[j++] = words_ibuf[i]; + size_t uniq_words_count{}; + for (i = 0; i < words_count; ++i) { + // drop duplicates + if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) { + word_start_indices[uniq_words_count++] = word_start_indices[i]; } else { - words_ibuf[j - 1] = words_ibuf[i]; + word_start_indices[uniq_words_count - 1] = word_start_indices[i]; } } - k = j; - int* res = prep_ibuf_res; - for (i = 0; i < k; i++) { - int* tmp = words_ibuf[i]; - while (*tmp != ' ') { - *res++ = *tmp++; + size_t result_size{}; + // output words with '+' separator + for (i = 0; i < uniq_words_count; ++i) { + size_t ind{word_start_indices[i]}; + while (code_points[ind] != WHITESPACE_CODE_POINT) { + prepared_code_points[result_size++] = code_points[ind++]; } - *res++ = '+'; + prepared_code_points[result_size++] = PLUS_CODE_POINT; } - *res++ = 0; + prepared_code_points[result_size++] = 0; - assert(res - prep_ibuf_res < MAX_NAME_SIZE); - return prep_ibuf_res; + assertf(result_size < MAX_NAME_SIZE); + return result_size; } -const char* clean_str_unicode(const int* xx) { - assert(xx != NULL); - - int* v = prepare_str_unicode(xx); - int l = put_string_utf8(v, prep_buf); - assert(l < sizeof(prep_buf)); - - char *s = prep_buf, *x = prep_buf; - int skip; - - while (*x != 0) { - skip = !strncmp(x, "amp+", 4) || !strncmp(x, "gt+", 3) || !strncmp(x, "lt+", 3) || !strncmp(x, "quot+", 5) || !strncmp(x, "ft+", 3) || - !strncmp(x, "feat+", 5) || - (((x[0] == '1' && x[1] == '9') || (x[0] == '2' && x[1] == '0')) && ('0' <= x[2] && x[2] <= '9') && ('0' <= x[3] && x[3] <= '9') && x[4] == '+') || - !strncmp(x, "092+", 4) || !strncmp(x, "33+", 3) || !strncmp(x, "34+", 3) || !strncmp(x, "36+", 3) || !strncmp(x, "39+", 3) || - !strncmp(x, "60+", 3) || !strncmp(x, "62+", 3) || !strncmp(x, "8232+", 5) || !strncmp(x, "8233+", 5); +inline size_t clean_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, + std::function assertf) noexcept { + prepare_str_unicode(code_points, word_start_indices, prepared_code_points, assertf); + + auto length{static_cast(put_string_utf8(prepared_code_points, reinterpret_cast(utf8_result)))}; + assertf(length < MAX_NAME_BYTES_SIZE); + + size_t i{}; + size_t result_size{}; + while (i < length) { + char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; + bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || + !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || + (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || + !std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) || + !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || + !std::strncmp(c, "8233+", 5)}; do { - *s = *x; if (!skip) { - s++; + utf8_result[result_size] = utf8_result[i]; + ++result_size; } - } while (*x++ != '+'); + } while (utf8_result[i++] != static_cast('+')); } - *s = 0; + utf8_result[result_size] = static_cast(0); - return prep_buf; + return result_size; } -const char* clean_str(const char* x) { - if (x == NULL || strlen(x) >= MAX_NAME_SIZE) { - return x; +size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, + std::function assertf) { + size_t x_len = strlen(x); + if (x == NULL || x_len >= MAX_NAME_SIZE) { + for (size_t i = 0; i < x_len; ++i) { + utf8_result[i] = static_cast(x[i]); + } + utf8_result[x_len] = static_cast(0); + return x_len; } - html_string_to_utf8(x, prep_ibuf); - return clean_str_unicode(prep_ibuf); + html_string_to_utf8(x, code_points); + return clean_str_unicode(code_points, word_start_indices, prepared_code_points, utf8_result, assertf); } diff --git a/common/unicode/unicode-utils.h b/common/unicode/unicode-utils.h index fb214488c0..60da63ad82 100644 --- a/common/unicode/unicode-utils.h +++ b/common/unicode/unicode-utils.h @@ -5,6 +5,7 @@ #pragma once #include +#include inline constexpr size_t MAX_NAME_SIZE = 65536; inline constexpr size_t MAX_NAME_BYTES_SIZE = 4 * MAX_NAME_SIZE + 4; @@ -12,4 +13,4 @@ inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; int unicode_toupper(int code); int unicode_tolower(int code); -const char* clean_str(const char* x); +size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result); diff --git a/runtime-common/stdlib/string/string-functions.h b/runtime-common/stdlib/string/string-functions.h index d200a644b8..a87fc3ddac 100644 --- a/runtime-common/stdlib/string/string-functions.h +++ b/runtime-common/stdlib/string/string-functions.h @@ -11,6 +11,7 @@ #include #include +#include "common/unicode/unicode-utils.h" #include "runtime-common/core/runtime-core.h" #include "runtime-common/core/utils/kphp-assert-core.h" #include "runtime-common/stdlib/string/string-context.h" @@ -542,3 +543,34 @@ string str_concat(str_concat_arg s1, str_concat_arg s2) noexcept; string str_concat(str_concat_arg s1, str_concat_arg s2, str_concat_arg s3) noexcept; string str_concat(str_concat_arg s1, str_concat_arg s2, str_concat_arg s3, str_concat_arg s4) noexcept; string str_concat(str_concat_arg s1, str_concat_arg s2, str_concat_arg s3, str_concat_arg s4, str_concat_arg s5) noexcept; + +namespace prepare_search_query_impl_ { + +inline constexpr size_t SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t WORD_INDICES_SPAN_SIZE_IN_BYTES = sizeof(size_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t RESULT_BYTES_SPAN_SIZE_IN_BYTES = sizeof(std::byte) * MAX_NAME_BYTES_SIZE; + +static_assert(SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES + WORD_INDICES_SPAN_SIZE_IN_BYTES + RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES + + RESULT_BYTES_SPAN_SIZE_IN_BYTES < + StringLibContext::STATIC_BUFFER_LENGTH); + +inline constexpr size_t SOURCE_CODE_POINTS_SPAN_BEGIN = 0; +inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES; +inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + WORD_INDICES_SPAN_SIZE_IN_BYTES; +inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES; + +inline string prepare_search_query(const string& query, std::function assertf) noexcept { + auto& string_lib_ctx{StringLibContext::get()}; + int32_t* code_points{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), prepare_search_query_impl_::SOURCE_CODE_POINTS_SPAN_BEGIN))}; + size_t* word_start_indices{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), prepare_search_query_impl_::WORD_INDICES_SPAN_BEGIN))}; + int32_t* prepared_code_points{ + reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), prepare_search_query_impl_::RESULT_CODE_POINTS_SPAN_BEGIN))}; + std::byte* utf8_result{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), prepare_search_query_impl_::RESULT_BYTES_SPAN_BEGIN))}; + + size_t length{clean_str(query.c_str(), code_points, word_start_indices, prepared_code_points, utf8_result, assertf)}; + + return {reinterpret_cast(utf8_result), static_cast(length)}; +} + +} // namespace prepare_search_query_impl_ diff --git a/runtime-light/stdlib/string/string-functions.cpp b/runtime-light/stdlib/string/string-functions.cpp index 958607b375..a16dda76de 100644 --- a/runtime-light/stdlib/string/string-functions.cpp +++ b/runtime-light/stdlib/string/string-functions.cpp @@ -53,8 +53,7 @@ int32_t binary_search_ranges(int32_t code) noexcept { /* Prepares unicode 0-terminated string input for search, leaving only digits and letters with diacritics. - Length of string can decrease. - Returns length of result. */ + Length of string can decrease. */ void prepare_search_string(std::span& code_points) noexcept { size_t output_size{}; for (size_t i{}; code_points[i] != 0; ++i) { diff --git a/runtime-light/stdlib/string/string-functions.h b/runtime-light/stdlib/string/string-functions.h index 0f1480ab5d..342ebcfb9a 100644 --- a/runtime-light/stdlib/string/string-functions.h +++ b/runtime-light/stdlib/string/string-functions.h @@ -15,6 +15,7 @@ #include "common/unicode/utf8-utils.h" #include "runtime-common/core/runtime-core.h" #include "runtime-common/stdlib/string/string-context.h" +#include "runtime-common/stdlib/string/string-functions.h" #include "runtime-light/k2-platform/k2-api.h" #include "runtime-light/stdlib/diagnostics/logs.h" @@ -34,8 +35,6 @@ inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + __WORD_INDICES_SPAN_SIZE_IN_BYTES; inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES; -inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; - inline constexpr int32_t WHITESPACE{static_cast(' ')}; inline constexpr int32_t PLUS{static_cast('+')}; @@ -163,9 +162,8 @@ inline std::span prepare_search_query_impl(std::span s{ - string_functions_impl_::prepare_search_query_impl({reinterpret_cast(query.c_str()), static_cast(query.size())})}; - return {reinterpret_cast(s.data()), static_cast(s.size())}; + // TODO no problem if std::function allocate? + return prepare_search_query_impl_::prepare_search_query(query, [](bool condition) { kphp::log::assertion(condition); }); } inline Optional f$setlocale(int64_t category, const string& locale) noexcept { diff --git a/runtime/string_functions.cpp b/runtime/string_functions.cpp index 62faaad12b..3e5e70f9b9 100644 --- a/runtime/string_functions.cpp +++ b/runtime/string_functions.cpp @@ -34,11 +34,7 @@ Optional f$setlocale(int64_t category, const string& locale) noexcept { } string f$prepare_search_query(const string& query) noexcept { - const char* s = clean_str(query.c_str()); - if (s == nullptr) { - s = ""; - } - return string(s); + return prepare_search_query_impl_::prepare_search_query(query, [](bool condition) { assert(condition); }); } // Based on `getcsv` from `streams` From 892dca1c974105bc32f9e599d6e9ac5e01011ba6 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Thu, 12 Mar 2026 20:17:37 +0300 Subject: [PATCH 21/28] fixed --- common/unicode/unicode-utils.cpp | 14 +- common/unicode/unicode-utils.h | 3 +- runtime-light/stdlib/stdlib.cmake | 1 - .../stdlib/string/string-functions.cpp | 82 ---------- .../stdlib/string/string-functions.h | 142 ------------------ .../php/data/component-config.yaml | 1 + 6 files changed, 10 insertions(+), 233 deletions(-) delete mode 100644 runtime-light/stdlib/string/string-functions.cpp diff --git a/common/unicode/unicode-utils.cpp b/common/unicode/unicode-utils.cpp index 0a47bb748f..1bc0d046d7 100644 --- a/common/unicode/unicode-utils.cpp +++ b/common/unicode/unicode-utils.cpp @@ -57,7 +57,7 @@ int unicode_toupper(int code) { if ((unsigned int)code < (unsigned int)TABLE_SIZE) { return to_upper_table[code]; } else { - return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code); + return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code, [](bool condition) { assert(condition); }); } } @@ -66,7 +66,7 @@ int unicode_tolower(int code) { if ((unsigned int)code < (unsigned int)TABLE_SIZE) { return to_lower_table[code]; } else { - return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code); + return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code, [](bool condition) { assert(condition); }); } } @@ -177,12 +177,12 @@ inline size_t clean_str_unicode(int32_t* code_points, size_t* word_start_indices size_t result_size{}; while (i < length) { char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; - bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || - !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || + bool skip{!strncmp(c, "amp+", 4) || !strncmp(c, "gt+", 3) || !strncmp(c, "lt+", 3) || !strncmp(c, "quot+", 5) || + !strncmp(c, "ft+", 3) || !strncmp(c, "feat+", 5) || (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || - !std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) || - !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || - !std::strncmp(c, "8233+", 5)}; + !strncmp(c, "092+", 4) || !strncmp(c, "33+", 3) || !strncmp(c, "34+", 3) || !strncmp(c, "36+", 3) || + !strncmp(c, "39+", 3) || !strncmp(c, "60+", 3) || !strncmp(c, "62+", 3) || !strncmp(c, "8232+", 5) || + !strncmp(c, "8233+", 5)}; do { if (!skip) { utf8_result[result_size] = utf8_result[i]; diff --git a/common/unicode/unicode-utils.h b/common/unicode/unicode-utils.h index 60da63ad82..18fdf00aa6 100644 --- a/common/unicode/unicode-utils.h +++ b/common/unicode/unicode-utils.h @@ -6,6 +6,7 @@ #include #include +#include inline constexpr size_t MAX_NAME_SIZE = 65536; inline constexpr size_t MAX_NAME_BYTES_SIZE = 4 * MAX_NAME_SIZE + 4; @@ -13,4 +14,4 @@ inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; int unicode_toupper(int code); int unicode_tolower(int code); -size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result); +size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, std::function assertf); diff --git a/runtime-light/stdlib/stdlib.cmake b/runtime-light/stdlib/stdlib.cmake index 47f3648188..3831da6b07 100644 --- a/runtime-light/stdlib/stdlib.cmake +++ b/runtime-light/stdlib/stdlib.cmake @@ -39,7 +39,6 @@ prepend( string/regex-functions.cpp string/regex-state.cpp string/string-state.cpp - string/string-functions.cpp system/system-functions.cpp system/system-state.cpp time/date-interval.cpp diff --git a/runtime-light/stdlib/string/string-functions.cpp b/runtime-light/stdlib/string/string-functions.cpp deleted file mode 100644 index a16dda76de..0000000000 --- a/runtime-light/stdlib/string/string-functions.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// Compiler for PHP (aka KPHP) -// Copyright (c) 2025 LLC «V Kontakte» -// Distributed under the GPL v3 License, see LICENSE.notice.txt - -#include "runtime-light/stdlib/string/string-functions.h" - -#include -#include -#include - -#include "auto/common/unicode-utils-auto.h" -#include "runtime-light/k2-platform/k2-api.h" - -namespace string_functions_impl_ { - -/* Search generated ranges for specified character */ -int32_t binary_search_ranges(int32_t code) noexcept { - if (code > MAX_UTF8_CODE_POINT) { - return 0; - } - - size_t l{0}; - size_t r{prepare_table_ranges_size}; - while (l < r) { - size_t m{((l + r + 2) >> 2) << 1}; - if (prepare_table_ranges[m] <= code) { - l = m; - } else { - r = m - 2; - } - } - - // prepare_table_ranges[l] - key - // prepare_table_ranges[l + 1] - value - int32_t t{prepare_table_ranges[l + 1]}; - if (t < 0) { - return code - prepare_table_ranges[l] + (~t); - } - if (t <= 0x10ffff) { - return t; - } - switch (t - 0x200000) { - case 0: - return (code & -2); - case 1: - return (code | 1); - case 2: - return ((code - 1) | 1); - default: - k2::exit(1); - } -} - -/* Prepares unicode 0-terminated string input for search, - leaving only digits and letters with diacritics. - Length of string can decrease. */ -void prepare_search_string(std::span& code_points) noexcept { - size_t output_size{}; - for (size_t i{}; code_points[i] != 0; ++i) { - int32_t c{code_points[i]}; - int32_t new_c{}; - if (static_cast(c) < static_cast(TABLE_SIZE)) { - new_c = static_cast(prepare_table[c]); - } else { - new_c = binary_search_ranges(c); - } - if (new_c != 0) { - // we forbid 2 whitespaces after each other and starting whitespace - if (new_c != WHITESPACE || (output_size > 0 && code_points[output_size - 1] != WHITESPACE)) { - code_points[output_size++] = new_c; - } - } - } - if (output_size > 0 && code_points[output_size - 1] == WHITESPACE) { - // throw out terminating whitespace - --output_size; - } - code_points[output_size] = 0; - code_points = code_points.first(output_size); -} - -} // namespace string_functions_impl_ diff --git a/runtime-light/stdlib/string/string-functions.h b/runtime-light/stdlib/string/string-functions.h index 342ebcfb9a..81f8c7008a 100644 --- a/runtime-light/stdlib/string/string-functions.h +++ b/runtime-light/stdlib/string/string-functions.h @@ -19,148 +19,6 @@ #include "runtime-light/k2-platform/k2-api.h" #include "runtime-light/stdlib/diagnostics/logs.h" -namespace string_functions_impl_ { - -inline constexpr size_t __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; -inline constexpr size_t __WORD_INDICES_SPAN_SIZE_IN_BYTES = sizeof(size_t) * MAX_NAME_CODE_POINTS_SIZE; -inline constexpr size_t __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; -inline constexpr size_t __RESULT_BYTES_SPAN_SIZE_IN_BYTES = sizeof(std::byte) * MAX_NAME_BYTES_SIZE; - -static_assert(__SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES + __WORD_INDICES_SPAN_SIZE_IN_BYTES + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES + - __RESULT_BYTES_SPAN_SIZE_IN_BYTES < - StringLibContext::STATIC_BUFFER_LENGTH); - -inline constexpr size_t SOURCE_CODE_POINTS_SPAN_BEGIN = 0; -inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES; -inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + __WORD_INDICES_SPAN_SIZE_IN_BYTES; -inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES; - -inline constexpr int32_t WHITESPACE{static_cast(' ')}; -inline constexpr int32_t PLUS{static_cast('+')}; - -/* Search generated ranges for specified character */ -int32_t binary_search_ranges(int32_t code) noexcept; - -/* Prepares unicode 0-terminated string input for search, - leaving only digits and letters with diacritics. - Length of string can decrease. - Returns length of result. */ -void prepare_search_string(std::span& code_points) noexcept; - -inline std::span prepare_str_unicode(std::span code_points) noexcept { - prepare_search_string(code_points); - code_points[code_points.size()] = WHITESPACE; - - auto& string_lib_ctx{StringLibContext::get()}; - auto* word_indices_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), WORD_INDICES_SPAN_BEGIN))}; - // indices of first char of every word in `code_points`. - std::span word_start_indices{word_indices_begin, MAX_NAME_CODE_POINTS_SIZE}; - size_t words_count{}; - size_t i{}; - // looking for the beginnings of the words - while (i < code_points.size()) { - word_start_indices[words_count++] = i; - while (i < code_points.size() && code_points[i] != WHITESPACE) { - ++i; - } - ++i; - } - word_start_indices = word_start_indices.first(words_count); - - auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool { - while (code_points[x] != WHITESPACE && code_points[x] == code_points[y]) { - ++x; - ++y; - } - if (code_points[x] == WHITESPACE) { - return code_points[y] != WHITESPACE; - } - if (code_points[y] == WHITESPACE) { - return false; - } - return code_points[x] < code_points[y]; - }}; - - std::sort(word_start_indices.begin(), word_start_indices.end(), word_less_cmp); - - size_t uniq_words_count{}; - for (i = 0; i < words_count; ++i) { - // drop duplicates - if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) { - word_start_indices[uniq_words_count++] = word_start_indices[i]; - } else { - word_start_indices[uniq_words_count - 1] = word_start_indices[i]; - } - } - - auto* result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_CODE_POINTS_SPAN_BEGIN))}; - std::span result{result_begin, MAX_NAME_CODE_POINTS_SIZE}; - size_t result_size{}; - // output words with '+' separator - for (i = 0; i < uniq_words_count; ++i) { - size_t ind{word_start_indices[i]}; - while (code_points[ind] != WHITESPACE) { - result[result_size++] = code_points[ind++]; - } - result[result_size++] = PLUS; - } - result[result_size++] = 0; - - kphp::log::assertion(result_size < MAX_NAME_SIZE); - result = result.first(result_size); - return result; -} - -inline std::span clean_str_unicode(std::span source_code_points) noexcept { - std::span prepared_code_points{prepare_str_unicode(source_code_points)}; - - auto& string_lib_ctx{StringLibContext::get()}; - auto* utf8_result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_BYTES_SPAN_BEGIN))}; - std::span utf8_result{utf8_result_begin, MAX_NAME_BYTES_SIZE}; - auto length{static_cast(put_string_utf8(prepared_code_points.data(), reinterpret_cast(utf8_result.data())))}; - kphp::log::assertion(length < utf8_result.size()); - utf8_result = utf8_result.first(length); - - size_t i{}; - size_t result_size{}; - while (i < utf8_result.size()) { - char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; - bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || - !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || - (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || - !std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) || - !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || - !std::strncmp(c, "8233+", 5)}; - do { - if (!skip) { - utf8_result[result_size] = utf8_result[i]; - ++result_size; - } - } while (utf8_result[i++] != static_cast('+')); - } - utf8_result[result_size] = static_cast(0); - - return utf8_result; -} - -inline std::span prepare_search_query_impl(std::span x) noexcept { - if (x.empty() || x.size() >= MAX_NAME_SIZE) { - return x; - } - - auto& string_lib_ctx{StringLibContext::get()}; - auto* source_code_points_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), SOURCE_CODE_POINTS_SPAN_BEGIN))}; - std::span source_code_points{ - source_code_points_begin, - MAX_NAME_CODE_POINTS_SIZE, - }; - - html_string_to_utf8(reinterpret_cast(x.data()), source_code_points.data()); - return clean_str_unicode(source_code_points); -} - -} // namespace string_functions_impl_ - inline string f$prepare_search_query(const string& query) noexcept { // TODO no problem if std::function allocate? return prepare_search_query_impl_::prepare_search_query(query, [](bool condition) { kphp::log::assertion(condition); }); diff --git a/tests/python/tests/prepare_search_query/php/data/component-config.yaml b/tests/python/tests/prepare_search_query/php/data/component-config.yaml index 2ed98fed14..5ec7d22c4e 100644 --- a/tests/python/tests/prepare_search_query/php/data/component-config.yaml +++ b/tests/python/tests/prepare_search_query/php/data/component-config.yaml @@ -3,4 +3,5 @@ components: script: image: KPHP scope: Request + args: {} links: {} From 68c095ffca175a37496e0ff43dfcec40b03f732c Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Thu, 12 Mar 2026 20:33:10 +0300 Subject: [PATCH 22/28] fmt --- common/unicode/unicode-utils.cpp | 9 ++++----- common/unicode/unicode-utils.h | 3 ++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/common/unicode/unicode-utils.cpp b/common/unicode/unicode-utils.cpp index 1bc0d046d7..8594e27b7a 100644 --- a/common/unicode/unicode-utils.cpp +++ b/common/unicode/unicode-utils.cpp @@ -177,12 +177,11 @@ inline size_t clean_str_unicode(int32_t* code_points, size_t* word_start_indices size_t result_size{}; while (i < length) { char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; - bool skip{!strncmp(c, "amp+", 4) || !strncmp(c, "gt+", 3) || !strncmp(c, "lt+", 3) || !strncmp(c, "quot+", 5) || - !strncmp(c, "ft+", 3) || !strncmp(c, "feat+", 5) || + bool skip{!strncmp(c, "amp+", 4) || !strncmp(c, "gt+", 3) || !strncmp(c, "lt+", 3) || !strncmp(c, "quot+", 5) || !strncmp(c, "ft+", 3) || + !strncmp(c, "feat+", 5) || (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || - !strncmp(c, "092+", 4) || !strncmp(c, "33+", 3) || !strncmp(c, "34+", 3) || !strncmp(c, "36+", 3) || - !strncmp(c, "39+", 3) || !strncmp(c, "60+", 3) || !strncmp(c, "62+", 3) || !strncmp(c, "8232+", 5) || - !strncmp(c, "8233+", 5)}; + !strncmp(c, "092+", 4) || !strncmp(c, "33+", 3) || !strncmp(c, "34+", 3) || !strncmp(c, "36+", 3) || !strncmp(c, "39+", 3) || + !strncmp(c, "60+", 3) || !strncmp(c, "62+", 3) || !strncmp(c, "8232+", 5) || !strncmp(c, "8233+", 5)}; do { if (!skip) { utf8_result[result_size] = utf8_result[i]; diff --git a/common/unicode/unicode-utils.h b/common/unicode/unicode-utils.h index 18fdf00aa6..7088b08da9 100644 --- a/common/unicode/unicode-utils.h +++ b/common/unicode/unicode-utils.h @@ -14,4 +14,5 @@ inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; int unicode_toupper(int code); int unicode_tolower(int code); -size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, std::function assertf); +size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, + std::function assertf); From 04d8d5ebfb0e68d789f01bdcff57dab62d3efb81 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Thu, 12 Mar 2026 20:37:52 +0300 Subject: [PATCH 23/28] brace init --- common/unicode/unicode-utils.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/common/unicode/unicode-utils.cpp b/common/unicode/unicode-utils.cpp index 8594e27b7a..c7dfcd34a6 100644 --- a/common/unicode/unicode-utils.cpp +++ b/common/unicode/unicode-utils.cpp @@ -102,14 +102,6 @@ size_t prepare_search_string(int32_t* code_points, std::function ass return output_size; } -int stricmp_void(const void* x, const void* y) { - const int* s1 = *(const int**)x; - const int* s2 = *(const int**)y; - while (*s1 == *s2 && *s1 != ' ') - s1++, s2++; - return *s1 - *s2; -} - inline size_t prepare_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::function assertf) noexcept { size_t code_points_length = prepare_search_string(code_points, assertf); code_points[code_points_length] = WHITESPACE_CODE_POINT; @@ -196,7 +188,7 @@ inline size_t clean_str_unicode(int32_t* code_points, size_t* word_start_indices size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, std::function assertf) { - size_t x_len = strlen(x); + size_t x_len{strlen(x)}; if (x == NULL || x_len >= MAX_NAME_SIZE) { for (size_t i = 0; i < x_len; ++i) { utf8_result[i] = static_cast(x[i]); From 2eb75245a3cbb309b10cfdf3f5416c481a5a7685 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Thu, 12 Mar 2026 21:44:27 +0300 Subject: [PATCH 24/28] std::function removed with function pointer --- common/unicode/unicode-utils.cpp | 12 ++++++------ common/unicode/unicode-utils.h | 2 +- runtime-common/stdlib/string/string-functions.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/common/unicode/unicode-utils.cpp b/common/unicode/unicode-utils.cpp index c7dfcd34a6..5e642319df 100644 --- a/common/unicode/unicode-utils.cpp +++ b/common/unicode/unicode-utils.cpp @@ -17,7 +17,7 @@ #include "common/unicode/utf8-utils.h" /* Search generated ranges for specified character */ -static int binary_search_ranges(const int* ranges, int r, int code, std::function assertf) { +static int binary_search_ranges(const int* ranges, int r, int code, void (*assertf)(bool)) { if ((unsigned int)code > 0x10ffff) { return 0; } @@ -77,7 +77,7 @@ inline constexpr int32_t PLUS_CODE_POINT{static_cast('+')}; leaving only digits and letters with diacritics. Length of string can decrease. Returns length of result. */ -size_t prepare_search_string(int32_t* code_points, std::function assertf) noexcept { +size_t prepare_search_string(int32_t* code_points, void (*assertf)(bool)) noexcept { size_t output_size{}; for (size_t i{}; code_points[i] != 0; ++i) { int32_t c{code_points[i]}; @@ -102,7 +102,7 @@ size_t prepare_search_string(int32_t* code_points, std::function ass return output_size; } -inline size_t prepare_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::function assertf) noexcept { +inline size_t prepare_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, void (*assertf)(bool)) noexcept { size_t code_points_length = prepare_search_string(code_points, assertf); code_points[code_points_length] = WHITESPACE_CODE_POINT; @@ -159,7 +159,7 @@ inline size_t prepare_str_unicode(int32_t* code_points, size_t* word_start_indic } inline size_t clean_str_unicode(int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, - std::function assertf) noexcept { + void (*assertf)(bool)) noexcept { prepare_str_unicode(code_points, word_start_indices, prepared_code_points, assertf); auto length{static_cast(put_string_utf8(prepared_code_points, reinterpret_cast(utf8_result)))}; @@ -187,9 +187,9 @@ inline size_t clean_str_unicode(int32_t* code_points, size_t* word_start_indices } size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, - std::function assertf) { + void (*assertf)(bool)) { size_t x_len{strlen(x)}; - if (x == NULL || x_len >= MAX_NAME_SIZE) { + if (assertf == nullptr || x == NULL || x_len >= MAX_NAME_SIZE) { for (size_t i = 0; i < x_len; ++i) { utf8_result[i] = static_cast(x[i]); } diff --git a/common/unicode/unicode-utils.h b/common/unicode/unicode-utils.h index 7088b08da9..a4268b9912 100644 --- a/common/unicode/unicode-utils.h +++ b/common/unicode/unicode-utils.h @@ -15,4 +15,4 @@ inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; int unicode_toupper(int code); int unicode_tolower(int code); size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, - std::function assertf); + void (*assertf)(bool)); diff --git a/runtime-common/stdlib/string/string-functions.h b/runtime-common/stdlib/string/string-functions.h index a87fc3ddac..ce0784e8b6 100644 --- a/runtime-common/stdlib/string/string-functions.h +++ b/runtime-common/stdlib/string/string-functions.h @@ -560,7 +560,7 @@ inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + WORD_INDICES_SPAN_SIZE_IN_BYTES; inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES; -inline string prepare_search_query(const string& query, std::function assertf) noexcept { +inline string prepare_search_query(const string& query, void (*assertf)(bool)) noexcept { auto& string_lib_ctx{StringLibContext::get()}; int32_t* code_points{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), prepare_search_query_impl_::SOURCE_CODE_POINTS_SPAN_BEGIN))}; size_t* word_start_indices{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), prepare_search_query_impl_::WORD_INDICES_SPAN_BEGIN))}; From d221bd8c34fd95fb8e90a53a5e0ffaa3463cab30 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Thu, 12 Mar 2026 21:58:47 +0300 Subject: [PATCH 25/28] fmt --- common/unicode/unicode-utils.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/unicode/unicode-utils.h b/common/unicode/unicode-utils.h index a4268b9912..7375280eb8 100644 --- a/common/unicode/unicode-utils.h +++ b/common/unicode/unicode-utils.h @@ -14,5 +14,4 @@ inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; int unicode_toupper(int code); int unicode_tolower(int code); -size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, - void (*assertf)(bool)); +size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, void (*assertf)(bool)); From 463cda2348032c493dd187041e9eb7da2a9e6bd5 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Fri, 13 Mar 2026 19:15:39 +0300 Subject: [PATCH 26/28] removed unused asserts --- common/unicode/unicode-utils.cpp | 5 ++--- common/unicode/unicode-utils.h | 1 - common/unicode/utf8-utils.cpp | 2 -- runtime-light/stdlib/string/string-functions.h | 11 +---------- runtime/string_functions.cpp | 2 +- 5 files changed, 4 insertions(+), 17 deletions(-) diff --git a/common/unicode/unicode-utils.cpp b/common/unicode/unicode-utils.cpp index 5e642319df..5dd46e6fef 100644 --- a/common/unicode/unicode-utils.cpp +++ b/common/unicode/unicode-utils.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -57,7 +56,7 @@ int unicode_toupper(int code) { if ((unsigned int)code < (unsigned int)TABLE_SIZE) { return to_upper_table[code]; } else { - return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code, [](bool condition) { assert(condition); }); + return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code, [](bool condition) noexcept { assert(condition); }); } } @@ -66,7 +65,7 @@ int unicode_tolower(int code) { if ((unsigned int)code < (unsigned int)TABLE_SIZE) { return to_lower_table[code]; } else { - return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code, [](bool condition) { assert(condition); }); + return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code, [](bool condition) noexcept { assert(condition); }); } } diff --git a/common/unicode/unicode-utils.h b/common/unicode/unicode-utils.h index 7375280eb8..72a894827a 100644 --- a/common/unicode/unicode-utils.h +++ b/common/unicode/unicode-utils.h @@ -6,7 +6,6 @@ #include #include -#include inline constexpr size_t MAX_NAME_SIZE = 65536; inline constexpr size_t MAX_NAME_BYTES_SIZE = 4 * MAX_NAME_SIZE + 4; diff --git a/common/unicode/utf8-utils.cpp b/common/unicode/utf8-utils.cpp index 16c6be0aaa..ad0dfb39b1 100644 --- a/common/unicode/utf8-utils.cpp +++ b/common/unicode/utf8-utils.cpp @@ -5,7 +5,6 @@ #include "common/unicode/utf8-utils.h" #include -#include #include #include @@ -992,7 +991,6 @@ int simplify_character(int c) { } } -// TODO does constexpr std::array enough for safe use in runtime-light ? constexpr std::array _s_1__{97, 0}; constexpr std::array _v_1__{1072, 0}; constexpr std::array _s_2__{98, 0}; diff --git a/runtime-light/stdlib/string/string-functions.h b/runtime-light/stdlib/string/string-functions.h index 81f8c7008a..aa6b147dde 100644 --- a/runtime-light/stdlib/string/string-functions.h +++ b/runtime-light/stdlib/string/string-functions.h @@ -4,24 +4,15 @@ #pragma once -#include -#include #include -#include -#include -#include -#include "common/unicode/unicode-utils.h" -#include "common/unicode/utf8-utils.h" #include "runtime-common/core/runtime-core.h" -#include "runtime-common/stdlib/string/string-context.h" #include "runtime-common/stdlib/string/string-functions.h" #include "runtime-light/k2-platform/k2-api.h" #include "runtime-light/stdlib/diagnostics/logs.h" inline string f$prepare_search_query(const string& query) noexcept { - // TODO no problem if std::function allocate? - return prepare_search_query_impl_::prepare_search_query(query, [](bool condition) { kphp::log::assertion(condition); }); + return prepare_search_query_impl_::prepare_search_query(query, [](bool condition) noexcept { kphp::log::assertion(condition); }); } inline Optional f$setlocale(int64_t category, const string& locale) noexcept { diff --git a/runtime/string_functions.cpp b/runtime/string_functions.cpp index 3e5e70f9b9..28f7c378dc 100644 --- a/runtime/string_functions.cpp +++ b/runtime/string_functions.cpp @@ -34,7 +34,7 @@ Optional f$setlocale(int64_t category, const string& locale) noexcept { } string f$prepare_search_query(const string& query) noexcept { - return prepare_search_query_impl_::prepare_search_query(query, [](bool condition) { assert(condition); }); + return prepare_search_query_impl_::prepare_search_query(query, [](bool condition) noexcept { assert(condition); }); } // Based on `getcsv` from `streams` From 7a10bbbfa1b6edfa98d2980b489ee6d7cc56c9f7 Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Fri, 13 Mar 2026 19:44:19 +0300 Subject: [PATCH 27/28] assertf passed to unicode_toupper() and unicode_tolower() --- common/unicode/unicode-utils.cpp | 12 ++++---- common/unicode/unicode-utils.h | 4 +-- .../stdlib/string/mbstring-functions.cpp | 12 ++++---- .../stdlib/string/mbstring-functions.h | 6 ++-- .../stdlib/string/mbstring-functions.h | 28 +++++++++++++++++++ runtime/mbstring-functions.h | 28 +++++++++++++++++++ 6 files changed, 74 insertions(+), 16 deletions(-) create mode 100644 runtime-light/stdlib/string/mbstring-functions.h create mode 100644 runtime/mbstring-functions.h diff --git a/common/unicode/unicode-utils.cpp b/common/unicode/unicode-utils.cpp index 5dd46e6fef..88d535eee0 100644 --- a/common/unicode/unicode-utils.cpp +++ b/common/unicode/unicode-utils.cpp @@ -46,26 +46,28 @@ static int binary_search_ranges(const int* ranges, int r, int code, void (*asser case 2: return ((code - 1) | 1); default: - assertf(false); + if (assertf != nullptr) { + assertf(false); + } } return 0; } /* Convert character to upper case */ -int unicode_toupper(int code) { +int unicode_toupper(int code, void (*assertf)(bool)) { if ((unsigned int)code < (unsigned int)TABLE_SIZE) { return to_upper_table[code]; } else { - return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code, [](bool condition) noexcept { assert(condition); }); + return binary_search_ranges(to_upper_table_ranges, to_upper_table_ranges_size, code, assertf); } } /* Convert character to lower case */ -int unicode_tolower(int code) { +int unicode_tolower(int code, void (*assertf)(bool)) { if ((unsigned int)code < (unsigned int)TABLE_SIZE) { return to_lower_table[code]; } else { - return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code, [](bool condition) noexcept { assert(condition); }); + return binary_search_ranges(to_lower_table_ranges, to_lower_table_ranges_size, code, assertf); } } diff --git a/common/unicode/unicode-utils.h b/common/unicode/unicode-utils.h index 72a894827a..50c59af432 100644 --- a/common/unicode/unicode-utils.h +++ b/common/unicode/unicode-utils.h @@ -11,6 +11,6 @@ inline constexpr size_t MAX_NAME_SIZE = 65536; inline constexpr size_t MAX_NAME_BYTES_SIZE = 4 * MAX_NAME_SIZE + 4; inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; -int unicode_toupper(int code); -int unicode_tolower(int code); +int unicode_toupper(int code, void (*assertf)(bool)); +int unicode_tolower(int code, void (*assertf)(bool)); size_t clean_str(const char* x, int32_t* code_points, size_t* word_start_indices, int32_t* prepared_code_points, std::byte* utf8_result, void (*assertf)(bool)); diff --git a/runtime-common/stdlib/string/mbstring-functions.cpp b/runtime-common/stdlib/string/mbstring-functions.cpp index ef84817f03..6faf0566db 100644 --- a/runtime-common/stdlib/string/mbstring-functions.cpp +++ b/runtime-common/stdlib/string/mbstring-functions.cpp @@ -130,7 +130,7 @@ int64_t f$mb_strlen(const string& str, const string& encoding) noexcept { return mb_UTF8_strlen(str.c_str()); } -string f$mb_strtolower(const string& str, const string& encoding) noexcept { +string mb_strtolower_impl(const string& str, void (*assertf)(bool), const string& encoding) noexcept { int encoding_num = mb_detect_encoding(encoding); if (encoding_num < 0) { php_critical_error("encoding \"%s\" doesn't supported in mb_strtolower", encoding.c_str()); @@ -184,7 +184,7 @@ string f$mb_strtolower(const string& str, const string& encoding) noexcept { int ch = 0; while ((p = get_char_utf8(&ch, s)) > 0) { s += p; - res_len += put_char_utf8(unicode_tolower(ch), &res[res_len]); + res_len += put_char_utf8(unicode_tolower(ch, assertf), &res[res_len]); } if (p < 0) { php_warning("Incorrect UTF-8 string \"%s\" in function mb_strtolower", str.c_str()); @@ -195,7 +195,7 @@ string f$mb_strtolower(const string& str, const string& encoding) noexcept { } } -string f$mb_strtoupper(const string& str, const string& encoding) noexcept { +string mb_strtoupper_impl(const string& str, void (*assertf)(bool), const string& encoding) noexcept { int encoding_num = mb_detect_encoding(encoding); if (encoding_num < 0) { php_critical_error("encoding \"%s\" doesn't supported in mb_strtoupper", encoding.c_str()); @@ -254,7 +254,7 @@ string f$mb_strtoupper(const string& str, const string& encoding) noexcept { int ch = 0; while ((p = get_char_utf8(&ch, s)) > 0) { s += p; - res_len += put_char_utf8(unicode_toupper(ch), &res[res_len]); + res_len += put_char_utf8(unicode_toupper(ch, assertf), &res[res_len]); } if (p < 0) { php_warning("Incorrect UTF-8 string \"%s\" in function mb_strtoupper", str.c_str()); @@ -307,9 +307,9 @@ Optional f$mb_strpos(const string& haystack, const string& needle, int6 return false; } -Optional f$mb_stripos(const string& haystack, const string& needle, int64_t offset, const string& encoding) noexcept { +Optional mb_stripos_impl(const string& haystack, const string& needle, void (*assertf)(bool), int64_t offset, const string& encoding) noexcept { if (const int encoding_num = check_strpos_agrs("mb_stripos", needle, offset, encoding)) { - return mp_strpos_impl(f$mb_strtolower(haystack, encoding), f$mb_strtolower(needle, encoding), offset, encoding_num); + return mp_strpos_impl(mb_strtolower_impl(haystack, assertf, encoding), mb_strtolower_impl(needle, assertf, encoding), offset, encoding_num); } return false; } diff --git a/runtime-common/stdlib/string/mbstring-functions.h b/runtime-common/stdlib/string/mbstring-functions.h index 6d0432ac9b..a15f75f5cf 100644 --- a/runtime-common/stdlib/string/mbstring-functions.h +++ b/runtime-common/stdlib/string/mbstring-functions.h @@ -15,14 +15,14 @@ bool f$mb_check_encoding(const string& str, const string& encoding = StringLibCo int64_t f$mb_strlen(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; -string f$mb_strtolower(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; +string mb_strtolower_impl(const string& str, void (*assertf)(bool), const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; -string f$mb_strtoupper(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; +string mb_strtoupper_impl(const string& str, void (*assertf)(bool), const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; Optional f$mb_strpos(const string& haystack, const string& needle, int64_t offset = 0, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; -Optional f$mb_stripos(const string& haystack, const string& needle, int64_t offset = 0, +Optional mb_stripos_impl(const string& haystack, const string& needle, void (*assertf)(bool), int64_t offset = 0, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; string f$mb_substr(const string& str, int64_t start, const mixed& length = std::numeric_limits::max(), diff --git a/runtime-light/stdlib/string/mbstring-functions.h b/runtime-light/stdlib/string/mbstring-functions.h new file mode 100644 index 0000000000..16f7466fb4 --- /dev/null +++ b/runtime-light/stdlib/string/mbstring-functions.h @@ -0,0 +1,28 @@ +// Compiler for PHP (aka KPHP) +// Copyright (c) 2026 LLC «V Kontakte» +// Distributed under the GPL v3 License, see LICENSE.notice.txt + +#pragma once + +#include + +#include "runtime-common/core/runtime-core.h" +#include "runtime-common/stdlib/string/mbstring-functions.h" +#include "runtime-common/stdlib/string/string-context.h" +#include "runtime-light/stdlib/diagnostics/logs.h" + +inline string f$mb_strtolower(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { + return mb_strtolower_impl( + str, [](bool condition) noexcept { kphp::log::assertion(condition); }, encoding); +} + +inline string f$mb_strtoupper(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { + return mb_strtoupper_impl( + str, [](bool condition) noexcept { kphp::log::assertion(condition); }, encoding); +} + +inline Optional f$mb_stripos(const string& haystack, const string& needle, int64_t offset = 0, + const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { + return mb_stripos_impl( + haystack, needle, [](bool condition) noexcept { kphp::log::assertion(condition); }, offset, encoding); +} diff --git a/runtime/mbstring-functions.h b/runtime/mbstring-functions.h new file mode 100644 index 0000000000..4123062fea --- /dev/null +++ b/runtime/mbstring-functions.h @@ -0,0 +1,28 @@ +// Compiler for PHP (aka KPHP) +// Copyright (c) 2026 LLC «V Kontakte» +// Distributed under the GPL v3 License, see LICENSE.notice.txt + +#pragma once + +#include +#include + +#include "runtime-common/core/runtime-core.h" +#include "runtime-common/stdlib/string/mbstring-functions.h" +#include "runtime-common/stdlib/string/string-context.h" + +inline string f$mb_strtolower(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { + return mb_strtolower_impl( + str, [](bool condition) noexcept { assert(condition); }, encoding); +} + +inline string f$mb_strtoupper(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { + return mb_strtoupper_impl( + str, [](bool condition) noexcept { assert(condition); }, encoding); +} + +inline Optional f$mb_stripos(const string& haystack, const string& needle, int64_t offset = 0, + const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { + return mb_stripos_impl( + haystack, needle, [](bool condition) noexcept { assert(condition); }, offset, encoding); +} From 61bf6a0a5d45d3f589bf525c1b966a2f404c16be Mon Sep 17 00:00:00 2001 From: Nikita Siniachenko Date: Fri, 13 Mar 2026 19:48:56 +0300 Subject: [PATCH 28/28] fmt --- runtime-common/stdlib/string/mbstring-functions.h | 2 +- runtime-light/stdlib/string/mbstring-functions.h | 9 +++------ runtime/mbstring-functions.h | 9 +++------ 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/runtime-common/stdlib/string/mbstring-functions.h b/runtime-common/stdlib/string/mbstring-functions.h index a15f75f5cf..3f1516d2f3 100644 --- a/runtime-common/stdlib/string/mbstring-functions.h +++ b/runtime-common/stdlib/string/mbstring-functions.h @@ -23,7 +23,7 @@ Optional f$mb_strpos(const string& haystack, const string& needle, int6 const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; Optional mb_stripos_impl(const string& haystack, const string& needle, void (*assertf)(bool), int64_t offset = 0, - const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; + const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; string f$mb_substr(const string& str, int64_t start, const mixed& length = std::numeric_limits::max(), const string& encoding = StringLibConstants::get().CP1251_STR) noexcept; diff --git a/runtime-light/stdlib/string/mbstring-functions.h b/runtime-light/stdlib/string/mbstring-functions.h index 16f7466fb4..5cddc4d954 100644 --- a/runtime-light/stdlib/string/mbstring-functions.h +++ b/runtime-light/stdlib/string/mbstring-functions.h @@ -12,17 +12,14 @@ #include "runtime-light/stdlib/diagnostics/logs.h" inline string f$mb_strtolower(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { - return mb_strtolower_impl( - str, [](bool condition) noexcept { kphp::log::assertion(condition); }, encoding); + return mb_strtolower_impl(str, [](bool condition) noexcept { kphp::log::assertion(condition); }, encoding); } inline string f$mb_strtoupper(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { - return mb_strtoupper_impl( - str, [](bool condition) noexcept { kphp::log::assertion(condition); }, encoding); + return mb_strtoupper_impl(str, [](bool condition) noexcept { kphp::log::assertion(condition); }, encoding); } inline Optional f$mb_stripos(const string& haystack, const string& needle, int64_t offset = 0, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { - return mb_stripos_impl( - haystack, needle, [](bool condition) noexcept { kphp::log::assertion(condition); }, offset, encoding); + return mb_stripos_impl(haystack, needle, [](bool condition) noexcept { kphp::log::assertion(condition); }, offset, encoding); } diff --git a/runtime/mbstring-functions.h b/runtime/mbstring-functions.h index 4123062fea..c61679b1cd 100644 --- a/runtime/mbstring-functions.h +++ b/runtime/mbstring-functions.h @@ -12,17 +12,14 @@ #include "runtime-common/stdlib/string/string-context.h" inline string f$mb_strtolower(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { - return mb_strtolower_impl( - str, [](bool condition) noexcept { assert(condition); }, encoding); + return mb_strtolower_impl(str, [](bool condition) noexcept { assert(condition); }, encoding); } inline string f$mb_strtoupper(const string& str, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { - return mb_strtoupper_impl( - str, [](bool condition) noexcept { assert(condition); }, encoding); + return mb_strtoupper_impl(str, [](bool condition) noexcept { assert(condition); }, encoding); } inline Optional f$mb_stripos(const string& haystack, const string& needle, int64_t offset = 0, const string& encoding = StringLibConstants::get().CP1251_STR) noexcept { - return mb_stripos_impl( - haystack, needle, [](bool condition) noexcept { assert(condition); }, offset, encoding); + return mb_stripos_impl(haystack, needle, [](bool condition) noexcept { assert(condition); }, offset, encoding); }