From 3e21dc77c83628bb8114d1974999bad35719d41f Mon Sep 17 00:00:00 2001 From: ruslandoga Date: Mon, 18 May 2026 23:52:24 +0300 Subject: [PATCH] Add type parser property tests and escaping --- lib/ch/types.ex | 56 +++-- test/ch/types_test.exs | 486 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 526 insertions(+), 16 deletions(-) diff --git a/lib/ch/types.ex b/lib/ch/types.ex index fc28032..b3e945f 100644 --- a/lib/ch/types.ex +++ b/lib/ch/types.ex @@ -419,7 +419,7 @@ defmodule Ch.Types do end defp decode([:string | stack], <>, acc) do - decode_string(rest, 0, rest, stack, acc) + decode_string(rest, [], stack, acc) end defp decode([:int | stack], <>, acc) do @@ -514,22 +514,30 @@ defmodule Ch.Types do Enum.sort_by(types, fn t -> IO.iodata_to_binary(encode(t)) end) end - # TODO '', \' + defp decode_string(<>, parts, stack, acc) do + decode_string(rest, [?' | parts], stack, acc) + end - defp decode_string(<>, len, original, stack, acc) do - part = :binary.part(original, 0, len) - decode(stack, rest, [:binary.copy(part) | acc]) + defp decode_string(<>, parts, stack, acc) do + string = parts |> :lists.reverse() |> IO.iodata_to_binary() + decode(stack, rest, [string | acc]) + end + + defp decode_string(<>, parts, stack, acc) do + decode_string(rest, [unescape_string_char(escaped) | parts], stack, acc) end - defp decode_string(<>, len, original, stack, acc) do - decode_string(rest, len + utf8_size(u), original, stack, acc) + defp decode_string(<>, parts, stack, acc) do + decode_string(rest, [<> | parts], stack, acc) end - @compile inline: [utf8_size: 1] - defp utf8_size(codepoint) when codepoint <= 0x7F, do: 1 - defp utf8_size(codepoint) when codepoint <= 0x7FF, do: 2 - defp utf8_size(codepoint) when codepoint <= 0xFFFF, do: 3 - defp utf8_size(codepoint) when codepoint <= 0x10FFFF, do: 4 + defp unescape_string_char(?0), do: 0 + defp unescape_string_char(?b), do: ?\b + defp unescape_string_char(?f), do: ?\f + defp unescape_string_char(?n), do: ?\n + defp unescape_string_char(?r), do: ?\r + defp unescape_string_char(?t), do: ?\t + defp unescape_string_char(char), do: char defguardp is_alpha(a) when (a >= ?a and a <= ?z) or (a >= ?A and a <= ?Z) defguardp is_numeric(char) when char >= ?0 and char <= ?9 @@ -613,7 +621,7 @@ defmodule Ch.Types do end def encode({:datetime, timezone}) when is_binary(timezone) do - ["DateTime('", timezone, "')"] + ["DateTime('", encode_string(timezone), "')"] end def encode({:datetime64, precision}) do @@ -621,7 +629,7 @@ defmodule Ch.Types do end def encode({:datetime64, precision, timezone}) when is_binary(timezone) do - ["DateTime64(", Integer.to_string(precision), ", '", timezone, "')"] + ["DateTime64(", Integer.to_string(precision), ", '", encode_string(timezone), "')"] end def encode({:enum8, mapping}) do @@ -647,12 +655,28 @@ defmodule Ch.Types do defp encode_intersperse([] = empty, _separator), do: empty defp encode_mapping([{k, v}]) when is_binary(k) do - [k, "' = ", Integer.to_string(v)] + [encode_string(k), "' = ", Integer.to_string(v)] end defp encode_mapping([{k, v} | mapping]) when is_binary(k) do - [k, "' = ", Integer.to_string(v), ", '" | encode_mapping(mapping)] + [encode_string(k), "' = ", Integer.to_string(v), ", '" | encode_mapping(mapping)] end defp encode_mapping([] = empty), do: empty + + defp encode_string(string) do + for <> do + escape_string_char(char) + end + end + + defp escape_string_char(?\0), do: "\\0" + defp escape_string_char(?\b), do: "\\b" + defp escape_string_char(?\f), do: "\\f" + defp escape_string_char(?\n), do: "\\n" + defp escape_string_char(?\r), do: "\\r" + defp escape_string_char(?\t), do: "\\t" + defp escape_string_char(?'), do: "\\'" + defp escape_string_char(?\\), do: "\\\\" + defp escape_string_char(char), do: <> end diff --git a/test/ch/types_test.exs b/test/ch/types_test.exs index 9c8b236..da113bb 100644 --- a/test/ch/types_test.exs +++ b/test/ch/types_test.exs @@ -1,6 +1,8 @@ defmodule Ch.TypesTest do use ExUnit.Case, async: true + import Ch.Types, only: [decode: 1] + doctest Ch.Types, import: true describe "decode/1" do @@ -116,6 +118,11 @@ defmodule Ch.TypesTest do assert decode("DateTime('UTC')") == {:datetime, "UTC"} assert decode("DateTime('Asia/Tokyo')") == {:datetime, "Asia/Tokyo"} assert decode(" DateTime ( 'UTC' ) ") == {:datetime, "UTC"} + + assert decode("DateTime('has\\'quote\\\\slash\\t\\n\\r')") == + {:datetime, "has'quote\\slash\t\n\r"} + + assert decode("DateTime('has''quote')") == {:datetime, "has'quote"} end test "datetime64" do @@ -123,6 +130,11 @@ defmodule Ch.TypesTest do assert decode("DateTime64(3, 'UTC')") == {:datetime64, 3, "UTC"} assert decode("DateTime64(3, 'Asia/Tokyo')") == {:datetime64, 3, "Asia/Tokyo"} assert decode(" DateTime64 ( 3 , 'Asia/Taipei' ) ") == {:datetime64, 3, "Asia/Taipei"} + + assert decode("DateTime64(3, 'has\\'quote\\\\slash\\t\\n\\r')") == + {:datetime64, 3, "has'quote\\slash\t\n\r"} + + assert decode("DateTime64(3, 'has''quote')") == {:datetime64, 3, "has'quote"} end test "time64" do @@ -152,6 +164,12 @@ defmodule Ch.TypesTest do assert decode("Enum8('hello' = 1, 'world' = 2)") == {:enum8, [{"hello", 1}, {"world", 2}]} assert decode("Enum8('é€𐍈' = 1)") == {:enum8, [{"é€𐍈", 1}]} + assert decode("Enum8('has\\'quote' = 1, 'has''quote' = 2, 'has\\\\slash' = 3)") == + {:enum8, [{"has'quote", 1}, {"has'quote", 2}, {"has\\slash", 3}]} + + assert decode("Enum8('tabs\\tnewlines\\nreturns\\r' = 1)") == + {:enum8, [{"tabs\tnewlines\nreturns\r", 1}]} + assert decode("Enum16('hello' = -1, 'world' = 2)") == {:enum16, [{"hello", -1}, {"world", 2}]} @@ -199,3 +217,471 @@ defmodule Ch.TypesTest do end end end + +defmodule Ch.TypesPropertyTest do + use ExUnit.Case, async: true + use ExUnitProperties + + import Ch.Types, only: [decode: 1, encode: 1] + + @scalar_types [ + :string, + :boolean, + :i8, + :i16, + :i32, + :i64, + :i128, + :i256, + :u8, + :u16, + :u32, + :u64, + :u128, + :u256, + :f32, + :f64, + :date, + :datetime, + :date32, + :time, + :uuid, + :ipv4, + :ipv6, + :point, + :ring, + :polygon, + :multipolygon, + :nothing, + :json, + :dynamic + ] + + describe "encode/decode properties" do + property "quoted string arguments round-trip through escaping" do + check all string <- quoted_string_gen(), + max_runs: 300 do + assert string + |> then(&{:datetime, &1}) + |> encode() + |> IO.iodata_to_binary() + |> decode() == {:datetime, string} + + assert string + |> then(&{:datetime64, 3, &1}) + |> encode() + |> IO.iodata_to_binary() + |> decode() == {:datetime64, 3, string} + + assert [{"enum", 1}, {string, 2}] + |> then(&{:enum8, &1}) + |> encode() + |> IO.iodata_to_binary() + |> decode() == {:enum8, [{"enum", 1}, {string, 2}]} + end + end + + property "generated types round-trip through encode/decode" do + check all type <- type_gen(), + max_runs: 500 do + assert type |> encode() |> IO.iodata_to_binary() |> decode() == normalize_type(type) + end + end + + property "generated types decode with arbitrary whitespace" do + check all type <- type_gen(), + whitespace <- whitespace_gen(), + max_runs: 500 do + assert type |> render_type(whitespace) |> decode() == normalize_decoded_type(type) + end + end + + property "decode accepts scalar types with arbitrary surrounding whitespace" do + check all type <- member_of(@scalar_types), + left <- whitespace_gen(), + right <- whitespace_gen() do + encoded = IO.iodata_to_binary([left, encode(type), right]) + + assert decode(encoded) == type + end + end + + property "decode rejects non-empty junk after a valid type" do + check all type <- type_gen() |> filter(&(&1 != :json)), + junk <- junk_suffix_gen() do + encoded = type |> encode() |> IO.iodata_to_binary() + + assert_raise ArgumentError, ~r/unexpected character/, fn -> + decode(encoded <> junk) + end + end + end + + property "decode rejects truncated parameterized types" do + check all type <- parameterized_type_gen() do + encoded = type |> encode() |> IO.iodata_to_binary() + size = byte_size(encoded) + truncated = binary_part(encoded, 0, size - 1) + + assert_raise ArgumentError, fn -> + decode(truncated) + end + end + end + end + + defp type_gen do + tree(member_of(@scalar_types), ¶meterized_type_gen/1) + end + + defp parameterized_type_gen do + type_gen() + |> bind(fn + atom when atom in @scalar_types -> + parameterized_type_gen(constant(atom)) + + type -> + constant(type) + end) + end + + defp parameterized_type_gen(type_gen) do + gen all kind <- + member_of([ + :array, + :tuple, + :variant, + :map, + :nullable, + :low_cardinality, + :fixed_string, + :datetime, + :datetime64, + :time64, + :decimal, + :decimal_sized, + :enum, + :simple_aggregate_function + ]) do + kind + end + |> bind(fn + :array -> + map(type_gen, &{:array, &1}) + + :tuple -> + map(list_of(type_gen, max_length: 6), &{:tuple, &1}) + + :variant -> + map(list_of(type_gen, min_length: 1, max_length: 6), &{:variant, &1}) + + :map -> + map({type_gen, type_gen}, fn {key_type, value_type} -> {:map, key_type, value_type} end) + + :nullable -> + map(type_gen, &{:nullable, &1}) + + :low_cardinality -> + map(type_gen, &{:low_cardinality, &1}) + + :fixed_string -> + map(integer(1..1024), &{:fixed_string, &1}) + + :datetime -> + map(timezone_gen(), &{:datetime, &1}) + + :datetime64 -> + one_of([ + map(integer(0..9), &{:datetime64, &1}), + map({integer(0..9), timezone_gen()}, fn {precision, timezone} -> + {:datetime64, precision, timezone} + end) + ]) + + :time64 -> + map(integer(0..9), &{:time64, &1}) + + :decimal -> + gen all precision <- integer(1..76), + scale <- integer(0..precision) do + {:decimal, precision, scale} + end + + :decimal_sized -> + gen all type <- member_of([:decimal32, :decimal64, :decimal128, :decimal256]), + scale <- integer(0..9) do + {type, scale} + end + + :enum -> + gen all type <- member_of([:enum8, :enum16]), + mapping <- enum_mapping_gen(type) do + {type, mapping} + end + + :simple_aggregate_function -> + gen all function <- identifier_gen(), + type <- type_gen do + {:simple_aggregate_function, function, type} + end + end) + end + + defp enum_mapping_gen(:enum8) do + uniq_list_of({enum_name_gen(), integer(-128..127)}, min_length: 1, max_length: 8) + end + + defp enum_mapping_gen(:enum16) do + uniq_list_of({enum_name_gen(), integer(-32768..32767)}, min_length: 1, max_length: 8) + end + + defp timezone_gen do + one_of([ + member_of(["UTC", "Europe/Vienna", "Asia/Tokyo", "America/New_York", "Etc/GMT-3"]), + quoted_string_gen() + ]) + end + + defp identifier_gen do + gen all first <- string([?a..?z, ?A..?Z], length: 1), + rest <- string([?a..?z, ?A..?Z, ?0..?9, ?_], max_length: 12) do + first <> rest + end + end + + defp enum_name_gen do + quoted_string_gen() + end + + defp quoted_string_gen do + string( + [ + ?a..?z, + ?A..?Z, + ?0..?9, + ?_, + ?\s, + ?\t, + ?\n, + ?\r, + ?-, + ?., + ?/, + ?', + ?\\, + ?é, + ?€, + ?𐍈 + ], + min_length: 1, + max_length: 20 + ) + end + + defp whitespace_gen do + string([?\s, ?\t, ?\n, ?\r], max_length: 6) + end + + defp junk_suffix_gen do + string([?$], min_length: 1, max_length: 4) + end + + defp normalize_type({:variant, types}) do + {:variant, + Enum.sort_by(Enum.map(types, &normalize_type/1), fn t -> IO.iodata_to_binary(encode(t)) end)} + end + + defp normalize_type({:decimal32, scale}), do: {:decimal, 9, scale} + defp normalize_type({:decimal64, scale}), do: {:decimal, 18, scale} + defp normalize_type({:decimal128, scale}), do: {:decimal, 38, scale} + defp normalize_type({:decimal256, scale}), do: {:decimal, 76, scale} + defp normalize_type({:array, type}), do: {:array, normalize_type(type)} + defp normalize_type({:tuple, types}), do: {:tuple, Enum.map(types, &normalize_type/1)} + + defp normalize_type({:map, key_type, value_type}), + do: {:map, normalize_type(key_type), normalize_type(value_type)} + + defp normalize_type({:nullable, type}), do: {:nullable, normalize_type(type)} + defp normalize_type({:low_cardinality, type}), do: {:low_cardinality, normalize_type(type)} + + defp normalize_type({:simple_aggregate_function, function, type}) do + {:simple_aggregate_function, function, normalize_type(type)} + end + + defp normalize_type(type), do: type + + defp normalize_decoded_type({:variant, types}) do + {:variant, + Enum.sort_by(Enum.map(types, &normalize_decoded_type/1), fn t -> + IO.iodata_to_binary(encode(t)) + end)} + end + + defp normalize_decoded_type({:array, type}), do: {:array, normalize_decoded_type(type)} + + defp normalize_decoded_type({:tuple, types}), + do: {:tuple, Enum.map(types, &normalize_decoded_type/1)} + + defp normalize_decoded_type({:map, key_type, value_type}), + do: {:map, normalize_decoded_type(key_type), normalize_decoded_type(value_type)} + + defp normalize_decoded_type({:nullable, type}), do: {:nullable, normalize_decoded_type(type)} + + defp normalize_decoded_type({:low_cardinality, type}), + do: {:low_cardinality, normalize_decoded_type(type)} + + defp normalize_decoded_type({:simple_aggregate_function, function, type}) do + {:simple_aggregate_function, function, normalize_decoded_type(type)} + end + + defp normalize_decoded_type(type), do: type + + defp render_type(type, whitespace) do + [whitespace, render_type!(type, whitespace), whitespace] |> IO.iodata_to_binary() + end + + defp render_type!(type, _whitespace) when type in @scalar_types, do: encode(type) + + defp render_type!({:fixed_string, n}, whitespace), + do: ["FixedString", whitespace, ?(, whitespace, Integer.to_string(n), whitespace, ?)] + + defp render_type!({:time64, p}, whitespace), + do: ["Time64", whitespace, ?(, whitespace, Integer.to_string(p), whitespace, ?)] + + defp render_type!({:decimal, p, s}, whitespace), + do: [ + "Decimal", + whitespace, + ?(, + whitespace, + Integer.to_string(p), + whitespace, + ?,, + whitespace, + Integer.to_string(s), + whitespace, + ?) + ] + + defp render_type!({:decimal32, s}, whitespace), + do: render_decimal_sized("Decimal32", s, whitespace) + + defp render_type!({:decimal64, s}, whitespace), + do: render_decimal_sized("Decimal64", s, whitespace) + + defp render_type!({:decimal128, s}, whitespace), + do: render_decimal_sized("Decimal128", s, whitespace) + + defp render_type!({:decimal256, s}, whitespace), + do: render_decimal_sized("Decimal256", s, whitespace) + + defp render_type!({:datetime, timezone}, whitespace), + do: ["DateTime", whitespace, ?(, whitespace, ?', render_string(timezone), ?', whitespace, ?)] + + defp render_type!({:datetime64, p}, whitespace), + do: ["DateTime64", whitespace, ?(, whitespace, Integer.to_string(p), whitespace, ?)] + + defp render_type!({:datetime64, p, timezone}, whitespace) do + [ + "DateTime64", + whitespace, + ?(, + whitespace, + Integer.to_string(p), + whitespace, + ?,, + whitespace, + ?', + render_string(timezone), + ?', + whitespace, + ?) + ] + end + + defp render_type!({:array, type}, whitespace), do: render_unary("Array", type, whitespace) + defp render_type!({:nullable, type}, whitespace), do: render_unary("Nullable", type, whitespace) + + defp render_type!({:low_cardinality, type}, whitespace), + do: render_unary("LowCardinality", type, whitespace) + + defp render_type!({:tuple, types}, whitespace) do + ["Tuple", whitespace, ?(, whitespace, render_types(types, whitespace), whitespace, ?)] + end + + defp render_type!({:variant, types}, whitespace) do + ["Variant", whitespace, ?(, whitespace, render_types(types, whitespace), whitespace, ?)] + end + + defp render_type!({:map, key_type, value_type}, whitespace) do + [ + "Map", + whitespace, + ?(, + whitespace, + render_type!(key_type, whitespace), + whitespace, + ?,, + whitespace, + render_type!(value_type, whitespace), + whitespace, + ?) + ] + end + + defp render_type!({:enum8, mapping}, whitespace), do: render_enum("Enum8", mapping, whitespace) + + defp render_type!({:enum16, mapping}, whitespace), + do: render_enum("Enum16", mapping, whitespace) + + defp render_type!({:simple_aggregate_function, function, type}, whitespace) do + [ + "SimpleAggregateFunction", + whitespace, + ?(, + whitespace, + function, + whitespace, + ?,, + whitespace, + render_type!(type, whitespace), + whitespace, + ?) + ] + end + + defp render_decimal_sized(name, scale, whitespace) do + [name, whitespace, ?(, whitespace, Integer.to_string(scale), whitespace, ?)] + end + + defp render_unary(name, type, whitespace) do + [name, whitespace, ?(, whitespace, render_type!(type, whitespace), whitespace, ?)] + end + + defp render_types(types, whitespace) do + types + |> Enum.map(&render_type!(&1, whitespace)) + |> Enum.intersperse([whitespace, ?,, whitespace]) + end + + defp render_enum(name, mapping, whitespace) do + [name, whitespace, ?(, whitespace, render_mapping(mapping, whitespace), whitespace, ?)] + end + + defp render_mapping(mapping, whitespace) do + mapping + |> Enum.map(fn {key, value} -> + [?', render_string(key), ?', whitespace, ?=, whitespace, Integer.to_string(value)] + end) + |> Enum.intersperse([whitespace, ?,, whitespace]) + end + + defp render_string(string) do + string + |> String.replace("\\", "\\\\") + |> String.replace("'", "\\'") + |> String.replace("\t", "\\t") + |> String.replace("\n", "\\n") + |> String.replace("\r", "\\r") + end +end