diff --git a/.gitmodules b/.gitmodules index d85acea564..74d6d07e45 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,9 @@ [submodule "cpp/third-party/googletest"] path = cpp/third-party/googletest url = git@github.com:google/googletest.git +[submodule "cpp/FlameGraph"] + path = cpp/FlameGraph + url = git@github.com:brendangregg/FlameGraph.git +[submodule "cpp/third-party/flatbuffers"] + path = cpp/third-party/flatbuffers + url = https://github.com/google/flatbuffers.git diff --git a/cpp/.gitignore b/cpp/.gitignore index 6021de86a8..eedd6d3627 100644 --- a/cpp/.gitignore +++ b/cpp/.gitignore @@ -5,3 +5,8 @@ plot **/cmake-build-debug **/CMakeCache.txt **/CMakeFiles +# remove perf svg +cpp/testcase/perf-*/*.svg/*.svg +*.csv +*.txt +*.svg diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e2a9f72867..f83e373eea 100755 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,22 +1,20 @@ cmake_minimum_required(VERSION 3.19) -set(CMAKE_CXX_STANDARD 14) -# Set extension name here + +set(CMAKE_CXX_STANDARD 20) set(TARGET_NAME pixels) set(DCMAKE_EXPORT_COMPILE_COMMANDS=1) set(EXTENSION_NAME ${TARGET_NAME}_extension) project(${TARGET_NAME}) + +add_definitions(-DDUCKDB_EXTENSION_LIBRARY) + include_directories(include) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) set(EXTENSION_SOURCES pixels-duckdb/pixels_extension.cpp pixels-duckdb/PixelsScanFunction.cpp ) -add_library(${EXTENSION_NAME} STATIC ${EXTENSION_SOURCES}) - -find_package(Protobuf REQUIRED) -include_directories(${Protobuf_INCLUDE_DIRS}) - -include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_subdirectory(pixels-common) add_subdirectory(pixels-core) @@ -29,18 +27,18 @@ include_directories(pixels-core/include) include_directories(${CMAKE_CURRENT_BINARY_DIR}) include_directories(${CMAKE_CURRENT_BINARY_DIR}/pixels-common/liburing/src/include) -target_link_libraries( - ${EXTENSION_NAME} +build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) +set(PARAMETERS "-warnings") +build_loadable_extension(${TARGET_NAME} ${PARAMETERS} ${EXTENSION_SOURCES}) + +target_link_libraries(${EXTENSION_NAME} pixels-common pixels-core ) # Add the subdirectory that contains the build_loadable_extension definition - -set(PARAMETERS "-warnings") -build_loadable_extension(${TARGET_NAME} ${PARAMETERS} ${EXTENSION_SOURCES}) - message("duckdb export set: ${DUCKDB_EXPORT_SET}" ) +message("TARGET NAME: ${TARGET_NAME} EXTENSION NAME: ${EXTENSION_NAME}") install( TARGETS ${EXTENSION_NAME} pixels-core pixels-common diff --git a/cpp/FlameGraph b/cpp/FlameGraph new file mode 160000 index 0000000000..41fee1f99f --- /dev/null +++ b/cpp/FlameGraph @@ -0,0 +1 @@ +Subproject commit 41fee1f99f9276008b7cd112fca19dc3ea84ac32 diff --git a/cpp/Makefile b/cpp/Makefile index 418f175a2f..12a39470cb 100644 --- a/cpp/Makefile +++ b/cpp/Makefile @@ -1,4 +1,4 @@ -.PHONY: all clean debug release pull update deps +.PHONY: all clean debug release pull update fb-release fb-debug all: release @@ -14,21 +14,22 @@ ifeq (${STATIC_LIBCPP}, 1) endif ifeq ($(GEN),ninja) - GENERATOR=-G "Ninja" - FORCE_COLOR=-DFORCE_COLORED_OUTPUT=1 + GENERATOR=-G "Ninja" + FORCE_COLOR=-DFORCE_COLORED_OUTPUT=1 endif -PROTOBUF_DIR=third-party/protobuf +# remove protobuf, use flatbuffer instead BUILD_FLAGS=-DEXTENSION_STATIC_BUILD=1 -DBUILD_TPCH_EXTENSION=1 -DBUILD_BENCHMARKS=1 -DBUILD_PARQUET_EXTENSION=1 \ -${OSX_BUILD_UNIVERSAL_FLAG} ${STATIC_LIBCPP} CLIENT_FLAGS := +PIXELS_BASE_DIR := $(shell dirname $(shell pwd)) -# These flags will make DuckDB build the extension - -EXTENSION_FLAGS=-DDUCKDB_EXTENSION_NAMES="pixels" -DDUCKDB_EXTENSION_PIXELS_PATH="$(PROJ_DIR)" \ --DDUCKDB_EXTENSION_PIXELS_SHOULD_LINK="TRUE" -DDUCKDB_EXTENSION_PIXELS_INCLUDE_PATH="$(PROJ_DIR)include" \ --DCMAKE_PREFIX_PATH=$(PROJ_DIR)third-party/protobuf/cmake/build -DPIXELS_SRC="$(dirname $(pwd))" +FB_FLAGS=-DUSE_FLATBUFFERS=ON \ +-DDUCKDB_EXTENSION_NAMES="pixels" \ +-DDUCKDB_EXTENSION_PIXELS_PATH="$(PROJ_DIR)" \ +-DDUCKDB_EXTENSION_PIXELS_SHOULD_LINK="TRUE" \ +-DDUCKDB_EXTENSION_PIXELS_INCLUDE_PATH="$(PROJ_DIR)include" \ +-DPIXELS_SRC="$(PIXELS_BASE_DIR)" pull: git submodule init @@ -37,24 +38,23 @@ pull: update: git submodule update --remote --merge pixels-duckdb/duckdb git -C third-party/googletest checkout v1.15.2 - git -C third-party/protobuf checkout v3.21.6 - -deps: - mkdir -p "${PROTOBUF_DIR}/cmake/build" && cd "third-party/protobuf/cmake/build" && \ - cmake -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=Release ../.. -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -Dprotobuf_BUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=./ && \ - make -j install clean: - rm -rf build/release - rm -rf build/debug + rm -rf build/fb-release + rm -rf build/fb-debug cd pixels-duckdb/duckdb && make clean -# Main build -debug: deps - cmake $(GENERATOR) $(FORCE_COLOR) $(EXTENSION_FLAGS) ${CLIENT_FLAGS} -DEXTENSION_STATIC_BUILD=1 -DCMAKE_BUILD_TYPE=Debug ${BUILD_FLAGS} -S pixels-duckdb/duckdb -B build/debug && \ - cmake --build build/debug --config Debug +debug: fb-debug +release: fb-release + +fb-release: + cmake $(GENERATOR) $(FORCE_COLOR) $(FB_FLAGS) ${CLIENT_FLAGS} \ + -DEXTENSION_STATIC_BUILD=1 -DCMAKE_BUILD_TYPE=Release ${BUILD_FLAGS} \ + -S pixels-duckdb/duckdb -B build/release && \ + cmake --build build/release --config Release -release: deps - cmake $(GENERATOR) $(FORCE_COLOR) $(EXTENSION_FLAGS) ${CLIENT_FLAGS} -DEXTENSION_STATIC_BUILD=1 -DCMAKE_BUILD_TYPE=Release ${BUILD_FLAGS} -S pixels-duckdb/duckdb -B build/release && \ - cmake --build build/release --config Release \ No newline at end of file +fb-debug: + cmake $(GENERATOR) $(FORCE_COLOR) $(FB_FLAGS) ${CLIENT_FLAGS} \ + -DEXTENSION_STATIC_BUILD=1 -DCMAKE_BUILD_TYPE=Debug ${BUILD_FLAGS} \ + -S pixels-duckdb/duckdb -B build/debug && \ + cmake --build build/debug --config Debug \ No newline at end of file diff --git a/cpp/include/PixelsScanFunction.hpp b/cpp/include/PixelsScanFunction.hpp index 297500a068..fb29fc6a1e 100644 --- a/cpp/include/PixelsScanFunction.hpp +++ b/cpp/include/PixelsScanFunction.hpp @@ -113,7 +113,7 @@ namespace duckdb PixelsScanInitLocal(ExecutionContext &context, TableFunctionInitInput &input, GlobalTableFunctionState *gstate_p); - static bool PixelsParallelStateNext(ClientContext &context, const PixelsReadBindData &bind_data, + static bool PixelsParallelStateNext(ClientContext &context, PixelsReadBindData &bind_data, PixelsReadLocalState &scan_data, PixelsReadGlobalState ¶llel_state, bool is_init_state = false); diff --git a/cpp/pixels-cli/CMakeLists.txt b/cpp/pixels-cli/CMakeLists.txt index ef71129255..d3376c527e 100644 --- a/cpp/pixels-cli/CMakeLists.txt +++ b/cpp/pixels-cli/CMakeLists.txt @@ -1,6 +1,6 @@ project(pixels-cli) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) include(ExternalProject) include(ProcessorCount) diff --git a/cpp/pixels-cli/include/executor/LoadExecutor.h b/cpp/pixels-cli/include/executor/LoadExecutor.h index 942b6a7c9d..02d0f093b8 100644 --- a/cpp/pixels-cli/include/executor/LoadExecutor.h +++ b/cpp/pixels-cli/include/executor/LoadExecutor.h @@ -36,6 +36,6 @@ class LoadExecutor : public CommandExecutor private: bool startConsumers(const std::vector &inputFiles, Parameters parameters, - const std::vector &loadedFiles); + const std::vector &loadedFiles, int concurrency); }; #endif //PIXELS_LOADEXECUTOR_H diff --git a/cpp/pixels-cli/include/load/PixelsConsumer.h b/cpp/pixels-cli/include/load/PixelsConsumer.h index 805b5ddf8d..de2f30e367 100644 --- a/cpp/pixels-cli/include/load/PixelsConsumer.h +++ b/cpp/pixels-cli/include/load/PixelsConsumer.h @@ -27,6 +27,7 @@ #include #include +#include #include class PixelsConsumer @@ -39,6 +40,7 @@ class PixelsConsumer private: static int GlobalTargetPathId; + static std::mutex globalMutex; // Mutex to protect GlobalTargetPathId std::vector queue; Parameters parameters; std::vector loadedFiles; diff --git a/cpp/pixels-cli/lib/executor/LoadExecutor.cpp b/cpp/pixels-cli/lib/executor/LoadExecutor.cpp index 5d7e354741..0a99c7c5a0 100644 --- a/cpp/pixels-cli/lib/executor/LoadExecutor.cpp +++ b/cpp/pixels-cli/lib/executor/LoadExecutor.cpp @@ -24,21 +24,35 @@ */ #include #include +#include +#include #include #include #include #include +#include #include void LoadExecutor::execute(const bpo::variables_map &ns, const std::string &command) { std::string schema = ns["schema"].as(); + if (std::filesystem::exists(schema) && std::filesystem::is_regular_file(schema)) + { + std::ifstream ifs(schema); + if (ifs.is_open()) + { + std::stringstream buffer; + buffer << ifs.rdbuf(); + schema = buffer.str(); + } + } std::string origin = ns["origin"].as(); std::string target = ns["target"].as(); int rowNum = ns["row_num"].as(); std::string regex = ns["row_regex"].as(); EncodingLevel encodingLevel = EncodingLevel::from(ns["encoding_level"].as()); bool nullPadding = ns["nulls_padding"].as(); + int concurrency = ns["concurrency"].as(); if (origin.back() != '/') { @@ -55,7 +69,7 @@ void LoadExecutor::execute(const bpo::variables_map &ns, const std::string &comm } auto startTime = std::chrono::system_clock::now(); - if (startConsumers(inputFiles, parameters, loadedFiles)) + if (startConsumers(inputFiles, parameters, loadedFiles, concurrency)) { std::cout << command << " is successful" << std::endl; } @@ -65,14 +79,47 @@ void LoadExecutor::execute(const bpo::variables_map &ns, const std::string &comm } auto endTime = std::chrono::system_clock::now(); std::chrono::duration elapsedSeconds = endTime - startTime; - std::cout << "Text file in " << origin << " are loaded by 1 thread in " + std::cout << "Text file in " << origin << " are loaded by " << concurrency << " thread(s) in " << elapsedSeconds.count() << " seconds." << std::endl; } bool LoadExecutor::startConsumers(const std::vector &inputFiles, Parameters parameters, - const std::vector &loadedFiles) + const std::vector &loadedFiles, int concurrency) { - PixelsConsumer consumer(inputFiles, parameters, loadedFiles); - consumer.run(); + if (concurrency <= 1 || inputFiles.size() <= 1) + { + // Single-threaded mode + PixelsConsumer consumer(inputFiles, parameters, loadedFiles); + consumer.run(); + } + else + { + // Multi-threaded mode: each thread processes one file + std::vector threads; + int numThreads = std::min(concurrency, static_cast(inputFiles.size())); + std::vector> inputfilesQueue(numThreads); + int currentThread=0; + for (int i = 0; i < inputFiles.size(); ++i) + { + inputfilesQueue[(currentThread++)%numThreads].push_back(inputFiles[i]); + } + // Each thread gets one file queue to process + for (int i=0;i #include #include +#include int PixelsConsumer::GlobalTargetPathId = 0; +std::mutex PixelsConsumer::globalMutex; PixelsConsumer::PixelsConsumer(const std::vector &queue, const Parameters ¶meters, const std::vector &loadedFiles) @@ -106,8 +108,13 @@ void PixelsConsumer::run() if (initPixelsFile) { LocalFS targetStorage; + int fileId; + { + std::lock_guard lock(globalMutex); + fileId = GlobalTargetPathId++; + } targetFileName = std::to_string(std::chrono::system_clock::to_time_t(std::chrono::system_clock::now())) + \ - "_" + std::to_string(this->loadedFiles.size()) + ".pxl"; + "_" + std::to_string(fileId) + ".pxl"; targetFilePath = targetPath + targetFileName; pixelsWriter = std::make_shared(schema, pixelsStride, rowGroupSize, targetFilePath, blockSize, @@ -137,10 +144,7 @@ void PixelsConsumer::run() if (rowBatch->rowCount == rowBatch->getMaxSize()) { - std::cout << "writing row group to file: " << targetFilePath << " rowCount:" << rowBatch->rowCount - << std::endl; pixelsWriter->addRowBatch(rowBatch); - rowBatch->reset(); } @@ -173,4 +177,4 @@ void PixelsConsumer::run() this->loadedFiles.push_back(targetFilePath); } std::cout << "Exit PixelsConsumer" << std::endl; -} \ No newline at end of file +} diff --git a/cpp/pixels-cli/main.cpp b/cpp/pixels-cli/main.cpp index 0173ea1d24..7f1cdcc041 100644 --- a/cpp/pixels-cli/main.cpp +++ b/cpp/pixels-cli/main.cpp @@ -110,7 +110,9 @@ int main() ("encoding_level,e", bpo::value()->default_value(2), "specify the encoding level for data loading") ("nulls_padding,p", bpo::value()->default_value(false), - "specify whether nulls padding is enabled"); + "specify whether nulls padding is enabled") + ("concurrency,c", bpo::value()->default_value(1), + "specify the number of threads for data loading"); bpo::variables_map vm; try @@ -127,10 +129,8 @@ int main() { std::cerr << "Error parsing options: " << e.what() << "\n"; } - // try { - LoadExecutor *loadExecutor = new LoadExecutor(); + std::unique_ptr loadExecutor = std::make_unique(); loadExecutor->execute(vm, command); - // } catch } else if (command == "QUERY") { @@ -160,6 +160,7 @@ int main() { std::cout << "Command " << command << " not found" << std::endl; } + for (char* p : argv) free(p); } // end of while loop return 0; -} \ No newline at end of file +} diff --git a/cpp/pixels-common/CMakeLists.txt b/cpp/pixels-common/CMakeLists.txt index 1ed1f4f03d..3f680a57b7 100644 --- a/cpp/pixels-common/CMakeLists.txt +++ b/cpp/pixels-common/CMakeLists.txt @@ -5,63 +5,79 @@ set(CMAKE_CXX_STANDARD 17) include(ExternalProject) set(pixels_common_cxx - lib/physical/storage/LocalFS.cpp + lib/physical/storage/LocalFS.cpp lib/physical/storage/LocalFSProvider.cpp lib/physical/storage/PhysicalLocalWriter.cpp lib/physical/PhysicalWriterOption.cpp lib/physical/Status.cpp - lib/physical/Storage.cpp + lib/physical/Storage.cpp lib/physical/FilePath.cpp - lib/physical/natives/PixelsRandomAccessFile.cpp - lib/physical/natives/DirectRandomAccessFile.cpp - lib/physical/natives/ByteBuffer.cpp - lib/physical/io/PhysicalLocalReader.cpp - lib/physical/StorageFactory.cpp - lib/physical/Request.cpp - lib/physical/RequestBatch.cpp - lib/physical/scheduler/NoopScheduler.cpp - lib/physical/SchedulerFactory.cpp - lib/exception/InvalidArgumentException.cpp - lib/utils/Constants.cpp - lib/utils/String.cpp - include/physical/natives/DirectIoLib.h - lib/physical/natives/DirectIoLib.cpp - include/utils/ConfigFactory.h - lib/utils/ConfigFactory.cpp - include/physical/MergedRequest.h - include/physical/scheduler/SortMergeScheduler.h - lib/physical/scheduler/SortMergeScheduler.cpp - lib/MergedRequest.cpp include/profiler/TimeProfiler.h - lib/profiler/TimeProfiler.cpp - include/profiler/CountProfiler.h - lib/profiler/CountProfiler.cpp - include/profiler/AbstractProfiler.h - include/physical/allocator/Allocator.h - include/physical/allocator/OrdinaryAllocator.h - lib/physical/allocator/OrdinaryAllocator.cpp - include/physical/allocator/BufferPoolAllocator.h - lib/physical/allocator/BufferPoolAllocator.cpp - include/physical/BufferPool.h - lib/physical/BufferPool.cpp - include/physical/natives/DirectUringRandomAccessFile.h - lib/physical/natives/DirectUringRandomAccessFile.cpp - include/utils/ColumnSizeCSVReader.h lib/utils/ColumnSizeCSVReader.cpp - include/physical/StorageArrayScheduler.h lib/physical/StorageArrayScheduler.cpp + lib/physical/natives/PixelsRandomAccessFile.cpp + lib/physical/natives/DirectRandomAccessFile.cpp + lib/physical/natives/ByteBuffer.cpp + lib/physical/io/PhysicalLocalReader.cpp + lib/physical/StorageFactory.cpp + lib/physical/Request.cpp + lib/physical/RequestBatch.cpp + lib/physical/scheduler/NoopScheduler.cpp + lib/physical/SchedulerFactory.cpp + lib/exception/InvalidArgumentException.cpp + lib/utils/Constants.cpp + lib/utils/String.cpp + include/physical/natives/DirectIoLib.h + lib/physical/natives/DirectIoLib.cpp + include/utils/ConfigFactory.h + lib/utils/ConfigFactory.cpp + include/physical/MergedRequest.h + include/physical/scheduler/SortMergeScheduler.h + lib/physical/scheduler/SortMergeScheduler.cpp + lib/MergedRequest.cpp + include/profiler/TimeProfiler.h + lib/profiler/TimeProfiler.cpp + include/profiler/CountProfiler.h + lib/profiler/CountProfiler.cpp + include/profiler/AbstractProfiler.h + include/physical/allocator/Allocator.h + include/physical/allocator/OrdinaryAllocator.h + lib/physical/allocator/OrdinaryAllocator.cpp + include/physical/allocator/BufferPoolAllocator.h + lib/physical/allocator/BufferPoolAllocator.cpp + include/physical/BufferPool.h + lib/physical/BufferPool.cpp + include/physical/natives/DirectUringRandomAccessFile.h + lib/physical/natives/DirectUringRandomAccessFile.cpp + include/utils/ColumnSizeCSVReader.h + lib/utils/ColumnSizeCSVReader.cpp + include/physical/StorageArrayScheduler.h + lib/physical/StorageArrayScheduler.cpp include/physical/natives/ByteOrder.h ) -include_directories(include) +set(FBS_SRC "${CMAKE_CURRENT_SOURCE_DIR}/../../proto/pixels.fbs") +set(GEN_HEADER_DIR "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_out") +set(FBS_HEADER "${GEN_HEADER_DIR}/pixels_generated.h") -if(NOT DEFINED ENV{PIXELS_SRC}) - message(FATAL_ERROR "You must set PIXELS_SRC environment variable. The value should be set to the Pixels base directory.") -endif() +file(MAKE_DIRECTORY ${GEN_HEADER_DIR}) -protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS $ENV{PIXELS_SRC}/proto/pixels.proto) +add_custom_command( + OUTPUT "${FBS_HEADER}" + COMMAND flatc --cpp --gen-object-api -o "${GEN_HEADER_DIR}" "${FBS_SRC}" + DEPENDS "${FBS_SRC}" + COMMENT "Running FlatBuffers compiler (flatc) with --gen-object-api on ${FBS_SRC}" + VERBATIM +) + +add_custom_target(pixels_fb_gen DEPENDS "${FBS_HEADER}") -add_library(pixels-common ${pixels_common_cxx} ${PROTO_SRCS} ${PROTO_HDRS}) +add_library(pixels-common ${pixels_common_cxx} "${FBS_HEADER}") +add_dependencies(pixels-common pixels_fb_gen) -# liburing -set(LIBURING_GIT_REPOSITORY git@github.com:axboe/liburing.git) +target_include_directories(pixels-common PUBLIC + $ + $ +) + +set(LIBURING_GIT_REPOSITORY https://github.com/axboe/liburing.git) set(LIBURING_GIT_TAG liburing-2.2) set(LIBURING_BUILD_COMMAND make -j) @@ -74,12 +90,17 @@ ExternalProject_Add(liburing INSTALL_COMMAND "" BUILD_COMMAND ${LIBURING_BUILD_COMMAND} BUILD_IN_SOURCE true - ) +) add_dependencies(pixels-common liburing) -include_directories(${CMAKE_CURRENT_BINARY_DIR}/liburing/src/include) -link_directories(${CMAKE_CURRENT_BINARY_DIR}/liburing/src) -message(${CMAKE_CURRENT_BINARY_DIR}/liburing/src) + +set(LIBURING_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/liburing/src/include) +set(LIBURING_LIB ${CMAKE_CURRENT_BINARY_DIR}/liburing/src/liburing.a) + +target_include_directories(pixels-common PRIVATE ${LIBURING_INCLUDE_DIR}) + target_link_libraries(pixels-common - ${Protobuf_LIBRARIES} - ${CMAKE_CURRENT_BINARY_DIR}/liburing/src/liburing.a) + PRIVATE + ${LIBURING_LIB} + pthread # liburing usually depends on pthread +) diff --git a/cpp/pixels-common/include/physical/BufferPool.h b/cpp/pixels-common/include/physical/BufferPool.h index ec810e29a9..63476a4652 100644 --- a/cpp/pixels-common/include/physical/BufferPool.h +++ b/cpp/pixels-common/include/physical/BufferPool.h @@ -33,6 +33,7 @@ #include "exception/InvalidArgumentException.h" #include "utils/ColumnSizeCSVReader.h" #include +#include // when allocating buffer pool, we use the size of the first pxl file. Consider that // the remaining pxl file has larger size than the first file, we allocate some extra @@ -65,7 +66,7 @@ class BufferPool static thread_local bool isInitialized; static thread_local std::map> buffers[2]; - static std::shared_ptr directIoLib; + static thread_local std::shared_ptr directIoLib; static thread_local int currBufferIdx; static thread_local int nextBufferIdx; friend class DirectUringRandomAccessFile; diff --git a/cpp/pixels-common/include/physical/natives/ByteBuffer.h b/cpp/pixels-common/include/physical/natives/ByteBuffer.h index ab64114641..20bab30168 100644 --- a/cpp/pixels-common/include/physical/natives/ByteBuffer.h +++ b/cpp/pixels-common/include/physical/natives/ByteBuffer.h @@ -159,6 +159,10 @@ class ByteBuffer void printPosition(); + // for auto-resize + + void ensureCapacity(uint32_t minCapacity); + protected: uint32_t wpos; mutable uint32_t rpos; @@ -197,10 +201,11 @@ class ByteBuffer { uint32_t s = sizeof(data); - if (size() < (wpos + s)) - { - throw std::runtime_error("Append exceeds the size of buffer"); - } + // if (size() < (wpos + s)) + // { + // throw std::runtime_error("Append exceeds the size of buffer"); + // } + ensureCapacity(wpos + s); memcpy(&buf[wpos], (uint8_t * ) & data, s); //printf("writing %c to %i\n", (uint8_t)data, wpos); @@ -210,14 +215,17 @@ class ByteBuffer template void insert(T data, uint32_t index) { - if ((index + sizeof(data)) > size()) - { - throw std::runtime_error("Insert exceeds the size of buffer"); - } + // if ((index + sizeof(data)) > size()) + // { + // throw std::runtime_error("Insert exceeds the size of buffer"); + // } + uint32_t s = sizeof(data); + // 确保 index 处的写入不会越界 + ensureCapacity(index + s); memcpy(&buf[index], (uint8_t * ) & data, sizeof(data)); - wpos = index + sizeof(data); + wpos = index + s; } }; diff --git a/cpp/pixels-common/lib/physical/BufferPool.cpp b/cpp/pixels-common/lib/physical/BufferPool.cpp index 4dcc7bdeab..d899e6ce1b 100644 --- a/cpp/pixels-common/lib/physical/BufferPool.cpp +++ b/cpp/pixels-common/lib/physical/BufferPool.cpp @@ -34,7 +34,7 @@ BufferPool::buffers[2]; // since we call switch function first. thread_local int BufferPool::currBufferIdx = 1; thread_local int BufferPool::nextBufferIdx = 0; -std::shared_ptr BufferPool::directIoLib; +thread_local std::shared_ptr BufferPool::directIoLib; void BufferPool::Initialize(std::vector colIds, std::vector bytes, std::vector columnNames) diff --git a/cpp/pixels-common/lib/physical/natives/ByteBuffer.cpp b/cpp/pixels-common/lib/physical/natives/ByteBuffer.cpp index bb929e2d4f..0feecf04d5 100644 --- a/cpp/pixels-common/lib/physical/natives/ByteBuffer.cpp +++ b/cpp/pixels-common/lib/physical/natives/ByteBuffer.cpp @@ -402,6 +402,30 @@ void ByteBuffer::printPosition() << wpos << std::endl; } +/** + * ensure enough space + * @param minCapacity + */ +void ByteBuffer::ensureCapacity(uint32_t minCapacity) { + if (minCapacity > bufSize) { + uint32_t newSize = bufSize * 2; + if (newSize < minCapacity) { + newSize = minCapacity; + } + + uint8_t* newBuf = new uint8_t[newSize]; + + if (buf != nullptr) { + memcpy(newBuf, buf, bufSize); + delete[] buf; + } + + buf = newBuf; + bufSize = newSize; + } +} + + ByteBuffer::~ByteBuffer() { if (!fromOtherBB) diff --git a/cpp/pixels-core/include/PixelsFooterCache.h b/cpp/pixels-core/include/PixelsFooterCache.h index e6b8766132..b5cdef5d7d 100644 --- a/cpp/pixels-core/include/PixelsFooterCache.h +++ b/cpp/pixels-core/include/PixelsFooterCache.h @@ -27,29 +27,28 @@ #include #include -#include "pixels-common/pixels.pb.h" +#include "pixels_generated.h" #include -using namespace pixels::proto; -typedef std::unordered_map > FileTailTable; -typedef std::unordered_map > RGFooterTable; +typedef std::unordered_map FileTailTable; +typedef std::unordered_map RGFooterTable; class PixelsFooterCache { public: PixelsFooterCache(); - void putFileTail(const std::string &id, std::shared_ptr fileTail); + void putFileTail(const std::string &id, const pixels::fb::FileTail* fileTail); bool containsFileTail(const std::string &id); - std::shared_ptr getFileTail(const std::string &id); + const pixels::fb::FileTail* getFileTail(const std::string &id); - void putRGFooter(const std::string &id, std::shared_ptr footer); + void putRGFooter(const std::string &id, const pixels::fb::RowGroupFooter* footer); bool containsRGFooter(const std::string &id); - std::shared_ptr getRGFooter(const std::string &id); + const pixels::fb::RowGroupFooter* getRGFooter(const std::string &id); private: FileTailTable fileTailCacheMap; diff --git a/cpp/pixels-core/include/PixelsReader.h b/cpp/pixels-core/include/PixelsReader.h index 2007dd5c23..29808284b2 100644 --- a/cpp/pixels-core/include/PixelsReader.h +++ b/cpp/pixels-core/include/PixelsReader.h @@ -38,14 +38,15 @@ #include "reader/PixelsRecordReader.h" #include "reader/PixelsReaderOption.h" #include "PixelsVersion.h" +#include "pixels_generated.h" -typedef ::google::protobuf::RepeatedPtrField<::pixels::proto::ColumnStatistic> +typedef flatbuffers::Vector> ColumnStatisticList; -typedef ::google::protobuf::RepeatedPtrField<::pixels::proto::RowGroupInformation> +typedef flatbuffers::Vector> RowGroupInfoList; -typedef ::google::protobuf::RepeatedPtrField<::pixels::proto::RowGroupStatistic> +typedef flatbuffers::Vector> RowGroupStatList; class PixelsReader @@ -64,7 +65,7 @@ class PixelsReader virtual long getNumberOfRows() = 0; - virtual pixels::proto::CompressionKind getCompressionKind() = 0; + virtual pixels::fb::CompressionKind getCompressionKind() = 0; virtual long getCompressionBlockSize() = 0; @@ -76,17 +77,17 @@ class PixelsReader virtual bool isPartitioned() = 0; - virtual ColumnStatisticList getColumnStats() = 0; + virtual const ColumnStatisticList* getColumnStats() = 0; - virtual pixels::proto::ColumnStatistic getColumnStat(std::string columnName) = 0; + virtual const pixels::fb::ColumnStatistic* getColumnStat(std::string columnName) = 0; - virtual RowGroupInfoList getRowGroupInfos() = 0; + virtual const RowGroupInfoList* getRowGroupInfos() = 0; - virtual pixels::proto::RowGroupInformation getRowGroupInfo(int rowGroupId) = 0; + virtual const pixels::fb::RowGroupInformation* getRowGroupInfo(int rowGroupId) = 0; - virtual pixels::proto::RowGroupStatistic getRowGroupStat(int rowGroupId) = 0; + virtual const pixels::fb::RowGroupStatistic* getRowGroupStat(int rowGroupId) = 0; - virtual RowGroupStatList getRowGroupStats() = 0; + virtual const RowGroupStatList* getRowGroupStats() = 0; virtual void close() = 0; diff --git a/cpp/pixels-core/include/PixelsReaderBuilder.h b/cpp/pixels-core/include/PixelsReaderBuilder.h index 496931a8c7..09f718c3d5 100644 --- a/cpp/pixels-core/include/PixelsReaderBuilder.h +++ b/cpp/pixels-core/include/PixelsReaderBuilder.h @@ -27,7 +27,8 @@ #include "PixelsReaderImpl.h" #include "physical/PhysicalReaderUtil.h" -#include "pixels-common/pixels.pb.h" +#include "pixels_generated.h" +#include "pixels_generated.h" #include "PixelsVersion.h" #include "PixelsFooterCache.h" #include "exception/PixelsReaderException.h" diff --git a/cpp/pixels-core/include/PixelsReaderImpl.h b/cpp/pixels-core/include/PixelsReaderImpl.h index 13e58e1d49..c35d703c66 100644 --- a/cpp/pixels-core/include/PixelsReaderImpl.h +++ b/cpp/pixels-core/include/PixelsReaderImpl.h @@ -29,7 +29,7 @@ #include "reader/PixelsRecordReaderImpl.h" #include #include -#include "pixels-common/pixels.pb.h" +#include "pixels_generated.h" #include "PixelsFooterCache.h" #include "reader/PixelsReaderOption.h" @@ -43,7 +43,7 @@ class PixelsReaderImpl : public PixelsReader PixelsReaderImpl(std::shared_ptr fileSchema, std::shared_ptr reader, - std::shared_ptr fileTail, + const pixels::fb::FileTail* fileTail, std::shared_ptr footerCache); ~PixelsReaderImpl(); @@ -54,7 +54,7 @@ class PixelsReaderImpl : public PixelsReader long getNumberOfRows() override; - pixels::proto::CompressionKind getCompressionKind() override; + pixels::fb::CompressionKind getCompressionKind() override; long getCompressionBlockSize() override; @@ -66,17 +66,17 @@ class PixelsReaderImpl : public PixelsReader bool isPartitioned() override; - ColumnStatisticList getColumnStats() override; + const ColumnStatisticList* getColumnStats() override; - pixels::proto::ColumnStatistic getColumnStat(std::string columnName) override; + const pixels::fb::ColumnStatistic* getColumnStat(std::string columnName) override; - RowGroupInfoList getRowGroupInfos() override; + const RowGroupInfoList* getRowGroupInfos() override; - pixels::proto::RowGroupInformation getRowGroupInfo(int rowGroupId) override; + const pixels::fb::RowGroupInformation* getRowGroupInfo(int rowGroupId) override; - pixels::proto::RowGroupStatistic getRowGroupStat(int rowGroupId) override; + const pixels::fb::RowGroupStatistic* getRowGroupStat(int rowGroupId) override; - RowGroupStatList getRowGroupStats() override; + const RowGroupStatList* getRowGroupStats() override; void close() override; @@ -85,8 +85,8 @@ class PixelsReaderImpl : public PixelsReader std::shared_ptr fileSchema; std::shared_ptr physicalReader; std::shared_ptr pixelsFooterCache; - pixels::proto::PostScript postScript; - pixels::proto::Footer footer; + const pixels::fb::Footer* footer; + const pixels::fb::PostScript* postScript; bool closed; }; diff --git a/cpp/pixels-core/include/PixelsWriterImpl.h b/cpp/pixels-core/include/PixelsWriterImpl.h index 6dd3ae3d4c..d8edcdd7e9 100644 --- a/cpp/pixels-core/include/PixelsWriterImpl.h +++ b/cpp/pixels-core/include/PixelsWriterImpl.h @@ -31,12 +31,21 @@ #include "writer/ColumnWriter.h" #include "utils/ConfigFactory.h" #include "stats/StatsRecorder.h" -#include "pixels-common/pixels.pb.h" +#include "pixels_generated.h" #include "vector/VectorizedRowBatch.h" #include #include #include +// temp metadata store in memory +struct RowGroupNative { + uint64_t footerOffset; + uint32_t dataLength; + uint32_t footerLength; + uint32_t numberOfRows; +}; + + class PixelsWriterImpl : public PixelsWriter { public: @@ -49,6 +58,14 @@ class PixelsWriterImpl : public PixelsWriter void writeColumnVectors(std::vector > &columnVectors, int rowBatchSize); void writeRowGroup(); + // Split into four functions + int prepareRowGroup(); + + void writeRowGroupData(uint32_t totalLength); + + int writeRowGroupFooter(); + + void recordRowGroupMetadata(int rowGroupDataLength); void writeFileTail(); @@ -66,25 +83,30 @@ class PixelsWriterImpl : public PixelsWriter std::shared_ptr schema; int rowGroupSize; - pixels::proto::CompressionKind compressionKind; int compressionBlockSize; // std::unique_ptr timeZone; std::shared_ptr columnWriterOption; std::vector > columnWriters; std::vector fileColStatRecorders; - std::int64_t fileContentLength; - int fileRowNum; + std::int64_t fileContentLength = 0 ; + int fileRowNum = 0; std::int64_t writtenBytes = 0; std::int64_t curRowGroupOffset = 0; - std::int64_t curRowGroupFooterOffset = 0; std::int64_t curRowGroupNumOfRows = 0; int curRowGroupDataLength = 0; bool haseValueIsSet = false; int currHashValue = 0; bool partitioned; - std::vector rowGroupInfoList; - std::vector rowGroupStatisticList; std::shared_ptr physicalWriter; std::vector > children; + + // flatbuffers + // global fFlatBuffe + flatbuffers::FlatBufferBuilder fbb; + pixels::fb::CompressionKind compressionKind; + std::vector> rowGroupInfoList; + std::vector> rowGroupStatisticList; + + std::vector rowGroupMetadataList; }; #endif //PIXELS_PIXELSWRITERIMPL_H diff --git a/cpp/pixels-core/include/TypeDescription.h b/cpp/pixels-core/include/TypeDescription.h index f1edb1753d..231719a76d 100644 --- a/cpp/pixels-core/include/TypeDescription.h +++ b/cpp/pixels-core/include/TypeDescription.h @@ -36,7 +36,9 @@ #include #include #include -#include +#include + +#include "pixels_generated.h" #include #include "vector/LongColumnVector.h" #include "vector/ByteColumnVector.h" @@ -129,7 +131,7 @@ class TypeDescription : public std::enable_shared_from_this static std::shared_ptr createStruct(); static std::shared_ptr - createSchema(const std::vector > &types); + createSchema(std::span types); std::shared_ptr addField(const std::string &field, const std::shared_ptr &fieldType); @@ -200,8 +202,7 @@ class TypeDescription : public std::enable_shared_from_this static int MAX_TIMESTAMP_PRECISION; static int MAX_TIME_PRECISION; - - void writeTypes(std::shared_ptr footer); + std::vector> writeTypes(flatbuffers::FlatBufferBuilder& fbb); private: diff --git a/cpp/pixels-core/include/reader/ColumnReader.h b/cpp/pixels-core/include/reader/ColumnReader.h index 072c02d1fc..0a2a6bf27b 100644 --- a/cpp/pixels-core/include/reader/ColumnReader.h +++ b/cpp/pixels-core/include/reader/ColumnReader.h @@ -27,7 +27,7 @@ #include "TypeDescription.h" #include "physical/natives/ByteBuffer.h" -#include "pixels-common/pixels.pb.h" +#include "pixels_generated.h" #include "math.h" #include "duckdb.h" #include "duckdb/common/types/vector.hpp" @@ -61,10 +61,10 @@ class ColumnReader * @param chunkIndex the metadata of the column chunk to read. */ virtual void read(std::shared_ptr input, - pixels::proto::ColumnEncoding &encoding, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask); void setValid(const std::shared_ptr &input, int pixelStride, diff --git a/cpp/pixels-core/include/reader/DateColumnReader.h b/cpp/pixels-core/include/reader/DateColumnReader.h index dd300959d2..d1233194d8 100644 --- a/cpp/pixels-core/include/reader/DateColumnReader.h +++ b/cpp/pixels-core/include/reader/DateColumnReader.h @@ -36,10 +36,10 @@ class DateColumnReader : public ColumnReader void close() override; void read(std::shared_ptr input, - pixels::proto::ColumnEncoding &encoding, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) override; private: diff --git a/cpp/pixels-core/include/reader/DecimalColumnReader.h b/cpp/pixels-core/include/reader/DecimalColumnReader.h index 15f989aa4d..3b3159622b 100644 --- a/cpp/pixels-core/include/reader/DecimalColumnReader.h +++ b/cpp/pixels-core/include/reader/DecimalColumnReader.h @@ -35,10 +35,10 @@ class DecimalColumnReader : public ColumnReader void close() override; void read(std::shared_ptr input, - pixels::proto::ColumnEncoding &encoding, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) override; private: diff --git a/cpp/pixels-core/include/reader/IntColumnReader.h b/cpp/pixels-core/include/reader/IntColumnReader.h index 38c8a069b6..628d7bddc5 100644 --- a/cpp/pixels-core/include/reader/IntColumnReader.h +++ b/cpp/pixels-core/include/reader/IntColumnReader.h @@ -37,10 +37,10 @@ class IntColumnReader : public ColumnReader void close() override; void read(std::shared_ptr input, - pixels::proto::ColumnEncoding &encoding, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) override; private: diff --git a/cpp/pixels-core/include/reader/LongColumnReader.h b/cpp/pixels-core/include/reader/LongColumnReader.h index a9809a9204..c202812155 100644 --- a/cpp/pixels-core/include/reader/LongColumnReader.h +++ b/cpp/pixels-core/include/reader/LongColumnReader.h @@ -36,10 +36,10 @@ class LongColumnReader : public ColumnReader void close() override; void read(std::shared_ptr input, - pixels::proto::ColumnEncoding &encoding, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) override; private: diff --git a/cpp/pixels-core/include/reader/PixelsRecordReaderImpl.h b/cpp/pixels-core/include/reader/PixelsRecordReaderImpl.h index 190663f574..934bc3ce83 100644 --- a/cpp/pixels-core/include/reader/PixelsRecordReaderImpl.h +++ b/cpp/pixels-core/include/reader/PixelsRecordReaderImpl.h @@ -30,7 +30,7 @@ #include "vector/VectorizedRowBatch.h" #include "physical/Scheduler.h" #include "physical/SchedulerFactory.h" -#include "pixels-common/pixels.pb.h" +#include "pixels_generated.h" #include "PixelsFooterCache.h" #include "reader/PixelsReaderOption.h" #include "utils/String.h" @@ -65,8 +65,8 @@ class PixelsRecordReaderImpl : public PixelsRecordReader { public: explicit PixelsRecordReaderImpl(std::shared_ptr reader, - const pixels::proto::PostScript &pixelsPostScript, - const pixels::proto::Footer &pixelsFooter, + const pixels::fb::PostScript* pixelsPostScript, + const pixels::fb::Footer* pixelsFooter, const PixelsReaderOption &opt, std::shared_ptr pixelsFooterCache ); @@ -99,8 +99,8 @@ class PixelsRecordReaderImpl : public PixelsRecordReader void UpdateRowGroupInfo(); std::shared_ptr physicalReader; - pixels::proto::Footer footer; - pixels::proto::PostScript postScript; + const pixels::fb::Footer* footer; + const pixels::fb::PostScript* postScript; std::shared_ptr footerCache; PixelsReaderOption option; duckdb::TableFilterSet *filter; @@ -119,10 +119,10 @@ class PixelsRecordReaderImpl : public PixelsRecordReader int curRGRowCount; bool enabledFilterPushDown; std::shared_ptr filterMask; - std::shared_ptr curRGFooter; - std::vector > curEncoding; + const pixels::fb::RowGroupFooter* curRGFooter; + std::vector curEncoding; std::vector curChunkBufferIndex; - std::vector > curChunkIndex; + std::vector curChunkIndex; /** * Columns included by reader option; if included, set true */ @@ -141,10 +141,10 @@ class PixelsRecordReaderImpl : public PixelsRecordReader std::vector resultColumns; std::vector resultColumnsEncoded; bool enableEncodedVector; - std::vector > rowGroupFooters; + std::vector rowGroupFooters; int includedColumnNum; // the number of columns to read - std::vector > includedColumnTypes; + std::vector includedColumnTypes; std::shared_ptr fileSchema; std::shared_ptr resultSchema; diff --git a/cpp/pixels-core/include/reader/StringColumnReader.h b/cpp/pixels-core/include/reader/StringColumnReader.h index 500248cff1..b4a316720a 100644 --- a/cpp/pixels-core/include/reader/StringColumnReader.h +++ b/cpp/pixels-core/include/reader/StringColumnReader.h @@ -27,6 +27,7 @@ #include "reader/ColumnReader.h" #include "encoding/RunLenIntDecoder.h" +#include "pixels_generated.h" class StringColumnReader : public ColumnReader { @@ -38,10 +39,10 @@ class StringColumnReader : public ColumnReader void close() override; void read(std::shared_ptr input, - pixels::proto::ColumnEncoding &encoding, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) override; private: @@ -74,6 +75,6 @@ class StringColumnReader : public ColumnReader * In this method, we have reduced most of significant memory copies. */ void readContent(std::shared_ptr input, - uint32_t inputLength, pixels::proto::ColumnEncoding &encoding); + uint32_t inputLength, const pixels::fb::ColumnEncoding* encoding); }; #endif //PIXELS_STRINGCOLUMNREADER_H diff --git a/cpp/pixels-core/include/reader/TimestampColumnReader.h b/cpp/pixels-core/include/reader/TimestampColumnReader.h index d0745a3e61..596a78ac70 100644 --- a/cpp/pixels-core/include/reader/TimestampColumnReader.h +++ b/cpp/pixels-core/include/reader/TimestampColumnReader.h @@ -36,10 +36,10 @@ class TimestampColumnReader : public ColumnReader void close() override; void read(std::shared_ptr input, - pixels::proto::ColumnEncoding &encoding, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) override; private: diff --git a/cpp/pixels-core/include/stats/StatsRecorder.h b/cpp/pixels-core/include/stats/StatsRecorder.h index 1f703553fb..18e3750266 100644 --- a/cpp/pixels-core/include/stats/StatsRecorder.h +++ b/cpp/pixels-core/include/stats/StatsRecorder.h @@ -27,7 +27,8 @@ #define PIXELS_STATSRECODER_H #include "TypeDescription.h" -#include "pixels-common/pixels.pb.h" +#include "pixels_generated.h" +#include "pixels_generated.h" class StatsRecorder { @@ -38,7 +39,7 @@ class StatsRecorder public: StatsRecorder(); - explicit StatsRecorder(const pixels::proto::ColumnStatistic &statistic); + explicit StatsRecorder(const pixels::fb::ColumnStatistic* statistic); virtual ~StatsRecorder(); @@ -80,14 +81,14 @@ class StatsRecorder bool hasNullValue() const; - virtual pixels::proto::ColumnStatistic serialize() const; + virtual flatbuffers::Offset serialize(flatbuffers::FlatBufferBuilder& builder) const; static std::unique_ptr create(TypeDescription type); static std::unique_ptr - create(TypeDescription type, const pixels::proto::ColumnStatistic &statistic); + create(TypeDescription type, const pixels::fb::ColumnStatistic* statistic); static std::unique_ptr - create(TypeDescription::Category category, const pixels::proto::ColumnStatistic &statistic); + create(TypeDescription::Category category, const pixels::fb::ColumnStatistic* statistic); }; #endif // PIXELS_STATSRECODER_H diff --git a/cpp/pixels-core/include/writer/ColumnWriter.h b/cpp/pixels-core/include/writer/ColumnWriter.h index 5ef608c382..a39ab72181 100644 --- a/cpp/pixels-core/include/writer/ColumnWriter.h +++ b/cpp/pixels-core/include/writer/ColumnWriter.h @@ -27,8 +27,7 @@ #include "TypeDescription.h" #include "physical/natives/ByteBuffer.h" -#include "pixels-common/pixels.pb.h" -#include +#include "pixels_generated.h" #include #include "duckdb.h" #include "duckdb/common/types/vector.hpp" @@ -54,11 +53,9 @@ class ColumnWriter virtual bool decideNullsPadding(std::shared_ptr writerOption) = 0; - virtual pixels::proto::ColumnChunkIndex getColumnChunkIndex(); - - virtual std::shared_ptr getColumnChunkIndexPtr(); + virtual const flatbuffers::Offset getColumnChunkIndex(); - virtual pixels::proto::ColumnEncoding getColumnChunkEncoding() const; + virtual const flatbuffers::Offset getColumnChunkEncoding(flatbuffers::FlatBufferBuilder& fbb) const; virtual void reset(); @@ -69,12 +66,24 @@ class ColumnWriter // virtual virtual void newPixel(); + virtual flatbuffers::Offset buildColumnChunkIndex(flatbuffers::FlatBufferBuilder& fbb, uint64_t chunkOffset, uint32_t chunkLength,bool littleEndian); + private: static const int ISNULL_ALIGNMENT; static const std::vector ISNULL_PADDING_BUFFER; - std::shared_ptr columnChunkIndex{}; - std::shared_ptr columnChunkStat{}; + // Structure to hold pixel statistics data for delayed serialization + struct PixelStatSnapshot { + std::unique_ptr colStatObj; + }; + + // Accumulated data for building ColumnChunkIndex + std::vector pixelPositions; + std::vector pixelStatSnapshots; // Changed from pixelStatistics + + // Built flatbuffers objects + flatbuffers::Offset columnChunkIndex; + const pixels::fb::ColumnStatistic* columnChunkStat; int lastPixelPosition = 0; int curPixelPosition = 0; @@ -94,5 +103,6 @@ class ColumnWriter int curPixelVectorIndex = 0; const ByteOrder byteOrder; std::vector isNull{}; + int isNullOffset =0; }; #endif //PIXELS_COLUMNWRITER_H diff --git a/cpp/pixels-core/include/writer/IntColumnWriter.h b/cpp/pixels-core/include/writer/IntColumnWriter.h index 84fcc9e383..95e5c574df 100644 --- a/cpp/pixels-core/include/writer/IntColumnWriter.h +++ b/cpp/pixels-core/include/writer/IntColumnWriter.h @@ -40,7 +40,7 @@ class IntColumnWriter : public ColumnWriter bool decideNullsPadding(std::shared_ptr writerOption) override; - pixels::proto::ColumnEncoding getColumnChunkEncoding() const override; + const flatbuffers::Offset getColumnChunkEncoding(flatbuffers::FlatBufferBuilder& fbb) const override; private: bool runlengthEncoding; diff --git a/cpp/pixels-core/include/writer/LongColumnWriter.h b/cpp/pixels-core/include/writer/LongColumnWriter.h index 039f6f7eca..889995a95e 100644 --- a/cpp/pixels-core/include/writer/LongColumnWriter.h +++ b/cpp/pixels-core/include/writer/LongColumnWriter.h @@ -41,7 +41,7 @@ class LongColumnWriter : public ColumnWriter bool decideNullsPadding(std::shared_ptr writerOption) override; - pixels::proto::ColumnEncoding getColumnChunkEncoding() const override; + const flatbuffers::Offset getColumnChunkEncoding(flatbuffers::FlatBufferBuilder& fbb) const override; private: bool runlengthEncoding; diff --git a/cpp/pixels-core/lib/PixelsFilter.cpp b/cpp/pixels-core/lib/PixelsFilter.cpp index 90248a3937..643715eedc 100644 --- a/cpp/pixels-core/lib/PixelsFilter.cpp +++ b/cpp/pixels-core/lib/PixelsFilter.cpp @@ -298,6 +298,9 @@ void PixelsFilter::ApplyFilter(std::shared_ptr vector, duckdb::Ta case duckdb::TableFilterType::IS_NULL: // TODO: support is null break; + case duckdb::TableFilterType::OPTIONAL_FILTER: + // nothing to do + return; default: D_ASSERT(0); break; diff --git a/cpp/pixels-core/lib/PixelsFooterCache.cpp b/cpp/pixels-core/lib/PixelsFooterCache.cpp index 7b227f79c7..837f838159 100644 --- a/cpp/pixels-core/lib/PixelsFooterCache.cpp +++ b/cpp/pixels-core/lib/PixelsFooterCache.cpp @@ -29,12 +29,12 @@ PixelsFooterCache::PixelsFooterCache() { } -void PixelsFooterCache::putFileTail(const std::string &id, std::shared_ptr fileTail) +void PixelsFooterCache::putFileTail(const std::string &id, const pixels::fb::FileTail* fileTail) { fileTailCacheMap[id] = fileTail; } -std::shared_ptr PixelsFooterCache::getFileTail(const std::string &id) +const pixels::fb::FileTail* PixelsFooterCache::getFileTail(const std::string &id) { if (fileTailCacheMap.find(id) != fileTailCacheMap.end()) { @@ -46,7 +46,7 @@ std::shared_ptr PixelsFooterCache::getFileTail(const std::string &id) } } -void PixelsFooterCache::putRGFooter(const std::string &id, std::shared_ptr footer) +void PixelsFooterCache::putRGFooter(const std::string &id, const pixels::fb::RowGroupFooter* footer) { rowGroupFooterCacheMap[id] = footer; } @@ -56,7 +56,7 @@ bool PixelsFooterCache::containsFileTail(const std::string &id) return fileTailCacheMap.find(id) != fileTailCacheMap.end(); } -std::shared_ptr PixelsFooterCache::getRGFooter(const std::string &id) +const pixels::fb::RowGroupFooter* PixelsFooterCache::getRGFooter(const std::string &id) { if (rowGroupFooterCacheMap.find(id) != rowGroupFooterCacheMap.end()) { @@ -72,5 +72,3 @@ bool PixelsFooterCache::containsRGFooter(const std::string &id) { return rowGroupFooterCacheMap.find(id) != rowGroupFooterCacheMap.end(); } - - diff --git a/cpp/pixels-core/lib/PixelsReaderBuilder.cpp b/cpp/pixels-core/lib/PixelsReaderBuilder.cpp index 387448ee32..a58201a226 100644 --- a/cpp/pixels-core/lib/PixelsReaderBuilder.cpp +++ b/cpp/pixels-core/lib/PixelsReaderBuilder.cpp @@ -60,7 +60,7 @@ std::shared_ptr PixelsReaderBuilder::build() PhysicalReaderUtil::newPhysicalReader (builderStorage, builderPath); // try to get file tail from cache std::string fileName = fsReader->getName (); - std::shared_ptr fileTail; + const pixels::fb::FileTail* fileTail; if (builderPixelsFooterCache != nullptr && builderPixelsFooterCache->containsFileTail (fileName)) { fileTail = builderPixelsFooterCache->getFileTail (fileName); @@ -85,11 +85,10 @@ std::shared_ptr PixelsReaderBuilder::build() int fileTailLength = (int) (fileLen - fileTailOffset - sizeof (long)); fsReader->seek (fileTailOffset); std::shared_ptr fileTailBuffer = fsReader->readFully (fileTailLength); - fileTail = std::make_shared (); - if (!fileTail->ParseFromArray (fileTailBuffer->getPointer (), - fileTailLength)) + fileTail = pixels::fb::GetFileTail(fileTailBuffer->getPointer()); + if (fileTail == nullptr) { - throw InvalidArgumentException ("PixelsReaderBuilder::build: paring FileTail error!"); + throw InvalidArgumentException ("PixelsReaderBuilder::build: parsing FileTail error!"); } if (builderPixelsFooterCache != nullptr) { @@ -98,9 +97,9 @@ std::shared_ptr PixelsReaderBuilder::build() } // check file MAGIC and file version - pixels::proto::PostScript postScript = fileTail->postscript (); - uint32_t fileVersion = postScript.version (); - const std::string &fileMagic = postScript.magic (); + const pixels::fb::PostScript* postScript = fileTail->postscript(); + uint32_t fileVersion = postScript->version(); + const std::string fileMagic = postScript->magic()->str(); if (PixelsVersion::currentVersion () != fileVersion) { throw PixelsFileVersionInvalidException (fileVersion); @@ -110,10 +109,10 @@ std::shared_ptr PixelsReaderBuilder::build() throw PixelsFileMagicInvalidException (fileMagic); } - auto fileColTypes = std::vector>{}; - for (const auto &type: fileTail->footer ().types ()) + auto fileColTypes = std::vector{}; + for (int i = 0; i < fileTail->footer()->types()->size(); i++) { - fileColTypes.emplace_back (std::make_shared (type)); + fileColTypes.emplace_back(fileTail->footer()->types()->Get(i)); } builderSchema = TypeDescription::createSchema (fileColTypes); @@ -122,5 +121,3 @@ std::shared_ptr PixelsReaderBuilder::build() return std::make_shared (builderSchema, fsReader, fileTail, builderPixelsFooterCache); } - - diff --git a/cpp/pixels-core/lib/PixelsReaderImpl.cpp b/cpp/pixels-core/lib/PixelsReaderImpl.cpp index ff01c3fb89..09074e092d 100644 --- a/cpp/pixels-core/lib/PixelsReaderImpl.cpp +++ b/cpp/pixels-core/lib/PixelsReaderImpl.cpp @@ -26,7 +26,7 @@ PixelsReaderImpl::PixelsReaderImpl(std::shared_ptr fileSchema, std::shared_ptr reader, - std::shared_ptr fileTail, + const pixels::fb::FileTail* fileTail, std::shared_ptr footerCache) { this->fileSchema = fileSchema; @@ -62,50 +62,50 @@ std::shared_ptr PixelsReaderImpl::getFileSchema() PixelsVersion::Version PixelsReaderImpl::getFileVersion() { - return PixelsVersion::from(postScript.version()); + return PixelsVersion::from(postScript->version()); } long PixelsReaderImpl::getNumberOfRows() { - return postScript.numberofrows(); + return postScript->numberOfRows(); } -pixels::proto::CompressionKind PixelsReaderImpl::getCompressionKind() +pixels::fb::CompressionKind PixelsReaderImpl::getCompressionKind() { - return postScript.compression(); + return postScript->compression(); } long PixelsReaderImpl::getCompressionBlockSize() { - return postScript.compressionblocksize(); + return postScript->compressionBlockSize(); } long PixelsReaderImpl::getPixelStride() { - return postScript.pixelstride(); + return postScript->pixelStride(); } std::string PixelsReaderImpl::getWriterTimeZone() { - return postScript.writertimezone(); + return postScript->writerTimezone()->str(); } int PixelsReaderImpl::getRowGroupNum() { - return footer.rowgroupinfos_size(); + return footer->rowGroupInfos()->size(); } bool PixelsReaderImpl::isPartitioned() { - return postScript.has_partitioned() && postScript.partitioned(); + return postScript->partitioned(); } -ColumnStatisticList PixelsReaderImpl::getColumnStats() +const ColumnStatisticList* PixelsReaderImpl::getColumnStats() { - return footer.columnstats(); + return footer->columnStats(); } -pixels::proto::ColumnStatistic PixelsReaderImpl::getColumnStat(std::string columnName) +const pixels::fb::ColumnStatistic* PixelsReaderImpl::getColumnStat(std::string columnName) { auto fieldNames = fileSchema->getFieldNames(); auto fieldIter = std::find(fieldNames.begin(), fieldNames.end(), columnName); @@ -115,35 +115,35 @@ pixels::proto::ColumnStatistic PixelsReaderImpl::getColumnStat(std::string colum columnName + " is not the field name!"); } int fieldId = fieldIter - fieldNames.begin(); - return footer.columnstats().Get(fieldId); + return footer->columnStats()->Get(fieldId); } -RowGroupInfoList PixelsReaderImpl::getRowGroupInfos() +const RowGroupInfoList* PixelsReaderImpl::getRowGroupInfos() { - return footer.rowgroupinfos(); + return footer->rowGroupInfos(); } -pixels::proto::RowGroupInformation PixelsReaderImpl::getRowGroupInfo(int rowGroupId) +const pixels::fb::RowGroupInformation* PixelsReaderImpl::getRowGroupInfo(int rowGroupId) { - if (rowGroupId < 0 || rowGroupId >= footer.columnstats_size()) + if (rowGroupId < 0 || rowGroupId >= footer->columnStats()->size()) { throw InvalidArgumentException("row group id is out of bound."); } - return footer.rowgroupinfos().Get(rowGroupId); + return footer->rowGroupInfos()->Get(rowGroupId); } -pixels::proto::RowGroupStatistic PixelsReaderImpl::getRowGroupStat(int rowGroupId) +const pixels::fb::RowGroupStatistic* PixelsReaderImpl::getRowGroupStat(int rowGroupId) { - if (rowGroupId < 0 || rowGroupId >= footer.columnstats_size()) + if (rowGroupId < 0 || rowGroupId >= footer->columnStats()->size()) { throw InvalidArgumentException("row group id is out of bound."); } - return footer.rowgroupstats().Get(rowGroupId); + return footer->rowGroupStats()->Get(rowGroupId); } -RowGroupStatList PixelsReaderImpl::getRowGroupStats() +const RowGroupStatList* PixelsReaderImpl::getRowGroupStats() { - return footer.rowgroupstats(); + return footer->rowGroupStats(); } PixelsReaderImpl::~PixelsReaderImpl() diff --git a/cpp/pixels-core/lib/PixelsWriterImpl.cpp b/cpp/pixels-core/lib/PixelsWriterImpl.cpp index aa89850371..83c2f30e42 100644 --- a/cpp/pixels-core/lib/PixelsWriterImpl.cpp +++ b/cpp/pixels-core/lib/PixelsWriterImpl.cpp @@ -31,7 +31,6 @@ #include "PixelsVersion.h" #include "utils/Endianness.h" #include "physical/PhysicalWriterUtil.h" -#include "pixels-common/pixels.pb.h" #include "reader/PixelsRecordReader.h" #include "reader/PixelsRecordReaderImpl.h" #include "utils/Endianness.h" @@ -53,7 +52,7 @@ PixelsWriterImpl::PixelsWriterImpl(std::shared_ptr schema, bool nullsPadding, bool partitioned, int compressionBlockSize) : schema(schema), rowGroupSize(rowGroupSize), - compressionBlockSize(compressionBlockSize) + compressionBlockSize(compressionBlockSize),fbb(1024) { this->columnWriterOption = std::make_shared() ->setPixelsStride(pixelsStride) @@ -61,7 +60,7 @@ PixelsWriterImpl::PixelsWriterImpl(std::shared_ptr schema, ->setNullsPadding(nullsPadding); this->physicalWriter = PhysicalWriterUtil::newPhysicalWriter( targetFilePath, blockSize, blockPadding, false); - this->compressionKind = pixels::proto::CompressionKind::NONE; + this->compressionKind = pixels::fb::CompressionKind::CompressionKind_NONE; // this->timeZone = // std::unique_ptr(icu::TimeZone::createDefault()); this->children = schema->getChildren(); @@ -77,7 +76,6 @@ PixelsWriterImpl::PixelsWriterImpl(std::shared_ptr schema, bool PixelsWriterImpl::addRowBatch( std::shared_ptr rowBatch) { - std::cout << "PixelsWriterImpl::addRowBatch" << std::endl; curRowGroupDataLength = 0; curRowGroupNumOfRows += rowBatch->count(); writeColumnVectors(rowBatch->cols, rowBatch->count()); @@ -90,45 +88,71 @@ bool PixelsWriterImpl::addRowBatch( } return true; } +// single thread +// void PixelsWriterImpl::writeColumnVectors( +// std::vector> &columnVectors, +// int rowBatchSize) +// { +// +// int batchDataLength = 0; +// int commonColumnLength = columnVectors.size(); +// +// +// for (int i = 0; i < commonColumnLength; ++i) +// { +// try +// { +// int columnWrittenSize = columnWriters[i]->write(columnVectors[i], rowBatchSize); +// batchDataLength += columnWrittenSize; +// } +// catch (const std::exception &e) +// { +// throw std::runtime_error("Single-threaded write failed at column [" + std::to_string(i) + +// "], name: " + ". Error: " + std::string(e.what())); +// } +// } +// curRowGroupDataLength += batchDataLength; +// } void PixelsWriterImpl::writeColumnVectors( std::vector> &columnVectors, int rowBatchSize) { - std::vector> futures; - std::atomic dataLength(0); + std::vector> futures; int commonColumnLength = columnVectors.size(); - // Writing regular columns + // Writing regular columns in parallel + // Each column writer now maintains its own statistics snapshots + // and doesn't need the shared FlatBufferBuilder until buildColumnChunkIndex for (int i = 0; i < commonColumnLength; ++i) { - // dataLength += columnWriters[i]->write(columnVectors[i], rowBatchSize); - futures.emplace_back(std::async(std::launch::async, [this, columnVectors, - rowBatchSize, i, - &dataLength]() + futures.emplace_back(std::async(std::launch::async, [this, &columnVectors, + rowBatchSize, i]() { try { - dataLength += columnWriters[i]->write(columnVectors[i], rowBatchSize); - } catch (const std::exception &e) + // Each thread uses a temporary FlatBufferBuilder for newPixel serialization + // The actual usage is internal to write() -> newPixel() + return columnWriters[i]->write(columnVectors[i], rowBatchSize); + } + catch (const std::exception &e) { - throw std::runtime_error("failed to write column vector: " + + throw std::runtime_error("failed to write column vector [" + std::to_string(i) + "]: " + std::string(e.what())); } })); } - // Wait for all futures to complete + // Wait for all futures to complete and accumulate data length + int dataLength = 0; for (auto &future : futures) { - future.get(); // Blocking until all tasks are completed + dataLength += future.get(); // Blocking until all tasks are completed } - // Simulate curRowGroupDataLength accumulation - curRowGroupDataLength += dataLength.load(); - std::cout << "Data length written: " << curRowGroupDataLength << std::endl; + // Accumulate current RowGroup data length + curRowGroupDataLength += dataLength; } - void PixelsWriterImpl::close() { try @@ -150,21 +174,76 @@ void PixelsWriterImpl::close() } } + + + +/* + * writeRowGroup + * + * Persist the current RowGroup (accumulated through multiple addRowBatch() calls) + * to disk, and collect its metadata to be written to the file footer. + * + * Trigger conditions: + * 1) During addRowBatch(), when the accumulated RowGroup data size reaches + * the threshold (curRowGroupDataLength >= rowGroupSize) + * 2) Before close(), when there are remaining unwritten rows + * (curRowGroupNumOfRows != 0) + * + * The current implementation is divided into four stages + * (corresponding to the four private methods below): + * + * 1) prepareRowGroup() + * - Flush each ColumnWriter, materializing its internal buffers into + * column chunks + * - Compute the total data length to be written for this RowGroup, + * including alignment padding based on CHUNK_ALIGNMENT + * + * 2) writeRowGroupData(rowGroupDataLength) + * - Call physicalWriter->prepare() to reserve write space and obtain + * curRowGroupOffset + * - Insert alignment padding at the beginning of the RowGroup if needed + * (to avoid block boundary or alignment issues) + * - Sequentially write the raw data of each column chunk, inserting + * padding after each chunk when required + * + * 3) writeRowGroupFooter() + * - Use FlatBuffers (via the member fbb) to build the ColumnChunkIndex + * and Encoding information for this RowGroup + * - Reinitialize columnWriters[i] to begin writing the next RowGroup + * - Return the RowGroup data region length, which matches the value + * computed in prepareRowGroup() and is used for subsequent metadata + * + * 4) recordRowGroupMetadata(rowGroupDataLength) + * - Serialize the completed RowGroupFooter stored in fbb and write it + * to the physical file + * - Update rowGroupInfoList / rowGroupMetadataList as well as + * fileRowNum and fileContentLength + * - Finally, call fbb.Clear() to reset the builder state, preventing + * memory and object contamination from reuse + */ + void PixelsWriterImpl::writeRowGroup() { - // TODO - std::cout << "Try to write rowGroup" << std::endl; - int rowGroupDataLength = 0; - // pixels::proto::RowGroupStatistic curRowGroupStatistic; - pixels::proto::RowGroupInformation curRowGroupInfo; - pixels::proto::RowGroupIndex curRowGroupIndex; - pixels::proto::RowGroupEncoding curRowGroupEncoding; - // reset each column writer and get current row group content size in bytes - for (auto writer : columnWriters) + + int rowGroupDataLength=prepareRowGroup(); + + writeRowGroupData(rowGroupDataLength); + + rowGroupDataLength=writeRowGroupFooter(); + + recordRowGroupMetadata(rowGroupDataLength); + +} + + +int PixelsWriterImpl::prepareRowGroup() +{ + int rowGroupDataLength =0; + for (auto writer: columnWriters) { - // flush writes the isNull bit map into the internal output stream. + // flush residual rows to storage, a new pixels is created writer->flush(); - rowGroupDataLength += writer->getColumnChunkSize(); + rowGroupDataLength+=writer->getColumnChunkSize(); if (CHUNK_ALIGNMENT != 0 && rowGroupDataLength % CHUNK_ALIGNMENT != 0) { /* @@ -177,168 +256,221 @@ void PixelsWriterImpl::writeRowGroup() CHUNK_ALIGNMENT - rowGroupDataLength % CHUNK_ALIGNMENT; } } - // write and flush row group content + return rowGroupDataLength; +} + +void PixelsWriterImpl::writeRowGroupData(uint32_t rowGroupDataLength) +{ try { - curRowGroupOffset = physicalWriter->prepare(rowGroupDataLength); - if (curRowGroupOffset != -1) + curRowGroupOffset= physicalWriter->prepare(rowGroupDataLength); + if (curRowGroupOffset!=-1){ + // No need for double alignment + if (CHUNK_ALIGNMENT !=0 && curRowGroupOffset % CHUNK_ALIGNMENT) + { + int paddingNeeded = CHUNK_ALIGNMENT -(curRowGroupOffset% CHUNK_ALIGNMENT); + physicalWriter->append(CHUNK_PADDING_BUFFER.data(),0,paddingNeeded); + writtenBytes += paddingNeeded; + curRowGroupOffset = physicalWriter->prepare(rowGroupDataLength); + } + + for (auto & writer: columnWriters) { - int tryAlign = 0; - while (CHUNK_ALIGNMENT != 0 && curRowGroupOffset % CHUNK_ALIGNMENT != 0 && - tryAlign++ < 2) + auto rowGroupBuffer=writer->getColumnChunkContent(); + physicalWriter->append(rowGroupBuffer.data(),0,rowGroupBuffer.size()); + writtenBytes += rowGroupBuffer.size(); + if (CHUNK_ALIGNMENT != 0 && + rowGroupBuffer.size() % CHUNK_ALIGNMENT != 0) { - int alignBytes = CHUNK_ALIGNMENT - curRowGroupOffset % CHUNK_ALIGNMENT; + int alignBytes = + CHUNK_ALIGNMENT - rowGroupBuffer.size() % CHUNK_ALIGNMENT; physicalWriter->append(CHUNK_PADDING_BUFFER.data(), 0, alignBytes); writtenBytes += alignBytes; - curRowGroupOffset = physicalWriter->prepare(rowGroupDataLength); - } - if (tryAlign > 2) - { - std::cerr << "Failed to align the start offset of the column chunks in " - "the row group" - << std::endl; - throw std::runtime_error("Failed to align the start offset of the " - "column chunks in the row group"); - } - - for (auto &writer : columnWriters) - { - auto rowGroupBuffer = writer->getColumnChunkContent(); - physicalWriter->append(rowGroupBuffer.data(), 0, rowGroupBuffer.size()); - writtenBytes += rowGroupBuffer.size(); - if (CHUNK_ALIGNMENT != 0 && - rowGroupBuffer.size() % CHUNK_ALIGNMENT != 0) - { - int alignBytes = - CHUNK_ALIGNMENT - rowGroupBuffer.size() % CHUNK_ALIGNMENT; - physicalWriter->append(CHUNK_PADDING_BUFFER.data(), 0, alignBytes); - writtenBytes += alignBytes; - } } - physicalWriter->flush(); - } else - { - std::cerr << "Write row group prepare failed" << std::endl; - throw std::runtime_error("Write row group prepare failed"); } - } catch (const std::exception &e) + physicalWriter->flush(); + } else + { + std::cerr << "Write row group prepare failed" << std::endl; + throw std::runtime_error("Write row group prepare failed"); + } + }catch (const std::exception &e) { std::cerr << e.what() << std::endl; throw; } +} + +int PixelsWriterImpl::writeRowGroupFooter() +{ + std::vector< + flatbuffers::Offset> + columnChunkIndexVector; + + std::vector< + flatbuffers::Offset> + columnChunkEncodingVector; + // update index and stats(necessary?) - rowGroupDataLength = 0; + int rowGroupDataLength = 0; for (int i = 0; i < columnWriters.size(); i++) { std::shared_ptr writer = columnWriters[i]; - auto chunkIndex = writer->getColumnChunkIndex(); - chunkIndex.set_chunkoffset(curRowGroupOffset + rowGroupDataLength); - chunkIndex.set_chunklength(writer->getColumnChunkSize()); - chunkIndex.set_littleendian(true); + auto chunkIndex = writer->buildColumnChunkIndex(fbb, + curRowGroupOffset + rowGroupDataLength,writer->getColumnChunkSize(),true); + rowGroupDataLength += writer->getColumnChunkSize(); if (CHUNK_ALIGNMENT != 0 && rowGroupDataLength % CHUNK_ALIGNMENT != 0) { rowGroupDataLength += CHUNK_ALIGNMENT - rowGroupDataLength % CHUNK_ALIGNMENT; } - *(curRowGroupIndex.add_columnchunkindexentries()) = chunkIndex; - *(curRowGroupEncoding.add_columnchunkencodings()) = - writer->getColumnChunkEncoding(); + columnChunkIndexVector.push_back(chunkIndex); + columnChunkEncodingVector.push_back(writer->getColumnChunkEncoding(fbb)); columnWriters[i] = ColumnWriterBuilder::newColumnWriter(children.at(i), columnWriterOption); } - // put curRowGroupIndex into rowGroupFooter - std::shared_ptr rowGroupFooter = - std::make_shared(); - - rowGroupFooter->mutable_rowgroupindexentry()->CopyFrom(curRowGroupIndex); - rowGroupFooter->mutable_rowgroupencoding()->CopyFrom(curRowGroupEncoding); - std::cout << "curRowGroupEncoding: " << curRowGroupEncoding.ByteSizeLong() - << std::endl; - std::cout << "curRowGroupEncoding: " << curRowGroupEncoding.ByteSizeLong() - << std::endl; + auto entriesOffset = fbb.CreateVector(columnChunkIndexVector); + auto curRowGroupIndex = pixels::fb::CreateRowGroupIndex(fbb, entriesOffset); + auto curRowGroupEncodingOffset = fbb.CreateVector(columnChunkEncodingVector); + auto curRowGroupEncoding=pixels::fb::CreateRowGroupEncoding(fbb,curRowGroupEncodingOffset,0); + flatbuffers::Offset rowGroupFooterOffset = + pixels::fb::CreateRowGroupFooter(fbb, curRowGroupIndex, curRowGroupEncoding); + + auto curRowGroupFooterOffset = rowGroupFooterOffset; + + fbb.Finish(curRowGroupFooterOffset); + + return rowGroupDataLength; + +} + +void PixelsWriterImpl::recordRowGroupMetadata(int rowGroupDataLength ) +{ + uint8_t *footerBufPtr = fbb.GetBufferPointer(); + size_t footerBufSize = fbb.GetSize(); + long uploadedFooterOffset = 0; + try { - ByteBuffer footerBuffer(rowGroupFooter->ByteSizeLong()); - rowGroupFooter->SerializeToArray(footerBuffer.getPointer(), - rowGroupFooter->ByteSizeLong()); - physicalWriter->prepare(footerBuffer.size()); - curRowGroupFooterOffset = physicalWriter->append(footerBuffer.getPointer(), - 0, footerBuffer.size()); - writtenBytes += footerBuffer.size(); + physicalWriter->prepare(footerBufSize); + uploadedFooterOffset = physicalWriter->append(footerBufPtr, 0, footerBufSize); + writtenBytes += footerBufSize; physicalWriter->flush(); } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; + std::cerr << "Failed to write RowGroupFooter: " << e.what() << std::endl; throw; } + // Update RowGroupInformation and add it to the list - curRowGroupInfo.set_footeroffset(curRowGroupFooterOffset); - curRowGroupInfo.set_datalength(rowGroupDataLength); - curRowGroupInfo.set_footerlength(rowGroupFooter->ByteSizeLong()); - curRowGroupInfo.set_numberofrows(curRowGroupNumOfRows); + auto curRowGroupInfo = pixels::fb::CreateRowGroupInformation( + fbb, + uploadedFooterOffset, + rowGroupDataLength, + static_cast(footerBufSize), + curRowGroupNumOfRows, + 0 ); rowGroupInfoList.push_back(curRowGroupInfo); + // store rowgroupInfo + RowGroupNative nativeInfo; + nativeInfo.footerOffset = (uint64_t)uploadedFooterOffset; + nativeInfo.footerLength = (uint32_t)footerBufSize; + nativeInfo.dataLength = this->curRowGroupDataLength; + nativeInfo.numberOfRows = this->curRowGroupNumOfRows; + rowGroupMetadataList.push_back(nativeInfo); + this->fileRowNum += curRowGroupNumOfRows; this->fileContentLength += rowGroupDataLength; - std::cout << "PixelsWriterImpl::writeRowGroup" << std::endl; + + + // release fbb + fbb.Clear(); } + void PixelsWriterImpl::writeFileTail() { - std::shared_ptr footer = - std::make_shared(); - std::shared_ptr postScript = - std::make_shared(); - schema->writeTypes(footer); - for (auto rowGroupInformation : rowGroupInfoList) - { - *(footer->add_rowgroupinfos()) = rowGroupInformation; + fbb.Clear(); + + std::vector> infoOffsets; + for (size_t i = 0; i < rowGroupMetadataList.size(); i++) { + const auto& info = rowGroupMetadataList[i]; + + auto fbInfo = pixels::fb::CreateRowGroupInformation( + fbb, + info.footerOffset, + info.dataLength, + info.footerLength, + info.numberOfRows + ); + infoOffsets.push_back(fbInfo); } - postScript->set_version(PixelsVersion::V1); - std::string FILE_MAGIC = "PIXELS"; - postScript->set_contentlength(fileContentLength); - postScript->set_numberofrows(fileRowNum); - postScript->set_compression(compressionKind); - postScript->set_compressionblocksize(compressionBlockSize); - postScript->set_pixelstride(columnWriterOption->getPixelsStride()); - postScript->set_partitioned(partitioned); - postScript->set_columnchunkalignment(CHUNK_ALIGNMENT); - postScript->set_magic(FILE_MAGIC); - - // build fileTail - pixels::proto::FileTail fileTail; - *fileTail.mutable_footer() = *footer; - *fileTail.mutable_postscript() = *postScript; - fileTail.set_footerlength(footer->ByteSizeLong()); - fileTail.set_postscriptlength(postScript->ByteSizeLong()); - - // flush filetail - int fileTailLen = fileTail.ByteSizeLong() + 8; - physicalWriter->prepare(fileTailLen); - - std::shared_ptr fileTailBuffer = - std::make_shared(fileTail.ByteSizeLong()); - fileTail.SerializeToArray(fileTailBuffer->getPointer(), - fileTail.ByteSizeLong()); - long tailOffset = physicalWriter->append(fileTailBuffer->getPointer(), 0, - fileTail.ByteSizeLong()); - - if (Endianness::isLittleEndian()) - { - tailOffset = (long) __builtin_bswap64(tailOffset); - } + auto rowGroupInfoVector = fbb.CreateVector(infoOffsets); + + // === 1. build Footer === + size_t footerStart = fbb.GetSize(); + + auto typeOffsets = schema->writeTypes(fbb); + auto typeVector = fbb.CreateVector(typeOffsets); + auto footerOffset = pixels::fb::CreateFooter(fbb, typeVector, 0, rowGroupInfoVector); + + uint32_t footerLength = static_cast(fbb.GetSize() - footerStart); + + // === 2. build PostScript === + size_t psStart = fbb.GetSize(); + + auto magicOffset = fbb.CreateString("PIXELS"); + auto postScriptOffset = pixels::fb::CreatePostScript( + fbb, + PixelsVersion::V1, + fileContentLength, + fileRowNum, + compressionKind, + compressionBlockSize, + columnWriterOption->getPixelsStride(), + 0, // writerTimeZone + partitioned, + CHUNK_ALIGNMENT, + false, + magicOffset + ); + + uint32_t postScriptLength = static_cast(fbb.GetSize() - psStart); - std::shared_ptr tailOffsetBuffer = - std::make_shared(8); + // === 3. build root object FileTail === + auto fileTailOffset = pixels::fb::CreateFileTail( + fbb, + footerOffset, + postScriptOffset, + footerLength, + postScriptLength + ); + fbb.Finish(fileTailOffset); + + // === 4. physical write === + uint8_t *bufferPtr = fbb.GetBufferPointer(); + size_t bufferSize = fbb.GetSize(); + + int totalFileTailLen = static_cast(bufferSize + 8); + physicalWriter->prepare(totalFileTailLen); + + long tailOffset = physicalWriter->append(bufferPtr, 0, bufferSize); + + if (Endianness::isLittleEndian()) + { + tailOffset = (long) __builtin_bswap64(tailOffset); + } + + std::shared_ptr tailOffsetBuffer = std::make_shared(8); tailOffsetBuffer->putLong(tailOffset); physicalWriter->append(tailOffsetBuffer); - writtenBytes += fileTailLen; - physicalWriter->flush(); - std::cout << "PixelsWriterImpl::writeFileTail" << std::endl; -} \ No newline at end of file + writtenBytes += totalFileTailLen; + physicalWriter->flush(); +} diff --git a/cpp/pixels-core/lib/TypeDescription.cpp b/cpp/pixels-core/lib/TypeDescription.cpp index e553e99e48..ed5a28e1ca 100644 --- a/cpp/pixels-core/lib/TypeDescription.cpp +++ b/cpp/pixels-core/lib/TypeDescription.cpp @@ -139,61 +139,61 @@ TypeDescription::TypeDescription(Category c) } std::shared_ptr -TypeDescription::createSchema(const std::vector > &types) +TypeDescription::createSchema(std::span types) { std::shared_ptr schema = createStruct(); - for (const auto &type: types) + for (const auto type: types) { - const std::string &fieldName = type->name(); + const std::string &fieldName = type->name()->str(); std::shared_ptr fieldType; switch (type->kind()) { - case pixels::proto::Type_Kind_BOOLEAN: + case pixels::fb::TypeKind_BOOLEAN: fieldType = TypeDescription::createBoolean(); break; - case pixels::proto::Type_Kind_LONG: + case pixels::fb::TypeKind_LONG: fieldType = TypeDescription::createLong(); break; - case pixels::proto::Type_Kind_INT: + case pixels::fb::TypeKind_INT: fieldType = TypeDescription::createInt(); break; - case pixels::proto::Type_Kind_SHORT: + case pixels::fb::TypeKind_SHORT: fieldType = TypeDescription::createShort(); break; - case pixels::proto::Type_Kind_BYTE: + case pixels::fb::TypeKind_BYTE: fieldType = TypeDescription::createByte(); break; - case pixels::proto::Type_Kind_FLOAT: + case pixels::fb::TypeKind_FLOAT: fieldType = TypeDescription::createFloat(); break; - case pixels::proto::Type_Kind_DOUBLE: + case pixels::fb::TypeKind_DOUBLE: fieldType = TypeDescription::createDouble(); break; - case pixels::proto::Type_Kind_DECIMAL: + case pixels::fb::TypeKind_DECIMAL: fieldType = TypeDescription::createDecimal(type->precision(), type->scale()); break; - case pixels::proto::Type_Kind_VARCHAR: + case pixels::fb::TypeKind_VARCHAR: fieldType = TypeDescription::createVarchar(); - fieldType->maxLength = type->maximumlength(); + fieldType->maxLength = type->maximumLength(); break; - case pixels::proto::Type_Kind_CHAR: + case pixels::fb::TypeKind_CHAR: fieldType = TypeDescription::createChar(); - fieldType->maxLength = type->maximumlength(); + fieldType->maxLength = type->maximumLength(); break; - case pixels::proto::Type_Kind_STRING: + case pixels::fb::TypeKind_STRING: fieldType = TypeDescription::createString(); break; - case pixels::proto::Type_Kind_DATE: + case pixels::fb::TypeKind_DATE: fieldType = TypeDescription::createDate(); break; - case pixels::proto::Type_Kind_TIME: + case pixels::fb::TypeKind_TIME: fieldType = TypeDescription::createTime(); break; - case pixels::proto::Type_Kind_TIMESTAMP: + case pixels::fb::TypeKind_TIMESTAMP: fieldType = TypeDescription::createTimestamp(); break; default: - throw InvalidArgumentException("TypeDescription::createSchema: Unknown type: " + type->name()); + throw InvalidArgumentException("TypeDescription::createSchema: Unknown type: " + type->name()->str()); } schema->addField(fieldName, fieldType); } @@ -761,90 +761,84 @@ TypeDescription TypeDescription::withMaxLength(int maxLength) return *this; } -void TypeDescription::writeTypes(std::shared_ptr footer) +std::vector> TypeDescription::writeTypes(flatbuffers::FlatBufferBuilder& fbb) { - std::vector > children = this->getChildren(); - std::vector names = this->getFieldNames(); + std::vector> children = this->getChildren(); + std::vector names = this->getFieldNames(); + std::vector> typeVector; + if (children.empty()) { - return; + return typeVector; } + for (int i = 0; i < children.size(); i++) { - std::shared_ptr child = children.at(i); - std::shared_ptr tmpType = std::make_shared(); - tmpType->set_name(names.at(i)); + std::shared_ptr child = children.at(i); + + // 1. get and create FlatBuffers strin offset + flatbuffers::Offset fbName = 0; + if (i < names.size()) { + fbName = fbb.CreateString(names.at(i)); + } + switch (child->getCategory()) { case TypeDescription::Category::BOOLEAN: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_BOOLEAN); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_BOOLEAN, fbName)); break; case TypeDescription::Category::BYTE: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_BYTE); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_BYTE, fbName)); break; case TypeDescription::Category::SHORT: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_SHORT); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_SHORT, fbName)); break; case TypeDescription::Category::INT: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_INT); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_INT, fbName)); break; case TypeDescription::Category::LONG: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_LONG); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_LONG, fbName)); break; case TypeDescription::Category::FLOAT: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_FLOAT); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_FLOAT, fbName)); break; case TypeDescription::Category::DOUBLE: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_DOUBLE); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_DOUBLE, fbName)); break; case TypeDescription::Category::DECIMAL: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_DECIMAL); - tmpType->set_precision(child->getPrecision()); - tmpType->set_scale(child->getScale()); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_DECIMAL, fbName, 0, 0, child->getPrecision(), child->getScale())); break; case TypeDescription::Category::STRING: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_STRING); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_STRING, fbName)); break; case TypeDescription::Category::CHAR: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_CHAR); - tmpType->set_maximumlength(child->getMaxLength()); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_CHAR, fbName, 0, child->getMaxLength())); break; case TypeDescription::Category::VARCHAR: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_VARCHAR); - tmpType->set_maximumlength(child->getMaxLength()); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_VARCHAR, fbName, 0, child->getMaxLength())); break; case TypeDescription::Category::BINARY: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_BINARY); - tmpType->set_maximumlength(child->getMaxLength()); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_BINARY, fbName, 0, child->getMaxLength())); break; case TypeDescription::Category::VARBINARY: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_VARBINARY); - tmpType->set_maximumlength(child->getMaxLength()); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_VARBINARY, fbName, 0, child->getMaxLength())); break; case TypeDescription::Category::TIMESTAMP: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_TIMESTAMP); - tmpType->set_precision(child->getPrecision()); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_TIMESTAMP, fbName, 0, 0, child->getPrecision())); break; case TypeDescription::Category::DATE: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_DATE); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_DATE, fbName)); break; case TypeDescription::Category::TIME: - tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_TIME); - tmpType->set_precision(child->getPrecision()); + typeVector.push_back(pixels::fb::CreateType(fbb, pixels::fb::TypeKind_TIME, fbName, 0, 0, child->getPrecision())); break; -// case TypeDescription::Category::VECTOR: -// tmpType->set_kind(pixels::proto::Type_Kind::Type_Kind_VECTOR); -// tmpType->set_dimension(child->getDimension()); -// break; default: { std::string errorMsg = "Unknown category: "; - errorMsg += static_cast>(this->getCategory()); + errorMsg += std::to_string(static_cast>(child->getCategory())); throw std::runtime_error(errorMsg); } - - } - *(footer->add_types()) = *tmpType; } -} + return typeVector; +} \ No newline at end of file diff --git a/cpp/pixels-core/lib/encoding/RunLenIntEncoder.cpp b/cpp/pixels-core/lib/encoding/RunLenIntEncoder.cpp index 82828f7b0f..4840055df4 100644 --- a/cpp/pixels-core/lib/encoding/RunLenIntEncoder.cpp +++ b/cpp/pixels-core/lib/encoding/RunLenIntEncoder.cpp @@ -92,12 +92,10 @@ void RunLenIntEncoder::encode(long *values, int offset, int length, byte *result { for (int i = 0; i < length; ++i) { - // std::cout << encodingType << " value : " << values[i + offset] << std::endl; + std::cout << encodingType << " value : " << values[i + offset] << std::endl; this->write(values[i + offset]); } flush(); - // std::cout << "length: " << length << std::endl; - // std::cout << "buffer end: " << outputStream->getWritePos() << std::endl; resLen = outputStream->getWritePos(); outputStream->getBytes(results, resLen); outputStream->resetPosition(); diff --git a/cpp/pixels-core/lib/reader/ColumnReader.cpp b/cpp/pixels-core/lib/reader/ColumnReader.cpp index 07ba2b5855..48a9e0cfd7 100644 --- a/cpp/pixels-core/lib/reader/ColumnReader.cpp +++ b/cpp/pixels-core/lib/reader/ColumnReader.cpp @@ -73,9 +73,9 @@ std::shared_ptr ColumnReader::newColumnReader(std::shared_ptr input, pixels::proto::ColumnEncoding &encoding, int offset, int size, +ColumnReader::read(std::shared_ptr input,const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, std::shared_ptr filterMask) + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) { } diff --git a/cpp/pixels-core/lib/reader/DateColumnReader.cpp b/cpp/pixels-core/lib/reader/DateColumnReader.cpp index cb43c4ebe6..fa044bd88b 100644 --- a/cpp/pixels-core/lib/reader/DateColumnReader.cpp +++ b/cpp/pixels-core/lib/reader/DateColumnReader.cpp @@ -23,6 +23,7 @@ * @create 2023-04-06 */ #include "reader/DateColumnReader.h" +#include "vector/DateColumnVector.h" DateColumnReader::DateColumnReader(std::shared_ptr type) : ColumnReader(type) @@ -35,9 +36,10 @@ void DateColumnReader::close() } -void DateColumnReader::read(std::shared_ptr input, pixels::proto::ColumnEncoding &encoding, int offset, +void DateColumnReader::read(std::shared_ptr input, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, std::shared_ptr filterMask) + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) { std::shared_ptr columnVector = std::static_pointer_cast(vector); @@ -45,14 +47,14 @@ void DateColumnReader::read(std::shared_ptr input, pixels::proto::C { decoder = std::make_shared(input, true); elementIndex = 0; - isNullOffset = chunkIndex.isnulloffset(); + isNullOffset = chunkIndex->isNullOffset(); } int pixelId = elementIndex / pixelStride; - bool hasNull = chunkIndex.pixelstatistics(pixelId).statistic().hasnull(); + bool hasNull = chunkIndex->pixelStatistics()->Get(pixelId)->statistic()->hasNull(); setValid(input, pixelStride, vector, pixelId, hasNull); - if (encoding.kind() == pixels::proto::ColumnEncoding_Kind_RUNLENGTH) + if (encoding->kind() == pixels::fb::EncodingKind_RUNLENGTH) { for (int i = 0; i < size; i++) { diff --git a/cpp/pixels-core/lib/reader/DecimalColumnReader.cpp b/cpp/pixels-core/lib/reader/DecimalColumnReader.cpp index 77cbbb826a..023415aacd 100644 --- a/cpp/pixels-core/lib/reader/DecimalColumnReader.cpp +++ b/cpp/pixels-core/lib/reader/DecimalColumnReader.cpp @@ -23,6 +23,7 @@ * @create 2023-04-05 */ #include "reader/DecimalColumnReader.h" +#include "vector/DecimalColumnVector.h" /** * The column reader of decimals. @@ -39,9 +40,10 @@ void DecimalColumnReader::close() } -void DecimalColumnReader::read(std::shared_ptr input, pixels::proto::ColumnEncoding &encoding, int offset, +void DecimalColumnReader::read(std::shared_ptr input, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, std::shared_ptr filterMask) + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) { std::shared_ptr columnVector = std::static_pointer_cast(vector); @@ -57,12 +59,12 @@ void DecimalColumnReader::read(std::shared_ptr input, pixels::proto { // TODO: here we check null ColumnReader::elementIndex = 0; - isNullOffset = chunkIndex.isnulloffset(); + isNullOffset = chunkIndex->isNullOffset(); } // TODO: we didn't implement the run length encoded method int pixelId = elementIndex / pixelStride; - bool hasNull = chunkIndex.pixelstatistics(pixelId).statistic().hasnull(); + bool hasNull = chunkIndex->pixelStatistics()->Get(pixelId)->statistic()->hasNull(); setValid(input, pixelStride, vector, pixelId, hasNull); columnVector->vector = (long *) (input->getPointer() + input->getReadPos()); diff --git a/cpp/pixels-core/lib/reader/IntColumnReader.cpp b/cpp/pixels-core/lib/reader/IntColumnReader.cpp index 080eae2a5a..b64ab0a57b 100644 --- a/cpp/pixels-core/lib/reader/IntColumnReader.cpp +++ b/cpp/pixels-core/lib/reader/IntColumnReader.cpp @@ -38,10 +38,10 @@ void IntColumnReader::close() } void IntColumnReader::read(std::shared_ptr input, - pixels::proto::ColumnEncoding &encoding, int offset, + const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) { std::shared_ptr columnVector = @@ -55,14 +55,14 @@ void IntColumnReader::read(std::shared_ptr input, { decoder = std::make_shared(input, true); ColumnReader::elementIndex = 0; - isNullOffset = chunkIndex.isnulloffset(); + isNullOffset = chunkIndex->isNullOffset(); } int pixelId = elementIndex / pixelStride; - bool hasNull = chunkIndex.pixelstatistics(pixelId).statistic().hasnull(); + bool hasNull = chunkIndex->pixelStatistics()->Get(pixelId)->statistic()->hasNull(); setValid(input, pixelStride, vector, pixelId, hasNull); - if (encoding.kind() == pixels::proto::ColumnEncoding_Kind_RUNLENGTH) + if (encoding->kind() == pixels::fb::EncodingKind_RUNLENGTH) { for (int i = 0; i < size; i++) { diff --git a/cpp/pixels-core/lib/reader/LongColumnReader.cpp b/cpp/pixels-core/lib/reader/LongColumnReader.cpp index c2d7763238..9923001700 100644 --- a/cpp/pixels-core/lib/reader/LongColumnReader.cpp +++ b/cpp/pixels-core/lib/reader/LongColumnReader.cpp @@ -23,6 +23,7 @@ */ #include "reader/LongColumnReader.h" +#include "vector/LongColumnVector.h" LongColumnReader::LongColumnReader(std::shared_ptr type) : ColumnReader(type) @@ -36,10 +37,10 @@ void LongColumnReader::close() } void LongColumnReader::read(std::shared_ptr input, - pixels::proto::ColumnEncoding &encoding, int offset, + const pixels::fb::ColumnEncoding *encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex *chunkIndex, std::shared_ptr filterMask) { std::shared_ptr columnVector = @@ -53,14 +54,14 @@ void LongColumnReader::read(std::shared_ptr input, { decoder = std::make_shared(input, true); ColumnReader::elementIndex = 0; - isNullOffset = chunkIndex.isnulloffset(); + isNullOffset = chunkIndex->isNullOffset(); } int pixelId = elementIndex / pixelStride; - bool hasNull = chunkIndex.pixelstatistics(pixelId).statistic().hasnull(); + bool hasNull = chunkIndex->pixelStatistics()->Get(pixelId)->statistic()->hasNull(); setValid(input, pixelStride, vector, pixelId, hasNull); - if (encoding.kind() == pixels::proto::ColumnEncoding_Kind_RUNLENGTH) + if (encoding->kind() == pixels::fb::EncodingKind_RUNLENGTH) { for (int i = 0; i < size; i++) { @@ -73,4 +74,4 @@ void LongColumnReader::read(std::shared_ptr input, columnVector->longVector = (int64_t *) (input->getPointer() + input->getReadPos()); } -} \ No newline at end of file +} diff --git a/cpp/pixels-core/lib/reader/PixelsRecordReaderImpl.cpp b/cpp/pixels-core/lib/reader/PixelsRecordReaderImpl.cpp index 02f6fc1cf1..7322652281 100644 --- a/cpp/pixels-core/lib/reader/PixelsRecordReaderImpl.cpp +++ b/cpp/pixels-core/lib/reader/PixelsRecordReaderImpl.cpp @@ -27,8 +27,8 @@ #include "profiler/CountProfiler.h" PixelsRecordReaderImpl::PixelsRecordReaderImpl(std::shared_ptr reader, - const pixels::proto::PostScript &pixelsPostScript, - const pixels::proto::Footer &pixelsFooter, + const pixels::fb::PostScript* pixelsPostScript, + const pixels::fb::Footer* pixelsFooter, const PixelsReaderOption &opt, std::shared_ptr pixelsFooterCache) { @@ -72,11 +72,11 @@ PixelsRecordReaderImpl::PixelsRecordReaderImpl(std::shared_ptr void PixelsRecordReaderImpl::checkBeforeRead() { // get file schema - auto fileColTypesFooterTypes = footer.types(); - auto fileColTypes = std::vector < std::shared_ptr < pixels::proto::Type >> {}; - for (const auto &type: fileColTypesFooterTypes) + auto fileColTypesFooterTypes = footer->types(); + auto fileColTypes = std::vector{}; + for (int i = 0; i < fileColTypesFooterTypes->size(); i++) { - fileColTypes.emplace_back(std::make_shared<::pixels::proto::Type>(type)); + fileColTypes.emplace_back(fileColTypesFooterTypes->Get(i)); } // TODO: if fileCOlTypes == null fileSchema = TypeDescription::createSchema(fileColTypes); @@ -93,7 +93,7 @@ void PixelsRecordReaderImpl::checkBeforeRead() { for (int j = 0; j < fileColTypes.size(); j++) { - if (icompare(col, fileColTypes.at(j)->name())) + if (icompare(col, fileColTypes.at(j)->name()->str())) { optionColsIndices.emplace_back(j); includedColumns.at(j) = true; @@ -151,7 +151,7 @@ void PixelsRecordReaderImpl::checkBeforeRead() void PixelsRecordReaderImpl::UpdateRowGroupInfo() { // if not end of file, update row count - curRGRowCount = (int) footer.rowgroupinfos(targetRGs.at(curRGIdx)).numberofrows(); + curRGRowCount = (int) footer->rowGroupInfos()->Get(targetRGs.at(curRGIdx))->numberOfRows(); if (enabledFilterPushDown) { @@ -161,22 +161,20 @@ void PixelsRecordReaderImpl::UpdateRowGroupInfo() curRGFooter = rowGroupFooters.at(curRGIdx); // refresh resultColumnsEncoded for reading the column vectors in the next row group. - const pixels::proto::RowGroupEncoding &rgEncoding = rowGroupFooters.at(curRGIdx)->rowgroupencoding(); + const pixels::fb::RowGroupEncoding* rgEncoding = rowGroupFooters.at(curRGIdx)->rowGroupEncoding(); for (int i = 0; i < includedColumnNum; i++) { resultColumnsEncoded.at(i) = - rgEncoding.columnchunkencodings(resultColumns.at(i)) - .kind() != pixels::proto::ColumnEncoding_Kind_NONE + rgEncoding->columnChunkEncodings()->Get(resultColumns.at(i)) + ->kind() != pixels::fb::EncodingKind_NONE && enableEncodedVector; } for (int i = 0; i < resultColumns.size(); i++) { - curEncoding.at(i) = std::make_shared( - rgEncoding.columnchunkencodings(resultColumns.at(i))); + curEncoding.at(i) = rgEncoding->columnChunkEncodings()->Get(resultColumns.at(i)); curChunkBufferIndex.at(i) = resultColumns.at(i); - curChunkIndex.at(i) = std::make_shared(curRGFooter->rowgroupindexentry() - .columnchunkindexentries( - resultColumns.at(i))); + curChunkIndex.at(i) = curRGFooter->rowGroupIndexEntry() + ->columnChunkIndexEntries()->Get(resultColumns.at(i)); } // This flag makes sure that each row group invokes read() everRead = false; @@ -245,9 +243,9 @@ std::shared_ptr PixelsRecordReaderImpl::readBatch(bool reus int index = curChunkBufferIndex.at(i); auto &encoding = curEncoding.at(i); auto &chunkIndex = curChunkIndex.at(i); - readers.at(i)->read(chunkBuffers.at(index), *encoding, curRowInRG, curBatchSize, - postScript.pixelstride(), resultRowBatch->rowCount, - columnVectors.at(i), *chunkIndex, filterMask); + readers.at(i)->read(chunkBuffers.at(index), encoding, curRowInRG, curBatchSize, + postScript->pixelStride(), resultRowBatch->rowCount, + columnVectors.at(i), chunkIndex, filterMask); filterColumnIndex.emplace_back(index); PixelsFilter::ApplyFilter(columnVectors.at(i), *filterCol.second, *filterMask, resultSchema->getChildren().at(i)); @@ -271,9 +269,9 @@ std::shared_ptr PixelsRecordReaderImpl::readBatch(bool reus } auto &encoding = curEncoding.at(i); auto &chunkIndex = curChunkIndex.at(i); - readers.at(i)->read(chunkBuffers.at(index), *encoding, curRowInRG, curBatchSize, - postScript.pixelstride(), resultRowBatch->rowCount, - columnVectors.at(i), *chunkIndex, filterMask); + readers.at(i)->read(chunkBuffers.at(index), encoding, curRowInRG, curBatchSize, + postScript->pixelStride(), resultRowBatch->rowCount, + columnVectors.at(i), chunkIndex, filterMask); } // update current row index in the row group @@ -310,7 +308,7 @@ void PixelsRecordReaderImpl::prepareRead() for (int i = 0; i < RGLen; i++) { includedRGs.at(i) = true; - includedRowNum += footer.rowgroupinfos(RGStart + i).numberofrows(); + includedRowNum += footer->rowGroupInfos()->Get(RGStart + i)->numberOfRows(); } targetRGs.clear(); targetRGs.resize(RGLen); @@ -357,9 +355,9 @@ void PixelsRecordReaderImpl::prepareRead() else { // cache miss, read from disk and put it into cache - const pixels::proto::RowGroupInformation &rowGroupInformation = footer.rowgroupinfos(rgId); - uint64_t footerOffset = rowGroupInformation.footeroffset(); - uint64_t footerLength = rowGroupInformation.footerlength(); + const pixels::fb::RowGroupInformation* rowGroupInformation = footer->rowGroupInfos()->Get(rgId); + uint64_t footerOffset = rowGroupInformation->footerOffset(); + uint64_t footerLength = rowGroupInformation->footerLength(); fis.push_back(i); requestBatch.add(queryId, (int) footerOffset, (int) footerLength); rowGroupFooterCacheHit.at(i) = false; @@ -373,8 +371,8 @@ void PixelsRecordReaderImpl::prepareRead() { if (!rowGroupFooterCacheHit.at(i)) { - auto parsed = std::make_shared(); - parsed->ParseFromArray(bbs[i]->getPointer(), (int) bbs[i]->size()); + const pixels::fb::RowGroupFooter* parsed = + flatbuffers::GetRoot((bbs[i]->getPointer())); rowGroupFooters.at(fis[i]) = parsed; if (footerCache != nullptr) { @@ -439,17 +437,17 @@ bool PixelsRecordReaderImpl::read() // TODO: support cache read - const pixels::proto::RowGroupIndex &rowGroupIndex = - rowGroupFooters[curRGIdx]->rowgroupindexentry(); + const pixels::fb::RowGroupIndex* rowGroupIndex = + rowGroupFooters[curRGIdx]->rowGroupIndexEntry(); for (int colId: targetColumns) { - const pixels::proto::ColumnChunkIndex &chunkIndex = - rowGroupIndex.columnchunkindexentries(colId); - if (!chunkIndex.littleendian()) + const pixels::fb::ColumnChunkIndex* chunkIndex = + rowGroupIndex->columnChunkIndexEntries()->Get(colId); + if (!chunkIndex->littleEndian()) { throw InvalidArgumentException("Pixels C++ reader only supports little endianness. "); } - ChunkId chunk(curRGIdx, colId, chunkIndex.chunkoffset(), chunkIndex.chunklength()); + ChunkId chunk(curRGIdx, colId, chunkIndex->chunkOffset(), chunkIndex->chunkLength()); diskChunks.emplace_back(chunk); } @@ -466,8 +464,13 @@ bool PixelsRecordReaderImpl::read() requestBatch.add(queryId, chunk.offset, (int) chunk.length, ::BufferPool::GetBufferId(i)); colIds.emplace_back(chunk.columnId); bytes.emplace_back(chunk.length); + + // std::cout << "[DEBUG] Reading RowGroup. Offset: " << chunk.offset + // << ", Length: " << chunk.length << std::endl; } + ::DirectUringRandomAccessFile::Initialize(); ::BufferPool::Initialize(colIds, bytes, fileSchema->getFieldNames()); + ::DirectUringRandomAccessFile::RegisterBufferFromPool(colIds); std::vector > originalByteBuffers; for (int i = 0; i < colIds.size(); i++) @@ -519,7 +522,7 @@ std::shared_ptr PixelsRecordReaderImpl::getResultSchema() std::shared_ptr PixelsRecordReaderImpl::createEmptyEOFRowBatch(int size) { auto emptySchema = TypeDescription::createSchema( - std::vector < std::shared_ptr < pixels::proto::Type >> ()); + std::span()); auto emptyRowBatch = emptySchema->createRowBatch(0); emptyRowBatch->rowCount = 0; return emptyRowBatch; @@ -547,4 +550,3 @@ void PixelsRecordReaderImpl::close() includedColumnTypes.clear(); endOfFile = true; } - diff --git a/cpp/pixels-core/lib/reader/StringColumnReader.cpp b/cpp/pixels-core/lib/reader/StringColumnReader.cpp index e6f0be5a2f..f886bc633f 100644 --- a/cpp/pixels-core/lib/reader/StringColumnReader.cpp +++ b/cpp/pixels-core/lib/reader/StringColumnReader.cpp @@ -24,6 +24,7 @@ */ #include "reader/StringColumnReader.h" #include "profiler/CountProfiler.h" +#include "vector/BinaryColumnVector.h" StringColumnReader::StringColumnReader(std::shared_ptr type) : ColumnReader(type) { @@ -46,9 +47,9 @@ void StringColumnReader::close() } -void StringColumnReader::read(std::shared_ptr input, pixels::proto::ColumnEncoding &encoding, int offset, +void StringColumnReader::read(std::shared_ptr input, const pixels::fb::ColumnEncoding* encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, std::shared_ptr filterMask) + const pixels::fb::ColumnChunkIndex* chunkIndex, std::shared_ptr filterMask) { // TODO: support dictionary std::shared_ptr columnVector = @@ -58,20 +59,20 @@ void StringColumnReader::read(std::shared_ptr input, pixels::proto: { elementIndex = 0; bufferOffset = 0; - isNullOffset = chunkIndex.isnulloffset(); + isNullOffset = chunkIndex->isNullOffset(); readContent(input, input->bytesRemaining(), encoding); } int pixelId = elementIndex / pixelStride; - bool hasNull = chunkIndex.pixelstatistics(pixelId).statistic().hasnull(); + bool hasNull = chunkIndex->pixelStatistics()->Get(pixelId)->statistic()->hasNull(); setValid(input, pixelStride, vector, pixelId, hasNull); // TODO: if dictionary encoded - if (encoding.kind() == pixels::proto::ColumnEncoding_Kind_DICTIONARY) + if (encoding->kind() == pixels::fb::EncodingKind_DICTIONARY) { bool cascadeRLE = false; - if (encoding.has_cascadeencoding() && - encoding.cascadeencoding().kind() == pixels::proto::ColumnEncoding_Kind_RUNLENGTH) + if (encoding->cascadeEncoding() != nullptr && + encoding->cascadeEncoding()->kind() == pixels::fb::EncodingKind_RUNLENGTH) { cascadeRLE = true; } @@ -91,7 +92,7 @@ void StringColumnReader::read(std::shared_ptr input, pixels::proto: // use setRef instead of setVal to reduce memory copy. columnVector->setRef(i + vectorIndex, dictContentBuf->getPointer(), dictStarts[originId], tmpLen); } - else if (!valid && (!cascadeRLE) && chunkIndex.nullspadding()) + else if (!valid && (!cascadeRLE) && chunkIndex->nullsPadding()) { // is null: skip this number contentBuf->getInt(); @@ -152,9 +153,9 @@ void StringColumnReader::read(std::shared_ptr input, pixels::proto: void StringColumnReader::readContent(std::shared_ptr input, uint32_t inputLength, - pixels::proto::ColumnEncoding &encoding) + const pixels::fb::ColumnEncoding* encoding) { - if (encoding.kind() == pixels::proto::ColumnEncoding_Kind_DICTIONARY) + if (encoding->kind() == pixels::fb::EncodingKind_DICTIONARY) { input->markReaderIndex(); input->skipBytes(inputLength - 2 * sizeof(int)); @@ -170,14 +171,14 @@ void StringColumnReader::readContent(std::shared_ptr input, *input, dictStartsOffset, startsBufLength); int bufferStart = 0; - if (encoding.has_cascadeencoding() && - encoding.cascadeencoding().kind() == pixels::proto::ColumnEncoding_Kind::ColumnEncoding_Kind_RUNLENGTH) + if (encoding->cascadeEncoding() != nullptr && + encoding->cascadeEncoding()->kind() == pixels::fb::EncodingKind_RUNLENGTH) { std::shared_ptr startsDecoder = std::make_shared(startsBuf, false); - if (encoding.has_dictionarysize()) + if (encoding->dictionarySize() != 0) { - startsLength = (int) encoding.dictionarysize() + 1; + startsLength = (int) encoding->dictionarySize() + 1; dictStarts = new int[startsLength]; int i = 0; while (startsDecoder->hasNext()) @@ -199,7 +200,7 @@ void StringColumnReader::readContent(std::shared_ptr input, "StringColumnReader::readContent: the length of the starts array buffer is invalid. "); } int startsSize = startsBufLength / sizeof(int); - if (encoding.has_dictionarysize() && encoding.dictionarysize() + 1 != startsSize) + if (encoding->dictionarySize() != 0 && encoding->dictionarySize() + 1 != startsSize) { throw new InvalidArgumentException( "the dictionary size is inconsistent with the size of the starts array"); diff --git a/cpp/pixels-core/lib/reader/TimestampColumnReader.cpp b/cpp/pixels-core/lib/reader/TimestampColumnReader.cpp index 38c8e29117..180b42d915 100644 --- a/cpp/pixels-core/lib/reader/TimestampColumnReader.cpp +++ b/cpp/pixels-core/lib/reader/TimestampColumnReader.cpp @@ -23,6 +23,7 @@ * @create 2023-12-23 */ #include "reader/TimestampColumnReader.h" +#include "vector/TimestampColumnVector.h" TimestampColumnReader::TimestampColumnReader(std::shared_ptr type) : ColumnReader(type) { @@ -35,9 +36,10 @@ void TimestampColumnReader::close() } void -TimestampColumnReader::read(std::shared_ptr input, pixels::proto::ColumnEncoding &encoding, int offset, +TimestampColumnReader::read(std::shared_ptr input, + const pixels::fb::ColumnEncoding *encoding, int offset, int size, int pixelStride, int vectorIndex, std::shared_ptr vector, - pixels::proto::ColumnChunkIndex &chunkIndex, + const pixels::fb::ColumnChunkIndex *chunkIndex, std::shared_ptr filterMask) { std::shared_ptr columnVector = @@ -47,14 +49,14 @@ TimestampColumnReader::read(std::shared_ptr input, pixels::proto::C { decoder = std::make_shared(input, true); ColumnReader::elementIndex = 0; - isNullOffset = chunkIndex.isnulloffset(); + isNullOffset = chunkIndex->isNullOffset(); } int pixelId = elementIndex / pixelStride; - bool hasNull = chunkIndex.pixelstatistics(pixelId).statistic().hasnull(); + bool hasNull = chunkIndex->pixelStatistics()->Get(pixelId)->statistic()->hasNull(); setValid(input, pixelStride, vector, pixelId, hasNull); - if (encoding.kind() == pixels::proto::ColumnEncoding_Kind_RUNLENGTH) + if (encoding->kind() == pixels::fb::EncodingKind_RUNLENGTH) { for (int i = 0; i < size; i++) { diff --git a/cpp/pixels-core/lib/stats/StatsRecorder.cpp b/cpp/pixels-core/lib/stats/StatsRecorder.cpp index 5036a4dac9..338c031b62 100644 --- a/cpp/pixels-core/lib/stats/StatsRecorder.cpp +++ b/cpp/pixels-core/lib/stats/StatsRecorder.cpp @@ -30,9 +30,9 @@ StatsRecorder::StatsRecorder() : numberOfValues(0), hasNull(false) {} -StatsRecorder::StatsRecorder(const pixels::proto::ColumnStatistic &statistic) - : numberOfValues(statistic.has_numberofvalues() ? statistic.numberofvalues() : 0), - hasNull(statistic.has_hasnull() ? statistic.hasnull() : true) +StatsRecorder::StatsRecorder(const pixels::fb::ColumnStatistic* statistic) + : numberOfValues(statistic != nullptr ? statistic->numberOfValues() : 0), + hasNull(statistic != nullptr ? statistic->hasNull() : true) {} @@ -132,12 +132,22 @@ bool StatsRecorder::hasNullValue() const { return hasNull; } -pixels::proto::ColumnStatistic StatsRecorder::serialize() const +flatbuffers::Offset StatsRecorder::serialize(flatbuffers::FlatBufferBuilder& builder) const { - pixels::proto::ColumnStatistic statistic; - statistic.set_numberofvalues(numberOfValues); - statistic.set_hasnull(hasNull); - return statistic; + return pixels::fb::CreateColumnStatistic( + builder, + numberOfValues, // numberOfValues + 0, // intStatistics + 0, // doubleStatistics + 0, // stringStatistics + 0, // bucketStatistics + 0, // binaryStatistics + 0, // timestampStatistics + 0, // dateStatistics + 0, // timeStatistics + 0, // int128Statistics + hasNull // hasNull + ); } @@ -157,7 +167,7 @@ std::unique_ptr StatsRecorder::create(TypeDescription type) std::unique_ptr -StatsRecorder::create(TypeDescription type, const pixels::proto::ColumnStatistic &statistic) +StatsRecorder::create(TypeDescription type, const pixels::fb::ColumnStatistic* statistic) { switch (type.getCategory()) { @@ -169,7 +179,7 @@ StatsRecorder::create(TypeDescription type, const pixels::proto::ColumnStatistic std::unique_ptr -StatsRecorder::create(TypeDescription::Category category, const pixels::proto::ColumnStatistic &statistic) +StatsRecorder::create(TypeDescription::Category category, const pixels::fb::ColumnStatistic* statistic) { switch (category) { diff --git a/cpp/pixels-core/lib/vector/IntColumnVector.cpp b/cpp/pixels-core/lib/vector/IntColumnVector.cpp index ca12a9b8f3..8077b69def 100644 --- a/cpp/pixels-core/lib/vector/IntColumnVector.cpp +++ b/cpp/pixels-core/lib/vector/IntColumnVector.cpp @@ -49,11 +49,11 @@ void IntColumnVector::close() void IntColumnVector::print(int rowCount) { - throw InvalidArgumentException ("not support print longcolumnvector."); - // for(int i = 0; i < rowCount; i++) { - // std::cout<(outputStream->getWritePos() - outputStream->getReadPos()); } -pixels::proto::ColumnChunkIndex ColumnWriter::getColumnChunkIndex() +const flatbuffers::Offset ColumnWriter::getColumnChunkIndex() { - return *columnChunkIndex; + return columnChunkIndex; } -std::shared_ptr ColumnWriter::getColumnChunkIndexPtr() +flatbuffers::Offset ColumnWriter::buildColumnChunkIndex(flatbuffers::FlatBufferBuilder& fbb, uint64_t chunkOffset, uint32_t chunkLength, bool littleEndian) { - return columnChunkIndex; + // Build pixel positions vector + auto positions = fbb.CreateVector(pixelPositions); + + // Rebuild pixel statistics from saved snapshots + std::vector> pixelStats; + pixelStats.reserve(pixelStatSnapshots.size()); + + /** + * Traverse statistical snapshots to serialize C++ objects in memory + * to flattbuffers format + */ + + for (auto& snapshot : pixelStatSnapshots) { + // snapShot.colStatObj.get() get original pointers of C++ objects(generated by Object API) + // The Pack method will recursively write all fields of the object into the fbb buffer. + auto colStatOffset = pixels::fb::ColumnStatistic::Pack(fbb, snapshot.colStatObj.get()); + + auto pixelStatOffset = pixels::fb::CreatePixelStatistic(fbb, colStatOffset); + pixelStats.push_back(pixelStatOffset); + } + + auto statistics = fbb.CreateVector(pixelStats); + + // Build ColumnStatistic for the chunk + auto chunkStat = columnChunkStatRecorder.serialize(fbb); + + // Create ColumnChunkIndex + auto index = pixels::fb::CreateColumnChunkIndex( + fbb, + chunkOffset, // chunkOffset + chunkLength, // chunkLength + isNullOffset, + positions, + statistics, + littleEndian, // littleEndian + nullsPadding, + ISNULL_ALIGNMENT + ); + return index; } -pixels::proto::ColumnEncoding ColumnWriter::getColumnChunkEncoding() const + +const flatbuffers::Offset ColumnWriter::getColumnChunkEncoding(flatbuffers::FlatBufferBuilder &fbb) const { - pixels::proto::ColumnEncoding encoding; - encoding.set_kind(pixels::proto::ColumnEncoding::Kind::ColumnEncoding_Kind_NONE); + // Build a simple ColumnEncoding with NONE kind + auto encoding = pixels::fb::CreateColumnEncoding( + fbb, + pixels::fb::EncodingKind_NONE, + 0, // dictionarySize + 0 // cascadeEncoding + ); return encoding; } + void ColumnWriter::flush() { if (curPixelEleIndex > 0) { newPixel(); } - int isNullOffset = static_cast(outputStream->getWritePos()); + + isNullOffset = static_cast(outputStream->getWritePos()); if (ISNULL_ALIGNMENT != 0 && isNullOffset % ISNULL_ALIGNMENT != 0) { int alignBytes = ISNULL_ALIGNMENT - (isNullOffset % ISNULL_ALIGNMENT); outputStream->putBytes(const_cast(ISNULL_PADDING_BUFFER.data()), alignBytes); isNullOffset += alignBytes; } - columnChunkIndex->set_isnulloffset(isNullOffset); + outputStream->putBytes(isNullStream->getPointer() + isNullStream->getReadPos(), isNullStream->getWritePos() - isNullStream->getReadPos()); + + // Issue: align the entire column chunk to CHUNK_ALIGNMENT + // to ensure getColumnChunkSize() returns the correct aligned size + static const int CHUNK_ALIGNMENT = std::stoi(ConfigFactory::Instance().getProperty("column.chunk.alignment")); + static const std::vector CHUNK_PADDING_BUFFER(CHUNK_ALIGNMENT, 0); + + int chunkSize = static_cast(outputStream->getWritePos()); + if (CHUNK_ALIGNMENT != 0 && chunkSize % CHUNK_ALIGNMENT != 0) + { + int alignBytes = CHUNK_ALIGNMENT - (chunkSize % CHUNK_ALIGNMENT); + outputStream->putBytes(const_cast(CHUNK_PADDING_BUFFER.data()), alignBytes); + } } void ColumnWriter::newPixel() @@ -84,6 +142,7 @@ void ColumnWriter::newPixel() isNullStream->putBytes(const_cast(compacted.data()), compacted.size()); pixelStatRecorder.setHasNull(); } + curPixelPosition = static_cast(outputStream->getWritePos()); curPixelEleIndex = 0; curPixelVectorIndex = 0; @@ -91,11 +150,18 @@ void ColumnWriter::newPixel() columnChunkStatRecorder.merge(pixelStatRecorder); - pixels::proto::PixelStatistic pixelStat; - *pixelStat.mutable_statistic() = pixelStatRecorder.serialize(); - columnChunkIndex->add_pixelpositions(lastPixelPosition); - auto new_pixelstatistic = columnChunkIndex->add_pixelstatistics(); - *new_pixelstatistic = pixelStat; + // Add pixel position + pixelPositions.push_back(lastPixelPosition); + + + // Object API needed + auto colStatT = std::make_unique(); + + + // save the object in memory,No need to loop in flatbuffers for this. + PixelStatSnapshot snapshot; + snapshot.colStatObj = std::move(colStatT); + pixelStatSnapshots.push_back(std::move(snapshot)); lastPixelPosition = curPixelPosition; pixelStatRecorder.reset(); @@ -106,8 +172,13 @@ void ColumnWriter::reset() { lastPixelPosition = 0; curPixelPosition = 0; - columnChunkIndex->Clear(); - columnChunkStat->Clear(); + + // Clear flatbuffers data + pixelPositions.clear(); + pixelStatSnapshots.clear(); // Clear snapshots instead of pixelStatistics + columnChunkIndex = 0; + columnChunkStat = nullptr; + pixelStatRecorder.reset(); columnChunkStatRecorder.reset(); outputStream->resetPosition(); @@ -125,13 +196,15 @@ ColumnWriter::ColumnWriter(std::shared_ptr type, : pixelStride(writerOption->getPixelsStride()), encodingLevel(writerOption->getEncodingLevel()), byteOrder(writerOption->getByteOrder()), - nullsPadding(false),// default is false - isNull(pixelStride, false) + nullsPadding(writerOption->isNullsPadding()),// default is false + isNull(pixelStride, false), + columnChunkIndex(0), + columnChunkStat(nullptr) { - outputStream = std::make_shared(); - isNullStream = std::make_shared(); - columnChunkIndex = std::make_shared(); - columnChunkIndex->set_littleendian(byteOrder == ByteOrder::PIXELS_LITTLE_ENDIAN); - columnChunkIndex->set_nullspadding(nullsPadding); - columnChunkIndex->set_isnullalignment(ISNULL_ALIGNMENT); + outputStream = std::make_shared(pixelStride); + isNullStream = std::make_shared(pixelStride); + + // Reserve space for pixel positions and statistics snapshots + pixelPositions.reserve(100); // reasonable initial capacity + pixelStatSnapshots.reserve(100); } diff --git a/cpp/pixels-core/lib/writer/DateColumnWriter.cpp b/cpp/pixels-core/lib/writer/DateColumnWriter.cpp index f5dd27379e..070f3e232f 100644 --- a/cpp/pixels-core/lib/writer/DateColumnWriter.cpp +++ b/cpp/pixels-core/lib/writer/DateColumnWriter.cpp @@ -48,7 +48,10 @@ int DateColumnWriter::write(std::shared_ptr vector, int size) if (columnVector->isNull[i]) { hasNull = true; - encodingUtils.writeIntLE(outputStream, 0); + if (nullsPadding) + { + encodingUtils.writeIntLE(outputStream, 0); + } } else { if (byteOrder == ByteOrder::PIXELS_LITTLE_ENDIAN) diff --git a/cpp/pixels-core/lib/writer/DecimalColumnWriter.cpp b/cpp/pixels-core/lib/writer/DecimalColumnWriter.cpp index 298effa639..17c3eab452 100644 --- a/cpp/pixels-core/lib/writer/DecimalColumnWriter.cpp +++ b/cpp/pixels-core/lib/writer/DecimalColumnWriter.cpp @@ -49,7 +49,10 @@ int DecimalColumnWriter::write(std::shared_ptr vector, int size) if (columnVector->isNull[i]) { hasNull = true; - encodingUtils.writeLongLE(outputStream, 0L); + if (nullsPadding) + { + encodingUtils.writeLongLE(outputStream, 0L); + } } else { if (byteOrder == ByteOrder::PIXELS_LITTLE_ENDIAN) diff --git a/cpp/pixels-core/lib/writer/IntColumWriter.cpp b/cpp/pixels-core/lib/writer/IntColumWriter.cpp index 8ec0c6ea24..b0f038fe9f 100644 --- a/cpp/pixels-core/lib/writer/IntColumWriter.cpp +++ b/cpp/pixels-core/lib/writer/IntColumWriter.cpp @@ -41,7 +41,6 @@ IntColumnWriter::IntColumnWriter( int IntColumnWriter::write(std::shared_ptr vector, int size) { - std::cout << "In IntColumnWriter" << std::endl; auto columnVector = std::static_pointer_cast (vector); if (!columnVector) { @@ -159,17 +158,14 @@ void IntColumnWriter::newPixel() ColumnWriter::newPixel (); } -pixels::proto::ColumnEncoding IntColumnWriter::getColumnChunkEncoding() const +const flatbuffers::Offset IntColumnWriter::getColumnChunkEncoding(flatbuffers::FlatBufferBuilder& fbb) const { - pixels::proto::ColumnEncoding columnEncoding; + if (runlengthEncoding) { - columnEncoding.set_kind ( - pixels::proto::ColumnEncoding::Kind::ColumnEncoding_Kind_RUNLENGTH); + return pixels::fb::CreateColumnEncoding(fbb,pixels::fb::EncodingKind_RUNLENGTH); } else { - columnEncoding.set_kind ( - pixels::proto::ColumnEncoding::Kind::ColumnEncoding_Kind_NONE); + return pixels::fb::CreateColumnEncoding(fbb,pixels::fb::EncodingKind_NONE); } - return columnEncoding; -} \ No newline at end of file +} diff --git a/cpp/pixels-core/lib/writer/LongColumnWriter.cpp b/cpp/pixels-core/lib/writer/LongColumnWriter.cpp index 9dd872e0e9..1c54d1a6c8 100644 --- a/cpp/pixels-core/lib/writer/LongColumnWriter.cpp +++ b/cpp/pixels-core/lib/writer/LongColumnWriter.cpp @@ -40,7 +40,6 @@ LongColumnWriter::LongColumnWriter( int LongColumnWriter::write(std::shared_ptr vector, int size) { - std::cout << "In LongColumnWriter" << std::endl; auto columnVector = std::static_pointer_cast(vector); if (!columnVector) { @@ -125,7 +124,7 @@ void LongColumnWriter::newPixel() if (runlengthEncoding) { std::vector buffer(curPixelVectorIndex * - sizeof(int)); + sizeof(long)); int resLen; encoder->encode(curPixelVector.data(), buffer.data(), curPixelVectorIndex, resLen); @@ -136,20 +135,20 @@ void LongColumnWriter::newPixel() EncodingUtils encodingUtils; curVecPartitionBuffer = - std::make_shared(curPixelVectorIndex * sizeof(int)); + std::make_shared(curPixelVectorIndex * sizeof(long)); if (byteOrder == ByteOrder::PIXELS_LITTLE_ENDIAN) { for (int i = 0; i < curPixelVectorIndex; i++) { encodingUtils.writeLongLE(curVecPartitionBuffer, - (int) curPixelVector[i]); + (int64_t) curPixelVector[i]); } } else { for (int i = 0; i < curPixelVectorIndex; i++) { encodingUtils.writeLongBE(curVecPartitionBuffer, - (int) curPixelVector[i]); + (int64_t) curPixelVector[i]); } } @@ -160,17 +159,14 @@ void LongColumnWriter::newPixel() ColumnWriter::newPixel(); } -pixels::proto::ColumnEncoding LongColumnWriter::getColumnChunkEncoding() const +const flatbuffers::Offset LongColumnWriter::getColumnChunkEncoding(flatbuffers::FlatBufferBuilder& fbb) const { - pixels::proto::ColumnEncoding columnEncoding; + if (runlengthEncoding) { - columnEncoding.set_kind( - pixels::proto::ColumnEncoding::Kind::ColumnEncoding_Kind_RUNLENGTH); + return pixels::fb::CreateColumnEncoding(fbb,pixels::fb::EncodingKind_RUNLENGTH); } else { - columnEncoding.set_kind( - pixels::proto::ColumnEncoding::Kind::ColumnEncoding_Kind_NONE); + return pixels::fb::CreateColumnEncoding(fbb,pixels::fb::EncodingKind_NONE); } - return columnEncoding; -} \ No newline at end of file +} diff --git a/cpp/pixels-core/lib/writer/TimestampColumnWriter.cpp b/cpp/pixels-core/lib/writer/TimestampColumnWriter.cpp index c5b6f8d348..2e3a7cc962 100644 --- a/cpp/pixels-core/lib/writer/TimestampColumnWriter.cpp +++ b/cpp/pixels-core/lib/writer/TimestampColumnWriter.cpp @@ -49,7 +49,10 @@ int TimestampColumnWriter::write(std::shared_ptr vector, int size) if (columnVector->isNull[i]) { hasNull = true; - encodingUtils.writeLongLE(outputStream, 0L); + if (nullsPadding) + { + encodingUtils.writeLongLE(outputStream, 0L); + } } else { if (byteOrder == ByteOrder::PIXELS_LITTLE_ENDIAN) diff --git a/cpp/pixels-cpp.properties b/cpp/pixels-cpp.properties index 0a65c450ac..82f6be2402 100644 --- a/cpp/pixels-cpp.properties +++ b/cpp/pixels-cpp.properties @@ -44,3 +44,7 @@ column.chunk.alignment=32 # for DuckDB, it is only effective when column.chunk.alignment also meets the alignment of the isNull bitmap isnull.bitmap.alignment=8 + + +# pixels.doublebuffer +pixels.doublebuffer=false diff --git a/cpp/pixels-duckdb/PixelsScanFunction.cpp b/cpp/pixels-duckdb/PixelsScanFunction.cpp index c8aec8b7ef..1eb6afad2c 100644 --- a/cpp/pixels-duckdb/PixelsScanFunction.cpp +++ b/cpp/pixels-duckdb/PixelsScanFunction.cpp @@ -29,7 +29,7 @@ namespace duckdb { -bool PixelsScanFunction::enable_filter_pushdown = false; +bool PixelsScanFunction::enable_filter_pushdown = true; static idx_t PixelsScanGetBatchIndex(ClientContext &context, const FunctionData *bind_data_p, LocalTableFunctionState *local_state, @@ -63,8 +63,8 @@ TableFunctionSet PixelsScanFunction::GetFunctionSet() TableFunction table_function("pixels_scan", {LogicalType::VARCHAR}, PixelsScanImplementation, PixelsScanBind, PixelsScanInitGlobal, PixelsScanInitLocal); table_function.projection_pushdown = true; -// table_function.filter_pushdown = true; - //table_function.filter_prune = true; + table_function.filter_pushdown = true; + // table_function.filter_prune = true; enable_filter_pushdown = table_function.filter_pushdown; MultiFileReader::AddParameters(table_function); table_function.cardinality = PixelsCardinality; @@ -447,7 +447,7 @@ void PixelsScanFunction::TransformDuckdbChunk(PixelsReadLocalState &data, vectorizedRowBatch->increment(thisOutputChunkRows); } -bool PixelsScanFunction::PixelsParallelStateNext(ClientContext &context, const PixelsReadBindData &bind_data, +bool PixelsScanFunction::PixelsParallelStateNext(ClientContext &context, PixelsReadBindData &bind_data, PixelsReadLocalState &scan_data, PixelsReadGlobalState ¶llel_state, bool is_init_state) @@ -485,7 +485,7 @@ bool PixelsScanFunction::PixelsParallelStateNext(ClientContext &context, const P parallel_lock.unlock(); return false; } - + bind_data.curFileId++; scan_data.curr_file_index = scan_data.next_file_index; scan_data.curr_batch_index = scan_data.next_batch_index; scan_data.next_file_index = parallel_state.file_index.at(scan_data.deviceID); @@ -501,7 +501,12 @@ bool PixelsScanFunction::PixelsParallelStateNext(ClientContext &context, const P scan_data.currReader->close(); } - ::BufferPool::Switch(); + if (ConfigFactory::Instance().getProperty("pixels.doublebuffer")=="true") + { + ::BufferPool::Switch(); + } + // double/single buffer + scan_data.currReader = scan_data.nextReader; scan_data.currPixelsRecordReader = scan_data.nextPixelsRecordReader; // asyncReadComplete is not invoked in the first run (is_init_state = true) @@ -509,6 +514,12 @@ bool PixelsScanFunction::PixelsParallelStateNext(ClientContext &context, const P { auto currPixelsRecordReader = std::static_pointer_cast( scan_data.currPixelsRecordReader); + if (ConfigFactory::Instance().getProperty("pixels.doublebuffer")=="false") + { + //single buffer + currPixelsRecordReader->read(); + } + currPixelsRecordReader->asyncReadComplete((int) scan_data.column_names.size()); } if (scan_data.next_file_index < StorageInstance->getFileSum(scan_data.deviceID)) @@ -526,7 +537,13 @@ bool PixelsScanFunction::PixelsParallelStateNext(ClientContext &context, const P scan_data.nextPixelsRecordReader = scan_data.nextReader->read(option); auto nextPixelsRecordReader = std::static_pointer_cast( scan_data.nextPixelsRecordReader); - nextPixelsRecordReader->read(); + + if (ConfigFactory::Instance().getProperty("pixels.doublebuffer")=="true") + { + //double buffer + nextPixelsRecordReader->read(); + } + } else { scan_data.nextReader = nullptr; diff --git a/cpp/pixels-duckdb/duckdb b/cpp/pixels-duckdb/duckdb index c3dc6d34c9..e0af5da3aa 160000 --- a/cpp/pixels-duckdb/duckdb +++ b/cpp/pixels-duckdb/duckdb @@ -1 +1 @@ -Subproject commit c3dc6d34c905bc44f311bf670b1bbddef1c0c776 +Subproject commit e0af5da3aaddf16c7b54bc1acaba94d5c2bcdd73 diff --git a/cpp/pixels.fbs b/cpp/pixels.fbs new file mode 100644 index 0000000000..c23d59f1a9 --- /dev/null +++ b/cpp/pixels.fbs @@ -0,0 +1,207 @@ +// File format definition of Pixels (FlatBuffers version) + +namespace pixels.fb; + +enum CompressionKind : byte { + NONE = 0, + ZLIB = 1, + SNAPPY = 2, + LZO = 3, + LZ4 = 4, + ZSTD = 5 +} + +enum TypeKind : byte { + BOOLEAN = 0, + BYTE = 1, + SHORT = 2, + INT = 3, + LONG = 4, + FLOAT = 5, + DOUBLE = 6, + STRING = 7, + BINARY = 8, + TIMESTAMP = 9, + ARRAY = 10, + MAP = 11, + STRUCT = 12, + VARBINARY = 13, + DECIMAL = 14, + DATE = 15, + VARCHAR = 16, + CHAR = 17, + TIME = 18, + VECTOR = 19 +} + +enum EncodingKind : byte { + NONE = 0, + RUNLENGTH = 1, + DICTIONARY = 2 +} + +// --- Statistics Tables --- + +table IntegerStatistic { + minimum: long; + maximum: long; + sum: long; +} + +table Integer128Statistic { + minimum_high: uint64; + minimum_low: uint64; + maximum_high: long; + maximum_low: long; +} + +table DoubleStatistic { + minimum: double; + maximum: double; + sum: double; +} + +table StringStatistic { + minimum: string; + maximum: string; + sum: long; +} + +table BucketStatistic { + count: [uint64]; +} + +table TimestampStatistic { + minimum: long; + maximum: long; +} + +table DateStatistic { + minimum: int; + maximum: int; +} + +table TimeStatistic { + minimum: int; + maximum: int; +} + +table BinaryStatistic { + sum: long; +} + +table ColumnStatistic { + numberOfValues: uint64; + intStatistics: IntegerStatistic; + doubleStatistics: DoubleStatistic; + stringStatistics: StringStatistic; + bucketStatistics: BucketStatistic; + binaryStatistics: BinaryStatistic; + timestampStatistics: TimestampStatistic; + dateStatistics: DateStatistic; + timeStatistics: TimeStatistic; + int128Statistics: Integer128Statistic; + hasNull: bool = false; +} + +table PixelStatistic { + statistic: ColumnStatistic; +} + +// --- Schema and Metadata --- + +table Type { + kind: TypeKind = BOOLEAN; + name: string; + subtypes: [uint]; + maximumLength: uint; + precision: uint; + scale: uint; + dimension: uint; +} + +table PartitionInformation { + columnIds: [uint]; + hashValue: int; +} + +table RowGroupInformation { + footerOffset: uint64; + dataLength: uint; + footerLength: uint; + numberOfRows: uint; + partitionInfo: PartitionInformation; +} + +table RowGroupStatistic { + columnChunkStats: [ColumnStatistic]; + hiddenColumnChunkStats: ColumnStatistic; +} + +// --- Index and Encoding --- + +table ColumnChunkIndex { + chunkOffset: uint64; + chunkLength: uint; + isNullOffset: uint; + pixelPositions: [uint]; + pixelStatistics: [PixelStatistic]; + littleEndian: bool = true; + nullsPadding: bool = false; + isNullAlignment: uint; +} + +table ColumnEncoding { + kind: EncodingKind = NONE; + dictionarySize: uint; + cascadeEncoding: ColumnEncoding; +} + +table RowGroupIndex { + columnChunkIndexEntries: [ColumnChunkIndex]; + hiddenColumnChunkIndexEntry: ColumnChunkIndex; +} + +table RowGroupEncoding { + columnChunkEncodings: [ColumnEncoding]; + hiddenColumnChunkEncoding: ColumnEncoding; +} + +table RowGroupFooter { + rowGroupIndexEntry: RowGroupIndex; + rowGroupEncoding: RowGroupEncoding; +} + +// --- File Tail Components --- + +table PostScript { + version: uint; + contentLength: uint64; + numberOfRows: uint; + compression: CompressionKind = NONE; + compressionBlockSize: uint; + pixelStride: uint; + writerTimezone: string; + partitioned: bool = false; + columnChunkAlignment: uint; + hasHiddenColumn: bool = false; + magic: string; // "PIXELS" +} + +table Footer { + types: [Type]; + columnStats: [ColumnStatistic]; + rowGroupInfos: [RowGroupInformation]; + rowGroupStats: [RowGroupStatistic]; + hiddenType: Type; + hiddenColumnStats: ColumnStatistic; +} + +table FileTail { + footer: Footer; + postscript: PostScript; + footerLength: uint; + postscriptLength: uint; +} + +root_type FileTail; diff --git a/cpp/testcase/README-zh.md b/cpp/testcase/README-zh.md new file mode 100644 index 0000000000..6354ea3450 --- /dev/null +++ b/cpp/testcase/README-zh.md @@ -0,0 +1,40 @@ +# 测试 +本目录存放了所有测试 + +## 运行脚本 +`process_sqls.py` 运行查询,需要传入benchmark参数指定要运行的benchmark,也需要指定要运行的查询 +```bash +usage: process_sqls.py [-h] [--runs RUNS] [--duckdb-bin DUCKDB_BIN] [--sql-dir SQL_DIR] + [--output-csv OUTPUT_CSV] [--wait-after-run WAIT_AFTER_RUN] + [--threads THREADS] [--benchmark BENCHMARK] [--benchmark-json BENCHMARK_JSON] + +DuckDB ClickBench Batch Test Script (Multi-column CSV, ensures resource release) + +options: + -h, --help show this help message and exit + --runs RUNS Number of runs per SQL file (default: 3) + --duckdb-bin DUCKDB_BIN + Path to duckdb executable + --sql-dir SQL_DIR Directory containing SQL files (only processes .sql files starting with 'q') + --output-csv OUTPUT_CSV + Path to output result CSV + --wait-after-run WAIT_AFTER_RUN + Seconds to wait after each run (ensures resource release, default: 2s) + --threads THREADS Number of threads to use in DuckDB (default: 96) + --benchmark BENCHMARK + Name of benchmark to use (must exist in benchmark JSON, e.g. clickbench- + pixels-e0) + --benchmark-json BENCHMARK_JSON + Path to benchmark configuration JSON file (default: ./benchmark.json) + +``` +## + +## I/O粒度测试 +`blk_stat.py`在执行`process_sqls.py`的同时,调用blktrace和blkprase读取底层块设备的I/O粒度,同时也需要注意运行的查询由`process_sql.py`内置 + +## 单/双buffer性能测试 +`single_doublebuffer_async_sync_test.py` 设置运行参数,执行单双buffer测试 + +## perf实验 + diff --git a/cpp/testcase/benchmark.json b/cpp/testcase/benchmark.json new file mode 100644 index 0000000000..86f8577dbe --- /dev/null +++ b/cpp/testcase/benchmark.json @@ -0,0 +1,15 @@ +{ + "tpch-pixels-e0":"", + "tpch-pixels-e1":"", + "tpch-pixels-e2":"", + "tpch-parquet-e0":"", + "tpch-parquet-e2":"", + "clickbench-parquet-e2":"", + "clickbench-parquet-e0":"CREATE VIEW hits AS SELECT * FROM parquet_scan([\n \"/data/9a3-01/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-02/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-03/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-04/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-05/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-06/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-07/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-08/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-09/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-10/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-11/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-12/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-13/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-14/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-15/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-16/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-17/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-18/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-19/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-20/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-21/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-22/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-23/clickbench/parquet-e0/hits/*\",\n \"/data/9a3-24/clickbench/parquet-e0/hits/*\"\n ]\n);", + "clickbench-pixels-e2":"", + "clickbench-pixels-e0-24ssd":"CREATE VIEW hits AS SELECT * FROM pixels_scan([\n \"/data/9a3-01/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-02/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-03/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-04/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-05/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-06/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-07/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-08/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-09/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-10/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-11/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-12/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-13/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-14/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-15/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-16/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-17/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-18/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-19/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-20/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-21/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-22/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-23/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-24/clickbench/pixels-e0-fb/*\"\n ]\n);", + "clickbench-pixels-e1":"", + "clickbench-pixels-e0-1ssd": "CREATE VIEW hits AS SELECT * FROM pixels_scan([\"/data/9a3-01/clickbench/pixels-e0-fb/*\"]);\n", + "clickbench-pixels-e0-6ssd": "CREATE VIEW hits AS SELECT * FROM pixels_scan([\n \"/data/9a3-01/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-02/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-03/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-04/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-05/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-06/clickbench/pixels-e0-fb/*\"\n ]\n);", + "clickbench-pixels-e0-12ssd": "CREATE VIEW hits AS SELECT * FROM pixels_scan([\n \"/data/9a3-01/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-02/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-03/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-04/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-05/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-06/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-07/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-08/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-09/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-10/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-11/clickbench/pixels-e0-fb/*\",\n \"/data/9a3-12/clickbench/pixels-e0-fb/*\"\n ]\n);" +} \ No newline at end of file diff --git a/cpp/testcase/blk_stat.py b/cpp/testcase/blk_stat.py new file mode 100644 index 0000000000..cc53621d17 --- /dev/null +++ b/cpp/testcase/blk_stat.py @@ -0,0 +1,96 @@ +import subprocess +import time +import re +import csv +import argparse +from collections import Counter +import os # <-- 导入 os 模块 + + +def clear_page_cache(): + """Clear Linux page cache to ensure fair benchmarking""" + try: + print("🧹 Clearing Linux page cache...") + # Synchronize filesystem caches + subprocess.run(["sync"], check=True) + # Drop caches (3 clears pagecache, dentries, and inodes) + subprocess.run(["sudo", "bash", "-c", "echo 3 > /proc/sys/vm/drop_caches"], check=True) + print("✅ Page cache cleared successfully") + except subprocess.CalledProcessError as e: + print(f"⚠️ Failed to clear page cache: {e}") + + +# -------------------- 1️⃣ Parse Command Line Arguments -------------------- +parser = argparse.ArgumentParser(description="Monitor I/O granularity using blktrace and blkparse") +parser.add_argument("--benchmark", required=True, help="Benchmark name, used as output file prefix") +args = parser.parse_args() +benchmark_name = args.benchmark + +# -------------------- 2️⃣ Define Regex Pattern -------------------- +# Pattern for capturing I/O size (in sectors) and the process name +# The current pattern targets 'G' (Get request) operations. +pattern = re.compile(r"\sG\s+RA?\s+\d+\s+\+\s+(\d+)\s+\[(duckdb|iou-sqp-\d+)\]") + +# -------------------- 3️⃣ Start blktrace and blkparse Pipeline -------------------- +# blktrace monitors block device I/O on nvme0n1 and outputs raw data to stdout +blktrace_cmd = ["sudo", "blktrace", "-d", "/dev/nvme0n1","-o", "-"] +# blkparse reads raw data from stdin ('-') +blkparse_cmd = ["blkparse", "-i", "-"] + +p1 = subprocess.Popen(blktrace_cmd, stdout=subprocess.PIPE) +p2 = subprocess.Popen(blkparse_cmd, stdin=p1.stdout, stdout=subprocess.PIPE, text=True) + +# -------------------- 4️⃣ Clear Page Cache -------------------- +clear_page_cache() + +# -------------------- 5️⃣ Start Benchmark Script (process_sqls.py) -------------------- +proc = subprocess.Popen(["python3", "process_sqls.py", "--runs", "1", "--benchmark", benchmark_name]) + +# -------------------- 6️⃣ Real-time I/O Granularity Collection -------------------- +counter = Counter() +print(f"📊 Collecting I/O traces while benchmark '{benchmark_name}' is running...") + +try: + # Read blkparse output line by line + for line in p2.stdout: + # Search for I/O size and process name using the defined pattern + match = pattern.search(line) + + if match: + # Group 1 is the I/O size in sectors + size = int(match.group(1)) + counter[size] += 1 + + # Check if the benchmark process (process_sqls) has finished + if proc.poll() is not None: + break +except KeyboardInterrupt: + print("⏹️ Stopped manually") + +# -------------------- 7️⃣ Terminate blktrace/blkparse -------------------- +p1.terminate() +p2.terminate() + +# -------------------- 8️⃣ Create Output Directory and Save Results -------------------- +output_dir = "io_results" +output_filename = os.path.join(output_dir, f"io_granularity_stats-{benchmark_name}.csv") # 使用 os.path.join 组合路径 + +# --- 检查并创建目录 --- +if not os.path.exists(output_dir): + print(f"📁 Output directory '{output_dir}' not found. Creating it...") + # recursively create directories if they don't exist + os.makedirs(output_dir) +# ---------------------- + +with open(output_filename, "w", newline="") as f: + writer = csv.writer(f) + # Write header: IO size in sectors, count of requests, and IO size converted to bytes (512 bytes/sector) + writer.writerow(["IO_Size_Sectors", "Count", "IO_Size_Bytes"]) + # Write sorted results + for s, c in sorted(counter.items()): + writer.writerow([s, c, s * 512]) + +print(f"✅ Results saved to {output_filename}") + + + diff --git a/cpp/testcase/clickbench-gen/clickbench-gen.sh b/cpp/testcase/clickbench-gen/clickbench-gen.sh new file mode 100755 index 0000000000..8054bd88ba --- /dev/null +++ b/cpp/testcase/clickbench-gen/clickbench-gen.sh @@ -0,0 +1,11 @@ +CUR_DIR=$(pwd) +/home/whz/test/pixels/cpp/build/release/extension/pixels/pixels-cli/pixels-cli < \ No newline at end of file diff --git a/cpp/testcase/generate_flamegraphs.sh b/cpp/testcase/generate_flamegraphs.sh new file mode 100755 index 0000000000..4676d81e4f --- /dev/null +++ b/cpp/testcase/generate_flamegraphs.sh @@ -0,0 +1,135 @@ +#!/bin/bash + +# ==================================================== +# Script Function: Batch generation of CPU/I/O/Scheduling related Flame Graphs (Perf + FlameGraph) +# Analysis Events: 1. cpu-clock, 2. page-faults, 3. branch-misses, 4. sched_switch/sched_stat_wait +# Distinction: Each event uses a different color palette (hot, mem, perf, chain) +# Usage: $0 +# Example: $0 test_q01.sql ../../build/release/duckdb ./results +# ==================================================== + +# --- Configure FlameGraph Path --- +FLAMEGRAPH_DIR="$HOME/FlameGraph" +STACKCOLLAPSE="${FLAMEGRAPH_DIR}/stackcollapse-perf.pl" +FLAMEGRAPH_PL="${FLAMEGRAPH_DIR}/flamegraph.pl" + +# ---------------------------------------------------- +# 1. Check Environment and Arguments +# ---------------------------------------------------- + +# Check FlameGraph dependencies +if [ ! -x "$STACKCOLLAPSE" ] || [ ! -x "$FLAMEGRAPH_PL" ]; then + echo "Error: Cannot find or execute FlameGraph tools (stackcollapse-perf.pl or flamegraph.pl)." + echo "Please ensure the FlameGraph repository is cloned to $HOME/FlameGraph." + exit 1 +fi + +# Check argument count +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + echo "Example: $0 test_q01.sql ../build/release/duckdb ./results" + exit 1 +fi + +# Receive arguments +SQL_FILE=$1 +DUCKDB_BINARY=$2 +OUTPUT_DIR=$3 + +# Check if DuckDB executable exists +if [ ! -x "$DUCKDB_BINARY" ]; then + echo "Error: Cannot find or execute ${DUCKDB_BINARY}" + exit 1 +fi + +# Create output directory (if it doesn't exist) +mkdir -p "$OUTPUT_DIR" +if [ ! -d "$OUTPUT_DIR" ]; then + echo "Error: Cannot create output directory ${OUTPUT_DIR}" + exit 1 +fi + +# Extract filename as prefix (e.g., test_q01) +FILENAME_PREFIX=$(basename "$SQL_FILE" .sql) + +echo "--- Starting Query Analysis: ${FILENAME_PREFIX} ---" +echo "Using executable: ${DUCKDB_BINARY}" +echo "Output directory: ${OUTPUT_DIR}" + +# ---------------------------------------------------- +# 2. Core Function Definition +# ---------------------------------------------------- + +# Function: Generate Flame Graph for specified event +# $1: Event Name (perf event name, e.g., cpu-clock, page-faults) +# $2: Friendly Event Name (e.g., CPU Time) +# $3: Output File Suffix (e.g., cpu_time, page_faults) +# $4: Color Palette Name (e.g., hot, mem, perf, chain) +function generate_flamegraph { + local EVENT_NAME="$1" + local FRIENDLY_NAME="$2" + local SUFFIX="$3" + local COLOR_PALETTE="$4" + + echo "" + echo "----------------------------------------------------" + echo "Start recording event: ${FRIENDLY_NAME} (${EVENT_NAME}) - Color: ${COLOR_PALETTE}" + echo "----------------------------------------------------" + + local DATA_FILE="${OUTPUT_DIR}/${FILENAME_PREFIX}_${SUFFIX}.data" + local PERF_TXT="${OUTPUT_DIR}/${FILENAME_PREFIX}_${SUFFIX}.perf.txt" + local FOLDED_FILE="${OUTPUT_DIR}/${FILENAME_PREFIX}_${SUFFIX}.folded" + local SVG_FILE="${OUTPUT_DIR}/${FILENAME_PREFIX}_${SUFFIX}.svg" + + # --- Record Data --- + # -e specifies event, --call-graph=dwarf captures call stack, -g enables call graph + sudo -E perf record --call-graph=dwarf -e "$EVENT_NAME" -g -o "$DATA_FILE" -F 10\ + -- "$DUCKDB_BINARY" < "$SQL_FILE" + + if [ $? -ne 0 ]; then + echo "[ERROR] perf record failed. Check permissions or perf installation." + return 1 + fi + + # --- Convert Data and Generate Flame Graph --- + echo "Converting ${FRIENDLY_NAME} data and generating Flame Graph..." + sudo perf script -i "$DATA_FILE" > "$PERF_TXT" + "$STACKCOLLAPSE" "$PERF_TXT" > "$FOLDED_FILE" + + # Add --color argument to specify color palette + "$FLAMEGRAPH_PL" --title="${FILENAME_PREFIX} ${FRIENDLY_NAME} Hotspots" --countname="$FRIENDLY_NAME" \ + --color="$COLOR_PALETTE" \ + "$FOLDED_FILE" > "$SVG_FILE" + echo "✅ ${FRIENDLY_NAME} Flame Graph generated: ${SVG_FILE}" + + # --- Cleanup Intermediate Files --- + echo "Cleaning up ${FRIENDLY_NAME} intermediate files..." + rm -f "$PERF_TXT" "$FOLDED_FILE" + sudo rm -f "$DATA_FILE" +} + +# ---------------------------------------------------- +# 3. Run Analysis (4 Flame Graphs total) +# ---------------------------------------------------- + +# 1. CPU Time Analysis (Standard CPU bottleneck analysis) +generate_flamegraph "cpu-clock" "CPU Time" "cpu_time" "hot" + +# 2. I/O Bottleneck Analysis (Related to memory access) +generate_flamegraph "page-faults" "Page Faults" "page_faults" "mem" + +# 3. Computational Efficiency Analysis (Related to pipeline) +generate_flamegraph "branch-misses" "Branch Misses" "branch_misses" "hot" + +# 4. Scheduling/Wait Bottleneck Analysis (Related to lock contention, context switching) +# Note: This event requires two perf event names +generate_flamegraph "sched:sched_switch,sched:sched_stat_wait" "Thread Scheduling" "sched" "chain" + + +# ---------------------------------------------------- +# 4. Task Summary +# ---------------------------------------------------- +echo "" +echo "--- Task Complete ---" +echo "Final result files (SVG/HTML) are in the ${OUTPUT_DIR} directory:" +find "$OUTPUT_DIR" -name "${FILENAME_PREFIX}_*.svg" \ No newline at end of file diff --git a/cpp/testcase/pixels-cli-tests/Test.sh b/cpp/testcase/pixels-cli-tests/Test.sh new file mode 100755 index 0000000000..bf13674cd6 --- /dev/null +++ b/cpp/testcase/pixels-cli-tests/Test.sh @@ -0,0 +1,4 @@ +python gen_complex_data.py +../../build/release/extension/pixels/pixels-cli/pixels-cli << EOF +LOAD -o /home/whz/test/pixels/cpp/testcase/pixels-cli-tests/test_origin -t /home/whz/test/pixels/cpp/testcase/pixels-cli-tests/test_target -s /home/whz/test/pixels/cpp/testcase/pixels-cli-tests/complex_test.schema -n 1000 -r , +EOF \ No newline at end of file diff --git a/cpp/testcase/pixels-cli-tests/complex_test.schema b/cpp/testcase/pixels-cli-tests/complex_test.schema new file mode 100644 index 0000000000..89fbcfed9e --- /dev/null +++ b/cpp/testcase/pixels-cli-tests/complex_test.schema @@ -0,0 +1 @@ +struct diff --git a/cpp/testcase/pixels-cli-tests/gen_complex_data.py b/cpp/testcase/pixels-cli-tests/gen_complex_data.py new file mode 100644 index 0000000000..8ee5036de1 --- /dev/null +++ b/cpp/testcase/pixels-cli-tests/gen_complex_data.py @@ -0,0 +1,64 @@ +import os +import random +from datetime import datetime, timedelta + +# Create output directory +os.makedirs("test_origin", exist_ok=True) + +def random_date(start_year=1970, end_year=2023): + start = datetime(start_year, 1, 1) + end = datetime(end_year, 12, 31) + return start + timedelta(days=random.randint(0, (end - start).days)) + +def maybe_null(value, null_probability=0.1): + """Return empty string (represents null) with given probability, otherwise return value""" + if random.random() < null_probability: + return "\\N" # Common CSV representation for null + return str(value) + +# Configuration +NUM_ROWS = 100 +NULL_PROBABILITY = 0.15 # 15% chance of null for each nullable column +OUTPUT_FILE = "test_origin/complex_data_with_nulls.csv" + +with open(OUTPUT_FILE, "w") as f: + for i in range(NUM_ROWS): + id_val = i # Keep id non-null as it's often a primary key + index_val = 1000000 + i + name_val = f"user_{i}" + birth_val = random_date().strftime("%Y-%m-%d") + updated_val = (datetime.now() - timedelta(minutes=i)).strftime("%Y-%m-%d %H:%M:%S") + price_val = round(19.99 + i, 2) + + # Apply null probability to each column (except id) + row = [ + str(id_val), # id: keep non-null + maybe_null(index_val, NULL_PROBABILITY), # index: nullable + maybe_null(name_val, NULL_PROBABILITY), # name: nullable + maybe_null(birth_val, NULL_PROBABILITY), # birth: nullable + maybe_null(updated_val, NULL_PROBABILITY),# updated: nullable + maybe_null(price_val, NULL_PROBABILITY) # price: nullable + ] + f.write(",".join(row) + "\n") + +print(f"Complex test data with nulls generated in {OUTPUT_FILE}") +print(f"Total rows: {NUM_ROWS}, Null probability: {NULL_PROBABILITY*100}%") + +# Also generate a version without nulls for comparison +OUTPUT_FILE_NO_NULLS = "test_origin/complex_data2.csv" +with open(OUTPUT_FILE_NO_NULLS, "w") as f: + for i in range(NUM_ROWS): + id_val = i + index_val = 1000000 + i + name_val = f"user_{i}" + birth_val = random_date().strftime("%Y-%m-%d") + updated_val = (datetime.now() - timedelta(minutes=i)).strftime("%Y-%m-%d %H:%M:%S") + price_val = round(19.99 + i, 2) + + row = [ + str(id_val), str(index_val), + name_val, birth_val, updated_val, str(price_val) + ] + f.write(",".join(row) + "\n") + +print(f"Complex test data (no nulls) generated in {OUTPUT_FILE_NO_NULLS}") diff --git a/cpp/testcase/process_sqls.py b/cpp/testcase/process_sqls.py new file mode 100755 index 0000000000..86ebd7f0dd --- /dev/null +++ b/cpp/testcase/process_sqls.py @@ -0,0 +1,261 @@ +import os +import re +import subprocess +import csv +import time +import psutil +import json # Added: For parsing benchmark configuration files +from typing import List +import argparse + +# -------------------------- 1. Basic Configuration (Added default benchmark JSON path) -------------------------- +# Default path to benchmark configuration file (can be overridden via CLI parameter) +DEFAULT_BENCHMARK_JSON = "./benchmark.json" + + +def clear_page_cache(): + """Clear Linux page cache to ensure fair benchmarking""" + try: + print("🧹 Clearing Linux page cache...") + subprocess.run(["sync"], check=True) + subprocess.run(["sudo", "bash", "-c", "echo 3 > /proc/sys/vm/drop_caches"], check=True) + print("✅ Page cache cleared successfully") + except subprocess.CalledProcessError as e: + print(f"⚠️ Failed to clear page cache: {e}") + + +# -------------------------- 2. CLI Argument Parsing (Added benchmark-related parameters) -------------------------- +def parse_args(): + parser = argparse.ArgumentParser(description="DuckDB ClickBench Batch Test Script (Multi-column CSV, ensures resource release)") + parser.add_argument( + "--runs", + type=int, + default=3, + help="Number of runs per SQL file (default: 3)" + ) + parser.add_argument( + "--duckdb-bin", + type=str, + default="/home/whz/test/pixels/cpp/build/release/duckdb", + help="Path to duckdb executable" + ) + parser.add_argument( + "--sql-dir", + type=str, + default="/home/whz/test/pixels/cpp/pixels-duckdb/duckdb/benchmark/clickbench/queries-test", + help="Directory containing SQL files (only processes .sql files starting with 'q')" + ) + parser.add_argument( + "--output-csv", + type=str, + default="/home/whz/test/pixels/cpp/duckdb_benchmark_result.csv", + help="Path to output result CSV" + ) + parser.add_argument( + "--wait-after-run", + type=float, + default=2.0, + help="Seconds to wait after each run (ensures resource release, default: 2s)" + ) + parser.add_argument( + "--threads", + type=int, + default=96, + help="Number of threads to use in DuckDB (default: 96)" + ) + parser.add_argument( + "--benchmark", + type=str, + default="clickbench-pixels-e0-1ssd", + help="Name of benchmark to use (must exist in benchmark JSON, e.g. clickbench-pixels-e0)" + ) + parser.add_argument( + "--benchmark-json", + type=str, + default=DEFAULT_BENCHMARK_JSON, + help=f"Path to benchmark configuration JSON file (default: {DEFAULT_BENCHMARK_JSON})" + ) + return parser.parse_args() + + +# -------------------------- 3. Core Utility Functions -------------------------- +def get_sql_files(sql_dir: str) -> List[str]: + sql_files = [] + for filename in os.listdir(sql_dir): + if filename.endswith(".sql") and filename.startswith("q"): + sql_files.append(os.path.join(sql_dir, filename)) + sql_files.sort() + if not sql_files: + raise ValueError(f"No .sql files starting with 'q' found in {sql_dir}!") + return sql_files + + +def extract_real_time(duckdb_output: str) -> float: + pattern = r"Run Time \(s\): real (\d+\.\d+)" + match = re.search(pattern, duckdb_output, re.MULTILINE) + if not match: + raise ValueError(f"Failed to extract real time! Partial output:\n{duckdb_output[:500]}...") + return round(float(match.group(1)), 3) + + +def kill_remaining_duckdb(duckdb_bin: str): + duckdb_name = os.path.basename(duckdb_bin) + for proc in psutil.process_iter(['name', 'cmdline']): + try: + if (proc.info['name'] == duckdb_name) or (duckdb_bin in ' '.join(proc.info['cmdline'] or [])): + print(f"⚠️ Found residual {duckdb_name} process (PID: {proc.pid}), killing...") + proc.terminate() + try: + proc.wait(timeout=1) + except psutil.TimeoutExpired: + proc.kill() + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + + +def load_benchmark_create_view(benchmark_json_path: str, benchmark_name: str) -> str: + if not os.path.exists(benchmark_json_path): + raise FileNotFoundError(f"Benchmark JSON file not found: {benchmark_json_path}") + + with open(benchmark_json_path, "r", encoding="utf-8") as f: + try: + benchmark_config = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Failed to parse benchmark JSON: {str(e)}") + + if benchmark_name not in benchmark_config: + available_benchmarks = ", ".join(benchmark_config.keys()) + raise KeyError(f"Benchmark '{benchmark_name}' not found. Available benchmarks: {available_benchmarks}") + + create_view_sql = benchmark_config[benchmark_name].strip() + if not create_view_sql: + raise ValueError(f"CREATE VIEW SQL for benchmark '{benchmark_name}' is empty in JSON") + + return create_view_sql + + +def run_single_sql(duckdb_bin: str, create_view_sql: str, sql_content: str, wait_after_run: float, threads: int) -> float: + duckdb_commands = f"{create_view_sql}\nset threads={threads};\n\n.timer on\nexplain analyze {sql_content.strip()}\n.exit" + process = None + + try: + process = subprocess.Popen( + [duckdb_bin], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT + ) + + input_data = duckdb_commands.encode("utf-8") + stdout, _ = process.communicate(input=input_data, timeout=3600) + + output = stdout.decode("utf-8", errors="ignore") + + if process.returncode != 0: + raise RuntimeError(f"duckdb execution failed (code {process.returncode}):\n{output[:1000]}...") + print(output) + real_time = extract_real_time(output) + time.sleep(wait_after_run) + kill_remaining_duckdb(duckdb_bin) + return real_time + + except subprocess.TimeoutExpired: + if process: + process.kill() + raise RuntimeError("duckdb execution timed out (exceeded 1 hour)") from None + finally: + if process and process.poll() is None: + process.kill() + print("⚠️ Forcibly terminated non-exiting duckdb process") + + +def init_csv(output_csv: str, runs: int): + headers = ["SQL File Name"] + [f"Run {idx} Time (s)" for idx in range(1, runs + 1)] + with open(output_csv, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=headers) + writer.writeheader() + print(f"✅ Initialized multi-column CSV with headers: {','.join(headers)}") + + +def write_single_row(output_csv: str, sql_filename: str, run_times: List[float], runs: int): + row_data = {"SQL File Name": sql_filename} + for idx in range(1, runs + 1): + time_val = run_times[idx - 1] if (idx - 1) < len(run_times) else "" + row_data[f"Run {idx} Time (s)"] = time_val + with open(output_csv, "a", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=row_data.keys()) + writer.writerow(row_data) + + +# -------------------------- 4. Main Logic -------------------------- +def main(): + args = parse_args() + print("=" * 70) + print("DuckDB ClickBench Batch Test Script (Resource Release Ensured)") + print(f"Config: {args.runs} runs per SQL, {args.wait_after_run}s wait after each run") + print(f"Benchmark: {args.benchmark} (from {args.benchmark_json})") + print(f"DuckDB path: {args.duckdb_bin}") + print(f"Threads: {args.threads}") + print(f"SQL directory: {args.sql_dir}") + print(f"Output CSV: {args.output_csv}") + print("=" * 70) + + # clear_page_cache() + + kill_remaining_duckdb(args.duckdb_bin) + try: + create_view_sql = load_benchmark_create_view(args.benchmark_json, args.benchmark) + print(f"✅ Loaded CREATE VIEW SQL for benchmark '{args.benchmark}'") + except (FileNotFoundError, KeyError, ValueError) as e: + print(f"\n❌ Benchmark initialization failed: {str(e)}") + return + + init_csv(args.output_csv, args.runs) + try: + sql_files = get_sql_files(args.sql_dir) + print(f"\n✅ Found {len(sql_files)} eligible SQL files:") + for i, f in enumerate(sql_files, 1): + print(f" {i:2d}. {os.path.basename(f)}") + except ValueError as e: + print(f"\n❌ Error: {e}") + return + + for sql_file in sql_files: + sql_filename = os.path.basename(sql_file).replace(".sql", "") + print(f"\n{'=' * 60}") + print(f"Processing: {sql_filename}.sql") + print(f"{'=' * 60}") + + try: + with open(sql_file, "r", encoding="utf-8") as f: + sql_content = f.read() + print(f"✅ Successfully read SQL file (content length: {len(sql_content)} chars)") + except Exception as e: + print(f"❌ Failed to read SQL file: {e}") + write_single_row(args.output_csv, sql_filename, [], args.runs) + continue + + run_times = [] + for run_idx in range(1, args.runs + 1): + print(f"\n--- Run {run_idx:2d}/{args.runs} ---") + clear_page_cache() + try: + real_time = run_single_sql(args.duckdb_bin, create_view_sql, sql_content, args.wait_after_run, args.threads) + run_times.append(real_time) + print(f"✅ Run successful, time: {real_time}s") + except (RuntimeError, ValueError) as e: + print(f"❌ Run failed: {e}") + continue + + write_single_row(args.output_csv, sql_filename, run_times, args.runs) + print(f"\n✅ Written to CSV: {sql_filename}.sql → Valid runs: {len(run_times)}/{args.runs}") + + kill_remaining_duckdb(args.duckdb_bin) + print(f"\n{'=' * 70}") + print("All SQL files processed!") + print(f"Multi-column CSV: {args.output_csv}") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/cpp/testcase/run_perf.py b/cpp/testcase/run_perf.py new file mode 100644 index 0000000000..a04c8b38a3 --- /dev/null +++ b/cpp/testcase/run_perf.py @@ -0,0 +1,189 @@ +import subprocess +import os +import re +import argparse # Import argparse for command-line arguments + +# --- Configuration (Defaults & Constants) --- +# Default lists (can be overridden by command-line arguments) +# THREADS = [1, 2, 4, 8, 16, 24, 32, 48, 64, 96] +# THREADS=[24] +# QUERIES = ["q01", "q24", "q33"] +# SSD_MODES = ["1ssd", "24ssd"] +# SSD_MODES=["1ssd"] +# THREADS = [32, 48] +# # THREADS = [24] +# # QUERIES = ["q01"] +# QUERIES = ["q01"] +# SSD_MODES = ["1ssd"] +# THREADS = [ 96] +# THREADS=[24] +# QUERIES = ["q24"] +# SSD_MODES = ["24ssd"] +THREADS = [24] +QUERIES = ["q01", "q24", "q33"] +SSD_MODES = ["24ssd"] + +# Base 'perf stat' command: focusing on CPU and scheduling metrics +PERF_CMD_BASE = [ + "sudo", "-E", "perf", "stat", + "-e", "cycles,instructions,cache-references,cache-misses,branches,branch-misses", + "-e", "page-faults,minor-faults,major-faults", + "-e", "task-clock,context-switches" +] + +# Path to the DuckDB binary +DUCKDB_BINARY = "../build/release/duckdb" + +# FlameGraph Bash script path (updated for scheduling events) +FLAMEGRAPH_SCRIPT = "./generate_flamegraphs.sh" + + +def ensure_result_dir(result_dir): + """Create the results output directory""" + if not os.path.exists(result_dir): + os.makedirs(result_dir) + print(f"Created directory: {result_dir}") + + +def update_sql_thread(sql_dir, sql_filename, thread_value, result_dir): + """ + Replaces 'set threads=x;' in the SQL file with the specified value + and writes the content to a temporary file in the result directory. + """ + # Construct the full path to the base SQL file + sql_path = os.path.join(sql_dir, sql_filename) + + with open(sql_path, "r") as f: + content = f.read() + + # Replace or add set threads=x; + # Try to replace existing one, if not found, assume it goes at the start + if re.search(r"set\s+threads\s*=\s*\d+;", content): + new_content = re.sub(r"set\s+threads\s*=\s*\d+;", f"set threads={thread_value};", content) + else: + # If no 'set threads' line is found, add it to the beginning + new_content = f"set threads={thread_value};\n{content}" + + + # Temporary file name uses the result directory + tmp_path = os.path.join(result_dir, f"{os.path.basename(sql_filename)}.tmp_threads_{thread_value}.sql") + with open(tmp_path, "w") as f: + f.write(new_content) + + return tmp_path + + +def run_perf_stat_switches(query_file, query_name, ssd_mode, thread_value, result_dir): + """ + Executes perf stat, collects context switch metrics, and outputs the result + to a file in the results directory. + """ + output_name = f"{query_name}-{ssd_mode}-threads{thread_value}-context-stat.txt" + output_path = os.path.join(result_dir, output_name) + + # PERF_CMD_BASE is defined at the top of the script + cmd = PERF_CMD_BASE + ["-o", output_path, DUCKDB_BINARY] + + print(f"\n--- 1. Running perf stat for context switches: {output_name} ---") + + try: + with open(query_file, "r") as sql_f: + subprocess.run(cmd, stdin=sql_f, check=True) # Use check=True to ensure command execution success + print(f"==> perf stat output saved to: {output_path}") + except subprocess.CalledProcessError as e: + print(f"[ERROR] perf stat failed for {output_name}: {e}") + return False + except FileNotFoundError: + print(f"[ERROR] DuckDB binary not found at {DUCKDB_BINARY}") + return False + + return True + +def run_sched_flamegraph(tmp_sql_path, query_name, ssd_mode, thread_value, result_dir): + """ + Calls an external Bash script to generate a flame graph focused on scheduling events. + """ + if not os.path.exists(FLAMEGRAPH_SCRIPT): + print(f"[WARN] Schedule FlameGraph script not found: {FLAMEGRAPH_SCRIPT}. Skipping analysis.") + return + + print(f"\n--- 2. Running Schedule FlameGraph analysis ---") + + # Ensure the Bash script has execute permissions + if not os.access(FLAMEGRAPH_SCRIPT, os.X_OK): + print(f"[WARN] Adding execute permission to {FLAMEGRAPH_SCRIPT}") + os.chmod(FLAMEGRAPH_SCRIPT, 0o755) + + output_html_name = f"{query_name}-{ssd_mode}-threads{thread_value}-sched.svg" + output_html_path = os.path.join(result_dir, output_html_name) + + # Call the Bash script, passing the temporary SQL file, DuckDB binary path, and output HTML path + flamegraph_cmd = [FLAMEGRAPH_SCRIPT, tmp_sql_path, DUCKDB_BINARY, output_html_path] + + try: + subprocess.run(flamegraph_cmd, check=True) + print(f"==> Schedule FlameGraph saved to: {output_html_path}") + except subprocess.CalledProcessError as e: + print(f"[ERROR] Schedule FlameGraph script failed for {tmp_sql_path}: {e}") + except FileNotFoundError: + print(f"[ERROR] FlameGraph script file not found at {FLAMEGRAPH_SCRIPT}") + + +def main(THREADS=THREADS, QUERIES=QUERIES, SSD_MODES=SSD_MODES): + parser = argparse.ArgumentParser(description="Run DuckDB benchmarks with perf stat and FlameGraph analysis.") + + # --- New required arguments for directories --- + parser.add_argument("--sql-dir", required=False, default="perf-pixels" ,help="Directory containing the base SQL query files (e.g., test-q01-1ssd.sql).") + parser.add_argument("--result-dir", required=False,default="perf-pixels", help="Directory where temporary files and final results (perf stats, SVG) will be saved.") + # --- Optional arguments with defaults --- + parser.add_argument("--threads", type=int, nargs='+', default=THREADS, help=f"List of thread counts to test (default: {THREADS}).") + parser.add_argument("--queries", nargs='+', default=QUERIES, help=f"List of query IDs to test (default: {QUERIES}).") + parser.add_argument("--ssd-modes", nargs='+', default=SSD_MODES, help=f"List of SSD configurations (default: {SSD_MODES}).") + + args = parser.parse_args() + + # Assign parsed arguments to variables + RESULT_DIR = args.result_dir + SQL_DIR = args.sql_dir + THREADS = args.threads + QUERIES = args.queries + SSD_MODES = args.ssd_modes + + # Ensure the result directory exists + ensure_result_dir(RESULT_DIR) + + for q in QUERIES: + for mode in SSD_MODES: + + sql_file_base = f"test-{q}-{mode}.sql" + # Check for the file in the specified SQL_DIR + sql_file_path_check = os.path.join(SQL_DIR, sql_file_base) + + if not os.path.exists(sql_file_path_check): + print(f"[WARN] Base SQL file not found: {sql_file_path_check}, skipping") + continue + + for t in THREADS: + print("=" * 50) + print(f"Starting analysis for Q={q}, Mode={mode}, Threads={t}") + + # 1. Update threads and create temporary SQL file in RESULT_DIR + tmp_sql = update_sql_thread( + SQL_DIR, sql_file_base, t, RESULT_DIR + ) + + # 2. Run perf stat to collect context switch metrics + success = run_perf_stat_switches(tmp_sql, q, mode, t, RESULT_DIR) + + # 3. If perf stat succeeded, run scheduling FlameGraph analysis + if success: + run_sched_flamegraph(tmp_sql, q, mode, t, RESULT_DIR) + + # 4. Clean up temporary SQL file + os.remove(tmp_sql) + print(f"Cleaned up temporary SQL file: {tmp_sql}") + print("=" * 50) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cpp/testcase/single_doublebuffer_async_sync_test.py b/cpp/testcase/single_doublebuffer_async_sync_test.py new file mode 100644 index 0000000000..bbf1e575f6 --- /dev/null +++ b/cpp/testcase/single_doublebuffer_async_sync_test.py @@ -0,0 +1,139 @@ +import subprocess +import os +import re +import shutil + +# ------------------------------------- +# 1. Configuration Parameters +# ------------------------------------- +# threads_list = [1, 2, 4, 8, 16, 24, 48, 64, 96] +threads_list = [16, 24] +benchmarks = [ + "clickbench-pixels-e0-1ssd", + # "clickbench-pixels-e0-6ssd", + # "clickbench-pixels-e0-12ssd", + # "clickbench-pixels-e0-24ssd" +] + +runs = 1 +# Define all Buffer Modes to be tested +buffer_modes = ["doublebuffer", "singlebuffer"] +properties_path = os.path.expanduser("~/opt/pixels/etc/pixels-cpp.properties") +process_script = "process_sqls.py" + +# Root directory for saving results +output_root = "single_double_buffer_results" + +# ------------------------------------- +# 2. Core Modification: Buffer Mode Switching Function +# ------------------------------------- +def set_buffer_mode(mode): + """Modify the 'pixels.doublebuffer' parameter in pixels-cpp.properties""" + assert mode in ("doublebuffer", "singlebuffer") + + if not os.path.exists(properties_path): + raise FileNotFoundError(f"Configuration file not found: {properties_path}") + + with open(properties_path, "r") as f: + lines = f.readlines() + + new_lines = [] + changed = False + + # Determine the value to set + new_value = "true" if mode == "doublebuffer" else "false" + + for line in lines: + if line.strip().startswith("pixels.doublebuffer"): + # Find and replace this line + new_lines.append(f"pixels.doublebuffer={new_value}\n") + changed = True + else: + new_lines.append(line) + + # If the line was not found in the file, append it at the end + if not changed: + new_lines.append(f"pixels.doublebuffer={new_value}\n") + + with open(properties_path, "w") as f: + f.writelines(new_lines) + + print(f"🔄 Buffer mode switched to: {mode.upper()}") + + +# ------------------------------------- +# 3. IO Mode Switching Function (Unchanged logic) +# ------------------------------------- +def set_io_mode(mode): + """Modify the 'localfs.enable.async.io' parameter in pixels-cpp.properties""" + assert mode in ("async", "sync") + + if not os.path.exists(properties_path): + raise FileNotFoundError(f"Configuration file not found: {properties_path}") + + with open(properties_path, "r") as f: + lines = f.readlines() + + new_lines = [] + changed = False + for line in lines: + if line.startswith("localfs.enable.async.io"): + new_value = "true" if mode == "async" else "false" + new_lines.append(f"localfs.enable.async.io={new_value}\n") + changed = True + else: + new_lines.append(line) + + if not changed: + new_value = "true" if mode == "async" else "false" + new_lines.append(f"localfs.enable.async.io={new_value}\n") + + with open(properties_path, "w") as f: + f.writelines(new_lines) + + print(f"🔧 IO mode switched to: {mode.upper()}") + +# ------------------------------------- +# 4. Nested Loop for Test Execution +# ------------------------------------- +for buffer_mode in buffer_modes: + print(f"\n===========================================") + print(f"🚀 Starting Test for Buffer Mode: {buffer_mode.upper()}") + print(f"===========================================") + set_buffer_mode(buffer_mode) # <-- Set the current Buffer Mode + + for io_mode in ["sync", "async"]: + print(f"\n======= Switching to {io_mode.upper()} mode =======") + set_io_mode(io_mode) + + for benchmark in benchmarks: + print(f"\n===== Benchmark: {benchmark} ({buffer_mode}/{io_mode}) =====\n") + + # Create an isolated directory: output_root/benchmark/buffer_mode/io_mode/ + benchmark_dir = os.path.join(output_root, benchmark, buffer_mode, io_mode) + os.makedirs(benchmark_dir, exist_ok=True) + print(f"📁 Directory created: {benchmark_dir}") + + for t in threads_list: + output_csv = os.path.join( + benchmark_dir, + f"duckdb_benchmark_result-{buffer_mode}-{io_mode}-{t}threads.csv" + ) + + cmd = [ + "python", + process_script, + "--benchmark", benchmark, + "--runs", str(runs), + "--output-csv", output_csv, + "--threads", str(t), + ] + + print(f"\n▶ Executing: {benchmark}, Buffer={buffer_mode}, IO={io_mode}, {t} threads") + print("Command:", " ".join(cmd)) + + subprocess.run(cmd, check=True) + + print(f"✔ Completed: {output_csv}\n") + +print("\n🎉 All tasks (doublebuffer/singlebuffer, sync/async) completed successfully!") diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 35b3240667..0a87a15e53 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #project(tests) -# + #include(FetchContent) #FetchContent_Declare( # googletest @@ -8,21 +8,21 @@ # #set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) #FetchContent_MakeAvailable(googletest) -# + #enable_testing() -# -# + + #add_executable( # unit_tests # UnitTests.cpp) -# + #target_link_libraries( # unit_tests # GTest::gtest_main # pixels-common # pixels-core #) -# + #include(GoogleTest) #include_directories(../pixels-core/include) #include_directories(../pixels-common/include) diff --git a/cpp/tests/data/example.pxl b/cpp/tests/data/example.pxl index 9ec1b42fad..8d268c6222 100644 Binary files a/cpp/tests/data/example.pxl and b/cpp/tests/data/example.pxl differ diff --git a/cpp/tests/writer/CMakeLists.txt b/cpp/tests/writer/CMakeLists.txt index 44004f8116..9edf7dfba2 100644 --- a/cpp/tests/writer/CMakeLists.txt +++ b/cpp/tests/writer/CMakeLists.txt @@ -16,18 +16,18 @@ endif () target_link_libraries( IntegerWriterTest + PRIVATE gtest_main - pixels-common - pixels-core duckdb + pixels-core ) target_link_libraries( PixelsWriterTest + PRIVATE gtest_main - pixels-common - pixels-core duckdb + pixels-core ) set(GTEST_DIR "${PROJECT_SOURCE_DIR}/third-party/googletest") diff --git a/cpp/tests/writer/IntegerWriterTest.cpp b/cpp/tests/writer/IntegerWriterTest.cpp index 43a346ac82..bce37276c1 100644 --- a/cpp/tests/writer/IntegerWriterTest.cpp +++ b/cpp/tests/writer/IntegerWriterTest.cpp @@ -28,6 +28,7 @@ #include "reader/LongColumnReader.h" #include "writer/IntColumnWriter.h" #include "writer/LongColumnWriter.h" +#include "pixels_generated.h" #include "gtest/gtest.h" @@ -36,7 +37,6 @@ TEST(IntWriterTest, WriteRunLengthEncodeIntWithoutNull) { int len = 10; int pixel_stride = 5; -// bool is_long = false; bool encoding = true; auto integer_column_vector = std::make_shared(len, encoding); @@ -54,16 +54,17 @@ TEST(IntWriterTest, WriteRunLengthEncodeIntWithoutNull) { option->setNullsPadding(false); option->setEncodingLevel(EncodingLevel(EncodingLevel::EL2)); + flatbuffers::FlatBufferBuilder fbb(1024); + auto integer_column_writer = std::make_unique( TypeDescription::createInt(), option); - auto write_size = integer_column_writer->write(integer_column_vector, len); + auto write_size = integer_column_writer->write( integer_column_vector, len); EXPECT_NE(write_size, 0); integer_column_writer->flush(); auto content = integer_column_writer->getColumnChunkContent(); EXPECT_GT(content.size(), 0); std::cerr << "[DEBUG] content size: " << content.size() << std::endl; - integer_column_writer->close(); /**---------------------- ** Write End. Use Reader to check @@ -72,7 +73,27 @@ TEST(IntWriterTest, WriteRunLengthEncodeIntWithoutNull) { std::make_unique(TypeDescription::createInt()); auto buffer = std::make_shared(content.size()); buffer->putBytes(content.data(), content.size()); - auto column_chunk_encoding = integer_column_writer->getColumnChunkEncoding(); + + auto column_chunk_encoding_offset = integer_column_writer->getColumnChunkEncoding(fbb); + auto column_chunk_index_offset = integer_column_writer->buildColumnChunkIndex(fbb, 0, content.size(), true); + + std::vector> indexVec; + indexVec.push_back(column_chunk_index_offset); + auto indicesOffset = fbb.CreateVector(indexVec); + auto rowGroupIndexOffset = pixels::fb::CreateRowGroupIndex(fbb, indicesOffset); + + std::vector> encodingVec; + encodingVec.push_back(column_chunk_encoding_offset); + auto encodingsOffset = fbb.CreateVector(encodingVec); + auto rowGroupEncodingOffset = pixels::fb::CreateRowGroupEncoding(fbb, encodingsOffset); + + auto footerOffset = pixels::fb::CreateRowGroupFooter(fbb, rowGroupIndexOffset, rowGroupEncodingOffset); + fbb.Finish(footerOffset); + + auto footer = flatbuffers::GetRoot((fbb.GetBufferPointer())); + auto column_chunk_encoding = footer->rowGroupEncoding()->columnChunkEncodings()->Get(0); + auto column_chunk_index = footer->rowGroupIndexEntry()->columnChunkIndexEntries()->Get(0); + auto int_result_vector = std::make_shared(len, encoding); auto bit_mask = std::make_shared(len); @@ -85,7 +106,7 @@ TEST(IntWriterTest, WriteRunLengthEncodeIntWithoutNull) { integer_column_reader->read( buffer, column_chunk_encoding, pixel_offset, size, pixel_stride, vector_index, int_result_vector, - *integer_column_writer->getColumnChunkIndexPtr(), bit_mask); + column_chunk_index, bit_mask); for (int i = vector_index; i < vector_index + size; i++) { std::cerr << "[DEBUG READ CASE1] " << int_result_vector->intVector[i] << std::endl; @@ -96,12 +117,12 @@ TEST(IntWriterTest, WriteRunLengthEncodeIntWithoutNull) { vector_index += size; num_to_read -= size; } + integer_column_writer->close(); } -TEST(IntWriterTest, DISABLED_WriteIntWithoutNull) { +TEST(IntWriterTest, WriteIntWithoutNull) { int len = 10; int pixel_stride = 5; - bool is_long = false; bool encoding = false; auto integer_column_vector = std::make_shared(len, encoding); @@ -120,6 +141,8 @@ TEST(IntWriterTest, DISABLED_WriteIntWithoutNull) { option->setByteOrder(ByteOrder::PIXELS_LITTLE_ENDIAN); option->setEncodingLevel(EncodingLevel(EncodingLevel::EL0)); + flatbuffers::FlatBufferBuilder fbb(1024); + auto integer_column_writer = std::make_unique( TypeDescription::createInt(), option); auto write_size = integer_column_writer->write(integer_column_vector, len); @@ -129,7 +152,6 @@ TEST(IntWriterTest, DISABLED_WriteIntWithoutNull) { EXPECT_GT(content.size(), 0); std::cerr << "[DEBUG] content size: " << content.size() << std::endl; - integer_column_writer->close(); /**---------------------- ** Write End. Use Reader to check @@ -138,7 +160,27 @@ TEST(IntWriterTest, DISABLED_WriteIntWithoutNull) { std::make_unique(TypeDescription::createInt()); auto buffer = std::make_shared(content.size()); buffer->putBytes(content.data(), content.size()); - auto column_chunk_encoding = integer_column_writer->getColumnChunkEncoding(); + + auto column_chunk_encoding_offset = integer_column_writer->getColumnChunkEncoding(fbb); + auto column_chunk_index_offset = integer_column_writer->buildColumnChunkIndex(fbb, 0, content.size(), true); + + std::vector> indexVec; + indexVec.push_back(column_chunk_index_offset); + auto indicesOffset = fbb.CreateVector(indexVec); + auto rowGroupIndexOffset = pixels::fb::CreateRowGroupIndex(fbb, indicesOffset); + + std::vector> encodingVec; + encodingVec.push_back(column_chunk_encoding_offset); + auto encodingsOffset = fbb.CreateVector(encodingVec); + auto rowGroupEncodingOffset = pixels::fb::CreateRowGroupEncoding(fbb, encodingsOffset); + + auto footerOffset = pixels::fb::CreateRowGroupFooter(fbb, rowGroupIndexOffset, rowGroupEncodingOffset); + fbb.Finish(footerOffset); + + auto footer = flatbuffers::GetRoot((fbb.GetBufferPointer())); + auto column_chunk_encoding = footer->rowGroupEncoding()->columnChunkEncodings()->Get(0); + auto column_chunk_index = footer->rowGroupIndexEntry()->columnChunkIndexEntries()->Get(0); + auto int_result_vector = std::make_shared(len, encoding); auto bit_mask = std::make_shared(len); @@ -151,46 +193,26 @@ TEST(IntWriterTest, DISABLED_WriteIntWithoutNull) { integer_column_reader->read( buffer, column_chunk_encoding, pixel_offset, size, pixel_stride, vector_index, int_result_vector, - *integer_column_writer->getColumnChunkIndexPtr(), bit_mask); + column_chunk_index, bit_mask); for (int i = vector_index; i < vector_index + size; i++) { std::cerr << "[DEBUG READ CASE1] " - << reinterpret_cast(int_result_vector->intVector)[i] + << int_result_vector->intVector[i] << std::endl; - EXPECT_EQ(reinterpret_cast(int_result_vector->intVector)[i], + EXPECT_EQ(int_result_vector->intVector[i], integer_column_vector->intVector[i]); } pixel_offset += size; vector_index += size; num_to_read -= size; } + integer_column_writer->close(); } -TEST(EncodeTest, DISABLED_EncodeLong) { - constexpr size_t len = 10; - std::array data; - for (size_t i = 0; i < len; i++) { - data[i] = INT64_MAX - i; - } - auto encode_buffer = std::make_shared(); - auto encoder = std::make_unique(); - - int res_len{0}; - encoder->encode(data.data(), encode_buffer->getPointer(), len, res_len); - - EXPECT_GT(res_len, 0); - - bool is_signed = true; - auto decoder = std::make_unique(encode_buffer, is_signed); - for (size_t i = 0; i < len; i++) { - EXPECT_EQ(decoder->next(), data[i]); - } -} - -TEST(IntWriterTest, DISABLED_WriteRunLengthEncodeLongWithoutNull) { +TEST(IntWriterTest, WriteRunLengthEncodeLongWithoutNull) { int len = 23; int pixel_stride = 5; - bool is_long = true; bool encoding = true; + bool is_long = true; auto long_column_vector = std::make_shared(len, encoding, is_long); ASSERT_TRUE(long_column_vector); @@ -203,7 +225,9 @@ TEST(IntWriterTest, DISABLED_WriteRunLengthEncodeLongWithoutNull) { option->setNullsPadding(false); option->setEncodingLevel(EncodingLevel(EncodingLevel::EL2)); - auto long_column_writer = std::make_unique( + flatbuffers::FlatBufferBuilder fbb(1024); + + auto long_column_writer = std::make_unique( TypeDescription::createLong(), option); auto write_size = long_column_writer->write(long_column_vector, len); EXPECT_NE(write_size, 0); @@ -216,10 +240,30 @@ TEST(IntWriterTest, DISABLED_WriteRunLengthEncodeLongWithoutNull) { ** Write End. Use Reader to check *------------------------**/ auto long_column_reader = - std::make_unique(TypeDescription::createLong()); + std::make_unique(TypeDescription::createLong()); auto buffer = std::make_shared(content.size()); buffer->putBytes(content.data(), content.size()); - auto column_chunk_encoding = long_column_writer->getColumnChunkEncoding(); + + auto column_chunk_encoding_offset = long_column_writer->getColumnChunkEncoding(fbb); + auto column_chunk_index_offset = long_column_writer->buildColumnChunkIndex(fbb, 0, content.size(), true); + + std::vector> indexVec; + indexVec.push_back(column_chunk_index_offset); + auto indicesOffset = fbb.CreateVector(indexVec); + auto rowGroupIndexOffset = pixels::fb::CreateRowGroupIndex(fbb, indicesOffset); + + std::vector> encodingVec; + encodingVec.push_back(column_chunk_encoding_offset); + auto encodingsOffset = fbb.CreateVector(encodingVec); + auto rowGroupEncodingOffset = pixels::fb::CreateRowGroupEncoding(fbb, encodingsOffset); + + auto footerOffset = pixels::fb::CreateRowGroupFooter(fbb, rowGroupIndexOffset, rowGroupEncodingOffset); + fbb.Finish(footerOffset); + + auto footer = flatbuffers::GetRoot((fbb.GetBufferPointer())); + auto column_chunk_encoding = footer->rowGroupEncoding()->columnChunkEncodings()->Get(0); + auto column_chunk_index = footer->rowGroupIndexEntry()->columnChunkIndexEntries()->Get(0); + auto long_result_vector = std::make_shared(len, encoding, is_long); auto bit_mask = std::make_shared(len); @@ -230,7 +274,7 @@ TEST(IntWriterTest, DISABLED_WriteRunLengthEncodeLongWithoutNull) { auto size = std::min(pixel_stride, num_to_read); long_column_reader->read(buffer, column_chunk_encoding, pixel_offset, size, pixel_stride, vector_index, long_result_vector, - *(long_column_writer->getColumnChunkIndexPtr()), + column_chunk_index, bit_mask); for (int i = vector_index; i < vector_index + size; i++) { std::cerr << "[DEBUG READ CASE1] " << long_result_vector->longVector[i] @@ -246,13 +290,12 @@ TEST(IntWriterTest, DISABLED_WriteRunLengthEncodeLongWithoutNull) { long_column_writer->close(); } -TEST(IntWriterTest, DISABLED_WriteRunLengthEncodeIntWithNull) { +TEST(IntWriterTest, WriteRunLengthEncodeIntWithNull) { int len = 23; int pixel_stride = 5; - bool is_long = false; bool encoding = true; auto integer_column_vector = - std::make_shared(len, encoding, is_long); + std::make_shared(len, encoding); ASSERT_TRUE(integer_column_vector); for (int i = 0; i < len; ++i) { if (i % 2) { @@ -271,16 +314,17 @@ TEST(IntWriterTest, DISABLED_WriteRunLengthEncodeIntWithNull) { option->setNullsPadding(false); option->setEncodingLevel(EncodingLevel(EncodingLevel::EL2)); + flatbuffers::FlatBufferBuilder fbb(1024); + auto integer_column_writer = std::make_unique( TypeDescription::createInt(), option); - auto write_size = integer_column_writer->write(integer_column_vector, len); + auto write_size = integer_column_writer->write( integer_column_vector, len); EXPECT_NE(write_size, 0); integer_column_writer->flush(); auto content = integer_column_writer->getColumnChunkContent(); EXPECT_GT(content.size(), 0); std::cerr << "[DEBUG] content size: " << content.size() << std::endl; - integer_column_writer->close(); /**---------------------- ** Write End. Use Reader to check @@ -289,7 +333,27 @@ TEST(IntWriterTest, DISABLED_WriteRunLengthEncodeIntWithNull) { std::make_unique(TypeDescription::createInt()); auto buffer = std::make_shared(content.size()); buffer->putBytes(content.data(), content.size()); - auto column_chunk_encoding = integer_column_writer->getColumnChunkEncoding(); + + auto column_chunk_encoding_offset = integer_column_writer->getColumnChunkEncoding(fbb); + auto column_chunk_index_offset = integer_column_writer->buildColumnChunkIndex(fbb, 0, content.size(), true); + + std::vector> indexVec; + indexVec.push_back(column_chunk_index_offset); + auto indicesOffset = fbb.CreateVector(indexVec); + auto rowGroupIndexOffset = pixels::fb::CreateRowGroupIndex(fbb, indicesOffset); + + std::vector> encodingVec; + encodingVec.push_back(column_chunk_encoding_offset); + auto encodingsOffset = fbb.CreateVector(encodingVec); + auto rowGroupEncodingOffset = pixels::fb::CreateRowGroupEncoding(fbb, encodingsOffset); + + auto footerOffset = pixels::fb::CreateRowGroupFooter(fbb, rowGroupIndexOffset, rowGroupEncodingOffset); + fbb.Finish(footerOffset); + + auto footer = flatbuffers::GetRoot((fbb.GetBufferPointer())); + auto column_chunk_encoding = footer->rowGroupEncoding()->columnChunkEncodings()->Get(0); + auto column_chunk_index = footer->rowGroupIndexEntry()->columnChunkIndexEntries()->Get(0); + auto int_result_vector = std::make_shared(len, encoding); auto bit_mask = std::make_shared(len); @@ -302,15 +366,14 @@ TEST(IntWriterTest, DISABLED_WriteRunLengthEncodeIntWithNull) { integer_column_reader->read( buffer, column_chunk_encoding, pixel_offset, size, pixel_stride, vector_index, int_result_vector, - *integer_column_writer->getColumnChunkIndexPtr(), bit_mask); + column_chunk_index, bit_mask); for (int i = vector_index; i < vector_index + size; i++) { std::cerr << "[DEBUG READ CASE1] " << int_result_vector->intVector[i] << std::endl; - // EXPECT_EQ(int_result_vector->intVector[i], - // integer_column_vector->intVector[i]); } pixel_offset += size; vector_index += size; num_to_read -= size; } -} \ No newline at end of file + integer_column_writer->close(); +} diff --git a/cpp/tests/writer/PixelsWriterTest.cpp b/cpp/tests/writer/PixelsWriterTest.cpp index 5c60037848..9ea27eabce 100644 --- a/cpp/tests/writer/PixelsWriterTest.cpp +++ b/cpp/tests/writer/PixelsWriterTest.cpp @@ -27,22 +27,43 @@ #include "physical/PhysicalReaderUtil.h" #include "PixelsReaderBuilder.h" #include "gtest/gtest.h" +#include +#include + class PIXELS_WRITER_TEST : public ::testing::Test { protected: void SetUp() override { - target_file_path_ = ConfigFactory::Instance().getPixelsSourceDirectory() + - "cpp/tests/data/example.pxl"; + base_path = ConfigFactory::Instance().getPixelsSourceDirectory() + "cpp/tests/data/"; + if(!std::filesystem::exists(base_path)) { + std::filesystem::create_directories(base_path); + } + target_file_path_ = base_path + "example.pxl"; + + // if exist, delete it first + if (std::filesystem::exists(target_file_path_)) { + std::filesystem::remove(target_file_path_); + std::cout << "[INFO] Removed existing test file: " << target_file_path_ << std::endl; + } } + + void cleanup(const std::string& path) { + if (std::filesystem::exists(path)) { + std::filesystem::remove(path); + } + } + protected: - const int pixels_stride_ = 16; + std::string base_path; + // `pixels_stride` must be divisible by `row_batch` + const int pixels_stride_ = 20; std::string target_file_path_; const int block_size_ = 1024; const int compression_block_size_ = 16; bool block_padding_ = true; int row_num = 10; - const int row_group_size_ = 10; + const int row_group_size_ = 100; }; TEST_F(PIXELS_WRITER_TEST, DISABLED_SINGLE_INT) @@ -50,258 +71,499 @@ TEST_F(PIXELS_WRITER_TEST, DISABLED_SINGLE_INT) auto schema = TypeDescription::fromString("struct"); EXPECT_TRUE(schema); std::vector encode_vector(1, true); - auto row_batch = schema->createRowBatch(row_group_size_, encode_vector); + auto row_batch = schema->createRowBatch(row_num, encode_vector); - EncodingLevel encoding_level{EncodingLevel::EL2}; + EncodingLevel encoding_level{EncodingLevel::EL0}; bool nulls_padding = true; bool partitioned = true; auto pixels_writer = std::make_unique(schema, pixels_stride_, row_group_size_, target_file_path_, block_size_, block_padding_, encoding_level, nulls_padding, partitioned, compression_block_size_); - /**======================= - * * INFO - * Write Row Batch - * - *========================**/ + auto va = std::dynamic_pointer_cast(row_batch->cols[0]); + ASSERT_TRUE(va); + for (int i = 0; i < row_num; ++i) { - auto va = std::dynamic_pointer_cast(row_batch->cols[0]); - ASSERT_TRUE(va); - auto start_time_ts = std::chrono::high_resolution_clock::now(); + va->add(i); + row_batch->rowCount++; + if (row_batch->rowCount == row_batch->getMaxSize()) { - for (int i = 0; i < row_num; ++i) - { - va->add(i); - if (row_batch->rowCount == row_batch->getMaxSize()) - { - pixels_writer->addRowBatch(row_batch); - row_batch->reset(); - } - } - if(row_batch->rowCount!=0) { - pixels_writer->addRowBatch(row_batch); - row_batch->reset(); - } - pixels_writer->close(); + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } + } + if(row_batch->rowCount != 0) { + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } + pixels_writer->close(); + + // Read back and verify + auto footerCache = std::make_shared(); + auto builder = std::make_shared(); + std::shared_ptr<::Storage> storage = StorageFactory::getInstance()->getStorage(::Storage::file); + std::shared_ptr pixels_reader = builder + ->setPath(target_file_path_) + ->setStorage(storage) + ->setPixelsFooterCache(footerCache) + ->build(); + + PixelsReaderOption option; + option.setSkipCorruptRecords(false); + option.setTolerantSchemaEvolution(true); + option.setEnableEncodedColumnVector(true); + option.setIncludeCols({"a"}); + option.setBatchSize(10); + option.setRGRange(0,1); + auto recordReader = pixels_reader->read(option); + int count = 0; + while(true) { + auto rb = recordReader->readBatch(true); + if(rb == nullptr || rb->rowCount == 0) break; + auto v = std::static_pointer_cast(rb->cols[0]); + for(int i = 0; i < rb->rowCount; i++) { + EXPECT_EQ(((int*)v->intVector)[i], count); + count++; } - auto end_time_ts = std::chrono::high_resolution_clock::now(); - auto duration = end_time_ts - start_time_ts; - std::cerr << "[DEBUG] Time: " << duration.count() << std::endl; } + EXPECT_EQ(count, row_num); + ::BufferPool::Reset(); + } -TEST_F(PIXELS_WRITER_TEST, DISABLED_WRITE_AND_READ) +TEST_F(PIXELS_WRITER_TEST, SINGLE_LONG) { - auto schema = TypeDescription::fromString("struct"); + auto schema = TypeDescription::fromString("struct"); EXPECT_TRUE(schema); std::vector encode_vector(1, true); - auto row_batch = schema->createRowBatch(row_group_size_, encode_vector); - - EncodingLevel encoding_level{EncodingLevel::EL2}; + auto row_batch = schema->createRowBatch(row_num, encode_vector); + + EncodingLevel encoding_level{EncodingLevel::EL0}; bool nulls_padding = true; bool partitioned = true; - + auto pixels_writer = std::make_unique(schema, pixels_stride_, row_group_size_, target_file_path_, block_size_, block_padding_, encoding_level, nulls_padding, partitioned, compression_block_size_); - /**======================= - * * INFO - * Write Row Batch - * - *========================**/ + auto va = std::dynamic_pointer_cast(row_batch->cols[0]); + ASSERT_TRUE(va); + for (long i = 0; i < row_num; ++i) { - auto va = std::dynamic_pointer_cast(row_batch->cols[0]); - ASSERT_TRUE(va); - auto start_time_ts = std::chrono::high_resolution_clock::now(); + va->add(i*1000000L); + // va->add(9110818468285196899L); + row_batch->rowCount++; + if (row_batch->rowCount == row_batch->getMaxSize()) { - for (int i = 0; i < row_num; ++i) - { - va->add(i); - if (row_batch->rowCount == row_batch->getMaxSize()) - { - pixels_writer->addRowBatch(row_batch); - row_batch->reset(); - } - } - if(row_batch->rowCount!=0) { - pixels_writer->addRowBatch(row_batch); - row_batch->reset(); - } - pixels_writer->close(); - } - auto end_time_ts = std::chrono::high_resolution_clock::now(); - auto duration = end_time_ts - start_time_ts; - std::cerr << "[DEBUG] Time: " << duration.count() << std::endl; + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } } + if(row_batch->rowCount != 0) { + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } + pixels_writer->close(); - { - auto footerCache = std::make_shared(); - auto builder = std::make_shared(); - // test read - std::shared_ptr<::Storage> storage = StorageFactory::getInstance()->getStorage(::Storage::file); - std::shared_ptr pixels_reader = builder - ->setPath(target_file_path_) - ->setStorage(storage) - ->setPixelsFooterCache(footerCache) - ->build(); - - std::cerr<< "[DEBUG] row group num:" << pixels_reader->getRowGroupNum() << std::endl; - PixelsReaderOption option; - option.setSkipCorruptRecords(false); - option.setTolerantSchemaEvolution(true); - option.setEnableEncodedColumnVector(true); - option.setIncludeCols({"a"}); - option.setBatchSize(10); - option.setRGRange(0,1); - auto recordReader = pixels_reader->read(option); - auto rowBatch = recordReader->readBatch(true); - auto vector = std::static_pointer_cast(rowBatch->cols[0]); - { - // check read result - for(int i = 0; i < row_num; i++) { - EXPECT_EQ(*( (int*)vector->intVector + i ), i); - } + // Read back and verify + auto footerCache = std::make_shared(); + auto builder = std::make_shared(); + std::shared_ptr<::Storage> storage = StorageFactory::getInstance()->getStorage(::Storage::file); + std::shared_ptr pixels_reader = builder + ->setPath(target_file_path_) + ->setStorage(storage) + ->setPixelsFooterCache(footerCache) + ->build(); + + PixelsReaderOption option; + option.setSkipCorruptRecords(false); + option.setTolerantSchemaEvolution(true); + option.setEnableEncodedColumnVector(true); + option.setIncludeCols({"a"}); + option.setBatchSize(10); + option.setRGRange(0,1); + auto recordReader = pixels_reader->read(option); + int count = 0; + while(true) { + auto rb = recordReader->readBatch(true); + if(rb == nullptr || rb->rowCount == 0) break; + auto v = std::static_pointer_cast(rb->cols[0]); + for(int i = 0; i < rb->rowCount; i++) { + EXPECT_EQ(v->longVector[i], (long)count * 1000000L); + count++; } } + EXPECT_EQ(count, row_num); + ::BufferPool::Reset(); } -TEST_F(PIXELS_WRITER_TEST, DISABLED_WRITE_TWO_COLUMN) +TEST_F(PIXELS_WRITER_TEST, DISABLED_SINGLE_STRING) { - auto schema = TypeDescription::fromString("struct"); + auto schema = TypeDescription::fromString("struct"); EXPECT_TRUE(schema); - std::vector encode_vector(2, true); - auto row_batch = schema->createRowBatch(row_group_size_, encode_vector); - + std::vector encode_vector(1, true); + auto row_batch = schema->createRowBatch(row_num, encode_vector); + EncodingLevel encoding_level{EncodingLevel::EL2}; bool nulls_padding = true; bool partitioned = true; - + auto pixels_writer = std::make_unique(schema, pixels_stride_, row_group_size_, target_file_path_, block_size_, block_padding_, encoding_level, nulls_padding, partitioned, compression_block_size_); - /**======================= - * * INFO - * Write Row Batch - * - *========================**/ + auto va = std::dynamic_pointer_cast(row_batch->cols[0]); + ASSERT_TRUE(va); + for (int i = 0; i < row_num; ++i) { - auto va = std::dynamic_pointer_cast(row_batch->cols[0]); - auto vb = std::dynamic_pointer_cast(row_batch->cols[1]); - ASSERT_TRUE(va); - auto start_time_ts = std::chrono::high_resolution_clock::now(); + std::string s = "string_" + std::to_string(i); + va->add(s); + row_batch->rowCount++; + if (row_batch->rowCount == row_batch->getMaxSize()) { - for (int i = 0; i < row_num; ++i) - { - va->add(i); - vb->add(i * i); - if (row_batch->rowCount == row_batch->getMaxSize()) - { - pixels_writer->addRowBatch(row_batch); - row_batch->reset(); - } - } - if(row_batch->rowCount!=0) { - pixels_writer->addRowBatch(row_batch); - row_batch->reset(); - } - pixels_writer->close(); + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } + } + if(row_batch->rowCount != 0) { + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } + pixels_writer->close(); + + // Read back and verify + auto footerCache = std::make_shared(); + auto builder = std::make_shared(); + std::shared_ptr<::Storage> storage = StorageFactory::getInstance()->getStorage(::Storage::file); + std::shared_ptr pixels_reader = builder + ->setPath(target_file_path_) + ->setStorage(storage) + ->setPixelsFooterCache(footerCache) + ->build(); + + PixelsReaderOption option; + option.setSkipCorruptRecords(false); + option.setTolerantSchemaEvolution(true); + option.setEnableEncodedColumnVector(true); + option.setIncludeCols({"a"}); + option.setBatchSize(10); + option.setRGRange(0,1); + auto recordReader = pixels_reader->read(option); + int count = 0; + while(true) { + auto rb = recordReader->readBatch(true); + if(rb == nullptr || rb->rowCount == 0) break; + auto v = std::static_pointer_cast(rb->cols[0]); + for(int i = 0; i < rb->rowCount; i++) { + std::string expected = "string_" + std::to_string(count); + std::string actual(v->vector[i].GetString()); + EXPECT_EQ(actual, expected); + count++; } - auto end_time_ts = std::chrono::high_resolution_clock::now(); - auto duration = end_time_ts - start_time_ts; - std::cerr << "[DEBUG] Time: " << duration.count() << std::endl; } + EXPECT_EQ(count, row_num); + ::BufferPool::Reset(); } -TEST_F(PIXELS_WRITER_TEST, DISABLED_SINGLE_INT_WITHOUT_RUNLENENCODE) +TEST_F(PIXELS_WRITER_TEST, DISABLED_MULTI_COLUMN) { + auto schema = TypeDescription::fromString("struct"); + EXPECT_TRUE(schema); + std::vector encode_vector(3, true); + auto row_batch = schema->createRowBatch(row_num, encode_vector); + EncodingLevel encoding_level{EncodingLevel::EL2}; + auto pixels_writer = std::make_unique(schema, pixels_stride_, row_group_size_, target_file_path_, + block_size_, block_padding_, encoding_level, true, true, compression_block_size_); + + auto va = std::dynamic_pointer_cast(row_batch->cols[0]); + auto vb = std::dynamic_pointer_cast(row_batch->cols[1]); + auto vc = std::dynamic_pointer_cast(row_batch->cols[2]); + + for (int i = 0; i < row_num; ++i) + { + va->add(i); + vb->add((long)i * 10); + std::string s = "val_" + std::to_string(i); + vc->add(s); + row_batch->rowCount++; + if (row_batch->rowCount == row_batch->getMaxSize()) + { + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } + } + if(row_batch->rowCount != 0) { + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } + pixels_writer->close(); + + // Read back and verify + auto footerCache = std::make_shared(); + auto builder = std::make_shared(); + std::shared_ptr<::Storage> storage = StorageFactory::getInstance()->getStorage(::Storage::file); + std::shared_ptr pixels_reader = builder->setPath(target_file_path_)->setStorage(storage)->setPixelsFooterCache(footerCache)->build(); + + PixelsReaderOption option; + option.setSkipCorruptRecords(false); + option.setTolerantSchemaEvolution(true); + option.setEnableEncodedColumnVector(true); + option.setBatchSize(10); + option.setRGRange(0,1); + option.setIncludeCols({"a", "b", "c"}); + auto recordReader = pixels_reader->read(option); + int count = 0; + while(true) { + auto rb = recordReader->readBatch(true); + if(rb == nullptr || rb->rowCount == 0) break; + auto ra = std::static_pointer_cast(rb->cols[0]); + auto rb_col = std::static_pointer_cast(rb->cols[1]); + auto rc = std::static_pointer_cast(rb->cols[2]); + for(int i = 0; i < rb->rowCount; i++) { + EXPECT_EQ(((int*)ra->intVector)[i], count); + EXPECT_EQ(rb_col->longVector[i], (long)count * 10); + std::string expected = "val_" + std::to_string(count); + std::string actual(rc->vector[i].GetString()); + EXPECT_EQ(actual, expected); + count++; + } + } + EXPECT_EQ(count, row_num); + ::BufferPool::Reset(); + +} + +TEST_F(PIXELS_WRITER_TEST, DISABLED_MULTI_ROWGROUP) +{ + // Write 100 rows with row_group_size = 10 -> should result in 10 row groups + int total_rows = 100; auto schema = TypeDescription::fromString("struct"); EXPECT_TRUE(schema); std::vector encode_vector(1, true); - auto row_batch = schema->createRowBatch(row_group_size_, encode_vector); - + auto row_batch = schema->createRowBatch(row_num, encode_vector); EncodingLevel encoding_level{EncodingLevel::EL0}; - bool nulls_padding = true; - bool partitioned = true; - auto pixels_writer = std::make_unique(schema, pixels_stride_, row_group_size_, target_file_path_, - block_size_, block_padding_, encoding_level, nulls_padding, partitioned, compression_block_size_); + block_size_, block_padding_, encoding_level, true, true, compression_block_size_); - /**======================= - * * INFO - * Write Row Batch - * - *========================**/ + auto va = std::dynamic_pointer_cast(row_batch->cols[0]); + for (int i = 0; i < total_rows; ++i) { - auto va = std::dynamic_pointer_cast(row_batch->cols[0]); - ASSERT_TRUE(va); - auto start_time_ts = std::chrono::high_resolution_clock::now(); + va->add(i); + row_batch->rowCount++; + if (row_batch->rowCount == row_batch->getMaxSize()) { - for (int i = 0; i < row_num; ++i) - { - va->add(i); - if (row_batch->rowCount == row_batch->getMaxSize()) - { - pixels_writer->addRowBatch(row_batch); - row_batch->reset(); - } - } - if(row_batch->rowCount!=0) { + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } + } + if(row_batch->rowCount != 0) { + pixels_writer->addRowBatch(row_batch); + row_batch->reset(); + } + pixels_writer->close(); + + // Read back and verify row group count and data + auto footerCache = std::make_shared(); + auto builder = std::make_shared(); + std::shared_ptr<::Storage> storage = StorageFactory::getInstance()->getStorage(::Storage::file); + std::shared_ptr pixels_reader = builder->setPath(target_file_path_)->setStorage(storage)->setPixelsFooterCache(footerCache)->build(); + + EXPECT_GE(pixels_reader->getRowGroupNum(), 3); + + PixelsReaderOption option; + option.setSkipCorruptRecords(false); + option.setTolerantSchemaEvolution(true); + option.setEnableEncodedColumnVector(true); + option.setBatchSize(10); + option.setRGRange(0,3); + option.setIncludeCols({"a"}); + auto recordReader = pixels_reader->read(option); + int count = 0; + while(true) { + auto rb = recordReader->readBatch(true); + if(rb == nullptr || rb->rowCount == 0) break; + auto v = std::static_pointer_cast(rb->cols[0]); + for(int i = 0; i < rb->rowCount; i++) { + EXPECT_EQ(((int*)v->intVector)[i], count); + count++; + } + } + EXPECT_EQ(count, total_rows); + ::BufferPool::Reset(); + +} + +TEST_F(PIXELS_WRITER_TEST, DISABLED_MULTI_FILE) +{ + auto schema = TypeDescription::fromString("struct"); + EXPECT_TRUE(schema); + std::vector encode_vector(1, true); + + std::string file1 = base_path + "example_1.pxl"; + std::string file2 = base_path + "example_2.pxl"; + + auto write_file = [&](const std::string& path, int start_val) { + auto row_batch = schema->createRowBatch(row_num, encode_vector); + EncodingLevel encoding_level{EncodingLevel::EL2}; + auto pixels_writer = std::make_unique(schema, pixels_stride_, row_group_size_, path, + block_size_, block_padding_, encoding_level, true, true, compression_block_size_); + auto va = std::dynamic_pointer_cast(row_batch->cols[0]); + for (int i = 0; i < row_num; ++i) { + va->add(start_val + i); + row_batch->rowCount++; + if (row_batch->rowCount == row_batch->getMaxSize()) { pixels_writer->addRowBatch(row_batch); row_batch->reset(); } - pixels_writer->close(); } - auto end_time_ts = std::chrono::high_resolution_clock::now(); - auto duration = end_time_ts - start_time_ts; - std::cerr << "[DEBUG] Time: " << duration.count() << std::endl; - } + if(row_batch->rowCount != 0) { + pixels_writer->addRowBatch(row_batch); + } + pixels_writer->close(); + }; + + write_file(file1, 0); + write_file(file2, 100); + + auto verify_file = [&](const std::string& path, int expected_start) { + auto footerCache = std::make_shared(); + auto builder = std::make_shared(); + std::shared_ptr<::Storage> storage = StorageFactory::getInstance()->getStorage(::Storage::file); + std::shared_ptr pixels_reader = builder->setPath(path)->setStorage(storage)->setPixelsFooterCache(footerCache)->build(); + + PixelsReaderOption option; + option.setSkipCorruptRecords(false); + option.setTolerantSchemaEvolution(true); + option.setEnableEncodedColumnVector(true); + option.setBatchSize(10); + option.setRGRange(0,1); + option.setIncludeCols({"a"}); + auto recordReader = pixels_reader->read(option); + int count = 0; + while(true) { + auto rb = recordReader->readBatch(true); + if(rb == nullptr || rb->rowCount == 0) break; + auto v = std::static_pointer_cast(rb->cols[0]); + for(int i = 0; i < rb->rowCount; i++) { + EXPECT_EQ(((int*)v->intVector)[i], expected_start + count); + count++; + } + } + EXPECT_EQ(count, row_num); + }; + + verify_file(file1, 0); + verify_file(file2, 100); + + cleanup(file1); + cleanup(file2); + ::BufferPool::Reset(); + } -TEST_F(PIXELS_WRITER_TEST, WRITE_TWO_COLUMN_WITHOUT_RUNLENENCODE) +TEST_F(PIXELS_WRITER_TEST, DISABLED_COMPREHENSIVE_TEST) { - auto schema = TypeDescription::fromString("struct"); + // 测试多种数据类型写入多个rowgroup 一次测试写入多个文件 + int total_rows = 50; + int rg_size = 10; + auto schema = TypeDescription::fromString("struct"); EXPECT_TRUE(schema); - std::vector encode_vector(2, true); - auto row_batch = schema->createRowBatch(row_group_size_, encode_vector); + std::vector encode_vector(6, true); + + std::string file1 = base_path + "example_comp1.pxl"; + std::string file2 = base_path+ "example_comp2.pxl"; - EncodingLevel encoding_level{EncodingLevel::EL0}; - bool nulls_padding = true; - bool partitioned = true; - auto pixels_writer = std::make_unique(schema, pixels_stride_, row_group_size_, target_file_path_, - block_size_, block_padding_, encoding_level, nulls_padding, partitioned, compression_block_size_); + auto write_comp_file = [&](const std::string& path, int start_val) { + auto row_batch = schema->createRowBatch(rg_size, encode_vector); + EncodingLevel encoding_level{EncodingLevel::EL2}; + auto pixels_writer = std::make_unique(schema, pixels_stride_, rg_size, path, + block_size_, block_padding_, encoding_level, true, true, compression_block_size_); + + auto v1 = std::dynamic_pointer_cast(row_batch->cols[0]); + auto v2 = std::dynamic_pointer_cast(row_batch->cols[1]); + auto v3 = std::dynamic_pointer_cast(row_batch->cols[2]); + auto v4 = std::dynamic_pointer_cast(row_batch->cols[3]); + auto v5 = std::dynamic_pointer_cast(row_batch->cols[4]); + auto v6 = std::dynamic_pointer_cast(row_batch->cols[5]); - /**======================= - * * INFO - * Write Row Batch - * - *========================**/ - { - auto va = std::dynamic_pointer_cast(row_batch->cols[0]); - auto vb = std::dynamic_pointer_cast(row_batch->cols[1]); - ASSERT_TRUE(va); - auto start_time_ts = std::chrono::high_resolution_clock::now(); - { - for (int i = 0; i < row_num; ++i) - { - va->add(i); - vb->add(i * i); - if (row_batch->rowCount == row_batch->getMaxSize()) - { - pixels_writer->addRowBatch(row_batch); - row_batch->reset(); - } - } - if(row_batch->rowCount!=0) { + for (int i = 0; i < total_rows; ++i) { + int val = start_val + i; + v1->add(val); + v2->add((long)val * 100); + std::string tmp = "str_" + std::to_string(val); + v3->add(tmp); + v4->add(val); // days from epoch + v5->add((long)val * 1000); // timestamp + v6->add((long)val); // decimal value + + row_batch->rowCount++; + if (row_batch->rowCount == row_batch->getMaxSize()) { pixels_writer->addRowBatch(row_batch); row_batch->reset(); } - pixels_writer->close(); } - auto end_time_ts = std::chrono::high_resolution_clock::now(); - auto duration = end_time_ts - start_time_ts; - std::cerr << "[DEBUG] Time: " << duration.count() << std::endl; - } -} \ No newline at end of file + if(row_batch->rowCount != 0) { + pixels_writer->addRowBatch(row_batch); + } + pixels_writer->close(); + }; + + write_comp_file(file1, 0); + write_comp_file(file2, 1000); + + auto verify_comp_file = [&](const std::string& path, int expected_start) { + auto footerCache = std::make_shared(); + auto builder = std::make_shared(); + std::shared_ptr<::Storage> storage = StorageFactory::getInstance()->getStorage(::Storage::file); + std::shared_ptr pixels_reader = builder->setPath(path)->setStorage(storage)->setPixelsFooterCache(footerCache)->build(); + + EXPECT_EQ(pixels_reader->getRowGroupNum(), total_rows / rg_size); + + PixelsReaderOption option; + option.setSkipCorruptRecords(false); + option.setTolerantSchemaEvolution(true); + option.setEnableEncodedColumnVector(true); + option.setBatchSize(10); + option.setRGRange(0,5); + option.setIncludeCols({"c1", "c2", "c3", "c4", "c5", "c6"}); + auto recordReader = pixels_reader->read(option); + int count = 0; + while(true) { + auto rb = recordReader->readBatch(true); + if(rb == nullptr || rb->rowCount == 0) break; + + auto rv1 = std::static_pointer_cast(rb->cols[0]); + auto rv2 = std::static_pointer_cast(rb->cols[1]); + auto rv3 = std::static_pointer_cast(rb->cols[2]); + auto rv4 = std::static_pointer_cast(rb->cols[3]); + auto rv5 = std::static_pointer_cast(rb->cols[4]); + auto rv6 = std::static_pointer_cast(rb->cols[5]); + + for(int i = 0; i < rb->rowCount; i++) { + int expected_val = expected_start + count; + EXPECT_EQ(((int*)rv1->intVector)[i], expected_val); + EXPECT_EQ(rv2->longVector[i], (long)expected_val * 100); + + std::string expected_str = "str_" + std::to_string(expected_val); + std::string actual_str(rv3->vector[i].GetString()); + EXPECT_EQ(actual_str, expected_str); + + EXPECT_EQ(rv4->dates[i], expected_val); + EXPECT_EQ(rv5->times[i], (long)expected_val * 1000); + EXPECT_EQ(rv6->vector[i], (long)expected_val); + + count++; + } + } + EXPECT_EQ(count, total_rows); + }; + + verify_comp_file(file1, 0); + verify_comp_file(file2, 1000); + + cleanup(file1); + cleanup(file2); + ::BufferPool::Reset(); + +} diff --git a/cpp/third-party/flatbuffers b/cpp/third-party/flatbuffers new file mode 160000 index 0000000000..8914d06ab7 --- /dev/null +++ b/cpp/third-party/flatbuffers @@ -0,0 +1 @@ +Subproject commit 8914d06ab7123167424438cf293bb349833bcb7d diff --git a/proto/pixels.fbs b/proto/pixels.fbs new file mode 100644 index 0000000000..c23d59f1a9 --- /dev/null +++ b/proto/pixels.fbs @@ -0,0 +1,207 @@ +// File format definition of Pixels (FlatBuffers version) + +namespace pixels.fb; + +enum CompressionKind : byte { + NONE = 0, + ZLIB = 1, + SNAPPY = 2, + LZO = 3, + LZ4 = 4, + ZSTD = 5 +} + +enum TypeKind : byte { + BOOLEAN = 0, + BYTE = 1, + SHORT = 2, + INT = 3, + LONG = 4, + FLOAT = 5, + DOUBLE = 6, + STRING = 7, + BINARY = 8, + TIMESTAMP = 9, + ARRAY = 10, + MAP = 11, + STRUCT = 12, + VARBINARY = 13, + DECIMAL = 14, + DATE = 15, + VARCHAR = 16, + CHAR = 17, + TIME = 18, + VECTOR = 19 +} + +enum EncodingKind : byte { + NONE = 0, + RUNLENGTH = 1, + DICTIONARY = 2 +} + +// --- Statistics Tables --- + +table IntegerStatistic { + minimum: long; + maximum: long; + sum: long; +} + +table Integer128Statistic { + minimum_high: uint64; + minimum_low: uint64; + maximum_high: long; + maximum_low: long; +} + +table DoubleStatistic { + minimum: double; + maximum: double; + sum: double; +} + +table StringStatistic { + minimum: string; + maximum: string; + sum: long; +} + +table BucketStatistic { + count: [uint64]; +} + +table TimestampStatistic { + minimum: long; + maximum: long; +} + +table DateStatistic { + minimum: int; + maximum: int; +} + +table TimeStatistic { + minimum: int; + maximum: int; +} + +table BinaryStatistic { + sum: long; +} + +table ColumnStatistic { + numberOfValues: uint64; + intStatistics: IntegerStatistic; + doubleStatistics: DoubleStatistic; + stringStatistics: StringStatistic; + bucketStatistics: BucketStatistic; + binaryStatistics: BinaryStatistic; + timestampStatistics: TimestampStatistic; + dateStatistics: DateStatistic; + timeStatistics: TimeStatistic; + int128Statistics: Integer128Statistic; + hasNull: bool = false; +} + +table PixelStatistic { + statistic: ColumnStatistic; +} + +// --- Schema and Metadata --- + +table Type { + kind: TypeKind = BOOLEAN; + name: string; + subtypes: [uint]; + maximumLength: uint; + precision: uint; + scale: uint; + dimension: uint; +} + +table PartitionInformation { + columnIds: [uint]; + hashValue: int; +} + +table RowGroupInformation { + footerOffset: uint64; + dataLength: uint; + footerLength: uint; + numberOfRows: uint; + partitionInfo: PartitionInformation; +} + +table RowGroupStatistic { + columnChunkStats: [ColumnStatistic]; + hiddenColumnChunkStats: ColumnStatistic; +} + +// --- Index and Encoding --- + +table ColumnChunkIndex { + chunkOffset: uint64; + chunkLength: uint; + isNullOffset: uint; + pixelPositions: [uint]; + pixelStatistics: [PixelStatistic]; + littleEndian: bool = true; + nullsPadding: bool = false; + isNullAlignment: uint; +} + +table ColumnEncoding { + kind: EncodingKind = NONE; + dictionarySize: uint; + cascadeEncoding: ColumnEncoding; +} + +table RowGroupIndex { + columnChunkIndexEntries: [ColumnChunkIndex]; + hiddenColumnChunkIndexEntry: ColumnChunkIndex; +} + +table RowGroupEncoding { + columnChunkEncodings: [ColumnEncoding]; + hiddenColumnChunkEncoding: ColumnEncoding; +} + +table RowGroupFooter { + rowGroupIndexEntry: RowGroupIndex; + rowGroupEncoding: RowGroupEncoding; +} + +// --- File Tail Components --- + +table PostScript { + version: uint; + contentLength: uint64; + numberOfRows: uint; + compression: CompressionKind = NONE; + compressionBlockSize: uint; + pixelStride: uint; + writerTimezone: string; + partitioned: bool = false; + columnChunkAlignment: uint; + hasHiddenColumn: bool = false; + magic: string; // "PIXELS" +} + +table Footer { + types: [Type]; + columnStats: [ColumnStatistic]; + rowGroupInfos: [RowGroupInformation]; + rowGroupStats: [RowGroupStatistic]; + hiddenType: Type; + hiddenColumnStats: ColumnStatistic; +} + +table FileTail { + footer: Footer; + postscript: PostScript; + footerLength: uint; + postscriptLength: uint; +} + +root_type FileTail; diff --git a/proto/pixels.proto b/proto/pixels.proto index 3f4a0450d4..9732211c21 100644 --- a/proto/pixels.proto +++ b/proto/pixels.proto @@ -285,4 +285,4 @@ message ColumnEncoding { optional uint32 dictionarySize = 2; // the explicit cascade encoding scheme specified by pixels writer optional ColumnEncoding cascadeEncoding = 3; -} \ No newline at end of file +}