diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt index 09e9198..f30cae9 100644 --- a/Project2-Character-Recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/CMakeLists.txt @@ -22,6 +22,8 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") endif() include_directories(.) +link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) + add_subdirectory(character_recognition) cuda_add_executable(${CMAKE_PROJECT_NAME} @@ -30,6 +32,8 @@ cuda_add_executable(${CMAKE_PROJECT_NAME} ) target_link_libraries(${CMAKE_PROJECT_NAME} + curand + cublas character_recognition ${CORELIBS} ) diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..8042e2a 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -3,12 +3,23 @@ CUDA Character Recognition **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Gangzheng Tong + * www.gtong.me +* Tested on: Windows 10, i7-8th Gen @ 2.2GHz 16GB, RTX 2070 8GB (Personal Laptop) -### (TODO: Your README) +![Screenshot](img/output.png) +![Screenshot](img/time_neurons.PNG) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Features Implemented +In this project I implemented the following features: +1. Loading data from files +2. Forward and backward propagation implemented on GPU +3. Wrap cuBLAS, thrust and my custom kernel into a Matrix struct and make it easy to use; could also be useful for future projects +4. Use C++ smart pointers to manage memory and avoid memory leak all at once +5. Test the time on different number of neurons +However, I'm not able to predict the character given training samples. The cost fluctuates between 0.2 and 0.3 and seems not dropping within 40 iterations. I did unit tests on every kernel and didn't find anything wrong. Maybe it's due to the limited number of training samples or inappropriate initial weights and bias. + +### A Few Observations +1. GPU is capable of handling large throughput. With the increasing # of hidden neurons, the data becomes huge (10212 * 2048 floats for a weight matrix) but my RTX 2070 was able to complete on iteration under 2 seconds. That's 52 samples and a dozen of big matrix operations. +2. C-Style matrix is typically row-major, but CUDA matrix is colmn-major. I spent a lot of time debugging the matrix dot production being unware of this. diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..c5e28b0 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_75 ) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..806142d 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -1,27 +1,354 @@ #include #include +#include +#include +#include +#include +#include #include "common.h" #include "mlp.h" +#include + +#define THREADS_PER_BLOCK 256 + namespace CharacterRecognition { - using Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - // TODO: __global__ - - /** - * Example of use case (follow how you did it in stream compaction) - */ - /*void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - */ - - // TODO: implement required elements for MLP sections 1 and 2 here + using namespace std; + using Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + double learningRate; + shared_ptr W1, W2, B1, B2, IN, H, Y, X;; + int inputN, hiddenN, outputN; + + // Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU + void GPU_fill_rand(float* A, int nr_rows_A, int nr_cols_A) { + curandGenerator_t prng; + curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_XORWOW); + curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock()); + // Fill the array with random numbers on the device + curandGenerateUniform(prng, A, nr_rows_A * nr_cols_A); + cudaDeviceSynchronize(); + checkCUDAError("GPU_fill_rand FAILED"); + } + + void GPU_fill_rand(Matrix* p_matrix) { + GPU_fill_rand(p_matrix->dev_data, p_matrix->numRow, p_matrix->numCol); + } + + __global__ void kernAdd(float* A, float* B, int n) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= n) return; + A[idx] += B[idx]; + } + + __global__ void kernSubtract(float* A, const float* B, int n) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= n) return; + A[idx] -= B[idx]; + } + + __global__ void kernSubtract(const float* A, const float* B, float*C, int n) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= n) return; + C[idx] = A[idx] - B[idx]; + } + + __global__ void kernDiffSquare(const float* A, const float* B, float* C, int n) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= n) return; + const float tmp = A[idx] - B[idx]; + C[idx] = tmp * tmp; + } + + __global__ void generate_in_a_b(float* A, float a, float b, int N) { + + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= N) return; + A[idx] = (b - a) * A[idx] + a; + } + + __global__ void generate_random_numbers(float* numbers, int N) { + + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i >= N) return; + curandState state; + curand_init(clock64(), i, 0, &state); + numbers[i] = curand_uniform(&state); + } + + __global__ void kernSigmoid(float* numbers, int N) { + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i >= N) return; + numbers[i] = 1.0 / (1.0 + std::exp(-numbers[i])); + } + + __global__ void kernSigmoidPrime(float* numbers, int N) { + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i >= N) return; + float tmp = std::exp(-numbers[i]); + numbers[i] = tmp / ( (1.f + tmp) * (1.f + tmp) ); + } + + __global__ void kernMultiply(float* numbers, float constant, int N) { + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i >= N) return; + numbers[i] *= constant; + } + + __global__ void kernMultiplyMatrix(float* numbers, const float* other, int N) { + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i >= N) return; + numbers[i] *= other[i]; + } + + + void Matrix::initWithRandom() { + GPU_fill_rand(dev_data, numRow, numCol); + int n = numRow * numCol; + int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + float scale = 10.0f/std::sqrtf(n); // TODO: find a better initialization value!! + generate_in_a_b << > > (dev_data, -scale, scale, n); + } + + void Matrix::initWithZero() { + memset(data, 0, dataSize); + cudaMemset(dev_data, 0, dataSize); + } + + void Matrix::initWithTest() { + for (size_t row = 0; row < numRow; row++) { + for (size_t col = 0; col < numCol; col++) { + data[row * numCol + col] = (float)(row * numCol + col + 1.0f) / 10.f; + } + } + copyToDevice(); + } + + Matrix* Matrix::dot(const Matrix* other) const { + // C(m, n) = A(m, k) * B(k, n) + int m = numRow; + int k = numCol; + if (k != other->numRow) { + throw "Matrices not match"; + } + int n = other->numCol; + + Matrix* product = new Matrix(m, n); + // IMPORTANT!! + // cuBLAS uses column-major order, so reverse the order to get the correct result + // C: A*B, cuBLAS: B*A + gpu_blas_mmul(other->dev_data, dev_data, product->dev_data, n, k, m); + return product; + } + + void Matrix::add(const Matrix* other) { + int n = numRow * numCol; + int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + kernAdd<<>>(dev_data, other->dev_data, n); + } + + Matrix* Matrix::subtract(const Matrix* other) const{ + Matrix* output = new Matrix(numRow, numCol); + int n = numRow * numCol; + int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + kernSubtract << > > (dev_data, other->dev_data, output->dev_data, n); + + return output; + } + + + Matrix* Matrix::diffSquare(const Matrix* other) const { + Matrix* output = new Matrix(numRow, numCol); + int n = numRow * numCol; + int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + kernDiffSquare << > > (dev_data, other->dev_data, output->dev_data, n); + + return output; + } + + void Matrix::subtract_inplace(const Matrix* other) { + int n = numRow * numCol; + int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + kernSubtract << > > (dev_data, other->dev_data, n); + } + + void Matrix::sigmoid() { + int n = numRow * numCol; + int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + kernSigmoid << > > (dev_data, n); + } + + void Matrix::sigmoidePrime() { + int n = numRow * numCol; + int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + kernSigmoidPrime << > > (dev_data, n); + } + + Matrix* Matrix::multiply(const float constant) { + int n = numRow * numCol; + int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + kernMultiply<< > > (dev_data, constant, n); + return this; + } + + Matrix* Matrix::multiply(const Matrix* other) { + int n = numRow * numCol; + int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + kernMultiplyMatrix << > > (dev_data, other->dev_data, n); + return this; + } + + Matrix* Matrix::transpose() const { + Matrix* trans = new Matrix(numCol , numRow); + int const m = numRow; + int const n = numCol; + float const alpha(1.0); + float const beta(0.0); + cublasHandle_t handle; + cublasCreate(&handle); + cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, &alpha, dev_data, n, &beta, trans->dev_data, m, trans->dev_data, m); + cublasDestroy(handle); + + return trans; + } + + + void init(int inputNeuron, int hiddenNeuron, int outputNeuron, double rate) + { + inputN = inputNeuron, hiddenN = hiddenNeuron, outputN = outputNeuron; + learningRate = rate; + + W1 = make_shared(inputNeuron, hiddenNeuron); + W2 = make_shared(hiddenNeuron, outputNeuron); + B1 = make_shared(1, hiddenNeuron); + B2 = make_shared(1, outputNeuron); + + W1->initWithRandom(); + W2->initWithRandom(); + B1->initWithZero(); + B2->initWithZero(); + } + + Matrix* computeOutput(const vector & input) { + vector> wrapper = { input }; + X = make_shared(wrapper); + X->copyToDevice(); + + H = shared_ptr( X->dot(W1.get()) ); + + H->add(B1.get()); + H->sigmoid(); + + Y = shared_ptr( H->dot(W2.get()) ); + Y->add(B2.get()); + Y->sigmoid(); + + return Y.get(); + } + + float learn(const vector expectedOutput) { + // Compute gradients + //dJdB2 = Y.subtract(Y2).multiply( H.dot(W2).add(B2).applyFunction(sigmoidePrime) ); + vector> wrapper = { expectedOutput }; + auto Y2 = std::make_unique(wrapper); // 1 x numOutput matrix + + auto dJdB2 = unique_ptr( Y->subtract(Y2.get()) ); + auto tmpH_W2 = unique_ptr( H->dot(W2.get()) ); + tmpH_W2->add(B2.get()); + tmpH_W2->sigmoidePrime(); + dJdB2->multiply(tmpH_W2.get()); + dJdB2->copyToHost(); + + // dJdB1 = dJdB2.dot(W2.transpose()).multiply(X.dot(W1).add(B1).applyFunction(sigmoidePrime)); + auto W2_trans = unique_ptr( W2->transpose() ); + auto dJdB1 = unique_ptr( dJdB2->dot(W2_trans.get()) ); + + auto tmpX_W1 = unique_ptr( X->dot(W1.get()) ); + tmpX_W1->add(B1.get()); + tmpX_W1->sigmoidePrime(); // X.dot(W1).add(B1).applyFunction(sigmoidePrime) + dJdB1->multiply(tmpX_W1.get()); + + // dJdW2 = H.transpose().dot(dJdB2); + auto dJdW2_tmp = unique_ptr( H->transpose() ); + auto dJdW2 = unique_ptr( dJdW2_tmp->dot(dJdB2.get()) ); + + // dJdW1 = X.transpose().dot(dJdB1); + auto dJdW1_tmp = unique_ptr( X->transpose() ); + auto dJdW1 = unique_ptr( dJdW1_tmp->dot(dJdB1.get()) ); + + // update weights + W1->subtract_inplace(dJdW1->multiply(learningRate)); + W2->subtract_inplace(dJdW2->multiply(learningRate)); + B1->subtract_inplace(dJdB1->multiply(learningRate)); + B2->subtract_inplace(dJdB2->multiply(learningRate)); + + // Calculate cost + auto diffSqr = unique_ptr(Y2->diffSquare(Y.get())); + thrust::device_ptr thrust_diffSqr(diffSqr->dev_data); + int n = diffSqr->numCol * diffSqr->numCol; + float cost = thrust::reduce(thrust_diffSqr, thrust_diffSqr + n, 0.f, thrust::plus()); + cost = std::sqrtf(cost / n); + + return cost; + } + + void unitTest() + { + // Test transpose + Matrix m(3, 2); + Matrix n(2, 2); + + m.initWithTest(); + n.initWithTest(); + + m.print(); + n.print(); + + //Matrix* pt = m.transpose(); + //pt->copyToHost(); + //pt->print(); + + // Test multiply + //m.multiply(3.0f); + //m.copyToHost(); + //m.print(); + + // Test sigmoid + //m.sigmoid(); + //m.copyToHost(); + //m.print(); + + // Test Dot + Matrix* p = m.dot(&n); + p->copyToHost(); + p->print(); + + } + + // Took from + // https://solarianprogrammer.com/2012/05/31/matrix-multiplication-cuda-cublas-curand-thrust/ + // Multiply the arrays A and B on GPU and save the result in C + // C(m,n) = A(m,k) * B(k,n) + void gpu_blas_mmul(const float* A, const float* B, float* C, const int m, const int k, const int n) { + int lda = m, ldb = k, ldc = m; + const float alf = 1; + const float bet = 0; + const float* alpha = &alf; + const float* beta = &bet; + + // Create a handle for CUBLAS + cublasHandle_t handle; + cublasCreate(&handle); + + // Do the actual multiplication + cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + + // Destroy the handle + cublasDestroy(handle); + } } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..1d422ed 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -1,9 +1,91 @@ #pragma once #include "common.h" +#include +#include namespace CharacterRecognition { + using namespace std; Common::PerformanceTimer& timer(); + struct Matrix { + float* data; + float* dev_data; + int numRow; + int numCol; + size_t dataSize; + Matrix(int rows, int cols) : + numRow(rows), numCol(cols) + { + dataSize = rows * cols * sizeof(float); + data = (float*)malloc(dataSize); + cudaMalloc(&dev_data, dataSize); + } + + Matrix(const vector >& dataArr ) { + numRow = dataArr.size(); + if (!numRow) return; + numCol = dataArr[0].size(); + dataSize = numRow * numCol * sizeof(float); + data = (float*)malloc(dataSize); + cudaMalloc(&dev_data, dataSize); + + for (size_t row = 0; row < numRow; row++) { + for (size_t col = 0; col < numCol; col++) { + data[row * numCol + col] = dataArr[row][col]; + } + } + } + + void copyToHost() { + cudaMemcpy(data, dev_data, dataSize, cudaMemcpyDeviceToHost); + } + + void copyToDevice() { + cudaMemcpy(dev_data, data, dataSize, cudaMemcpyHostToDevice); + } + + void initWithRandom(); + void initWithTest(); + void initWithZero(); + + Matrix* dot(const Matrix* other) const; + Matrix* transpose() const; + Matrix* subtract(const Matrix* other) const; + Matrix* diffSquare(const Matrix* other) const; + + void add(const Matrix* other); + void sigmoid(); + void sigmoidePrime(); + void subtract_inplace(const Matrix* other); + Matrix* multiply(const float constant); + Matrix* multiply(const Matrix* other); + + + void print() { + std::cout << "-------- Matrix " << numRow << " X " << numCol << "-------- \n"; + for (int row = 0; row < numRow; row++) { + for (int col = 0; col < numCol; col++) { + printf("%f ", *(data + row*numCol + col)); + } + std::cout << std::endl; + } + } + + + ~Matrix() { + free(data); + cudaFree(dev_data); + } + }; // TODO: implement required elements for MLP sections 1 and 2 here + void init(int inputNeuron, int hiddenNeuron, int outputNeuron, double rate); + void GPU_fill_rand(float* A, int nr_rows_A, int nr_cols_A); + void GPU_fill_rand(Matrix* p_matrix); + void gpu_blas_mmul(const float* A, const float* B, float* C, const int m, const int k, const int n); + Matrix* computeOutput(const vector& input); + float learn(const vector expectedOutput); + + void unitTest(); + } diff --git a/Project2-Character-Recognition/img/output.png b/Project2-Character-Recognition/img/output.png new file mode 100644 index 0000000..0cef2dc Binary files /dev/null and b/Project2-Character-Recognition/img/output.png differ diff --git a/Project2-Character-Recognition/img/time_neurons.PNG b/Project2-Character-Recognition/img/time_neurons.PNG new file mode 100644 index 0000000..d62b13b Binary files /dev/null and b/Project2-Character-Recognition/img/time_neurons.PNG differ diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..2830b07 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -11,142 +11,109 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +#include +#include +#include + + +const int INPUT_N = 10201; +const int HIDDEN_N = 64; + +const int trainingSize = 52; // How many samples to train + +using namespace std; + +void loadTrainingData(const string& dir, vector >& input, vector >& output) +{ + + input.resize(trainingSize); // 52 x 10201 + output.resize(trainingSize); // 52 x 52 + + for (size_t i = 1; i <= trainingSize; i++) { + string filename = to_string(i) + "info.txt"; + if (i < 10) { + filename = "0" + filename; + } + string filePath = dir + filename; + + ifstream file(filePath); + + if (file.is_open()) { + int tmp; + for (size_t c = 0; c < 2; c++) file >> tmp; + input[i-1].resize(INPUT_N); + for (size_t c = 0; c < INPUT_N; c++) { + file >> input[i-1][c]; + } + file.close(); + } + output[i - 1].resize(trainingSize, 0); + output[i - 1][i - 1] = 1; + } + + // Normalization + for (size_t i = 0; i < trainingSize; i++) { + float mean = 0.f; + float variance = 0; + for (int j = 0; j < INPUT_N; j++) { + mean += input[i][j]; + } + mean /= INPUT_N; + for (int j = 0; j < INPUT_N; j++) { + variance += (input[i][j] - mean) * (input[i][j] - mean); + } + float stdv = std::sqrt(variance / INPUT_N); + for (int j = 0; j < INPUT_N; j++) { + input[i][j] = (input[i][j]) / (stdv + 0.000001f); + } + } +} int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + + //CharacterRecognition::unitTest(); + //system("pause"); // stop Win32 console from closing on exit + + const string dir = "..\\data-set\\"; + vector> input; + vector> output; + + loadTrainingData(dir, input, output); + CharacterRecognition::init(INPUT_N, HIDDEN_N, trainingSize, 0.01f); + + // compute output + std::vector inputArr = { 1, 2 }; + + // train on 10 iterations + for (int i = 0; i < 40; i++) + { + CharacterRecognition::timer().startCpuTimer(); + float cost; + for (int j = 0; j < input.size(); j++) // train all 52 samples + { + CharacterRecognition::Matrix* m = CharacterRecognition::computeOutput(input[j]); + cost = CharacterRecognition::learn(output[j]); + } + CharacterRecognition::timer().endCpuTimer(); + float time = CharacterRecognition::timer().getCpuElapsedTimeForPreviousOperation(); + cout << "#" << i + 1 << "/40 Cost: " << cost << " Took time: " << time << endl; + } + + // test + cout << "expected output : actual output" << endl; + for (int i = 0; i < input.size(); i++) // testing on last 10 examples + { + for (int j = 0; j < trainingSize; j++) + { + cout << output[i][j] << " "; + } + cout << endl; + + CharacterRecognition::Matrix* result = CharacterRecognition::computeOutput(input[i]); + result->copyToHost(); + result->print(); + } + system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; } diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md index 0e38ddb..d1faa51 100644 --- a/Project2-Stream-Compaction/README.md +++ b/Project2-Stream-Compaction/README.md @@ -3,12 +3,78 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Gangzheng Tong + * www.gtong.me +* Tested on: Windows 10, i7-8th Gen @ 2.2GHz 16GB, RTX 2070 8GB (Personal Laptop) -### (TODO: Your README) +![Screenshot](img/block_size.PNG) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![Screenshot](img/N.PNG) + +### Features Implemented +In this project I implemented all features required. Tested with up to `2^26` elements and all passed. +The project will run different scan and stream compaction algorithms and report the timing. +To make it easy to dump the testing, I added some automation code so it can be run with a batch file. + +### Questions +1. By trying different block size, I concluded that it performs the best when each block contains 256 threads. While theoretically my GPU can run up to 1024 threads concurrently, the number of threads is limited by other factors such as the number of registers. For this particular algorithm, it's possbile that the GPU cannot schudule all 1024 threads to run at once. +2. For the detailed compairson see the chart above. It's obvious to see that with effcient scan outperforms naive as the array size gets larger. The thrust implementation is the fastest when the array size is huge. +3. The memeory latency is clearly the bottleneck when the array size is small, which explains why even the trust implementation failed to compete with CPU one when the number of elements is smaller than `2^16`. +4. As array size grows, the memory bandwidth becomes the bottleneck, especially for Naive implementation, which suffers greatly from the data incoherency. Thrust is probalbly utilizes the shared memeory and cahce to overcome the limitation. + +Raw Output +``` +**************** +** SCAN TESTS ** +**************** + [ 17 29 47 28 31 5 24 33 0 16 12 15 20 ... 33 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 4.6704ms (std::chrono Measured) + [ 0 17 46 93 121 152 157 181 214 214 230 242 257 ... 25687915 25687948 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 1.6061ms (std::chrono Measured) + [ 0 17 46 93 121 152 157 181 214 214 230 242 257 ... 25687815 25687841 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 1.48669ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 1.37837ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.649216ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.658624ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.182272ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.218432ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 1 3 2 3 1 2 1 0 2 2 3 0 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.5317ms (std::chrono Measured) + [ 1 1 3 2 3 1 2 1 2 2 3 3 2 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.4355ms (std::chrono Measured) + [ 1 1 3 2 3 1 2 1 2 2 3 3 2 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 6.1908ms (std::chrono Measured) + [ 1 1 3 2 3 1 2 1 2 2 3 3 2 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 1.02813ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 1.06282ms (CUDA Measured) + passed +``` diff --git a/Project2-Stream-Compaction/img/N.PNG b/Project2-Stream-Compaction/img/N.PNG new file mode 100644 index 0000000..4b2e166 Binary files /dev/null and b/Project2-Stream-Compaction/img/N.PNG differ diff --git a/Project2-Stream-Compaction/img/block_size.PNG b/Project2-Stream-Compaction/img/block_size.PNG new file mode 100644 index 0000000..4f8e6c2 Binary files /dev/null and b/Project2-Stream-Compaction/img/block_size.PNG differ diff --git a/Project2-Stream-Compaction/output_data/BLOCK_SIZE.csv b/Project2-Stream-Compaction/output_data/BLOCK_SIZE.csv new file mode 100644 index 0000000..e69de29 diff --git a/Project2-Stream-Compaction/src/cxxopts.hpp b/Project2-Stream-Compaction/src/cxxopts.hpp new file mode 100644 index 0000000..ed3c6a2 --- /dev/null +++ b/Project2-Stream-Compaction/src/cxxopts.hpp @@ -0,0 +1,2207 @@ +/* + +Copyright (c) 2014, 2015, 2016, 2017 Jarryd Beck + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +*/ + +#ifndef CXXOPTS_HPP_INCLUDED +#define CXXOPTS_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cpp_lib_optional +#include +#define CXXOPTS_HAS_OPTIONAL +#endif + +#ifndef CXXOPTS_VECTOR_DELIMITER +#define CXXOPTS_VECTOR_DELIMITER ',' +#endif + +#define CXXOPTS__VERSION_MAJOR 2 +#define CXXOPTS__VERSION_MINOR 2 +#define CXXOPTS__VERSION_PATCH 0 + +namespace cxxopts +{ + static constexpr struct { + uint8_t major, minor, patch; + } version = { + CXXOPTS__VERSION_MAJOR, + CXXOPTS__VERSION_MINOR, + CXXOPTS__VERSION_PATCH + }; +} + +//when we ask cxxopts to use Unicode, help strings are processed using ICU, +//which results in the correct lengths being computed for strings when they +//are formatted for the help output +//it is necessary to make sure that can be found by the +//compiler, and that icu-uc is linked in to the binary. + +#ifdef CXXOPTS_USE_UNICODE +#include + +namespace cxxopts +{ + typedef icu::UnicodeString String; + + inline + String + toLocalString(std::string s) + { + return icu::UnicodeString::fromUTF8(std::move(s)); + } + + class UnicodeStringIterator : public + std::iterator + { + public: + + UnicodeStringIterator(const icu::UnicodeString* string, int32_t pos) + : s(string) + , i(pos) + { + } + + value_type + operator*() const + { + return s->char32At(i); + } + + bool + operator==(const UnicodeStringIterator& rhs) const + { + return s == rhs.s && i == rhs.i; + } + + bool + operator!=(const UnicodeStringIterator& rhs) const + { + return !(*this == rhs); + } + + UnicodeStringIterator& + operator++() + { + ++i; + return *this; + } + + UnicodeStringIterator + operator+(int32_t v) + { + return UnicodeStringIterator(s, i + v); + } + + private: + const icu::UnicodeString* s; + int32_t i; + }; + + inline + String& + stringAppend(String&s, String a) + { + return s.append(std::move(a)); + } + + inline + String& + stringAppend(String& s, int n, UChar32 c) + { + for (int i = 0; i != n; ++i) + { + s.append(c); + } + + return s; + } + + template + String& + stringAppend(String& s, Iterator begin, Iterator end) + { + while (begin != end) + { + s.append(*begin); + ++begin; + } + + return s; + } + + inline + size_t + stringLength(const String& s) + { + return s.length(); + } + + inline + std::string + toUTF8String(const String& s) + { + std::string result; + s.toUTF8String(result); + + return result; + } + + inline + bool + empty(const String& s) + { + return s.isEmpty(); + } +} + +namespace std +{ + inline + cxxopts::UnicodeStringIterator + begin(const icu::UnicodeString& s) + { + return cxxopts::UnicodeStringIterator(&s, 0); + } + + inline + cxxopts::UnicodeStringIterator + end(const icu::UnicodeString& s) + { + return cxxopts::UnicodeStringIterator(&s, s.length()); + } +} + +//ifdef CXXOPTS_USE_UNICODE +#else + +namespace cxxopts +{ + typedef std::string String; + + template + T + toLocalString(T&& t) + { + return std::forward(t); + } + + inline + size_t + stringLength(const String& s) + { + return s.length(); + } + + inline + String& + stringAppend(String&s, String a) + { + return s.append(std::move(a)); + } + + inline + String& + stringAppend(String& s, size_t n, char c) + { + return s.append(n, c); + } + + template + String& + stringAppend(String& s, Iterator begin, Iterator end) + { + return s.append(begin, end); + } + + template + std::string + toUTF8String(T&& t) + { + return std::forward(t); + } + + inline + bool + empty(const std::string& s) + { + return s.empty(); + } +} + +//ifdef CXXOPTS_USE_UNICODE +#endif + +namespace cxxopts +{ + namespace + { +#ifdef _WIN32 + const std::string LQUOTE("\'"); + const std::string RQUOTE("\'"); +#else + const std::string LQUOTE("‘"); + const std::string RQUOTE("’"); +#endif + } + + class Value : public std::enable_shared_from_this + { + public: + + virtual ~Value() = default; + + virtual + std::shared_ptr + clone() const = 0; + + virtual void + parse(const std::string& text) const = 0; + + virtual void + parse() const = 0; + + virtual bool + has_default() const = 0; + + virtual bool + is_container() const = 0; + + virtual bool + has_implicit() const = 0; + + virtual std::string + get_default_value() const = 0; + + virtual std::string + get_implicit_value() const = 0; + + virtual std::shared_ptr + default_value(const std::string& value) = 0; + + virtual std::shared_ptr + implicit_value(const std::string& value) = 0; + + virtual std::shared_ptr + no_implicit_value() = 0; + + virtual bool + is_boolean() const = 0; + }; + + class OptionException : public std::exception + { + public: + OptionException(const std::string& message) + : m_message(message) + { + } + + virtual const char* + what() const noexcept + { + return m_message.c_str(); + } + + private: + std::string m_message; + }; + + class OptionSpecException : public OptionException + { + public: + + OptionSpecException(const std::string& message) + : OptionException(message) + { + } + }; + + class OptionParseException : public OptionException + { + public: + OptionParseException(const std::string& message) + : OptionException(message) + { + } + }; + + class option_exists_error : public OptionSpecException + { + public: + option_exists_error(const std::string& option) + : OptionSpecException("Option " + LQUOTE + option + RQUOTE + " already exists") + { + } + }; + + class invalid_option_format_error : public OptionSpecException + { + public: + invalid_option_format_error(const std::string& format) + : OptionSpecException("Invalid option format " + LQUOTE + format + RQUOTE) + { + } + }; + + class option_syntax_exception : public OptionParseException { + public: + option_syntax_exception(const std::string& text) + : OptionParseException("Argument " + LQUOTE + text + RQUOTE + + " starts with a - but has incorrect syntax") + { + } + }; + + class option_not_exists_exception : public OptionParseException + { + public: + option_not_exists_exception(const std::string& option) + : OptionParseException("Option " + LQUOTE + option + RQUOTE + " does not exist") + { + } + }; + + class missing_argument_exception : public OptionParseException + { + public: + missing_argument_exception(const std::string& option) + : OptionParseException( + "Option " + LQUOTE + option + RQUOTE + " is missing an argument" + ) + { + } + }; + + class option_requires_argument_exception : public OptionParseException + { + public: + option_requires_argument_exception(const std::string& option) + : OptionParseException( + "Option " + LQUOTE + option + RQUOTE + " requires an argument" + ) + { + } + }; + + class option_not_has_argument_exception : public OptionParseException + { + public: + option_not_has_argument_exception + ( + const std::string& option, + const std::string& arg + ) + : OptionParseException( + "Option " + LQUOTE + option + RQUOTE + + " does not take an argument, but argument " + + LQUOTE + arg + RQUOTE + " given" + ) + { + } + }; + + class option_not_present_exception : public OptionParseException + { + public: + option_not_present_exception(const std::string& option) + : OptionParseException("Option " + LQUOTE + option + RQUOTE + " not present") + { + } + }; + + class argument_incorrect_type : public OptionParseException + { + public: + argument_incorrect_type + ( + const std::string& arg + ) + : OptionParseException( + "Argument " + LQUOTE + arg + RQUOTE + " failed to parse" + ) + { + } + }; + + class option_required_exception : public OptionParseException + { + public: + option_required_exception(const std::string& option) + : OptionParseException( + "Option " + LQUOTE + option + RQUOTE + " is required but not present" + ) + { + } + }; + + template + void throw_or_mimic(const std::string& text) + { + static_assert(std::is_base_of::value, + "throw_or_mimic only works on std::exception and " + "deriving classes"); + +#ifndef CXXOPTS_NO_EXCEPTIONS + // If CXXOPTS_NO_EXCEPTIONS is not defined, just throw + throw T{text}; +#else + // Otherwise manually instantiate the exception, print what() to stderr, + // and abort + T exception{text}; + std::cerr << exception.what() << std::endl; + std::cerr << "Aborting (exceptions disabled)..." << std::endl; + std::abort(); +#endif + } + + namespace values + { + namespace + { + std::basic_regex integer_pattern + ("(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)"); + std::basic_regex truthy_pattern + ("(t|T)(rue)?|1"); + std::basic_regex falsy_pattern + ("(f|F)(alse)?|0"); + } + + namespace detail + { + template + struct SignedCheck; + + template + struct SignedCheck + { + template + void + operator()(bool negative, U u, const std::string& text) + { + if (negative) + { + if (u > static_cast((std::numeric_limits::min)())) + { + throw_or_mimic(text); + } + } + else + { + if (u > static_cast((std::numeric_limits::max)())) + { + throw_or_mimic(text); + } + } + } + }; + + template + struct SignedCheck + { + template + void + operator()(bool, U, const std::string&) {} + }; + + template + void + check_signed_range(bool negative, U value, const std::string& text) + { + SignedCheck::is_signed>()(negative, value, text); + } + } + + template + R + checked_negate(T&& t, const std::string&, std::true_type) + { + // if we got to here, then `t` is a positive number that fits into + // `R`. So to avoid MSVC C4146, we first cast it to `R`. + // See https://github.com/jarro2783/cxxopts/issues/62 for more details. + return -static_cast(t-1)-1; + } + + template + T + checked_negate(T&& t, const std::string& text, std::false_type) + { + throw_or_mimic(text); + return t; + } + + template + void + integer_parser(const std::string& text, T& value) + { + std::smatch match; + std::regex_match(text, match, integer_pattern); + + if (match.length() == 0) + { + throw_or_mimic(text); + } + + if (match.length(4) > 0) + { + value = 0; + return; + } + + using US = typename std::make_unsigned::type; + + constexpr bool is_signed = std::numeric_limits::is_signed; + const bool negative = match.length(1) > 0; + const uint8_t base = match.length(2) > 0 ? 16 : 10; + + auto value_match = match[3]; + + US result = 0; + + for (auto iter = value_match.first; iter != value_match.second; ++iter) + { + US digit = 0; + + if (*iter >= '0' && *iter <= '9') + { + digit = static_cast(*iter - '0'); + } + else if (base == 16 && *iter >= 'a' && *iter <= 'f') + { + digit = static_cast(*iter - 'a' + 10); + } + else if (base == 16 && *iter >= 'A' && *iter <= 'F') + { + digit = static_cast(*iter - 'A' + 10); + } + else + { + throw_or_mimic(text); + } + + US next = result * base + digit; + if (result > next) + { + throw_or_mimic(text); + } + + result = next; + } + + detail::check_signed_range(negative, result, text); + + if (negative) + { + value = checked_negate(result, + text, + std::integral_constant()); + } + else + { + value = static_cast(result); + } + } + + template + void stringstream_parser(const std::string& text, T& value) + { + std::stringstream in(text); + in >> value; + if (!in) { + throw_or_mimic(text); + } + } + + inline + void + parse_value(const std::string& text, uint8_t& value) + { + integer_parser(text, value); + } + + inline + void + parse_value(const std::string& text, int8_t& value) + { + integer_parser(text, value); + } + + inline + void + parse_value(const std::string& text, uint16_t& value) + { + integer_parser(text, value); + } + + inline + void + parse_value(const std::string& text, int16_t& value) + { + integer_parser(text, value); + } + + inline + void + parse_value(const std::string& text, uint32_t& value) + { + integer_parser(text, value); + } + + inline + void + parse_value(const std::string& text, int32_t& value) + { + integer_parser(text, value); + } + + inline + void + parse_value(const std::string& text, uint64_t& value) + { + integer_parser(text, value); + } + + inline + void + parse_value(const std::string& text, int64_t& value) + { + integer_parser(text, value); + } + + inline + void + parse_value(const std::string& text, bool& value) + { + std::smatch result; + std::regex_match(text, result, truthy_pattern); + + if (!result.empty()) + { + value = true; + return; + } + + std::regex_match(text, result, falsy_pattern); + if (!result.empty()) + { + value = false; + return; + } + + throw_or_mimic(text); + } + + inline + void + parse_value(const std::string& text, std::string& value) + { + value = text; + } + + // The fallback parser. It uses the stringstream parser to parse all types + // that have not been overloaded explicitly. It has to be placed in the + // source code before all other more specialized templates. + template + void + parse_value(const std::string& text, T& value) { + stringstream_parser(text, value); + } + + template + void + parse_value(const std::string& text, std::vector& value) + { + std::stringstream in(text); + std::string token; + while(in.eof() == false && std::getline(in, token, CXXOPTS_VECTOR_DELIMITER)) { + T v; + parse_value(token, v); + value.emplace_back(std::move(v)); + } + } + +#ifdef CXXOPTS_HAS_OPTIONAL + template + void + parse_value(const std::string& text, std::optional& value) + { + T result; + parse_value(text, result); + value = std::move(result); + } +#endif + + inline + void parse_value(const std::string& text, char& c) + { + if (text.length() != 1) + { + throw_or_mimic(text); + } + + c = text[0]; + } + + template + struct type_is_container + { + static constexpr bool value = false; + }; + + template + struct type_is_container> + { + static constexpr bool value = true; + }; + + template + class abstract_value : public Value + { + using Self = abstract_value; + + public: + abstract_value() + : m_result(std::make_shared()) + , m_store(m_result.get()) + { + } + + abstract_value(T* t) + : m_store(t) + { + } + + virtual ~abstract_value() = default; + + abstract_value(const abstract_value& rhs) + { + if (rhs.m_result) + { + m_result = std::make_shared(); + m_store = m_result.get(); + } + else + { + m_store = rhs.m_store; + } + + m_default = rhs.m_default; + m_implicit = rhs.m_implicit; + m_default_value = rhs.m_default_value; + m_implicit_value = rhs.m_implicit_value; + } + + void + parse(const std::string& text) const + { + parse_value(text, *m_store); + } + + bool + is_container() const + { + return type_is_container::value; + } + + void + parse() const + { + parse_value(m_default_value, *m_store); + } + + bool + has_default() const + { + return m_default; + } + + bool + has_implicit() const + { + return m_implicit; + } + + std::shared_ptr + default_value(const std::string& value) + { + m_default = true; + m_default_value = value; + return shared_from_this(); + } + + std::shared_ptr + implicit_value(const std::string& value) + { + m_implicit = true; + m_implicit_value = value; + return shared_from_this(); + } + + std::shared_ptr + no_implicit_value() + { + m_implicit = false; + return shared_from_this(); + } + + std::string + get_default_value() const + { + return m_default_value; + } + + std::string + get_implicit_value() const + { + return m_implicit_value; + } + + bool + is_boolean() const + { + return std::is_same::value; + } + + const T& + get() const + { + if (m_store == nullptr) + { + return *m_result; + } + else + { + return *m_store; + } + } + + protected: + std::shared_ptr m_result; + T* m_store; + + bool m_default = false; + bool m_implicit = false; + + std::string m_default_value; + std::string m_implicit_value; + }; + + template + class standard_value : public abstract_value + { + public: + using abstract_value::abstract_value; + + std::shared_ptr + clone() const + { + return std::make_shared>(*this); + } + }; + + template <> + class standard_value : public abstract_value + { + public: + ~standard_value() = default; + + standard_value() + { + set_default_and_implicit(); + } + + standard_value(bool* b) + : abstract_value(b) + { + set_default_and_implicit(); + } + + std::shared_ptr + clone() const + { + return std::make_shared>(*this); + } + + private: + + void + set_default_and_implicit() + { + m_default = true; + m_default_value = "false"; + m_implicit = true; + m_implicit_value = "true"; + } + }; + } + + template + std::shared_ptr + value() + { + return std::make_shared>(); + } + + template + std::shared_ptr + value(T& t) + { + return std::make_shared>(&t); + } + + class OptionAdder; + + class OptionDetails + { + public: + OptionDetails + ( + const std::string& short_, + const std::string& long_, + const String& desc, + std::shared_ptr val + ) + : m_short(short_) + , m_long(long_) + , m_desc(desc) + , m_value(val) + , m_count(0) + { + } + + OptionDetails(const OptionDetails& rhs) + : m_desc(rhs.m_desc) + , m_count(rhs.m_count) + { + m_value = rhs.m_value->clone(); + } + + OptionDetails(OptionDetails&& rhs) = default; + + const String& + description() const + { + return m_desc; + } + + const Value& value() const { + return *m_value; + } + + std::shared_ptr + make_storage() const + { + return m_value->clone(); + } + + const std::string& + short_name() const + { + return m_short; + } + + const std::string& + long_name() const + { + return m_long; + } + + private: + std::string m_short; + std::string m_long; + String m_desc; + std::shared_ptr m_value; + int m_count; + }; + + struct HelpOptionDetails + { + std::string s; + std::string l; + String desc; + bool has_default; + std::string default_value; + bool has_implicit; + std::string implicit_value; + std::string arg_help; + bool is_container; + bool is_boolean; + }; + + struct HelpGroupDetails + { + std::string name; + std::string description; + std::vector options; + }; + + class OptionValue + { + public: + void + parse + ( + std::shared_ptr details, + const std::string& text + ) + { + ensure_value(details); + ++m_count; + m_value->parse(text); + } + + void + parse_default(std::shared_ptr details) + { + ensure_value(details); + m_default = true; + m_value->parse(); + } + + size_t + count() const noexcept + { + return m_count; + } + + // TODO: maybe default options should count towards the number of arguments + bool + has_default() const noexcept + { + return m_default; + } + + template + const T& + as() const + { + if (m_value == nullptr) { + throw_or_mimic("No value"); + } + +#ifdef CXXOPTS_NO_RTTI + return static_cast&>(*m_value).get(); +#else + return dynamic_cast&>(*m_value).get(); +#endif + } + + private: + void + ensure_value(std::shared_ptr details) + { + if (m_value == nullptr) + { + m_value = details->make_storage(); + } + } + + std::shared_ptr m_value; + size_t m_count = 0; + bool m_default = false; + }; + + class KeyValue + { + public: + KeyValue(std::string key_, std::string value_) + : m_key(std::move(key_)) + , m_value(std::move(value_)) + { + } + + const + std::string& + key() const + { + return m_key; + } + + const + std::string& + value() const + { + return m_value; + } + + template + T + as() const + { + T result; + values::parse_value(m_value, result); + return result; + } + + private: + std::string m_key; + std::string m_value; + }; + + class ParseResult + { + public: + + ParseResult( + const std::shared_ptr< + std::unordered_map> + >, + std::vector, + bool allow_unrecognised, + int&, char**&); + + size_t + count(const std::string& o) const + { + auto iter = m_options->find(o); + if (iter == m_options->end()) + { + return 0; + } + + auto riter = m_results.find(iter->second); + + return riter->second.count(); + } + + const OptionValue& + operator[](const std::string& option) const + { + auto iter = m_options->find(option); + + if (iter == m_options->end()) + { + throw_or_mimic(option); + } + + auto riter = m_results.find(iter->second); + + return riter->second; + } + + const std::vector& + arguments() const + { + return m_sequential; + } + + private: + + void + parse(int& argc, char**& argv); + + void + add_to_option(const std::string& option, const std::string& arg); + + bool + consume_positional(std::string a); + + void + parse_option + ( + std::shared_ptr value, + const std::string& name, + const std::string& arg = "" + ); + + void + parse_default(std::shared_ptr details); + + void + checked_parse_arg + ( + int argc, + char* argv[], + int& current, + std::shared_ptr value, + const std::string& name + ); + + const std::shared_ptr< + std::unordered_map> + > m_options; + std::vector m_positional; + std::vector::iterator m_next_positional; + std::unordered_set m_positional_set; + std::unordered_map, OptionValue> m_results; + + bool m_allow_unrecognised; + + std::vector m_sequential; + }; + + struct Option + { + Option + ( + const std::string& opts, + const std::string& desc, + const std::shared_ptr& value = ::cxxopts::value(), + const std::string& arg_help = "" + ) + : opts_(opts) + , desc_(desc) + , value_(value) + , arg_help_(arg_help) + { + } + + std::string opts_; + std::string desc_; + std::shared_ptr value_; + std::string arg_help_; + }; + + class Options + { + typedef std::unordered_map> + OptionMap; + public: + + Options(std::string program, std::string help_string = "") + : m_program(std::move(program)) + , m_help_string(toLocalString(std::move(help_string))) + , m_custom_help("[OPTION...]") + , m_positional_help("positional parameters") + , m_show_positional(false) + , m_allow_unrecognised(false) + , m_options(std::make_shared()) + , m_next_positional(m_positional.end()) + { + } + + Options& + positional_help(std::string help_text) + { + m_positional_help = std::move(help_text); + return *this; + } + + Options& + custom_help(std::string help_text) + { + m_custom_help = std::move(help_text); + return *this; + } + + Options& + show_positional_help() + { + m_show_positional = true; + return *this; + } + + Options& + allow_unrecognised_options() + { + m_allow_unrecognised = true; + return *this; + } + + ParseResult + parse(int& argc, char**& argv); + + OptionAdder + add_options(std::string group = ""); + + void + add_options + ( + const std::string& group, + std::initializer_list