diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt
index 09e9198..f30cae9 100644
--- a/Project2-Character-Recognition/CMakeLists.txt
+++ b/Project2-Character-Recognition/CMakeLists.txt
@@ -22,6 +22,8 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 endif()
 
 include_directories(.)
+link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
+
 add_subdirectory(character_recognition)
 
 cuda_add_executable(${CMAKE_PROJECT_NAME}
@@ -30,6 +32,8 @@ cuda_add_executable(${CMAKE_PROJECT_NAME}
     )
 
 target_link_libraries(${CMAKE_PROJECT_NAME}
+    curand
+    cublas
     character_recognition
     ${CORELIBS}
     )
diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md
index 4503fac..8042e2a 100644
--- a/Project2-Character-Recognition/README.md
+++ b/Project2-Character-Recognition/README.md
@@ -3,12 +3,23 @@ CUDA Character Recognition
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Gangzheng Tong
+  * www.gtong.me
+* Tested on: Windows 10, i7-8th Gen @ 2.2GHz 16GB, RTX 2070 8GB (Personal Laptop)
 
-### (TODO: Your README)
+![Screenshot](img/output.png)
+![Screenshot](img/time_neurons.PNG)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Features Implemented
+In this project I implemented the following features:
+1. Loading data from files
+2. Forward and backward propagation implemented on GPU
+3. Wrap cuBLAS, thrust and my custom kernel into a Matrix struct and make it easy to use; could also be useful for future projects
+4. Use C++ smart pointers to manage memory and avoid memory leak all at once
+5. Test the time on different number of neurons
 
+However, I'm not able to predict the character given training samples. The cost fluctuates between 0.2 and 0.3 and seems not dropping within 40 iterations. I did unit tests on every kernel and didn't find anything wrong. Maybe it's due to the limited number of training samples or inappropriate initial weights and bias.
+
+### A Few Observations
+1. GPU is capable of handling large throughput. With the increasing # of hidden neurons, the data becomes huge (10212 * 2048 floats for a weight matrix) but my RTX 2070 was able to complete on iteration under 2 seconds. That's 52 samples and a dozen of big matrix operations. 
+2. C-Style matrix is typically row-major, but CUDA matrix is colmn-major. I spent a lot of time debugging the matrix dot production being unware of this.
diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
index 7446175..c5e28b0 100644
--- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt
+++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
@@ -7,5 +7,5 @@ set(SOURCE_FILES
 
 cuda_add_library(character_recognition
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_75
     )
diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu
index 5a3ed7f..806142d 100644
--- a/Project2-Character-Recognition/character_recognition/mlp.cu
+++ b/Project2-Character-Recognition/character_recognition/mlp.cu
@@ -1,27 +1,354 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <curand.h>
+#include <cublas_v2.h>
+#include <curand_kernel.h>
+#include <thrust/device_vector.h>
+#include <vector>
 #include "common.h"
 #include "mlp.h"
 
+#include <memory>
+
+#define THREADS_PER_BLOCK 256
+
 namespace CharacterRecognition {
-    using Common::PerformanceTimer;
-    PerformanceTimer& timer()
-    {
-        static PerformanceTimer timer;
-        return timer;
-    }
-        
-    // TODO: __global__
-
-    /**
-        * Example of use case (follow how you did it in stream compaction)
-        */
-    /*void scan(int n, int *odata, const int *idata) {
-        timer().startGpuTimer();
-        // TODO
-        timer().endGpuTimer();
-    }
-    */
-
-	// TODO: implement required elements for MLP sections 1 and 2 here
+	using namespace std;
+	using Common::PerformanceTimer;
+	PerformanceTimer& timer()
+	{
+		static PerformanceTimer timer;
+		return timer;
+	}
+
+	double learningRate;
+	shared_ptr<Matrix> W1, W2, B1, B2, IN, H, Y, X;;
+	int inputN, hiddenN, outputN;
+
+	// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU
+	void GPU_fill_rand(float* A, int nr_rows_A, int nr_cols_A) {
+		curandGenerator_t prng;
+		curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_XORWOW);
+		curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());
+		// Fill the array with random numbers on the device
+		curandGenerateUniform(prng, A, nr_rows_A * nr_cols_A);
+		cudaDeviceSynchronize();
+		checkCUDAError("GPU_fill_rand FAILED");
+	}
+
+	void GPU_fill_rand(Matrix* p_matrix) {
+		GPU_fill_rand(p_matrix->dev_data, p_matrix->numRow, p_matrix->numCol);
+	}
+
+	__global__ void kernAdd(float* A, float* B, int n) {
+		int idx = threadIdx.x + blockIdx.x * blockDim.x;
+		if (idx >= n) return;
+		A[idx] += B[idx];
+	}
+
+	__global__ void kernSubtract(float* A, const float* B, int n) {
+		int idx = threadIdx.x + blockIdx.x * blockDim.x;
+		if (idx >= n) return;
+		A[idx] -= B[idx];
+	}
+
+	__global__ void kernSubtract(const float* A, const float* B, float*C, int n) {
+		int idx = threadIdx.x + blockIdx.x * blockDim.x;
+		if (idx >= n) return;
+		C[idx] = A[idx] - B[idx];
+	}
+
+	__global__ void kernDiffSquare(const float* A, const float* B, float* C, int n) {
+		int idx = threadIdx.x + blockIdx.x * blockDim.x;
+		if (idx >= n) return;
+		const float tmp = A[idx] - B[idx];
+		C[idx] = tmp * tmp;
+	}
+
+	__global__ void generate_in_a_b(float* A, float a, float b, int N) {
+
+		int idx = threadIdx.x + blockIdx.x * blockDim.x;
+		if (idx >= N) return;
+		A[idx] = (b - a) * A[idx] + a;
+	}
+
+	__global__ void generate_random_numbers(float* numbers, int N) {
+
+		int i = threadIdx.x + blockIdx.x * blockDim.x;
+		if (i >= N) return;
+		curandState state;
+		curand_init(clock64(), i, 0, &state);
+		numbers[i] = curand_uniform(&state);
+	}
+
+	__global__ void kernSigmoid(float* numbers, int N) {
+		int i = threadIdx.x + blockIdx.x * blockDim.x;
+		if (i >= N) return;
+		numbers[i] = 1.0 / (1.0 + std::exp(-numbers[i]));
+	}
+
+	__global__ void kernSigmoidPrime(float* numbers, int N) {
+		int i = threadIdx.x + blockIdx.x * blockDim.x;
+		if (i >= N) return;
+		float tmp = std::exp(-numbers[i]);
+		numbers[i] = tmp / ( (1.f + tmp) * (1.f + tmp) );
+	}
+
+	__global__ void kernMultiply(float* numbers, float constant, int N) {
+		int i = threadIdx.x + blockIdx.x * blockDim.x;
+		if (i >= N) return;
+		numbers[i] *= constant;
+	}
+
+	__global__ void kernMultiplyMatrix(float* numbers, const float* other, int N) {
+		int i = threadIdx.x + blockIdx.x * blockDim.x;
+		if (i >= N) return;
+		numbers[i] *= other[i];
+	}
+
+
+	void Matrix::initWithRandom() {
+		GPU_fill_rand(dev_data, numRow, numCol);
+		int n = numRow * numCol;
+		int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+		float scale = 10.0f/std::sqrtf(n);	// TODO: find a better initialization value!!
+		generate_in_a_b << <blockSize, THREADS_PER_BLOCK >> > (dev_data, -scale, scale, n);
+	}
+
+	void Matrix::initWithZero() {
+		memset(data, 0, dataSize);
+		cudaMemset(dev_data, 0, dataSize);
+	}
+
+	void Matrix::initWithTest() {
+		for (size_t row = 0; row < numRow; row++) {
+			for (size_t col = 0; col < numCol; col++) {
+				data[row * numCol + col] = (float)(row * numCol + col + 1.0f) / 10.f;
+			}
+		}
+		copyToDevice();
+	}
+
+	Matrix* Matrix::dot(const Matrix* other) const {
+		// C(m, n) = A(m, k) * B(k, n)
+		int m = numRow;
+		int k = numCol;
+		if (k != other->numRow) {
+			throw "Matrices not match";
+		}
+		int n = other->numCol;
+
+		Matrix* product = new Matrix(m, n);
+	//	IMPORTANT!!
+	//	cuBLAS uses column-major order, so reverse the order to get the correct result
+	//	C: A*B, cuBLAS: B*A
+		gpu_blas_mmul(other->dev_data, dev_data, product->dev_data, n, k, m);
+		return product;
+	}
+
+	void Matrix::add(const Matrix* other) {
+		int n = numRow * numCol;
+		int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+		kernAdd<<<blockSize, THREADS_PER_BLOCK>>>(dev_data, other->dev_data, n);
+	}
+
+	Matrix* Matrix::subtract(const Matrix* other) const{
+		Matrix* output = new Matrix(numRow, numCol);
+		int n = numRow * numCol;
+		int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+		kernSubtract << <blockSize, THREADS_PER_BLOCK >> > (dev_data, other->dev_data, output->dev_data, n);
+
+		return output;
+	}
+
+
+	Matrix* Matrix::diffSquare(const Matrix* other) const {
+		Matrix* output = new Matrix(numRow, numCol);
+		int n = numRow * numCol;
+		int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+		kernDiffSquare << <blockSize, THREADS_PER_BLOCK >> > (dev_data, other->dev_data, output->dev_data, n);
+
+		return output;
+	}
+
+	void Matrix::subtract_inplace(const Matrix* other) {
+		int n = numRow * numCol;
+		int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+		kernSubtract << <blockSize, THREADS_PER_BLOCK >> > (dev_data, other->dev_data, n);
+	}
+	
+	void Matrix::sigmoid() {
+		int n = numRow * numCol;
+		int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+		kernSigmoid << <blockSize, THREADS_PER_BLOCK >> > (dev_data, n);
+	}
+
+	void Matrix::sigmoidePrime() {
+		int n = numRow * numCol;
+		int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+		kernSigmoidPrime << <blockSize, THREADS_PER_BLOCK >> > (dev_data, n);
+	}
+
+	Matrix* Matrix::multiply(const float constant) {
+		int n = numRow * numCol;
+		int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+		kernMultiply<< <blockSize, THREADS_PER_BLOCK >> > (dev_data, constant, n);
+		return this;
+	}
+
+	Matrix* Matrix::multiply(const Matrix* other) {
+		int n = numRow * numCol;
+		int blockSize = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+		kernMultiplyMatrix << <blockSize, THREADS_PER_BLOCK >> > (dev_data, other->dev_data, n);
+		return this;
+	}
+
+	Matrix* Matrix::transpose() const {
+		Matrix* trans = new Matrix(numCol , numRow);
+		int const m = numRow;
+		int const n = numCol;
+		float const alpha(1.0);
+		float const beta(0.0);
+		cublasHandle_t handle;
+		cublasCreate(&handle);
+		cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, &alpha, dev_data, n, &beta, trans->dev_data, m, trans->dev_data, m);
+		cublasDestroy(handle);
+
+		return trans;
+	}
+
+
+	void init(int inputNeuron, int hiddenNeuron, int outputNeuron, double rate)
+	{
+		inputN = inputNeuron, hiddenN = hiddenNeuron, outputN = outputNeuron;
+		learningRate = rate;
+
+		W1 = make_shared<Matrix>(inputNeuron, hiddenNeuron);
+		W2 = make_shared<Matrix>(hiddenNeuron, outputNeuron);
+		B1 = make_shared<Matrix>(1, hiddenNeuron);
+		B2 = make_shared<Matrix>(1, outputNeuron);
+
+		W1->initWithRandom();
+		W2->initWithRandom();
+		B1->initWithZero();
+		B2->initWithZero();
+	}
+
+	Matrix* computeOutput(const vector<float> & input) {
+		vector<vector<float>> wrapper = { input };
+		X = make_shared<Matrix>(wrapper);
+		X->copyToDevice();
+
+		H = shared_ptr<Matrix>( X->dot(W1.get()) );
+		
+		H->add(B1.get());
+		H->sigmoid();
+
+		Y = shared_ptr<Matrix>( H->dot(W2.get()) );
+		Y->add(B2.get());
+		Y->sigmoid();
+
+		return Y.get();
+	}
+
+	float learn(const vector<float> expectedOutput) {
+		// Compute gradients
+		//dJdB2 = Y.subtract(Y2).multiply( H.dot(W2).add(B2).applyFunction(sigmoidePrime) );
+		vector<vector<float>> wrapper = { expectedOutput };
+		auto Y2 = std::make_unique<Matrix>(wrapper); // 1 x numOutput matrix
+
+		auto dJdB2 = unique_ptr<Matrix>( Y->subtract(Y2.get()) );
+		auto tmpH_W2 = unique_ptr<Matrix>( H->dot(W2.get()) );
+		tmpH_W2->add(B2.get());
+		tmpH_W2->sigmoidePrime();
+		dJdB2->multiply(tmpH_W2.get());
+		dJdB2->copyToHost();
+
+		// dJdB1 = dJdB2.dot(W2.transpose()).multiply(X.dot(W1).add(B1).applyFunction(sigmoidePrime));
+		auto W2_trans = unique_ptr<Matrix>( W2->transpose() );
+		auto dJdB1 = unique_ptr<Matrix>( dJdB2->dot(W2_trans.get()) );
+
+		auto tmpX_W1 = unique_ptr<Matrix>( X->dot(W1.get()) );
+		tmpX_W1->add(B1.get());
+		tmpX_W1->sigmoidePrime();	// X.dot(W1).add(B1).applyFunction(sigmoidePrime)
+		dJdB1->multiply(tmpX_W1.get());
+
+		// dJdW2 = H.transpose().dot(dJdB2);
+		auto dJdW2_tmp = unique_ptr<Matrix>( H->transpose() );
+		auto dJdW2 = unique_ptr<Matrix>( dJdW2_tmp->dot(dJdB2.get()) );
+
+		// dJdW1 = X.transpose().dot(dJdB1);
+		auto dJdW1_tmp = unique_ptr<Matrix>( X->transpose() );
+		auto dJdW1 = unique_ptr<Matrix>( dJdW1_tmp->dot(dJdB1.get()) );
+
+		// update weights
+		W1->subtract_inplace(dJdW1->multiply(learningRate));
+		W2->subtract_inplace(dJdW2->multiply(learningRate));
+		B1->subtract_inplace(dJdB1->multiply(learningRate));
+		B2->subtract_inplace(dJdB2->multiply(learningRate));
+
+		// Calculate cost
+		auto diffSqr = unique_ptr<Matrix>(Y2->diffSquare(Y.get()));
+		thrust::device_ptr<float> thrust_diffSqr(diffSqr->dev_data);
+		int n = diffSqr->numCol * diffSqr->numCol;
+		float cost = thrust::reduce(thrust_diffSqr, thrust_diffSqr + n, 0.f, thrust::plus<float>());
+		cost = std::sqrtf(cost / n);
+
+		return cost;
+	}
+
+	void unitTest()
+	{
+		// Test transpose
+		Matrix m(3, 2);
+		Matrix n(2, 2);
+
+		m.initWithTest();
+		n.initWithTest();
+
+		m.print();
+		n.print();
+
+		//Matrix* pt = m.transpose();
+		//pt->copyToHost();
+		//pt->print();
+
+		// Test multiply
+		//m.multiply(3.0f);
+		//m.copyToHost();
+		//m.print();
+
+		// Test sigmoid
+		//m.sigmoid();
+		//m.copyToHost();
+		//m.print();
+
+		// Test Dot
+		Matrix* p = m.dot(&n);
+		p->copyToHost();
+		p->print();
+
+	}
+
+	// Took from 
+	// https://solarianprogrammer.com/2012/05/31/matrix-multiplication-cuda-cublas-curand-thrust/
+	// Multiply the arrays A and B on GPU and save the result in C
+	// C(m,n) = A(m,k) * B(k,n)
+	void gpu_blas_mmul(const float* A, const float* B, float* C, const int m, const int k, const int n) {
+		int lda = m, ldb = k, ldc = m;
+		const float alf = 1;
+		const float bet = 0;
+		const float* alpha = &alf;
+		const float* beta = &bet;
+
+		// Create a handle for CUBLAS
+		cublasHandle_t handle;
+		cublasCreate(&handle);
+
+		// Do the actual multiplication
+		cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+
+		// Destroy the handle
+		cublasDestroy(handle);
+	}
 }
diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h
index 2096228..1d422ed 100644
--- a/Project2-Character-Recognition/character_recognition/mlp.h
+++ b/Project2-Character-Recognition/character_recognition/mlp.h
@@ -1,9 +1,91 @@
 #pragma once
 
 #include "common.h"
+#include <iostream>
+#include <vector>
 
 namespace CharacterRecognition {
+	using namespace std;
     Common::PerformanceTimer& timer();
+	struct Matrix {
+		float* data;
+		float* dev_data;
+		int numRow;
+		int numCol;
+		size_t dataSize;
 
+		Matrix(int rows, int cols) :
+			numRow(rows), numCol(cols)
+		{
+			dataSize = rows * cols * sizeof(float);
+			data = (float*)malloc(dataSize);
+			cudaMalloc(&dev_data, dataSize);
+		}
+
+		Matrix(const vector<vector<float> >& dataArr ) {
+			numRow = dataArr.size();
+			if (!numRow) return;
+			numCol = dataArr[0].size();
+			dataSize = numRow * numCol * sizeof(float);
+			data = (float*)malloc(dataSize);
+			cudaMalloc(&dev_data, dataSize);
+
+			for (size_t row = 0; row < numRow; row++) {
+				for (size_t col = 0; col < numCol; col++) {
+					data[row * numCol + col] = dataArr[row][col];
+				}
+			}
+		}
+
+		void copyToHost() {
+			cudaMemcpy(data, dev_data, dataSize, cudaMemcpyDeviceToHost);
+		}
+
+		void copyToDevice() {
+			cudaMemcpy(dev_data, data, dataSize, cudaMemcpyHostToDevice);
+		}
+
+		void initWithRandom();
+		void initWithTest();
+		void initWithZero();
+		
+		Matrix* dot(const Matrix* other) const;
+		Matrix* transpose() const;
+		Matrix* subtract(const Matrix* other) const;
+		Matrix* diffSquare(const Matrix* other) const;
+
+		void add(const Matrix* other);
+		void sigmoid();
+		void sigmoidePrime();
+		void subtract_inplace(const Matrix* other);
+		Matrix* multiply(const float constant);
+		Matrix* multiply(const Matrix* other);
+
+
+		void print() {
+			std::cout << "-------- Matrix " << numRow << " X " << numCol << "-------- \n";
+			for (int row = 0; row < numRow; row++) {
+				for (int col = 0; col < numCol; col++) {
+					printf("%f ", *(data + row*numCol + col));
+				}
+				std::cout << std::endl;
+			}
+		}
+
+
+		~Matrix() {
+			free(data);
+			cudaFree(dev_data);
+		}
+	};
     // TODO: implement required elements for MLP sections 1 and 2 here
+	void init(int inputNeuron, int hiddenNeuron, int outputNeuron, double rate);
+	void GPU_fill_rand(float* A, int nr_rows_A, int nr_cols_A);
+	void GPU_fill_rand(Matrix* p_matrix);
+	void gpu_blas_mmul(const float* A, const float* B, float* C, const int m, const int k, const int n);
+	Matrix* computeOutput(const vector<float>& input);
+	float learn(const vector<float> expectedOutput);
+
+	void unitTest();
+
 }
diff --git a/Project2-Character-Recognition/img/output.png b/Project2-Character-Recognition/img/output.png
new file mode 100644
index 0000000..0cef2dc
Binary files /dev/null and b/Project2-Character-Recognition/img/output.png differ
diff --git a/Project2-Character-Recognition/img/time_neurons.PNG b/Project2-Character-Recognition/img/time_neurons.PNG
new file mode 100644
index 0000000..d62b13b
Binary files /dev/null and b/Project2-Character-Recognition/img/time_neurons.PNG differ
diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp
index 11dd534..2830b07 100644
--- a/Project2-Character-Recognition/src/main.cpp
+++ b/Project2-Character-Recognition/src/main.cpp
@@ -11,142 +11,109 @@
 #include <character_recognition/common.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+
+const int INPUT_N = 10201;
+const int HIDDEN_N = 64;
+
+const int trainingSize = 52;	// How many samples to train
+
+using namespace std;
+
+void loadTrainingData(const string& dir, vector<vector<float> >& input, vector<vector<float> >& output)
+{
+
+	input.resize(trainingSize); // 52 x 10201
+	output.resize(trainingSize); // 52 x 52
+
+	for (size_t i = 1; i <= trainingSize; i++) {
+		string filename = to_string(i) + "info.txt";
+		if (i < 10) {
+			filename = "0" + filename;
+		}
+		string filePath = dir + filename;
+
+		ifstream file(filePath);
+
+		if (file.is_open()) {
+			int tmp;
+			for (size_t c = 0; c < 2; c++) file >> tmp;
+			input[i-1].resize(INPUT_N);
+			for (size_t c = 0; c < INPUT_N; c++) {
+				file >> input[i-1][c];
+			}
+			file.close();
+		}
+		output[i - 1].resize(trainingSize, 0);
+		output[i - 1][i - 1] = 1;
+	}
+
+	// Normalization
+	for (size_t i = 0; i < trainingSize; i++) {
+		float mean = 0.f;
+		float variance = 0;
+		for (int j = 0; j < INPUT_N; j++) {
+			mean += input[i][j];
+		}
+		mean /= INPUT_N;
+		for (int j = 0; j < INPUT_N; j++) {
+			variance += (input[i][j] - mean) * (input[i][j] - mean);
+		}
+		float stdv = std::sqrt(variance / INPUT_N);
+		for (int j = 0; j < INPUT_N; j++) {
+			input[i][j] = (input[i][j]) / (stdv + 0.000001f);
+		}
+	}
+}
 
 int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-	onesArray(SIZE, c);
-	printDesc("1s array for finding bugs");
-	StreamCompaction::Naive::scan(SIZE, c, a);
-	printArray(SIZE, c, true); */
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
+
+	//CharacterRecognition::unitTest();
+	//system("pause"); // stop Win32 console from closing on exit
+
+	const string dir = "..\\data-set\\";
+	vector<vector<float>> input;
+	vector<vector<float>> output;
+
+	loadTrainingData(dir, input, output);
+	CharacterRecognition::init(INPUT_N, HIDDEN_N, trainingSize, 0.01f);
+
+	// compute output
+	std::vector<float> inputArr = { 1, 2 };
+
+	// train on 10 iterations
+	for (int i = 0; i < 40; i++)
+	{
+		CharacterRecognition::timer().startCpuTimer();
+		float cost;
+		for (int j = 0; j < input.size(); j++) // train all 52 samples
+		{
+			CharacterRecognition::Matrix* m = CharacterRecognition::computeOutput(input[j]);
+			cost = CharacterRecognition::learn(output[j]);
+		}
+		CharacterRecognition::timer().endCpuTimer();
+		float time = CharacterRecognition::timer().getCpuElapsedTimeForPreviousOperation();
+		cout << "#" << i + 1 << "/40  Cost: " << cost  << "	 Took time: " << time << endl;
+	}
+
+	// test
+	cout << "expected output : actual output" << endl;
+	for (int i = 0; i < input.size(); i++) // testing on last 10 examples
+	{
+		for (int j = 0; j < trainingSize; j++)
+		{
+			cout << output[i][j] << " ";
+		}
+		cout << endl;
+
+		CharacterRecognition::Matrix* result = CharacterRecognition::computeOutput(input[i]);
+		result->copyToHost();
+		result->print();
+	}
+
 
     system("pause"); // stop Win32 console from closing on exit
-	delete[] a;
-	delete[] b;
-	delete[] c;
 }
diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md
index 0e38ddb..d1faa51 100644
--- a/Project2-Stream-Compaction/README.md
+++ b/Project2-Stream-Compaction/README.md
@@ -3,12 +3,78 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Gangzheng Tong
+  * www.gtong.me
+* Tested on: Windows 10, i7-8th Gen @ 2.2GHz 16GB, RTX 2070 8GB (Personal Laptop)
 
-### (TODO: Your README)
+![Screenshot](img/block_size.PNG)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![Screenshot](img/N.PNG)
 
+
+### Features Implemented
+In this project I implemented all features required. Tested with up to `2^26` elements and all passed.
+The project will run different scan and stream compaction algorithms and report the timing.
+To make it easy to dump the testing, I added some automation code so it can be run with a batch file.
+
+### Questions
+1. By trying different block size, I concluded that it performs the best when each block contains 256 threads. While theoretically my GPU can run up to 1024 threads concurrently, the number of threads is limited by other factors such as the number of registers. For this particular algorithm, it's possbile that the GPU cannot schudule all 1024 threads to run at once.
+2. For the detailed compairson see the chart above. It's obvious to see that with effcient scan outperforms naive as the array size gets larger. The thrust implementation is the fastest when the array size is huge.
+3. The memeory latency is clearly the bottleneck when the array size is small, which explains why even the trust implementation failed to compete with CPU one when the number of elements is smaller than `2^16`. 
+4. As array size grows, the memory bandwidth becomes the bottleneck, especially for Naive implementation, which suffers greatly from the data incoherency. Thrust is probalbly utilizes the shared memeory and cahce to overcome the limitation.
+
+Raw Output
+```
+****************
+** SCAN TESTS **
+****************
+    [  17  29  47  28  31   5  24  33   0  16  12  15  20 ...  33   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 4.6704ms    (std::chrono Measured)
+    [   0  17  46  93 121 152 157 181 214 214 230 242 257 ... 25687915 25687948 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 1.6061ms    (std::chrono Measured)
+    [   0  17  46  93 121 152 157 181 214 214 230 242 257 ... 25687815 25687841 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 1.48669ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 1.37837ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.649216ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.658624ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.182272ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.218432ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   1   3   2   3   1   2   1   0   2   2   3   0 ...   1   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 2.5317ms    (std::chrono Measured)
+    [   1   1   3   2   3   1   2   1   2   2   3   3   2 ...   2   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 2.4355ms    (std::chrono Measured)
+    [   1   1   3   2   3   1   2   1   2   2   3   3   2 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 6.1908ms    (std::chrono Measured)
+    [   1   1   3   2   3   1   2   1   2   2   3   3   2 ...   2   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 1.02813ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 1.06282ms    (CUDA Measured)
+    passed
+```
diff --git a/Project2-Stream-Compaction/img/N.PNG b/Project2-Stream-Compaction/img/N.PNG
new file mode 100644
index 0000000..4b2e166
Binary files /dev/null and b/Project2-Stream-Compaction/img/N.PNG differ
diff --git a/Project2-Stream-Compaction/img/block_size.PNG b/Project2-Stream-Compaction/img/block_size.PNG
new file mode 100644
index 0000000..4f8e6c2
Binary files /dev/null and b/Project2-Stream-Compaction/img/block_size.PNG differ
diff --git a/Project2-Stream-Compaction/output_data/BLOCK_SIZE.csv b/Project2-Stream-Compaction/output_data/BLOCK_SIZE.csv
new file mode 100644
index 0000000..e69de29
diff --git a/Project2-Stream-Compaction/src/cxxopts.hpp b/Project2-Stream-Compaction/src/cxxopts.hpp
new file mode 100644
index 0000000..ed3c6a2
--- /dev/null
+++ b/Project2-Stream-Compaction/src/cxxopts.hpp
@@ -0,0 +1,2207 @@
+/*
+
+Copyright (c) 2014, 2015, 2016, 2017 Jarryd Beck
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+#ifndef CXXOPTS_HPP_INCLUDED
+#define CXXOPTS_HPP_INCLUDED
+
+#include <cstring>
+#include <cctype>
+#include <exception>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#ifdef __cpp_lib_optional
+#include <optional>
+#define CXXOPTS_HAS_OPTIONAL
+#endif
+
+#ifndef CXXOPTS_VECTOR_DELIMITER
+#define CXXOPTS_VECTOR_DELIMITER ','
+#endif
+
+#define CXXOPTS__VERSION_MAJOR 2
+#define CXXOPTS__VERSION_MINOR 2
+#define CXXOPTS__VERSION_PATCH 0
+
+namespace cxxopts
+{
+  static constexpr struct {
+    uint8_t major, minor, patch;
+  } version = {
+    CXXOPTS__VERSION_MAJOR,
+    CXXOPTS__VERSION_MINOR,
+    CXXOPTS__VERSION_PATCH
+  };
+}
+
+//when we ask cxxopts to use Unicode, help strings are processed using ICU,
+//which results in the correct lengths being computed for strings when they
+//are formatted for the help output
+//it is necessary to make sure that <unicode/unistr.h> can be found by the
+//compiler, and that icu-uc is linked in to the binary.
+
+#ifdef CXXOPTS_USE_UNICODE
+#include <unicode/unistr.h>
+
+namespace cxxopts
+{
+  typedef icu::UnicodeString String;
+
+  inline
+  String
+  toLocalString(std::string s)
+  {
+    return icu::UnicodeString::fromUTF8(std::move(s));
+  }
+
+  class UnicodeStringIterator : public
+    std::iterator<std::forward_iterator_tag, int32_t>
+  {
+    public:
+
+    UnicodeStringIterator(const icu::UnicodeString* string, int32_t pos)
+    : s(string)
+    , i(pos)
+    {
+    }
+
+    value_type
+    operator*() const
+    {
+      return s->char32At(i);
+    }
+
+    bool
+    operator==(const UnicodeStringIterator& rhs) const
+    {
+      return s == rhs.s && i == rhs.i;
+    }
+
+    bool
+    operator!=(const UnicodeStringIterator& rhs) const
+    {
+      return !(*this == rhs);
+    }
+
+    UnicodeStringIterator&
+    operator++()
+    {
+      ++i;
+      return *this;
+    }
+
+    UnicodeStringIterator
+    operator+(int32_t v)
+    {
+      return UnicodeStringIterator(s, i + v);
+    }
+
+    private:
+    const icu::UnicodeString* s;
+    int32_t i;
+  };
+
+  inline
+  String&
+  stringAppend(String&s, String a)
+  {
+    return s.append(std::move(a));
+  }
+
+  inline
+  String&
+  stringAppend(String& s, int n, UChar32 c)
+  {
+    for (int i = 0; i != n; ++i)
+    {
+      s.append(c);
+    }
+
+    return s;
+  }
+
+  template <typename Iterator>
+  String&
+  stringAppend(String& s, Iterator begin, Iterator end)
+  {
+    while (begin != end)
+    {
+      s.append(*begin);
+      ++begin;
+    }
+
+    return s;
+  }
+
+  inline
+  size_t
+  stringLength(const String& s)
+  {
+    return s.length();
+  }
+
+  inline
+  std::string
+  toUTF8String(const String& s)
+  {
+    std::string result;
+    s.toUTF8String(result);
+
+    return result;
+  }
+
+  inline
+  bool
+  empty(const String& s)
+  {
+    return s.isEmpty();
+  }
+}
+
+namespace std
+{
+  inline
+  cxxopts::UnicodeStringIterator
+  begin(const icu::UnicodeString& s)
+  {
+    return cxxopts::UnicodeStringIterator(&s, 0);
+  }
+
+  inline
+  cxxopts::UnicodeStringIterator
+  end(const icu::UnicodeString& s)
+  {
+    return cxxopts::UnicodeStringIterator(&s, s.length());
+  }
+}
+
+//ifdef CXXOPTS_USE_UNICODE
+#else
+
+namespace cxxopts
+{
+  typedef std::string String;
+
+  template <typename T>
+  T
+  toLocalString(T&& t)
+  {
+    return std::forward<T>(t);
+  }
+
+  inline
+  size_t
+  stringLength(const String& s)
+  {
+    return s.length();
+  }
+
+  inline
+  String&
+  stringAppend(String&s, String a)
+  {
+    return s.append(std::move(a));
+  }
+
+  inline
+  String&
+  stringAppend(String& s, size_t n, char c)
+  {
+    return s.append(n, c);
+  }
+
+  template <typename Iterator>
+  String&
+  stringAppend(String& s, Iterator begin, Iterator end)
+  {
+    return s.append(begin, end);
+  }
+
+  template <typename T>
+  std::string
+  toUTF8String(T&& t)
+  {
+    return std::forward<T>(t);
+  }
+
+  inline
+  bool
+  empty(const std::string& s)
+  {
+    return s.empty();
+  }
+}
+
+//ifdef CXXOPTS_USE_UNICODE
+#endif
+
+namespace cxxopts
+{
+  namespace
+  {
+#ifdef _WIN32
+    const std::string LQUOTE("\'");
+    const std::string RQUOTE("\'");
+#else
+    const std::string LQUOTE("‘");
+    const std::string RQUOTE("’");
+#endif
+  }
+
+  class Value : public std::enable_shared_from_this<Value>
+  {
+    public:
+
+    virtual ~Value() = default;
+
+    virtual
+    std::shared_ptr<Value>
+    clone() const = 0;
+
+    virtual void
+    parse(const std::string& text) const = 0;
+
+    virtual void
+    parse() const = 0;
+
+    virtual bool
+    has_default() const = 0;
+
+    virtual bool
+    is_container() const = 0;
+
+    virtual bool
+    has_implicit() const = 0;
+
+    virtual std::string
+    get_default_value() const = 0;
+
+    virtual std::string
+    get_implicit_value() const = 0;
+
+    virtual std::shared_ptr<Value>
+    default_value(const std::string& value) = 0;
+
+    virtual std::shared_ptr<Value>
+    implicit_value(const std::string& value) = 0;
+
+    virtual std::shared_ptr<Value>
+    no_implicit_value() = 0;
+
+    virtual bool
+    is_boolean() const = 0;
+  };
+
+  class OptionException : public std::exception
+  {
+    public:
+    OptionException(const std::string& message)
+    : m_message(message)
+    {
+    }
+
+    virtual const char*
+    what() const noexcept
+    {
+      return m_message.c_str();
+    }
+
+    private:
+    std::string m_message;
+  };
+
+  class OptionSpecException : public OptionException
+  {
+    public:
+
+    OptionSpecException(const std::string& message)
+    : OptionException(message)
+    {
+    }
+  };
+
+  class OptionParseException : public OptionException
+  {
+    public:
+    OptionParseException(const std::string& message)
+    : OptionException(message)
+    {
+    }
+  };
+
+  class option_exists_error : public OptionSpecException
+  {
+    public:
+    option_exists_error(const std::string& option)
+    : OptionSpecException("Option " + LQUOTE + option + RQUOTE + " already exists")
+    {
+    }
+  };
+
+  class invalid_option_format_error : public OptionSpecException
+  {
+    public:
+    invalid_option_format_error(const std::string& format)
+    : OptionSpecException("Invalid option format " + LQUOTE + format + RQUOTE)
+    {
+    }
+  };
+
+  class option_syntax_exception : public OptionParseException {
+    public:
+    option_syntax_exception(const std::string& text)
+    : OptionParseException("Argument " + LQUOTE + text + RQUOTE +
+        " starts with a - but has incorrect syntax")
+    {
+    }
+  };
+
+  class option_not_exists_exception : public OptionParseException
+  {
+    public:
+    option_not_exists_exception(const std::string& option)
+    : OptionParseException("Option " + LQUOTE + option + RQUOTE + " does not exist")
+    {
+    }
+  };
+
+  class missing_argument_exception : public OptionParseException
+  {
+    public:
+    missing_argument_exception(const std::string& option)
+    : OptionParseException(
+        "Option " + LQUOTE + option + RQUOTE + " is missing an argument"
+      )
+    {
+    }
+  };
+
+  class option_requires_argument_exception : public OptionParseException
+  {
+    public:
+    option_requires_argument_exception(const std::string& option)
+    : OptionParseException(
+        "Option " + LQUOTE + option + RQUOTE + " requires an argument"
+      )
+    {
+    }
+  };
+
+  class option_not_has_argument_exception : public OptionParseException
+  {
+    public:
+    option_not_has_argument_exception
+    (
+      const std::string& option,
+      const std::string& arg
+    )
+    : OptionParseException(
+        "Option " + LQUOTE + option + RQUOTE +
+        " does not take an argument, but argument " +
+        LQUOTE + arg + RQUOTE + " given"
+      )
+    {
+    }
+  };
+
+  class option_not_present_exception : public OptionParseException
+  {
+    public:
+    option_not_present_exception(const std::string& option)
+    : OptionParseException("Option " + LQUOTE + option + RQUOTE + " not present")
+    {
+    }
+  };
+
+  class argument_incorrect_type : public OptionParseException
+  {
+    public:
+    argument_incorrect_type
+    (
+      const std::string& arg
+    )
+    : OptionParseException(
+        "Argument " + LQUOTE + arg + RQUOTE + " failed to parse"
+      )
+    {
+    }
+  };
+
+  class option_required_exception : public OptionParseException
+  {
+    public:
+    option_required_exception(const std::string& option)
+    : OptionParseException(
+        "Option " + LQUOTE + option + RQUOTE + " is required but not present"
+      )
+    {
+    }
+  };
+
+  template <typename T>
+  void throw_or_mimic(const std::string& text)
+  {
+    static_assert(std::is_base_of<std::exception, T>::value,
+                  "throw_or_mimic only works on std::exception and "
+                  "deriving classes");
+
+#ifndef CXXOPTS_NO_EXCEPTIONS
+    // If CXXOPTS_NO_EXCEPTIONS is not defined, just throw
+    throw T{text};
+#else
+    // Otherwise manually instantiate the exception, print what() to stderr,
+    // and abort
+    T exception{text};
+    std::cerr << exception.what() << std::endl;
+    std::cerr << "Aborting (exceptions disabled)..." << std::endl;
+    std::abort();
+#endif
+  }
+
+  namespace values
+  {
+    namespace
+    {
+      std::basic_regex<char> integer_pattern
+        ("(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)");
+      std::basic_regex<char> truthy_pattern
+        ("(t|T)(rue)?|1");
+      std::basic_regex<char> falsy_pattern
+        ("(f|F)(alse)?|0");
+    }
+
+    namespace detail
+    {
+      template <typename T, bool B>
+      struct SignedCheck;
+
+      template <typename T>
+      struct SignedCheck<T, true>
+      {
+        template <typename U>
+        void
+        operator()(bool negative, U u, const std::string& text)
+        {
+          if (negative)
+          {
+            if (u > static_cast<U>((std::numeric_limits<T>::min)()))
+            {
+              throw_or_mimic<argument_incorrect_type>(text);
+            }
+          }
+          else
+          {
+            if (u > static_cast<U>((std::numeric_limits<T>::max)()))
+            {
+              throw_or_mimic<argument_incorrect_type>(text);
+            }
+          }
+        }
+      };
+
+      template <typename T>
+      struct SignedCheck<T, false>
+      {
+        template <typename U>
+        void
+        operator()(bool, U, const std::string&) {}
+      };
+
+      template <typename T, typename U>
+      void
+      check_signed_range(bool negative, U value, const std::string& text)
+      {
+        SignedCheck<T, std::numeric_limits<T>::is_signed>()(negative, value, text);
+      }
+    }
+
+    template <typename R, typename T>
+    R
+    checked_negate(T&& t, const std::string&, std::true_type)
+    {
+      // if we got to here, then `t` is a positive number that fits into
+      // `R`. So to avoid MSVC C4146, we first cast it to `R`.
+      // See https://github.com/jarro2783/cxxopts/issues/62 for more details.
+      return -static_cast<R>(t-1)-1;
+    }
+
+    template <typename R, typename T>
+    T
+    checked_negate(T&& t, const std::string& text, std::false_type)
+    {
+      throw_or_mimic<argument_incorrect_type>(text);
+      return t;
+    }
+
+    template <typename T>
+    void
+    integer_parser(const std::string& text, T& value)
+    {
+      std::smatch match;
+      std::regex_match(text, match, integer_pattern);
+
+      if (match.length() == 0)
+      {
+        throw_or_mimic<argument_incorrect_type>(text);
+      }
+
+      if (match.length(4) > 0)
+      {
+        value = 0;
+        return;
+      }
+
+      using US = typename std::make_unsigned<T>::type;
+
+      constexpr bool is_signed = std::numeric_limits<T>::is_signed;
+      const bool negative = match.length(1) > 0;
+      const uint8_t base = match.length(2) > 0 ? 16 : 10;
+
+      auto value_match = match[3];
+
+      US result = 0;
+
+      for (auto iter = value_match.first; iter != value_match.second; ++iter)
+      {
+        US digit = 0;
+
+        if (*iter >= '0' && *iter <= '9')
+        {
+          digit = static_cast<US>(*iter - '0');
+        }
+        else if (base == 16 && *iter >= 'a' && *iter <= 'f')
+        {
+          digit = static_cast<US>(*iter - 'a' + 10);
+        }
+        else if (base == 16 && *iter >= 'A' && *iter <= 'F')
+        {
+          digit = static_cast<US>(*iter - 'A' + 10);
+        }
+        else
+        {
+          throw_or_mimic<argument_incorrect_type>(text);
+        }
+
+        US next = result * base + digit;
+        if (result > next)
+        {
+          throw_or_mimic<argument_incorrect_type>(text);
+        }
+
+        result = next;
+      }
+
+      detail::check_signed_range<T>(negative, result, text);
+
+      if (negative)
+      {
+        value = checked_negate<T>(result,
+          text,
+          std::integral_constant<bool, is_signed>());
+      }
+      else
+      {
+        value = static_cast<T>(result);
+      }
+    }
+
+    template <typename T>
+    void stringstream_parser(const std::string& text, T& value)
+    {
+      std::stringstream in(text);
+      in >> value;
+      if (!in) {
+        throw_or_mimic<argument_incorrect_type>(text);
+      }
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, uint8_t& value)
+    {
+      integer_parser(text, value);
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, int8_t& value)
+    {
+      integer_parser(text, value);
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, uint16_t& value)
+    {
+      integer_parser(text, value);
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, int16_t& value)
+    {
+      integer_parser(text, value);
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, uint32_t& value)
+    {
+      integer_parser(text, value);
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, int32_t& value)
+    {
+      integer_parser(text, value);
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, uint64_t& value)
+    {
+      integer_parser(text, value);
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, int64_t& value)
+    {
+      integer_parser(text, value);
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, bool& value)
+    {
+      std::smatch result;
+      std::regex_match(text, result, truthy_pattern);
+
+      if (!result.empty())
+      {
+        value = true;
+        return;
+      }
+
+      std::regex_match(text, result, falsy_pattern);
+      if (!result.empty())
+      {
+        value = false;
+        return;
+      }
+
+      throw_or_mimic<argument_incorrect_type>(text);
+    }
+
+    inline
+    void
+    parse_value(const std::string& text, std::string& value)
+    {
+      value = text;
+    }
+
+    // The fallback parser. It uses the stringstream parser to parse all types
+    // that have not been overloaded explicitly.  It has to be placed in the
+    // source code before all other more specialized templates.
+    template <typename T>
+    void
+    parse_value(const std::string& text, T& value) {
+      stringstream_parser(text, value);
+    }
+
+    template <typename T>
+    void
+    parse_value(const std::string& text, std::vector<T>& value)
+    {
+      std::stringstream in(text);
+      std::string token;
+      while(in.eof() == false && std::getline(in, token, CXXOPTS_VECTOR_DELIMITER)) {
+        T v;
+        parse_value(token, v);
+        value.emplace_back(std::move(v));
+      }
+    }
+
+#ifdef CXXOPTS_HAS_OPTIONAL
+    template <typename T>
+    void
+    parse_value(const std::string& text, std::optional<T>& value)
+    {
+      T result;
+      parse_value(text, result);
+      value = std::move(result);
+    }
+#endif
+
+    inline
+    void parse_value(const std::string& text, char& c)
+    {
+      if (text.length() != 1)
+      {
+        throw_or_mimic<argument_incorrect_type>(text);
+      }
+
+      c = text[0];
+    }
+
+    template <typename T>
+    struct type_is_container
+    {
+      static constexpr bool value = false;
+    };
+
+    template <typename T>
+    struct type_is_container<std::vector<T>>
+    {
+      static constexpr bool value = true;
+    };
+
+    template <typename T>
+    class abstract_value : public Value
+    {
+      using Self = abstract_value<T>;
+
+      public:
+      abstract_value()
+      : m_result(std::make_shared<T>())
+      , m_store(m_result.get())
+      {
+      }
+
+      abstract_value(T* t)
+      : m_store(t)
+      {
+      }
+
+      virtual ~abstract_value() = default;
+
+      abstract_value(const abstract_value& rhs)
+      {
+        if (rhs.m_result)
+        {
+          m_result = std::make_shared<T>();
+          m_store = m_result.get();
+        }
+        else
+        {
+          m_store = rhs.m_store;
+        }
+
+        m_default = rhs.m_default;
+        m_implicit = rhs.m_implicit;
+        m_default_value = rhs.m_default_value;
+        m_implicit_value = rhs.m_implicit_value;
+      }
+
+      void
+      parse(const std::string& text) const
+      {
+        parse_value(text, *m_store);
+      }
+
+      bool
+      is_container() const
+      {
+        return type_is_container<T>::value;
+      }
+
+      void
+      parse() const
+      {
+        parse_value(m_default_value, *m_store);
+      }
+
+      bool
+      has_default() const
+      {
+        return m_default;
+      }
+
+      bool
+      has_implicit() const
+      {
+        return m_implicit;
+      }
+
+      std::shared_ptr<Value>
+      default_value(const std::string& value)
+      {
+        m_default = true;
+        m_default_value = value;
+        return shared_from_this();
+      }
+
+      std::shared_ptr<Value>
+      implicit_value(const std::string& value)
+      {
+        m_implicit = true;
+        m_implicit_value = value;
+        return shared_from_this();
+      }
+
+      std::shared_ptr<Value>
+      no_implicit_value()
+      {
+        m_implicit = false;
+        return shared_from_this();
+      }
+
+      std::string
+      get_default_value() const
+      {
+        return m_default_value;
+      }
+
+      std::string
+      get_implicit_value() const
+      {
+        return m_implicit_value;
+      }
+
+      bool
+      is_boolean() const
+      {
+        return std::is_same<T, bool>::value;
+      }
+
+      const T&
+      get() const
+      {
+        if (m_store == nullptr)
+        {
+          return *m_result;
+        }
+        else
+        {
+          return *m_store;
+        }
+      }
+
+      protected:
+      std::shared_ptr<T> m_result;
+      T* m_store;
+
+      bool m_default = false;
+      bool m_implicit = false;
+
+      std::string m_default_value;
+      std::string m_implicit_value;
+    };
+
+    template <typename T>
+    class standard_value : public abstract_value<T>
+    {
+      public:
+      using abstract_value<T>::abstract_value;
+
+      std::shared_ptr<Value>
+      clone() const
+      {
+        return std::make_shared<standard_value<T>>(*this);
+      }
+    };
+
+    template <>
+    class standard_value<bool> : public abstract_value<bool>
+    {
+      public:
+      ~standard_value() = default;
+
+      standard_value()
+      {
+        set_default_and_implicit();
+      }
+
+      standard_value(bool* b)
+      : abstract_value(b)
+      {
+        set_default_and_implicit();
+      }
+
+      std::shared_ptr<Value>
+      clone() const
+      {
+        return std::make_shared<standard_value<bool>>(*this);
+      }
+
+      private:
+
+      void
+      set_default_and_implicit()
+      {
+        m_default = true;
+        m_default_value = "false";
+        m_implicit = true;
+        m_implicit_value = "true";
+      }
+    };
+  }
+
+  template <typename T>
+  std::shared_ptr<Value>
+  value()
+  {
+    return std::make_shared<values::standard_value<T>>();
+  }
+
+  template <typename T>
+  std::shared_ptr<Value>
+  value(T& t)
+  {
+    return std::make_shared<values::standard_value<T>>(&t);
+  }
+
+  class OptionAdder;
+
+  class OptionDetails
+  {
+    public:
+    OptionDetails
+    (
+      const std::string& short_,
+      const std::string& long_,
+      const String& desc,
+      std::shared_ptr<const Value> val
+    )
+    : m_short(short_)
+    , m_long(long_)
+    , m_desc(desc)
+    , m_value(val)
+    , m_count(0)
+    {
+    }
+
+    OptionDetails(const OptionDetails& rhs)
+    : m_desc(rhs.m_desc)
+    , m_count(rhs.m_count)
+    {
+      m_value = rhs.m_value->clone();
+    }
+
+    OptionDetails(OptionDetails&& rhs) = default;
+
+    const String&
+    description() const
+    {
+      return m_desc;
+    }
+
+    const Value& value() const {
+        return *m_value;
+    }
+
+    std::shared_ptr<Value>
+    make_storage() const
+    {
+      return m_value->clone();
+    }
+
+    const std::string&
+    short_name() const
+    {
+      return m_short;
+    }
+
+    const std::string&
+    long_name() const
+    {
+      return m_long;
+    }
+
+    private:
+    std::string m_short;
+    std::string m_long;
+    String m_desc;
+    std::shared_ptr<const Value> m_value;
+    int m_count;
+  };
+
+  struct HelpOptionDetails
+  {
+    std::string s;
+    std::string l;
+    String desc;
+    bool has_default;
+    std::string default_value;
+    bool has_implicit;
+    std::string implicit_value;
+    std::string arg_help;
+    bool is_container;
+    bool is_boolean;
+  };
+
+  struct HelpGroupDetails
+  {
+    std::string name;
+    std::string description;
+    std::vector<HelpOptionDetails> options;
+  };
+
+  class OptionValue
+  {
+    public:
+    void
+    parse
+    (
+      std::shared_ptr<const OptionDetails> details,
+      const std::string& text
+    )
+    {
+      ensure_value(details);
+      ++m_count;
+      m_value->parse(text);
+    }
+
+    void
+    parse_default(std::shared_ptr<const OptionDetails> details)
+    {
+      ensure_value(details);
+      m_default = true;
+      m_value->parse();
+    }
+
+    size_t
+    count() const noexcept
+    {
+      return m_count;
+    }
+
+    // TODO: maybe default options should count towards the number of arguments
+    bool
+    has_default() const noexcept
+    {
+      return m_default;
+    }
+
+    template <typename T>
+    const T&
+    as() const
+    {
+      if (m_value == nullptr) {
+        throw_or_mimic<std::domain_error>("No value");
+      }
+
+#ifdef CXXOPTS_NO_RTTI
+      return static_cast<const values::standard_value<T>&>(*m_value).get();
+#else
+      return dynamic_cast<const values::standard_value<T>&>(*m_value).get();
+#endif
+    }
+
+    private:
+    void
+    ensure_value(std::shared_ptr<const OptionDetails> details)
+    {
+      if (m_value == nullptr)
+      {
+        m_value = details->make_storage();
+      }
+    }
+
+    std::shared_ptr<Value> m_value;
+    size_t m_count = 0;
+    bool m_default = false;
+  };
+
+  class KeyValue
+  {
+    public:
+    KeyValue(std::string key_, std::string value_)
+    : m_key(std::move(key_))
+    , m_value(std::move(value_))
+    {
+    }
+
+    const
+    std::string&
+    key() const
+    {
+      return m_key;
+    }
+
+    const
+    std::string&
+    value() const
+    {
+      return m_value;
+    }
+
+    template <typename T>
+    T
+    as() const
+    {
+      T result;
+      values::parse_value(m_value, result);
+      return result;
+    }
+
+    private:
+    std::string m_key;
+    std::string m_value;
+  };
+
+  class ParseResult
+  {
+    public:
+
+    ParseResult(
+      const std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<OptionDetails>>
+      >,
+      std::vector<std::string>,
+      bool allow_unrecognised,
+      int&, char**&);
+
+    size_t
+    count(const std::string& o) const
+    {
+      auto iter = m_options->find(o);
+      if (iter == m_options->end())
+      {
+        return 0;
+      }
+
+      auto riter = m_results.find(iter->second);
+
+      return riter->second.count();
+    }
+
+    const OptionValue&
+    operator[](const std::string& option) const
+    {
+      auto iter = m_options->find(option);
+
+      if (iter == m_options->end())
+      {
+        throw_or_mimic<option_not_present_exception>(option);
+      }
+
+      auto riter = m_results.find(iter->second);
+
+      return riter->second;
+    }
+
+    const std::vector<KeyValue>&
+    arguments() const
+    {
+      return m_sequential;
+    }
+
+    private:
+
+    void
+    parse(int& argc, char**& argv);
+
+    void
+    add_to_option(const std::string& option, const std::string& arg);
+
+    bool
+    consume_positional(std::string a);
+
+    void
+    parse_option
+    (
+      std::shared_ptr<OptionDetails> value,
+      const std::string& name,
+      const std::string& arg = ""
+    );
+
+    void
+    parse_default(std::shared_ptr<OptionDetails> details);
+
+    void
+    checked_parse_arg
+    (
+      int argc,
+      char* argv[],
+      int& current,
+      std::shared_ptr<OptionDetails> value,
+      const std::string& name
+    );
+
+    const std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<OptionDetails>>
+    > m_options;
+    std::vector<std::string> m_positional;
+    std::vector<std::string>::iterator m_next_positional;
+    std::unordered_set<std::string> m_positional_set;
+    std::unordered_map<std::shared_ptr<OptionDetails>, OptionValue> m_results;
+
+    bool m_allow_unrecognised;
+
+    std::vector<KeyValue> m_sequential;
+  };
+
+  struct Option
+  {
+    Option
+    (
+      const std::string& opts,
+      const std::string& desc,
+      const std::shared_ptr<const Value>& value = ::cxxopts::value<bool>(),
+      const std::string& arg_help = ""
+    )
+    : opts_(opts)
+    , desc_(desc)
+    , value_(value)
+    , arg_help_(arg_help)
+    {
+    }
+
+    std::string opts_;
+    std::string desc_;
+    std::shared_ptr<const Value> value_;
+    std::string arg_help_;
+  };
+
+  class Options
+  {
+    typedef std::unordered_map<std::string, std::shared_ptr<OptionDetails>>
+      OptionMap;
+    public:
+
+    Options(std::string program, std::string help_string = "")
+    : m_program(std::move(program))
+    , m_help_string(toLocalString(std::move(help_string)))
+    , m_custom_help("[OPTION...]")
+    , m_positional_help("positional parameters")
+    , m_show_positional(false)
+    , m_allow_unrecognised(false)
+    , m_options(std::make_shared<OptionMap>())
+    , m_next_positional(m_positional.end())
+    {
+    }
+
+    Options&
+    positional_help(std::string help_text)
+    {
+      m_positional_help = std::move(help_text);
+      return *this;
+    }
+
+    Options&
+    custom_help(std::string help_text)
+    {
+      m_custom_help = std::move(help_text);
+      return *this;
+    }
+
+    Options&
+    show_positional_help()
+    {
+      m_show_positional = true;
+      return *this;
+    }
+
+    Options&
+    allow_unrecognised_options()
+    {
+      m_allow_unrecognised = true;
+      return *this;
+    }
+
+    ParseResult
+    parse(int& argc, char**& argv);
+
+    OptionAdder
+    add_options(std::string group = "");
+
+    void
+    add_options
+    (
+      const std::string& group,
+      std::initializer_list<Option> options
+    );
+
+    void
+    add_option
+    (
+      const std::string& group,
+      const Option& option
+    );
+
+    void
+    add_option
+    (
+      const std::string& group,
+      const std::string& s,
+      const std::string& l,
+      std::string desc,
+      std::shared_ptr<const Value> value,
+      std::string arg_help
+    );
+
+    //parse positional arguments into the given option
+    void
+    parse_positional(std::string option);
+
+    void
+    parse_positional(std::vector<std::string> options);
+
+    void
+    parse_positional(std::initializer_list<std::string> options);
+
+    template <typename Iterator>
+    void
+    parse_positional(Iterator begin, Iterator end) {
+      parse_positional(std::vector<std::string>{begin, end});
+    }
+
+    std::string
+    help(const std::vector<std::string>& groups = {}) const;
+
+    const std::vector<std::string>
+    groups() const;
+
+    const HelpGroupDetails&
+    group_help(const std::string& group) const;
+
+    private:
+
+    void
+    add_one_option
+    (
+      const std::string& option,
+      std::shared_ptr<OptionDetails> details
+    );
+
+    String
+    help_one_group(const std::string& group) const;
+
+    void
+    generate_group_help
+    (
+      String& result,
+      const std::vector<std::string>& groups
+    ) const;
+
+    void
+    generate_all_groups_help(String& result) const;
+
+    std::string m_program;
+    String m_help_string;
+    std::string m_custom_help;
+    std::string m_positional_help;
+    bool m_show_positional;
+    bool m_allow_unrecognised;
+
+    std::shared_ptr<OptionMap> m_options;
+    std::vector<std::string> m_positional;
+    std::vector<std::string>::iterator m_next_positional;
+    std::unordered_set<std::string> m_positional_set;
+
+    //mapping from groups to help options
+    std::map<std::string, HelpGroupDetails> m_help;
+  };
+
+  class OptionAdder
+  {
+    public:
+
+    OptionAdder(Options& options, std::string group)
+    : m_options(options), m_group(std::move(group))
+    {
+    }
+
+    OptionAdder&
+    operator()
+    (
+      const std::string& opts,
+      const std::string& desc,
+      std::shared_ptr<const Value> value
+        = ::cxxopts::value<bool>(),
+      std::string arg_help = ""
+    );
+
+    private:
+    Options& m_options;
+    std::string m_group;
+  };
+
+  namespace
+  {
+    constexpr int OPTION_LONGEST = 30;
+    constexpr int OPTION_DESC_GAP = 2;
+
+    std::basic_regex<char> option_matcher
+      ("--([[:alnum:]][-_[:alnum:]]+)(=(.*))?|-([[:alnum:]]+)");
+
+    std::basic_regex<char> option_specifier
+      ("(([[:alnum:]]),)?[ ]*([[:alnum:]][-_[:alnum:]]*)?");
+
+    String
+    format_option
+    (
+      const HelpOptionDetails& o
+    )
+    {
+      auto& s = o.s;
+      auto& l = o.l;
+
+      String result = "  ";
+
+      if (s.size() > 0)
+      {
+        result += "-" + toLocalString(s) + ",";
+      }
+      else
+      {
+        result += "   ";
+      }
+
+      if (l.size() > 0)
+      {
+        result += " --" + toLocalString(l);
+      }
+
+      auto arg = o.arg_help.size() > 0 ? toLocalString(o.arg_help) : "arg";
+
+      if (!o.is_boolean)
+      {
+        if (o.has_implicit)
+        {
+          result += " [=" + arg + "(=" + toLocalString(o.implicit_value) + ")]";
+        }
+        else
+        {
+          result += " " + arg;
+        }
+      }
+
+      return result;
+    }
+
+    String
+    format_description
+    (
+      const HelpOptionDetails& o,
+      size_t start,
+      size_t width
+    )
+    {
+      auto desc = o.desc;
+
+      if (o.has_default && (!o.is_boolean || o.default_value != "false"))
+      {
+        desc += toLocalString(" (default: " + o.default_value + ")");
+      }
+
+      String result;
+
+      auto current = std::begin(desc);
+      auto startLine = current;
+      auto lastSpace = current;
+
+      auto size = size_t{};
+
+      while (current != std::end(desc))
+      {
+        if (*current == ' ')
+        {
+          lastSpace = current;
+        }
+
+        if (*current == '\n')
+        {
+          startLine = current + 1;
+          lastSpace = startLine;
+        }
+        else if (size > width)
+        {
+          if (lastSpace == startLine)
+          {
+            stringAppend(result, startLine, current + 1);
+            stringAppend(result, "\n");
+            stringAppend(result, start, ' ');
+            startLine = current + 1;
+            lastSpace = startLine;
+          }
+          else
+          {
+            stringAppend(result, startLine, lastSpace);
+            stringAppend(result, "\n");
+            stringAppend(result, start, ' ');
+            startLine = lastSpace + 1;
+          }
+          size = 0;
+        }
+        else
+        {
+          ++size;
+        }
+
+        ++current;
+      }
+
+      //append whatever is left
+      stringAppend(result, startLine, current);
+
+      return result;
+    }
+  }
+
+inline
+ParseResult::ParseResult
+(
+  const std::shared_ptr<
+    std::unordered_map<std::string, std::shared_ptr<OptionDetails>>
+  > options,
+  std::vector<std::string> positional,
+  bool allow_unrecognised,
+  int& argc, char**& argv
+)
+: m_options(options)
+, m_positional(std::move(positional))
+, m_next_positional(m_positional.begin())
+, m_allow_unrecognised(allow_unrecognised)
+{
+  parse(argc, argv);
+}
+
+inline
+void
+Options::add_options
+(
+  const std::string &group,
+  std::initializer_list<Option> options
+)
+{
+ OptionAdder option_adder(*this, group);
+ for (const auto &option: options)
+ {
+   option_adder(option.opts_, option.desc_, option.value_, option.arg_help_);
+ }
+}
+
+inline
+OptionAdder
+Options::add_options(std::string group)
+{
+  return OptionAdder(*this, std::move(group));
+}
+
+inline
+OptionAdder&
+OptionAdder::operator()
+(
+  const std::string& opts,
+  const std::string& desc,
+  std::shared_ptr<const Value> value,
+  std::string arg_help
+)
+{
+  std::match_results<const char*> result;
+  std::regex_match(opts.c_str(), result, option_specifier);
+
+  if (result.empty())
+  {
+    throw_or_mimic<invalid_option_format_error>(opts);
+  }
+
+  const auto& short_match = result[2];
+  const auto& long_match = result[3];
+
+  if (!short_match.length() && !long_match.length())
+  {
+    throw_or_mimic<invalid_option_format_error>(opts);
+  } else if (long_match.length() == 1 && short_match.length())
+  {
+    throw_or_mimic<invalid_option_format_error>(opts);
+  }
+
+  auto option_names = []
+  (
+    const std::sub_match<const char*>& short_,
+    const std::sub_match<const char*>& long_
+  )
+  {
+    if (long_.length() == 1)
+    {
+      return std::make_tuple(long_.str(), short_.str());
+    }
+    else
+    {
+      return std::make_tuple(short_.str(), long_.str());
+    }
+  }(short_match, long_match);
+
+  m_options.add_option
+  (
+    m_group,
+    std::get<0>(option_names),
+    std::get<1>(option_names),
+    desc,
+    value,
+    std::move(arg_help)
+  );
+
+  return *this;
+}
+
+inline
+void
+ParseResult::parse_default(std::shared_ptr<OptionDetails> details)
+{
+  m_results[details].parse_default(details);
+}
+
+inline
+void
+ParseResult::parse_option
+(
+  std::shared_ptr<OptionDetails> value,
+  const std::string& /*name*/,
+  const std::string& arg
+)
+{
+  auto& result = m_results[value];
+  result.parse(value, arg);
+
+  m_sequential.emplace_back(value->long_name(), arg);
+}
+
+inline
+void
+ParseResult::checked_parse_arg
+(
+  int argc,
+  char* argv[],
+  int& current,
+  std::shared_ptr<OptionDetails> value,
+  const std::string& name
+)
+{
+  if (current + 1 >= argc)
+  {
+    if (value->value().has_implicit())
+    {
+      parse_option(value, name, value->value().get_implicit_value());
+    }
+    else
+    {
+      throw_or_mimic<missing_argument_exception>(name);
+    }
+  }
+  else
+  {
+    if (value->value().has_implicit())
+    {
+      parse_option(value, name, value->value().get_implicit_value());
+    }
+    else
+    {
+      parse_option(value, name, argv[current + 1]);
+      ++current;
+    }
+  }
+}
+
+inline
+void
+ParseResult::add_to_option(const std::string& option, const std::string& arg)
+{
+  auto iter = m_options->find(option);
+
+  if (iter == m_options->end())
+  {
+    throw_or_mimic<option_not_exists_exception>(option);
+  }
+
+  parse_option(iter->second, option, arg);
+}
+
+inline
+bool
+ParseResult::consume_positional(std::string a)
+{
+  while (m_next_positional != m_positional.end())
+  {
+    auto iter = m_options->find(*m_next_positional);
+    if (iter != m_options->end())
+    {
+      auto& result = m_results[iter->second];
+      if (!iter->second->value().is_container())
+      {
+        if (result.count() == 0)
+        {
+          add_to_option(*m_next_positional, a);
+          ++m_next_positional;
+          return true;
+        }
+        else
+        {
+          ++m_next_positional;
+          continue;
+        }
+      }
+      else
+      {
+        add_to_option(*m_next_positional, a);
+        return true;
+      }
+    }
+    else
+    {
+      throw_or_mimic<option_not_exists_exception>(*m_next_positional);
+    }
+  }
+
+  return false;
+}
+
+inline
+void
+Options::parse_positional(std::string option)
+{
+  parse_positional(std::vector<std::string>{std::move(option)});
+}
+
+inline
+void
+Options::parse_positional(std::vector<std::string> options)
+{
+  m_positional = std::move(options);
+  m_next_positional = m_positional.begin();
+
+  m_positional_set.insert(m_positional.begin(), m_positional.end());
+}
+
+inline
+void
+Options::parse_positional(std::initializer_list<std::string> options)
+{
+  parse_positional(std::vector<std::string>(std::move(options)));
+}
+
+inline
+ParseResult
+Options::parse(int& argc, char**& argv)
+{
+  ParseResult result(m_options, m_positional, m_allow_unrecognised, argc, argv);
+  return result;
+}
+
+inline
+void
+ParseResult::parse(int& argc, char**& argv)
+{
+  int current = 1;
+
+  int nextKeep = 1;
+
+  bool consume_remaining = false;
+
+  while (current != argc)
+  {
+    if (strcmp(argv[current], "--") == 0)
+    {
+      consume_remaining = true;
+      ++current;
+      break;
+    }
+
+    std::match_results<const char*> result;
+    std::regex_match(argv[current], result, option_matcher);
+
+    if (result.empty())
+    {
+      //not a flag
+
+      // but if it starts with a `-`, then it's an error
+      if (argv[current][0] == '-' && argv[current][1] != '\0') {
+        if (!m_allow_unrecognised) {
+          throw_or_mimic<option_syntax_exception>(argv[current]);
+        }
+      }
+
+      //if true is returned here then it was consumed, otherwise it is
+      //ignored
+      if (consume_positional(argv[current]))
+      {
+      }
+      else
+      {
+        argv[nextKeep] = argv[current];
+        ++nextKeep;
+      }
+      //if we return from here then it was parsed successfully, so continue
+    }
+    else
+    {
+      //short or long option?
+      if (result[4].length() != 0)
+      {
+        const std::string& s = result[4];
+
+        for (std::size_t i = 0; i != s.size(); ++i)
+        {
+          std::string name(1, s[i]);
+          auto iter = m_options->find(name);
+
+          if (iter == m_options->end())
+          {
+            if (m_allow_unrecognised)
+            {
+              continue;
+            }
+            else
+            {
+              //error
+              throw_or_mimic<option_not_exists_exception>(name);
+            }
+          }
+
+          auto value = iter->second;
+
+          if (i + 1 == s.size())
+          {
+            //it must be the last argument
+            checked_parse_arg(argc, argv, current, value, name);
+          }
+          else if (value->value().has_implicit())
+          {
+            parse_option(value, name, value->value().get_implicit_value());
+          }
+          else
+          {
+            //error
+            throw_or_mimic<option_requires_argument_exception>(name);
+          }
+        }
+      }
+      else if (result[1].length() != 0)
+      {
+        const std::string& name = result[1];
+
+        auto iter = m_options->find(name);
+
+        if (iter == m_options->end())
+        {
+          if (m_allow_unrecognised)
+          {
+            // keep unrecognised options in argument list, skip to next argument
+            argv[nextKeep] = argv[current];
+            ++nextKeep;
+            ++current;
+            continue;
+          }
+          else
+          {
+            //error
+            throw_or_mimic<option_not_exists_exception>(name);
+          }
+        }
+
+        auto opt = iter->second;
+
+        //equals provided for long option?
+        if (result[2].length() != 0)
+        {
+          //parse the option given
+
+          parse_option(opt, name, result[3]);
+        }
+        else
+        {
+          //parse the next argument
+          checked_parse_arg(argc, argv, current, opt, name);
+        }
+      }
+
+    }
+
+    ++current;
+  }
+
+  for (auto& opt : *m_options)
+  {
+    auto& detail = opt.second;
+    auto& value = detail->value();
+
+    auto& store = m_results[detail];
+
+    if(value.has_default() && !store.count() && !store.has_default()){
+      parse_default(detail);
+    }
+  }
+
+  if (consume_remaining)
+  {
+    while (current < argc)
+    {
+      if (!consume_positional(argv[current])) {
+        break;
+      }
+      ++current;
+    }
+
+    //adjust argv for any that couldn't be swallowed
+    while (current != argc) {
+      argv[nextKeep] = argv[current];
+      ++nextKeep;
+      ++current;
+    }
+  }
+
+  argc = nextKeep;
+
+}
+
+inline
+void
+Options::add_option
+(
+  const std::string& group,
+  const Option& option
+)
+{
+    add_options(group, {option});
+}
+
+inline
+void
+Options::add_option
+(
+  const std::string& group,
+  const std::string& s,
+  const std::string& l,
+  std::string desc,
+  std::shared_ptr<const Value> value,
+  std::string arg_help
+)
+{
+  auto stringDesc = toLocalString(std::move(desc));
+  auto option = std::make_shared<OptionDetails>(s, l, stringDesc, value);
+
+  if (s.size() > 0)
+  {
+    add_one_option(s, option);
+  }
+
+  if (l.size() > 0)
+  {
+    add_one_option(l, option);
+  }
+
+  //add the help details
+  auto& options = m_help[group];
+
+  options.options.emplace_back(HelpOptionDetails{s, l, stringDesc,
+      value->has_default(), value->get_default_value(),
+      value->has_implicit(), value->get_implicit_value(),
+      std::move(arg_help),
+      value->is_container(),
+      value->is_boolean()});
+}
+
+inline
+void
+Options::add_one_option
+(
+  const std::string& option,
+  std::shared_ptr<OptionDetails> details
+)
+{
+  auto in = m_options->emplace(option, details);
+
+  if (!in.second)
+  {
+    throw_or_mimic<option_exists_error>(option);
+  }
+}
+
+inline
+String
+Options::help_one_group(const std::string& g) const
+{
+  typedef std::vector<std::pair<String, String>> OptionHelp;
+
+  auto group = m_help.find(g);
+  if (group == m_help.end())
+  {
+    return "";
+  }
+
+  OptionHelp format;
+
+  size_t longest = 0;
+
+  String result;
+
+  if (!g.empty())
+  {
+    result += toLocalString(" " + g + " options:\n");
+  }
+
+  for (const auto& o : group->second.options)
+  {
+    if (m_positional_set.find(o.l) != m_positional_set.end() &&
+        !m_show_positional)
+    {
+      continue;
+    }
+
+    auto s = format_option(o);
+    longest = (std::max)(longest, stringLength(s));
+    format.push_back(std::make_pair(s, String()));
+  }
+
+  longest = (std::min)(longest, static_cast<size_t>(OPTION_LONGEST));
+
+  //widest allowed description
+  auto allowed = size_t{76} - longest - OPTION_DESC_GAP;
+
+  auto fiter = format.begin();
+  for (const auto& o : group->second.options)
+  {
+    if (m_positional_set.find(o.l) != m_positional_set.end() &&
+        !m_show_positional)
+    {
+      continue;
+    }
+
+    auto d = format_description(o, longest + OPTION_DESC_GAP, allowed);
+
+    result += fiter->first;
+    if (stringLength(fiter->first) > longest)
+    {
+      result += '\n';
+      result += toLocalString(std::string(longest + OPTION_DESC_GAP, ' '));
+    }
+    else
+    {
+      result += toLocalString(std::string(longest + OPTION_DESC_GAP -
+        stringLength(fiter->first),
+        ' '));
+    }
+    result += d;
+    result += '\n';
+
+    ++fiter;
+  }
+
+  return result;
+}
+
+inline
+void
+Options::generate_group_help
+(
+  String& result,
+  const std::vector<std::string>& print_groups
+) const
+{
+  for (size_t i = 0; i != print_groups.size(); ++i)
+  {
+    const String& group_help_text = help_one_group(print_groups[i]);
+    if (empty(group_help_text))
+    {
+      continue;
+    }
+    result += group_help_text;
+    if (i < print_groups.size() - 1)
+    {
+      result += '\n';
+    }
+  }
+}
+
+inline
+void
+Options::generate_all_groups_help(String& result) const
+{
+  std::vector<std::string> all_groups;
+  all_groups.reserve(m_help.size());
+
+  for (auto& group : m_help)
+  {
+    all_groups.push_back(group.first);
+  }
+
+  generate_group_help(result, all_groups);
+}
+
+inline
+std::string
+Options::help(const std::vector<std::string>& help_groups) const
+{
+  String result = m_help_string + "\nUsage:\n  " +
+    toLocalString(m_program) + " " + toLocalString(m_custom_help);
+
+  if (m_positional.size() > 0 && m_positional_help.size() > 0) {
+    result += " " + toLocalString(m_positional_help);
+  }
+
+  result += "\n\n";
+
+  if (help_groups.size() == 0)
+  {
+    generate_all_groups_help(result);
+  }
+  else
+  {
+    generate_group_help(result, help_groups);
+  }
+
+  return toUTF8String(result);
+}
+
+inline
+const std::vector<std::string>
+Options::groups() const
+{
+  std::vector<std::string> g;
+
+  std::transform(
+    m_help.begin(),
+    m_help.end(),
+    std::back_inserter(g),
+    [] (const std::map<std::string, HelpGroupDetails>::value_type& pair)
+    {
+      return pair.first;
+    }
+  );
+
+  return g;
+}
+
+inline
+const HelpGroupDetails&
+Options::group_help(const std::string& group) const
+{
+  return m_help.at(group);
+}
+
+}
+
+#endif //CXXOPTS_HPP_INCLUDED
diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp
index d016553..6d07e4c 100644
--- a/Project2-Stream-Compaction/src/main.cpp
+++ b/Project2-Stream-Compaction/src/main.cpp
@@ -13,15 +13,80 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+#define AUTOMATION 0
+
+#if AUTOMATION
+#include "cxxopts.hpp"
+
+int SIZE = 1;
+const char timingFileName[] = "C:/Users/Gangzheng/Desktop/565/Project2-Number-Algorithms/Project2-Stream-Compaction/output_data/BLOCK_SIZE.csv";
+
+void writeTime(FILE* of, float time, bool newLine = false) {
+
+		
+		if (newLine)
+			fprintf(of, "%0.6f \n", time);
+		else
+			fprintf(of, "%0.6f,", time);
+
+
+}//writeTime
+
+cxxopts::ParseResult
+parse(int argc, char* argv[])
+{
+	try
+	{
+		cxxopts::Options options(argv[0], " - automation command line options");
+		options
+			.positional_help("[optional args]")
+			.show_positional_help();
+
+		options
+			.allow_unrecognised_options()
+			.add_options()
+			("b, blockSize", "Change BlockSize, must be power of 2", cxxopts::value<int>(StreamCompaction::Common::THREADS_PER_BLOCK))
+			("n, numObjects", "Size of arrray", cxxopts::value<int>(SIZE));
+
+		auto result = options.parse(argc, argv);
+
+		if (result.count("help"))
+		{
+			std::cout << options.help({ "", "Group" }) << std::endl;
+			exit(0);
+		}
+
+		return result;
+
+	}
+	catch (const cxxopts::OptionException& e)
+	{
+		std::cout << "error parsing options: " << e.what() << std::endl;
+		exit(1);
+	}
+} // parse
+
+#else
+const int SIZE = 1 << 20; // feel free to change the size of array
+#endif
+
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
 int *c = new int[SIZE];
 
+
+
+
 int main(int argc, char* argv[]) {
-    // Scan tests
 
+#if AUTOMATION
+	parse(argc, argv);
+
+	FILE* of = fopen(timingFileName, "w+");
+#endif
+
+    // Scan tests
     printf("\n");
     printf("****************\n");
     printf("** SCAN TESTS **\n");
@@ -38,7 +103,10 @@ int main(int argc, char* argv[]) {
     printDesc("cpu scan, power-of-two");
     StreamCompaction::CPU::scan(SIZE, b, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
+	printArray(SIZE, b, true);
+#if AUTOMATION
+	writeTime(of, StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), false);
+#endif
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
@@ -53,7 +121,9 @@ int main(int argc, char* argv[]) {
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
-
+#if AUTOMATION
+	writeTime(of, StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), false);
+#endif
 	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
 	onesArray(SIZE, c);
 	printDesc("1s array for finding bugs");
@@ -73,6 +143,9 @@ int main(int argc, char* argv[]) {
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
+#if AUTOMATION
+	writeTime(of, StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), false);
+#endif
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
@@ -87,7 +160,9 @@ int main(int argc, char* argv[]) {
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
-
+#if AUTOMATION
+	writeTime(of, StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), true);
+#endif
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
@@ -151,4 +226,8 @@ int main(int argc, char* argv[]) {
 	delete[] a;
 	delete[] b;
 	delete[] c;
+
+#if AUTOMATION
+	fclose(of);
+#endif
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
index cdbef77..272a7e8 100644
--- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
+++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_70
     )
diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu
index 2ed6d63..3fc4250 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.cu
+++ b/Project2-Stream-Compaction/stream_compaction/common.cu
@@ -17,13 +17,15 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) {
 
 namespace StreamCompaction {
     namespace Common {
-
         /**
          * Maps an array to an array of 0s and 1s for stream compaction. Elements
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            // DONE
+			int k = blockIdx.x * blockDim.x + threadIdx.x;
+			if (k >= n) return;
+			bools[k] = idata[k] > 0 ? 1 : 0;
         }
 
         /**
@@ -32,7 +34,12 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+            // DONE
+			int k = blockIdx.x * blockDim.x + threadIdx.x;
+			if (k >= n) return;
+			if (bools[k]) {
+				odata[indices[k]] = idata[k];
+			}
         }
 
     }
diff --git a/Project2-Stream-Compaction/stream_compaction/common.h b/Project2-Stream-Compaction/stream_compaction/common.h
index 996997e..5947192 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.h
+++ b/Project2-Stream-Compaction/stream_compaction/common.h
@@ -32,6 +32,9 @@ inline int ilog2ceil(int x) {
 
 namespace StreamCompaction {
     namespace Common {
+
+		static int THREADS_PER_BLOCK = 256;
+
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
 
         __global__ void kernScatter(int n, int *odata,
diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu
index a2d3e6c..93c5e8d 100644
--- a/Project2-Stream-Compaction/stream_compaction/cpu.cu
+++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu
@@ -1,6 +1,5 @@
 #include <cstdio>
 #include "cpu.h"
-
 #include "common.h"
 
 namespace StreamCompaction {
@@ -20,6 +19,17 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+
+			if (n == 0) {
+				timer().endCpuTimer();
+				return;
+			}
+			// odata[0] = idata[0];  // Inclusive Scan
+			odata[0] = 0;	// Exclusive Scan
+			for (int i = 1; i < n; i++) {
+				odata[i] = odata[i - 1] + idata[i-1];
+			}
+
 	        timer().endCpuTimer();
         }
 
@@ -30,9 +40,14 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-            // TODO
+            // DONE
+			int num = 0;
+			for (int i = 0; i < n; i++) {
+				if (idata[i] == 0) continue;
+				odata[num++] = idata[i];
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return num;
         }
 
         /**
@@ -42,9 +57,26 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-	        // TODO
+	        // DONE
+			int* scanResult = (int*)malloc(n * sizeof(int));
+
+			// Scan
+			scanResult[0] = 0;
+			for (int i = 1; i < n; i++) {
+				scanResult[i] = scanResult[i - 1] + (idata[i-1]?1:0);
+			}
+			
+			int num = 0;
+			for (int i = 0; i < n; i++) {
+				// Only write on element if idata has a 1
+				if (idata[i]) {
+					odata[scanResult[i]] = idata[i];
+					num++;
+				}
+			}
+			free(scanResult);
 	        timer().endCpuTimer();
-            return -1;
+            return num;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu
index 2db346e..1bd9390 100644
--- a/Project2-Stream-Compaction/stream_compaction/efficient.cu
+++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu
@@ -6,19 +6,75 @@
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
+		using namespace StreamCompaction::Common;
+
         PerformanceTimer& timer()
         {
             static PerformanceTimer timer;
             return timer;
         }
 
+		__global__ 
+		void kernReduction(int n, unsigned int d, int *idata)
+		{
+			int k = blockIdx.x * blockDim.x + threadIdx.x;
+			unsigned int offset = 1 << d;
+
+			if (k >= n || k % (offset << 1) !=0 ) return;
+
+			idata[k + (offset << 1) - 1] += idata[k + offset - 1];
+		}
+
+		__global__
+			void kernDownSweep(int n, unsigned int d, int* idata)
+		{
+			int k = blockIdx.x * blockDim.x + threadIdx.x;
+
+			unsigned int offset = 1 << d;
+			if (k >= n || k % (offset << 1) != 0) return;
+
+			int tmp = idata[k + offset - 1];	// Save left child
+			idata[k + offset - 1] = idata[k + (offset << 1) - 1];	// Set left child to this node's value
+			idata[k + (offset << 1) - 1] += tmp;	// Set right child to old left value + this node's value
+		}
+
+		__global__ void kernSetZero(int n, int* idata)
+		{
+			idata[n - 1] = 0;
+		}
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+        void scan(int n, int *odata, const int *idata, bool useTimer) {
+            // DONE
+			int* d_idata;
+			int maxD = ilog2ceil(n);
+			int nCeil = 1 << maxD;
+
+			cudaMalloc(&d_idata, nCeil * sizeof(int));
+			cudaMemset(d_idata, 0, nCeil * sizeof(int));
+			cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			int threadsPerBlock = Common::THREADS_PER_BLOCK;
+			int blockSize = (nCeil + threadsPerBlock - 1) / threadsPerBlock;
+
+			if (useTimer) timer().startGpuTimer();
+			// Parallel Reduction
+			for (int d = 0; d < maxD; d++) {
+				kernReduction << <blockSize, threadsPerBlock >> > (n, d, d_idata);
+			}
+
+			kernSetZero << <1, 1 >> > (nCeil, d_idata);
+			// Down Sweep
+			for (int d = maxD-1; d >= 0; d--) {
+				kernDownSweep << <blockSize, threadsPerBlock >> > (n, d, d_idata);
+			}
+			if (useTimer) timer().endGpuTimer();
+
+			cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(d_idata);
+
         }
 
         /**
@@ -31,10 +87,46 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-            return -1;
+            // DONE
+			int numOfCompacted = 0;
+			int* d_idata;
+			int* d_odata;
+			int* d_bools;
+			int* d_indices;
+
+			cudaMalloc(&d_idata, n * sizeof(int));
+			cudaMalloc(&d_odata, n * sizeof(int));
+			cudaMalloc(&d_bools, n * sizeof(int));
+			cudaMalloc(&d_indices, n * sizeof(int));
+
+			cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			timer().startGpuTimer();
+			int threadsPerBlock = Common::THREADS_PER_BLOCK;
+			int blockSize = (n + threadsPerBlock - 1) / threadsPerBlock;
+			kernMapToBoolean<<<blockSize, threadsPerBlock>>>(n, d_bools, d_idata);
+
+			// false: Not using the scan's timer
+			scan(n, d_indices, d_bools, false);
+			timer().endGpuTimer();
+
+
+			int* lastIndex = d_indices + n - 1;
+			cudaMemcpy(&numOfCompacted, lastIndex, sizeof(int), cudaMemcpyDeviceToHost);
+
+			kernScatter << <blockSize, threadsPerBlock >> > (n, d_odata, d_idata, d_bools, d_indices);
+
+			cudaDeviceSynchronize();
+
+			cudaMemcpy(odata, d_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+			cudaFree(d_idata);
+			cudaFree(d_odata);
+			cudaFree(d_bools);
+			cudaFree(d_indices);
+
+
+            return (idata[n-1] > 0 ) ? ( numOfCompacted + 1 ) : numOfCompacted;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.h b/Project2-Stream-Compaction/stream_compaction/efficient.h
index 803cb4f..5ba93b0 100644
--- a/Project2-Stream-Compaction/stream_compaction/efficient.h
+++ b/Project2-Stream-Compaction/stream_compaction/efficient.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
     namespace Efficient {
         StreamCompaction::Common::PerformanceTimer& timer();
 
-        void scan(int n, int *odata, const int *idata);
+        void scan(int n, int *odata, const int *idata, bool useTimer = true);
 
         int compact(int n, int *odata, const int *idata);
     }
diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu
index 4308876..f3bb723 100644
--- a/Project2-Stream-Compaction/stream_compaction/naive.cu
+++ b/Project2-Stream-Compaction/stream_compaction/naive.cu
@@ -2,24 +2,71 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include <algorithm>
 
 namespace StreamCompaction {
-    namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-        // TODO: __global__
-
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
-    }
+	namespace Naive {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+
+		// TODO: __global__
+		__global__
+		void kernScan(unsigned int n, int offset, int* odata, int* idata) {
+
+			int k = blockIdx.x * blockDim.x + threadIdx.x;
+			if (k >= n) return;
+
+			if (k >= offset)
+				odata[k] = idata[k - offset] + idata[k];
+			else
+				odata[k] = idata[k];
+		}
+
+		__global__
+		void kernRightShift(unsigned int n, int* odata, int* idata) {
+			// Shift right by one -> get exclusive scan
+			int k = blockIdx.x * blockDim.x + threadIdx.x;
+			if (k >= n) return;
+			
+			odata[k] = k > 0 ? idata[k - 1] : 0;
+		}
+
+		/**
+		 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		 */
+		void scan(int n, int* odata, const int* idata) {
+
+			int* d_odata;
+			int* d_idata;
+			cudaMalloc(&d_odata, n * sizeof(int));
+			cudaMalloc(&d_idata, n * sizeof(int));
+			cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			int threadsPerBlock = Common::THREADS_PER_BLOCK;
+			int blockSize = (n + threadsPerBlock - 1) / threadsPerBlock;
+
+			timer().startGpuTimer();
+			for (int offset = 1; offset < (n << 1); offset *= 2) {
+				kernScan << <blockSize, threadsPerBlock >> > (n, offset, d_odata, d_idata);
+				cudaDeviceSynchronize();
+				// swap in and out
+				int* tmp = d_odata;
+				d_odata = d_idata;
+				d_idata = tmp;
+			}
+			timer().endGpuTimer();
+
+			cudaMemcpy(d_idata, d_odata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			kernRightShift << <blockSize, threadsPerBlock >> > (n, d_odata, d_idata);
+
+			cudaMemcpy(odata, d_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(d_idata);
+			cudaFree(d_odata);
+
+		}
+	}
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu
index 1def45e..58772ee 100644
--- a/Project2-Stream-Compaction/stream_compaction/thrust.cu
+++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu
@@ -18,11 +18,17 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+			thrust::device_vector<int> dv_in(idata, idata + n);
+			thrust::device_vector<int> dv_out(n);
+
+			timer().startGpuTimer();
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             timer().endGpuTimer();
+
+			thrust::copy(dv_out.begin(), dv_out.end(), odata);
         }
     }
 }
diff --git a/Project2-Stream-Compaction/test_blockSize.bat b/Project2-Stream-Compaction/test_blockSize.bat
new file mode 100644
index 0000000..1cbe26b
--- /dev/null
+++ b/Project2-Stream-Compaction/test_blockSize.bat
@@ -0,0 +1,6 @@
+set mypath=%cd%
+echo %mypath% 
+
+
+"%mypath%\build\Release\cis565_stream_compaction_test.exe" -b 128 -n 1600000
+timeout 60
\ No newline at end of file
diff --git a/README.md b/README.md
index 3a0b2fe..d846620 100644
--- a/README.md
+++ b/README.md
@@ -3,14 +3,11 @@ CUDA Number Algorithms
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
+* Gangzheng Tong
+  * www.gtong.me
+* Tested on: Windows 10, i7-8th Gen @ 2.2GHz 16GB, RTX 2070 8GB (Personal Laptop)
 
 Link to the readmes of the other two subprojects.
 
-Add anything else you think is relevant up to this point.
-(Remember, this is public, so don't put anything here that you don't want to share with the world.)
-
+* [Stream Compaction](Project2-Stream-Compaction/README.md)
+* [Character Recognition](Project2-Character-Recognition/README.md)