// Reduction (sum) kernel
// without error checking

#define N 1024*1024*12

#define BLOCK_SIZE 256

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Simple reduction kernel
__global__ void reductionSumKernel (int* devA, int* blockResults, int n) {
	extern __shared__ int sharedData[];

	unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        	
	// Load block in the shared memory
	if (i < n) sharedData[tid] = devA[i];
	else sharedData[tid] = 0;
	
	__syncthreads();

    // Do reduction in shared memory
	for (unsigned int s = blockDim.x/2; s > 0; s >>= 1) {
		if (tid < s) {
			sharedData[tid] += sharedData[tid + s];
		}
		__syncthreads();
	}	
	
	// Write result for this block to global memory 
	if (tid == 0) blockResults[blockIdx.x] = sharedData[0];
}

int sumReduction(int* A, int n) {
	int size = n * sizeof(int), gpuSum = 0, numBlocks;
	int *devA, *devBlockRes;
	
	cudaMalloc((void **) &devA, size);
	cudaMemcpy(devA, A, size, cudaMemcpyHostToDevice);
		
	// Run kernel several times until the work is done
	numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
	cudaMalloc((void **) &devBlockRes, numBlocks * sizeof(int));

	reductionSumKernel<<<numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(int)>>>(devA, devBlockRes, n);
	
	while (numBlocks > 1) {
	  n = numBlocks;
	  numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
	  reductionSumKernel<<<numBlocks, BLOCK_SIZE, BLOCK_SIZE * sizeof(int)>>>(devBlockRes, devBlockRes, n);
	}
	
	// Copy back the results
	cudaMemcpy(&gpuSum, devBlockRes, sizeof(int), cudaMemcpyDeviceToHost);
	
	cudaFree(devA);
	cudaFree(devBlockRes);
	
	return gpuSum;
}

int sumGold (int* C, int n) {
	int i, sum = 0;
	for (i = 0; i < n; i++) 
	  sum += C[i];
	
	return sum;
}

int main (int argc, char **argv ) {
	int i, size = N *sizeof( int), refSum, gpuSum;
	int *A;
	
    // Allocate arrays
	A = (int*) malloc(size);
	
    // Generate arrays	
	srand(time(NULL));
	for (i = 0; i < N; i++) {
	  A[i] = rand();
	}
	
	// CUDA
	gpuSum = sumReduction(A, N);
	
	// Sequential
	refSum = sumGold(A, N);
		
	// Process results
	if (refSum != gpuSum) 
	  printf("Test FAILED!\n");
	else 
      printf("Test PASSED!\n");
	
    free(A);
	
	return 0;
}