// Vector Addition Kernel, using shared memory 
// with timing
// without error checking

#define N 1024*1024*12
#define BLOCK_SIZE 256

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <cutil.h>

// Compute vector sum C = A+B
void vecAddGold(int* A, int* B, int* C, int n){
	int i;
	for (i = 0; i < n; i++)
	  C[i] = A[i] + B[i];
}

__global__ void vecAddKernel (int* devA, int* devB, int* devC, int n){
	__shared__ int tileA[BLOCK_SIZE], tileB[BLOCK_SIZE];
  
  int sum = 0;
  int tid = threadIdx.x;
  int idx = blockDim.x * blockIdx.x + tid;
	
  tileA[tid] = devA[idx];
  tileB[tid] = devB[idx];

  __syncthreads();

  if(idx < n) { 
    sum = tileA[tid] + tileB[tid];      
  }
  
  devC[idx] = sum;
  
}

void vecAdd(int* A, int* B, int* C, int n){
	int size = n * sizeof(int);
	int *devA, *devB, *devC;
	
  unsigned int timer = 0;
  float time1, time2, time3;
  
  cutCreateTimer(&timer);
	cutStartTimer(timer);

  cudaMalloc((void **) &devA, size);
	cudaMemcpy(devA, A, size, cudaMemcpyHostToDevice);
	cudaMalloc((void **) &devB, size);
	cudaMemcpy(devB, B, size, cudaMemcpyHostToDevice);
	cudaMalloc((void **) &devC, size);

  cutStopTimer(timer);
	time1 = cutGetTimerValue(timer);

  cutStartTimer(timer);

  // Run ceil(N/256) blocks of 256 threads each
	vecAddKernel<<<ceil((double)N/BLOCK_SIZE), BLOCK_SIZE>>>(devA, devB, devC, n);
  
  cutStopTimer(timer);
	time2 = cutGetTimerValue(timer);

  cutStartTimer(timer);
	cudaMemcpy(C, devC, size, cudaMemcpyDeviceToHost);
	cudaFree(devA); cudaFree(devB); cudaFree(devC); 
  time3 = cutGetTimerValue(timer);

  printf("Memory allocation & transfer time: %2.2f\n", time1 + time3 - time2);	
	printf("Kernel run time: %2.2f\n", time2 - time1);
 
}

int compareGold (int* C, int* D, int n) {
	int i;
	for (i = 0; i < n; i++) {
	  if (C[i] != D[i]) {
	    printf("Elements C[%d] = %d i D[%d] = %d mismatch!", i, i, C[i], D[i]);
		  return 1;
	  }
	}
	return 0;
}

int main (int argc, char **argv ) {
	int i, size = N *sizeof( int);
	int *A, *B, *C, *D;
	
	unsigned int timer = 0;
	
  float time1, time2;	
	
	// Allocate arrays
	A = (int*) malloc(size);
	B = (int*) malloc(size);
	C = (int*) malloc(size);
	D = (int*) malloc(size);	
		
	// Load arrays	
		
	srand(time(NULL));
	for (i = 0; i < N; i++) {
	  A[i] = rand();
	  B[i] = rand();
	}
	
	// CUDA
	
	cutCreateTimer(&timer);
	cutStartTimer(timer);
	vecAdd(A, B, C, N);
	
	cudaThreadSynchronize();
	cutStopTimer(timer);
	time1 = cutGetTimerValue(timer);
	
	// Sequential
	  
	cutStartTimer(timer);
	vecAddGold(A, B, D, N);
	cutStopTimer(timer);
	time2 = cutGetTimerValue(timer);
	
	printf("CUDA vector addition time: %2.2f\n", time1);	
	printf("CPU vector addition time: %2.2f\n", time2 - time1);	
		
	// Process results
	
	if (compareGold(C, D, N) != 0) 
	  printf("Test FAILED!\n");
	else 
    printf("Test PASSED!\n");
	
	cutDeleteTimer(timer);
	
	free(A);
	free(B);
	free(C);
	free(D);
		
	return 0;
}