// Vector Addition Kernel, using shared memory // with timing // without error checking #define N 1024*1024*12 #define BLOCK_SIZE 256 #include #include #include #include // Compute vector sum C = A+B void vecAddGold(int* A, int* B, int* C, int n){ int i; for (i = 0; i < n; i++) C[i] = A[i] + B[i]; } __global__ void vecAddKernel (int* devA, int* devB, int* devC, int n){ __shared__ int tileA[BLOCK_SIZE], tileB[BLOCK_SIZE]; int sum = 0; int tid = threadIdx.x; int idx = blockDim.x * blockIdx.x + tid; tileA[tid] = devA[idx]; tileB[tid] = devB[idx]; __syncthreads(); if(idx < n) { sum = tileA[tid] + tileB[tid]; } devC[idx] = sum; } void vecAdd(int* A, int* B, int* C, int n){ int size = n * sizeof(int); int *devA, *devB, *devC; unsigned int timer = 0; float time1, time2, time3; cutCreateTimer(&timer); cutStartTimer(timer); cudaMalloc((void **) &devA, size); cudaMemcpy(devA, A, size, cudaMemcpyHostToDevice); cudaMalloc((void **) &devB, size); cudaMemcpy(devB, B, size, cudaMemcpyHostToDevice); cudaMalloc((void **) &devC, size); cutStopTimer(timer); time1 = cutGetTimerValue(timer); cutStartTimer(timer); // Run ceil(N/256) blocks of 256 threads each vecAddKernel<<>>(devA, devB, devC, n); cutStopTimer(timer); time2 = cutGetTimerValue(timer); cutStartTimer(timer); cudaMemcpy(C, devC, size, cudaMemcpyDeviceToHost); cudaFree(devA); cudaFree(devB); cudaFree(devC); time3 = cutGetTimerValue(timer); printf("Memory allocation & transfer time: %2.2f\n", time1 + time3 - time2); printf("Kernel run time: %2.2f\n", time2 - time1); } int compareGold (int* C, int* D, int n) { int i; for (i = 0; i < n; i++) { if (C[i] != D[i]) { printf("Elements C[%d] = %d i D[%d] = %d mismatch!", i, i, C[i], D[i]); return 1; } } return 0; } int main (int argc, char **argv ) { int i, size = N *sizeof( int); int *A, *B, *C, *D; unsigned int timer = 0; float time1, time2; // Allocate arrays A = (int*) malloc(size); B = (int*) malloc(size); C = (int*) malloc(size); D = (int*) malloc(size); // Load arrays srand(time(NULL)); for (i = 0; i < N; i++) { A[i] = rand(); B[i] = rand(); } // CUDA cutCreateTimer(&timer); cutStartTimer(timer); vecAdd(A, B, C, N); cudaThreadSynchronize(); cutStopTimer(timer); time1 = cutGetTimerValue(timer); // Sequential cutStartTimer(timer); vecAddGold(A, B, D, N); cutStopTimer(timer); time2 = cutGetTimerValue(timer); printf("CUDA vector addition time: %2.2f\n", time1); printf("CPU vector addition time: %2.2f\n", time2 - time1); // Process results if (compareGold(C, D, N) != 0) printf("Test FAILED!\n"); else printf("Test PASSED!\n"); cutDeleteTimer(timer); free(A); free(B); free(C); free(D); return 0; }