// Vector Addition Kernel #define N 1024*1024*12 #define BLOCK_SIZE 256 #include #include #include // Compute vector sum C = A+B void vecAddGold(int* A, int* B, int* C, int n){ int i; for (i = 0; i < n; i++) C[i] = A[i] + B[i]; } __global__ void vecAddKernel (int* devA, int* devB, int* devC, int n){ int idx = threadIdx.x + blockDim.x * blockIdx.x; if(idx < n) devC[idx] = devA[idx] + devB[idx]; } void vecAdd(int* A, int* B, int* C, int n){ int size = n * sizeof(int); int *devA, *devB, *devC; cudaMalloc((void **) &devA, size); cudaMemcpy(devA, A, size, cudaMemcpyHostToDevice); cudaMalloc((void **) &devB, size); cudaMemcpy(devB, B, size, cudaMemcpyHostToDevice); cudaMalloc((void **) &devC, size); // Run ceil(N/256) blocks of 256 threads each vecAddKernel<<>>(devA, devB, devC, n); cudaMemcpy(C, devC, size, cudaMemcpyDeviceToHost); cudaFree(devA); cudaFree(devB); cudaFree(devC); } int compareGold (int* C, int* D, int n) { int i; for (i = 0; i < n; i++) { if (C[i] != D[i]) { return 1; } } return 0; } int main (int argc, char **argv ) { int i, size = N *sizeof( int); int *A, *B, *C, *D; // Allocate arrays A = (int*) malloc(size); B = (int*) malloc(size); C = (int*) malloc(size); D = (int*) malloc(size); // Load arrays srand(time(NULL)); for (i = 0; i < N; i++) { A[i] = rand(); B[i] = rand(); } // CUDA vecAdd(A, B, C, N); // Sequential vecAddGold(A, B, D, N); // Process results if (compareGold(C, D, N) != 0) printf("Test FAILED!"); else printf("Test PASSED!"); free(A); free(B); free(C); free(D); return 0; }