// Reduction (sum) kernel // without error checking #define N 1024*1024*12 #define BLOCK_SIZE 256 #include #include #include // Simple reduction kernel __global__ void reductionSumKernel (int* devA, int* blockResults, int n) { extern __shared__ int sharedData[]; unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; // Load block in the shared memory if (i < n) sharedData[tid] = devA[i]; else sharedData[tid] = 0; __syncthreads(); // Do reduction in shared memory for (unsigned int s = blockDim.x/2; s > 0; s >>= 1) { if (tid < s) { sharedData[tid] += sharedData[tid + s]; } __syncthreads(); } // Write result for this block to global memory if (tid == 0) blockResults[blockIdx.x] = sharedData[0]; } int sumReduction(int* A, int n) { int size = n * sizeof(int), gpuSum = 0, numBlocks; int *devA, *devBlockRes; cudaMalloc((void **) &devA, size); cudaMemcpy(devA, A, size, cudaMemcpyHostToDevice); // Run kernel several times until the work is done numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; cudaMalloc((void **) &devBlockRes, numBlocks * sizeof(int)); reductionSumKernel<<>>(devA, devBlockRes, n); while (numBlocks > 1) { n = numBlocks; numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; reductionSumKernel<<>>(devBlockRes, devBlockRes, n); } // Copy back the results cudaMemcpy(&gpuSum, devBlockRes, sizeof(int), cudaMemcpyDeviceToHost); cudaFree(devA); cudaFree(devBlockRes); return gpuSum; } int sumGold (int* C, int n) { int i, sum = 0; for (i = 0; i < n; i++) sum += C[i]; return sum; } int main (int argc, char **argv ) { int i, size = N *sizeof( int), refSum, gpuSum; int *A; // Allocate arrays A = (int*) malloc(size); // Generate arrays srand(time(NULL)); for (i = 0; i < N; i++) { A[i] = rand(); } // CUDA gpuSum = sumReduction(A, N); // Sequential refSum = sumGold(A, N); // Process results if (refSum != gpuSum) printf("Test FAILED!\n"); else printf("Test PASSED!\n"); free(A); return 0; }