#include #include #include #include # define NPOINTS 1024 # define MAXITER 1000 #define ACCURACY 0.01 #define NUM_THREADS 256 #define BLOCK_DIM 16 struct complex{ float real; float imag; }; float MandlebrotSeq (int npoints, int maxiter) { int i, j, iter, numoutside = 0; float area, ztemp; struct complex z, c; for (i=0; i4.0e0) { numoutside++; break; } } } } area=2.0*2.5*1.125*(float)(npoints*npoints-numoutside)/(float)(npoints*npoints); printf("Numoutside sequential version: %d\n", numoutside); printf("Area of Mandlebrot set = %12.8f\n",area); return area; } __global__ void MandlebrotKernel (int* numOutsideSums, int npoints, int maxiter) { extern __shared__ int numOutside[]; int x, y, iter; float creal, cimag, zreal, zimag, ztemp; x = blockIdx.x * blockDim.x + threadIdx.x; y = blockIdx.y * blockDim.y + threadIdx.y; creal = -2.0+2.5*(float)(y)/(float)(npoints)+1.0e-7; cimag = 1.125*(float)(x)/(float)(npoints)+1.0e-7; zreal = creal; zimag = cimag; numOutside[threadIdx.y * blockDim.x + threadIdx.x] = 0; for (iter = 0; iter < maxiter; iter++) { ztemp = (zreal * zreal) - (zimag * zimag) + creal; zimag = zreal * zimag * 2 + cimag; zreal = ztemp; if ((zreal * zreal + zimag * zimag) > 4.0e0) { numOutside[threadIdx.y * blockDim.x + threadIdx.x] = 1; } } x = threadIdx.y * blockDim.x + threadIdx.x; __syncthreads(); // Do reduction in shared memory for (int iter = (blockDim.x * blockDim.y) / 2; iter > 0; iter >>= 1) { if ( x < iter) { numOutside[x] += numOutside[x + iter]; } __syncthreads(); } if (x == 0) { numOutsideSums[blockIdx.y * gridDim.x + blockIdx.x] = numOutside[0]; } } float MandlebrotGPU (int npoints, int maxiter) { int i, numoutside = 0, grid, size; float area; int *numOutside, *numOutsideGPU; grid = ceil(npoints / BLOCK_DIM); size = grid * grid * sizeof(int); dim3 gridDim(grid, grid); dim3 blockDim(BLOCK_DIM, BLOCK_DIM); numOutside = (int*) malloc(size); cudaMalloc(&numOutsideGPU, size); MandlebrotKernel<<>>(numOutsideGPU, npoints, maxiter); printf("Launching kernel: %s\n", cudaGetErrorString(cudaGetLastError())); cudaMemcpy(numOutside, numOutsideGPU, size, cudaMemcpyDeviceToHost); for (i = 0; i < grid * grid; i++) { numoutside += numOutside[i]; } area = 2.0*2.5*1.125*(float)(npoints*npoints-numoutside)/(float)(npoints*npoints); cudaFree(numOutsideGPU); free(numOutside); printf("Numoutside GPU version: %d\n", numoutside); printf("Area of Mandlebrot set = %12.8f\n", area); return area; } int main (int argc, char* argv[]) { float area1, area2; int npoints = NPOINTS, maxiter = MAXITER; if (argc == 3) { npoints = atoi(argv[1]); maxiter = atoi(argv[2]); } area1 = MandlebrotSeq(npoints, maxiter); area2 = MandlebrotGPU(npoints, maxiter); if (fabs(area1 - area2) < ACCURACY) { printf("Test PASSED!\n"); } else { printf("Test FAILED!\n"); } return 0; }