Recently I am learning the examples in the book CUDA by JASON SANDERS. the example of Juila Set makes a bad performance of 7032ms. Here is the program:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cpu_bitmap.h>
#include <book.h>
#define DIM 1024
struct cuComplex{
float r;
float i;
device cuComplex(float a, float b) : r(a),i(b){
}
__device__ float magnitude2(void){
return r*r+i*i;
}
__device__ cuComplex operator *(const cuComplex& a){
return cuComplex(r*a.r-i*a.i, i*a.r+r*a.i);
}
__device__ cuComplex operator +(const cuComplex& a){
return cuComplex(r+a.r,i+a.i);
}
};
device int julia(int x,int y){
const float scale = 1.5;
float jx = scale * (float)(DIM/2 - x)/(DIM/2);
float jy = scale * (float)(DIM/2 - y)/(DIM/2);
cuComplex c(-0.8,0.156);
cuComplex a(jx,jy);
int i = 0;
for(i = 0; i<200; i++){
a = a*a + c;
if(a.magnitude2() > 1000){
return 0;
}
}
return 1;
}
global void kernel(unsigned char ptr){
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + ygridDim.x;
int juliaValue = julia(x,y);
ptr[offset*4 + 0] = 255*juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 1;
ptr[offset*4 + 3] = 255;
}
int main(void){
CPUBitmap bitmap(DIM,DIM);
unsigned char * dev_bitmap;
dim3 grid(DIM,DIM);
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
dim3 thread(DIM,DIM);
cudaEvent_t start,stop;
cudaEvent_t bitmapCpy_start,bitmapCpy_stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventCreate(&bitmapCpy_start));
HANDLE_ERROR(cudaEventCreate(&bitmapCpy_stop));
HANDLE_ERROR(cudaMalloc((void **)&dev_bitmap,bitmap.image_size()));
HANDLE_ERROR(cudaEventRecord(start,0));
kernel<<<grid,1>>>(dev_bitmap);
HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost));
//HANDLE_ERROR(cudaEventRecord(bitmapCpy_stop,0));
//HANDLE_ERROR(cudaEventSynchronize(bitmapCpy_stop));
// float copyTime;
// HANDLE_ERROR(cudaEventElapsedTime(©Time,bitmapCpy_start,bitmapCpy_stop));
HANDLE_ERROR(cudaEventRecord(stop,0));
HANDLE_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,start,stop));
//printf("Total time is %3.1f ms, time for copying is %3.1f ms \n",elapsedTime,copyTime);
printf("Total time is %3.1f ms\n",elapsedTime);
bitmap.display_and_exit();
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaEventDestroy(bitmapCpy_start));
HANDLE_ERROR(cudaEventDestroy(bitmapCpy_stop));
HANDLE_ERROR(cudaFree(dev_bitmap));
}
I think the main factor that influences the performance is that the program above just run 1 thread in every block:
kernel<<<grid,1>>>(dev_bitmap);
so I change the kernel like the following:
global void kernel(unsigned char *ptr){
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int offset = x + y*gridDim.x*blockIdx.x;
int juliaValue = julia(x,y);
ptr[offset*4 + 0] = 255*juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 1;
ptr[offset*4 + 3] = 255;
}
and call kernel:
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<blocks,threads>>>(dev_bitmap);
I think this change is not a big deal, but when I ran it, it acted like that it ran into some endless loops, no image appeared and I couldn’t do anything with my screen, just blocked there. anyone can help ? thanks.
toolkit: cuda 5.5
system: ubuntu 12.04