Cuda by example, Julia set performance improved failed

Recently I am learning the examples in the book CUDA by JASON SANDERS. the example of Juila Set makes a bad performance of 7032ms. Here is the program:

#include <cuda.h>
#include <cuda_runtime.h>
#include <cpu_bitmap.h>
#include <book.h>
#define DIM 1024

struct cuComplex{
float r;
float i;
device cuComplex(float a, float b) : r(a),i(b){

}
__device__ float magnitude2(void){
    return r*r+i*i;
}
__device__ cuComplex operator *(const cuComplex& a){
    return cuComplex(r*a.r-i*a.i, i*a.r+r*a.i);
}
__device__ cuComplex operator +(const cuComplex& a){
    return cuComplex(r+a.r,i+a.i);
}

};

device int julia(int x,int y){
const float scale = 1.5;
float jx = scale * (float)(DIM/2 - x)/(DIM/2);
float jy = scale * (float)(DIM/2 - y)/(DIM/2);

cuComplex c(-0.8,0.156);
cuComplex a(jx,jy);

int i = 0;
for(i = 0; i<200; i++){
    a = a*a + c;
    if(a.magnitude2() > 1000){
        return 0;
    }
}
return 1;

}

global void kernel(unsigned char ptr){
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y
gridDim.x;

int juliaValue = julia(x,y);
ptr[offset*4 + 0] = 255*juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 1;
ptr[offset*4 + 3] = 255;

}

int main(void){

CPUBitmap bitmap(DIM,DIM);
unsigned char * dev_bitmap;

dim3 grid(DIM,DIM);
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
dim3 thread(DIM,DIM);
cudaEvent_t start,stop;
cudaEvent_t bitmapCpy_start,bitmapCpy_stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventCreate(&bitmapCpy_start));
HANDLE_ERROR(cudaEventCreate(&bitmapCpy_stop));

HANDLE_ERROR(cudaMalloc((void **)&dev_bitmap,bitmap.image_size()));

HANDLE_ERROR(cudaEventRecord(start,0));


kernel<<<grid,1>>>(dev_bitmap);

HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost));
//HANDLE_ERROR(cudaEventRecord(bitmapCpy_stop,0));
//HANDLE_ERROR(cudaEventSynchronize(bitmapCpy_stop));

// float copyTime;
// HANDLE_ERROR(cudaEventElapsedTime(&copyTime,bitmapCpy_start,bitmapCpy_stop));

HANDLE_ERROR(cudaEventRecord(stop,0));
HANDLE_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,start,stop));



//printf("Total time is %3.1f ms, time for copying is %3.1f ms \n",elapsedTime,copyTime);
printf("Total time is %3.1f ms\n",elapsedTime);

bitmap.display_and_exit();
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaEventDestroy(bitmapCpy_start));
HANDLE_ERROR(cudaEventDestroy(bitmapCpy_stop));
HANDLE_ERROR(cudaFree(dev_bitmap));

}

I think the main factor that influences the performance is that the program above just run 1 thread in every block:

kernel<<<grid,1>>>(dev_bitmap);


so I change the kernel like the following:

global void kernel(unsigned char *ptr){

int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;

int offset = x + y*gridDim.x*blockIdx.x;

int juliaValue = julia(x,y);
ptr[offset*4 + 0] = 255*juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 1;
ptr[offset*4 + 3] = 255;

}


and call kernel:

dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<blocks,threads>>>(dev_bitmap);


I think this change is not a big deal, but when I ran it, it acted like that it ran into some endless loops, no image appeared and I couldn’t do anything with my screen, just blocked there. anyone can help ? thanks.

toolkit: cuda 5.5

system: ubuntu 12.04

I have found the mistake, in my new kernel, the offset should be:

offset = x+ygridDim.xblockDim.x;