tips on speedup for CUDA kernel for C1060 and C2050

Currently this kernel takes 172 us for ni = 438.

xy_field<<<nblks,threads3>>>(out1_d, out2_d, x_val, y_val, index, npix);

const int threads3 = 32;

const int nblks = (ni+(threads3-1))/threads3;

__global__ void xy_field(float *in1, float *max_val, float *x_val, float *y_val, int *index, int ny)

{

	int tid = blockIdx.x * blockDim.x + threadIdx.x;

	

	int index1 = 0;

	int x_sh = 0;

	int y_sh = 0;

	int x_indx1 = 0;

	int x_indx2 = 0;

	int y_indx1 = 0;

	int y_indx2 = 0;

	float valx = 0;

	float valy = 0;

	float numx = 0;

	float numy = 0;

	float denx = 0;

	float deny = 0;

	

	index1 = index[tid] - (ny*ny*tid);

	y_sh = index1/ny;

	x_sh = index1 - (y_sh*ny);

	x_indx1 = (tid*ny*ny)+y_sh*ny+(x_sh-1);

	x_indx2 = (tid*ny*ny)+y_sh*ny+(x_sh+1);

	y_indx1 = (tid*ny*ny)+(y_sh-1)*ny+x_sh;

	y_indx2 = (tid*ny*ny)+(y_sh+1)*ny+x_sh;

	numx = (max_val[tid] - in1[x_indx1]);

	denx = (2*max_val[tid] - in1[x_indx1] - in1[x_indx2]);

	numy = (max_val[tid] - in1[y_indx1]);

	deny = (2*max_val[tid] - in1[y_indx1] - in1[y_indx2]);

	valx = (x_sh - 0.5) + __fdividef(numx,denx);

	valy = (y_sh - 0.5) + __fdividef(numy,deny);

	

	x_val[tid] = valx;

	y_val[tid] = valy;

}

any suggestions regarding decreasing the execution time? I was looking at using shared memory for decreasing execution time, but the shared memory usage increases linearly along with increasing ni.

Thanks in advance :)

Currently this kernel takes 172 us for ni = 438.

xy_field<<<nblks,threads3>>>(out1_d, out2_d, x_val, y_val, index, npix);

const int threads3 = 32;

const int nblks = (ni+(threads3-1))/threads3;

__global__ void xy_field(float *in1, float *max_val, float *x_val, float *y_val, int *index, int ny)

{

	int tid = blockIdx.x * blockDim.x + threadIdx.x;

	

	int index1 = 0;

	int x_sh = 0;

	int y_sh = 0;

	int x_indx1 = 0;

	int x_indx2 = 0;

	int y_indx1 = 0;

	int y_indx2 = 0;

	float valx = 0;

	float valy = 0;

	float numx = 0;

	float numy = 0;

	float denx = 0;

	float deny = 0;

	

	index1 = index[tid] - (ny*ny*tid);

	y_sh = index1/ny;

	x_sh = index1 - (y_sh*ny);

	x_indx1 = (tid*ny*ny)+y_sh*ny+(x_sh-1);

	x_indx2 = (tid*ny*ny)+y_sh*ny+(x_sh+1);

	y_indx1 = (tid*ny*ny)+(y_sh-1)*ny+x_sh;

	y_indx2 = (tid*ny*ny)+(y_sh+1)*ny+x_sh;

	numx = (max_val[tid] - in1[x_indx1]);

	denx = (2*max_val[tid] - in1[x_indx1] - in1[x_indx2]);

	numy = (max_val[tid] - in1[y_indx1]);

	deny = (2*max_val[tid] - in1[y_indx1] - in1[y_indx2]);

	valx = (x_sh - 0.5) + __fdividef(numx,denx);

	valy = (y_sh - 0.5) + __fdividef(numy,deny);

	

	x_val[tid] = valx;

	y_val[tid] = valy;

}

any suggestions regarding decreasing the execution time? I was looking at using shared memory for decreasing execution time, but the shared memory usage increases linearly along with increasing ni.

Thanks in advance :)

Some easy things (reduces memory ops and operations) but check again if its correct.

memory ops: 14 vs 7

operations: 44 vs 30

__global__ void xy_field(float *in1, float *max_val, float *x_val, float *y_val, int *index, int ny)

{

        int tid = blockIdx.x * blockDim.x + threadIdx.x;

int index1 = 0;

        int x_sh = 0;

        int y_sh = 0;

        int x_indx1 = 0;

        int x_indx2 = 0;

        int y_indx1 = 0;

        int y_indx2 = 0;

        float valx = 0;

        float valy = 0;

        float numx = 0;

        float numy = 0;

        float denx = 0;

        float deny = 0;

int tidnyny=ny*ny*tid;

        index1 = index[tid] - (tidnyny);

        y_sh = index1/ny;

        x_sh = index1 - (y_sh*ny);

        x_indx1 = (tidnyny)+y_sh*ny+(x_sh-1);

        y_indx1 = (tidnyny)+(y_sh-1)*ny+x_sh;

        x_indx2 = x_index+2;

        y_indx2 = y_index1+2;

float in1_x1=in1[x_indx1];

        float in1_y1=in1[y_indx1];

        float max_val_tid=max_val[tid];

numx = (max_val_tid - in1_x1]);

        denx = (2*max_val_tid - in1_x1 - in1[x_indx2]);

        numy = (max_val_tid - in1_y1);

        deny = (2*max_val_tid - in1_y1 - in1[y_indx2]);

        valx = (x_sh - 0.5f) + __fdividef(numx,denx);

        valy = (y_sh - 0.5f) + __fdividef(numy,deny);

x_val[tid] = valx;

        y_val[tid] = valy;

}

I dont think shared memory will help, since I dont see reusage of data. If you only have to load it ones, by one thread (and not more than one thread of the same block uses the data) there is no sense in using shared mem.

You might have more luck with using texture cache. But not sure about that.

Cheers

Ceearem

Some easy things (reduces memory ops and operations) but check again if its correct.

memory ops: 14 vs 7

operations: 44 vs 30

__global__ void xy_field(float *in1, float *max_val, float *x_val, float *y_val, int *index, int ny)

{

        int tid = blockIdx.x * blockDim.x + threadIdx.x;

int index1 = 0;

        int x_sh = 0;

        int y_sh = 0;

        int x_indx1 = 0;

        int x_indx2 = 0;

        int y_indx1 = 0;

        int y_indx2 = 0;

        float valx = 0;

        float valy = 0;

        float numx = 0;

        float numy = 0;

        float denx = 0;

        float deny = 0;

int tidnyny=ny*ny*tid;

        index1 = index[tid] - (tidnyny);

        y_sh = index1/ny;

        x_sh = index1 - (y_sh*ny);

        x_indx1 = (tidnyny)+y_sh*ny+(x_sh-1);

        y_indx1 = (tidnyny)+(y_sh-1)*ny+x_sh;

        x_indx2 = x_index+2;

        y_indx2 = y_index1+2;

float in1_x1=in1[x_indx1];

        float in1_y1=in1[y_indx1];

        float max_val_tid=max_val[tid];

numx = (max_val_tid - in1_x1]);

        denx = (2*max_val_tid - in1_x1 - in1[x_indx2]);

        numy = (max_val_tid - in1_y1);

        deny = (2*max_val_tid - in1_y1 - in1[y_indx2]);

        valx = (x_sh - 0.5f) + __fdividef(numx,denx);

        valy = (y_sh - 0.5f) + __fdividef(numy,deny);

x_val[tid] = valx;

        y_val[tid] = valy;

}

I dont think shared memory will help, since I dont see reusage of data. If you only have to load it ones, by one thread (and not more than one thread of the same block uses the data) there is no sense in using shared mem.

You might have more luck with using texture cache. But not sure about that.

Cheers

Ceearem

Ceearem: unless I’m missing something, your changes are things that any decent optimizing compiler would do anyway.

Ceearem: unless I’m missing something, your changes are things that any decent optimizing compiler would do anyway.