How to Optimize the CUDA Code?

int num_ees = 12;
int num_step = 16;
int num_gg = 1024*1024;
int num_pp = 1024*8;
int num_dims= 1024*2;
int num_segments= num_dims/num_step;

cudaMalloc((void **) &d_data_gg, num_ees * num_step * num_gg * sizeof(char) );
cudaMalloc((void **) &d_data_pp, num_ees * num_dims * sizeof(char) );
cudaMalloc((void **) &d_measure, num_gg * sizeof(double) );
cudaMalloc((void **) &d_dot_products, num_ees * num_gg * sizeof(long) );
cudaMalloc((void **) &d_gg_x, num_ees * num_gg * sizeof(double) );
cudaMalloc((void **) &d_pp_x, num_ees * num_segments * sizeof(double) );

............................................

int threadsPerBlock =  512;
int blocksPerGrid =(num_gg + threadsPerBlock - 1) / threadsPerBlock;

..............................................
for (int p=0; p<num_pp; p++)
{
.........
    for (int g=0; g<num_gg; g++)
    {
            ..................
            for (int s=0; s<num_segments; s++)
            {
                  .....................................................
                 GetMeasureGPU<<<blocksPerGrid, threadsPerBlock>>>(..................);
                 .................
           }
          .................................
   }

}
............................................

__global__ void GetMeasureGPU(char* d_data_gg, char* d_data_pp, double* d_gg_x, double* d_pp_x,
                                long* d_dot_products, double *d_measure,
                                long num_ees, long num_dims, long ff, long num_segments, long num_gg, long num_step)
{
    long g = blockDim.x * blockIdx.x + threadIdx.x;
    if (g < num_gg)
    {
        for (long e=0; e<num_ees; e++)
        {
            for (long j=0; j<num_step; j++)
            {
                 d_dot_products[e*num_gg + g]  += (long)d_data_gg[e * num_step * num_gg + g*num_step + j] * (long)d_data_pp[e * num_dims + ff + j];
            }
            d_measure[g] += d_dot_products[e*num_gg + g] * d_pp_x[e * num_segments + ff/num_step] * d_gg_x[e * num_gg + g];
        }
    }
}

I tried to do the Optimization like this, and got a performance improvement of about 30%. But the result was wrong.

..............................................
for (int p=0; p<num_pp; p++)
{
.........
    for (int g=0; g<num_gg; g++)
    {
            ..................
            for (int s=0; s<num_segments; s++)
            {
                  .....................................................
                 dim3 threadsPerBlock(16, 12);
		 int blocksPerGrid = num_gg;
                 GetMeasureGPU<<<blocksPerGrid, threadsPerBlock>>>(d_data_gg, d_data_pp, d_gg_x, d_pp_x,d_dot_products, d_measure, num_segments, s);
                 .................
           }
          .................................
   }

}
............................................


__global__ void GetCorrelationsStepwise(char* d_data_gg, char* d_data_pp, double* d_gg_x, double* d_pp_x,
	long* d_dot_products, double *d_measure, long num_segments, long s)  // blockDim.x = 16; blockDim.y = 12; gridDim.x = num_gallery = 100W
{
	unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
	unsigned int ix = blockDim.x * blockDim.y * blockIdx.x + tid;

	__shared__ long s_products[12][16];
	__shared__ long s_dot_products[12];
	
	s_products[threadIdx.y][threadIdx.x] = (long)d_data_gg[ix] * (long)d_data_pp[tid];
	__syncthreads();
	
	if (threadIdx.x < 8)
	{
		s_products[threadIdx.y][threadIdx.x] += s_products[threadIdx.y][threadIdx.x + 8];
		s_products[threadIdx.y][threadIdx.x] += s_products[threadIdx.y][threadIdx.x + 4];
		s_products[threadIdx.y][threadIdx.x] += s_products[threadIdx.y][threadIdx.x + 2];
		s_products[threadIdx.y][threadIdx.x] += s_products[threadIdx.y][threadIdx.x + 1];
	}
	if (threadIdx.x == 0)		 
	{
		int gp = threadIdx.y * gridDim.x + blockIdx.x;
		d_dot_products[gp] += s_products[threadIdx.y][0];
		s_dot_products[threadIdx.y] = d_dot_products[gp] * d_pp_x[threadIdx.y * num_segments + s]
																	* d_gg_x[gp];
		__syncthreads();

		if (threadIdx.y < 4)
		{
			s_dot_products[threadIdx.y] += s_dot_products[threadIdx.y + 4] + s_dot_products[threadIdx.y + 8];
			s_dot_products[threadIdx.y] += s_dot_products[threadIdx.y + 2];
			s_dot_products[threadIdx.y] += s_dot_products[threadIdx.y + 1];
		}
		if (threadIdx.y == 0)
		{
			d_measure[blockIdx.x] += s_dot_products[0];
		}
	}
}

Who can help me? It has tortured me for several weeks.
Thanks a billion!

I would like to try to help you, but I am do not have the will to read so many line of code. Please describe your problem first and then I will be able to make some sense of your code.

Thanks a lot for your time.
I think, The kernel code is quite simple.

1. In the inner loop, the dot products of two char-type arrays are accumulatively calculated by a step of 16 elements.
2. In the outer loop, the dot products are multiplied by 2 double values from 2 double arrays, and at last the products are summed to d_measure[g].

__global__ void GetMeasureGPU(char* d_data_gg, char* d_data_pp, double* d_gg_x, double* d_pp_x,
                                long* d_dot_products, double *d_measure,
                                long num_ees, long num_dims, long ff, long num_segments, long num_gg, long num_step)
{
    long g = blockDim.x * blockIdx.x + threadIdx.x;
    if (g < num_gg)
    {
        for (long e=0; e<num_ees; e++) // the outer loop
        {
            for (long j=0; j<num_step; j++)  // the inner loop 
            {
                 d_dot_products[e*num_gg + g]  += 
                         (long)d_data_gg[e * num_step * num_gg + g*num_step + j] 
                       * (long)d_data_pp[e * num_dims + ff + j];
            }
            d_measure[g] += d_dot_products[e*num_gg + g]
                          * d_pp_x[e * num_segments + ff/num_step]
                          * d_gg_x[e * num_gg + g];
        }
    }
}

Is “long” a 64-bit type on your platform? If so, you should look into converting as many variables to “int” as possible. GPUs have a 32-bit architecture and 64-bit integer arithmetic must be emulated. Variables g, e, and j in particular look like they could easily be of type “int”. You can also get rid of the type casts applied to the “char” data. Under C/C++ rules, any operands with types narrower than “int” will automatically be converted to “int” when used in an expression.

What are typical values of “num_step”? Consider unrolling this loop with #pragma unroll and check which unrolling factor works best.

Use the modifiers “const” and “restrict” as appropriate for the pointer arguments to both global and the device functions to allow the compiler to arrange loads for fastest performance.

How many different values for the various num_* have to be supported? If there are only few values consider converting the function into a template function with the num_* as a template argument and instantiating the required combinations, invoking each function via a table of function pointers. The template arguments would be compile-time constants allowing the compiler to save instructions.

Consider arrnaging you “char” data such that you can access it as “char4”, cutting down on the number of load instructions. Note: the signedness of char can differ between platforms. Since conversion to wider types occurs, and you are doing math, declare the data either signed char or unsigned car as needed, and ue char4 or uchar4 accordingly.

Thanks for your so much comments. But, they do not improve the performance significantly, according to my experiments.

I would suggest using the profiler to guide your optimization efforts. It is difficult to assess performance and provide more than generic tuning advice from just reading code.