Slow kernel

// dimBlock.x = NUMBER_OF_PARAMETERS; dimBlock.y = NUMBER_OF_PARAMETERS; dimBlock.z = 1;

// dimGrid.x = DATA_D; dimGrid.y = DATA_H;

__global__ void Calculate_A_matrix_2D_values(float* A_matrix_2D_values, float* Certainties, float* Phase_Gradients, int dim)

{

	int y = blockIdx.y;

	int z = blockIdx.x;

	int i = threadIdx.x;

	int j = threadIdx.y;

	int idx, A_matrix_element, matrix_element_idx, i_index, j_index; 

	float B_matrix[36];   

	float phase_gradient_value, certainty_value, B_matrix_value_1, B_matrix_value_2, B_matrix_value;

	float matrix_2D_value = 0.0f;

	float xf, yf, zf;

	if (y < (DATA_H - PADDING_Y) && z < (DATA_D - PADDING_Z) && i < NUMBER_OF_PARAMETERS && j < NUMBER_OF_PARAMETERS)

	{

		//		(1 0 0 x y z 0 0 0 0 0 0)

		// B =  (0 1 0 0 0 0 x y z 0 0 0)

		//		(0 0 1 0 0 0 0 0 0 x y z)

		// Change to coordinate system with origo in the middle  (sx-1)/2, (sy-1)/2, (sz-1)/2

		xf = (float)0 - ((float)DATA_W - 1.0f)/2.0f;		

		yf = (float)y - ((float)DATA_H - 1.0f)/2.0f;

		zf = (float)z - ((float)DATA_D - 1.0f)/2.0f;

		B_matrix[0] =1;  B_matrix[1] =0;  B_matrix[2] =0; B_matrix[3] =0;  B_matrix[4] =1; B_matrix[5] =0;  B_matrix[6] =0; B_matrix[7] =0;  B_matrix[8] =1;  B_matrix[9] =xf; B_matrix[10]=0;  B_matrix[11]=0; 

		B_matrix[12]=yf; B_matrix[13]=0;  B_matrix[14]=0; B_matrix[15]=zf; B_matrix[16]=0; B_matrix[17]=0;  B_matrix[18]=0; B_matrix[19]=xf; B_matrix[20]=0;  B_matrix[21]=0;  B_matrix[22]=yf; B_matrix[23]=0; 

		B_matrix[24]=0;  B_matrix[25]=zf; B_matrix[26]=0; B_matrix[27]=0;  B_matrix[28]=0; B_matrix[29]=xf; B_matrix[30]=0; B_matrix[31]=0;  B_matrix[32]=yf; B_matrix[33]=0;  B_matrix[34]=0;  B_matrix[35]=zf; 			

		A_matrix_element = i*NUMBER_OF_PARAMETERS + j; // max 144 (12 x 12 matrix)		

		

		i_index = i*NUMBER_OF_DIMENSIONS + dim;

		j_index = j*NUMBER_OF_DIMENSIONS + dim;

			

		for (int x = 0; x < DATA_W; x++)

   		{

			idx = x + y * DATA_W + z * DATA_W * DATA_H;

	

			xf = (float)x - ((float)DATA_W - 1.0f)/2.0f;	

					

			B_matrix[9] =xf; B_matrix[19]=xf; B_matrix[29]=xf; 	

	

			B_matrix_value_1 = B_matrix[j*NUMBER_OF_DIMENSIONS + dim];

					  B_matrix_value_2 = B_matrix[i*NUMBER_OF_DIMENSIONS + dim];

			phase_gradient_value = Phase_Gradients[idx];

		 	certainty_value = Certainties[idx]; 

			matrix_2D_value += certainty_value*certainty_value*B_matrix_value_1*phase_gradi

ent_value*phase_gradient_value*B_matrix_value_2;

		}		

		int offset_index = y * NUMBER_OF_PARAMETERS * NUMBER_OF_PARAMETERS + z * NUMBER_OF_PARAMETERS * NUMBER_OF_PARAMETERS * DATA_H;

		matrix_element_idx = A_matrix_element + offset_index;

		if (dim == 2)

		{

			A_matrix_2D_values[matrix_element_idx] = matrix_2D_value;

		}

		else

		{

			A_matrix_2D_values[matrix_element_idx] += matrix_2D_value;

		}	

	}	

}

If I run the current kernel it takes 12,2 seconds. If I comment the reading of the B_matrix (which is local for each thread) it drops to 2.8 seconds. How can it take such a long time to read from the parameters in the thread? It also affects the time of the other kernels, how is that possible?

If I also comment the reading of phase gradients and certainties it drops to 0.1 s, does this mean that I’m using the memory in a stupid way?

NUMBER_OF_PARAMETERS = 12

DATA_W = DATA_H = DATA_D = 128

Since you are accessing the B_matrix as an array, it gets spilled into local memory which is a part of the “slow” global device memory. Consider using shared memory - I don’t know how many threads your are starting, but with 16KB of shared memory and 4 * 36 bytes per thread, you should be able to run around 112 threads.

I start 128 * 128 blocks with 144 threads each…

The problem is that the values of the B_matrix depends on the spatial positions x, y and z so it should be unique for each thread, is that possible with the shared memory?

Why does it get spilled into the local memory? If I don’t use an array and use B_matrix_value_1, … , B_matrix_value_36 instead, is there any difference?