// dimBlock.x = NUMBER_OF_PARAMETERS; dimBlock.y = NUMBER_OF_PARAMETERS; dimBlock.z = 1;
// dimGrid.x = DATA_D; dimGrid.y = DATA_H;
__global__ void Calculate_A_matrix_2D_values(float* A_matrix_2D_values, float* Certainties, float* Phase_Gradients, int dim)
{
int y = blockIdx.y;
int z = blockIdx.x;
int i = threadIdx.x;
int j = threadIdx.y;
int idx, A_matrix_element, matrix_element_idx, i_index, j_index;
float B_matrix[36];
float phase_gradient_value, certainty_value, B_matrix_value_1, B_matrix_value_2, B_matrix_value;
float matrix_2D_value = 0.0f;
float xf, yf, zf;
if (y < (DATA_H - PADDING_Y) && z < (DATA_D - PADDING_Z) && i < NUMBER_OF_PARAMETERS && j < NUMBER_OF_PARAMETERS)
{
// (1 0 0 x y z 0 0 0 0 0 0)
// B = (0 1 0 0 0 0 x y z 0 0 0)
// (0 0 1 0 0 0 0 0 0 x y z)
// Change to coordinate system with origo in the middle (sx-1)/2, (sy-1)/2, (sz-1)/2
xf = (float)0 - ((float)DATA_W - 1.0f)/2.0f;
yf = (float)y - ((float)DATA_H - 1.0f)/2.0f;
zf = (float)z - ((float)DATA_D - 1.0f)/2.0f;
B_matrix[0] =1; B_matrix[1] =0; B_matrix[2] =0; B_matrix[3] =0; B_matrix[4] =1; B_matrix[5] =0; B_matrix[6] =0; B_matrix[7] =0; B_matrix[8] =1; B_matrix[9] =xf; B_matrix[10]=0; B_matrix[11]=0;
B_matrix[12]=yf; B_matrix[13]=0; B_matrix[14]=0; B_matrix[15]=zf; B_matrix[16]=0; B_matrix[17]=0; B_matrix[18]=0; B_matrix[19]=xf; B_matrix[20]=0; B_matrix[21]=0; B_matrix[22]=yf; B_matrix[23]=0;
B_matrix[24]=0; B_matrix[25]=zf; B_matrix[26]=0; B_matrix[27]=0; B_matrix[28]=0; B_matrix[29]=xf; B_matrix[30]=0; B_matrix[31]=0; B_matrix[32]=yf; B_matrix[33]=0; B_matrix[34]=0; B_matrix[35]=zf;
A_matrix_element = i*NUMBER_OF_PARAMETERS + j; // max 144 (12 x 12 matrix)
i_index = i*NUMBER_OF_DIMENSIONS + dim;
j_index = j*NUMBER_OF_DIMENSIONS + dim;
for (int x = 0; x < DATA_W; x++)
{
idx = x + y * DATA_W + z * DATA_W * DATA_H;
xf = (float)x - ((float)DATA_W - 1.0f)/2.0f;
B_matrix[9] =xf; B_matrix[19]=xf; B_matrix[29]=xf;
B_matrix_value_1 = B_matrix[j*NUMBER_OF_DIMENSIONS + dim];
B_matrix_value_2 = B_matrix[i*NUMBER_OF_DIMENSIONS + dim];
phase_gradient_value = Phase_Gradients[idx];
certainty_value = Certainties[idx];
matrix_2D_value += certainty_value*certainty_value*B_matrix_value_1*phase_gradi
ent_value*phase_gradient_value*B_matrix_value_2;
}
int offset_index = y * NUMBER_OF_PARAMETERS * NUMBER_OF_PARAMETERS + z * NUMBER_OF_PARAMETERS * NUMBER_OF_PARAMETERS * DATA_H;
matrix_element_idx = A_matrix_element + offset_index;
if (dim == 2)
{
A_matrix_2D_values[matrix_element_idx] = matrix_2D_value;
}
else
{
A_matrix_2D_values[matrix_element_idx] += matrix_2D_value;
}
}
}
If I run the current kernel it takes 12,2 seconds. If I comment the reading of the B_matrix (which is local for each thread) it drops to 2.8 seconds. How can it take such a long time to read from the parameters in the thread? It also affects the time of the other kernels, how is that possible?
If I also comment the reading of phase gradients and certainties it drops to 0.1 s, does this mean that I’m using the memory in a stupid way?
Since you are accessing the B_matrix as an array, it gets spilled into local memory which is a part of the “slow” global device memory. Consider using shared memory - I don’t know how many threads your are starting, but with 16KB of shared memory and 4 * 36 bytes per thread, you should be able to run around 112 threads.
The problem is that the values of the B_matrix depends on the spatial positions x, y and z so it should be unique for each thread, is that possible with the shared memory?
Why does it get spilled into the local memory? If I don’t use an array and use B_matrix_value_1, … , B_matrix_value_36 instead, is there any difference?