Varying results in different runs under the same condition

Dear community,

I recently started programming in cuda and the strangest problem occurs to me. The following kernel:

__global__
void calc_m_alpha_fill_shared(float4 * coords4, int coord_total,
		int coord_offset, M_alpha * m_alphas) {
	extern __shared__ M_alpha shared[];
	int tid = threadIdx.x;
	int coord_index_zero_block = coord_offset + tid;
	//(blockIdx.x +1) because first frame is zero frame
	int coord_address = (blockIdx.x + 1) * coord_total + coord_offset
			+ threadIdx.x;

	if (coord_index_zero_block >= coord_total) {
		return;
	}
	float4 p0 = coords4[coord_index_zero_block];
	float4 p = coords4[coord_address];
	M_alpha M;

	float d = p.x * p.x + p.y * p.y + p.z * p.z + p0.x * p0.x + p0.y * p0.y
			+ p0.z * p0.z;
	M.m00 = d - 2 * p.x * p0.x - 2 * p.y * p0.y - 2 * p.z * p0.z;
	M.m01 = 2 * (p.y * p0.z - p.z * p0.y);
	M.m02 = 2 * (-p.x * p0.z + p.z * p0.x);
	M.m03 = 2 * (p.x * p0.y - p.y * p0.x);
	M.m11 = d - 2 * p.x * p0.x + 2 * p.y * p0.y + 2 * p.z * p0.z;
	M.m12 = -2 * (p.x * p0.y + p.y * p0.x);
	M.m13 = -2 * (p.x * p0.z + p.z * p0.x);
	M.m22 = d + 2 * p.x * p0.x - 2 * p.y * p0.y + 2 * p.z * p0.z;
	M.m23 = -2 * (p.y * p0.z + p.z * p0.y);
	M.m33 = d + 2 * p.x * p0.x + 2 * p.y * p0.y - 2 * p.z * p0.z;

	shared[threadIdx.x] = M;

	__syncthreads();

	if (tid < 512) {
		shared[tid].m00 += shared[tid + 512].m00;
		shared[tid].m01 += shared[tid + 512].m01;
		shared[tid].m02 += shared[tid + 512].m02;
		shared[tid].m03 += shared[tid + 512].m03;
		shared[tid].m11 += shared[tid + 512].m11;
		shared[tid].m12 += shared[tid + 512].m12;
		shared[tid].m13 += shared[tid + 512].m13;
		shared[tid].m22 += shared[tid + 512].m22;
		shared[tid].m23 += shared[tid + 512].m23;
		shared[tid].m33 += shared[tid + 512].m33;
	}
	__syncthreads();

	if (tid < 256) {
		shared[tid].m00 += shared[tid + 256].m00;
		shared[tid].m01 += shared[tid + 256].m01;
		shared[tid].m02 += shared[tid + 256].m02;
		shared[tid].m03 += shared[tid + 256].m03;
		shared[tid].m11 += shared[tid + 256].m11;
		shared[tid].m12 += shared[tid + 256].m12;
		shared[tid].m13 += shared[tid + 256].m13;
		shared[tid].m22 += shared[tid + 256].m22;
		shared[tid].m23 += shared[tid + 256].m23;
		shared[tid].m33 += shared[tid + 256].m33;
	}
	__syncthreads();

	if (tid < 128) {
		shared[tid].m00 += shared[tid + 128].m00;
		shared[tid].m01 += shared[tid + 128].m01;
		shared[tid].m02 += shared[tid + 128].m02;
		shared[tid].m03 += shared[tid + 128].m03;
		shared[tid].m11 += shared[tid + 128].m11;
		shared[tid].m12 += shared[tid + 128].m12;
		shared[tid].m13 += shared[tid + 128].m13;
		shared[tid].m22 += shared[tid + 128].m22;
		shared[tid].m23 += shared[tid + 128].m23;
		shared[tid].m33 += shared[tid + 128].m33;
	}
	__syncthreads();

	if (tid < 64) {
		shared[tid].m00 += shared[tid + 64].m00;
		shared[tid].m01 += shared[tid + 64].m01;
		shared[tid].m02 += shared[tid + 64].m02;
		shared[tid].m03 += shared[tid + 64].m03;
		shared[tid].m11 += shared[tid + 64].m11;
		shared[tid].m12 += shared[tid + 64].m12;
		shared[tid].m13 += shared[tid + 64].m13;
		shared[tid].m22 += shared[tid + 64].m22;
		shared[tid].m23 += shared[tid + 64].m23;
		shared[tid].m33 += shared[tid + 64].m33;
	}
	__syncthreads();
	//TODO: consider this: from the next block on not the whole warp is used
	if (tid < 32) {
		shared[tid].m00 += shared[tid + 32].m00;
		shared[tid].m01 += shared[tid + 32].m01;
		shared[tid].m02 += shared[tid + 32].m02;
		shared[tid].m03 += shared[tid + 32].m03;
		shared[tid].m11 += shared[tid + 32].m11;
		shared[tid].m12 += shared[tid + 32].m12;
		shared[tid].m13 += shared[tid + 32].m13;
		shared[tid].m22 += shared[tid + 32].m22;
		shared[tid].m23 += shared[tid + 32].m23;
		shared[tid].m33 += shared[tid + 32].m33;
	}
	__syncthreads();
	if (tid < 16) {
		shared[tid].m00 += shared[tid + 16].m00;
		shared[tid].m01 += shared[tid + 16].m01;
		shared[tid].m02 += shared[tid + 16].m02;
		shared[tid].m03 += shared[tid + 16].m03;
		shared[tid].m11 += shared[tid + 16].m11;
		shared[tid].m12 += shared[tid + 16].m12;
		shared[tid].m13 += shared[tid + 16].m13;
		shared[tid].m22 += shared[tid + 16].m22;
		shared[tid].m23 += shared[tid + 16].m23;
		shared[tid].m33 += shared[tid + 16].m33;
//		warpReduce(shared, tid);
	}
	__syncthreads();
	if (tid < 8) {
		shared[tid].m00 += shared[tid + 8].m00;
		shared[tid].m01 += shared[tid + 8].m01;
		shared[tid].m02 += shared[tid + 8].m02;
		shared[tid].m03 += shared[tid + 8].m03;
		shared[tid].m11 += shared[tid + 8].m11;
		shared[tid].m12 += shared[tid + 8].m12;
		shared[tid].m13 += shared[tid + 8].m13;
		shared[tid].m22 += shared[tid + 8].m22;
		shared[tid].m23 += shared[tid + 8].m23;
		shared[tid].m33 += shared[tid + 8].m33;
	}
	__syncthreads();
	if (tid < 4) {
		shared[tid].m00 += shared[tid + 4].m00;
		shared[tid].m01 += shared[tid + 4].m01;
		shared[tid].m02 += shared[tid + 4].m02;
		shared[tid].m03 += shared[tid + 4].m03;
		shared[tid].m11 += shared[tid + 4].m11;
		shared[tid].m12 += shared[tid + 4].m12;
		shared[tid].m13 += shared[tid + 4].m13;
		shared[tid].m22 += shared[tid + 4].m22;
		shared[tid].m23 += shared[tid + 4].m23;
		shared[tid].m33 += shared[tid + 4].m33;
	}
	__syncthreads();
	if (tid < 2) {
		shared[tid].m00 += shared[tid + 2].m00;
		shared[tid].m01 += shared[tid + 2].m01;
		shared[tid].m02 += shared[tid + 2].m02;
		shared[tid].m03 += shared[tid + 2].m03;
		shared[tid].m11 += shared[tid + 2].m11;
		shared[tid].m12 += shared[tid + 2].m12;
		shared[tid].m13 += shared[tid + 2].m13;
		shared[tid].m22 += shared[tid + 2].m22;
		shared[tid].m23 += shared[tid + 2].m23;
		shared[tid].m33 += shared[tid + 2].m33;
	}
	__syncthreads();
	if (tid < 1) {
		shared[tid].m00 += shared[tid + 1].m00;
		shared[tid].m01 += shared[tid + 1].m01;
		shared[tid].m02 += shared[tid + 1].m02;
		shared[tid].m03 += shared[tid + 1].m03;
		shared[tid].m11 += shared[tid + 1].m11;
		shared[tid].m12 += shared[tid + 1].m12;
		shared[tid].m13 += shared[tid + 1].m13;
		shared[tid].m22 += shared[tid + 1].m22;
		shared[tid].m23 += shared[tid + 1].m23;
		shared[tid].m33 += shared[tid + 1].m33;
	}
	__syncthreads();

	if (tid == 0) {
		m_alphas[blockIdx.x].m00 += shared[0].m00;
		m_alphas[blockIdx.x].m01 += shared[0].m01;
		m_alphas[blockIdx.x].m02 += shared[0].m02;
		m_alphas[blockIdx.x].m03 += shared[0].m03;
		m_alphas[blockIdx.x].m11 += shared[0].m11;
		m_alphas[blockIdx.x].m12 += shared[0].m12;
		m_alphas[blockIdx.x].m13 += shared[0].m13;
		m_alphas[blockIdx.x].m22 += shared[0].m22;
		m_alphas[blockIdx.x].m23 += shared[0].m23;
		m_alphas[blockIdx.x].m33 += shared[0].m33;
	}
}

is called from

float * device;
	int zero_frame_volume = 4 * sizeof(float) * num_coords;
//align to coords_per_block
	int frames_volume = 4 * sizeof(float) * (num_coords / coords_per_block + 1)
			* coords_per_block * (num_frames - 1);
	cudaMalloc(&device, zero_frame_volume + frames_volume);
	cudaMemset(device,0.0,zero_frame_volume + frames_volume);
for (int i = 0; i < num_coords; i++) {
		cudaMemcpy(device + 4 * i, frames + 3 * i, 3 * sizeof(float),
				cudaMemcpyHostToDevice);
	}
	for (int frame_count = 1; frame_count < num_frames; frame_count++) {
		for (int coord_count = 0; coord_count < num_coords; coord_count++) {
			int coord_index = frame_count * num_coords + coord_count;
			CudaSafeCall(
					cudaMemcpy(device + 4 * coord_index,
							frames + 3 * coord_index, 3 * sizeof(float),
							cudaMemcpyHostToDevice));
		}
	}
	float4 * coords4 = reinterpret_cast<float4*>(device);

	M_alpha * d_m_alphas;
	cudaMalloc(&d_m_alphas, sizeof(M_alpha) * num_frames);
	cudaMemset(d_m_alphas,0.0,sizeof(M_alpha) * num_frames);
	int M_alpha_volume = 1024;
	int block_num = num_frames - 1;

	int threads_per_block = 1024;
	printf("Kernel launched with %i blocks.\n", block_num);
	printf("Kernel launched with %i threads per block.\n", threads_per_block);
	printf("Volume of M_alpha struct: %i.\n", sizeof(M_alpha));
	printf("Allocated %i bytes shared memory on multiprocessors.\n",
			M_alpha_volume * sizeof(M_alpha));
	int iterations = num_coords / threads_per_block + 1;
	printf("Initial kernel iterations: %i.\n", iterations);

	M_alpha * h_m_alphas = (M_alpha *) malloc(num_frames * sizeof(M_alpha));

	for (int i = 0; i < iterations; i++) {
		calc_m_alpha_fill_shared<<<block_num,threads_per_block,M_alpha_volume*sizeof(M_alpha)>>>(coords4,num_coords,i*threads_per_block,d_m_alphas);
		cudaDeviceSynchronize();
	}
		cudaMemcpy(h_m_alphas, d_m_alphas, num_frames * sizeof(M_alpha),
				cudaMemcpyDeviceToHost);
		printf("%f\n", h_m_alphas[0].m00);
		printf("%f\n", h_m_alphas[0].m01);
		printf("%f\n", h_m_alphas[0].m02);
		printf("%f\n", h_m_alphas[0].m03);
		cudaDeviceSynchronize();

With M being a simple struc:

#pragma pack(1)
struct M_alpha {
	float m00;
	float m01;
	float m02;
	float m03;
	float m11;
	float m12;
	float m13;
	float m22;
	float m23;
	float m33;
	char padding;
}

What it is supposed to do is calculate the entity M from pairs of (x,y,z) coordinates for a set number of frames and reduce it. I know the parallelization is probably clumsy and the reduction can be improved. My problem is another one: In different runs the program gives me different results. It is always one of 5 or so small variations of the correct result, plus the correct result.

I ran all the memcheck tools, in particular racecheck, which does not give me any hazards. What is interesting though is that when I solely run memcheck (i.e. “@>cuda-memcheck ./program”) the result is always the correct one! Obviously memcheck does something which I do not do in a normal program call, unfortunately I was unable to find the source code for memcheck.

I would appreciate help or hints.

All the best
Twerp

Okay here it is: Due to the multiple calls of my kernel and the varying nature of the input (sometimes 1024 elements, sometimes less) some of the old values survived in shared memory and had to be set to 0 before continuing:

M_alpha M;
	M.m00 = 0;
	M.m01 = 0;
	M.m02 = 0;
	M.m03 = 0;
	M.m11 = 0;
	M.m12 = 0;
	M.m13 = 0;
	M.m22 = 0;
	M.m23 = 0;
	M.m33 = 0;
	shared[tid] = M;
	if (coord_index_zero_block >= coord_total) {
		return;
	}

Maybe my stupidity helps someone!
Best,
Twerp

Can you write :

M_alpha M = { 0 };

? That is a simpler way to initialize a structure. You can also use in-line initializers like this :

struct M_alpha 
{
	float m00 = { 0 };
	float m01 = { 0 };
	float m02 = { 0 };
	float m03 = { 0 };
	float m11 = { 0 };
	float m12 = { 0 };
	float m13 = { 0 };
	float m22 = { 0 };
	float m23 = { 0 };
	float m33 = { 0 };
};

Then every instance of it is always initialized to zeros.

Do you really need the alignment pragma and padding variable? I don’t think it actually does any good in this case. The compiler is going to align the items on a four-byte boundary anyway.

Thank you ryork for your comments! I will apply you suggestions to my codebase. Regarding the padding: I did it to prevent possible bank conflicts, if there is some actual performance gain is yet to be seen. Curious about your remark regarding the compiler alignment. I thought the #pragma directive does just prevent this? If I request

sizeof(M_alpha)

the result is indeed 41.

All the best
Twerp