Varying results in different runs under the same condition

kai.karius · September 4, 2019, 12:19pm

Dear community,

I recently started programming in cuda and the strangest problem occurs to me. The following kernel:

__global__
void calc_m_alpha_fill_shared(float4 * coords4, int coord_total,
		int coord_offset, M_alpha * m_alphas) {
	extern __shared__ M_alpha shared[];
	int tid = threadIdx.x;
	int coord_index_zero_block = coord_offset + tid;
	//(blockIdx.x +1) because first frame is zero frame
	int coord_address = (blockIdx.x + 1) * coord_total + coord_offset
			+ threadIdx.x;

	if (coord_index_zero_block >= coord_total) {
		return;
	}
	float4 p0 = coords4[coord_index_zero_block];
	float4 p = coords4[coord_address];
	M_alpha M;

	float d = p.x * p.x + p.y * p.y + p.z * p.z + p0.x * p0.x + p0.y * p0.y
			+ p0.z * p0.z;
	M.m00 = d - 2 * p.x * p0.x - 2 * p.y * p0.y - 2 * p.z * p0.z;
	M.m01 = 2 * (p.y * p0.z - p.z * p0.y);
	M.m02 = 2 * (-p.x * p0.z + p.z * p0.x);
	M.m03 = 2 * (p.x * p0.y - p.y * p0.x);
	M.m11 = d - 2 * p.x * p0.x + 2 * p.y * p0.y + 2 * p.z * p0.z;
	M.m12 = -2 * (p.x * p0.y + p.y * p0.x);
	M.m13 = -2 * (p.x * p0.z + p.z * p0.x);
	M.m22 = d + 2 * p.x * p0.x - 2 * p.y * p0.y + 2 * p.z * p0.z;
	M.m23 = -2 * (p.y * p0.z + p.z * p0.y);
	M.m33 = d + 2 * p.x * p0.x + 2 * p.y * p0.y - 2 * p.z * p0.z;

	shared[threadIdx.x] = M;

	__syncthreads();

	if (tid < 512) {
		shared[tid].m00 += shared[tid + 512].m00;
		shared[tid].m01 += shared[tid + 512].m01;
		shared[tid].m02 += shared[tid + 512].m02;
		shared[tid].m03 += shared[tid + 512].m03;
		shared[tid].m11 += shared[tid + 512].m11;
		shared[tid].m12 += shared[tid + 512].m12;
		shared[tid].m13 += shared[tid + 512].m13;
		shared[tid].m22 += shared[tid + 512].m22;
		shared[tid].m23 += shared[tid + 512].m23;
		shared[tid].m33 += shared[tid + 512].m33;
	}
	__syncthreads();

	if (tid < 256) {
		shared[tid].m00 += shared[tid + 256].m00;
		shared[tid].m01 += shared[tid + 256].m01;
		shared[tid].m02 += shared[tid + 256].m02;
		shared[tid].m03 += shared[tid + 256].m03;
		shared[tid].m11 += shared[tid + 256].m11;
		shared[tid].m12 += shared[tid + 256].m12;
		shared[tid].m13 += shared[tid + 256].m13;
		shared[tid].m22 += shared[tid + 256].m22;
		shared[tid].m23 += shared[tid + 256].m23;
		shared[tid].m33 += shared[tid + 256].m33;
	}
	__syncthreads();

	if (tid < 128) {
		shared[tid].m00 += shared[tid + 128].m00;
		shared[tid].m01 += shared[tid + 128].m01;
		shared[tid].m02 += shared[tid + 128].m02;
		shared[tid].m03 += shared[tid + 128].m03;
		shared[tid].m11 += shared[tid + 128].m11;
		shared[tid].m12 += shared[tid + 128].m12;
		shared[tid].m13 += shared[tid + 128].m13;
		shared[tid].m22 += shared[tid + 128].m22;
		shared[tid].m23 += shared[tid + 128].m23;
		shared[tid].m33 += shared[tid + 128].m33;
	}
	__syncthreads();

	if (tid < 64) {
		shared[tid].m00 += shared[tid + 64].m00;
		shared[tid].m01 += shared[tid + 64].m01;
		shared[tid].m02 += shared[tid + 64].m02;
		shared[tid].m03 += shared[tid + 64].m03;
		shared[tid].m11 += shared[tid + 64].m11;
		shared[tid].m12 += shared[tid + 64].m12;
		shared[tid].m13 += shared[tid + 64].m13;
		shared[tid].m22 += shared[tid + 64].m22;
		shared[tid].m23 += shared[tid + 64].m23;
		shared[tid].m33 += shared[tid + 64].m33;
	}
	__syncthreads();
	//TODO: consider this: from the next block on not the whole warp is used
	if (tid < 32) {
		shared[tid].m00 += shared[tid + 32].m00;
		shared[tid].m01 += shared[tid + 32].m01;
		shared[tid].m02 += shared[tid + 32].m02;
		shared[tid].m03 += shared[tid + 32].m03;
		shared[tid].m11 += shared[tid + 32].m11;
		shared[tid].m12 += shared[tid + 32].m12;
		shared[tid].m13 += shared[tid + 32].m13;
		shared[tid].m22 += shared[tid + 32].m22;
		shared[tid].m23 += shared[tid + 32].m23;
		shared[tid].m33 += shared[tid + 32].m33;
	}
	__syncthreads();
	if (tid < 16) {
		shared[tid].m00 += shared[tid + 16].m00;
		shared[tid].m01 += shared[tid + 16].m01;
		shared[tid].m02 += shared[tid + 16].m02;
		shared[tid].m03 += shared[tid + 16].m03;
		shared[tid].m11 += shared[tid + 16].m11;
		shared[tid].m12 += shared[tid + 16].m12;
		shared[tid].m13 += shared[tid + 16].m13;
		shared[tid].m22 += shared[tid + 16].m22;
		shared[tid].m23 += shared[tid + 16].m23;
		shared[tid].m33 += shared[tid + 16].m33;
//		warpReduce(shared, tid);
	}
	__syncthreads();
	if (tid < 8) {
		shared[tid].m00 += shared[tid + 8].m00;
		shared[tid].m01 += shared[tid + 8].m01;
		shared[tid].m02 += shared[tid + 8].m02;
		shared[tid].m03 += shared[tid + 8].m03;
		shared[tid].m11 += shared[tid + 8].m11;
		shared[tid].m12 += shared[tid + 8].m12;
		shared[tid].m13 += shared[tid + 8].m13;
		shared[tid].m22 += shared[tid + 8].m22;
		shared[tid].m23 += shared[tid + 8].m23;
		shared[tid].m33 += shared[tid + 8].m33;
	}
	__syncthreads();
	if (tid < 4) {
		shared[tid].m00 += shared[tid + 4].m00;
		shared[tid].m01 += shared[tid + 4].m01;
		shared[tid].m02 += shared[tid + 4].m02;
		shared[tid].m03 += shared[tid + 4].m03;
		shared[tid].m11 += shared[tid + 4].m11;
		shared[tid].m12 += shared[tid + 4].m12;
		shared[tid].m13 += shared[tid + 4].m13;
		shared[tid].m22 += shared[tid + 4].m22;
		shared[tid].m23 += shared[tid + 4].m23;
		shared[tid].m33 += shared[tid + 4].m33;
	}
	__syncthreads();
	if (tid < 2) {
		shared[tid].m00 += shared[tid + 2].m00;
		shared[tid].m01 += shared[tid + 2].m01;
		shared[tid].m02 += shared[tid + 2].m02;
		shared[tid].m03 += shared[tid + 2].m03;
		shared[tid].m11 += shared[tid + 2].m11;
		shared[tid].m12 += shared[tid + 2].m12;
		shared[tid].m13 += shared[tid + 2].m13;
		shared[tid].m22 += shared[tid + 2].m22;
		shared[tid].m23 += shared[tid + 2].m23;
		shared[tid].m33 += shared[tid + 2].m33;
	}
	__syncthreads();
	if (tid < 1) {
		shared[tid].m00 += shared[tid + 1].m00;
		shared[tid].m01 += shared[tid + 1].m01;
		shared[tid].m02 += shared[tid + 1].m02;
		shared[tid].m03 += shared[tid + 1].m03;
		shared[tid].m11 += shared[tid + 1].m11;
		shared[tid].m12 += shared[tid + 1].m12;
		shared[tid].m13 += shared[tid + 1].m13;
		shared[tid].m22 += shared[tid + 1].m22;
		shared[tid].m23 += shared[tid + 1].m23;
		shared[tid].m33 += shared[tid + 1].m33;
	}
	__syncthreads();

	if (tid == 0) {
		m_alphas[blockIdx.x].m00 += shared[0].m00;
		m_alphas[blockIdx.x].m01 += shared[0].m01;
		m_alphas[blockIdx.x].m02 += shared[0].m02;
		m_alphas[blockIdx.x].m03 += shared[0].m03;
		m_alphas[blockIdx.x].m11 += shared[0].m11;
		m_alphas[blockIdx.x].m12 += shared[0].m12;
		m_alphas[blockIdx.x].m13 += shared[0].m13;
		m_alphas[blockIdx.x].m22 += shared[0].m22;
		m_alphas[blockIdx.x].m23 += shared[0].m23;
		m_alphas[blockIdx.x].m33 += shared[0].m33;
	}
}

is called from

float * device;
	int zero_frame_volume = 4 * sizeof(float) * num_coords;
//align to coords_per_block
	int frames_volume = 4 * sizeof(float) * (num_coords / coords_per_block + 1)
			* coords_per_block * (num_frames - 1);
	cudaMalloc(&device, zero_frame_volume + frames_volume);
	cudaMemset(device,0.0,zero_frame_volume + frames_volume);
for (int i = 0; i < num_coords; i++) {
		cudaMemcpy(device + 4 * i, frames + 3 * i, 3 * sizeof(float),
				cudaMemcpyHostToDevice);
	}
	for (int frame_count = 1; frame_count < num_frames; frame_count++) {
		for (int coord_count = 0; coord_count < num_coords; coord_count++) {
			int coord_index = frame_count * num_coords + coord_count;
			CudaSafeCall(
					cudaMemcpy(device + 4 * coord_index,
							frames + 3 * coord_index, 3 * sizeof(float),
							cudaMemcpyHostToDevice));
		}
	}
	float4 * coords4 = reinterpret_cast<float4*>(device);

	M_alpha * d_m_alphas;
	cudaMalloc(&d_m_alphas, sizeof(M_alpha) * num_frames);
	cudaMemset(d_m_alphas,0.0,sizeof(M_alpha) * num_frames);
	int M_alpha_volume = 1024;
	int block_num = num_frames - 1;

	int threads_per_block = 1024;
	printf("Kernel launched with %i blocks.\n", block_num);
	printf("Kernel launched with %i threads per block.\n", threads_per_block);
	printf("Volume of M_alpha struct: %i.\n", sizeof(M_alpha));
	printf("Allocated %i bytes shared memory on multiprocessors.\n",
			M_alpha_volume * sizeof(M_alpha));
	int iterations = num_coords / threads_per_block + 1;
	printf("Initial kernel iterations: %i.\n", iterations);

	M_alpha * h_m_alphas = (M_alpha *) malloc(num_frames * sizeof(M_alpha));

	for (int i = 0; i < iterations; i++) {
		calc_m_alpha_fill_shared<<<block_num,threads_per_block,M_alpha_volume*sizeof(M_alpha)>>>(coords4,num_coords,i*threads_per_block,d_m_alphas);
		cudaDeviceSynchronize();
	}
		cudaMemcpy(h_m_alphas, d_m_alphas, num_frames * sizeof(M_alpha),
				cudaMemcpyDeviceToHost);
		printf("%f\n", h_m_alphas[0].m00);
		printf("%f\n", h_m_alphas[0].m01);
		printf("%f\n", h_m_alphas[0].m02);
		printf("%f\n", h_m_alphas[0].m03);
		cudaDeviceSynchronize();

With M being a simple struc:

#pragma pack(1)
struct M_alpha {
	float m00;
	float m01;
	float m02;
	float m03;
	float m11;
	float m12;
	float m13;
	float m22;
	float m23;
	float m33;
	char padding;
}

What it is supposed to do is calculate the entity M from pairs of (x,y,z) coordinates for a set number of frames and reduce it. I know the parallelization is probably clumsy and the reduction can be improved. My problem is another one: In different runs the program gives me different results. It is always one of 5 or so small variations of the correct result, plus the correct result.

I ran all the memcheck tools, in particular racecheck, which does not give me any hazards. What is interesting though is that when I solely run memcheck (i.e. “@>cuda-memcheck ./program”) the result is always the correct one! Obviously memcheck does something which I do not do in a normal program call, unfortunately I was unable to find the source code for memcheck.

I would appreciate help or hints.

All the best
Twerp

kai.karius · September 4, 2019, 1:07pm

Okay here it is: Due to the multiple calls of my kernel and the varying nature of the input (sometimes 1024 elements, sometimes less) some of the old values survived in shared memory and had to be set to 0 before continuing:

M_alpha M;
	M.m00 = 0;
	M.m01 = 0;
	M.m02 = 0;
	M.m03 = 0;
	M.m11 = 0;
	M.m12 = 0;
	M.m13 = 0;
	M.m22 = 0;
	M.m23 = 0;
	M.m33 = 0;
	shared[tid] = M;
	if (coord_index_zero_block >= coord_total) {
		return;
	}

Maybe my stupidity helps someone!
Best,
Twerp

ryork · September 4, 2019, 4:20pm

Can you write :

M_alpha M = { 0 };

? That is a simpler way to initialize a structure. You can also use in-line initializers like this :

struct M_alpha 
{
	float m00 = { 0 };
	float m01 = { 0 };
	float m02 = { 0 };
	float m03 = { 0 };
	float m11 = { 0 };
	float m12 = { 0 };
	float m13 = { 0 };
	float m22 = { 0 };
	float m23 = { 0 };
	float m33 = { 0 };
};

Then every instance of it is always initialized to zeros.

Do you really need the alignment pragma and padding variable? I don’t think it actually does any good in this case. The compiler is going to align the items on a four-byte boundary anyway.

kai.karius · September 5, 2019, 9:51am

Thank you ryork for your comments! I will apply you suggestions to my codebase. Regarding the padding: I did it to prevent possible bank conflicts, if there is some actual performance gain is yet to be seen. Curious about your remark regarding the compiler alignment. I thought the #pragma directive does just prevent this? If I request

sizeof(M_alpha)

the result is indeed 41.

All the best
Twerp

Topic		Replies	Views
Reduction CUDA Programming and Performance	19	3466	May 16, 2012
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16301	January 30, 2011
Results differ when compiled with sm_10 and sm_20 CUDA Programming and Performance	19	2418	June 17, 2011
help with kernel synchronization? CUDA Programming and Performance	22	13900	August 26, 2010
Error or incomprehension, MMa ptx mixed precision Bfloat16 rtx3080 CUDA Programming and Performance	20	2248	October 12, 2021
Getting wrong output from CUDA kernel CUDA Programming and Performance	6	8289	April 15, 2011
inconsistent results on every run CUDA Programming and Performance	15	5488	October 17, 2011
Cuda code performance CUDA Programming and Performance	14	3152	December 16, 2014
Why cannot run this program CUDA Programming and Performance	7	951	May 11, 2018
Reduction questions(newbie-ish) CUDA Programming and Performance	7	1793	January 14, 2009

Varying results in different runs under the same condition

Related topics