Dear community,
I recently started programming in cuda and the strangest problem occurs to me. The following kernel:
__global__
void calc_m_alpha_fill_shared(float4 * coords4, int coord_total,
int coord_offset, M_alpha * m_alphas) {
extern __shared__ M_alpha shared[];
int tid = threadIdx.x;
int coord_index_zero_block = coord_offset + tid;
//(blockIdx.x +1) because first frame is zero frame
int coord_address = (blockIdx.x + 1) * coord_total + coord_offset
+ threadIdx.x;
if (coord_index_zero_block >= coord_total) {
return;
}
float4 p0 = coords4[coord_index_zero_block];
float4 p = coords4[coord_address];
M_alpha M;
float d = p.x * p.x + p.y * p.y + p.z * p.z + p0.x * p0.x + p0.y * p0.y
+ p0.z * p0.z;
M.m00 = d - 2 * p.x * p0.x - 2 * p.y * p0.y - 2 * p.z * p0.z;
M.m01 = 2 * (p.y * p0.z - p.z * p0.y);
M.m02 = 2 * (-p.x * p0.z + p.z * p0.x);
M.m03 = 2 * (p.x * p0.y - p.y * p0.x);
M.m11 = d - 2 * p.x * p0.x + 2 * p.y * p0.y + 2 * p.z * p0.z;
M.m12 = -2 * (p.x * p0.y + p.y * p0.x);
M.m13 = -2 * (p.x * p0.z + p.z * p0.x);
M.m22 = d + 2 * p.x * p0.x - 2 * p.y * p0.y + 2 * p.z * p0.z;
M.m23 = -2 * (p.y * p0.z + p.z * p0.y);
M.m33 = d + 2 * p.x * p0.x + 2 * p.y * p0.y - 2 * p.z * p0.z;
shared[threadIdx.x] = M;
__syncthreads();
if (tid < 512) {
shared[tid].m00 += shared[tid + 512].m00;
shared[tid].m01 += shared[tid + 512].m01;
shared[tid].m02 += shared[tid + 512].m02;
shared[tid].m03 += shared[tid + 512].m03;
shared[tid].m11 += shared[tid + 512].m11;
shared[tid].m12 += shared[tid + 512].m12;
shared[tid].m13 += shared[tid + 512].m13;
shared[tid].m22 += shared[tid + 512].m22;
shared[tid].m23 += shared[tid + 512].m23;
shared[tid].m33 += shared[tid + 512].m33;
}
__syncthreads();
if (tid < 256) {
shared[tid].m00 += shared[tid + 256].m00;
shared[tid].m01 += shared[tid + 256].m01;
shared[tid].m02 += shared[tid + 256].m02;
shared[tid].m03 += shared[tid + 256].m03;
shared[tid].m11 += shared[tid + 256].m11;
shared[tid].m12 += shared[tid + 256].m12;
shared[tid].m13 += shared[tid + 256].m13;
shared[tid].m22 += shared[tid + 256].m22;
shared[tid].m23 += shared[tid + 256].m23;
shared[tid].m33 += shared[tid + 256].m33;
}
__syncthreads();
if (tid < 128) {
shared[tid].m00 += shared[tid + 128].m00;
shared[tid].m01 += shared[tid + 128].m01;
shared[tid].m02 += shared[tid + 128].m02;
shared[tid].m03 += shared[tid + 128].m03;
shared[tid].m11 += shared[tid + 128].m11;
shared[tid].m12 += shared[tid + 128].m12;
shared[tid].m13 += shared[tid + 128].m13;
shared[tid].m22 += shared[tid + 128].m22;
shared[tid].m23 += shared[tid + 128].m23;
shared[tid].m33 += shared[tid + 128].m33;
}
__syncthreads();
if (tid < 64) {
shared[tid].m00 += shared[tid + 64].m00;
shared[tid].m01 += shared[tid + 64].m01;
shared[tid].m02 += shared[tid + 64].m02;
shared[tid].m03 += shared[tid + 64].m03;
shared[tid].m11 += shared[tid + 64].m11;
shared[tid].m12 += shared[tid + 64].m12;
shared[tid].m13 += shared[tid + 64].m13;
shared[tid].m22 += shared[tid + 64].m22;
shared[tid].m23 += shared[tid + 64].m23;
shared[tid].m33 += shared[tid + 64].m33;
}
__syncthreads();
//TODO: consider this: from the next block on not the whole warp is used
if (tid < 32) {
shared[tid].m00 += shared[tid + 32].m00;
shared[tid].m01 += shared[tid + 32].m01;
shared[tid].m02 += shared[tid + 32].m02;
shared[tid].m03 += shared[tid + 32].m03;
shared[tid].m11 += shared[tid + 32].m11;
shared[tid].m12 += shared[tid + 32].m12;
shared[tid].m13 += shared[tid + 32].m13;
shared[tid].m22 += shared[tid + 32].m22;
shared[tid].m23 += shared[tid + 32].m23;
shared[tid].m33 += shared[tid + 32].m33;
}
__syncthreads();
if (tid < 16) {
shared[tid].m00 += shared[tid + 16].m00;
shared[tid].m01 += shared[tid + 16].m01;
shared[tid].m02 += shared[tid + 16].m02;
shared[tid].m03 += shared[tid + 16].m03;
shared[tid].m11 += shared[tid + 16].m11;
shared[tid].m12 += shared[tid + 16].m12;
shared[tid].m13 += shared[tid + 16].m13;
shared[tid].m22 += shared[tid + 16].m22;
shared[tid].m23 += shared[tid + 16].m23;
shared[tid].m33 += shared[tid + 16].m33;
// warpReduce(shared, tid);
}
__syncthreads();
if (tid < 8) {
shared[tid].m00 += shared[tid + 8].m00;
shared[tid].m01 += shared[tid + 8].m01;
shared[tid].m02 += shared[tid + 8].m02;
shared[tid].m03 += shared[tid + 8].m03;
shared[tid].m11 += shared[tid + 8].m11;
shared[tid].m12 += shared[tid + 8].m12;
shared[tid].m13 += shared[tid + 8].m13;
shared[tid].m22 += shared[tid + 8].m22;
shared[tid].m23 += shared[tid + 8].m23;
shared[tid].m33 += shared[tid + 8].m33;
}
__syncthreads();
if (tid < 4) {
shared[tid].m00 += shared[tid + 4].m00;
shared[tid].m01 += shared[tid + 4].m01;
shared[tid].m02 += shared[tid + 4].m02;
shared[tid].m03 += shared[tid + 4].m03;
shared[tid].m11 += shared[tid + 4].m11;
shared[tid].m12 += shared[tid + 4].m12;
shared[tid].m13 += shared[tid + 4].m13;
shared[tid].m22 += shared[tid + 4].m22;
shared[tid].m23 += shared[tid + 4].m23;
shared[tid].m33 += shared[tid + 4].m33;
}
__syncthreads();
if (tid < 2) {
shared[tid].m00 += shared[tid + 2].m00;
shared[tid].m01 += shared[tid + 2].m01;
shared[tid].m02 += shared[tid + 2].m02;
shared[tid].m03 += shared[tid + 2].m03;
shared[tid].m11 += shared[tid + 2].m11;
shared[tid].m12 += shared[tid + 2].m12;
shared[tid].m13 += shared[tid + 2].m13;
shared[tid].m22 += shared[tid + 2].m22;
shared[tid].m23 += shared[tid + 2].m23;
shared[tid].m33 += shared[tid + 2].m33;
}
__syncthreads();
if (tid < 1) {
shared[tid].m00 += shared[tid + 1].m00;
shared[tid].m01 += shared[tid + 1].m01;
shared[tid].m02 += shared[tid + 1].m02;
shared[tid].m03 += shared[tid + 1].m03;
shared[tid].m11 += shared[tid + 1].m11;
shared[tid].m12 += shared[tid + 1].m12;
shared[tid].m13 += shared[tid + 1].m13;
shared[tid].m22 += shared[tid + 1].m22;
shared[tid].m23 += shared[tid + 1].m23;
shared[tid].m33 += shared[tid + 1].m33;
}
__syncthreads();
if (tid == 0) {
m_alphas[blockIdx.x].m00 += shared[0].m00;
m_alphas[blockIdx.x].m01 += shared[0].m01;
m_alphas[blockIdx.x].m02 += shared[0].m02;
m_alphas[blockIdx.x].m03 += shared[0].m03;
m_alphas[blockIdx.x].m11 += shared[0].m11;
m_alphas[blockIdx.x].m12 += shared[0].m12;
m_alphas[blockIdx.x].m13 += shared[0].m13;
m_alphas[blockIdx.x].m22 += shared[0].m22;
m_alphas[blockIdx.x].m23 += shared[0].m23;
m_alphas[blockIdx.x].m33 += shared[0].m33;
}
}
is called from
float * device;
int zero_frame_volume = 4 * sizeof(float) * num_coords;
//align to coords_per_block
int frames_volume = 4 * sizeof(float) * (num_coords / coords_per_block + 1)
* coords_per_block * (num_frames - 1);
cudaMalloc(&device, zero_frame_volume + frames_volume);
cudaMemset(device,0.0,zero_frame_volume + frames_volume);
for (int i = 0; i < num_coords; i++) {
cudaMemcpy(device + 4 * i, frames + 3 * i, 3 * sizeof(float),
cudaMemcpyHostToDevice);
}
for (int frame_count = 1; frame_count < num_frames; frame_count++) {
for (int coord_count = 0; coord_count < num_coords; coord_count++) {
int coord_index = frame_count * num_coords + coord_count;
CudaSafeCall(
cudaMemcpy(device + 4 * coord_index,
frames + 3 * coord_index, 3 * sizeof(float),
cudaMemcpyHostToDevice));
}
}
float4 * coords4 = reinterpret_cast<float4*>(device);
M_alpha * d_m_alphas;
cudaMalloc(&d_m_alphas, sizeof(M_alpha) * num_frames);
cudaMemset(d_m_alphas,0.0,sizeof(M_alpha) * num_frames);
int M_alpha_volume = 1024;
int block_num = num_frames - 1;
int threads_per_block = 1024;
printf("Kernel launched with %i blocks.\n", block_num);
printf("Kernel launched with %i threads per block.\n", threads_per_block);
printf("Volume of M_alpha struct: %i.\n", sizeof(M_alpha));
printf("Allocated %i bytes shared memory on multiprocessors.\n",
M_alpha_volume * sizeof(M_alpha));
int iterations = num_coords / threads_per_block + 1;
printf("Initial kernel iterations: %i.\n", iterations);
M_alpha * h_m_alphas = (M_alpha *) malloc(num_frames * sizeof(M_alpha));
for (int i = 0; i < iterations; i++) {
calc_m_alpha_fill_shared<<<block_num,threads_per_block,M_alpha_volume*sizeof(M_alpha)>>>(coords4,num_coords,i*threads_per_block,d_m_alphas);
cudaDeviceSynchronize();
}
cudaMemcpy(h_m_alphas, d_m_alphas, num_frames * sizeof(M_alpha),
cudaMemcpyDeviceToHost);
printf("%f\n", h_m_alphas[0].m00);
printf("%f\n", h_m_alphas[0].m01);
printf("%f\n", h_m_alphas[0].m02);
printf("%f\n", h_m_alphas[0].m03);
cudaDeviceSynchronize();
With M being a simple struc:
#pragma pack(1)
struct M_alpha {
float m00;
float m01;
float m02;
float m03;
float m11;
float m12;
float m13;
float m22;
float m23;
float m33;
char padding;
}
What it is supposed to do is calculate the entity M from pairs of (x,y,z) coordinates for a set number of frames and reduce it. I know the parallelization is probably clumsy and the reduction can be improved. My problem is another one: In different runs the program gives me different results. It is always one of 5 or so small variations of the correct result, plus the correct result.
I ran all the memcheck tools, in particular racecheck, which does not give me any hazards. What is interesting though is that when I solely run memcheck (i.e. “@>cuda-memcheck ./program”) the result is always the correct one! Obviously memcheck does something which I do not do in a normal program call, unfortunately I was unable to find the source code for memcheck.
I would appreciate help or hints.
All the best
Twerp