Simple 1-D arrays do not update.

I cannot get the input array (h_out) to update in this simple piece of code. I’ve seen this problem in other places, but solutions that work for them have not worked for me. I’m an experienced programmer, a little worried that I don’t see something crucial here.

compile with

nvcc -arch=sm_35 voxelization_kernel.cu
#include <stdio.h>

#include <cuda_runtime.h>

inline
cudaError_t checkCudaErrors(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
#endif
  return result;
}

//integral of the normal distribution
__device__ float get_prob( float x,  float kernel_width) {
  return 0.5 * (erf((x + 0.5) / kernel_width) - erf((x - 0.5) / kernel_width));
}

__global__ void blurElementsToPlanes(const int n_elements, const int n_channels, const float kernel_width, const int radius, const int subvoxel, const int grid_size_x, const int grid_size_y, const int grid_size_z, float * in, float * element_planes){

	///Thread blur kernel here
        	///For each element(atom) I of the input array
		int idx = threadIdx.x + blockIdx.x * blockDim.x;
                int idy = threadIdx.y + blockIdx.y * blockDim.y;

		/// we constrain the x index to be less than n_elements ( like an if loop but without branching behavior
                idx = max( idx, 0);
                idx = min( idx, n_elements-1); 

idy = max( idy, 0);
                idy = min( idy, n_channels-1);

		///flat indexing is x + y * WIDTH
		
		/// cubal indexing is x + WIDTH * (y + DEPTH * z)

		/// get the coordinates of this element in XYZT
               	/// calculate the centerpoint voxel
		int vector_length = 4; ///should almost certainly be a configurable parameter, currently XYZT
		auto this_element = vector_length*idx;
		auto vx = (int)round(in[this_element]);
                auto vy = (int)round(in[this_element+1]);
                auto vz = (int)round(in[this_element+2]);
		auto vt = (int)round(in[this_element+3]);

		
		//printf("hello from thread/element: %d! my coords are %f %f %f %f and my voxel indices are %d %d %d %d\n", idx,in[this_element],in[this_element+1] ,in[this_element+2],in[this_element+3], vx, vy, vz, vt);

		/// we have to pay for an if here unfortunately to catch only nonzero atoms (returns)
		if ((vt!=0) && (idy==vt)){
			/// compute the voxelsphere around this point
				///we first must compute the bounds of the cubic operation
			
			/// min max sets are probably right here, just need to add
			auto lower_bound_x = 0;
			auto upper_bound_x = grid_size_x;
                        auto lower_bound_y = 0;
                        auto upper_bound_y = grid_size_y; 
			auto lower_bound_z = 0;
                        auto upper_bound_z = grid_size_z;

			///printf("acceptable bounds: %d %d %d %d %d %d\n", lower_bound_x, upper_bound_x, lower_bound_y, upper_bound_y, lower_bound_z, upper_bound_z);

			for (int lx=lower_bound_x; lx< upper_bound_x; lx++){
				for (int ly=lower_bound_y; ly< upper_bound_y; ly++){
					for (int lz=lower_bound_z; lz< upper_bound_z; lz++){

const auto dx = lx - vx;
                                                const auto dy = ly - vy;
                                                const auto dz = lz - vz;
						

							if ((pow(dx,2)+pow(dy,2)+pow(dz,2)) <= pow(radius,2)){
							/// if the subvoxel flag is tripped here, then we substitute vx, vy, vz with the actual values.
							

							const auto plane_number = idx;
				
							const auto channel = idy;
	 			

							const auto plidx = plane_number+(n_channels*(channel+grid_size_x*(lx + grid_size_y * (ly + grid_size_z * lz))));

							//printf("setting plindex: pl: %d, plane_number: %d n_channels: %d channel: %d gsx: %d lx: %d gsy: %d, ly: %d gsz: %d lz: %d\n",plidx, plane_number,n_channels,channel,grid_size_x,lx, grid_size_y ,ly,grid_size_z, lz);

							/// store each to the element planes at plane I , channel J, element X, Y, Z						
							element_planes[plidx] += get_prob(dx, kernel_width) * get_prob(dy, kernel_width) * get_prob(dz, kernel_width);

							 ///printf("blurring to x: %d y: %d z: %d vt: %d pl_idx: %d on channel %d: %f\n", lx, ly, lz, vt, plidx, channel,element_planes[plidx]); 

						}
					}
				}
			}

		return;
		}
		else{
			return; ///we don't execute blur on padding atoms, just leave the plane alone
		}

	/// We don't _syncthreads() because the element planes are separate
}
                
		

__global__ void combineElementPlanes( const int n_elements, const int n_channels, const int radius, const int grid_size_x, const int grid_size_y, const int grid_size_z, float * element_planes, float * out){

///recombine kernel here
        /// For each element of element planes I, channel J
        /// for each element X, Y, Z in the given plane I
        /// sum to channel J, X, Y, Z in output cube

	int idx = threadIdx.x + blockIdx.x * blockDim.x; //thread over plane I
        int idy = threadIdx.y + blockIdx.y * blockDim.y; //thread over channel J	

        idx = max( idx, 0);
        idx = min( idx, n_elements-1);	
	
	idy = max(idy, 0);
	idy = min(idy, n_channels-1);

	///printf("combine threads; index: %d\n", idy);

        for (int x=0; x< grid_size_x; x++){
               for (int y=0; y< grid_size_y; y++){
                       for (int z=0; z< grid_size_z; z++){
				
			       	const auto plane_number = idx;
				const auto channel = idy;
				const auto plidx = plane_number+n_channels*(channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z)));
				const auto outidx = channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z));

				float rhs = element_planes[plidx];
				out[outidx] += rhs;

				if (rhs>0.){

					
					//printf("pch: %d gsx: %d x: %d gsy: %d y: %d gsz: %d z: %d plidx: %d\n",channel, grid_size_x,x, grid_size_y,y,grid_size_z, z,plidx);
					//printf("ch: %d gsx: %d x: %d gsy: %d y: %d gsz: %d z: %d multiplies to %d\n", channel, grid_size_x,x, grid_size_y,y,grid_size_z, z,channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z)));
					printf("summing plane %d, channel %d, x: %d y: %d z: %d plidx: %d out[%d]: %f-->%f\n", plane_number, idy, x, y, z,plidx,outidx, rhs, out[outidx] ); 
				
				
				
				}			

		      }

	       }

	}
	return;
}

///base operation
void voxelizeTensorGPU( const int n_elements, const int n_channels, const float kernel_width, const int radius, const int subvoxel, const int grid_size_x, const int grid_size_y, const int grid_size_z, float * h_in, float * h_out){

        int vector_length = 4; ///should almost certainly be a configurable parameter, currently XYZT
	

	/// build and zero out a 5D element planes tensor of n_elements, n_channels, x,y,z

	size_t planeSize = n_elements*n_channels*grid_size_x*grid_size_y*grid_size_z*sizeof(float);
	float * d_element_planes;
        checkCudaErrors(cudaMalloc((void **) &d_element_planes, planeSize*sizeof(float)));
        checkCudaErrors(cudaMemset(d_element_planes, 0, planeSize*sizeof(int)));	

///	auto element_planes = at::zeros(in.type(), {n_elements*n_channels*grid_size_x*grid_size_y*grid_size_z}).to(at::kCuda);

	///allocate input and output tensors
	size_t inputSize = n_elements*vector_length*sizeof(float); ///n_atoms * XYZT
	size_t outputSize = n_channels*grid_size_x*grid_size_y*grid_size_z*sizeof(float);
	float * d_in, * d_out;
	checkCudaErrors(cudaMallocManaged((void **) &d_in, inputSize*sizeof(float)));
	checkCudaErrors(cudaMallocManaged((void **) &d_out, outputSize*sizeof(float)));

	///transfer input and output tensors to device
	checkCudaErrors(cudaMemcpy(d_in,h_in, inputSize, cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemset(d_out, 0., outputSize));
	

        /// launch blur kernel with as many dim3 threads as atoms,channels
	
	int blockLen = 3; ///these should be job dependent configurable parameters... 
	int blockHeight = 3;

	dim3 block1(blockLen, blockHeight);
	dim3 grid1((n_elements+block1.x-1)/block1.x, (n_channels+block1.y-1)/block1.y);

	blurElementsToPlanes<<<grid1, block1>>>( n_elements, n_channels, kernel_width, radius, subvoxel, grid_size_x, grid_size_y, grid_size_z,d_in,d_element_planes);
        /// launch recombine kernel with as many dim3 threads as gridx, gridy, gridz, sum over i, j into channel j
        checkCudaErrors( cudaThreadSynchronize() );

	int c_blockLen = 3;///again, job dependent parameters - set to test configuration here
	int c_blockHeight = 3;

	dim3 block2(c_blockLen, c_blockHeight);	
	dim3 grid2((n_elements+block2.x-1)/block2.x, (n_channels+block2.y-1)/block2.y);
	combineElementPlanes<<<grid2, block2>>>(n_elements, n_channels, radius, grid_size_x, grid_size_y, grid_size_z, d_element_planes, d_out);	

	checkCudaErrors( cudaThreadSynchronize() );
//cudaDeviceSynchronize();

	checkCudaErrors(cudaMemcpy(h_out, d_out, outputSize, cudaMemcpyDeviceToHost));

	for (int channel=0; channel<n_channels; channel++){
                for (int x=0; x< grid_size_x; x++){
                        for (int y=0; y< grid_size_y; y++){
                                for (int z=0; z< grid_size_z; z++){
                                        int outidx = channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z));
                                        printf("inside cuda output[ch:%d x:%d y:%d z:%d]: out[%d]:%f\n", channel, x, y, z,outidx,h_out[outidx]);
                                }
                        }
                }
        }
	
	checkCudaErrors(cudaFree(d_in));
	checkCudaErrors(cudaFree(d_out));
        checkCudaErrors(cudaFree(d_element_planes));

}

int main(void){
	

	const int n_elements = 3;
	const int n_channels = 3;
	const int atom_vector_dim = 4;
	const float kernel_width = 2.;
	const int radius = 2;
	const int subvoxel = 0;
	const int grid_size_x = 3;
        const int grid_size_y = 3;
        const int grid_size_z = 3;

	///build an example one-atom 3 n_elements, 3 n_channels XYZT  matrix on host
	float input[n_elements*atom_vector_dim] = { 0., 0., 0., 0. , 1., 1., 1., 1. , 0., 0., 0., 0. 
	};

	///build a zeroed 3x3x3 output matrix here

	float output[n_channels*grid_size_x*grid_size_y*grid_size_z] = {0.};
							     

	/// store expected 3x3x3 output here
	
	/// float expected_output[3][3][3] = something; 

	///flatten input and output matrix to the proper indexing

	/// launch kernel
	voxelizeTensorGPU( n_elements, n_channels, kernel_width, radius, subvoxel, grid_size_x, grid_size_y, grid_size_z,input, output);

	for (int channel=0; channel<n_channels; channel++){
        	for (int x=0; x< grid_size_x; x++){
               		for (int y=0; y< grid_size_y; y++){
                       		for (int z=0; z< grid_size_z; z++){
					int outidx = channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z));
					printf("output[ch:%d x:%d y:%d z:%d]: %f\n", channel, x, y, z,output[outidx]);
                       		}
               		}
        	}
	}
	/// check error on output
	return 0;

}

I should also note that setting line 154 to be

out[outidx] =1.;

results in the correct output. Why this is baffles me completely.

Partial solution: if I change lines 154 through 165 to be

//                              out[outidx]=rhs+out[outidx];
                                if (rhs>0.){
                                        //printf("before adding out[%d]: %f\n", outidx, out[outidx]);
                                        out[outidx] += rhs;
                                        //printf("after adding out[%d]: %f\n", outidx, out[outidx]);    

                                        //printf("pch: %d gsx: %d x: %d gsy: %d y: %d gsz: %d z: %d plidx: %d\n",channel, grid_size_x,x, grid_size_y,y,grid_size_z, z,plidx);
                                        //printf("ch: %d gsx: %d x: %d gsy: %d y: %d gsz: %d z: %d multiplies to %d\n", channel, grid_size_x,x, grid_size_y,y,grid_size_z, z,channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z)));
                                        ///printf("summing plane %d, channel %d, x: %d y: %d z: %d plidx: %d out[%d]: %f-->%f\n", plane_number, idy, x, y, z,plidx,outidx, rhs, out[outidx] ); 



                                }

It works completely correctly. However the first implementation should work fine - we should just be adding in nonzero values to zero numbers. It doesn’t explain why I was getting all zeroes before. I would greatly appreciate someone of greater wisdom to step in here.

Have you run your code under cuda-memcheck?

You have multiple threads writing to the same location at approximately the same time. CUDA does not automatically sort out these race conditions for you.

You can prove this to yourself by replacing (line 154):

out[outidx] += rhs;

in your first posting with:

atomicAdd(out+outidx, rhs);

whereupon you will see non-zero results in the output, approximately duplicating the result you see when you move the add operation inside the if-statement. The possibility for this hazard is evident simply by studying the construction of outidx in your triple-nested for loops.

When you move the add operation inside the if statement, you are drastically reducing the number of thread updates, and therefore the race conditions do not occur or do not occur as often.