I cannot get the input array (h_out) to update in this simple piece of code. I’ve seen this problem in other places, but solutions that work for them have not worked for me. I’m an experienced programmer, a little worried that I don’t see something crucial here.
compile with
nvcc -arch=sm_35 voxelization_kernel.cu
#include <stdio.h>
#include <cuda_runtime.h>
inline
cudaError_t checkCudaErrors(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
//integral of the normal distribution
__device__ float get_prob( float x, float kernel_width) {
return 0.5 * (erf((x + 0.5) / kernel_width) - erf((x - 0.5) / kernel_width));
}
__global__ void blurElementsToPlanes(const int n_elements, const int n_channels, const float kernel_width, const int radius, const int subvoxel, const int grid_size_x, const int grid_size_y, const int grid_size_z, float * in, float * element_planes){
///Thread blur kernel here
///For each element(atom) I of the input array
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
/// we constrain the x index to be less than n_elements ( like an if loop but without branching behavior
idx = max( idx, 0);
idx = min( idx, n_elements-1);
idy = max( idy, 0);
idy = min( idy, n_channels-1);
///flat indexing is x + y * WIDTH
/// cubal indexing is x + WIDTH * (y + DEPTH * z)
/// get the coordinates of this element in XYZT
/// calculate the centerpoint voxel
int vector_length = 4; ///should almost certainly be a configurable parameter, currently XYZT
auto this_element = vector_length*idx;
auto vx = (int)round(in[this_element]);
auto vy = (int)round(in[this_element+1]);
auto vz = (int)round(in[this_element+2]);
auto vt = (int)round(in[this_element+3]);
//printf("hello from thread/element: %d! my coords are %f %f %f %f and my voxel indices are %d %d %d %d\n", idx,in[this_element],in[this_element+1] ,in[this_element+2],in[this_element+3], vx, vy, vz, vt);
/// we have to pay for an if here unfortunately to catch only nonzero atoms (returns)
if ((vt!=0) && (idy==vt)){
/// compute the voxelsphere around this point
///we first must compute the bounds of the cubic operation
/// min max sets are probably right here, just need to add
auto lower_bound_x = 0;
auto upper_bound_x = grid_size_x;
auto lower_bound_y = 0;
auto upper_bound_y = grid_size_y;
auto lower_bound_z = 0;
auto upper_bound_z = grid_size_z;
///printf("acceptable bounds: %d %d %d %d %d %d\n", lower_bound_x, upper_bound_x, lower_bound_y, upper_bound_y, lower_bound_z, upper_bound_z);
for (int lx=lower_bound_x; lx< upper_bound_x; lx++){
for (int ly=lower_bound_y; ly< upper_bound_y; ly++){
for (int lz=lower_bound_z; lz< upper_bound_z; lz++){
const auto dx = lx - vx;
const auto dy = ly - vy;
const auto dz = lz - vz;
if ((pow(dx,2)+pow(dy,2)+pow(dz,2)) <= pow(radius,2)){
/// if the subvoxel flag is tripped here, then we substitute vx, vy, vz with the actual values.
const auto plane_number = idx;
const auto channel = idy;
const auto plidx = plane_number+(n_channels*(channel+grid_size_x*(lx + grid_size_y * (ly + grid_size_z * lz))));
//printf("setting plindex: pl: %d, plane_number: %d n_channels: %d channel: %d gsx: %d lx: %d gsy: %d, ly: %d gsz: %d lz: %d\n",plidx, plane_number,n_channels,channel,grid_size_x,lx, grid_size_y ,ly,grid_size_z, lz);
/// store each to the element planes at plane I , channel J, element X, Y, Z
element_planes[plidx] += get_prob(dx, kernel_width) * get_prob(dy, kernel_width) * get_prob(dz, kernel_width);
///printf("blurring to x: %d y: %d z: %d vt: %d pl_idx: %d on channel %d: %f\n", lx, ly, lz, vt, plidx, channel,element_planes[plidx]);
}
}
}
}
return;
}
else{
return; ///we don't execute blur on padding atoms, just leave the plane alone
}
/// We don't _syncthreads() because the element planes are separate
}
__global__ void combineElementPlanes( const int n_elements, const int n_channels, const int radius, const int grid_size_x, const int grid_size_y, const int grid_size_z, float * element_planes, float * out){
///recombine kernel here
/// For each element of element planes I, channel J
/// for each element X, Y, Z in the given plane I
/// sum to channel J, X, Y, Z in output cube
int idx = threadIdx.x + blockIdx.x * blockDim.x; //thread over plane I
int idy = threadIdx.y + blockIdx.y * blockDim.y; //thread over channel J
idx = max( idx, 0);
idx = min( idx, n_elements-1);
idy = max(idy, 0);
idy = min(idy, n_channels-1);
///printf("combine threads; index: %d\n", idy);
for (int x=0; x< grid_size_x; x++){
for (int y=0; y< grid_size_y; y++){
for (int z=0; z< grid_size_z; z++){
const auto plane_number = idx;
const auto channel = idy;
const auto plidx = plane_number+n_channels*(channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z)));
const auto outidx = channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z));
float rhs = element_planes[plidx];
out[outidx] += rhs;
if (rhs>0.){
//printf("pch: %d gsx: %d x: %d gsy: %d y: %d gsz: %d z: %d plidx: %d\n",channel, grid_size_x,x, grid_size_y,y,grid_size_z, z,plidx);
//printf("ch: %d gsx: %d x: %d gsy: %d y: %d gsz: %d z: %d multiplies to %d\n", channel, grid_size_x,x, grid_size_y,y,grid_size_z, z,channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z)));
printf("summing plane %d, channel %d, x: %d y: %d z: %d plidx: %d out[%d]: %f-->%f\n", plane_number, idy, x, y, z,plidx,outidx, rhs, out[outidx] );
}
}
}
}
return;
}
///base operation
void voxelizeTensorGPU( const int n_elements, const int n_channels, const float kernel_width, const int radius, const int subvoxel, const int grid_size_x, const int grid_size_y, const int grid_size_z, float * h_in, float * h_out){
int vector_length = 4; ///should almost certainly be a configurable parameter, currently XYZT
/// build and zero out a 5D element planes tensor of n_elements, n_channels, x,y,z
size_t planeSize = n_elements*n_channels*grid_size_x*grid_size_y*grid_size_z*sizeof(float);
float * d_element_planes;
checkCudaErrors(cudaMalloc((void **) &d_element_planes, planeSize*sizeof(float)));
checkCudaErrors(cudaMemset(d_element_planes, 0, planeSize*sizeof(int)));
/// auto element_planes = at::zeros(in.type(), {n_elements*n_channels*grid_size_x*grid_size_y*grid_size_z}).to(at::kCuda);
///allocate input and output tensors
size_t inputSize = n_elements*vector_length*sizeof(float); ///n_atoms * XYZT
size_t outputSize = n_channels*grid_size_x*grid_size_y*grid_size_z*sizeof(float);
float * d_in, * d_out;
checkCudaErrors(cudaMallocManaged((void **) &d_in, inputSize*sizeof(float)));
checkCudaErrors(cudaMallocManaged((void **) &d_out, outputSize*sizeof(float)));
///transfer input and output tensors to device
checkCudaErrors(cudaMemcpy(d_in,h_in, inputSize, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemset(d_out, 0., outputSize));
/// launch blur kernel with as many dim3 threads as atoms,channels
int blockLen = 3; ///these should be job dependent configurable parameters...
int blockHeight = 3;
dim3 block1(blockLen, blockHeight);
dim3 grid1((n_elements+block1.x-1)/block1.x, (n_channels+block1.y-1)/block1.y);
blurElementsToPlanes<<<grid1, block1>>>( n_elements, n_channels, kernel_width, radius, subvoxel, grid_size_x, grid_size_y, grid_size_z,d_in,d_element_planes);
/// launch recombine kernel with as many dim3 threads as gridx, gridy, gridz, sum over i, j into channel j
checkCudaErrors( cudaThreadSynchronize() );
int c_blockLen = 3;///again, job dependent parameters - set to test configuration here
int c_blockHeight = 3;
dim3 block2(c_blockLen, c_blockHeight);
dim3 grid2((n_elements+block2.x-1)/block2.x, (n_channels+block2.y-1)/block2.y);
combineElementPlanes<<<grid2, block2>>>(n_elements, n_channels, radius, grid_size_x, grid_size_y, grid_size_z, d_element_planes, d_out);
checkCudaErrors( cudaThreadSynchronize() );
//cudaDeviceSynchronize();
checkCudaErrors(cudaMemcpy(h_out, d_out, outputSize, cudaMemcpyDeviceToHost));
for (int channel=0; channel<n_channels; channel++){
for (int x=0; x< grid_size_x; x++){
for (int y=0; y< grid_size_y; y++){
for (int z=0; z< grid_size_z; z++){
int outidx = channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z));
printf("inside cuda output[ch:%d x:%d y:%d z:%d]: out[%d]:%f\n", channel, x, y, z,outidx,h_out[outidx]);
}
}
}
}
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
checkCudaErrors(cudaFree(d_element_planes));
}
int main(void){
const int n_elements = 3;
const int n_channels = 3;
const int atom_vector_dim = 4;
const float kernel_width = 2.;
const int radius = 2;
const int subvoxel = 0;
const int grid_size_x = 3;
const int grid_size_y = 3;
const int grid_size_z = 3;
///build an example one-atom 3 n_elements, 3 n_channels XYZT matrix on host
float input[n_elements*atom_vector_dim] = { 0., 0., 0., 0. , 1., 1., 1., 1. , 0., 0., 0., 0.
};
///build a zeroed 3x3x3 output matrix here
float output[n_channels*grid_size_x*grid_size_y*grid_size_z] = {0.};
/// store expected 3x3x3 output here
/// float expected_output[3][3][3] = something;
///flatten input and output matrix to the proper indexing
/// launch kernel
voxelizeTensorGPU( n_elements, n_channels, kernel_width, radius, subvoxel, grid_size_x, grid_size_y, grid_size_z,input, output);
for (int channel=0; channel<n_channels; channel++){
for (int x=0; x< grid_size_x; x++){
for (int y=0; y< grid_size_y; y++){
for (int z=0; z< grid_size_z; z++){
int outidx = channel+grid_size_x*(x + grid_size_y * (y + grid_size_z * z));
printf("output[ch:%d x:%d y:%d z:%d]: %f\n", channel, x, y, z,output[outidx]);
}
}
}
}
/// check error on output
return 0;
}