NaN returns from recreated nbody integration code.

Background I’m working on recreating a simpler version of the Nbody example code provided with CUDA for my own use and learning. I’ve simplified it quite a lot; removing the variable data types, ignoring softening parameters for now (collisions are extremely rare for my application), and just working with the integration part of the code, from which i’m transferring data back to the cpu to be saved to file every time-step. I plan on doing that bit asynchronously in the future as described here: https://devblogs.nvidia.com/parallelforall/how-overlap-data-transfers-cuda-cc/ but for now the kernel calls and CPU parts are sequential.

The issue is that my recreated CUDA integration algorithm is outputting NaN values, and i’m trying to diagnose the cause. This is my first time using gpu computing, and I don’t know the tools typically used to probe my code as it runs - as things are device-side and to the best of my knowledge cannot be accessed mid-kernel call.

I’ve ran the code through cuda-memcheck which reports no errors, leading me to believe that at least i’m not addressing memory incorrectly. It’s possible I’ve misunderstood/misapplied the integration algorithm.

Here is my code,
“main.cu”

  • reads pre-generated data into cuda-allocated (cudamallochost) host memory
  • copies that into device memory
  • calls "leap_step" once for now, which is the kernel method
  • test on line 91/92 that the memory is properly transferred before the integration step (works)
  • “nBodyCuda.cu”

  • recreated nbody integration algorithm, using the tile calculation methods from the "GPU Gems 3" chapter 31, cross referenced with the nbody example provided with the CUDA package
  • “main.cu”

    #include "nBodyCuda.cu"
    
    
    int main(int argc, char *argv[]){
    
    	unsigned int numPoints = 16384;			
    
    			///*** CUDA Memory Parameters ***///
    
    	const int threadsPerBlock = 256;		// blockSize from NVDA_nbody
    	const int numTiles = (numPoints + threadsPerBlock -1) / threadsPerBlock;
    	const int sharedMemSize = threadsPerBlock * sizeof(float4);
    
    			///*** Host Memory Objects ***///
    
    	int steps = 1;
    	float dt = 0.00335175;					// time-steps (in variable time units)
    	size_t size = numPoints * sizeof(float4);
    	float4* host_points;					// array of float4 point data {m, x, y, z}
    	float4* host_velocities;				// initial velocities to be loaded into dev memory
    
    			///*** Device Memory Objects ***///
    	
    	float4* dev_points0;
    	float4* dev_points1;		//require two arrays of points for time-stepping.
    	float4* dev_velocities;		//examples use velocity float4s 
    
    			///*** File handling ***///
    
    	char filein[20] = {0};
    	char fileout[20] = {0};
    	char * temp = argv[1];
    	strcat(filein, temp);
    	strcat(filein, ".data");
    	strcat(fileout, temp);
    	strcat(fileout, "Run.data");
    
    	FILE *savefile;
    	ifstream datafile(filein, ifstream::in);
        savefile = fopen(fileout, "w");
    	if(datafile.is_open())
    		cout <<"succesfully opened initial and output files"<< endl;
    
    			///*** Load point data from file into _Pinned_ host memory ***///
    
    	std::string line;
    	int i = 0; 
    	checkCuda( cudaMallocHost((void**)&host_points, size) ); 	
    	checkCuda( cudaMallocHost((void**)&host_velocities, size)); 
    	while(getline(datafile, line)){
    		std::stringstream linestream(line);
    		if(i==0)
    			std::cout << "N = " << numPoints << endl;				 
    		else{	
    			linestream >> host_points[i-1].w 
    					   >> host_points[i-1].x 
    					   >> host_points[i-1].y
    					   >> host_points[i-1].z 						 //load mass,x,y,z data
    					   >> host_velocities[i-1].x
    					   >> host_velocities[i-1].y
    					   >> host_velocities[i-1].z;			 			 //load vx,vy,vz data
    
    			host_velocities[i-1].w = 0.0;
    		}
    	i++;
    	}
    		
    	checkCuda( cudaMalloc((void**)&dev_points0, size));
    	checkCuda( cudaMalloc((void**)&dev_points1, size));
    	checkCuda( cudaMalloc((void**)&dev_velocities, size));
    
    			///*** Load point data from _Pinned_ into (1 for now) Device memory ***///
    	
    	checkCuda( cudaMemcpy(dev_points0, 
    			      host_points, 
    			      size, 
    			      cudaMemcpyHostToDevice));					 // points0
    
    	checkCuda( cudaMemcpy(dev_points1,
    			      host_points,
    			      size,
    			      cudaMemcpyHostToDevice));					 // points1
    
    	checkCuda( cudaMemcpy(dev_velocities, 
    			      host_velocities,
    			      size,
    			      cudaMemcpyHostToDevice));					 // velocities
    
    			///*** Perform N-Body Integration on GPU, saving data on CPU ***///
    
    	checkCuda( cudaMemcpy(host_points, dev_points0, size, cudaMemcpyDeviceToHost));
    	cout << " first point (x) back and forth: " << host_points[0].x << endl;
    
    	SaveState(host_points, numPoints, savefile); //initial save
    	int nstep;
    	for (nstep = 0; nstep < steps; nstep++){
    
    		leap_step <<< numTiles, threadsPerBlock, sharedMemSize >>>
    							   (dev_points1, 
    							    dev_points0, 
    							    dev_velocities, 
    							    numPoints, 
    							    dt, 
    							    numTiles);
    
    		checkCuda( cudaMemcpy(host_points,
    				      dev_points0,
    				      size,
    				      cudaMemcpyDeviceToHost));
    
    		cout << "zero: " << host_points[0].x << endl;
    
    		checkCuda( cudaMemcpy(host_points,
    				      dev_points1,
    				      size,
    				      cudaMemcpyDeviceToHost));
    
    		cout << "zero: " << host_points[0].x << endl;
    
    		SaveState(host_points, numPoints, savefile); 
    
    		cout << "happened " << nstep << endl;
    		}
    
    			///*** Free Memory ***///
    	cudaFree(dev_points0);
    	cudaFree(dev_points1);
    	cudaFree(dev_velocities);
    	cudaFreeHost(host_points);
    	cudaFreeHost(host_velocities);
    }
    

    “nBodyCuda.cu”

    #include "nBodyCuda.h" 
    
    void SaveState(float4* bodies, int n, FILE* savefile){
    		for(int i = 0; i < n; i++)
    			fprintf(savefile,"%12.6f%s"
    					 "%12.6f%s%12.6f%s",
    					 bodies[i].x,"\t",
    					 bodies[i].y,"\t",
    					 bodies[i].z,"\n");
    }
    
    __device__ float3
    bodyBodyInteraction(float4 bi, float4 bj, float3 ai) {
    
    	float3 r;
    
    	r.x = bj.x - bi.x;
    	r.y = bj.y - bi.y;
    	r.z = bj.z - bi.z;
    
    	float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
    	float invDist = rsqrt(distSqr);
    	float invDistCube = invDist * invDist * invDist;
    
    	float s = bj.w * invDistCube;
    
    	ai.x += r.x * s;
    	ai.y += r.y * s;
    	ai.z += r.z * s;
    
    	return ai;
    }
    
    __device__ float3
    tile_accel(float4 threadPos,
    		   float4 *PosMirror,
    		   int numTiles) {
    
    	extern __shared__ float4 sharedPos[];
    	float3 accel = {0.0f, 0.0f, 0.0f};
    
    	for (int tile = 0; tile < numTiles; tile++){
    		sharedPos[threadIdx.x] = PosMirror[tile * blockDim.x + threadIdx.x];
    		__syncthreads();
    
    		for ( int i = 0; i < blockDim.x; i++ ) {
    			accel = bodyBodyInteraction(threadPos, sharedPos[i], accel);
    		}
    		__syncthreads();
    	}
    	return accel;
    }
    
    __global__ void 
    leap_step( float4 *__restrict__ newPos,
    		   float4 *__restrict__ oldPos, 
    		   float4 *deviceVel, 
    		   unsigned int numBodies,
    		   float dt, int numTiles ) {
    
    	int index = blockIdx.x * blockDim.x + threadIdx.x;
    
    	if (index > numBodies) { return; };
    
    	// The body for which to calculate the accel, in parallel
    	float4 position = oldPos[index];
    
    	float3 accel = tile_accel(position, oldPos, numTiles);
    
    	//Verlet - Leapfrog Integration
    	float4 velocity = deviceVel[index];
    	velocity.x += accel.x * dt;
    	velocity.y += accel.y * dt;
    	velocity.z += accel.z * dt;
    
    	position.x += velocity.x * dt;
    	position.y += velocity.y * dt;
    	position.z += velocity.z * dt;
    
    	newPos[index] = position;
    	oldPos[index] = position;
    	deviceVel[index] = velocity;
    }
    

    header is just includes and the checkCuda wrapping method for allocation prescribed elsewhere:

    #pragma once 
    #include <fstream>
    #include <iostream>
    #include <sstream>
    #include <math.h>
    #include "vector_types.h"
    #include <cuda.h>
    #include <stdio.h>
    #include <device_functions.h>
    #include <assert.h>
    #include <cuda_runtime.h>
    #include <cuda_profiler_api.h>
    // probably some unecessary includes here
    
    inline 
    cudaError_t checkCuda(cudaError_t result){
    	if (result != cudaSuccess){
    		fprintf(stderr, "CUDA Runtime Error: %sn", cudaGetErrorString(result));
    		assert(result==cudaSuccess);
    	}
    	return result;
    }
    

    My recommendation: apply standard debugging techniques. E.g. use the CUDA debugger, or add some strategic printf() calls to log intermediate data.

    Consider how NaNs can come about. Either the code picks up uninitialized data that happens to correspond to a NaN encoding, or NaNs are created as the result of invalid operations, such as dividing zero by zero, subtracting infinity from infinity, taking the square root or logarithm of a negative number.

    Got it! I was calculating acceleration from a body on itself and, as you suggested, getting 0 and/or inf values as a result.

    Thank you for the suggestion of (the special) printf function, I had thought device thread data was a black box before :)