Hello , i wrote a heat transfer program based on the example in "cuda by an example " book.
I can’t figure why some times I am getting nan results ,sometimes not and sometimes too big numbers (10^16 values).
In the beginning the program creates the random matrix which will be the initial grid.
Then , the first kernel just copies the initial grid , in order for the second kernel to update the grid.
Then , using the cublas Swap function , we take the output of the updated grid and supply it as an input to the first kernel and so on…
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <time.h>
#include <cublas_v2.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <iomanip>
using namespace std;
const int NbCells = 16;
const int threads = 1;
const int blocks = 1;//NbCells / threads;
const int speed = 0.25;
__device__ float generate( curandState * globalState, int ind )
{
ind = threadIdx.x + blockIdx.x * blockDim.x;
curandState localState = globalState[ ind ];
float RANDOM = curand_uniform( &localState );
globalState[ ind ] = localState;
return RANDOM;
}
__global__ void setup_kernel( curandState * state, unsigned long seed )
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init( seed, id, 0, &state[ id ] );
return;
}
__global__ void kernel( float * grid, curandState * globalState, int gridsize )
{
// generate random numbers
for ( int i = 0; i < gridsize; i++ )
{
float k = generate( globalState, i );
grid[ i ] = k;
}
return;
}
//copy temperatures from inTemps to constTemps
__global__ void copyTemp( float * inTemps , float * constTemps){
int RowIdx = threadIdx.y + (blockIdx.y * blockDim.y);
int ColIdx = threadIdx.x + (blockIdx.x * blockDim.x);
int offset = ( RowIdx * blockDim.x * gridDim.x ) + ColIdx;
constTemps[offset] = inTemps[offset];
return;
}
__global__ void updateTemp(float * inTemps , float * outTemps){
int RowIdx = threadIdx.y + (blockIdx.y * blockDim.y);
int ColIdx = threadIdx.x + (blockIdx.x * blockDim.x);
int offset = ( RowIdx * blockDim.x * gridDim.x ) + ColIdx;
int Rows = blockDim.y * gridDim.y;
int Cols = blockDim.x * gridDim.x;
int left = offset - 1;
int right = offset + 1;
int top = offset + Cols;
int bottom = offset - Cols;
//boundary conditions
if ( ColIdx == 0 ) left += Cols;
if ( ColIdx == ( Cols - 1 ) ) right -= Cols;
if ( RowIdx == ( Rows - 1 ) ) top -= Rows * Cols;
if ( RowIdx == 0 ) bottom += Rows * Cols;
outTemps[offset] = inTemps[offset] + speed * ( inTemps[top] + inTemps[bottom] + inTemps + inTemps - 4 * inTemps[offset] );
return;
}
int main(){
int N = 100; // random numbers
// allocate host memory
float * grid;
grid = (float*) malloc (NbCells * sizeof(float));
assert( NULL != grid );
float * newgrid;
newgrid = (float*) malloc (NbCells * sizeof(float));
assert( NULL != newgrid );
//allocate device memory
float *inTemps;
cudaMalloc( (void **) &inTemps, NbCells * sizeof(float) );
float * outTemps , * constTemps;
cudaMalloc( (void **) &outTemps, NbCells * sizeof(float) );
cudaMalloc( (void **) &constTemps, NbCells * sizeof(float) );
/* Generating random numbers for filling grids (inTemps and constTemps) */
curandState * devStates;
cudaMalloc( &devStates, N * sizeof(curandState) );
// setup seeds
setup_kernel<<<1,N>>>( devStates, unsigned( time(NULL)) );
cudaPeekAtLastError();
cudaDeviceSynchronize();
// generate random numbers for filling inTemps
kernel<<<1,1>>>( inTemps, devStates, NbCells );
cudaPeekAtLastError();
cudaDeviceSynchronize();
//------------------------------------------------------------------------------------------------
/* Get handle to the CUBLAS context */
cublasHandle_t cublasHandle = 0;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&cublasHandle);
// define threads and blocks
dim3 dimGrid( blocks , blocks );
dim3 dimBlock( threads , threads );
cudaMemcpy( grid, inTemps, NbCells * sizeof(float), cudaMemcpyDeviceToHost );
cout << "\nTemperatures before : "<<endl;
for (int i = 0; i < NbCells; i++) {
if (i % 4 == 0 ) cout <<endl;
cout << setprecision(5) << showpoint << setw(17) << grid[i] << "\t";
}
cout <<endl;
for (int i = 0; i < 2; i++ ) {
copyTemp<<<dimGrid,dimBlock>>>(inTemps , constTemps);
cudaPeekAtLastError();
cudaDeviceSynchronize();
updateTemp<<<dimGrid,dimBlock>>>(constTemps , outTemps);
cudaPeekAtLastError();
cudaDeviceSynchronize();
//swap input and output to supply first kernel
cublasSswap(cublasHandle , NbCells , outTemps , 1 , inTemps , 1 );
}
cudaMemcpy( newgrid, outTemps, NbCells * sizeof(float), cudaMemcpyDeviceToHost );
cout << "\nTemperatures after : "<<endl;
for (int i = 0; i < NbCells; i++){
if (i % 4 == 0 ) cout <<endl;
cout << setprecision(5) << showpoint << setw(17) << newgrid[i] << "\t";
}
cout <<endl;
//clean host memory
free ( grid );
free ( newgrid );
// clean device memory
cudaFree( inTemps );
cudaFree( outTemps );
cudaFree( constTemps );
return 0;
}