Complete freeze using CUDA

Hello Community.

I am writing because I am facing a strange problem with my code. I have to simulate a process and
I managed to write it in CUDA. Actually I don’t need that the cores talk each other,
I need to use each core independently and at the end collect the results.

The problem is the following. Using a “time step” of a certain value, that mean “N” cycles in the kernel,
the program return the correct results.
Using a time step smaller, that mean “M>N” cycles, the entire computer freeze, completely, and I have to
reboot it manually. The difference in “time step” is really small and even waiting a really long time
nothing change (the algorithm is linear respect to time step so a “long waiting time” should be excluded).

I am using a GTS 220 with Ubuntu 10.04, drivers and tools are update to January 2012.

Any idea about what it could be the problem? Any suggestions? Could it be an hardware problem of the GTS?
Does exist any known bugs of this card?
I could also post my code if needed.

Thanks in advance.

Maybe for M > N, you enter into an infinite loop? Please post code, so that we could take a look.

Thank you for the fast reply.

Here the code, anyway I don’t have a programming background so the code may look ugly.

Well… without any explanation the code could be difficult to understand.

Let’s say that the polynomial that I solve is monotonic with the increase of time, so it

increase at each timestep.

“Prob_of_t” also increase depending on the solution of the aforementioned polynomial, so

the condition to end the loop should be verified.

In my computer it works until timestep=(0.5e-12)/vel, but it fail when timestep=(0.1e-12)/vel.

#include <cuda.h>

#include <stdlib.h>

#include <iostream>

#include <fstream>

#include <cmath>

#include <curand_kernel.h>

using namespace std;

__global__ void setup_kernel(curandState *state, int nummod)

{

const int id=blockIdx.x*nummod;

curand_init(id, id, 0, &state[id]);

}

// our function is a third grade equation U^3 + bU^2 + cU + d = f(x)

__device__ float f(float b, float c, float d, float x) {

    return d + x*(c + x*(b + x) );

}

// first derivative of the function 3U^2 + 2bU + c

__device__ float fprime(float b, float c, float x) {

    return c + x*(2*b + 3*x);

}

// it calculate the solutions for the polinomial

__device__ bool nr_pol3(float b, float c, float d, float x0, float epsilon, float& x) {

    const int maxIteration = 500;

    x = x0;

    float fx = f(b,c,d,x);

int numIterations = 0;

    while (fabs(fx) > epsilon) {

        float fpx = fprime(b,c,x);

        ++numIterations;

        if (numIterations>maxIteration || fabs(fpx)<epsilon) {

            return false;

        }

        x -= fx / fpx;

        fx = f(b,c,d,x);

    }

    //cout<< "Number of iterations:"  << numIterations <<endl;

    return true;

}

// here we simply calculate the coefficient a,b,c of the equation U^3 +aU^2 +bU +c =0

//from the parameters

__device__ void coeff(float v,float time,float temp,float kc,float cl,float pl,float *sol)

{

 float kbol, s, vt, SL, den, svt, b, c, d;

 kbol=1.3806e-23;

 s=kc*pl/(kbol*temp);

 vt=v*time;

 SL=s*cl;

 den=-4*SL-4;

 svt=s*vt;

 b=(9.0+4.0*svt+8.0*SL)/den;

 c=(-6.0-8.0*svt-4.0*SL)/den;

 d=4.0*svt/den;

 sol[0]=b;

 sol[1]=c;

 sol[2]=d;

} 

__device__ float getFt(float a,float v,float time,float cl,float kc)

{return (v*time-a*cl)*kc;}

__device__ float prob_of_t(float koff,float deltax,float temp,float timestep,float mod,float F){

 float kbol=1.3806e-23;

 return  mod*timestep*koff*exp(F*deltax/(kbol*temp));

}

__global__ void sim_unf(float vel, float kel, float modclength, float plength, float deltax, float koff, float temp, const int nummod, float *dev_a, curandState *state)

{

	const int Idx=blockIdx.x*nummod;

	//here we assigne the inizial contour length given by the length of the FOLDED molecule

	float firstclength, clength, timestep, sim_time, phys_sol_U, abc[3];// unfolding_max_forces[nummod];

        firstclength=0.5*(modclength);

	clength=firstclength;

	timestep=(0.5e-11)/vel;

	sim_time=0.0;

	phys_sol_U=0.0;

	curandState localState = state[Idx];

	float last_ft;

int numbmod=nummod;

		while (numbmod>0)

		{

			//find the coefficients a,b,c of the equation u^3 +aU^2+bU+c=0 that are function of the parameters

			coeff(vel,sim_time,temp,kel,clength,plength,abc);

			//find the solutions with Newton solver

			nr_pol3(abc[0],abc[1],abc[2],phys_sol_U,1e-6,phys_sol_U);

			

			

			last_ft= getFt(phys_sol_U,vel,sim_time,clength,kel);

			if ( curand_uniform(&localState)  <=prob_of_t(koff,deltax,temp,timestep,numbmod,last_ft))

			{

				clength+=modclength;

				numbmod-=1;

				dev_a[Idx+numbmod]=last_ft;

			}

			sim_time+=timestep;

		}

	//no idea why ppl do that...    

	state[Idx]=localState;

}

	

int main(){

clock_t t1, t2;

t1=clock();

const int num_modules=8, num_curves=512;

const int numtotal=num_modules*num_curves;

float *unf_max_forces = (float*)malloc(numtotal*sizeof(float));

float *dev_a;

float vel=0.3e-6;   //it is in m/s

float kel=0.05; //it is in N/m

float modclength=28e-9; //it is in m

float plength=0.3e-9; //it is in m

float temp=300.0; //it is in K

float dxinit=1.5e-10; //it is in m

float koff=0.37; //it is in s^-1

curandState *devStates;

cudaMalloc(  (void **)&devStates, num_curves*sizeof(curandState) );

cudaMalloc ( (void **)&dev_a, numtotal*sizeof(float));

/* Setup prng states */

setup_kernel<<<num_curves, 1>>>(devStates,num_modules);

sim_unf<<<num_curves,1>>>(vel, kel, modclength, plength, dxinit, koff, temp, num_modules, dev_a, devStates);

cudaMemcpy(unf_max_forces,dev_a,numtotal*sizeof(float),cudaMemcpyDeviceToHost);

t2=clock();

float diff = ((float)t2 - (float)t1)/(float)CLOCKS_PER_SEC;

cout<<"Execution time for: "<<num_curves<<" curves: "<<diff<<" seconds"<<endl;

ofstream file;

file.open("dataout.txt");

for (int i=0; i< num_curves*num_modules;++i){

    int num_semicol=i % num_modules;

    for (int k=0;k<num_semicol;++k)

    file<<";";

    file<<1e12*unf_max_forces[i]<<endl;

}

file.close();

}

Just ran your code, runs just fine for both cases –

#timestep=(0.5e-12)/vel.

$ ./a.out 

Execution time for: 512 curves: 11.82 seconds

#timestep=(0.1e-12)/vel.

$ ./a.out

Execution time for: 512 curves: 53.81 seconds

-- Attached output file generated for this case.

The code looks good. Looks like you spent lot of time implementing this. You could try using ArrayFire(free CUDA GPU library) for your CUDA projects. This has a lot of useful functions, random data generation on GPUs, FFTs, convolutions, etc and could save lot of your development time.

Here are my system specifications (Used ArrayFire info() :) )

int main(){

   af::info(); // Gives info about CUDA driver/toolkit, CUDA-capable GPUs in system

}

Output:

ArrayFire v1.0 (build 90a2495) by AccelerEyes (64-bit Linux)

License Type: Designated Computer (/home/short/.arrayfire.lic)

Addons: MGL4, DLA, SLA

CUDA toolkit 4.0, driver 290.10

GPU0 GeForce GT 540M, 2048 MB, Compute 2.1 (single,double)

Memory Usage: 998 MB free (2048 MB total)

dataout.tar (50 KB)

file attached

Thank you for the tests. I was never able to reach 11 or 60 seconds of simulations, I am really starting to think that there is a subtle hardware bug or some kind problem in my card. Well, I think that the community did all the possible. Thanks again.