[Beginner] Memory is reseted in the kernel

Hi,

I am a beginner in CUDA development.

I am trying a very simple program : a 1D thermal diffusion.

But I failed to do more than 1 iteration, because the memory seems to be reseted and I don’t know why.

Here is the program : (see after in the post for a simple test that don’t work either.)

__global__ void kernel(float* T0_d,float* T1_d, float Cd,int N){

int index = threadIdx.x + blockIdx.x * blockDim.x;

  int j;

   for (j=1; j==1000; j++) 

   {

  if (index < N-1 && index > 0)

  T1_d[index] = T0_d[index] + Cd*(T0_d[index+1]-2.0*T0_d[index]+T0_d[index-1]);

  __syncthreads();

  T0_d[index] = T1_d[index] + Cd*(T1_d[index+1]-2.0*T1_d[index]+T1_d[index-1]);

  __syncthreads();

 }

}

// export CC=/usr/bin/gcc-4.3

#include <stdio.h>

#include <assert.h>

#include <cuda.h>

int main(void)

{

float *T_h;	 // pointers to host memory

   float *T0_d,*T1_d;	 // pointers to device memory

   int N = 100;

   int i;

   float L=1.0;

   float Dx,Dt,Cd;

   Dx = L/((N-1)*1.0);

   Dt = 0.4*Dx*Dx;

   Cd = Dt/(Dx*Dx);

int deviceCount = 0;

if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {

printf("cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");

printf("\nFAILED\n");

}

// allocate arrays on host

   T_h = (float *)malloc(sizeof(float)*N);

   // allocate arrays on device

   cudaMalloc((void **) &T0_d, sizeof(float)*N);

   cudaMalloc((void **) &T1_d, sizeof(float)*N);

   // initialize host data

   for (i=0; i<N; i++) 

   {

	  T_h[i] = 0.0;

   }

cudaMemcpy(T1_d, T_h, sizeof(float)*N, cudaMemcpyHostToDevice);

for (i=50; i<=60; i++) 

   {

	  T_h[i] = 20.0;

   }

// send data from host to device: T_h to T_d 

   cudaMemcpy(T0_d, T_h, sizeof(float)*N, cudaMemcpyHostToDevice);

// CUDA grid setting

  dim3 block(256);

  int n1 = (N+block.x-1)/(block.x); //number of blocks on the x dimension

  dim3 grid(n1,1);

	kernel<<<grid,block>>>(T0_d,T1_d,Cd,N);

// retrieve data from device: b_d to b_h

   cudaMemcpy(T_h, T0_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

for (i=0; i<N; i++) 

   {

	  printf("Host : i %d T %f\n",i,T_h[i]);

   }

   cudaMemcpy(T_h, T1_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

   for (i=0; i<N; i++) 

   {

	  printf("Host : i %d T %f\n",i,T_h[i]);

   }

free(T_h);

   cudaFree(T0_d);

   cudaFree(T1_d);

}

SO I tried a very simple kernel then :

__global__ void kernel(float* T0_d,float* T1_d, float Cd,int N){

  int index = threadIdx.x + blockIdx.x * blockDim.x;

  int j;

  if (index < N && index >= 0)

   for (j=1; j==10; j++) 

   {

  T1_d[index] = T1_d[index] + 1.0;

  __syncthreads();

 }

}

And I don’t understand why if the “for” loop do 1 iteration, T1 goes from 0 to 1, but if I do more than 1 iteration, T1 is set to 0.

Could someone help me understand why ? :)

Hi,

I am a beginner in CUDA development.

I am trying a very simple program : a 1D thermal diffusion.

But I failed to do more than 1 iteration, because the memory seems to be reseted and I don’t know why.

Here is the program : (see after in the post for a simple test that don’t work either.)

__global__ void kernel(float* T0_d,float* T1_d, float Cd,int N){

int index = threadIdx.x + blockIdx.x * blockDim.x;

  int j;

   for (j=1; j==1000; j++) 

   {

  if (index < N-1 && index > 0)

  T1_d[index] = T0_d[index] + Cd*(T0_d[index+1]-2.0*T0_d[index]+T0_d[index-1]);

  __syncthreads();

  T0_d[index] = T1_d[index] + Cd*(T1_d[index+1]-2.0*T1_d[index]+T1_d[index-1]);

  __syncthreads();

 }

}

// export CC=/usr/bin/gcc-4.3

#include <stdio.h>

#include <assert.h>

#include <cuda.h>

int main(void)

{

float *T_h;	 // pointers to host memory

   float *T0_d,*T1_d;	 // pointers to device memory

   int N = 100;

   int i;

   float L=1.0;

   float Dx,Dt,Cd;

   Dx = L/((N-1)*1.0);

   Dt = 0.4*Dx*Dx;

   Cd = Dt/(Dx*Dx);

int deviceCount = 0;

if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {

printf("cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");

printf("\nFAILED\n");

}

// allocate arrays on host

   T_h = (float *)malloc(sizeof(float)*N);

   // allocate arrays on device

   cudaMalloc((void **) &T0_d, sizeof(float)*N);

   cudaMalloc((void **) &T1_d, sizeof(float)*N);

   // initialize host data

   for (i=0; i<N; i++) 

   {

	  T_h[i] = 0.0;

   }

cudaMemcpy(T1_d, T_h, sizeof(float)*N, cudaMemcpyHostToDevice);

for (i=50; i<=60; i++) 

   {

	  T_h[i] = 20.0;

   }

// send data from host to device: T_h to T_d 

   cudaMemcpy(T0_d, T_h, sizeof(float)*N, cudaMemcpyHostToDevice);

// CUDA grid setting

  dim3 block(256);

  int n1 = (N+block.x-1)/(block.x); //number of blocks on the x dimension

  dim3 grid(n1,1);

	kernel<<<grid,block>>>(T0_d,T1_d,Cd,N);

// retrieve data from device: b_d to b_h

   cudaMemcpy(T_h, T0_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

for (i=0; i<N; i++) 

   {

	  printf("Host : i %d T %f\n",i,T_h[i]);

   }

   cudaMemcpy(T_h, T1_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

   for (i=0; i<N; i++) 

   {

	  printf("Host : i %d T %f\n",i,T_h[i]);

   }

free(T_h);

   cudaFree(T0_d);

   cudaFree(T1_d);

}

SO I tried a very simple kernel then :

__global__ void kernel(float* T0_d,float* T1_d, float Cd,int N){

  int index = threadIdx.x + blockIdx.x * blockDim.x;

  int j;

  if (index < N && index >= 0)

   for (j=1; j==10; j++) 

   {

  T1_d[index] = T1_d[index] + 1.0;

  __syncthreads();

 }

}

And I don’t understand why if the “for” loop do 1 iteration, T1 goes from 0 to 1, but if I do more than 1 iteration, T1 is set to 0.

Could someone help me understand why ? :)

The for loop above doesn’t execute even once, j == 10 is never true.

Also having that __syncthreads() inside the if() is dangerous if the number of threads > N (and N is not a multiple of the block size), since threads with index < N (within a block) will get to that sync point and the remaining won’t, and the gpu will likely hang.

The for loop above doesn’t execute even once, j == 10 is never true.

Also having that __syncthreads() inside the if() is dangerous if the number of threads > N (and N is not a multiple of the block size), since threads with index < N (within a block) will get to that sync point and the remaining won’t, and the gpu will likely hang.

Thank you for this reply.

I read more documentation and I got it. :)

But now, there is something I don’t understand. I made a simple program, which run fine sometimes, but can also return 0 or NaN/Inf.

Here is the program :

// export CC=/usr/bin/gcc-4.3

#include <stdio.h>

#include <assert.h>

#include <cuda.h>

__global__ void TestKernel(float *T_d,float *Ta_d,float *Ar_d,int ns)

{

 int index = blockIdx.x*blockDim.x+threadIdx.x;

 for(int nsp = 1; nsp <= ns;nsp++)

{

 Ar_d[index] = Ar_d[index] + exp2f(T_d[index]/Ta_d[nsp]);

}

}

int main(void)

{

float *T_h,*Ta_h,*Ar_h;	 // pointers to host memory

   float *T_d,*Ta_d,*Ar_d;	 // pointers to device memory

   int N = 8;

   int i;

int ns = 10;

int deviceCount = 0;

if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {

printf("cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");

printf("\nFAILED\n");

}

// allocate arrays on host

   Ar_h = (float *)malloc(sizeof(floa

   T_h = (float *)malloc(sizeof(float)*N);

   Ta_h = (float *)malloc(sizeof(float)*ns);

   // allocate arrays on device

   cudaMalloc((void **) &Ar_d, sizeof(float)*N);

   cudaMalloc((void **) &T_d, sizeof(float)*N);

   cudaMalloc((void **) &Ta_d, sizeof(float)*ns);

   // initialize host data

   for (i=0; i<N; i++) 

   {

	  Ar_h[i] = 0.f;

   }

   cudaMemcpy(Ar_d, Ar_h, sizeof(float)*N, cudaMemcpyHostToDevice);

   for (i=0; i<N; i++) 

   {

	  T_h[i] = 0.01;//*1.f/(N*1.f);

   }

   cudaMemcpy(T_d, T_h, sizeof(float)*N, cudaMemcpyHostToDevice);

for (i=0; i<ns; i++) 

   {

	  Ta_h[i] = 1.f;///(ns*1.f);

   }

   cudaMemcpy(Ta_d, Ta_h, sizeof(float)*ns, cudaMemcpyHostToDevice);

dim3 DimGrid(1,1);

   dim3 DimBlock(8,1,1);

TestKernel<<<DimGrid,DimBlock>>>(T_d,Ta_d,Ar_d,ns);

cudaThreadSynchronize();

cudaMemcpy(Ar_h, Ar_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

	printf("final value");

   for (i=0; i<N; i++) 

   {

	  printf("Host : a %d => %f\n",i,Ar_h[i]);

   }

free(T_h);free(Ar_h);free(Ta_h); 

   cudaFree(T_d); cudaFree(Ar_d); cudaFree(Ta_d);

}

There must be a memory mistake somewhere, but I can’t find it :/

Thank you for this reply.

I read more documentation and I got it. :)

But now, there is something I don’t understand. I made a simple program, which run fine sometimes, but can also return 0 or NaN/Inf.

Here is the program :

// export CC=/usr/bin/gcc-4.3

#include <stdio.h>

#include <assert.h>

#include <cuda.h>

__global__ void TestKernel(float *T_d,float *Ta_d,float *Ar_d,int ns)

{

 int index = blockIdx.x*blockDim.x+threadIdx.x;

 for(int nsp = 1; nsp <= ns;nsp++)

{

 Ar_d[index] = Ar_d[index] + exp2f(T_d[index]/Ta_d[nsp]);

}

}

int main(void)

{

float *T_h,*Ta_h,*Ar_h;	 // pointers to host memory

   float *T_d,*Ta_d,*Ar_d;	 // pointers to device memory

   int N = 8;

   int i;

int ns = 10;

int deviceCount = 0;

if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {

printf("cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");

printf("\nFAILED\n");

}

// allocate arrays on host

   Ar_h = (float *)malloc(sizeof(floa

   T_h = (float *)malloc(sizeof(float)*N);

   Ta_h = (float *)malloc(sizeof(float)*ns);

   // allocate arrays on device

   cudaMalloc((void **) &Ar_d, sizeof(float)*N);

   cudaMalloc((void **) &T_d, sizeof(float)*N);

   cudaMalloc((void **) &Ta_d, sizeof(float)*ns);

   // initialize host data

   for (i=0; i<N; i++) 

   {

	  Ar_h[i] = 0.f;

   }

   cudaMemcpy(Ar_d, Ar_h, sizeof(float)*N, cudaMemcpyHostToDevice);

   for (i=0; i<N; i++) 

   {

	  T_h[i] = 0.01;//*1.f/(N*1.f);

   }

   cudaMemcpy(T_d, T_h, sizeof(float)*N, cudaMemcpyHostToDevice);

for (i=0; i<ns; i++) 

   {

	  Ta_h[i] = 1.f;///(ns*1.f);

   }

   cudaMemcpy(Ta_d, Ta_h, sizeof(float)*ns, cudaMemcpyHostToDevice);

dim3 DimGrid(1,1);

   dim3 DimBlock(8,1,1);

TestKernel<<<DimGrid,DimBlock>>>(T_d,Ta_d,Ar_d,ns);

cudaThreadSynchronize();

cudaMemcpy(Ar_h, Ar_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

	printf("final value");

   for (i=0; i<N; i++) 

   {

	  printf("Host : a %d => %f\n",i,Ar_h[i]);

   }

free(T_h);free(Ar_h);free(Ta_h); 

   cudaFree(T_d); cudaFree(Ar_d); cudaFree(Ta_d);

}

There must be a memory mistake somewhere, but I can’t find it :/