Hi,
I am a beginner in CUDA development.
I am trying a very simple program : a 1D thermal diffusion.
But I failed to do more than 1 iteration, because the memory seems to be reseted and I don’t know why.
Here is the program : (see after in the post for a simple test that don’t work either.)
__global__ void kernel(float* T0_d,float* T1_d, float Cd,int N){
int index = threadIdx.x + blockIdx.x * blockDim.x;
int j;
for (j=1; j==1000; j++)
{
if (index < N-1 && index > 0)
T1_d[index] = T0_d[index] + Cd*(T0_d[index+1]-2.0*T0_d[index]+T0_d[index-1]);
__syncthreads();
T0_d[index] = T1_d[index] + Cd*(T1_d[index+1]-2.0*T1_d[index]+T1_d[index-1]);
__syncthreads();
}
}
// export CC=/usr/bin/gcc-4.3
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
int main(void)
{
float *T_h; // pointers to host memory
float *T0_d,*T1_d; // pointers to device memory
int N = 100;
int i;
float L=1.0;
float Dx,Dt,Cd;
Dx = L/((N-1)*1.0);
Dt = 0.4*Dx*Dx;
Cd = Dt/(Dx*Dx);
int deviceCount = 0;
if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
printf("cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");
printf("\nFAILED\n");
}
// allocate arrays on host
T_h = (float *)malloc(sizeof(float)*N);
// allocate arrays on device
cudaMalloc((void **) &T0_d, sizeof(float)*N);
cudaMalloc((void **) &T1_d, sizeof(float)*N);
// initialize host data
for (i=0; i<N; i++)
{
T_h[i] = 0.0;
}
cudaMemcpy(T1_d, T_h, sizeof(float)*N, cudaMemcpyHostToDevice);
for (i=50; i<=60; i++)
{
T_h[i] = 20.0;
}
// send data from host to device: T_h to T_d
cudaMemcpy(T0_d, T_h, sizeof(float)*N, cudaMemcpyHostToDevice);
// CUDA grid setting
dim3 block(256);
int n1 = (N+block.x-1)/(block.x); //number of blocks on the x dimension
dim3 grid(n1,1);
kernel<<<grid,block>>>(T0_d,T1_d,Cd,N);
// retrieve data from device: b_d to b_h
cudaMemcpy(T_h, T0_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
for (i=0; i<N; i++)
{
printf("Host : i %d T %f\n",i,T_h[i]);
}
cudaMemcpy(T_h, T1_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
for (i=0; i<N; i++)
{
printf("Host : i %d T %f\n",i,T_h[i]);
}
free(T_h);
cudaFree(T0_d);
cudaFree(T1_d);
}
SO I tried a very simple kernel then :
__global__ void kernel(float* T0_d,float* T1_d, float Cd,int N){
int index = threadIdx.x + blockIdx.x * blockDim.x;
int j;
if (index < N && index >= 0)
for (j=1; j==10; j++)
{
T1_d[index] = T1_d[index] + 1.0;
__syncthreads();
}
}
And I don’t understand why if the “for” loop do 1 iteration, T1 goes from 0 to 1, but if I do more than 1 iteration, T1 is set to 0.
Could someone help me understand why ? :)