Hello everyone,
I was having trouble with one of my kernel in my CFD code. I created this simple code that illustrates the problem I have.
[codebox]
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <string.h>
global void kernel1(float *g_array_out, int *g_table, int dummy1, int dummy2 )
{
// ints
int j,index_l;
int b_pid, g_pid;
int tid = threadIdx.x;
//int k=0;
for(int k =0; k < dummy1; k++)
{
b_pid = k*blockDim.x + tid;
g_pid = blockIdx.x*dummy1*blockDim.x + b_pid;
if(g_pid < dummy2)
{
index_l = g_table[g_pid];
//index_l = g_pid;
for(j = 0; j < 4; j++)
{
g_array_out[index_l] = 1.;
index_l += dummy2;
}
}
}
}
int main()
{
float *h_array_out= 0;
float *d_array_out= 0;
int *d_table = 0;
int *h_table = 0;
int block_size = 16;
int grid_size = 1;
// allocate look_up table
cudaMalloc((void**) &d_table, 16*sizeof(int));
cudaMemset(d_table,0, 16*sizeof(int));
h_table = (int*)malloc( 16*sizeof(int));
for(int g_pid=0;g_pid<16;g_pid++)
{
h_table[g_pid] = g_pid;
}
cudaMemcpy(d_table, h_table, 16*sizeof(int), cudaMemcpyHostToDevice);
free(h_table);
// allocate array_out
cudaMalloc((void**) &d_array_out, sizeof(float)*4*16);
cudaMemset(d_array_out,0,sizeof(float)*4*16);
if (d_array_out== 0 )
{
printf("couldn't allocate device memory\n");
exit(1);
}
kernel1<<<grid_size,block_size>>>(d_array_out,d_table,1,16);
h_array_out= (float*)malloc( 416sizeof(float));
cudaMemcpy(h_array_out,d_array_out,sizeof(float)*4*16,cudaMe
mcpyDeviceToHost);
for(int i=0;i<4*16;i++)
{
printf("%f\n",h_array_out[i]);
}
free(h_array_out);
cudaFree(d_array_out);
cudaFree(d_table);
}
[/codebox]
I would expect the output array (array_out) to be an array full of ones. However, when I compile with:
nvcc -arch=sm_13 -I/usr/local/cuda/include …/src/cuda_methods.cu -o cuda_exec
and execute the program, the “array_out” array has zeros in positions [16:47]. What is also really strange is that if I comment the outer loop in the kernel (since dummy1 is equal to 1 anyway), then I obtain what I expect: array_out full of ones. Also, if I replace the line: “index_l = d_table[g_pid]” by “index_l = g_pid”, I also get the good answer, even though d_table[g_pid] = g_pid. It seems like it’s something related to integer arrays not being dereferenced properly. I heard about the “-malign-double” option but that doesn’t seem to work when used with nvcc.
Do you guys have any idea what’s going on?
I would really appreciate your help.