Hello, my code is behaving as though it’s completely skipping the kernel calls. To illustrate this, I’ve written a very simple code which should fill all elements of a float array with the value 100:
#include <cutil.h>
__global__ void fill(float* Pd)
{
int i=threadIdx.x;
Pd[i]=100.0F;
}
int main(int argc, char** argv)
{
int width=10;
int size=width*sizeof(float);
float* Pd;
cudaMalloc((void**) &Pd, size);
fill<<<1,width>>>(Pd);
float* P=(float*)malloc(size);
cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);
cudaFree(Pd);
int i;
for(i=0; i<width; i++)
printf("%f, ", P[i]);
return 0;
}
If you take a look at the code, the expected output would be:
100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100
but instead I’m getting all zeros as though the kernel is not doing anything.
Can anyone see why this might be happening? Also, would someone mind compiling this and testing it on their machine to see if it is the code?
Thanks in advance!