This is surely elementary to many but I am stuck.
How does one pass the address of arrays setup with cudaMalloc to functions on the Host?
For example, the first template of code works perfectly.
The second does not - changed to put the cudaMemcpy in the genNums function.
I have convinced myself that I am not passing gpu_odata (address / pointer) to the function correctly but all I have tried fails.
I hope someone can be kind enough to tell me the simple error I am making. Thanks
(In case it is relevant, genNums is in fact recursive)
int genNums(unsigned short all)
{
genNums(all)
…array all built-up when finally return from recursive calls
}
int main()
{
unsigned short all[MAXALL];
int nBytes = MAXALL*sizeof(unsigned short);
unsigned short gpu_odata;
cudaMalloc( (void*) &gpu_odata, nBytes);
ret = genNums(all);
cudaMemcpy(gpu_odata, all, nBytes, cudaMemcpyHostToDevice);
nBlocks = cnt/blockSize + (cnt%blockSize == 0?0:1);
get_sums <<< nBlocks, blockSize >>> (gpu_odata);
...everything fine
}
BUT THIS CODE DOES NOT
int genNums(unsigned short all, unsigned short* gpu_odata)
{
genNums(all, gpu_odata)
cudaMemcpy(gpu_odata, all, nBytes, cudaMemcpyHostToDevice);
}
int main()
{
unsigned short all[MAXALL];
int nBytes = MAXALL*sizeof(unsigned short);
unsigned short gpu_odata;
cudaMalloc( (void*) &gpu_odata, nBytes);
ret = genNums(all, gpu_odata); ???? not passing gpu_odata correctly I think is my problem even though it compiles
nBlocks = cnt/blockSize + (cnt%blockSize == 0?0:1);
get_sums <<< nBlocks, blockSize >>> (gpu_odata);
...does not work, cudaMemcpy in function fails
}