I have a class Object defined and one of its member is a pointer arrayOfFloats, intended to store an array of floats.
In the kernel I define for each thread a variable of the class Object. However, since the content of arrayOfFloats is read from file, I wanted to read on the host side and allocate device memory and then pass the pointer inside the kernel to the arrayOfFloats, like this
class Object{
private:
float *arrayOfFloats;
public:
__device__ void initArrayOfFloatsPtr(float *devPtr){
arrayOfFloats=devPtr;
}
__device__ void initArrayOfFloatsCpy(float *devPtr,int arraySize){
arrayOfFloats=(float*)malloc(arraySize*sizeof(float));
for(int i=0;i<arraySize;i++)
arrayOfFloats[i]=devPtr[i];
}
__device__ printfArrayOfFloats(){
/* calls a simple printf to some elements of arrayOfFloats */
}
/* the remaining public members */
/* et cetera, et cetera */
}
__global__ void kernelCopy(float *devPtr,int arraySize){
Object var();
var.initArrayOfFloatsCpy(devPtr,arraySize);
var.printfArrayOfFloats();
}
__global__ void kernelPtr(float *devPtr){
Object var();
var.initArrayOfFloatsPtr(devPtr);
var.printfArrayOfFloats();
int main(){
float *arrayOfFloats_host=readFloatsFromFile("some_file.csv");
float *arrayOfFloats_device;
cudaMalloc((void**)&arrayOfFloats,arraySize*sizeof(float));
kernelCpy<<<someBlocks,someThreads>>>(arrayOfFloats_device,arraySize);
kernelPtr<<<someBlocks,someThreads>>>(arrayOfFloats_device);
}
[Sorry for code verbosity, I tried some contention] The arrayOfFloats is readonly, so no write-after-write or read-after-write or similar memory hazards combinations are liable to occur. kernelCpy prints the content of the files, kernelPtr does not.
Is it related to the arrayOfFloats_device being a host pointer to the device memory space? How can I “copy” the arrayOfFloats_device by address, saving up time otherwise taken by an explicit copy?