Hello,
I have a std::vector-like class(but limited to structures, that can be allocated using simple malloc), that can upload a copy of itself to CUDA device and download the data back after any modification(on demand). My problem is, that it sometimes needs to reallocate an array and in that case cudaMemcpy fails to download the data back.
here’s the implementation of both functions:
template<typename Type, unsigned int sizeOnStack>
// Adds an element to the CuVector
__device__ __host__ inline Type& CuVector<Type, sizeOnStack>::push(const Type& value){
if (usedSize >= allocSize){
allocSize *= 2;
Type *newData = new Type[allocSize];
if (newData == NULL) return(data[usedSize - 1]);
for (int i = 0; i < usedSize; i++)
newData[i] = data[i];
if ((data != stackData) && data != NULL) delete[] data;
data = newData;
printf("allocated\n");// just to ensure, the kernel reached this line.
}
data[usedSize] = value;
usedSize++;
return(data[usedSize - 1]);
}
template<typename Type, unsigned int sizeOnStack>
// Loads data from the GPU clone (Returns true if successful)
inline bool CuVector<Type, sizeOnStack>::updateFromClone(){
if (clone == NULL) return(false);
char cln;
CuVector *cpuClone = (CuVector*)cln;
if (cudaMemcpy(cpuClone, clone, sizeof(CuVector<Type, sizeOnStack>), cudaMemcpyDeviceToHost) != cudaSuccess){
std::cout << "Failed to load clone..." << std::endl;
return(false);
}
Type *newData = NULL;
if (cpuClone->data != clone->stackData){
newData = new Type[cpuClone->allocSize];
if (newData == NULL) return(false);
if (cudaMemcpy(newData, cpuClone->data, sizeof(Type)*cpuClone->allocSize, cudaMemcpyDeviceToHost) != cudaSuccess){
std::cout << "Failed to load clone data..." << std::endl; // I see this massage if and only if the upper function reallocates data
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
return(false);
}
}
if ((data != stackData) && data != NULL) delete[] data;
usedSize = cpuClone->usedSize;
allocSize = cpuClone->allocSize;
if (newData == NULL){
data = stackData;
for (int i = 0; i < usedSize; i++)
data[i] = cpuClone->stackData[i];
}
else data = newData;
return(true);
}
What might be the reason?
Do kernel “new” and host “cudaMalloc” operate on different parts of the GPU memory?