Hi all,
I am facing a strange problem, possible due to lack of heap storage. I have tried freeing memory and increase heap commit size in Visual Studio 8.0, but to no avail. The last few lines of the code try to copy data from split_dList to a host array. When I do that with a stack allocated array, it works just fine. But new doesn’t seem to work and the program crashes. Any help ? I think I am doing something wrong.
Thank You.
float ** sorted1stD, ** sorted2ndD, ** sorted3rdD;
float ** sorted1stDd, ** sorted2ndDd, ** sorted3rdDd;
float * bounds_dList;
int * numElements, *numElementsd;
CUDA_SAFE_CALL(cudaMalloc((void**)&sorted1stDd, numActivelist*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&sorted2ndDd, numActivelist*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&sorted3rdDd, numActivelist*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&numElementsd, numActivelist*sizeof(int)));
sorted1stD = new float*[numActivelist];
sorted2ndD = new float*[numActivelist];
sorted3rdD = new float*[numActivelist];
numElements = new int[numActivelist];
CUDA_SAFE_CALL(cudaMalloc((void**)&bounds_dList,numActivelist*6*sizeof(float)));
for(int i = 0; i < numActivelist; i++)
{
sorted1stD[i] = activeList[i].sortedvertexArray1stDd;
sorted2ndD[i] = activeList[i].sortedvertexArray2ndDd;
sorted3rdD[i] = activeList[i].sortedvertexArray3rdDd;
numElements[i] = activeList[i].numElements;
CUDA_SAFE_CALL(cudaMalloc((void**)&activeList[i].bounds_d,6*sizeof(float)));
}
CUDA_SAFE_CALL(cudaMemcpy(sorted1stDd, sorted1stD, numActivelist*sizeof(float),
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(sorted2ndDd, sorted2ndD, numActivelist*sizeof(float),
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(sorted3rdDd, sorted3rdD, numActivelist*sizeof(float),
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(numElementsd, numElements, numActivelist*sizeof(float),
cudaMemcpyHostToDevice));
const int maxThreads = 128;
const int maxBlocks = 128;
const int blockSize = 128;
int threads = 1, blocks = 1;
blocks += numActivelist / blockSize;
threads = (numActivelist < maxThreads ? numActivelist : maxThreads);
// Find the number of elements to be processed by each thread
uint num = findNumElementsPerThread(numActivelist);
dim3 dimBlock(blocks,1,1);
dim3 dimGrid(threads,1,1);
#ifdef _PRINT
float *testResultList;
testResultList = new float[6];
CUDA_SAFE_CALL(cudaMemcpy(testResultList, bounds_dList, 6*sizeof(float),
cudaMemcpyDeviceToHost));
printf("numActiveList : %d\n",numActivelist);
printf("numElements in node 0 : %d\n",activeList[0].numElements);
printf("Initially the bounds are :\n");
for(int i = 0; i < 6; i++)
{
printf("%f ",testResultList[i]);
}
printf("\n");
#endif //_PRINT
// Calculate bounds for all nodes in the activelist ------------------------------
computeBoundingBoxParallelKernel<<<dimGrid, dimBlock>>>(sorted1stDd,
sorted2ndDd, sorted3rdDd,
bounds_dList,
numActivelist, numElementsd, blockSize, num
);
float *oldBounds_dList;
CUDA_SAFE_CALL(cudaMalloc((void**)&oldBounds_dList,numActivelist*6*sizeof(float)));
#ifdef _PRINT
CUDA_SAFE_CALL(cudaMemcpy(testResultList, bounds_dList, 6*sizeof(float),
cudaMemcpyDeviceToHost));
printf("Finally the bounds are :\n");
for(int i = 0; i < 6; i++)
{
printf("%f ",testResultList[i]);
}
printf("\n");
#endif // _PRINT
// Split Large Nodes -------------------------------------------------------------
// We copy the old bounds of the node which were used to split it in the last split
// into oldBounds_List.
float *splitPos_dList;
uint totalNumElements = 0;
printf("numActivelist : %d\n",numActivelist);
CUDA_SAFE_CALL(cudaMalloc((void**)&splitPos_dList, numActivelist*2*sizeof(float)));
for(int i = 0; i<numActivelist; i++)
{
CUDA_SAFE_CALL(cudaMemcpy(&oldBounds_dList[6*i], activeList[i].bounds_d,
6*sizeof(float),cudaMemcpyDeviceToDevice));
totalNumElements += activeList[i].numElements;
}
splitNodeKernel<<<dimGrid,dimBlock>>>(bounds_dList, oldBounds_dList, splitPos_dList,
numActivelist, blockSize, num);
float *testResultList = new float[2];
CUDA_SAFE_CALL(cudaMemcpy((void**)&testResultList, splitPos_dList, 2*sizeof(float)
,cudaMemcpyDeviceToHost));
printf("split_dList[0] : %f split_dList[1] : %f\n",*testResultList, *(testResultList+1));