Problem in copying data frm device to host

Hi all,

I am facing a strange problem, possible due to lack of heap storage. I have tried freeing memory and increase heap commit size in Visual Studio 8.0, but to no avail. The last few lines of the code try to copy data from split_dList to a host array. When I do that with a stack allocated array, it works just fine. But new doesn’t seem to work and the program crashes. Any help ? I think I am doing something wrong.

Thank You.

float ** sorted1stD, ** sorted2ndD, ** sorted3rdD;

	float ** sorted1stDd, ** sorted2ndDd, ** sorted3rdDd;

	float * bounds_dList;

	int * numElements, *numElementsd;

	

	CUDA_SAFE_CALL(cudaMalloc((void**)&sorted1stDd, numActivelist*sizeof(float)));

	CUDA_SAFE_CALL(cudaMalloc((void**)&sorted2ndDd, numActivelist*sizeof(float)));

	CUDA_SAFE_CALL(cudaMalloc((void**)&sorted3rdDd, numActivelist*sizeof(float)));

	CUDA_SAFE_CALL(cudaMalloc((void**)&numElementsd, numActivelist*sizeof(int)));

	sorted1stD = new float*[numActivelist];

	sorted2ndD = new float*[numActivelist];

	sorted3rdD = new float*[numActivelist];

	

	

	numElements = new int[numActivelist];

	

	CUDA_SAFE_CALL(cudaMalloc((void**)&bounds_dList,numActivelist*6*sizeof(float)));

	for(int i = 0; i < numActivelist; i++)

	{

		sorted1stD[i] = activeList[i].sortedvertexArray1stDd;	

		sorted2ndD[i] = activeList[i].sortedvertexArray2ndDd;

		sorted3rdD[i] = activeList[i].sortedvertexArray3rdDd;

	   

		numElements[i] = activeList[i].numElements;

		CUDA_SAFE_CALL(cudaMalloc((void**)&activeList[i].bounds_d,6*sizeof(float)));

	}

	CUDA_SAFE_CALL(cudaMemcpy(sorted1stDd, sorted1stD, numActivelist*sizeof(float),

								cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaMemcpy(sorted2ndDd, sorted2ndD, numActivelist*sizeof(float),

								cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaMemcpy(sorted3rdDd, sorted3rdD, numActivelist*sizeof(float),

								cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaMemcpy(numElementsd, numElements, numActivelist*sizeof(float),

								cudaMemcpyHostToDevice));

	const int maxThreads = 128;

	const int maxBlocks = 128;

	const int blockSize = 128;

	int threads = 1, blocks = 1;

	

	blocks += numActivelist / blockSize;

	threads = (numActivelist < maxThreads ? numActivelist : maxThreads);

	

	// Find the number of elements to be processed by each thread

	uint num = findNumElementsPerThread(numActivelist);

	

	dim3 dimBlock(blocks,1,1);

	dim3 dimGrid(threads,1,1);

#ifdef _PRINT

	float *testResultList;

	testResultList = new float[6];

	CUDA_SAFE_CALL(cudaMemcpy(testResultList, bounds_dList, 6*sizeof(float),

								cudaMemcpyDeviceToHost));

	printf("numActiveList : %d\n",numActivelist);

	printf("numElements in node 0 : %d\n",activeList[0].numElements);

	printf("Initially the bounds are :\n");

	for(int i = 0; i < 6; i++)

	{   

		printf("%f ",testResultList[i]);

	}

	printf("\n");

#endif //_PRINT

	// Calculate bounds for all nodes in the activelist ------------------------------

	computeBoundingBoxParallelKernel<<<dimGrid, dimBlock>>>(sorted1stDd, 

											sorted2ndDd, sorted3rdDd,

											bounds_dList, 

											numActivelist, numElementsd, blockSize, num

											);

	

	float *oldBounds_dList;

	CUDA_SAFE_CALL(cudaMalloc((void**)&oldBounds_dList,numActivelist*6*sizeof(float)));

	

#ifdef _PRINT

	CUDA_SAFE_CALL(cudaMemcpy(testResultList, bounds_dList, 6*sizeof(float),

								cudaMemcpyDeviceToHost));

	printf("Finally the bounds are :\n");

	for(int i = 0; i < 6; i++)

	{   

		printf("%f ",testResultList[i]);

	}

	printf("\n");

#endif // _PRINT

	// Split Large Nodes -------------------------------------------------------------

	// We copy the old bounds of the node which were used to split it in the last split

	// into oldBounds_List. 

	float *splitPos_dList; 

	uint totalNumElements = 0;

	printf("numActivelist : %d\n",numActivelist);

	CUDA_SAFE_CALL(cudaMalloc((void**)&splitPos_dList, numActivelist*2*sizeof(float)));

	for(int i = 0; i<numActivelist; i++)

	{

		CUDA_SAFE_CALL(cudaMemcpy(&oldBounds_dList[6*i], activeList[i].bounds_d,

										6*sizeof(float),cudaMemcpyDeviceToDevice));

		totalNumElements += activeList[i].numElements;

		

	}

	splitNodeKernel<<<dimGrid,dimBlock>>>(bounds_dList, oldBounds_dList, splitPos_dList,

											numActivelist, blockSize, num);

	

	float *testResultList = new float[2];

	CUDA_SAFE_CALL(cudaMemcpy((void**)&testResultList, splitPos_dList, 2*sizeof(float)

																	,cudaMemcpyDeviceToHost));

	printf("split_dList[0] : %f split_dList[1] : %f\n",*testResultList, *(testResultList+1));

Oh stupid me. Maybe I should sleep a little more. I was using the wrong type casting for the passed pointer to the destination memory location.
Works now !