Ok, so I got it to work…
It occurred to me that I was thinking too much as an Object Oriented programmer.
So I simplified it to a huge array in global memory to hold all the results of all the threads.
And each thread writes at an offset of the maximum result size times it’s own thread number into this array.
unsigned int memoryFor1Result = sizeof(int) * mapSize ;
unsigned int memoryForResults = memoryFor1Result * NUMTHREADS;
unsigned int memoryForLenghts = sizeof(int) * NUMTHREADS;
/************************************************************/
/* allocate device memory
/************************************************************/
int* MapWalkable;
checkCudaErrors(cudaMalloc((void **) &MapWalkable, memoryForWalkable));
int* resultsArray;
checkCudaErrors(cudaMalloc((void**) &resultsArray, memoryForResults));
int* lenghtsArray;
checkCudaErrors(cudaMalloc((void**) &lenghtsArray, memoryForLenghts));
Kernel
__global__ void kernel( int* mapWalkable, int* OUTresultsArray, int* OUTlenghtsArray )
{
unsigned int mapSize = 100 * 100;
unsigned int IamThreadNumber = threadIdx.x;
int* myResultsArray = OUTresultsArray;
myResultsArray += mapSize * IamThreadNumber;
for( int j = 0; j < mapSize; ++j )
{
myResultsArray[j] = mapWalkable[j] * IamThreadNumber;
}
OUTlenghtsArray[IamThreadNumber] = IamThreadNumber;
}
And then copying to the host
for( unsigned int j = 0; j < NUMTHREADS; ++i)
{
int* resultsArrayAtOffset = &(resultsArray[mapSize * j]);
checkCudaErrors(cudaMemcpy( OUTresults[j].adressToResults, resultsArrayAtOffset, memoryFor1Result,
cudaMemcpyDeviceToHost));
int* lenghtsArrayAtOffset = &(lenghtsArray[j]);
checkCudaErrors(cudaMemcpy( &(OUTresults[j].pathLength), lenghtsArrayAtOffset, sizeof(int),
cudaMemcpyDeviceToHost));
}
And that finally gets all the correct data out of the kernel and into host memory.
Thanks for the help with the original question, even though I eventually did it in a different way.
For those wondering, this code:
for( int j = 0; j < mapSize; ++j )
{
myResultsArray[j] = mapWalkable[j] * IamThreadNumber;
}
OUTlenghtsArray[IamThreadNumber] = IamThreadNumber;
Is not the actual pathing calculation, the actual kernel is way longer and is terribly slow, so my next order of business will be to copy the structs generated from info in mapWalkable into shared memory so the kernel can loop over it and change it faster. Wish me luck…