Getting Each Kernel to write to a separate list.

Hello, I have a problem with the CUDA implementation of a path finding algorithm I need to optimize.

The problem is that I input one array with pathnode data, and I need each kernel to output a separate array with ‘baked’ data, that I can copy back to the host.

  • If I try to allocate data in global memory for each 'result' array before I start the kernel, the kernel crashes (with no error for me to read).
  • If I try to make the kernel allocate room for it's 'result' array in global memory, CUDA throws "Access violation" when I want to copy that back to the host.

Here is a simplified version of what I am trying to do.
(For some reason, the forum will not allow me to edit a ‘code’ block in this post, see the post bellow for the fixed example code)

I would really appreciate any help you can give me, I’ve been trying to figure it out on my own, but I am not very experienced with CUDA…

Oops, there are formatting mistakes in the main function, but the forum gives me an ‘unknown exception’ when I try to edit it…

Where is declared and initialized resultsArray?
You send it as a parameter to the kernel and later you use it in a cudaMemCopy resultsArray[j].adressToResults.
I guess all arrays into that struct are initialized, right?

Sorry, I made a mistake when simplifying the code. resultsArray is allocated in global memory, it is uninitialized when passed to the kernel, because the kernel is supposed to fill it.

This is the proper simplified code:

typedef struct 
{
    int pathLength;
    int* adressToResults;
} cPathResult ;

__global__ void kernel( int* mapWalkable, cPathResult* OUTresultsArray )
{	
    unsigned int mapSize = 100 * 100;	
    unsigned int IamThreadNumber = threadIdx.x;

    int* result = (int*) malloc(sizeof(int) * mapSize); // this should allocate space in global memory

    for( int i = 0; i < mapSize; ++i )
    {
        result [i] = mapWalkable[i] * IamThreadNumber;
    }
	
    OUTresultsArray[IamThreadNumber].adressToResults = resultingPathArray; // the result should stay stored there after the kernel finishes
    OUTresultsArray[IamThreadNumber].pathLength = IamThreadNumber;
}

extern "C" int main()
{
    int* INmap;
    cPathResult* OUTresults;

    unsigned int mapSize = 100 * 100;
    INmap = (int*) malloc(sizeof(int) * mapSize);
    OUTresults = (cPathResult*) malloc(sizeof(cPathResult) * NUMTHREADS);

    // actually loaded from a file by host, but this is for the example
    for( int i = 0; i < mapSize ; ++i )
    {
        INmap[i] = rand();
    }

    for( int j = 0; j < NUMTHREADS; ++j )
    {
        // allocate room where the results will go on host.
        OUTresults[j] = (int*) malloc(sizeof(int) * mapSize);
    }

    const char* noArgs = "nothing";
    findCudaDevice(0, &noArgs);

    // allocate room on the device for pointer to the result of each thread.
    cPathResult* resultsArray;
    checkCudaErrors(cudaMalloc((void **) &resultsArray, sizeof(cPathResult) * NUMTHREADS));

    int* MapWalkable;
    checkCudaErrors(cudaMalloc((void **) &MapWalkable, sizeof(int) * mapSize));
    checkCudaErrors(cudaMemcpy(MapWalkable, INmap, sizeof(int) * mapSize, 
        cudaMemcpyHostToDevice));
        
    dim3 grid(1, 1, 1);
    dim3 threads(NUMTHREADS, 1, 1);
    // execute the kernel
    kernel<<< grid, threads >>>( MapWalkable, resultsArray);
    getLastCudaError("Kernel execution failed");

    for( int j = 0; j < NUMTHREADS; ++j )
    {
        // this line throws the "Access violation", is that because kernels memory is automagically freed, when the kernel finishes or something?
        checkCudaErrors(cudaMemcpy( (OUTresults[j].adressToResults), (resultsArray[j].adressToResults), sizeof(int) * mapSize,
                                     cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaMemcpy( &(OUTresults[j].pathLength), &(resultsArray[j].pathLength), sizeof(int),
                                    cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaFree(resultsArray[k].adressToResults));
    }
    checkCudaErrors(cudaMemcpy(OUTresults, resultsArray, sizeof(int) * mapSize,
                               cudaMemcpyDeviceToHost));

    checkCudaErrors(cudaFree(MapWalkable)); 

    return true;
}

I’ve not worked with mallocs into a kernel but, I’ll try to help you.
You allocated resultsArray into Device Memory. Then, each thread allocates another array of int’s.
So both levels of pointers are in device memory. You are trying to access device memory from the host.

cudaMemcpy( (OUTresults[j].adressToResults), (<b>resultsArray[j].</b>adressToResults), sizeof(int) * mapSize,
                                     cudaMemcpyDeviceToHost)

You can’t get resultsArray[j] in the host if *resultsArray was allocated in Device.

Ah, thank you.
I feel stupid for not noticing that.

But if I copy resultsArray into host memory first and then get the pointers from that host memory, so it should be able to find the correct data on the device. I’ll try that now.

Ok, so I got it to work…

It occurred to me that I was thinking too much as an Object Oriented programmer.

So I simplified it to a huge array in global memory to hold all the results of all the threads.
And each thread writes at an offset of the maximum result size times it’s own thread number into this array.

unsigned int memoryFor1Result = sizeof(int) * mapSize ;
unsigned int memoryForResults = memoryFor1Result * NUMTHREADS;
unsigned int memoryForLenghts = sizeof(int) * NUMTHREADS;

/************************************************************/
/* allocate device memory
/************************************************************/
int* MapWalkable;
checkCudaErrors(cudaMalloc((void **) &MapWalkable, memoryForWalkable));
    
int* resultsArray;
checkCudaErrors(cudaMalloc((void**) &resultsArray, memoryForResults));
	
int* lenghtsArray;
checkCudaErrors(cudaMalloc((void**) &lenghtsArray, memoryForLenghts));

Kernel

__global__ void kernel( int* mapWalkable, int* OUTresultsArray, int* OUTlenghtsArray )
{	
    unsigned int mapSize = 100 * 100;	
    unsigned int IamThreadNumber = threadIdx.x;

    int* myResultsArray = OUTresultsArray;
    myResultsArray += mapSize * IamThreadNumber;

    for( int j = 0; j < mapSize; ++j )
    {
        myResultsArray[j] = mapWalkable[j] * IamThreadNumber;
    }
    OUTlenghtsArray[IamThreadNumber] = IamThreadNumber;
}

And then copying to the host

for( unsigned int j = 0; j < NUMTHREADS; ++i)
{
    int* resultsArrayAtOffset = &(resultsArray[mapSize * j]);
    checkCudaErrors(cudaMemcpy( OUTresults[j].adressToResults, resultsArrayAtOffset, memoryFor1Result, 
                                cudaMemcpyDeviceToHost));
    int* lenghtsArrayAtOffset = &(lenghtsArray[j]);
    checkCudaErrors(cudaMemcpy( &(OUTresults[j].pathLength), lenghtsArrayAtOffset, sizeof(int), 
                                cudaMemcpyDeviceToHost));
}

And that finally gets all the correct data out of the kernel and into host memory.
Thanks for the help with the original question, even though I eventually did it in a different way.

For those wondering, this code:

for( int j = 0; j < mapSize; ++j )
{
    myResultsArray[j] = mapWalkable[j] * IamThreadNumber;
}
OUTlenghtsArray[IamThreadNumber] = IamThreadNumber;

Is not the actual pathing calculation, the actual kernel is way longer and is terribly slow, so my next order of business will be to copy the structs generated from info in mapWalkable into shared memory so the kernel can loop over it and change it faster. Wish me luck…