Mapping OpenAcc code to CUDA kernels

Hi everyone,
I would like to know what exactly the Cuda-kernel looks like for the following simple OpenACC lines. There are some discussions regarding this topic (for example[1]) but no one has explained using a simple code.

struct train
{
    int num_stations;
    int *stations = nullptr;
};

//#pragma acc routine vector
void copy(int *A, int *B, int M, int N)
{
    //#pragma acc loop vector
    for(int i=M-1; i<=N-1; i++)
    {
        A[i]=B[i];
    }
}

void parallel(train *trains)
{
    int remove[2], train1[40], length1;

    #pragma acc parallel loop gang num_gangs(5) vector vector_length(4) private(remove, train1, length1) copy(trains[1:10])
    for(int i = 0; i< 10; i++)
    {
        copy(train1, trains[i].stations, 1, 40);

        length1 = trains[i].num_stations;

        remove[0] = train1[length1-1];

        train1[length1 -1] = -1;

        #pragma acc loop vector
        for(int j = 0; j < length1-1; j++)
        {
            if(train1[j] == remove[0])
            {
                remove[1] = j;
                train1[j] = -1;
                //break;
            }
        }

        length1 = length1 - 1;

        copy(trains[i].stations, train1, 1, 40);
    }
}

OpenACC compiler output says it launched

  1. 5 gangs(blockIdx.x), 4 vectors(threadIdx.x) (total 20 threads).
  2. Used shared memory for train1 and remove

I am unable to come up with an algorithm which satisfied the above 2 constraints. Can someone please show me how it is possible…?

Algorithm: I have an array of objects(train), each train contains an array(stations) filled with integers repeating twice till length(num_of_stations) and the rest of the positions are filled with -1, our task is to make the element at num_of_stations and its duplicate -1.

compilation command and it’s output:

nvc++ -acc -Minfo=accel removing_an_order.cu -o removing_an_order
copy(int *, int *, int, int):
     18, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
NVC++-W-1042-acc loop vector clause ignored because of inner vector loop. (removing_an_order.cu: 58)
parallel(train *):
     58, Generating copy(trains[1:10]) [if not already present]
         Generating NVIDIA GPU code
         61, #pragma acc loop gang(5) /* blockIdx.x */
         72, #pragma acc loop vector(4) /* threadIdx.x */
     58, CUDA shared memory used for remove,train1
     72, Loop is parallelizable
NVC++/x86-64 Linux 22.9-0: compilation completed with warnings

Complete code comparing the openacc output with serial output simultaneously:

#include <bits/stdc++.h>
#include <random>
#include <cuda.h>

#include "openacc.h"

using namespace std;

#define cMM cudaMallocManaged
struct train
{
    int num_stations;
    int *stations = nullptr;
};

//#pragma acc routine vector
void copy(int *A, int *B, int M, int N)
{
    //#pragma acc loop vector
    for(int i=M-1; i<=N-1; i++)
    {
        A[i]=B[i];
    }
}

void serial(train *trains)
{
    int remove[2], train1[40], length1;

    for(int i = 0; i< 10; i++)
    {
        copy(train1, trains[i].stations, 1, 40);

        length1 = trains[i].num_stations;

        remove[0] = train1[length1-1];

        train1[length1 -1] = -1;

        for(int j = 0; j < length1-1; j++)
        {
            if(train1[j] == remove[0])
            {
                remove[1] = j;
                train1[j] = -1;
                break;
            }
        }

        length1 = length1 - 1;

        copy(trains[i].stations, train1, 1, 40);
    }
}

void parallel(train *trains)
{
    int remove[2], train1[40], length1;

    #pragma acc parallel loop gang num_gangs(5) vector vector_length(4) private(remove, train1, length1) copy(trains[1:10])
    for(int i = 0; i< 10; i++)
    {
        copy(train1, trains[i].stations, 1, 40);

        length1 = trains[i].num_stations;

        remove[0] = train1[length1-1];

        train1[length1 -1] = -1;

        #pragma acc loop vector
        for(int j = 0; j < length1-1; j++)
        {
            if(train1[j] == remove[0])
            {
                remove[1] = j;
                train1[j] = -1;
                //break;
            }
        }

        length1 = length1 - 1;

        copy(trains[i].stations, train1, 1, 40);
    }
}

int main()
{
    int N = 16, num_of_trains = 10;

    struct train *trains, *trains2;

    cMM(&trains, num_of_trains*sizeof(train));

    cMM(&trains2, num_of_trains*sizeof(train));

    int temp[2*N];

    iota(temp, temp + N, 1);
    iota(temp+N, temp + 2*N, 1);

    for(int i = 0; i<10; i++)
    {
        trains[i].num_stations = 2*N;
        cMM(&trains[i].stations, sizeof(int));
        copy(trains[i].stations, temp, 1, 2*N);
        fill(trains[i].stations+2*N, trains[i].stations+4*N , -1);
        shuffle(trains[i].stations, trains[i].stations + 2*N, default_random_engine(i));

        trains2[i].num_stations = 2*N;
        cMM(&trains2[i].stations, sizeof(int));
        copy(trains2[i].stations, temp, 1, 2*N);
        fill(trains2[i].stations+2*N, trains2[i].stations+4*N , -1);
        shuffle(trains2[i].stations, trains2[i].stations + 2*N, default_random_engine(i));
    }
    
    serial(trains);

    cudaError_t err = cudaGetLastError();

    cout << "error =\t" << err << ",\t" << cudaGetErrorName(err) << ",\t" << cudaGetErrorString(err) << "\n";

    parallel(trains2);

    err = cudaGetLastError();

    cout << "error =\t" << err << ",\t" << cudaGetErrorName(err) << ",\t" << cudaGetErrorString(err) << "\n";

    for(int i = 0; i < 10; i++)
    {
        for(int j = 0; j<2*N; j++)
        {
            if(trains[i].stations[j]!=trains2[i].stations[j])
            {
                cout << "not equal\t";
            }
        }
    }
    return 0;
}

Hi nmnethaji8,

Apologies, but I’m not quite clear what you’re asking. Are you wanting to rewrite the OpenACC compute region into a CUDA kernel? Or do you just want to understand how the code gets translated into CUDA?

We translate the code into LLVM which then gets feed into NVVM to generate the device code. You can look at this code by adding the flag “-gpu=keep” which will save a “.gpu” file which you can inspect.

While not supported any longer, we do still ship our older CUDA device code generation as well which you can access via the flag “-gpu=keep,nollvm”. Now the generated CUDA is very low level so may be hard to read, (though it’s easier than reading LLVM), but might give you some idea how the code is translated.

  1. 5 gangs(blockIdx.x), 4 vectors(threadIdx.x) (total 20 threads).

This would be the CUDA kernel’s launch configuration, some thing like:

cudaKernel<<<5,4>>>(args)

  1. Used shared memory for train1 and remove

The compiler will generate a kernel using dynamic shared memory (which size is set as the third argument in the chevron syntax), but you should be able to simply declare a fixed size shared array in the kernel to get the same thing.

-Mat

Dear @MatColgrove,
I want to rewrite the OpenACC compute region into a CUDA kernel. This is not the first time someone asked a similar question. It would be nice to see the CUDA kernel definition for the corresponding OpenACC code. Yes, I know the kernel config for the kernel.

If you feel my request makes sense and post the kernel definition, my follow-up question would be what does the device function for the copy function look like if we uncomment the compiler directives…?

#pragma acc routine vector
void copy(int *A, int *B, int M, int N)
{
    #pragma acc loop vector
    for(int i=M-1; i<=N-1; i++)
    {
        A[i]=B[i];
    }
}

I hope my example clears some ambiguity regarding OpenACC to CUDA mapping.

Again, you can use the compiler flag “-gpu=nollvm,keep” to see the compiler generated CUDA C code, but this is highly optimized and low level so not the best reference if you’re starting out to learn CUDA.

Better to start with your CPU code (i.e. no OpenACC) with the first step being to create a basic kernel from the body of the “i” loop. Simplest to then assign the number of blocks equal to the loop trip count (i.e. 10) and the number of threads to 1. Be sure to move the declaration of length1, remove and train1 into the body of the kernel. You’ll also need to use cudaMalloc and cudaMemCopy to create and copy the train array on the device.

To create a device callable version of “copy”, simply add the "__device__ " attribute to the definition.

Once you get this working, the next step would be to add multiple threads. Add the “shared” attribute to the “train1” array and guard the calls copy so only one thread calls it, and add a call to syncthreads after the guard so the other threads wait. Set the inner for loop’s initial “j” to the threadIdx.x and have each iteration stride by the blockDim.x.

The final iteration would be to make the for loop in “copy” run across threads as well. Remove the single thread guard and add the same method as the “j” loop to have each thread process only some of the iterations. In your OpenACC code, this would be equivalent to making “copy” a vector routine and adding a “loop vector” on the for loop.

-Mat

Dear @MatColgrove, sorry for the delayed response. Here’s the code doing what you suggested. I am stuck at parallelising the copy function.

#include <bits/stdc++.h>
#include <random>
#include <cuda.h>

#include "openacc.h"

using namespace std;

#define cMM cudaMallocManaged
struct train
{
    int num_stations;
    int *stations = nullptr;
};

//#pragma acc routine vector
__device__ __host__ void copy(int *A, int *B, int M, int N)
{
    //#pragma acc loop vector
    for (int i = M - 1; i <= N - 1; i++)
    {
        A[i] = B[i];
    }
}

void serial(train *trains)
{
    int remove[2], train1[40], length1;

    for (int i = 0; i < 10; i++)
    {
        copy(train1, trains[i].stations, 1, 40);

        length1 = trains[i].num_stations;

        remove[0] = train1[length1 - 1];

        train1[length1 - 1] = -1;

        for (int j = 0; j < length1 - 1; j++)
        {
            if (train1[j] == remove[0])
            {
                remove[1] = j;
                train1[j] = -1;
                break;
            }
        }

        length1 = length1 - 1;

        copy(trains[i].stations, train1, 1, 40);
    }
}

__global__ void kernel(train *trains)
{
    __shared__ int remove[2], train1[40], length1;

    for (int i = blockIdx.x; i < 10; i += gridDim.x)
    {
        if (threadIdx.x == 0)
        {
            copy(train1, trains[i].stations, 1, 40);

            length1 = trains[i].num_stations;

            remove[0] = train1[length1 - 1];

            train1[length1 - 1] = -1;
        }

        __syncthreads();

        for (int j = threadIdx.x; j < length1 - 1; j += blockDim.x)
        {
            if (train1[j] == remove[0])
            {
                remove[1] = j;
                train1[j] = -1;
                break;
            }
        }

        __syncthreads();

        if (threadIdx.x == 0)
        {
            length1 = length1 - 1;

            copy(trains[i].stations, train1, 1, 40);
        }
    }
}

int main()
{
    int N = 16, num_of_trains = 10;

    struct train *trains, *trains2;

    cMM(&trains, num_of_trains * sizeof(train));

    cMM(&trains2, num_of_trains * sizeof(train));

    int temp[2 * N];

    iota(temp, temp + N, 1);
    iota(temp + N, temp + 2 * N, 1);

    for (int i = 0; i < 10; i++)
    {
        trains[i].num_stations = 2 * N;
        cMM(&trains[i].stations, sizeof(int));
        copy(trains[i].stations, temp, 1, 2 * N);
        fill(trains[i].stations + 2 * N, trains[i].stations + 4 * N, -1);
        shuffle(trains[i].stations, trains[i].stations + 2 * N, default_random_engine(i));

        trains2[i].num_stations = 2 * N;
        cMM(&trains2[i].stations, sizeof(int));
        copy(trains2[i].stations, temp, 1, 2 * N);
        fill(trains2[i].stations + 2 * N, trains2[i].stations + 4 * N, -1);
        shuffle(trains2[i].stations, trains2[i].stations + 2 * N, default_random_engine(i));
    }

    serial(trains);

    cudaError_t err = cudaGetLastError();

    cout << "error =\t" << err << ",\t" << cudaGetErrorName(err) << ",\t" << cudaGetErrorString(err) << "\n";

    kernel<<<5, 4>>>(trains2);

    int i = cudaDeviceSynchronize();

    err = cudaGetLastError();

    cout << "error =\t" << err << ",\t" << cudaGetErrorName(err) << ",\t" << cudaGetErrorString(err) << "\n";

    for (int i = 0; i < 10; i++)
    {
        for (int j = 0; j < 2 * N; j++)
        {
            if (trains[i].stations[j] != trains2[i].stations[j])
            {
                cout << "not equal\t";
            }
        }
    }
    return 0;
}

I couldn’t parallelize the loop in the copy function, if we remove the thread guard(threadIdx.x == 0) before calling copy function, isn’t the copy function called multiple times instead of parallelizing the loop inside…?

It would be call by every thread, hence you need to divide copy’s for loop iterations across the threads. Something like:

__device__ void copy(int *A, int *B, int M, int N)
{
    //#pragma acc loop vector
    for (int i = threadIdx.x + M - 1; i <= N - 1; i+=blockDim.x)
    {
        A[i] = B[i];
    }
}

Then in the kernel:

    for (int i = blockIdx.x; i < 10; i += gridDim.x)
    {
        copy(train1, trains[i].stations, 1, 40);
        __syncthreads();
        if (threadIdx.x == 0)
        {
            length1 = trains[i].num_stations;
            remove[0] = train1[length1 - 1];
            train1[length1 - 1] = -1;
        }
        __syncthreads();

-Mat

Dear @MatColgrove, according to this copy definition threadIx.x, blockDim.x is visible inside the copy function even though we’re not passing them as arguments, can you please confirm…?

Correct. The threads store this info in a set of special registers that’s accessible even within device function.

1 Like

Thank you, @MatColgrove, for your patience in answering my questions…