Hi everyone,
I would like to know what exactly the Cuda-kernel looks like for the following simple OpenACC lines. There are some discussions regarding this topic (for example[1]) but no one has explained using a simple code.
struct train
{
int num_stations;
int *stations = nullptr;
};
//#pragma acc routine vector
void copy(int *A, int *B, int M, int N)
{
//#pragma acc loop vector
for(int i=M-1; i<=N-1; i++)
{
A[i]=B[i];
}
}
void parallel(train *trains)
{
int remove[2], train1[40], length1;
#pragma acc parallel loop gang num_gangs(5) vector vector_length(4) private(remove, train1, length1) copy(trains[1:10])
for(int i = 0; i< 10; i++)
{
copy(train1, trains[i].stations, 1, 40);
length1 = trains[i].num_stations;
remove[0] = train1[length1-1];
train1[length1 -1] = -1;
#pragma acc loop vector
for(int j = 0; j < length1-1; j++)
{
if(train1[j] == remove[0])
{
remove[1] = j;
train1[j] = -1;
//break;
}
}
length1 = length1 - 1;
copy(trains[i].stations, train1, 1, 40);
}
}
OpenACC compiler output says it launched
- 5 gangs(blockIdx.x), 4 vectors(threadIdx.x) (total 20 threads).
- Used shared memory for train1 and remove
I am unable to come up with an algorithm which satisfied the above 2 constraints. Can someone please show me how it is possible…?
Algorithm: I have an array of objects(train), each train contains an array(stations) filled with integers repeating twice till length(num_of_stations) and the rest of the positions are filled with -1, our task is to make the element at num_of_stations and its duplicate -1.
compilation command and it’s output:
nvc++ -acc -Minfo=accel removing_an_order.cu -o removing_an_order
copy(int *, int *, int, int):
18, Generating implicit acc routine seq
Generating acc routine seq
Generating NVIDIA GPU code
NVC++-W-1042-acc loop vector clause ignored because of inner vector loop. (removing_an_order.cu: 58)
parallel(train *):
58, Generating copy(trains[1:10]) [if not already present]
Generating NVIDIA GPU code
61, #pragma acc loop gang(5) /* blockIdx.x */
72, #pragma acc loop vector(4) /* threadIdx.x */
58, CUDA shared memory used for remove,train1
72, Loop is parallelizable
NVC++/x86-64 Linux 22.9-0: compilation completed with warnings
Complete code comparing the openacc output with serial output simultaneously:
#include <bits/stdc++.h>
#include <random>
#include <cuda.h>
#include "openacc.h"
using namespace std;
#define cMM cudaMallocManaged
struct train
{
int num_stations;
int *stations = nullptr;
};
//#pragma acc routine vector
void copy(int *A, int *B, int M, int N)
{
//#pragma acc loop vector
for(int i=M-1; i<=N-1; i++)
{
A[i]=B[i];
}
}
void serial(train *trains)
{
int remove[2], train1[40], length1;
for(int i = 0; i< 10; i++)
{
copy(train1, trains[i].stations, 1, 40);
length1 = trains[i].num_stations;
remove[0] = train1[length1-1];
train1[length1 -1] = -1;
for(int j = 0; j < length1-1; j++)
{
if(train1[j] == remove[0])
{
remove[1] = j;
train1[j] = -1;
break;
}
}
length1 = length1 - 1;
copy(trains[i].stations, train1, 1, 40);
}
}
void parallel(train *trains)
{
int remove[2], train1[40], length1;
#pragma acc parallel loop gang num_gangs(5) vector vector_length(4) private(remove, train1, length1) copy(trains[1:10])
for(int i = 0; i< 10; i++)
{
copy(train1, trains[i].stations, 1, 40);
length1 = trains[i].num_stations;
remove[0] = train1[length1-1];
train1[length1 -1] = -1;
#pragma acc loop vector
for(int j = 0; j < length1-1; j++)
{
if(train1[j] == remove[0])
{
remove[1] = j;
train1[j] = -1;
//break;
}
}
length1 = length1 - 1;
copy(trains[i].stations, train1, 1, 40);
}
}
int main()
{
int N = 16, num_of_trains = 10;
struct train *trains, *trains2;
cMM(&trains, num_of_trains*sizeof(train));
cMM(&trains2, num_of_trains*sizeof(train));
int temp[2*N];
iota(temp, temp + N, 1);
iota(temp+N, temp + 2*N, 1);
for(int i = 0; i<10; i++)
{
trains[i].num_stations = 2*N;
cMM(&trains[i].stations, sizeof(int));
copy(trains[i].stations, temp, 1, 2*N);
fill(trains[i].stations+2*N, trains[i].stations+4*N , -1);
shuffle(trains[i].stations, trains[i].stations + 2*N, default_random_engine(i));
trains2[i].num_stations = 2*N;
cMM(&trains2[i].stations, sizeof(int));
copy(trains2[i].stations, temp, 1, 2*N);
fill(trains2[i].stations+2*N, trains2[i].stations+4*N , -1);
shuffle(trains2[i].stations, trains2[i].stations + 2*N, default_random_engine(i));
}
serial(trains);
cudaError_t err = cudaGetLastError();
cout << "error =\t" << err << ",\t" << cudaGetErrorName(err) << ",\t" << cudaGetErrorString(err) << "\n";
parallel(trains2);
err = cudaGetLastError();
cout << "error =\t" << err << ",\t" << cudaGetErrorName(err) << ",\t" << cudaGetErrorString(err) << "\n";
for(int i = 0; i < 10; i++)
{
for(int j = 0; j<2*N; j++)
{
if(trains[i].stations[j]!=trains2[i].stations[j])
{
cout << "not equal\t";
}
}
}
return 0;
}