Performance variation between Hawaii and tesla

I have two GPUs in a machine.

1. AMD Radeon R9 390x
2. NVIDIA GTX Titan

The problem is the performance is very much different between two devices using the same program below.

``````#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <math.h>

void PrintUsage(const char* progName) {
printf("Usage: %s <n> <gangs> <vectors>\n", progName);
printf("      <n>      : the number of elements.\n");
printf("      <gangs>  : the number of teams.\n");
printf("      <vectors>: the number of vectors.\n");
exit(0);
}

double get_time(void) {
struct timeval tv;
gettimeofday(&tv, NULL);

return tv.tv_sec + tv.tv_usec/1.0e6;
}

void init(float* M_A, float* M_B, int n) {
int x, y;

for( y = 0; y < n; ++y ) {
for( x = 0; x < n; ++x ) {
M_A[y*n+x] = 0.01;
M_B[y*n+x] = 0.01;
}
}
}
void run_acc(float* restrict M_A, float* restrict M_B, float* restrict M_C, int n,
int gangs, int vectors) {
int i, j, k;
double start_time, end_time;
float sum;

#pragma acc data copyin(M_A[0:n*n], M_B[0:n*n]) copyout(M_C[0:n*n])
{
start_time = get_time();
#pragma acc parallel loop \
private(i, j, k, sum) \
firstprivate(n) \
present(M_A[0:n*n], M_B[0:n*n], M_C[0:n*n]) \
collapse(2)\
independent\
gang vector\
num_gangs(gangs) vector_length(vectors)
for( i = 0; i < n; ++i ) {
for( j = 0; j < n; ++j ) {
sum = 0;
for( k = 0; k < n; ++k ) {
sum += M_A[i*n+k] * M_B[k*n+j];
}
M_C[i*n+j] = sum;
}
}
end_time = get_time();
}

printf("%d|%d|Elapsed time (ACC) | %f\n",
gangs, vectors, end_time - start_time);
}

int main(int argc, const char* argv[]) {

if( argc != 4 ) {
PrintUsage(argv[0]);
}

int n = atoi(argv[1]);
int gangs = atoi(argv[2]);
int vectors = atoi(argv[3]);

if( n <= 0 || gangs <= 0 || vectors <= 0 ) {
PrintUsage(argv[0]);
}

printf("Calculating %dx%d elements...\n", n, n);
#ifdef COLLAPSE
printf("Using collapse(2)...\n");
#endif // COLLAPSE
printf("gangs = %d, vectors = %d\n", gangs, vectors);

float* M_A = (float*)malloc(n*n*sizeof(float));
float* M_B = (float*)malloc(n*n*sizeof(float));
float* M_C = (float*)malloc(n*n*sizeof(float));
float* M_C2 = (float*)malloc(n*n*sizeof(float));

init(M_A, M_B, n);

run_acc(M_A, M_B, M_C, n, gangs, vectors);

return 0;
}
``````

Using radeon

pgc++ -acc -ta=radeon:hawaii,nollvm -Minfo -c -o matMul.o matMul.cpp
pgc++ -acc -ta=radeon:hawaii,nollvm -Minfo -o matMul matMul.o

./matMul 4096 32 256

Calculating 4096x4096 elements…
Using collapse(2)…
gangs = 32, vectors = 256
32|256|Elapsed time (ACC) | > 204.874061

Using titan

pgc++ -acc -ta=tesla -Minfo -c -o matMul.o matMul.cpp
pgc++ -acc -ta=tesla -Minfo -o matMul matMul.o

./matMul 4096 32 256

Calculating 4096x4096 elements…
Using collapse(2)…
gangs = 32, vectors = 256
32|256|Elapsed time (ACC) | > 2.448218

100x difference. What is wrong with the source code?

Hi Junghyun,

I recommend users avoid explicitly setting the number of gangs and the vector width. Anytime you add these you are essentially tuning to a particular device and that’s why you’re getting such poor performance on the AMD device. Removing “num_gangs(gangs) vector_length(vectors)” and letting the compiler decide the appropriate schedule for the target device, will greatly reduce your time. (on my Radeon the time reduces from 146 to 1.1 seconds).

I’d also recommend removing the “independent”, “firstprivate” and “private” clauses. “parallel loop” are implicitly independent so add “independent” is extraneous. Also, scalars are private by default so there’s no need to add them to a private clause. Worse, when you do put scalars in a private clause, an array of scalars will be created in global memory, one for each thread, thus reducing performance. Otherwise the compiler can declare them as local kernel variables and increase the likelihood that they will be stored in a register.

Hope this helps,
Mat

``````% cat test_09_03_15.a.c
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <math.h>

void PrintUsage(const char* progName) {
printf("Usage: %s <n> <gangs> <vectors>\n", progName);
printf("      <n>      : the number of elements.\n");
printf("      <gangs>  : the number of teams.\n");
printf("      <vectors>: the number of vectors.\n");
exit(0);
}

double get_time(void) {
struct timeval tv;
gettimeofday(&tv, NULL);

return tv.tv_sec + tv.tv_usec/1.0e6;
}

void init(float* M_A, float* M_B, int n) {
int x, y;

for( y = 0; y < n; ++y ) {
for( x = 0; x < n; ++x ) {
M_A[y*n+x] = 0.01;
M_B[y*n+x] = 0.01;
}
}
}
void run_acc(float* restrict M_A, float* restrict M_B, float* restrict
M_C, int n,
int gangs, int vectors) {
int i, j, k;
double start_time, end_time;
float sum;

#pragma acc data copyin(M_A[0:n*n], M_B[0:n*n]) copyout(M_C[0:n*n])
{
start_time = get_time();
#pragma acc parallel loop \
present(M_A[0:n*n], M_B[0:n*n], M_C[0:n*n]) \
collapse(2)\
gang vector
for( i = 0; i < n; ++i ) {
for( j = 0; j < n; ++j ) {
sum = 0;
for( k = 0; k < n; ++k ) {
sum += M_A[i*n+k] * M_B[k*n+j];
}
M_C[i*n+j] = sum;
}
}
end_time = get_time();
}

printf("%d|%d|Elapsed time (ACC) | %f\n",
gangs, vectors, end_time - start_time);
}

int main(int argc, const char* argv[]) {

if( argc != 4 ) {
PrintUsage(argv[0]);
}

int n = atoi(argv[1]);
int gangs = atoi(argv[2]);
int vectors = atoi(argv[3]);

if( n <= 0 || gangs <= 0 || vectors <= 0 ) {
PrintUsage(argv[0]);
}

printf("Calculating %dx%d elements...\n", n, n);
#ifdef COLLAPSE
printf("Using collapse(2)...\n");
#endif // COLLAPSE
printf("gangs = %d, vectors = %d\n", gangs, vectors);

float* M_A = (float*)malloc(n*n*sizeof(float));
float* M_B = (float*)malloc(n*n*sizeof(float));
float* M_C = (float*)malloc(n*n*sizeof(float));
float* M_C2 = (float*)malloc(n*n*sizeof(float));

init(M_A, M_B, n);

run_acc(M_A, M_B, M_C, n, gangs, vectors);

return 0;
}
% pgcc -acc -fast -ta=radeon:nollvm -Minfo=accel test_09_03_15.a.c
run_acc:
38, Generating copyin(M_A[:n*n],M_B[:n*n])
Generating copyout(M_C[:n*n])
41, Generating present(M_A[:n*n],M_B[:n*n],M_C[:n*n])
Accelerator kernel generated
Generating Radeon code
45, #pragma acc loop gang, vector(256) collapse(2) /* global dim(0) local dim(0) */
46,   /* global dim(0) local dim(0) collapsed */
48, Loop is parallelizable

% a.out 4096 32 256
Calculating 4096x4096 elements...
gangs = 32, vectors = 256
``````