OpenACC code much slower than CUDA on trivial copy/transpose

I made a benchmark to evaluate how OpenACC would translate a matrix transpose code, as opposed to CUDA.

I’m doing this since there’s a lot of tweaking to be done in CUDA, and I’d like to see how OpenACC handles it.

I’m really surprised on the bad performance, and would like to ask for some hints to understand what’s happening:

These are my timings with a 2048 * 2048 matrix runs (100 times average), on my GTX 480, PGI 12.9, Linux 64

OpenACC
Copy: 0.377050
Transpose: 0.802850

CUDA
Copy: 0.252929
Transpose naive: 0.507955
Transpose coalesced 0.344594
Transpose optimized 0.307754

The simple copy on OpenACC is slower than a coalesced transpose in CUDA, and the OpenACC transpose is WAY slower than an OpenACC transpose.
I’m quite puzzled with these results, as they seem very very bad (especially on the simple copy, it’s the simplest Kernel that I can think of).

Any ideas on this subject?


Below is the code. Note that I leave out in both CUDA and OpenACC the transfer times.

These are my OpenACC functions:

 10 void trans(const ff* restrict in, ff* restrict out, const int n) {
 11 #pragma acc data present(in[0:n*n], out[0:n*n])
 12 {
 13   
 14 #pragma acc kernels loop independent
 15   for (int i = 0; i < n; i++) {
 16 #pragma acc loop independent
 17     for (int j = 0; j < n; j++) { 
 18       out[j + i * n] = in[i + j * n];
 19     }
 20   }
 21 
 22 }
 23 }
 24 
 25 void copy_k(const ff* restrict in, ff* restrict out, const int n) {
 26 #pragma acc data present(in[0:n*n], out[0:n*n])
 27 {
 28 
 29 #pragma acc kernels loop independent
 30   for (int i = 0; i < n*n; i++) {
 31     out[i] = in[i];
 32   }
 33 
 34 }
 35 }

And then calling the kernels:

 78   acc_init(0);
 79   
 80 #pragma acc data copyin(in[0:n*n]) copy(out[0:n*n])
 81 {
 82  
 83   // Warm up
 84   copy_k(in, out, n);
 85   for (int i = 0; i < num_tests; i++) {
 86     StartTimer(); 
 87     copy_k(in, out, n);
 88     copy_time_ms += GetTimer();
 89   } 
 90   
 91 }
 92 
 93 #pragma acc data copyin(in[0:n*n]) copy(out[0:n*n])
 94 {
 95  
 96   // Warm up
 97   trans(in, out, n);
 98   for (int i = 0; i < num_tests; i++) {
 99     StartTimer();
100     trans(in, out, n);
101     trans_time_ms += GetTimer();
102   }
103 
104 }

Now the codes from the CUDA benchmarks: this code has adapted (barely changed) from the NVIDIA examples:

  7 #define TILE_DIM    16
  8 #define BLOCK_ROWS  16



 53 __global__ void copy(float *odata, float *idata, int n) {
 54   int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
 55   int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
 56 
 57   int index  = xIndex + n*yIndex;
 58 
 59   for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
 60     odata[index+i*n] = idata[index+i*n];
 61   }
 62 }



 64 __global__ void trans_naive(float *odata, float *idata, int n) {
 65   int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
 66   int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
 67 
 68   int index_in  = xIndex + n * yIndex;
 69   int index_out = yIndex + n * xIndex;
 70 
 71   for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
 72     odata[index_out+i] = idata[index_in+i*n];
 73   }
 74 }



 76 __global__ void trans_coalesced(float *odata, float *idata, int n) {
 77   __shared__ float tile[TILE_DIM][TILE_DIM];
 78 
 79   int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
 80   int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
 81   int index_in = xIndex + (yIndex)*n;
 82 
 83   xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
 84   yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
 85   int index_out = xIndex + (yIndex)*n;
 86 
 87   for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
 88     tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*n];
 89   }
 90 
 91   __syncthreads();
 92 
 93   for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
 94     odata[index_out+i*n] = tile[threadIdx.x][threadIdx.y+i];
 95   }
 96 }



 98 __global__ void trans_no_bank_conflicts(float *odata, float *idata, int n) {
 99   __shared__ float tile[TILE_DIM][TILE_DIM+1];
100 
101   int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
102   int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
103   int index_in = xIndex + (yIndex)*n;
104 
105   xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
106   yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
107   int index_out = xIndex + (yIndex)*n;
108 
109   for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
110     tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*n];
111   }
112 
113   __syncthreads();
114 
115   for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
116     odata[index_out+i*n] = tile[threadIdx.x][threadIdx.y+i];
117   }
118 }

Thanks!

Hi lechar,

Any ideas on this subject?

I’d try adjusting the loop schedule clauses. Review the output from “-Minfo=accel” and then adjust according. Also review the basic profile information (i.e. set the environment variable PGI_ACC_TIME=1) to see where the time is coming from.

  • Mat