I made a benchmark to evaluate how OpenACC would translate a matrix transpose code, as opposed to CUDA.
I’m doing this since there’s a lot of tweaking to be done in CUDA, and I’d like to see how OpenACC handles it.
I’m really surprised on the bad performance, and would like to ask for some hints to understand what’s happening:
These are my timings with a 2048 * 2048 matrix runs (100 times average), on my GTX 480, PGI 12.9, Linux 64
OpenACC
Copy: 0.377050
Transpose: 0.802850
CUDA
Copy: 0.252929
Transpose naive: 0.507955
Transpose coalesced 0.344594
Transpose optimized 0.307754
The simple copy on OpenACC is slower than a coalesced transpose in CUDA, and the OpenACC transpose is WAY slower than an OpenACC transpose.
I’m quite puzzled with these results, as they seem very very bad (especially on the simple copy, it’s the simplest Kernel that I can think of).
Any ideas on this subject?
Below is the code. Note that I leave out in both CUDA and OpenACC the transfer times.
These are my OpenACC functions:
10 void trans(const ff* restrict in, ff* restrict out, const int n) {
11 #pragma acc data present(in[0:n*n], out[0:n*n])
12 {
13
14 #pragma acc kernels loop independent
15 for (int i = 0; i < n; i++) {
16 #pragma acc loop independent
17 for (int j = 0; j < n; j++) {
18 out[j + i * n] = in[i + j * n];
19 }
20 }
21
22 }
23 }
24
25 void copy_k(const ff* restrict in, ff* restrict out, const int n) {
26 #pragma acc data present(in[0:n*n], out[0:n*n])
27 {
28
29 #pragma acc kernels loop independent
30 for (int i = 0; i < n*n; i++) {
31 out[i] = in[i];
32 }
33
34 }
35 }
And then calling the kernels:
78 acc_init(0);
79
80 #pragma acc data copyin(in[0:n*n]) copy(out[0:n*n])
81 {
82
83 // Warm up
84 copy_k(in, out, n);
85 for (int i = 0; i < num_tests; i++) {
86 StartTimer();
87 copy_k(in, out, n);
88 copy_time_ms += GetTimer();
89 }
90
91 }
92
93 #pragma acc data copyin(in[0:n*n]) copy(out[0:n*n])
94 {
95
96 // Warm up
97 trans(in, out, n);
98 for (int i = 0; i < num_tests; i++) {
99 StartTimer();
100 trans(in, out, n);
101 trans_time_ms += GetTimer();
102 }
103
104 }
Now the codes from the CUDA benchmarks: this code has adapted (barely changed) from the NVIDIA examples:
7 #define TILE_DIM 16
8 #define BLOCK_ROWS 16
53 __global__ void copy(float *odata, float *idata, int n) {
54 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
55 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
56
57 int index = xIndex + n*yIndex;
58
59 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
60 odata[index+i*n] = idata[index+i*n];
61 }
62 }
64 __global__ void trans_naive(float *odata, float *idata, int n) {
65 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
66 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
67
68 int index_in = xIndex + n * yIndex;
69 int index_out = yIndex + n * xIndex;
70
71 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
72 odata[index_out+i] = idata[index_in+i*n];
73 }
74 }
76 __global__ void trans_coalesced(float *odata, float *idata, int n) {
77 __shared__ float tile[TILE_DIM][TILE_DIM];
78
79 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
80 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
81 int index_in = xIndex + (yIndex)*n;
82
83 xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
84 yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
85 int index_out = xIndex + (yIndex)*n;
86
87 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
88 tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*n];
89 }
90
91 __syncthreads();
92
93 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
94 odata[index_out+i*n] = tile[threadIdx.x][threadIdx.y+i];
95 }
96 }
98 __global__ void trans_no_bank_conflicts(float *odata, float *idata, int n) {
99 __shared__ float tile[TILE_DIM][TILE_DIM+1];
100
101 int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
102 int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
103 int index_in = xIndex + (yIndex)*n;
104
105 xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
106 yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
107 int index_out = xIndex + (yIndex)*n;
108
109 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
110 tile[threadIdx.y+i][threadIdx.x] = idata[index_in+i*n];
111 }
112
113 __syncthreads();
114
115 for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
116 odata[index_out+i*n] = tile[threadIdx.x][threadIdx.y+i];
117 }
118 }
Thanks!