Hi Tapasya Patki,
Here’s some examples using the PGI Accelerator Model. I’ll post the OpenACC versions next. The major difference is that OpenACC only allows for 1-D arrays so “float **” need to be translated to “float *” and the “copy” clauses in OpenACC use starting bounds plus the number of elements to copy.
$ cat MatrixMultiplication.c
#define M 1024
#define N 2048
#define P 4096
/* muliply a M lines, P columns matrix by a P lines, N columns matrix
to produce a M lines, N columns matrix */
void
MatrixMultiplication1(restrict float a[M][N], restrict float b[M][P], restrict float c[P][N])
{
int i, j, k ;
#pragma acc region for parallel, vector(8)
for (i=0; i<M; i++) {
#pragma acc for parallel, vector(8)
for (j=0; j<N; j++) {
#pragma acc for seq
for (k=0; k<P; k++)
a[i][j] += b[i][k]*c[k][j] ;
}
}
}
void
MatrixMultiplication2(restrict float a[M][N], restrict float b[M][P], restrict float c[P][N])
{
int i, j, k ;
#pragma acc region for parallel, vector(8)
for (i=0; i<M; i++){
#pragma acc for parallel, vector(8)
for (j=0; j<N; j++) {
float sum = 0.0 ;
#pragma acc for seq
for (k=0; k<P; k++)
sum += b[i][k]*c[k][j] ;
a[i][j] = sum ;
}
}
}
void
MatrixMultiplication3(float * restrict a, float * restrict b, float * restrict c, int m, int n, int p)
{
int i, j, k ;
#pragma acc data region copyout(a[0:(m*n)-1]), copyin(b[0:(m*p)-1],c[0:(p*n)-1])
{
#pragma acc region for parallel, vector(8)
for (i=0; i<m; i++){
#pragma acc for parallel, vector (8)
for (j=0; j<n; j++) {
#pragma acc for seq
for (k=0; k<p; k++)
a[i*n+j] += b[i*p+k]*c[k*n+j] ;
}
}
}
}
void
MatrixMultiplication4(float * restrict a,float * restrict b, float * restrict c, int m, int n, int p)
{
int i, j, k ;
#pragma acc data region copyout(a[0:(m*n)-1]), copyin(b[0:(m*p)-1],c[0:(p*n)-1])
{
#pragma acc region for parallel, vector(8)
for (i=0; i<m; i++){
#pragma acc for parallel, vector (8)
for (j=0; j<n; j++)
{
float sum = 0.0 ;
#pragma acc for seq
for (k=0; k<p; k++)
sum += b[i*p+k]*c[k*n+j] ;
a[i*n+j] = sum ;
}
}
}
}
$ pgcc -ta=nvidia -c -Minfo=accel MatrixMultiplication.c -Msafeptr -V12.3
MatrixMultiplication1:
12, Generating copy(a[:1023][:])
Generating copyin(b[:1023][:])
Generating copyin(c[:4095][:])
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
13, Loop is parallelizable
15, Loop is parallelizable
17, Complex loop carried dependence of '*(a)' prevents parallelization
Loop carried dependence of '*(a)' prevents parallelization
Loop carried backward dependence of '*(a)' prevents vectorization
Accelerator kernel generated
13, #pragma acc for parallel, vector(8) /* blockIdx.y threadIdx.y */
15, #pragma acc for parallel, vector(8) /* blockIdx.x threadIdx.x */
17, #pragma acc for seq(8)
Cached references to size [8x8] block of 'b'
Cached references to size [8x8] block of 'c'
CC 1.0 : 9 registers; 560 shared, 16 constant, 0 local memory bytes; 66% occupancy
CC 2.0 : 17 registers; 520 shared, 56 constant, 0 local memory bytes; 33% occupancy
MatrixMultiplication2:
28, Generating copyout(a[:1023][:])
Generating copyin(b[:1023][:])
Generating copyin(c[:4095][:])
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
29, Loop is parallelizable
31, Loop is parallelizable
Accelerator kernel generated
29, #pragma acc for parallel, vector(8) /* blockIdx.y threadIdx.y */
31, #pragma acc for parallel, vector(8) /* blockIdx.x threadIdx.x */
CC 1.0 : 12 registers; 48 shared, 8 constant, 0 local memory bytes; 66% occupancy
CC 2.0 : 20 registers; 8 shared, 56 constant, 0 local memory bytes; 33% occupancy
34, Loop is parallelizable
MatrixMultiplication3:
46, Generating copyout(a[:m*n-1])
Generating copyin(c[:p*n-1])
Generating copyin(b[:p*m-1])
48, Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
49, Loop carried dependence of '*(a)' prevents parallelization
Loop carried backward dependence of '*(a)' prevents vectorization
51, Loop is parallelizable
53, Complex loop carried dependence of '*(a)' prevents parallelization
Loop carried dependence of '*(a)' prevents parallelization
Loop carried backward dependence of '*(a)' prevents vectorization
Accelerator kernel generated
49, #pragma acc for parallel, vector(8) /* blockIdx.y threadIdx.y */
51, #pragma acc for parallel, vector(8) /* blockIdx.x threadIdx.x */
53, #pragma acc for seq
CC 1.0 : 19 registers; 84 shared, 16 constant, 0 local memory bytes; 50% occupancy
CC 2.0 : 25 registers; 8 shared, 92 constant, 0 local memory bytes; 33% occupancy
MatrixMultiplication4:
65, Generating copyout(a[:m*n-1])
Generating copyin(c[:p*n-1])
Generating copyin(b[:p*m-1])
67, Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
68, Loop carried dependence of '*(a)' prevents parallelization
Loop carried backward dependence of '*(a)' prevents vectorization
70, Loop is parallelizable
Accelerator kernel generated
68, #pragma acc for parallel, vector(8) /* blockIdx.y threadIdx.y */
70, #pragma acc for parallel, vector(8) /* blockIdx.x threadIdx.x */
CC 1.0 : 17 registers; 84 shared, 12 constant, 0 local memory bytes; 50% occupancy
CC 2.0 : 24 registers; 8 shared, 92 constant, 0 local memory bytes; 33% occupancy
74, Loop is parallelizable