Hello,
I was trying to improve the performance of my code using OpenACC. However, the performance of the modified code was lower than the sequential version.
Can anyone tell me what I did wrong and how I can fix it?
This is the code (a simple Convolution with a fixed kernel):
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
#define N 4096
//Methods
void fill(float*);
#pragma acc routine seq
float sf(float, float, float, float, float,
float, float, float, float, float,
float, float, float, float, float,
float, float, float, float, float,
float, float, float, float, float, float);
//global
//const float sum = 159.0f;
//Main
int main(int argc, char **argv)
{
float *in, *out;
in = (float*) malloc(sizeof(float)*N*N);
out = (float*) malloc(sizeof(float)*N*N);
//Fill input
fill(in);
//Time
clock_t start = clock();
//Computation
int i,j;
#pragma acc parallel loop private(i,j)
for(i=2; i<N-2; i++){
#pragma acc loop independent
for(j=2; j<N-2; j++){
out[(i-2)*N + j-2] = sf( in[(i-2)*N + j-2], in[(i-2)*N + j-1],
in[(i-2)*N + j], in[(i-2)*N + j+1],
in[(i-2)*N + j+2], in[(i-1)*N + j-2],
in[(i-1)*N + j-1], in[(i-1)*N + j],
in[(i-1)*N + j+1], in[(i-1)*N + j+2],
in[(i)*N + j-2], in[(i)*N + j-1],
in[(i)*N + j], in[(i)*N + j+1],
in[(i)*N + j+2], in[(i+1)*N + j-2],
in[(i+1)*N + j-1], in[(i+1)*N + j],
in[(i+1)*N + j+1], in[(i+1)*N + j+2],
in[(i+2)*N + j-2], in[(i+2)*N + j-1],
in[(i+2)*N + j], in[(i+2)*N + j+1],
in[(i+2)*N + j+2], 159.0f );
}
}
//Time
clock_t end = clock();
printf("Runtime: %fs\n", (end-start)/(double)CLOCKS_PER_SEC);
//Validating Result
int err_num = 0;
//...
printf("%d Errors!\n", err_num);
//---
free(in);
free(out);
return 0;
}
void fill(float* t)
{
int i,range;
unsigned int seed = 6;
srand(seed);
range = 1000000;
for(i=0; i<N*N; i++){
t[i] = ((float)rand() / (float)RAND_MAX)*range;
}
}
#pragma acc routine seq
float sf(float f1, float f2, float f3, float f4, float f5,
float f6, float f7, float f8, float f9, float f10,
float f11, float f12, float f13, float f14, float f15,
float f16, float f17, float f18, float f19, float f20,
float f21, float f22, float f23, float f24, float f25, float sum)
{
return (2.0f*f1 + 4.0f*f2 + 5.0f*f3 + 4.0f*f4 + 2.0f*f5 +
4.0f*f6 + 9.0f*f7 + 12.0f*f8 + 9.0f*f9 + 4.0f*f10 +
5.0f*f11 + 12.0f*f12 + 15.0f*f13 + 12.0f*f14 + 5.0f*f15 +
4.0f*f16 + 9.0f*f17 + 12.0f*f18 + 9.0f*f19 + 4.0f*f20 +
2.0f*f21 + 4.0f*f22 + 5.0f*f23 + 4.0f*f24 + 2.0f*f25)/sum;
}
I compiled it using this command:
pgcc -acc -Minfo -ta=nvidia OpenACC_Gauss.c
The output was:
main:
35, Accelerator kernel generated
Generating Tesla code
36, #pragma acc loop gang /* blockIdx.x /
38, #pragma acc loop vector(128) / threadIdx.x */
35, Generating implicit copyin(in[:16777216])
Generating implicit copy(out[:16760828])
38, Loop is parallelizable
sf:
91, Generating acc routine seq
Generating Tesla code
The program took 0.418s to run. When I commented out the pragmas and recompiled the sequential program, it took only 0.375s (hardware is a i3-3220 CPU and GTX 1060 6GB graphics card).
I’m new to OpenACC, so any feedback on what mistakes I made would be greatly appreciated.