Poor Performance when using OpenACC pragmas

Hello,

I was trying to improve the performance of my code using OpenACC. However, the performance of the modified code was lower than the sequential version.
Can anyone tell me what I did wrong and how I can fix it?

This is the code (a simple Convolution with a fixed kernel):

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>

#define N 4096

//Methods
	void fill(float*);
	#pragma acc routine seq
	float sf(float, float, float, float, float,
		float, float, float, float, float,
		float, float, float, float, float,
		float, float, float, float, float,
		float, float, float, float, float, float);

//global
	//const float sum = 159.0f;

//Main
int main(int argc, char **argv)
{
		float *in, *out;
		in = (float*) malloc(sizeof(float)*N*N);
		out = (float*) malloc(sizeof(float)*N*N);

	//Fill input
		fill(in);
	
	//Time
		clock_t start = clock();
	//Computation
		int i,j;
		
		#pragma acc parallel loop private(i,j)
		for(i=2; i<N-2; i++){
		    #pragma acc loop independent
			for(j=2; j<N-2; j++){
				out[(i-2)*N + j-2] = sf( in[(i-2)*N + j-2], in[(i-2)*N + j-1],
								in[(i-2)*N + j], in[(i-2)*N + j+1],
								in[(i-2)*N + j+2], in[(i-1)*N + j-2],
								in[(i-1)*N + j-1], in[(i-1)*N + j],
								in[(i-1)*N + j+1], in[(i-1)*N + j+2],
								in[(i)*N + j-2], in[(i)*N + j-1],
								in[(i)*N + j], in[(i)*N + j+1],
								in[(i)*N + j+2], in[(i+1)*N + j-2],
								in[(i+1)*N + j-1], in[(i+1)*N + j],
								in[(i+1)*N + j+1], in[(i+1)*N + j+2],
								in[(i+2)*N + j-2], in[(i+2)*N + j-1],
								in[(i+2)*N + j], in[(i+2)*N + j+1],
								in[(i+2)*N + j+2], 159.0f );
			}
		}
	//Time
		clock_t end = clock();
		printf("Runtime: %fs\n", (end-start)/(double)CLOCKS_PER_SEC);

	//Validating Result
		int err_num = 0;

		//...

		printf("%d Errors!\n", err_num);

	//---
		free(in);
		free(out);

	return 0;
}

void fill(float* t)
{
	int i,range;

	unsigned int seed = 6;
	srand(seed);
	range = 1000000;

	for(i=0; i<N*N; i++){
		t[i] = ((float)rand() / (float)RAND_MAX)*range;
	}
}

#pragma acc routine seq
float sf(float f1, float f2, float f3, float f4, float f5,
	float f6, float f7, float f8, float f9, float f10,
	float f11, float f12, float f13, float f14, float f15,
	float f16, float f17, float f18, float f19, float f20,
	float f21, float f22, float f23, float f24, float f25, float sum)
{
	return (2.0f*f1 + 4.0f*f2 + 5.0f*f3 + 4.0f*f4 + 2.0f*f5 +
		4.0f*f6 + 9.0f*f7 + 12.0f*f8 + 9.0f*f9 + 4.0f*f10 +
		5.0f*f11 + 12.0f*f12 + 15.0f*f13 + 12.0f*f14 + 5.0f*f15 +
		4.0f*f16 + 9.0f*f17 + 12.0f*f18 + 9.0f*f19 + 4.0f*f20 +
		2.0f*f21 + 4.0f*f22 + 5.0f*f23 + 4.0f*f24 + 2.0f*f25)/sum;
}

I compiled it using this command:

pgcc -acc -Minfo -ta=nvidia OpenACC_Gauss.c

The output was:

main:
35, Accelerator kernel generated
Generating Tesla code
36, #pragma acc loop gang /* blockIdx.x /
38, #pragma acc loop vector(128) /
threadIdx.x */
35, Generating implicit copyin(in[:16777216])
Generating implicit copy(out[:16760828])
38, Loop is parallelizable
sf:
91, Generating acc routine seq
Generating Tesla code

The program took 0.418s to run. When I commented out the pragmas and recompiled the sequential program, it took only 0.375s (hardware is a i3-3220 CPU and GTX 1060 6GB graphics card).
I’m new to OpenACC, so any feedback on what mistakes I made would be greatly appreciated.

I don’t think you made any mistakes, but I think the time for the OpenACC code is probably dominated by copying the data in and out. A simple run under a profiler should show that to you. On Linux, for instance, just
% nvprof ./a.out
should give you times for copying the RHS array in, the kernel time, and copying the LHS array out.