Dear Mat,
I just want to know the difference with kernels and parallel ,so I worte a code as belows:
#include<stdio.h>
#include<stdlib.h>
#include<accel.h>
#include<time.h>
#define N 1024
int main()
{
struct timeval start,end;
double timeuse;
int i;
int n=N;
int x[N],y[N];
for(i=0;i<N;i++)
{
x[i]=rand()%10;
y[i]=rand()%10;
}
gettimeofday(&start,NULL);
#pragma acc kernels copy(x[0:1023],y[0:1023])
{
for(i=1;i<n-1;i++)
{
x[i]=0.5*y[i]+0.25*(y[i-1]+y[i+1]);
}
for(i=1;i<n-1;i++)
{
y[i]=0.5*x[i]+0.25*(x[i-1]+x[i+1]);
}
}
gettimeofday(&end,NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
timeuse /= 1000000;
printf("kernels time used:%f\n",timeuse);
gettimeofday(&start,NULL);
#pragma acc parallel copy(x[0:1023],y[0:1023])
{
#pragma acc loop
for(i=1;i<n-1;i++)
{
x[i]=0.5*y[i]+0.25*(y[i-1]+y[i+1]);
}
#pragma acc loop
for(i=1;i<n-1;i++)
{
y[i]=0.5*x[i]+0.25*(x[i-1]+x[i+1]);
}
}
gettimeofday(&end,NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
timeuse /= 1000000;
printf("parallel time used:%f\n",timeuse);
return 0;
}
the result is as below:
main:
19, Generating copy(y[0:1023])
Generating copy(x[0:1023])
Generating compute capability 2.0 binary
21, Loop is parallelizable
Accelerator kernel generated
21, #pragma acc loop gang, vector(256) /* blockIdx.x threadIdx.x /
CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes
25, Loop is parallelizable
Accelerator kernel generated
25, #pragma acc loop gang, vector(256) / blockIdx.x threadIdx.x /
CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes
36, Accelerator kernel generated
39, #pragma acc loop gang, vector(256) / blockIdx.x threadIdx.x /
CC 2.0 : 12 registers; 0 shared, 48 constant, 0 local memory bytes
44, #pragma acc loop gang, vector(256) / blockIdx.x threadIdx.x */
36, Generating copy(y[0:1023])
Generating copy(x[0:1023])
Generating compute capability 2.0 binary
39, Loop is parallelizable
44, Loop is parallelizable
for kernel instruction CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes
for parallel indstuction CC 2.0 : 12 registers; 0 shared, 48 constant, 0 local memory bytes
I still couldn’t figure out their difference.
Please kindly help me