Dear Mat,

I just want to know the difference with kernels and parallel ,so I worte a code as belows:

```
#include<stdio.h>
#include<stdlib.h>
#include<accel.h>
#include<time.h>
#define N 1024
int main()
{
struct timeval start,end;
double timeuse;
int i;
int n=N;
int x[N],y[N];
for(i=0;i<N;i++)
{
x[i]=rand()%10;
y[i]=rand()%10;
}
gettimeofday(&start,NULL);
#pragma acc kernels copy(x[0:1023],y[0:1023])
{
for(i=1;i<n-1;i++)
{
x[i]=0.5*y[i]+0.25*(y[i-1]+y[i+1]);
}
for(i=1;i<n-1;i++)
{
y[i]=0.5*x[i]+0.25*(x[i-1]+x[i+1]);
}
}
gettimeofday(&end,NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
timeuse /= 1000000;
printf("kernels time used:%f\n",timeuse);
gettimeofday(&start,NULL);
#pragma acc parallel copy(x[0:1023],y[0:1023])
{
#pragma acc loop
for(i=1;i<n-1;i++)
{
x[i]=0.5*y[i]+0.25*(y[i-1]+y[i+1]);
}
#pragma acc loop
for(i=1;i<n-1;i++)
{
y[i]=0.5*x[i]+0.25*(x[i-1]+x[i+1]);
}
}
gettimeofday(&end,NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
timeuse /= 1000000;
printf("parallel time used:%f\n",timeuse);
return 0;
}
the result is as below:
```

main:

19, Generating copy(y[0:1023])

Generating copy(x[0:1023])

Generating compute capability 2.0 binary

21, Loop is parallelizable

Accelerator kernel generated

21, #pragma acc loop gang, vector(256) /* blockIdx.x threadIdx.x */
CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes
25, Loop is parallelizable
Accelerator kernel generated
25, #pragma acc loop gang, vector(256) /* blockIdx.x threadIdx.x

*/*

CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes

36, Accelerator kernel generated

39, #pragma acc loop gang, vector(256) /blockIdx.x threadIdx.x

CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes

36, Accelerator kernel generated

39, #pragma acc loop gang, vector(256) /

*/*

CC 2.0 : 12 registers; 0 shared, 48 constant, 0 local memory bytes

44, #pragma acc loop gang, vector(256) /blockIdx.x threadIdx.x */

CC 2.0 : 12 registers; 0 shared, 48 constant, 0 local memory bytes

44, #pragma acc loop gang, vector(256) /

36, Generating copy(y[0:1023])

Generating copy(x[0:1023])

Generating compute capability 2.0 binary

39, Loop is parallelizable

44, Loop is parallelizable

for kernel instruction CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes

for parallel indstuction CC 2.0 : 12 registers; 0 shared, 48 constant, 0 local memory bytes

I still couldn’t figure out their difference.

Please kindly help me