the difference between kernels and parallel instructions

Dear Mat,

I just want to know the difference with kernels and parallel ,so I worte a code as belows:

#include<stdio.h>
#include<stdlib.h>
#include<accel.h>
#include<time.h>
#define N 1024
int main()
{
    struct timeval start,end;
    double timeuse;
    int i;
    int n=N;
    int x[N],y[N];
    for(i=0;i<N;i++)
    {
        x[i]=rand()%10;
        y[i]=rand()%10;
    }
    gettimeofday(&start,NULL);
    #pragma acc kernels copy(x[0:1023],y[0:1023])
    {
        for(i=1;i<n-1;i++)
        {
            x[i]=0.5*y[i]+0.25*(y[i-1]+y[i+1]);
        }
        for(i=1;i<n-1;i++)
        {
            y[i]=0.5*x[i]+0.25*(x[i-1]+x[i+1]);
        }
    }
    gettimeofday(&end,NULL);
    timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
    timeuse /= 1000000;
    printf("kernels time used:%f\n",timeuse);

    gettimeofday(&start,NULL);
    #pragma acc parallel copy(x[0:1023],y[0:1023])
    {
        #pragma acc loop
        for(i=1;i<n-1;i++)
        {
            x[i]=0.5*y[i]+0.25*(y[i-1]+y[i+1]);
        }
        #pragma acc loop
        for(i=1;i<n-1;i++)
        {
            y[i]=0.5*x[i]+0.25*(x[i-1]+x[i+1]);
        }
    }
    gettimeofday(&end,NULL);
    timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
    timeuse /= 1000000;
    printf("parallel time used:%f\n",timeuse);



    return 0;
}

the result is as below:

main:
19, Generating copy(y[0:1023])
Generating copy(x[0:1023])
Generating compute capability 2.0 binary
21, Loop is parallelizable
Accelerator kernel generated
21, #pragma acc loop gang, vector(256) /* blockIdx.x threadIdx.x /
CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes
25, Loop is parallelizable
Accelerator kernel generated
25, #pragma acc loop gang, vector(256) /
blockIdx.x threadIdx.x /
CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes
36, Accelerator kernel generated
39, #pragma acc loop gang, vector(256) /
blockIdx.x threadIdx.x /
CC 2.0 : 12 registers; 0 shared, 48 constant, 0 local memory bytes
44, #pragma acc loop gang, vector(256) /
blockIdx.x threadIdx.x */
36, Generating copy(y[0:1023])
Generating copy(x[0:1023])
Generating compute capability 2.0 binary
39, Loop is parallelizable
44, Loop is parallelizable

for kernel instruction CC 2.0 : 10 registers; 4 shared, 44 constant, 0 local memory bytes
for parallel indstuction CC 2.0 : 12 registers; 0 shared, 48 constant, 0 local memory bytes
I still couldn’t figure out their difference.

Please kindly help me