Hello, I have written a code to show OpenACC-CUDA interoperability and have some problems with compilation, could you please help me in finding problem?

```
#include <stdio.h>
#include <cuda.h>
#include <curand_kernel.h>
static const int N=100;
void vecAdd (float restrict *a, float restrict *b, float restrict*sum)
{
#pragma acc kernels loop present (a[N], b[N], sum[N]);
for (int i=0; i<N; i++)
sum[i]=a[i]+b[i];
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x + blockIdx.x * 64;
if (id<N){
//seed, sequence, offset, state
curand_init ( seed, id, 0, &state[id] );
}
}
__global__ void generate( curandState* globalState, float * array1, float * array2, float * array3 )
{
int ind = threadIdx.x;
if (ind <N){
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
array1[ind] = RANDOM;
array2[ind] = (RANDOM+5)/123;
array3[ind] = RANDOM+2;
globalState[ind] = localState;
}
}
int main()
{
curandState* devStates;
float * a, * b, * sum, * vec;
cudaMalloc ( &a, N*sizeof(float));
cudaMalloc ( &b, N*sizeof(float));
cudaMalloc ( &sum, N*sizeof(float));
cudaMalloc ( &vec, N*sizeof(float));
cudaMalloc ( &devStates, N*sizeof( curandState ));
setup_kernel <<< N/256+1, 256 >>> (a, time(NULL));
generate <<< N/256+1, 256 >>> ( devStates, a, b );
#pragma acc declare device_resident (a[N], b[N], sum[N], vec[N])
vecAdd (a,b,sum);
cublasInit();
cublasSaxpy(N, 2.0, sum, 2, vec, 1);
cublasShutdown();
#pragma acc host_data use_device (sum);
for (int i=0; i<N; i++)
printf(sum[i],"\n");
return 0;
}
```

I compile it with:

```
pgcc -acc -I/opt/pgi/linux86-64/2012/cuda/4.2/include -Minfo=accel -L /opt/pgi/linux86-64/2012/cuda/4.2/lib64 -lcurand -ta=nvidia interop.c
```

And recieve the following error:

```
PGC-F-0249-#error -- --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! --- (/opt/pgi/linux86-64/2012/cuda/4.2/include/host_defines.h: 128)
PGC/x86-64 Linux 12.10-0: compilation aborted
```

Thanks a lot!