Hello,

I am still new to CUDA and I have a kernel which takes most of my computing time on the GPU, the NVIDIA Profiler measures 33% Occupancy with a register count of 50.

I can see why the register count is so high, but i have no idea how to reduce the amount of used registers. Is there anything I can do?

```
__global__ void cuda_COMP_FG(double *U, double *V, double *F, double* G,int imax, int jmax, int offset, double delx,double dely, double delt, double alpha, double Re, double GY, double GX ){
int i = blockIdx.x * BLOCK_SIZE + threadIdx.x+1;
int j = blockIdx.y * BLOCK_SIZE + threadIdx.y+1;
if(i<imax){
double uu_xx=-2.0*U[i+j*offset];
uu_xx+=U[i-1+j*offset];
uu_xx+=U[i+1+j*offset];
uu_xx/=delx*delx;
double uu_yy=-2.0*U[i+j*offset];
uu_yy+=U[i+(j+1)*offset]+U[i+(j-1)*offset];
uu_yy/=dely*dely;
double uu_x=((U[i+j*offset]+U[i+1+j*offset])*(U[i+j*offset]+U[i+1+j*offset])*0.25-(U[i-1+j*offset]+U[i+j*offset])*(U[i-1+j*offset]+U[i+j*offset])*0.25)/delx + (alpha/delx)*((fabs(U[i+j*offset]+U[i+1+j*offset])*(U[i+j*offset]-U[i+1+j*offset])/4)-fabs(U[i-1+j*offset]+U[i+j*offset])*(U[i-1+j*offset]-U[i+j*offset])*0.25 ); //OK
double uv_y=((V[i+j*offset]+V[i+1+j*offset])*(U[i+j*offset]+U[i+(j+1)*offset])*0.25 - (V[i+(j-1)*offset]+V[i+1+(j-1)*offset])*(U[i+(j-1)*offset]+U[i+j*offset])*0.25)/dely + (alpha/dely)*(fabs(V[i+j*offset]+V[i+1+j*offset])*(U[i+j*offset]-U[i+(j+1)*offset])*0.25 - fabs(V[i+(j-1)*offset]+V[i+1+(j-1)*offset])*(U[i+(j-1)*offset]-U[i+j*offset])*0.25 );//OK
F[i+j*offset]=U[i+j*offset]+delt*((1.0/Re)*(uu_xx+uu_yy)-uu_x-uv_y+GX);
}
else{
F[j*offset]=U[j*offset];
F[imax+j*offset]=U[imax+j*offset];
}
if(j<jmax){
double vv_xx=-2.0*V[i+j*offset];
vv_xx+=V[i+1+j*offset];
vv_xx+=V[i-1+j*offset];
vv_xx/=delx*delx;
double vv_yy=-2.0*V[i+j*offset];//OK
vv_yy+=V[i+(j+1)*offset];
vv_yy+=V[i+(j-1)*offset];//OK
vv_yy/=dely*dely;
double vv_y=((V[i+j*offset]+V[i+(j+1)*offset])*(V[i+j*offset]+V[i+(j+1)*offset])/4.0 - (V[i+(j-1)*offset]+V[i+j*offset])*(V[i+(j-1)*offset]+V[i+j*offset])/4.0)/dely;
vv_y+=(alpha/dely)*( fabs(V[i+j*offset]+V[i+(j+1)*offset])*(V[i+j*offset]-V[i+(j+1)*offset])/4.0 - fabs(V[i+(j-1)*offset]+V[i+j*offset])*(V[i+(j-1)*offset]-V[i+j*offset])/4.0 ); //OK
double uv_x=((U[i+j*offset]+U[i+(j+1)*offset])*(V[i+j*offset]+V[i+1+j*offset])/4.0 - (U[i-1+j*offset]+U[i-1+(j+1)*offset])*(V[i-1+j*offset]+V[i+j*offset])/4.0 )/delx +(alpha/delx)*(fabs(U[i+j*offset]+U[i+(j+1)*offset])*(V[i+j*offset]-V[i+1+j*offset])/4.0 - fabs(U[i-1+j*offset]+U[i-1+(j+1)*offset])*(V[i-1+j*offset]-V[i+j*offset])/4.0 );//OK
G[i+j*offset]=V[i+j*offset]+delt*((1.0/Re)*(vv_xx+vv_yy)-uv_x-vv_y+GY);
}
else{
G[i]=V[i];
G[i+jmax*offset]=V[i+jmax*offset];
}
}
```