I am trying to use dynamic allocate shared memory for variable “etan” like this:
__global__ void PT1(double* dvxdx, double* dvydy, double* dvxdy, double* dvydx, double* vx, double* vy, double* alpha, double* beta, int* index, double* kvx, double* kvy, double* etan, double* Helem, double* areas, bool* isice, double* Eta_nbe, double* rheology_B, double n_glen, double eta_0, double rele,int nbe){
extern __shared__ double ss[];
int ii = threadIdx.x;
int ix = threadIdx.x+blockIdx.x * blockDim.x;
ss[ii] = etan[nbe-ii-1];
__syncthreads();
etan[ii] = ss[nbe-ii-1];
for(int ix = threadIdx.x+blockIdx.x * blockDim.x; ix<nbe; ix += blockDim.x * gridDim.x) {
if (isice[ix]) ss[ix] = min(exp(rele*log(eta_it) + (1-rele)*log(ss[ix])),eta_0*1e5);
Eta_nbe[ix] = ss[ix]*areas[ix];
}
}
int main()
{
double *d_etan;
cudaMalloc((void**)&d_etan, nbe*sizeof(double));
cudaMemcpy(d_etan, etan, nbe*sizeof(double), cudaMemcpyHostToDevice);
PT1<<<gride, blocke,nbe*sizeof(double)>>>(dvxdx, dvydy, dvxdy, dvydx, d_vx, d_vy, d_alpha, d_beta, d_index, kvx, kvy, d_etan, d_Helem, d_areas, d_isice, Eta_nbe, d_rheology_B, n_glen, eta_0, rele, nbe); cudaDeviceSynchronize();
}
but we face ERROR launching GPU C-CUDA program: invalid argument
and the result is not the same as the original code without shared memory.