__device__ inline void hess( //INPUT
const double* params,
const double bv1,
const double bv2,
const int np,
const int np2,
const bool include,
double* shared,
double* fs,
double* Vs,
double* Ks,
double* approx,
const int idB,
//OUTPUT
double* hess)
{
double sumf=0.7656576576;
double _d=0.545645645645645;
__syncthreads();
if(idB==0){
double partial_fsum;
for (int k=0; k<3; k++){
int kk = 2+np2*k;
partial_fsum=1.0;
for(int j=0;j<k;j++)
partial_fsum-=fs[j];
if (partial_fsum==0)
partial_fsum=tiny;
fs[k] = beta2f_gpu(params[kk])*partial_fsum;
}
}
if(idB==0) printf("CONTROL 1 THREAD 0\n");
if(idB==2) printf("CONTROL 1 THREAD 2\n");
if(idB==20) printf("CONTROL 1 THREAD 20\n");
if(idB==33) printf("CONTROL 1 THREAD 33\n");
__syncthreads();
if(idB==0) forwardModel_stepA(params,3,idB,&Vs[0],&Ks[0]);
__syncthreads();
if(idB==0) printf("CONTROL 2 THREAD 0\n");
if(idB==2) printf("CONTROL 2 THREAD 2\n");
if(idB==20) printf("CONTROL 2 THREAD 20\n");
if(idB==33) printf("CONTROL 2 THREAD 33\n");
__syncthreads();
if(idB==0) forwardModel_stepA(params,3,idB,&Vs[9],&Ks[3]);
if(idB==0) printf("CONTROL 3 THREAD 0\n");
if(idB==2) printf("CONTROL 3 THREAD 2\n");
if(idB==20) printf("CONTROL 3 THREAD 20\n");
if(idB==33) printf("CONTROL 3 THREAD 33\n");
__syncthreads();
if(idB==0){
double p_plus_h[14];
for (int p=0; p<np; p++) p_plus_h[p] = params[p];
p_plus_h[1]=params[1]+7;
forwardModel_stepA(params,3,idB,&Vs[9],&Ks[3]);
}
__syncthreads();
double S_trial;
if(idB==0) printf("CONTROL 4 THREAD 0\n");
if(idB==2) printf("CONTROL 4 THREAD 2\n");
if(idB==20) printf("CONTROL 4 THREAD 20\n");
if(idB==33) printf("CONTROL 4 THREAD 33\n");
__syncthreads();
S_trial= forwardModel_stepB(params,bv1,bv2,&Vs[9],&Ks[3],&approx[9],fs,_d,sumf,3,np,include,idB);
__syncthreads();
S_trial= forwardModel_stepB(params,bv1,bv2,&Vs[9],&Ks[3],&approx[9],fs,_d,sumf,3,np,include,idB);
if(idB==0) printf("CONTROL 5 THREAD 0\n");
if(idB==2) printf("CONTROL 5 THREAD 2\n");
if(idB==20) printf("CONTROL 5 THREAD 20\n");
if(idB==33) printf("CONTROL 5 THREAD 33\n");
__syncthreads();
}
//in diffmodel.cc
extern "C" __global__ void fit()
{
int idA = threadIdx.x;
int idB = blockIdx.x;
__shared__ double m[14];
__shared__ double fs[3];
__shared__ double shared[64];
__shared__ double fs2[3];
__shared__ double Vs[14*9];
__shared__ double Ks[14*3];
__shared__ double approx[14*9];
int np=4;
double hessd[14*14];
hess(m,2.445,5.55,14,4,false,shared,fs2,Vs,Ks,approx,idA,hessd);
__syncthreads();
hess(m,2.445,5.55,14,4,false,shared,fs2,Vs,Ks,approx,idA,hessd);
}
int main( int argc, const char* argv[] ){
int blocks = 1;
dim3 Dim_Grid(blocks,1);
dim3 Dim_Block(64,1);
fit<<<Dim_Grid, Dim_Block>>>();
cudaDeviceSynchronize();
}