After my kernel, there was a cudaThreadSynchronize function, and I was measuring the time taken after that. I instead measured the time before cudaThreadSynchronize was called, and it was very less… However, the program still takes a lot of time after that to execute completely in any case…

I am posting my code below:

[codebox]

#include<cuda.h>

#include<stdio.h>

#include<cutil.h>

#include<cutil_inline_runtime.h>

#define complex_double double2

#define ANG_RES 100

**global** void test_dev(complex_double *diag_terms)

{

int theta_count, phi_count;

```
int face_calc;
int face = 1;
int l,m,lp,mp,cnt,cntp;
int r_ind;
int tid;
int i,j,k;
double sph_x, sph_y, sph_z;
double face_x, face_y, face_z;
double dist_self;
double del_phi, del_theta;
double theta_self, phi_self;
float ii_self, jj_self, kk_self;
float cube_x, cube_y, cube_z;
```

tid = blockIdx.x * blockDim.x + threadIdx.x;

/* TIND_TO_VOX_XYZ(tid,i,j,k,info_stat.nX, info_stat.nY, info_stat.nZ);

```
i = i + info_stat.bounX;
j = j + info_stat.bounY;
k = k + info_stat.bounZ;
r_ind = i* (info_stat.nY + 2*info_stat.bounY )* (info_stat.nZ + 2*info_stat.bounZ) + j* (info_stat.nZ + 2*info_stat.bounZ) + k;
```

*/

for (ii_self = -4.5; ii_self <= 4.5; ii_self++) {

```
for (jj_self = -4.5; jj_self <= 4.5; jj_self++) {
for (kk_self = -4.5; kk_self <= 4.5; kk_self++) {
for ( theta_count = 0; theta_count < ANG_RES; theta_count++){
theta_self = theta_count * M_PI / ANG_RES;
for ( phi_count = 0; phi_count < ANG_RES; phi_count++){
phi_self = phi_count * 2.0*M_PI / ANG_RES;
sph_x = cos(phi_self) * sin(theta_self);
sph_y = sin(phi_self) * sin(theta_self);
sph_z = cos(theta_self);
```

}

```
}
}
}
}
```

}

int main(int argc, char **argv){

```
cudaSetDevice(3);
```

int size;

```
size = 125;
complex_double *diag_terms_dev;
cutilSafeCall(cudaMalloc(&diag_terms_dev,sizeof(complex_double)*size ));
cutilSafeCall(cudaMemset(diag_terms_dev,0, sizeof(complex_double)*size));
```

unsigned int timer=0;

```
cutCreateTimer(&timer);
cutResetTimer(timer);
cutStartTimer(timer);
test_dev<<<1,128>>>(diag_terms_dev);
cutStopTimer(timer);
```

printf(“Time taken on the kernel is %f ms \n”, timer);

cudaThreadSynchronize(); // This part takes a lot of time

```
cudaFree(diag_terms_dev);
return(0);
```

}

[/codebox]

I think it is still the kernel that is taking very long… Any inputs on why this is the case and how I could reduce this time ?