After my kernel, there was a cudaThreadSynchronize function, and I was measuring the time taken after that. I instead measured the time before cudaThreadSynchronize was called, and it was very less… However, the program still takes a lot of time after that to execute completely in any case…
I am posting my code below:
[codebox]
#include<cuda.h>
#include<stdio.h>
#include<cutil.h>
#include<cutil_inline_runtime.h>
#define complex_double double2
#define ANG_RES 100
global void test_dev(complex_double *diag_terms)
{
int theta_count, phi_count;
int face_calc;
int face = 1;
int l,m,lp,mp,cnt,cntp;
int r_ind;
int tid;
int i,j,k;
double sph_x, sph_y, sph_z;
double face_x, face_y, face_z;
double dist_self;
double del_phi, del_theta;
double theta_self, phi_self;
float ii_self, jj_self, kk_self;
float cube_x, cube_y, cube_z;
tid = blockIdx.x * blockDim.x + threadIdx.x;
/* TIND_TO_VOX_XYZ(tid,i,j,k,info_stat.nX, info_stat.nY, info_stat.nZ);
i = i + info_stat.bounX;
j = j + info_stat.bounY;
k = k + info_stat.bounZ;
r_ind = i* (info_stat.nY + 2*info_stat.bounY )* (info_stat.nZ + 2*info_stat.bounZ) + j* (info_stat.nZ + 2*info_stat.bounZ) + k;
*/
for (ii_self = -4.5; ii_self <= 4.5; ii_self++) {
for (jj_self = -4.5; jj_self <= 4.5; jj_self++) {
for (kk_self = -4.5; kk_self <= 4.5; kk_self++) {
for ( theta_count = 0; theta_count < ANG_RES; theta_count++){
theta_self = theta_count * M_PI / ANG_RES;
for ( phi_count = 0; phi_count < ANG_RES; phi_count++){
phi_self = phi_count * 2.0*M_PI / ANG_RES;
sph_x = cos(phi_self) * sin(theta_self);
sph_y = sin(phi_self) * sin(theta_self);
sph_z = cos(theta_self);
}
}
}
}
}
}
int main(int argc, char **argv){
cudaSetDevice(3);
int size;
size = 125;
complex_double *diag_terms_dev;
cutilSafeCall(cudaMalloc(&diag_terms_dev,sizeof(complex_double)*size ));
cutilSafeCall(cudaMemset(diag_terms_dev,0, sizeof(complex_double)*size));
unsigned int timer=0;
cutCreateTimer(&timer);
cutResetTimer(timer);
cutStartTimer(timer);
test_dev<<<1,128>>>(diag_terms_dev);
cutStopTimer(timer);
printf(“Time taken on the kernel is %f ms \n”, timer);
cudaThreadSynchronize(); // This part takes a lot of time
cudaFree(diag_terms_dev);
return(0);
}
[/codebox]
I think it is still the kernel that is taking very long… Any inputs on why this is the case and how I could reduce this time ?