I have a CUDA file kernel.cu and a c++ file program.cpp. Here is the portion of code:
kernel.cu:
int8_t compute(uint64_t *buff){
cudaError_t rtnval;
ulong *d_buf;
rtnval = cudaMalloc((void**)&d_buf, BLOBSIZE*sizeof(ulong));
if(rtnval!=cudaSuccess){
goto label;
}
rtnval = cudaMemcpy(d_buf, (ulong*) buff, BLOBSIZE*sizeof(ulong), cudaMemcpyHostToDevice);
if(rtnval!=cudaSuccess){
goto label;
}
krak<<<blocks,threads>>>(d_buf); //kernel
memset(buff, 0x00, sizeof(uint64_t)*BLOBSIZE);
rtnval = cudaMemcpy(buff,(uint64_t*) d_buf, BLOBSIZE*sizeof(ulong), cudaMemcpyDeviceToHost);
if(rtnval!=cudaSuccess){
goto label;
}
return 0x00;
label:
if(rtnval!=cudaSuccess){
char msg[100]={0x00};
sprintf(msg, "error: %s",cudaGetErrorName(rtnval));
printf("%s\n", msg);
return 0x01;
}
}
When I called compute() function of kernel.cu file from main() function in file program.cpp then it takes n seconds for completion of this function.
Now when I called compute() function of kernel.cu file by writing main() function in file kernel.cu and directly calling it from main() in kernel.cu file then it takes m seconds for completion.
But here is the unexpected thing happened that n comes very much greater than m like n comes out to be 25 sec and m comes to be 0.18 sec.
What can be the reason for this ?