I’ve written this code and although I can’t find any bug it doesn’t give the correct answer for precalculated test in debug mode. If i run it in emudebug however, it does work.
/* DIVISOR KERNEL */
__global__ void divk(long long *NUM, long long *RES)
{
long long DIV = (blockIdx.x * BLOCK_SIZE + threadIdx.x) * 8388608 + 3;
long long LNUM = *NUM;
long long additive;
for (additive = 0; additive<8388608; additive++)
{
if (LNUM%(DIV+additive) == 0 && DIV+additive < LNUM)
{
*RES = DIV+additive;
}
}
}
/* DIVISOR HOST */
long long div(long long NUM)
{
long long hostReturn;
long long *deviceReturn;
long long *DNUM;
long long NVALUE = 0;
int grid_size = (sqrt((float)NUM) / BLOCK_SIZE) / 8388608 + 1;
if (grid_size == 0) grid_size = (sqrt((float)NUM) / BLOCK_SIZE) / 8388608;
if (NUM%2 == 0)
{
return 2;
}
cudaMalloc((void**)&deviceReturn, sizeof(long long));
cudaMemcpy(deviceReturn, &NVALUE, sizeof(long long), cudaMemcpyHostToDevice);
cudaMalloc((void**)&DNUM, sizeof(long long));
cudaMemcpy(DNUM, &NUM, sizeof(long long), cudaMemcpyHostToDevice);
divk<<<grid_size, BLOCK_SIZE>>>(DNUM, deviceReturn);
cudaMemcpy(&hostReturn, deviceReturn, sizeof(long long), cudaMemcpyDeviceToHost);
cudaFree(deviceReturn);
cudaFree(DNUM);
return hostReturn;
}
Can anybody tell me what my problem is?