CUDA emudebug working but not debug

I’ve written this code and although I can’t find any bug it doesn’t give the correct answer for precalculated test in debug mode. If i run it in emudebug however, it does work.

/* DIVISOR KERNEL */

__global__ void divk(long long *NUM, long long *RES)

{

	long long DIV = (blockIdx.x * BLOCK_SIZE + threadIdx.x) * 8388608  + 3;

	long long LNUM = *NUM;

	long long additive;

	for (additive = 0; additive<8388608; additive++)

	{

  if (LNUM%(DIV+additive) == 0 && DIV+additive < LNUM)

  {

  	*RES = DIV+additive;

  }

	}

}

/* DIVISOR HOST */

long long div(long long NUM)

{

    long long hostReturn;

    long long *deviceReturn;

    long long *DNUM;

    long long NVALUE = 0;

    int grid_size = (sqrt((float)NUM) / BLOCK_SIZE) / 8388608 + 1;

    if (grid_size == 0) grid_size = (sqrt((float)NUM) / BLOCK_SIZE) / 8388608;

	if (NUM%2 == 0)

	{

  return 2;

	}

	

	cudaMalloc((void**)&deviceReturn, sizeof(long long));

	cudaMemcpy(deviceReturn, &NVALUE, sizeof(long long), cudaMemcpyHostToDevice);

	cudaMalloc((void**)&DNUM, sizeof(long long));

	cudaMemcpy(DNUM, &NUM, sizeof(long long), cudaMemcpyHostToDevice);

	divk<<<grid_size, BLOCK_SIZE>>>(DNUM, deviceReturn);

	cudaMemcpy(&hostReturn, deviceReturn, sizeof(long long), cudaMemcpyDeviceToHost);

	cudaFree(deviceReturn);

	cudaFree(DNUM);

	return hostReturn;

}

Can anybody tell me what my problem is?