I think I have found a bug in the compiler. When the following code is compiled with nvcc on arch sm_52 the bug occurres.
This is the code used to produce the bug.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuComplex.h>
#include <stdio.h>
#include <cstdlib>
__global__ void generate(cuDoubleComplex *data, int N, int width, int height)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i > N) return;
int x = i % width;
int y = i / width;
int value = 255 - x * 10;
printf("value = 255 - %i * 10 = %d\n", x, value);
printf("%i > 255 = %i\n", value, value > 255);
printf("%i == 255 = %i\n", value, value == 255);
printf("%i < 0 = %i\n", value, value < 0);
while (value > 255)
value = value - 255;
while (value < 0)
value = value + 255;
printf("value = %i\n", value);
auto complex = make_cuDoubleComplex(value, 0);
data[y * width + x] = complex;
printf("[%i, %i] = [%f, %f]\n\n", x, y, complex.x, complex.y);
}
int main()
{
int width = 1;
int height = 1;
int N = width * height;
cuDoubleComplex *result;
cudaMallocHost(&result, N * sizeof(cuDoubleComplex));
for (int i = 0; i < 5; i++)
{
printf("\n\nLoop %i\n\n", i);
generate<<<width, height>>>(result, N, width, height);
cudaDeviceSynchronize();
}
cudaDeviceReset();
system("pause");
return 0;
}
The command I used to compile it is:
nvcc -arch sm_52 .\kernel.cu -o ./kernel.exe
and then execute the kernel I get the following output
Loop 0
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 255
[0, 0] = [0.000000, 0.000000]
Loop 1
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 255
[0, 0] = [0.000000, 0.000000]
Loop 2
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 1065353471
[0, 0] = [0.000000, 0.000000]
Loop 3
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 255
[0, 0] = [0.000000, 0.000000]
Loop 4
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 255
[0, 0] = [0.000000, 0.000000]
if I compile the exact same code but with the flag sm_20
nvcc -arch sm_20 .\kernel.cu -o ./kernel.exe
and run the code I get the following output.
Loop 0
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 255
[0, 0] = [255.000000, 0.000000]
Loop 1
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 255
[0, 0] = [255.000000, 0.000000]
Loop 2
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 255
[0, 0] = [255.000000, 0.000000]
Loop 3
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 255
[0, 0] = [255.000000, 0.000000]
Loop 4
value = 255 - 0 * 10 = 255
255 > 255 = 0
255 == 255 = 1
255 < 0 = 0
value = 255
[0, 0] = [255.000000, 0.000000]
Notice the difference between the following lines of the correct and incorrect program output.
Incorrect:
value = 1065353471
[0, 0] = [0.000000, 0.000000]
Correct:
value = 255
[0, 0] = [255.000000, 0.000000]
The code produces different values for the variable “value” but the complex created by the function “make_cuDoubleComplex” always seems to contain 0,0 when compiled with sm_52.
When the width and height are increased to 2 you will also notice that it only occurres when the variable value is 255. You will get an output like this for a single loop.
value = 255 - 0 * 10 = 255
value = 255 - 1 * 10 = 245
value = 255 - 0 * 10 = 255
value = 255 - 1 * 10 = 245
255 > 255 = 0
245 > 255 = 0
255 > 255 = 0
245 > 255 = 0
255 == 255 = 1
245 == 255 = 0
255 == 255 = 1
245 == 255 = 0
255 < 0 = 0
245 < 0 = 0
255 < 0 = 0
245 < 0 = 0
value = 255
value = 245
value = 1065353471
value = 245
[0, 0] = [0.000000, 0.000000]
[1, 0] = [245.000000, 0.000000]
[0, 1] = [0.000000, 0.000000]
[1, 1] = [245.000000, 0.000000]
ver
OS: Microsoft Windows [Version 10.0.16226.1000]
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2016 NVIDIA Corporation
Built on Mon_Jan__9_17:32:33_CST_2017
Cuda compilation tools, release 8.0, V8.0.60