1080 does not support doubles?

hello,
I am trying to use nlopt with CUDA kernels.
since nlopt uses double precision variables, I has to parse arguments back and forth double to float between mey earlier float-based cud code and nlopt optimization.
I wanted to get rid of this performance hit, so I decided to change all relevant variables to doubles.

after doing that the code compiled, but kernel started returning 0 instead of proper results.

code for kernel

__global__ void mulKernel(int *processedList, int *processedCount, double *processedValues, const int *vox, const int *beam, const double *depos, const double *settings, const int *voxPerChunk, const int *chunksize, int *lastVoxel)
{
	int i = threadIdx.x;
	int bx = blockIdx.x;
	int startindex = (bx * 1024 + i);
	int current = 0;
	long voxelordernumber;

	long kUpperLimit = (startindex + 1)*(*chunksize);
	if (kUpperLimit > *lastVoxel)
	{
		kUpperLimit = *lastVoxel; //czy last Voxel może być < startindex?
	} 
	//#pragma unroll
	__syncthreads();
	for (size_t k = startindex * (*chunksize); k < kUpperLimit; k++)
	{
		for (size_t j = 0; j < *voxPerChunk; j++)
		{
			current = startindex * *voxPerChunk + j;
			if (processedList[current] == vox[k])
			{
				voxelordernumber = j;
				break;
			}
			if (processedList[current] == -1)
			{
				processedList[current] = vox[k];
				voxelordernumber = j;
				processedCount[startindex] = processedCount[startindex] + 1;
				break;
			}
		}
		processedValues[startindex* *voxPerChunk + voxelordernumber] += depos[k] * settings[beam[k]];
	}
	__syncthreads();
}

the important bit is

processedValues[startindex* *voxPerChunk + voxelordernumber] += depos[k] * settings[beam[k]];

processedValues =0
depos =/= 0
settings =1
the result of the operation however is 0.

I am using Visual with Nsight under CUDA 10 and NVidia GTX 1080
Please Help :-)

I tried to pin down the problem, so I used CUDA example code and changed all relevant variables to doubles

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(double *c, const double *a, const double *b, unsigned int size);

__global__ void addKernel(double *c, const double *a, const double *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const double a[arraySize] = { 1, 2, 3, 4, 5 };
    const double b[arraySize] = { 10, 20, 30, 40, 50 };
	double c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(double *c, const double *a, const double *b, unsigned int size)
{
	double *dev_a = 0;
	double *dev_b = 0;
	double *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(double), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

and the code returns

{1,2,3,4,5} + {10,20,30,40,50} = {0,0,0,0,0}

so there is for sure something wrong with doubles under CUDA,
please not that compute capability of this card is >>1.3 (6.0 I believe)

using printf on a double quantity with the %d format specifier will give you undefined behavior.

This is a C/C++ coding error and has nothing to do with CUDA.

Regarding the question about the kernel, my suggestion would be to start by making sure you are using proper CUDA error checking, and run your compiled executable code from the command line with cuda-memcheck. If any errors are reported from either of those 2 activities, those should be tracked down first.

As for consumer cards based on the Pascal architecture, their throughput of double-precision operations is 1/32 of their throughput of single-precision operations, which comes out to about 260 GFLOPS for a GTX 1080. So don’t expect it to break any speed records for code that is strictly double precision floating-point computation. Depending on what CPU you are using and how you are coding for it, it might not even be faster than the CPU.

cuda-memcheck returns no erros,
changing %d to %f made no improvement.
Do you want to see the full code? (~200 lines of relevant code out of 1200)

Then my suspicion would be that you have something fundamentally wrong with your setup. Here is your full test case, with the only change being %d -> %f and it works fine for me:

$ cat t311.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(double *c, const double *a, const double *b, unsigned int size);

__global__ void addKernel(double *c, const double *a, const double *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const double a[arraySize] = { 1, 2, 3, 4, 5 };
    const double b[arraySize] = { 10, 20, 30, 40, 50 };
        double c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%f,%f,%f,%f,%f}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(double *c, const double *a, const double *b, unsigned int size)
{
        double *dev_a = 0;
        double *dev_b = 0;
        double *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(double), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}
$ nvcc -o t311 t311.cu
$ ./t311
{1,2,3,4,5} + {10,20,30,40,50} = {11.000000,22.000000,33.000000,44.000000,55.000000}
$