1080 does not support doubles?

Luxex · October 29, 2018, 3:46pm

hello,
I am trying to use nlopt with CUDA kernels.
since nlopt uses double precision variables, I has to parse arguments back and forth double to float between mey earlier float-based cud code and nlopt optimization.
I wanted to get rid of this performance hit, so I decided to change all relevant variables to doubles.

after doing that the code compiled, but kernel started returning 0 instead of proper results.

code for kernel

__global__ void mulKernel(int *processedList, int *processedCount, double *processedValues, const int *vox, const int *beam, const double *depos, const double *settings, const int *voxPerChunk, const int *chunksize, int *lastVoxel)
{
	int i = threadIdx.x;
	int bx = blockIdx.x;
	int startindex = (bx * 1024 + i);
	int current = 0;
	long voxelordernumber;

	long kUpperLimit = (startindex + 1)*(*chunksize);
	if (kUpperLimit > *lastVoxel)
	{
		kUpperLimit = *lastVoxel; //czy last Voxel może być < startindex?
	} 
	//#pragma unroll
	__syncthreads();
	for (size_t k = startindex * (*chunksize); k < kUpperLimit; k++)
	{
		for (size_t j = 0; j < *voxPerChunk; j++)
		{
			current = startindex * *voxPerChunk + j;
			if (processedList[current] == vox[k])
			{
				voxelordernumber = j;
				break;
			}
			if (processedList[current] == -1)
			{
				processedList[current] = vox[k];
				voxelordernumber = j;
				processedCount[startindex] = processedCount[startindex] + 1;
				break;
			}
		}
		processedValues[startindex* *voxPerChunk + voxelordernumber] += depos[k] * settings[beam[k]];
	}
	__syncthreads();
}

the important bit is

processedValues[startindex* *voxPerChunk + voxelordernumber] += depos[k] * settings[beam[k]];

processedValues =0
depos =/= 0
settings =1
the result of the operation however is 0.

I am using Visual with Nsight under CUDA 10 and NVidia GTX 1080
Please Help :-)

Luxex · October 29, 2018, 3:50pm

I tried to pin down the problem, so I used CUDA example code and changed all relevant variables to doubles

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(double *c, const double *a, const double *b, unsigned int size);

__global__ void addKernel(double *c, const double *a, const double *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const double a[arraySize] = { 1, 2, 3, 4, 5 };
    const double b[arraySize] = { 10, 20, 30, 40, 50 };
	double c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(double *c, const double *a, const double *b, unsigned int size)
{
	double *dev_a = 0;
	double *dev_b = 0;
	double *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(double), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

and the code returns

{1,2,3,4,5} + {10,20,30,40,50} = {0,0,0,0,0}

so there is for sure something wrong with doubles under CUDA,
please not that compute capability of this card is >>1.3 (6.0 I believe)

Robert_Crovella · October 29, 2018, 4:06pm

using printf on a double quantity with the %d format specifier will give you undefined behavior.

This is a C/C++ coding error and has nothing to do with CUDA.

Regarding the question about the kernel, my suggestion would be to start by making sure you are using proper CUDA error checking, and run your compiled executable code from the command line with cuda-memcheck. If any errors are reported from either of those 2 activities, those should be tracked down first.

njuffa · October 29, 2018, 10:12pm

As for consumer cards based on the Pascal architecture, their throughput of double-precision operations is 1/32 of their throughput of single-precision operations, which comes out to about 260 GFLOPS for a GTX 1080. So don’t expect it to break any speed records for code that is strictly double precision floating-point computation. Depending on what CPU you are using and how you are coding for it, it might not even be faster than the CPU.

Luxex · October 30, 2018, 9:21am

cuda-memcheck returns no erros,
changing %d to %f made no improvement.
Do you want to see the full code? (~200 lines of relevant code out of 1200)

Robert_Crovella · October 30, 2018, 2:19pm

Then my suspicion would be that you have something fundamentally wrong with your setup. Here is your full test case, with the only change being %d → %f and it works fine for me:

$ cat t311.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(double *c, const double *a, const double *b, unsigned int size);

__global__ void addKernel(double *c, const double *a, const double *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const double a[arraySize] = { 1, 2, 3, 4, 5 };
    const double b[arraySize] = { 10, 20, 30, 40, 50 };
        double c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%f,%f,%f,%f,%f}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(double *c, const double *a, const double *b, unsigned int size)
{
        double *dev_a = 0;
        double *dev_b = 0;
        double *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(double), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    return cudaStatus;
}
$ nvcc -o t311 t311.cu
$ ./t311
{1,2,3,4,5} + {10,20,30,40,50} = {11.000000,22.000000,33.000000,44.000000,55.000000}
$

Topic		Replies	Views
Cuda code performance CUDA Programming and Performance	14	3147	December 16, 2014
Error 719 (failure to launch) for JCUDA and PyCUDA; How to run GPU consecutive times for 'large' data blocks CUDA Programming and Performance	0	2331	December 13, 2016
cudaSynchronizeDevice() returns error code 6 CUDA Programming and Performance	7	8601	June 16, 2011
The kernel isn't working CUDA Programming and Performance	9	1050	January 19, 2011
cudaMemcpy Failing To Copy Variable From Device To Host Correctly CUDA Programming and Performance	3	2795	April 26, 2021
including cuda_16fp.h breaks Visual Studio 2015 compilation CUDA Setup and Installation	5	1603	September 15, 2017
Poor half performance CUDA Programming and Performance	9	2313	February 16, 2020
How to choose how many threads/blocks to have? CUDA Programming and Performance	43	52168	June 7, 2022
Getting started with CUDA ... cannot add simple vectors CUDA Programming and Performance	9	20930	January 31, 2011
This code doesn't work maybe too much threads assigned? CUDA Programming and Performance	8	1089	February 2, 2014

1080 does not support doubles?

Related topics