Example MS VStudio code fails to run

robert.beattie · November 9, 2022, 4:00pm

Hello all,

Am a new learner to CUDA and have just installed the following :-
PCIe slot #1 is a GTX 3050 for my CAD work, connecting to two monitors.
PCIe slot #2 is a Quadro P400 that I intend to play with.

PC runs Win10 Pro. Installed CUDA 11.8.0 along with MS VisualStudio 2022
This is the example code generated by VS, which I have expanded a little bit (such as printing out the GPU cards in my machine and some of their properties).
The example code, however, fails to allocate memory on either of the GPUs.

Am stuck as to why. Any pointers gladly welcomed ! Thanks, Bob.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };
    int cuda_count;

    cudaDeviceProp prop;
    cudaGetDeviceCount(&cuda_count);
    fprintf(stderr, "Qty of CUDA GPU installed = %d\n", cuda_count);
    for (int device = 0; device < cuda_count; device++) {
        cudaGetDeviceProperties(&prop, device);
        fprintf(stderr, "Device number %d\n", device);
        fprintf(stderr, "Device name %s\n", prop.name);
        fprintf(stderr, "Compute %d.%d\n", prop.major, prop.minor);
        fprintf(stderr, "Device total memory %zuMB\n", prop.totalGlobalMem/(1024*1024));
    }

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;

    cudaError_t cudaStatus;
 
    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(1);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for dev_c !\n");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for dev_a !\n");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for dev_b !\n");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed host a -> GPU !\n");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed host b -> GPU !\n");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed GPU c -> host !");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

When compiled and executed, outputs :-

Qty of CUDA GPU installed = 2
Device number 0
Device name NVIDIA GeForce RTX 3050
Compute 8.6
Device total memory 8191MB
Device number 1
Device name Quadro P400
Compute 6.1
Device total memory 2047MB
cudaMalloc failed for dev_c !
addWithCuda failed!

robert.beattie · November 9, 2022, 4:54pm

Since my first post, I have downloaded the example CUDA samples from GitHub and they work OK.

:?

Robert_Crovella · November 10, 2022, 3:10am

There is nothing obviously wrong with that code that I can see.

What sort of output do you get if you modify this line:

    fprintf(stderr, "cudaMalloc failed for dev_c !\n");

to this:

    fprintf(stderr, "cudaMalloc failed for dev_c: %d, %s\n", (int)cudaStatus, cudaGetErrorString(cudaStatus));

robert.beattie · November 10, 2022, 10:11am

cudaMalloc failed for dev_c ! 222, the provided PTX was compiled with an unsupported toolchain.

Robert_Crovella · November 10, 2022, 4:21pm

So you have a mismatch between the GPU driver you have installed, and the CUDA toolkit you are trying to use. The usual suggestion here is to update to the latest driver available.

robert.beattie · November 10, 2022, 6:18pm

Updated from 516.94 to 526.86 and all is well.
Many thanks,
Bob.

Qty of CUDA GPU installed = 2
Device number 0
Device name NVIDIA GeForce RTX 3050
Compute 8.6
Device total memory 8191MB
Device number 1
Device name Quadro P400
Compute 6.1
Device total memory 2047MB
{1,2,3,4,5} + {10,20,30,40,50} = {11,22,33,44,55}

Topic		Replies	Views
including cuda_16fp.h breaks Visual Studio 2015 compilation CUDA Setup and Installation	5	1604	September 15, 2017
cudaMallocManaged() not working CUDA Programming and Performance	1	2354	November 18, 2018
cudaMemcpy Failing To Copy Variable From Device To Host Correctly CUDA Programming and Performance	3	2831	April 26, 2021
deviceQuery passes but other demos fail CUDA Programming and Performance	7	2516	January 22, 2009
Unable to run several CUDA samples. CUDA Programming and Performance	2	824	April 1, 2019
Error 719 (failure to launch) for JCUDA and PyCUDA; How to run GPU consecutive times for 'large' data blocks CUDA Programming and Performance	0	2333	December 13, 2016
I am new to cuda programming. In this code, c matric return by GPU is Zero matrix. I tried different... CUDA Programming and Performance	0	445	July 3, 2018
cuda Error at memory location CUDA Programming and Performance	1	2583	April 5, 2012
Cuda cannot find my graphic card? CUDA Setup and Installation	5	2412	April 9, 2019
GeForce 335M + Visual Studio 2012 CUDA Setup and Installation	7	1411	November 30, 2015

Example MS VStudio code fails to run

Related topics