Example MS VStudio code fails to run

Hello all,

Am a new learner to CUDA and have just installed the following :-
PCIe slot #1 is a GTX 3050 for my CAD work, connecting to two monitors.
PCIe slot #2 is a Quadro P400 that I intend to play with.

PC runs Win10 Pro. Installed CUDA 11.8.0 along with MS VisualStudio 2022
This is the example code generated by VS, which I have expanded a little bit (such as printing out the GPU cards in my machine and some of their properties).
The example code, however, fails to allocate memory on either of the GPUs.

Am stuck as to why. Any pointers gladly welcomed ! Thanks, Bob.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };
    int cuda_count;

    cudaDeviceProp prop;
    cudaGetDeviceCount(&cuda_count);
    fprintf(stderr, "Qty of CUDA GPU installed = %d\n", cuda_count);
    for (int device = 0; device < cuda_count; device++) {
        cudaGetDeviceProperties(&prop, device);
        fprintf(stderr, "Device number %d\n", device);
        fprintf(stderr, "Device name %s\n", prop.name);
        fprintf(stderr, "Compute %d.%d\n", prop.major, prop.minor);
        fprintf(stderr, "Device total memory %zuMB\n", prop.totalGlobalMem/(1024*1024));
    }

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;

    cudaError_t cudaStatus;
 
    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(1);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for dev_c !\n");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for dev_a !\n");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed for dev_b !\n");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed host a -> GPU !\n");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed host b -> GPU !\n");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed GPU c -> host !");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

When compiled and executed, outputs :-

Qty of CUDA GPU installed = 2
Device number 0
Device name NVIDIA GeForce RTX 3050
Compute 8.6
Device total memory 8191MB
Device number 1
Device name Quadro P400
Compute 6.1
Device total memory 2047MB
cudaMalloc failed for dev_c !
addWithCuda failed!

Since my first post, I have downloaded the example CUDA samples from GitHub and they work OK.

:?

There is nothing obviously wrong with that code that I can see.

What sort of output do you get if you modify this line:

    fprintf(stderr, "cudaMalloc failed for dev_c !\n");

to this:

    fprintf(stderr, "cudaMalloc failed for dev_c: %d, %s\n", (int)cudaStatus, cudaGetErrorString(cudaStatus));
cudaMalloc failed for dev_c ! 222, the provided PTX was compiled with an unsupported toolchain.

So you have a mismatch between the GPU driver you have installed, and the CUDA toolkit you are trying to use. The usual suggestion here is to update to the latest driver available.

Updated from 516.94 to 526.86 and all is well.
Many thanks,
Bob.

Qty of CUDA GPU installed = 2
Device number 0
Device name NVIDIA GeForce RTX 3050
Compute 8.6
Device total memory 8191MB
Device number 1
Device name Quadro P400
Compute 6.1
Device total memory 2047MB
{1,2,3,4,5} + {10,20,30,40,50} = {11,22,33,44,55}