CUDA and OpenACC cudaLaunchKernel returned status 98: invalid device function

Joseph_A · April 17, 2020, 9:00am

Hi,

I’m trying to execute a slightly modified example from the CUDA examples directory (conjugateGradient). I replaced cusparseScsrmv by an OpenACC kernel acc parallel loop. When I execute the program I get the following error message. What is the reason for this error?
I use the PGI compiler 19.10 and the CUDA toolkit 10.2.89.

Thank you for your help

GPU Device 0: "Tesla V100-SXM2-16GB" with compute capability 7.0
> GPU device has 80 Multi-Processors, SM 7.0 compute capabilities
cudaLaunchKernel returned status 98: invalid device function

makfile

PROG=conjugateGradient
CC=pgc++
CCU=nvcc -ccbin=${CC}

OBJS=main.o
OPTS=-ta=tesla:cc70 -acc -Minfo=accel -Minfo
INCLUDES=-I../common/inc
LIBS=-Mcudalib=cublas,cusparse

%.o: %.cu
	${CCU} -arch=compute_70 -code=sm_70 -rdc=true -Xcompiler "${OPTS}" ${INCLUDES} -c $<
all:${PROG}
${PROG}:${OBJS}
	${CC} -Mcuda ${OPTS} ${LIBS}  -o $@ ${OBJS}

clean:
	rm -f conjugateGradient main.o

main.cu

/*
 * This sample implements a conjugate gradient solver on GPU
 * using CUBLAS and CUSPARSE
 *
 */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

/* Using updated (v2) interfaces to cublas */
#include <cuda_runtime.h>
#include <cusparse.h>
#include <cublas_v2.h>

// Utilities and system includes
#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
#include <helper_cuda.h>       // helper function CUDA error checking and initialization

const char *sSDKname     = "conjugateGradient";

/* genTridiag: generate a random tridiagonal symmetric matrix */
void genTridiag(int *I, int *J, float *val, int N, int nz)
{
    I[0] = 0, J[0] = 0, J[1] = 1;
    val[0] = (float)rand()/RAND_MAX + 10.0f;
    val[1] = (float)rand()/RAND_MAX;
    int start;

    for (int i = 1; i < N; i++)
    {
        if (i > 1)
        {
            I[i] = I[i-1]+3;
        }
        else
        {
            I[1] = 2;
        }

        start = (i-1)*3 + 2;
        J[start] = i - 1;
        J[start+1] = i;

        if (i < N-1)
        {
            J[start+2] = i + 1;
        }

        val[start] = val[start-1];
        val[start+1] = (float)rand()/RAND_MAX + 10.0f;

        if (i < N-1)
        {
            val[start+2] = (float)rand()/RAND_MAX;
        }
    }

    I[N] = nz;
}


int main(int argc, char **argv)
{
    int M = 0, N = 0, nz = 0, *I = NULL, *J = NULL;
    float *val = NULL;
    const float tol = 1e-5f;
    const int max_iter = 10000;
    float *x;
    float *rhs;
    float a, b, na, r0, r1;
    int *d_col, *d_row;
    float *d_val, *d_x, dot;
    float *d_r, *d_p, *d_Ax;
    int k;
    float alpha, beta, alpham1;

    // This will pick the best possible CUDA capable device
    cudaDeviceProp deviceProp;
    int devID = findCudaDevice(argc, (const char **)argv);

    if (devID < 0)
    {
        printf("exiting...\n");
        exit(EXIT_SUCCESS);
    }

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    // Statistics about the GPU device
    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
           deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

    /* Generate a random tridiagonal symmetric matrix in CSR format */
    M = N = 1048576;
    nz = (N-2)*3 + 4;
    I = (int *)malloc(sizeof(int)*(N+1));
    J = (int *)malloc(sizeof(int)*nz);
    val = (float *)malloc(sizeof(float)*nz);
    genTridiag(I, J, val, N, nz);

    x = (float *)malloc(sizeof(float)*N);
    rhs = (float *)malloc(sizeof(float)*N);

    for (int i = 0; i < N; i++)
    {
        rhs[i] = 1.0;
        x[i] = 0.0;
    }

    /* Get handle to the CUBLAS context */
    cublasHandle_t cublasHandle = 0;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);

    checkCudaErrors(cublasStatus);

    /* Get handle to the CUSPARSE context */
    cusparseHandle_t cusparseHandle = 0;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);

    checkCudaErrors(cusparseStatus);

    cusparseMatDescr_t descr = 0;
    cusparseStatus = cusparseCreateMatDescr(&descr);

    checkCudaErrors(cusparseStatus);

    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

    checkCudaErrors(cudaMalloc((void **)&d_col, nz*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_row, (N+1)*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_val, nz*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_x, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_r, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_p, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_Ax, N*sizeof(float)));

    cudaMemcpy(d_col, J, nz*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_row, I, (N+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_val, val, nz*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice);

    alpha = 1.0;
    alpham1 = -1.0;
    beta = 0.0;
    r0 = 0.;

    cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax);

    cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1);
    cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1);

    k = 1;
    
    cudaEvent_t start;
    cudaEvent_t end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    float millis = 0.0;

    while (r1 > tol*tol && k <= max_iter)
    {
        if (k > 1)
        {
            b = r1 / r0;
            cublasStatus = cublasSscal(cublasHandle, N, &b, d_p, 1);
            cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1);
        }
        else
        {
            cublasStatus = cublasScopy(cublasHandle, N, d_r, 1, d_p, 1);
        }

        cudaEventRecord(start);
        if(0){
           cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
                          N, N, nz, &alpha, descr, d_val, 
                          d_row, d_col, d_p, &beta, d_Ax);
        }else{
           #pragma acc parallel loop deviceptr(d_val, d_row, d_col, d_p, d_Ax)
           for(int i=0;i<N;i++)
           {
              double dot = 0.0;
              int row_start = d_row[i];
              int row_end   = d_row[i+1];
              #pragma acc loop seq
              for(int elem = row_start; elem < row_end; elem++)
              {
                 dot += d_val[elem] * d_p[d_col[elem]]; 
              }
              d_Ax[i] = alpha*dot + beta*d_Ax[i];
           }
        }
        cudaEventRecord(end);
        cudaEventSynchronize(end);
        cudaEventElapsedTime(&millis, start, end);

        cublasStatus = cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, &dot);
        a = r1 / dot;

        cublasStatus = cublasSaxpy(cublasHandle, N, &a, d_p, 1, d_x, 1);
        na = -a;
        cublasStatus = cublasSaxpy(cublasHandle, N, &na, d_Ax, 1, d_r, 1);

        r0 = r1;
        cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1);
        cudaDeviceSynchronize();
        printf("iteration = %3d, residual = %e\n", k, sqrt(r1));
        k++;
    }
    
    cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost);

    float rsum, diff, err = 0.0;

    for (int i = 0; i < N; i++)
    {
        rsum = 0.0;

        for (int j = I[i]; j < I[i+1]; j++)
        {
            rsum += val[j]*x[J[j]];
        }

        diff = fabs(rsum - rhs[i]);

        if (diff > err)
        {
            err = diff;
        }
    }

    cusparseDestroy(cusparseHandle);
    cublasDestroy(cublasHandle);

    free(I);
    free(J);
    free(val);
    free(x);
    free(rhs);
    cudaFree(d_col);
    cudaFree(d_row);
    cudaFree(d_val);
    cudaFree(d_x);
    cudaFree(d_r);
    cudaFree(d_p);
    cudaFree(d_Ax);

    printf("Test Summary:  Error amount = %f\n", err);
    if(CUDA_SPMV)
    printf("CUDA-SpMV time: %f\n", millis*1000);
    else
    printf("SpMV time: %f\n", millis*1000);
    exit((k <= max_iter) ? 0 : 1);
    
}

MatColgrove · April 17, 2020, 2:23pm

Hi Peter,

Try changing the nvcc flag “-rdc=true” to “-rdc=false”.

% make
nvcc -ccbin=pgc++ -w -arch=compute_70 -code=sm_70 -rdc=false -Xcompiler "-ta=tesla:cc70 -ta=tesla:cuda10.1  -acc -Minfo=accel -Minfo -w -DCUDA_SPMV=0 -I../common/inc -I/opt/cuda-10.1/samples/common/inc/ " -I../common/inc -I/opt/cuda-10.1/samples/common/inc/  -c main.cu
main:
      1, include "main.cu"
         106, FMA (fused multiply-add) instruction(s) generated
      1, include "main.cu"
         184, Generating Tesla code
             186, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             192, #pragma acc loop seq
pgc++ -Mcuda -ta=tesla:cc70 -ta=tesla:cuda10.1  -acc -Minfo=accel -Minfo -w -DCUDA_SPMV=0 -Mcudalib=cublas,cusparse  -o conjugateGradient main.o
% ./conjugateGradient
GPU Device 0: "Tesla V100-PCIE-16GB" with compute capability 7.0

> GPU device has 80 Multi-Processors, SM 7.0 compute capabilities

iteration =   1, residual = 4.449882e+01
iteration =   2, residual = 3.245218e+00
iteration =   3, residual = 2.690220e-01
iteration =   4, residual = 2.307639e-02
iteration =   5, residual = 1.993140e-03
iteration =   6, residual = 1.846192e-04
iteration =   7, residual = 1.693379e-05
iteration =   8, residual = 1.600115e-06
Test Summary:  Error amount = 0.000000
SpMV time: 68.832001

-Mat

Joseph_A · April 20, 2020, 2:04am

mkcolg:

Hi Peter,

Try changing the nvcc flag “-rdc=true” to “-rdc=false”.

% make
nvcc -ccbin=pgc++ -w -arch=compute_70 -code=sm_70 -rdc=false -Xcompiler "-ta=tesla:cc70 -ta=tesla:cuda10.1  -acc -Minfo=accel -Minfo -w -DCUDA_SPMV=0 -I../common/inc -I/opt/cuda-10.1/samples/common/inc/ " -I../common/inc -I/opt/cuda-10.1/samples/common/inc/  -c main.cu
main:
      1, include "main.cu"
         106, FMA (fused multiply-add) instruction(s) generated
      1, include "main.cu"
         184, Generating Tesla code
             186, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             192, #pragma acc loop seq
pgc++ -Mcuda -ta=tesla:cc70 -ta=tesla:cuda10.1  -acc -Minfo=accel -Minfo -w -DCUDA_SPMV=0 -Mcudalib=cublas,cusparse  -o conjugateGradient main.o
% ./conjugateGradient
GPU Device 0: "Tesla V100-PCIE-16GB" with compute capability 7.0

> GPU device has 80 Multi-Processors, SM 7.0 compute capabilities

iteration =   1, residual = 4.449882e+01
iteration =   2, residual = 3.245218e+00
iteration =   3, residual = 2.690220e-01
iteration =   4, residual = 2.307639e-02
iteration =   5, residual = 1.993140e-03
iteration =   6, residual = 1.846192e-04
iteration =   7, residual = 1.693379e-05
iteration =   8, residual = 1.600115e-06
Test Summary:  Error amount = 0.000000
SpMV time: 68.832001

-Mat

Thank you very much! It worked!
One question regarding rdc.
I thought the PGI compiler by default generates relocatable device code (rdc=true) and NVCC by default generates non-relocatable device code (rdc=false).
Why was my initial makefile incorrect? (PGI rdc=true, NVCC rdc=true)

MatColgrove · April 20, 2020, 6:08pm

I thought the PGI compiler by default generates relocatable device code (rdc=true) and NVCC by default generates non-relocatable device code (rdc=false).

Correct.

Why was my initial makefile incorrect? (PGI rdc=true, NVCC rdc=true)

I believe the problem is that neither side has visibility about the kernels the other creates. When RDC is enabled, there’s extra initialization code that needs to be added to register the kernels. So when using RDC=true for both, the one side that’s linking doesn’t know it needs to initialize the kernels from the other side.

With RDC=false, the CUDA JIT compiler recompiles the PTX code at runtime and does the registration.

So you can have them both false, or the one that’s doing the linking true, but both can’t be true.

I’ll ask our GPU team in our next meeting if we can do anything registering nvcc compiled kernels so RDC could be enabled for both.

-Mat

Joseph_A · April 21, 2020, 7:38am

mkcolg:

I thought the PGI compiler by default generates relocatable device code (rdc=true) and NVCC by default generates non-relocatable device code (rdc=false).

Correct.

Why was my initial makefile incorrect? (PGI rdc=true, NVCC rdc=true)

I believe the problem is that neither side has visibility about the kernels the other creates. When RDC is enabled, there’s extra initialization code that needs to be added to register the kernels. So when using RDC=true for both, the one side that’s linking doesn’t know it needs to initialize the kernels from the other side.

With RDC=false, the CUDA JIT compiler recompiles the PTX code at runtime and does the registration.

So you can have them both false, or the one that’s doing the linking true, but both can’t be true.

I’ll ask our GPU team in our next meeting if we can do anything registering nvcc compiled kernels so RDC could be enabled for both.

-Mat

Thank you for the information.

Topic		Replies	Views
OpenACC Region: Command exited with non-zero status 1 nvc, nvc++ and nvfortran cuda	21	1893	October 14, 2021
Clarification on using OpenACC in a shared library Legacy PGI Compilers	27	4568	December 9, 2020
cudaLaunchKernel returned status 98: invalid device function nvc, nvc++ and nvfortran	15	4235	January 31, 2023
how to compil CUDA device functions Legacy PGI Compilers	10	5035	August 29, 2018
cuModuleLoadData error 209 Legacy PGI Compilers	7	16303	February 10, 2015
cudaSynchronizeDevice() returns error code 6 CUDA Programming and Performance	7	8611	June 16, 2011
Issue with cudaMemcpyToSymbol and Separable Compilation. CUDA Programming and Performance	10	1269	February 28, 2019
Missing cuda device code when trying to link nvc object file with gcc nvc, nvc++ and nvfortran	3	1232	March 4, 2022
Nvcc 12.3 with gcc 13.2 not working CUDA NVCC Compiler	11	10255	March 12, 2024
OpenACC-CUDA interoperability within the same file Legacy PGI Compilers	4	4146	November 4, 2016

CUDA and OpenACC cudaLaunchKernel returned status 98: invalid device function

Related topics