CUDA and OpenACC cudaLaunchKernel returned status 98: invalid device function

Hi,

I’m trying to execute a slightly modified example from the CUDA examples directory (conjugateGradient). I replaced cusparseScsrmv by an OpenACC kernel acc parallel loop. When I execute the program I get the following error message. What is the reason for this error?
I use the PGI compiler 19.10 and the CUDA toolkit 10.2.89.

Thank you for your help

GPU Device 0: "Tesla V100-SXM2-16GB" with compute capability 7.0
> GPU device has 80 Multi-Processors, SM 7.0 compute capabilities
cudaLaunchKernel returned status 98: invalid device function

makfile

PROG=conjugateGradient
CC=pgc++
CCU=nvcc -ccbin=${CC}

OBJS=main.o
OPTS=-ta=tesla:cc70 -acc -Minfo=accel -Minfo
INCLUDES=-I../common/inc
LIBS=-Mcudalib=cublas,cusparse

%.o: %.cu
	${CCU} -arch=compute_70 -code=sm_70 -rdc=true -Xcompiler "${OPTS}" ${INCLUDES} -c $<
all:${PROG}
${PROG}:${OBJS}
	${CC} -Mcuda ${OPTS} ${LIBS}  -o $@ ${OBJS}

clean:
	rm -f conjugateGradient main.o

main.cu

/*
 * This sample implements a conjugate gradient solver on GPU
 * using CUBLAS and CUSPARSE
 *
 */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

/* Using updated (v2) interfaces to cublas */
#include <cuda_runtime.h>
#include <cusparse.h>
#include <cublas_v2.h>

// Utilities and system includes
#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
#include <helper_cuda.h>       // helper function CUDA error checking and initialization

const char *sSDKname     = "conjugateGradient";

/* genTridiag: generate a random tridiagonal symmetric matrix */
void genTridiag(int *I, int *J, float *val, int N, int nz)
{
    I[0] = 0, J[0] = 0, J[1] = 1;
    val[0] = (float)rand()/RAND_MAX + 10.0f;
    val[1] = (float)rand()/RAND_MAX;
    int start;

    for (int i = 1; i < N; i++)
    {
        if (i > 1)
        {
            I[i] = I[i-1]+3;
        }
        else
        {
            I[1] = 2;
        }

        start = (i-1)*3 + 2;
        J[start] = i - 1;
        J[start+1] = i;

        if (i < N-1)
        {
            J[start+2] = i + 1;
        }

        val[start] = val[start-1];
        val[start+1] = (float)rand()/RAND_MAX + 10.0f;

        if (i < N-1)
        {
            val[start+2] = (float)rand()/RAND_MAX;
        }
    }

    I[N] = nz;
}


int main(int argc, char **argv)
{
    int M = 0, N = 0, nz = 0, *I = NULL, *J = NULL;
    float *val = NULL;
    const float tol = 1e-5f;
    const int max_iter = 10000;
    float *x;
    float *rhs;
    float a, b, na, r0, r1;
    int *d_col, *d_row;
    float *d_val, *d_x, dot;
    float *d_r, *d_p, *d_Ax;
    int k;
    float alpha, beta, alpham1;

    // This will pick the best possible CUDA capable device
    cudaDeviceProp deviceProp;
    int devID = findCudaDevice(argc, (const char **)argv);

    if (devID < 0)
    {
        printf("exiting...\n");
        exit(EXIT_SUCCESS);
    }

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    // Statistics about the GPU device
    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
           deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

    /* Generate a random tridiagonal symmetric matrix in CSR format */
    M = N = 1048576;
    nz = (N-2)*3 + 4;
    I = (int *)malloc(sizeof(int)*(N+1));
    J = (int *)malloc(sizeof(int)*nz);
    val = (float *)malloc(sizeof(float)*nz);
    genTridiag(I, J, val, N, nz);

    x = (float *)malloc(sizeof(float)*N);
    rhs = (float *)malloc(sizeof(float)*N);

    for (int i = 0; i < N; i++)
    {
        rhs[i] = 1.0;
        x[i] = 0.0;
    }

    /* Get handle to the CUBLAS context */
    cublasHandle_t cublasHandle = 0;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);

    checkCudaErrors(cublasStatus);

    /* Get handle to the CUSPARSE context */
    cusparseHandle_t cusparseHandle = 0;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);

    checkCudaErrors(cusparseStatus);

    cusparseMatDescr_t descr = 0;
    cusparseStatus = cusparseCreateMatDescr(&descr);

    checkCudaErrors(cusparseStatus);

    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

    checkCudaErrors(cudaMalloc((void **)&d_col, nz*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_row, (N+1)*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_val, nz*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_x, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_r, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_p, N*sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&d_Ax, N*sizeof(float)));

    cudaMemcpy(d_col, J, nz*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_row, I, (N+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_val, val, nz*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice);

    alpha = 1.0;
    alpham1 = -1.0;
    beta = 0.0;
    r0 = 0.;

    cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax);

    cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1);
    cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1);

    k = 1;
    
    cudaEvent_t start;
    cudaEvent_t end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    float millis = 0.0;

    while (r1 > tol*tol && k <= max_iter)
    {
        if (k > 1)
        {
            b = r1 / r0;
            cublasStatus = cublasSscal(cublasHandle, N, &b, d_p, 1);
            cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1);
        }
        else
        {
            cublasStatus = cublasScopy(cublasHandle, N, d_r, 1, d_p, 1);
        }

        cudaEventRecord(start);
        if(0){
           cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
                          N, N, nz, &alpha, descr, d_val, 
                          d_row, d_col, d_p, &beta, d_Ax);
        }else{
           #pragma acc parallel loop deviceptr(d_val, d_row, d_col, d_p, d_Ax)
           for(int i=0;i<N;i++)
           {
              double dot = 0.0;
              int row_start = d_row[i];
              int row_end   = d_row[i+1];
              #pragma acc loop seq
              for(int elem = row_start; elem < row_end; elem++)
              {
                 dot += d_val[elem] * d_p[d_col[elem]]; 
              }
              d_Ax[i] = alpha*dot + beta*d_Ax[i];
           }
        }
        cudaEventRecord(end);
        cudaEventSynchronize(end);
        cudaEventElapsedTime(&millis, start, end);

        cublasStatus = cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, &dot);
        a = r1 / dot;

        cublasStatus = cublasSaxpy(cublasHandle, N, &a, d_p, 1, d_x, 1);
        na = -a;
        cublasStatus = cublasSaxpy(cublasHandle, N, &na, d_Ax, 1, d_r, 1);

        r0 = r1;
        cublasStatus = cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1);
        cudaDeviceSynchronize();
        printf("iteration = %3d, residual = %e\n", k, sqrt(r1));
        k++;
    }
    
    cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost);

    float rsum, diff, err = 0.0;

    for (int i = 0; i < N; i++)
    {
        rsum = 0.0;

        for (int j = I[i]; j < I[i+1]; j++)
        {
            rsum += val[j]*x[J[j]];
        }

        diff = fabs(rsum - rhs[i]);

        if (diff > err)
        {
            err = diff;
        }
    }

    cusparseDestroy(cusparseHandle);
    cublasDestroy(cublasHandle);

    free(I);
    free(J);
    free(val);
    free(x);
    free(rhs);
    cudaFree(d_col);
    cudaFree(d_row);
    cudaFree(d_val);
    cudaFree(d_x);
    cudaFree(d_r);
    cudaFree(d_p);
    cudaFree(d_Ax);

    printf("Test Summary:  Error amount = %f\n", err);
    if(CUDA_SPMV)
    printf("CUDA-SpMV time: %f\n", millis*1000);
    else
    printf("SpMV time: %f\n", millis*1000);
    exit((k <= max_iter) ? 0 : 1);
    
}

Hi Peter,

Try changing the nvcc flag “-rdc=true” to “-rdc=false”.

% make
nvcc -ccbin=pgc++ -w -arch=compute_70 -code=sm_70 -rdc=false -Xcompiler "-ta=tesla:cc70 -ta=tesla:cuda10.1  -acc -Minfo=accel -Minfo -w -DCUDA_SPMV=0 -I../common/inc -I/opt/cuda-10.1/samples/common/inc/ " -I../common/inc -I/opt/cuda-10.1/samples/common/inc/  -c main.cu
main:
      1, include "main.cu"
         106, FMA (fused multiply-add) instruction(s) generated
      1, include "main.cu"
         184, Generating Tesla code
             186, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
             192, #pragma acc loop seq
pgc++ -Mcuda -ta=tesla:cc70 -ta=tesla:cuda10.1  -acc -Minfo=accel -Minfo -w -DCUDA_SPMV=0 -Mcudalib=cublas,cusparse  -o conjugateGradient main.o
% ./conjugateGradient
GPU Device 0: "Tesla V100-PCIE-16GB" with compute capability 7.0

> GPU device has 80 Multi-Processors, SM 7.0 compute capabilities

iteration =   1, residual = 4.449882e+01
iteration =   2, residual = 3.245218e+00
iteration =   3, residual = 2.690220e-01
iteration =   4, residual = 2.307639e-02
iteration =   5, residual = 1.993140e-03
iteration =   6, residual = 1.846192e-04
iteration =   7, residual = 1.693379e-05
iteration =   8, residual = 1.600115e-06
Test Summary:  Error amount = 0.000000
SpMV time: 68.832001

-Mat

Thank you very much! It worked!
One question regarding rdc.
I thought the PGI compiler by default generates relocatable device code (rdc=true) and NVCC by default generates non-relocatable device code (rdc=false).
Why was my initial makefile incorrect? (PGI rdc=true, NVCC rdc=true)

I thought the PGI compiler by default generates relocatable device code (rdc=true) and NVCC by default generates non-relocatable device code (rdc=false).

Correct.

Why was my initial makefile incorrect? (PGI rdc=true, NVCC rdc=true)

I believe the problem is that neither side has visibility about the kernels the other creates. When RDC is enabled, there’s extra initialization code that needs to be added to register the kernels. So when using RDC=true for both, the one side that’s linking doesn’t know it needs to initialize the kernels from the other side.

With RDC=false, the CUDA JIT compiler recompiles the PTX code at runtime and does the registration.

So you can have them both false, or the one that’s doing the linking true, but both can’t be true.

I’ll ask our GPU team in our next meeting if we can do anything registering nvcc compiled kernels so RDC could be enabled for both.

-Mat

Thank you for the information.