nvprof: Internal profiling error 4277:5 on Tesla P100, but not on GTX 1070

Thanks for the hint that i can simply import multiple files. Haven’t thought of that.

As a reduced version of the code, gluing together what i posted above to a complete source file nvproferr.cu

#include <stdint.h>
#include <stdio.h>

#define LIMBS 4 /* as an example */

typedef uint32_t mp_limb;
typedef mp_limb mp_t[LIMBS];

#define ASM asm __volatile__
#define __ASM_SIZE "32"
#define __ASM_CONSTRAINT "r"

#define __sub_cc(r, a, b) ASM ("sub.cc.u" __ASM_SIZE " %0, %1, %2;": "=" __ASM_CONSTRAINT (r): __ASM_CONSTRAINT (a), __ASM_CONSTRAINT (b))
#define __subc_cc(r, a, b) ASM ("subc.cc.u" __ASM_SIZE " %0, %1, %2;": "=" __ASM_CONSTRAINT (r): __ASM_CONSTRAINT (a), __ASM_CONSTRAINT (b))

#define __addcy(carry) ASM ("addc.u" __ASM_SIZE " %0, 0, 0;": "=" __ASM_CONSTRAINT (carry))

__device__
mp_limb mp_sub(mp_t r, const mp_t a, const mp_t b) {
        mp_limb carry = 0;
#ifdef __CUDA_ARCH__
        __sub_cc(r[0], a[0], b[0]);
#pragma unroll
        for (size_t i = 1; i < LIMBS; i++) {
                __subc_cc(r[i], a[i], b[i]);
        }

        __addcy(carry);
#else
#pragma unroll
        for (size_t i = 0; i < LIMBS; i++) {
                r[i] = a[i] - b[i] - carry;
                carry = r[i] > a[i];
        }
#endif
        return carry;
}

__global__
void cuda_mp_sub(mp_t r, const mp_t a, const mp_t b) {
        mp_sub(r, a, b);
        return;
}

int main(int arc, char *argv[]){
    mp_t r;
    mp_limb *dev_a, *dev_b, *dev_r;
    cudaMalloc((void **) &dev_a, LIMBS * sizeof(mp_limb));
    cudaMalloc((void **) &dev_b, LIMBS * sizeof(mp_limb));
    cudaMalloc((void **) &dev_r, LIMBS * sizeof(mp_limb));

    cuda_mp_sub<<< 1, 1 >>> (dev_r, dev_a, dev_b);

    cudaMemcpy(r, dev_r, LIMBS * sizeof(mp_limb), cudaMemcpyDeviceToHost);

    for(int i = 0; i < LIMBS; i++)
	    printf(" 0x%x", r[i]);
    printf("\n");

    return 0;
}

reproduces the problem for me, as can be seen from the following session

$ nvcc ./nvproferr.cu -o ./nvproferr

$ ./nvproferr 
 0x0 0x0 0x0 0x0

$ cuda-memcheck ./nvproferr
========= CUDA-MEMCHECK
 0x0 0x0 0x0 0x0
========= ERROR SUMMARY: 0 errors

$ nvprof ./nvproferr
==20529== NVPROF is profiling process 20529, command: ./nvproferr
 0x0 0x0 0x0 0x0
==20529== Profiling application: ./nvproferr
==20529== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   77.46%  4.2880us         1  4.2880us  4.2880us  4.2880us  cuda_mp_sub(unsigned int*, unsigned int const *, unsigned int const *)
                   22.54%  1.2480us         1  1.2480us  1.2480us  1.2480us  [CUDA memcpy DtoH]
      API calls:   99.68%  204.51ms         3  68.169ms  6.1100us  204.49ms  cudaMalloc
                    0.16%  334.11us        94  3.5540us     359ns  125.91us  cuDeviceGetAttribute
                    0.11%  225.90us         1  225.90us  225.90us  225.90us  cuDeviceTotalMem
                    0.02%  31.336us         1  31.336us  31.336us  31.336us  cuDeviceGetName
                    0.01%  30.009us         1  30.009us  30.009us  30.009us  cudaLaunch
                    0.01%  22.073us         1  22.073us  22.073us  22.073us  cudaMemcpy
                    0.00%  2.9890us         3     996ns     351ns  1.8310us  cuDeviceGetCount
                    0.00%  1.3630us         2     681ns     457ns     906ns  cuDeviceGet
                    0.00%  1.1160us         3     372ns     169ns     614ns  cudaSetupArgument
                    0.00%     840ns         1     840ns     840ns     840ns  cudaConfigureCall

$ nvprof --metrics all ./nvproferr
==20554== NVPROF is profiling process 20554, command: ./nvproferr
==20554== Error:  0x0 0x0 0x0 0x0
Internal profiling error 4277:5.
======== Error: CUDA profiling error.

Can you also reproduce this on a Tesla P100?