Thanks for the hint that i can simply import multiple files. Haven’t thought of that.
As a reduced version of the code, gluing together what i posted above to a complete source file nvproferr.cu
#include <stdint.h>
#include <stdio.h>
#define LIMBS 4 /* as an example */
typedef uint32_t mp_limb;
typedef mp_limb mp_t[LIMBS];
#define ASM asm __volatile__
#define __ASM_SIZE "32"
#define __ASM_CONSTRAINT "r"
#define __sub_cc(r, a, b) ASM ("sub.cc.u" __ASM_SIZE " %0, %1, %2;": "=" __ASM_CONSTRAINT (r): __ASM_CONSTRAINT (a), __ASM_CONSTRAINT (b))
#define __subc_cc(r, a, b) ASM ("subc.cc.u" __ASM_SIZE " %0, %1, %2;": "=" __ASM_CONSTRAINT (r): __ASM_CONSTRAINT (a), __ASM_CONSTRAINT (b))
#define __addcy(carry) ASM ("addc.u" __ASM_SIZE " %0, 0, 0;": "=" __ASM_CONSTRAINT (carry))
__device__
mp_limb mp_sub(mp_t r, const mp_t a, const mp_t b) {
mp_limb carry = 0;
#ifdef __CUDA_ARCH__
__sub_cc(r[0], a[0], b[0]);
#pragma unroll
for (size_t i = 1; i < LIMBS; i++) {
__subc_cc(r[i], a[i], b[i]);
}
__addcy(carry);
#else
#pragma unroll
for (size_t i = 0; i < LIMBS; i++) {
r[i] = a[i] - b[i] - carry;
carry = r[i] > a[i];
}
#endif
return carry;
}
__global__
void cuda_mp_sub(mp_t r, const mp_t a, const mp_t b) {
mp_sub(r, a, b);
return;
}
int main(int arc, char *argv[]){
mp_t r;
mp_limb *dev_a, *dev_b, *dev_r;
cudaMalloc((void **) &dev_a, LIMBS * sizeof(mp_limb));
cudaMalloc((void **) &dev_b, LIMBS * sizeof(mp_limb));
cudaMalloc((void **) &dev_r, LIMBS * sizeof(mp_limb));
cuda_mp_sub<<< 1, 1 >>> (dev_r, dev_a, dev_b);
cudaMemcpy(r, dev_r, LIMBS * sizeof(mp_limb), cudaMemcpyDeviceToHost);
for(int i = 0; i < LIMBS; i++)
printf(" 0x%x", r[i]);
printf("\n");
return 0;
}
reproduces the problem for me, as can be seen from the following session
$ nvcc ./nvproferr.cu -o ./nvproferr
$ ./nvproferr
0x0 0x0 0x0 0x0
$ cuda-memcheck ./nvproferr
========= CUDA-MEMCHECK
0x0 0x0 0x0 0x0
========= ERROR SUMMARY: 0 errors
$ nvprof ./nvproferr
==20529== NVPROF is profiling process 20529, command: ./nvproferr
0x0 0x0 0x0 0x0
==20529== Profiling application: ./nvproferr
==20529== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 77.46% 4.2880us 1 4.2880us 4.2880us 4.2880us cuda_mp_sub(unsigned int*, unsigned int const *, unsigned int const *)
22.54% 1.2480us 1 1.2480us 1.2480us 1.2480us [CUDA memcpy DtoH]
API calls: 99.68% 204.51ms 3 68.169ms 6.1100us 204.49ms cudaMalloc
0.16% 334.11us 94 3.5540us 359ns 125.91us cuDeviceGetAttribute
0.11% 225.90us 1 225.90us 225.90us 225.90us cuDeviceTotalMem
0.02% 31.336us 1 31.336us 31.336us 31.336us cuDeviceGetName
0.01% 30.009us 1 30.009us 30.009us 30.009us cudaLaunch
0.01% 22.073us 1 22.073us 22.073us 22.073us cudaMemcpy
0.00% 2.9890us 3 996ns 351ns 1.8310us cuDeviceGetCount
0.00% 1.3630us 2 681ns 457ns 906ns cuDeviceGet
0.00% 1.1160us 3 372ns 169ns 614ns cudaSetupArgument
0.00% 840ns 1 840ns 840ns 840ns cudaConfigureCall
$ nvprof --metrics all ./nvproferr
==20554== NVPROF is profiling process 20554, command: ./nvproferr
==20554== Error: 0x0 0x0 0x0 0x0
Internal profiling error 4277:5.
======== Error: CUDA profiling error.
Can you also reproduce this on a Tesla P100?