Does nvprof support cudaTextureObject_t?

My CUDA code hanged when my lookup table passed to CUDA kernel by cudaTextureObject_t and profiled by command nvprof. I printf the content of the lookup table in the kernel, all entries showed zero besides the first entry. Is there any way to fix it or is there any other profiling tools to profile my CUDA code? Thanks.

Yes,nvprof works fine with that.

Debug your code or provide a short, self-contained reproducer.

My CUDA code shows below. This code executed on Titan-V compiled with nvcc run.cu -G -g -arch=sm_70 -rdc=true -lcudadevrt -lcuda -lcudart -o run

After compiling then run without nvprof (./run) the result is
daRow0s = 11057008
daRow1s = 11057200
DAC[0] = 0xa8b770n
DAC[1] = 0xa8b830n
dacsPitch = 48

das0=0.000000
das1=334.276978
das2=1002.830933
das3=1671.385010
das4=2339.938965
das5=3008.492920

With nvprof (nvprof ./run) the result becomes
daRow0s = 35388704
daRow1s = 35326672
DAC[0] = 0x21bfd20n
DAC[1] = 0x21b0ad0n
dacsPitch = -15508

das0=0.000000
das1=0.000000
das2=0.000000
das3=0.000000
das4=0.000000
das5=0.000000

Even I changed dacsPitch to 48 manually.

Thank you very much for anyone helps.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>

#define REAL float

#define DACS_ENTRIES 44
#define E_ENTRIES 81

#define ZERO 1.0e-20
#define SZERO 1.0e-6

#define NPART 131072 //	number of particles simulated simultaneously
#define DACS_OFFSET 43
#define BINDING_ITEMS 5

#define ODDS 100
#define INTERACTION_TYPES 12
#define MCC 510998.9461 // rest mass energy ~511keV
#define TWOMCC 1021997.8922 //2*MC^2
#define M2C4 261119922915.31070521 //M^2C^4
#define PI 3.14159265359
#define C 299792458

#define GPUMEM_BASE_SIZE 300 // memory size in MB estimated for base needed
#define GPUMEM_PER_INC 4 // memory size in MB estimated per added particle

#define INC_2NDPARTICLES_100KEV 2500
#define INC_RADICALS_100KEV 12000

#define K_THREADS 128

typedef struct
{ 
    REAL x, y, z;
    REAL ux, uy, uz;
    REAL e;
	int h2oState;
    int dead;
    REAL path;
	REAL elape_time;
	int id;
	int parentId;
} eStruct;

__global__ 
void run_kernel(int N, cudaTextureObject_t DACSTable)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < N)
	{
        float das0, das1, das2, das3, das4, das5;
        das0 = tex2D<REAL>(DACSTable, 0, 1.5);
        printf(" das0=%f\n", das0);
        das1 = tex2D<REAL>(DACSTable, 1, 1.5);
        printf(" das1=%f\n", das1);
        das2 = tex2D<REAL>(DACSTable, 2, 1.5);
        printf(" das2=%f\n", das2);
        das3 = tex2D<REAL>(DACSTable, 3, 1.5);
        printf(" das3=%f\n", das3);
        das4 = tex2D<REAL>(DACSTable, 4, 1.5);
        printf(" das4=%f\n", das4);
        das5 = tex2D<REAL>(DACSTable, 5, 1.5);
        printf(" das5=%f\n", das5);
        asm("trap;");  
        
	}
}

void iniElectron(float *max_e, eStruct *eQueue, int *id)
{
    eQueue[0].x = 0;
    eQueue[0].y = 0;
    eQueue[0].z = 5.5e-4;        
    eQueue[0].ux = 3.673770e-01;
    eQueue[0].uy = 4.772163e-01;
    eQueue[0].uz = 7.983099e-01;        
    eQueue[0].e = 50000;
    eQueue[0].h2oState = 99;
    eQueue[0].dead = 0;
    eQueue[0].path = 0;		
    eQueue[0].elape_time = 0.0;		
    eQueue[0].id = *id;		
    eQueue[0].parentId = 0;
}

void rd_dacs(REAL **DACSTable)
{
    DACSTable[0][0] = 4.300000 ;
    DACSTable[0][1] = 4.500000 ;
    DACSTable[0][2] = 4.700000 ;
    DACSTable[0][3] = 4.900000 ;
    DACSTable[0][4] = 5.100000 ;
    DACSTable[0][5] = 5.300000 ;
    DACSTable[0][6] = 5.500000 ;
    DACSTable[0][7] = 5.700000 ;
    DACSTable[0][8] = 5.900000 ;
    DACSTable[0][9] = 6.100000 ;
    DACSTable[0][10] = 6.300000 ;
    DACSTable[0][11] = 6.500000 ;
    DACSTable[0][12] = 6.700000 ;
    DACSTable[0][13] = 6.900000 ;
    DACSTable[0][14] = 7.100000 ;
    DACSTable[0][15] = 7.300000 ;
    DACSTable[0][16] = 7.500000 ;
    DACSTable[0][17] = 7.700000 ;
    DACSTable[0][18] = 7.900000 ;
    DACSTable[0][19] = 8.100000 ;
    DACSTable[0][20] = 8.300000 ;
    DACSTable[0][21] = 8.500000 ;
    DACSTable[0][22] = 8.700000 ;
    DACSTable[0][23] = 8.900000 ;
    DACSTable[0][24] = 9.100000 ;
    DACSTable[0][25] = 9.300000 ;
    DACSTable[0][26] = 9.500000 ;
    DACSTable[0][27] = 9.700000 ;
    DACSTable[0][28] = 9.900000 ;
    DACSTable[0][29] = 10.100000;
    DACSTable[0][30] = 10.300000;
    DACSTable[0][31] = 10.500000;
    DACSTable[0][32] = 10.700000;
    DACSTable[0][33] = 10.900000;
    DACSTable[0][34] = 11.100000;
    DACSTable[0][35] = 11.300000;
    DACSTable[0][36] = 11.500000;
    DACSTable[0][37] = 11.700000;
    DACSTable[0][38] = 11.900000;
    DACSTable[0][39] = 12.100000;
    DACSTable[0][40] = 12.300000;
    DACSTable[0][41] = 12.500000;
    DACSTable[0][42] = 12.700000;
    DACSTable[0][43] = 12.900000;

DACSTable[1][0] = 0.000000      ;
    DACSTable[1][1] = 668.553983    ;
    DACSTable[1][2] = 1337.107966   ;
    DACSTable[1][3] = 2005.661948   ;
    DACSTable[1][4] = 2674.215931   ;
    DACSTable[1][5] = 3342.769914   ;
    DACSTable[1][6] = 5014.154871   ;
    DACSTable[1][7] = 6685.539828   ;
    DACSTable[1][8] = 26742.159312  ;
    DACSTable[1][9] = 93597.557591  ;
    DACSTable[1][10] = 203908.964752 ;
    DACSTable[1][11] = 213937.274493 ;
    DACSTable[1][12] = 183852.345268 ;
    DACSTable[1][13] = 127025.256731 ;
    DACSTable[1][14] = 96940.327505  ;
    DACSTable[1][15] = 63512.628365  ;
    DACSTable[1][16] = 30084.929226  ;
    DACSTable[1][17] = 26742.159312  ;
    DACSTable[1][18] = 30084.929226  ;
    DACSTable[1][19] = 43456.008881  ;
    DACSTable[1][20] = 46798.778795  ;
    DACSTable[1][21] = 33427.699140  ;
    DACSTable[1][22] = 30084.929226  ;
    DACSTable[1][23] = 26742.159312  ;
    DACSTable[1][24] = 20056.619484  ;
    DACSTable[1][25] = 16713.849570  ;
    DACSTable[1][26] = 10028.309742  ;
    DACSTable[1][27] = 6685.539828   ;
    DACSTable[1][28] = 3342.769914   ;
    DACSTable[1][29] =  3342.769914  ;
    DACSTable[1][30] =  5014.154871  ;
    DACSTable[1][31] =  6685.539828  ;
    DACSTable[1][32] =  10028.309742 ;
    DACSTable[1][33] =  13371.079656 ;
    DACSTable[1][34] =  16713.849570 ;
    DACSTable[1][35] =  20056.619484 ;
    DACSTable[1][36] =  16713.849570 ;
    DACSTable[1][37] =  10028.309742 ;
    DACSTable[1][38] =  6685.539828  ;
    DACSTable[1][39] =  3342.769914  ;
    DACSTable[1][40] =  2339.938940  ;
    DACSTable[1][41] =  1337.107966  ;
    DACSTable[1][42] =  334.276991   ;
    DACSTable[1][43] =  0.000000     ;    
}

int getSPcores(cudaDeviceProp devProp)
{  
    int cores = 0;
    int mp = devProp.multiProcessorCount;
    switch (devProp.major){
     case 2: // Fermi
      if (devProp.minor == 1) cores = mp * 48;
      else cores = mp * 32;
      break;
     case 3: // Kepler
      cores = mp * 192;
      break;
     case 5: // Maxwell
      cores = mp * 128;
      break;
     case 6: // Pascal
      if (devProp.minor == 1) cores = mp * 128;
      else if (devProp.minor == 0) cores = mp * 64;
      else printf("Unknown device type\n");
      break;
     default:
      printf("Unknown device type\n"); 
      break;
      }
    return cores;
}

int main(int argc, char *argv[]) 
{

REAL **DACSTable = NULL;
    REAL *BindE_array = NULL;
    REAL **elastDCSTable = NULL;
    REAL **ieeCSTable = NULL;
    int i;
    long long where_all = 0;
    long long where = 0;
    int second_num = 0;
    int e_num = 1;
    int gEid = 0;
    int currentID;
    time_t t;
    srand((unsigned)time(&t));
    float acc_kerneltime = 0;
    
    int *dev_where, *dev_second_num, *dev_gEid;
    eStruct *dev_eQueue, *dev_e2Queue;
    int *dev_stepQ,  *dev_cntQ;
    FILE *report = NULL;
    int GPUDeviceNo;
    REAL Ecutoff;
        
    GPUDeviceNo = 0;
    Ecutoff = 7.4;
    
    cudaSetDevice(GPUDeviceNo);
    
    size_t total_memory;
    size_t free_memory;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, GPUDeviceNo);
    int cudacores = getSPcores(deviceProp);
    printf("cuda core = %d\n", cudacores);
    int globalMB = deviceProp.totalGlobalMem >> 20;
    printf("Total global memory = %d (MB)\n", globalMB);
    size_t sharedB = deviceProp.sharedMemPerBlock;
    printf("Shared memory per block = %zd (Bytes)\n", sharedB);

ieeCSTable = (REAL**)malloc(sizeof(ieeCSTable)*(INTERACTION_TYPES-1));
    for (i=0; i<INTERACTION_TYPES-1; i++)
        ieeCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
    
    long int row0s = (long int)ieeCSTable[0];
    long int row1s = (long int)ieeCSTable[1];
    int ieecsPitch = (row1s - row0s)/sizeof(REAL);
    
    DACSTable = (REAL**)malloc(sizeof(DACSTable)*2);
    for (i=0; i<2; i++)
        DACSTable[i]=(REAL*)malloc(sizeof(REAL)*DACS_ENTRIES);	
        
    long int daRow0s = (long int)DACSTable[0];
    long int daRow1s = (long int)DACSTable[1];
    int dacsPitch = (daRow1s - daRow0s)/sizeof(REAL);
    printf("daRow0s = %ld\n", daRow0s);
    printf("daRow1s = %ld\n", daRow1s);
    printf("DAC[0] = %pn\n", &DACSTable[0][0]);
    printf("DAC[1] = %pn\n", &DACSTable[1][0]);
    printf("dacsPitch = %d\n\n", dacsPitch);
    
    dacsPitch = 48;
    ieecsPitch = 84;
    
    elastDCSTable = (REAL**)malloc(sizeof(elastDCSTable)*ODDS);
    for (i=0; i<ODDS; i++)
        elastDCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);

rd_dacs(DACSTable);
        
    float max_e = 0;
    
    eStruct *eQueue_ini = (eStruct *)malloc(e_num * sizeof(eStruct));
    iniElectron(&max_e, eQueue_ini, &currentID);	
    
    int memory_rqMB = GPUMEM_BASE_SIZE + e_num * GPUMEM_PER_INC;
    printf("Estimated GPU memory usage = %d (MB)\n", memory_rqMB);
    int batch = 1 + memory_rqMB/globalMB;
    
    int MaxN;
    int enumPerBatch = e_num/batch;
    int scale = 1;
    if (max_e > 100e3)
        scale = 1 + max_e/100e3;
    
    MaxN = enumPerBatch * scale * INC_2NDPARTICLES_100KEV;
    long long contsize = e_num * scale * INC_RADICALS_100KEV;
    
    printf("Total incident particles = %d\n", e_num);
    printf("Simulation in batchs = %d\n", batch);
    printf("Estimated Max. batch 2nd particles = %d\n", MaxN);
    
    float *e2ndQueue_test = (float *)malloc(MaxN * sizeof(float));
    memset(e2ndQueue_test, 0.0, sizeof(MaxN * sizeof(float)));
    int *host_stepQ = (int *)malloc(MaxN * sizeof(int));
    int *host_cntQ = (int *)malloc(MaxN * sizeof(int));
    cudaMalloc(&dev_stepQ, MaxN * sizeof(int));
    cudaMalloc(&dev_cntQ, MaxN * sizeof(int));
    cudaMalloc(&dev_e2Queue, MaxN * sizeof(eStruct));

/********************************/
    /****   < TEXTURE MEMORY >   ****/
    /********************************/
    
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray *dev_DACSTable, *dev_BindE_array, *dev_ieeCSTable, *dev_elastDCSTable;
    // cuda memory
    cudaMallocArray(&dev_DACSTable, &channelDesc, dacsPitch, 2);
    
    // cuda memory
    cudaMemcpyToArray(dev_DACSTable    , 0, 0, &DACSTable[0][0], 2 * dacsPitch * sizeof(REAL), cudaMemcpyHostToDevice);
    
    // resource description -> from cuda memory
    struct cudaResourceDesc resD_DACSTable;
        memset(&resD_DACSTable, 0, sizeof(resD_DACSTable));
        resD_DACSTable.resType = cudaResourceTypeArray;
        resD_DACSTable.res.array.array = dev_DACSTable;
    
    // texture description -> from cuda memory
    struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeClamp;
        texDesc.addressMode[1]   = cudaAddressModeClamp;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.filterMode       = cudaFilterModeLinear;
        
    cudaTextureObject_t texObj_DACSTable    = 0;
    cudaCreateTextureObject(&texObj_DACSTable, &resD_DACSTable,   &texDesc, NULL);
    
    int N = e_num;
    run_kernel<<<(N+K_THREADS-1)/K_THREADS, K_THREADS>>>(N, texObj_DACSTable);
        
    cudaFree(dev_e2Queue);
    cudaFree(dev_where);
    cudaFree(dev_second_num);
    cudaFree(dev_DACSTable);
    cudaFree(dev_BindE_array);
    cudaFree(dev_ieeCSTable);
    cudaFree(dev_elastDCSTable);

free(DACSTable);
    free(BindE_array);
    free(elastDCSTable);
    free(ieeCSTable);
	

}
  1. Why are you using a asm trap instruction in your kernel? Unless you have a specific and good reason for doing so, you shouldn’t be using that. The tools don’t react well to that instruction because that instruction by definition corrupts the CUDA context.

  2. After launching a kernel call, for proper profiling operation, you should be at a minimum having some sort of synchronizing instruction, such as cudaDeviceSynchronize().

When I remove the asm trap, and add cudaDeviceSynchronize(), I get the same output with or without nvprof.

Sorry, I forgot to remove the asm trap. That’s for debugging.
Where do you add cudaDeviceSynchronize()? I removed the trap and added cudaDeviceSynchronize() at the end of kernel function, but they were still different.

Thanks.

add cudaDeviceSynchronize() at the end of main, before cudaFree calls.

I tried on two machines
CentOS 7.6 TITAN V nvcc 10.1
Ubuntu 16.04 Quadro P4000 nvcc 9.2

Both results were still different with/without nvprof.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>

#define REAL float

#define DACS_ENTRIES 44
#define E_ENTRIES 81

#define ZERO 1.0e-20
#define SZERO 1.0e-6

#define NPART 131072 //	number of particles simulated simultaneously
#define DACS_OFFSET 43
#define BINDING_ITEMS 5

#define ODDS 100
#define INTERACTION_TYPES 12
#define MCC 510998.9461 // rest mass energy ~511keV
#define TWOMCC 1021997.8922 //2*MC^2
#define M2C4 261119922915.31070521 //M^2C^4
#define PI 3.14159265359
#define C 299792458

#define GPUMEM_BASE_SIZE 300 // memory size in MB estimated for base needed
#define GPUMEM_PER_INC 4 // memory size in MB estimated per added particle

#define INC_2NDPARTICLES_100KEV 2500
#define INC_RADICALS_100KEV 12000

#define K_THREADS 128

typedef struct
{ 
    REAL x, y, z;
    REAL ux, uy, uz;
    REAL e;
	int h2oState;
    int dead;
    REAL path;
	REAL elape_time;
	int id;
	int parentId;
} eStruct;


__global__ 
void run_kernel(int N, cudaTextureObject_t DACSTable)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < N)
	{
        float das0, das1, das2, das3, das4, das5;
        das0 = tex2D<REAL>(DACSTable, 0, 1.5);
        printf(" das0=%f\n", das0);
        das1 = tex2D<REAL>(DACSTable, 1, 1.5);
        printf(" das1=%f\n", das1);
        das2 = tex2D<REAL>(DACSTable, 2, 1.5);
        printf(" das2=%f\n", das2);
        das3 = tex2D<REAL>(DACSTable, 3, 1.5);
        printf(" das3=%f\n", das3);
        das4 = tex2D<REAL>(DACSTable, 4, 1.5);
        printf(" das4=%f\n", das4);
        das5 = tex2D<REAL>(DACSTable, 5, 1.5);
        printf(" das5=%f\n", das5);
	}
}


void iniElectron(float *max_e, eStruct *eQueue, int *id)
{
    eQueue[0].x = 0;
    eQueue[0].y = 0;
    eQueue[0].z = 5.5e-4;        
    eQueue[0].ux = 3.673770e-01;
    eQueue[0].uy = 4.772163e-01;
    eQueue[0].uz = 7.983099e-01;        
    eQueue[0].e = 50000;
    eQueue[0].h2oState = 99;
    eQueue[0].dead = 0;
    eQueue[0].path = 0;		
    eQueue[0].elape_time = 0.0;		
    eQueue[0].id = *id;		
    eQueue[0].parentId = 0;
}

void rd_dacs(REAL **DACSTable)
{
    DACSTable[0][0] = 4.300000 ;
    DACSTable[0][1] = 4.500000 ;
    DACSTable[0][2] = 4.700000 ;
    DACSTable[0][3] = 4.900000 ;
    DACSTable[0][4] = 5.100000 ;
    DACSTable[0][5] = 5.300000 ;
    DACSTable[0][6] = 5.500000 ;
    DACSTable[0][7] = 5.700000 ;
    DACSTable[0][8] = 5.900000 ;
    DACSTable[0][9] = 6.100000 ;
    DACSTable[0][10] = 6.300000 ;
    DACSTable[0][11] = 6.500000 ;
    DACSTable[0][12] = 6.700000 ;
    DACSTable[0][13] = 6.900000 ;
    DACSTable[0][14] = 7.100000 ;
    DACSTable[0][15] = 7.300000 ;
    DACSTable[0][16] = 7.500000 ;
    DACSTable[0][17] = 7.700000 ;
    DACSTable[0][18] = 7.900000 ;
    DACSTable[0][19] = 8.100000 ;
    DACSTable[0][20] = 8.300000 ;
    DACSTable[0][21] = 8.500000 ;
    DACSTable[0][22] = 8.700000 ;
    DACSTable[0][23] = 8.900000 ;
    DACSTable[0][24] = 9.100000 ;
    DACSTable[0][25] = 9.300000 ;
    DACSTable[0][26] = 9.500000 ;
    DACSTable[0][27] = 9.700000 ;
    DACSTable[0][28] = 9.900000 ;
    DACSTable[0][29] = 10.100000;
    DACSTable[0][30] = 10.300000;
    DACSTable[0][31] = 10.500000;
    DACSTable[0][32] = 10.700000;
    DACSTable[0][33] = 10.900000;
    DACSTable[0][34] = 11.100000;
    DACSTable[0][35] = 11.300000;
    DACSTable[0][36] = 11.500000;
    DACSTable[0][37] = 11.700000;
    DACSTable[0][38] = 11.900000;
    DACSTable[0][39] = 12.100000;
    DACSTable[0][40] = 12.300000;
    DACSTable[0][41] = 12.500000;
    DACSTable[0][42] = 12.700000;
    DACSTable[0][43] = 12.900000;


    DACSTable[1][0] = 0.000000      ;
    DACSTable[1][1] = 668.553983    ;
    DACSTable[1][2] = 1337.107966   ;
    DACSTable[1][3] = 2005.661948   ;
    DACSTable[1][4] = 2674.215931   ;
    DACSTable[1][5] = 3342.769914   ;
    DACSTable[1][6] = 5014.154871   ;
    DACSTable[1][7] = 6685.539828   ;
    DACSTable[1][8] = 26742.159312  ;
    DACSTable[1][9] = 93597.557591  ;
    DACSTable[1][10] = 203908.964752 ;
    DACSTable[1][11] = 213937.274493 ;
    DACSTable[1][12] = 183852.345268 ;
    DACSTable[1][13] = 127025.256731 ;
    DACSTable[1][14] = 96940.327505  ;
    DACSTable[1][15] = 63512.628365  ;
    DACSTable[1][16] = 30084.929226  ;
    DACSTable[1][17] = 26742.159312  ;
    DACSTable[1][18] = 30084.929226  ;
    DACSTable[1][19] = 43456.008881  ;
    DACSTable[1][20] = 46798.778795  ;
    DACSTable[1][21] = 33427.699140  ;
    DACSTable[1][22] = 30084.929226  ;
    DACSTable[1][23] = 26742.159312  ;
    DACSTable[1][24] = 20056.619484  ;
    DACSTable[1][25] = 16713.849570  ;
    DACSTable[1][26] = 10028.309742  ;
    DACSTable[1][27] = 6685.539828   ;
    DACSTable[1][28] = 3342.769914   ;
    DACSTable[1][29] =  3342.769914  ;
    DACSTable[1][30] =  5014.154871  ;
    DACSTable[1][31] =  6685.539828  ;
    DACSTable[1][32] =  10028.309742 ;
    DACSTable[1][33] =  13371.079656 ;
    DACSTable[1][34] =  16713.849570 ;
    DACSTable[1][35] =  20056.619484 ;
    DACSTable[1][36] =  16713.849570 ;
    DACSTable[1][37] =  10028.309742 ;
    DACSTable[1][38] =  6685.539828  ;
    DACSTable[1][39] =  3342.769914  ;
    DACSTable[1][40] =  2339.938940  ;
    DACSTable[1][41] =  1337.107966  ;
    DACSTable[1][42] =  334.276991   ;
    DACSTable[1][43] =  0.000000     ;    
}

int getSPcores(cudaDeviceProp devProp)
{  
    int cores = 0;
    int mp = devProp.multiProcessorCount;
    switch (devProp.major){
     case 2: // Fermi
      if (devProp.minor == 1) cores = mp * 48;
      else cores = mp * 32;
      break;
     case 3: // Kepler
      cores = mp * 192;
      break;
     case 5: // Maxwell
      cores = mp * 128;
      break;
     case 6: // Pascal
      if (devProp.minor == 1) cores = mp * 128;
      else if (devProp.minor == 0) cores = mp * 64;
      else printf("Unknown device type\n");
      break;
     default:
      printf("Unknown device type\n"); 
      break;
      }
    return cores;
}


int main(int argc, char *argv[]) 
{
    
    
    REAL **DACSTable = NULL;
    REAL *BindE_array = NULL;
    REAL **elastDCSTable = NULL;
    REAL **ieeCSTable = NULL;
    int i;
    long long where_all = 0;
    long long where = 0;
    int second_num = 0;
    int e_num = 1;
    int gEid = 0;
    int currentID;
    time_t t;
    srand((unsigned)time(&t));
    float acc_kerneltime = 0;
    
    int *dev_where, *dev_second_num, *dev_gEid;
    eStruct *dev_eQueue, *dev_e2Queue;
    int *dev_stepQ,  *dev_cntQ;
    FILE *report = NULL;
    int GPUDeviceNo;
    REAL Ecutoff;
        
    GPUDeviceNo = 0;
    Ecutoff = 7.4;
    
    cudaSetDevice(GPUDeviceNo);
    
    size_t total_memory;
    size_t free_memory;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, GPUDeviceNo);
    int cudacores = getSPcores(deviceProp);
    printf("cuda core = %d\n", cudacores);
    int globalMB = deviceProp.totalGlobalMem >> 20;
    printf("Total global memory = %d (MB)\n", globalMB);
    size_t sharedB = deviceProp.sharedMemPerBlock;
    printf("Shared memory per block = %zd (Bytes)\n", sharedB);
    
    
    ieeCSTable = (REAL**)malloc(sizeof(ieeCSTable)*(INTERACTION_TYPES-1));
    for (i=0; i<INTERACTION_TYPES-1; i++)
        ieeCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
    
    long int row0s = (long int)ieeCSTable[0];
    long int row1s = (long int)ieeCSTable[1];
    int ieecsPitch = (row1s - row0s)/sizeof(REAL);
    
    DACSTable = (REAL**)malloc(sizeof(DACSTable)*2);
    for (i=0; i<2; i++)
        DACSTable[i]=(REAL*)malloc(sizeof(REAL)*DACS_ENTRIES);	
        
    long int daRow0s = (long int)DACSTable[0];
    long int daRow1s = (long int)DACSTable[1];
    int dacsPitch = (daRow1s - daRow0s)/sizeof(REAL);
    printf("daRow0s = %ld\n", daRow0s);
    printf("daRow1s = %ld\n", daRow1s);
    printf("DAC[0] = %pn\n", &DACSTable[0][0]);
    printf("DAC[1] = %pn\n", &DACSTable[1][0]);
    printf("dacsPitch = %d\n\n", dacsPitch);
    
    dacsPitch = 48;
    ieecsPitch = 84;
    
    elastDCSTable = (REAL**)malloc(sizeof(elastDCSTable)*ODDS);
    for (i=0; i<ODDS; i++)
        elastDCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
    
        
    rd_dacs(DACSTable);
        
    float max_e = 0;
    
    eStruct *eQueue_ini = (eStruct *)malloc(e_num * sizeof(eStruct));
    iniElectron(&max_e, eQueue_ini, &currentID);	
    
    int memory_rqMB = GPUMEM_BASE_SIZE + e_num * GPUMEM_PER_INC;
    printf("Estimated GPU memory usage = %d (MB)\n", memory_rqMB);
    int batch = 1 + memory_rqMB/globalMB;
    
    int MaxN;
    int enumPerBatch = e_num/batch;
    int scale = 1;
    if (max_e > 100e3)
        scale = 1 + max_e/100e3;
    
    MaxN = enumPerBatch * scale * INC_2NDPARTICLES_100KEV;
    long long contsize = e_num * scale * INC_RADICALS_100KEV;
    
    printf("Total incident particles = %d\n", e_num);
    printf("Simulation in batchs = %d\n", batch);
    printf("Estimated Max. batch 2nd particles = %d\n", MaxN);
    
    float *e2ndQueue_test = (float *)malloc(MaxN * sizeof(float));
    memset(e2ndQueue_test, 0.0, sizeof(MaxN * sizeof(float)));
    int *host_stepQ = (int *)malloc(MaxN * sizeof(int));
    int *host_cntQ = (int *)malloc(MaxN * sizeof(int));
    cudaMalloc(&dev_stepQ, MaxN * sizeof(int));
    cudaMalloc(&dev_cntQ, MaxN * sizeof(int));
    cudaMalloc(&dev_e2Queue, MaxN * sizeof(eStruct));
    
    
    /********************************/
    /****   < TEXTURE MEMORY >   ****/
    /********************************/
    
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray *dev_DACSTable, *dev_BindE_array, *dev_ieeCSTable, *dev_elastDCSTable;
    // cuda memory
    cudaMallocArray(&dev_DACSTable, &channelDesc, dacsPitch, 2);
    
    // cuda memory
    cudaMemcpyToArray(dev_DACSTable    , 0, 0, &DACSTable[0][0], 2 * dacsPitch * sizeof(REAL), cudaMemcpyHostToDevice);
    
    // resource description -> from cuda memory
    struct cudaResourceDesc resD_DACSTable;
        memset(&resD_DACSTable, 0, sizeof(resD_DACSTable));
        resD_DACSTable.resType = cudaResourceTypeArray;
        resD_DACSTable.res.array.array = dev_DACSTable;
    
    // texture description -> from cuda memory
    struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeClamp;
        texDesc.addressMode[1]   = cudaAddressModeClamp;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.filterMode       = cudaFilterModeLinear;
        
    cudaTextureObject_t texObj_DACSTable    = 0;
    cudaCreateTextureObject(&texObj_DACSTable, &resD_DACSTable,   &texDesc, NULL);
    
    int N = e_num;
    run_kernel<<<(N+K_THREADS-1)/K_THREADS, K_THREADS>>>(N, texObj_DACSTable);
    
    cudaDeviceSynchronize();
   
    cudaFree(dev_e2Queue);
    cudaFree(dev_where);
    cudaFree(dev_second_num);
    cudaFree(dev_DACSTable);
    cudaFree(dev_BindE_array);
    cudaFree(dev_ieeCSTable);
    cudaFree(dev_elastDCSTable);
    
    
    free(DACSTable);
    free(BindE_array);
    free(elastDCSTable);
    free(ieeCSTable);
	

}

When I compile the code you have posted, I get warnings like this:

t1541.cu(353): warning: variable "dev_where" is used before its value is set

t1541.cu(354): warning: variable "dev_second_num" is used before its value is set

t1541.cu(356): warning: variable "dev_BindE_array" is used before its value is set

t1541.cu(357): warning: variable "dev_ieeCSTable" is used before its value is set

t1541.cu(358): warning: variable "dev_elastDCSTable" is used before its value is set

You should not ignore those (ever, in my opinion). In this case you may think they are benign because they are coming from your cudaFree statements. If you cudaFree a variable you have not previously allocated, you will get a CUDA runtime error. You generally want to make sure your application is free of any runtime errors:

  1. Before asking others for help with it
  2. Before attempting to use any profiling tools on it.

However, on my setup with CUDA 10.1.243 on CentOS7 and a tesla V100, I get this output:

$ ./t1541
Unknown device type
cuda core = 0
Total global memory = 32480 (MB)
Shared memory per block = 49152 (Bytes)
daRow0s = 13494592
daRow1s = 13494784
DAC[0] = 0xcde940n
DAC[1] = 0xcdea00n
dacsPitch = 48

Estimated GPU memory usage = 304 (MB)
Total incident particles = 1
Simulation in batchs = 1
Estimated Max. batch 2nd particles = 2500
 das0=0.000000
 das1=334.276978
 das2=1002.830933
 das3=1671.385010
 das4=2339.938965
 das5=3008.492920
$ nvprof ./t1541
==25002== NVPROF is profiling process 25002, command: ./t1541
Unknown device type
cuda core = 0
Total global memory = 32480 (MB)
Shared memory per block = 49152 (Bytes)
daRow0s = 22016160
daRow1s = 22016352
DAC[0] = 0x14ff0a0n
DAC[1] = 0x14ff160n
dacsPitch = 48

Estimated GPU memory usage = 304 (MB)
Total incident particles = 1
Simulation in batchs = 1
Estimated Max. batch 2nd particles = 2500
 das0=0.000000
 das1=334.276978
 das2=1002.830933
 das3=1671.385010
 das4=2339.938965
 das5=3008.492920
==25002== Profiling application: ./t1541
==25002== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.40%  222.63us         1  222.63us  222.63us  222.63us  run_kernel(int, __int64)
                    1.60%  3.6160us         1  3.6160us  3.6160us  3.6160us  [CUDA memcpy HtoA]
      API calls:   95.11%  312.77ms         3  104.26ms  8.0240us  312.74ms  cudaMalloc
                    2.01%  6.5942ms       388  16.995us     344ns  1.8024ms  cuDeviceGetAttribute
                    1.63%  5.3679ms         4  1.3420ms  641.00us  3.3724ms  cuDeviceTotalMem
                    0.36%  1.1805ms         1  1.1805ms  1.1805ms  1.1805ms  cudaGetDeviceProperties
                    0.35%  1.1478ms         1  1.1478ms  1.1478ms  1.1478ms  cudaDeviceSynchronize
                    0.31%  1.0030ms         1  1.0030ms  1.0030ms  1.0030ms  cudaMallocArray
                    0.17%  547.86us         4  136.96us  98.376us  237.71us  cuDeviceGetName
                    0.03%  103.16us         1  103.16us  103.16us  103.16us  cudaLaunchKernel
                    0.01%  38.904us         1  38.904us  38.904us  38.904us  cudaMemcpyToArray
                    0.01%  27.949us         1  27.949us  27.949us  27.949us  cudaCreateTextureObject
                    0.01%  27.162us         4  6.7900us  3.5150us  12.000us  cuDeviceGetPCIBusId
                    0.01%  23.897us         7  3.4130us     449ns  20.008us  cudaFree
                    0.01%  16.745us         1  16.745us  16.745us  16.745us  cudaSetDevice
                    0.00%  8.1250us         8  1.0150us     405ns  1.8000us  cuDeviceGet
                    0.00%  4.1880us         3  1.3960us     342ns  2.4380us  cuDeviceGetCount
                    0.00%  2.8470us         4     711ns     629ns     884ns  cuDeviceGetUuid
                    0.00%     463ns         1     463ns     463ns     463ns  cudaCreateChannelDesc
$

With respect to the kernel output (das0 - das5) the output appears to be identical. Note that if you are compiling with -arch=sm_70 on your P4000 that won’t work.

Before posting further, run your code with cuda-memcheck. Keep modifying your code until cuda-memcheck reports no errors. Then try your profiling again.

I compiled with -arch=sm_61 on P4000, thank you for the reminding.

The code below I removed all the warnings and run cuda-memcheck with no error.

But the results are still different with/without nvprof.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>

#define REAL float

#define DACS_ENTRIES 44
#define E_ENTRIES 81

#define ZERO 1.0e-20
#define SZERO 1.0e-6

#define NPART 131072 //	number of particles simulated simultaneously
#define DACS_OFFSET 43
#define BINDING_ITEMS 5

#define ODDS 100
#define INTERACTION_TYPES 12
#define MCC 510998.9461 // rest mass energy ~511keV
#define TWOMCC 1021997.8922 //2*MC^2
#define M2C4 261119922915.31070521 //M^2C^4
#define PI 3.14159265359
#define C 299792458

#define GPUMEM_BASE_SIZE 300 // memory size in MB estimated for base needed
#define GPUMEM_PER_INC 4 // memory size in MB estimated per added particle

#define INC_2NDPARTICLES_100KEV 2500
#define INC_RADICALS_100KEV 12000

#define K_THREADS 128

typedef struct
{ 
    REAL x, y, z;
    REAL ux, uy, uz;
    REAL e;
	int h2oState;
    int dead;
    REAL path;
	REAL elape_time;
	int id;
	int parentId;
} eStruct;

__global__ 
void run_kernel(int N, cudaTextureObject_t DACSTable)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < N)
	{
        float das0, das1, das2, das3, das4, das5;
        das0 = tex2D<REAL>(DACSTable, 0, 1.5);
        printf(" das0=%f\n", das0);
        das1 = tex2D<REAL>(DACSTable, 1, 1.5);
        printf(" das1=%f\n", das1);
        das2 = tex2D<REAL>(DACSTable, 2, 1.5);
        printf(" das2=%f\n", das2);
        das3 = tex2D<REAL>(DACSTable, 3, 1.5);
        printf(" das3=%f\n", das3);
        das4 = tex2D<REAL>(DACSTable, 4, 1.5);
        printf(" das4=%f\n", das4);
        das5 = tex2D<REAL>(DACSTable, 5, 1.5);
        printf(" das5=%f\n", das5);
	}
}

void iniElectron(float *max_e, eStruct *eQueue, int *id)
{
    eQueue[0].x = 0;
    eQueue[0].y = 0;
    eQueue[0].z = 5.5e-4;        
    eQueue[0].ux = 3.673770e-01;
    eQueue[0].uy = 4.772163e-01;
    eQueue[0].uz = 7.983099e-01;        
    eQueue[0].e = 50000;
    eQueue[0].h2oState = 99;
    eQueue[0].dead = 0;
    eQueue[0].path = 0;		
    eQueue[0].elape_time = 0.0;		
    eQueue[0].id = *id;		
    eQueue[0].parentId = 0;
}

void rd_dacs(REAL **DACSTable)
{
    DACSTable[0][0] = 4.300000 ;
    DACSTable[0][1] = 4.500000 ;
    DACSTable[0][2] = 4.700000 ;
    DACSTable[0][3] = 4.900000 ;
    DACSTable[0][4] = 5.100000 ;
    DACSTable[0][5] = 5.300000 ;
    DACSTable[0][6] = 5.500000 ;
    DACSTable[0][7] = 5.700000 ;
    DACSTable[0][8] = 5.900000 ;
    DACSTable[0][9] = 6.100000 ;
    DACSTable[0][10] = 6.300000 ;
    DACSTable[0][11] = 6.500000 ;
    DACSTable[0][12] = 6.700000 ;
    DACSTable[0][13] = 6.900000 ;
    DACSTable[0][14] = 7.100000 ;
    DACSTable[0][15] = 7.300000 ;
    DACSTable[0][16] = 7.500000 ;
    DACSTable[0][17] = 7.700000 ;
    DACSTable[0][18] = 7.900000 ;
    DACSTable[0][19] = 8.100000 ;
    DACSTable[0][20] = 8.300000 ;
    DACSTable[0][21] = 8.500000 ;
    DACSTable[0][22] = 8.700000 ;
    DACSTable[0][23] = 8.900000 ;
    DACSTable[0][24] = 9.100000 ;
    DACSTable[0][25] = 9.300000 ;
    DACSTable[0][26] = 9.500000 ;
    DACSTable[0][27] = 9.700000 ;
    DACSTable[0][28] = 9.900000 ;
    DACSTable[0][29] = 10.100000;
    DACSTable[0][30] = 10.300000;
    DACSTable[0][31] = 10.500000;
    DACSTable[0][32] = 10.700000;
    DACSTable[0][33] = 10.900000;
    DACSTable[0][34] = 11.100000;
    DACSTable[0][35] = 11.300000;
    DACSTable[0][36] = 11.500000;
    DACSTable[0][37] = 11.700000;
    DACSTable[0][38] = 11.900000;
    DACSTable[0][39] = 12.100000;
    DACSTable[0][40] = 12.300000;
    DACSTable[0][41] = 12.500000;
    DACSTable[0][42] = 12.700000;
    DACSTable[0][43] = 12.900000;

DACSTable[1][0] = 0.000000      ;
    DACSTable[1][1] = 668.553983    ;
    DACSTable[1][2] = 1337.107966   ;
    DACSTable[1][3] = 2005.661948   ;
    DACSTable[1][4] = 2674.215931   ;
    DACSTable[1][5] = 3342.769914   ;
    DACSTable[1][6] = 5014.154871   ;
    DACSTable[1][7] = 6685.539828   ;
    DACSTable[1][8] = 26742.159312  ;
    DACSTable[1][9] = 93597.557591  ;
    DACSTable[1][10] = 203908.964752 ;
    DACSTable[1][11] = 213937.274493 ;
    DACSTable[1][12] = 183852.345268 ;
    DACSTable[1][13] = 127025.256731 ;
    DACSTable[1][14] = 96940.327505  ;
    DACSTable[1][15] = 63512.628365  ;
    DACSTable[1][16] = 30084.929226  ;
    DACSTable[1][17] = 26742.159312  ;
    DACSTable[1][18] = 30084.929226  ;
    DACSTable[1][19] = 43456.008881  ;
    DACSTable[1][20] = 46798.778795  ;
    DACSTable[1][21] = 33427.699140  ;
    DACSTable[1][22] = 30084.929226  ;
    DACSTable[1][23] = 26742.159312  ;
    DACSTable[1][24] = 20056.619484  ;
    DACSTable[1][25] = 16713.849570  ;
    DACSTable[1][26] = 10028.309742  ;
    DACSTable[1][27] = 6685.539828   ;
    DACSTable[1][28] = 3342.769914   ;
    DACSTable[1][29] =  3342.769914  ;
    DACSTable[1][30] =  5014.154871  ;
    DACSTable[1][31] =  6685.539828  ;
    DACSTable[1][32] =  10028.309742 ;
    DACSTable[1][33] =  13371.079656 ;
    DACSTable[1][34] =  16713.849570 ;
    DACSTable[1][35] =  20056.619484 ;
    DACSTable[1][36] =  16713.849570 ;
    DACSTable[1][37] =  10028.309742 ;
    DACSTable[1][38] =  6685.539828  ;
    DACSTable[1][39] =  3342.769914  ;
    DACSTable[1][40] =  2339.938940  ;
    DACSTable[1][41] =  1337.107966  ;
    DACSTable[1][42] =  334.276991   ;
    DACSTable[1][43] =  0.000000     ;    
}

int getSPcores(cudaDeviceProp devProp)
{  
    int cores = 0;
    int mp = devProp.multiProcessorCount;
    switch (devProp.major){
     case 2: // Fermi
      if (devProp.minor == 1) cores = mp * 48;
      else cores = mp * 32;
      break;
     case 3: // Kepler
      cores = mp * 192;
      break;
     case 5: // Maxwell
      cores = mp * 128;
      break;
     case 6: // Pascal
      if (devProp.minor == 1) cores = mp * 128;
      else if (devProp.minor == 0) cores = mp * 64;
      else printf("Unknown device type\n");
      break;
     default:
      printf("Unknown device type\n"); 
      break;
      }
    return cores;
}

int main(int argc, char *argv[]) 
{

REAL **DACSTable = NULL;
    REAL **elastDCSTable = NULL;
    REAL **ieeCSTable = NULL;
    int i;
    int e_num = 1;
    int currentID;
    time_t t;
    srand((unsigned)time(&t));
    
    int GPUDeviceNo;
        
    GPUDeviceNo = 0;
    
    cudaSetDevice(GPUDeviceNo);
    
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, GPUDeviceNo);
    int cudacores = getSPcores(deviceProp);
    printf("cuda core = %d\n", cudacores);
    int globalMB = deviceProp.totalGlobalMem >> 20;
    printf("Total global memory = %d (MB)\n", globalMB);
    size_t sharedB = deviceProp.sharedMemPerBlock;
    printf("Shared memory per block = %zd (Bytes)\n", sharedB);

ieeCSTable = (REAL**)malloc(sizeof(ieeCSTable)*(INTERACTION_TYPES-1));
    for (i=0; i<INTERACTION_TYPES-1; i++)
        ieeCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
    
    DACSTable = (REAL**)malloc(sizeof(DACSTable)*2);
    for (i=0; i<2; i++)
        DACSTable[i]=(REAL*)malloc(sizeof(REAL)*DACS_ENTRIES);	
        
    long int daRow0s = (long int)DACSTable[0];
    long int daRow1s = (long int)DACSTable[1];
    int dacsPitch = (daRow1s - daRow0s)/sizeof(REAL);
    printf("daRow0s = %ld\n", daRow0s);
    printf("daRow1s = %ld\n", daRow1s);
    printf("DAC[0] = %pn\n", &DACSTable[0][0]);
    printf("DAC[1] = %pn\n", &DACSTable[1][0]);
    printf("dacsPitch = %d\n\n", dacsPitch);
    
    dacsPitch = 48;
    
    elastDCSTable = (REAL**)malloc(sizeof(elastDCSTable)*ODDS);
    for (i=0; i<ODDS; i++)
        elastDCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);

rd_dacs(DACSTable);
        
    float max_e = 0;
    
    eStruct *eQueue_ini = (eStruct *)malloc(e_num * sizeof(eStruct));
    iniElectron(&max_e, eQueue_ini, &currentID);	
    
    int memory_rqMB = GPUMEM_BASE_SIZE + e_num * GPUMEM_PER_INC;
    printf("Estimated GPU memory usage = %d (MB)\n", memory_rqMB);
    int batch = 1 + memory_rqMB/globalMB;
    
    int MaxN;
    int enumPerBatch = e_num/batch;
    int scale = 1;
    if (max_e > 100e3)
        scale = 1 + max_e/100e3;
    
    MaxN = enumPerBatch * scale * INC_2NDPARTICLES_100KEV;
    printf("Total incident particles = %d\n", e_num);
    printf("Simulation in batchs = %d\n", batch);
    printf("Estimated Max. batch 2nd particles = %d\n", MaxN);
    
    float *e2ndQueue_test = (float *)malloc(MaxN * sizeof(float));
    memset(e2ndQueue_test, 0.0, sizeof(MaxN * sizeof(float)));
    int *host_stepQ = (int *)malloc(MaxN * sizeof(int));
    int *host_cntQ = (int *)malloc(MaxN * sizeof(int));    
    
    /********************************/
    /****   < TEXTURE MEMORY >   ****/
    /********************************/
    
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray *dev_DACSTable;
    // cuda memory
    cudaMallocArray(&dev_DACSTable, &channelDesc, dacsPitch, 2);
    
    // cuda memory
    cudaMemcpyToArray(dev_DACSTable    , 0, 0, &DACSTable[0][0], 2 * dacsPitch * sizeof(REAL), cudaMemcpyHostToDevice);
    
    // resource description -> from cuda memory
    struct cudaResourceDesc resD_DACSTable;
        memset(&resD_DACSTable, 0, sizeof(resD_DACSTable));
        resD_DACSTable.resType = cudaResourceTypeArray;
        resD_DACSTable.res.array.array = dev_DACSTable;
    
    // texture description -> from cuda memory
    struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeClamp;
        texDesc.addressMode[1]   = cudaAddressModeClamp;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.filterMode       = cudaFilterModeLinear;
        
    cudaTextureObject_t texObj_DACSTable    = 0;
    cudaCreateTextureObject(&texObj_DACSTable, &resD_DACSTable,   &texDesc, NULL);
    
    int N = e_num;
    run_kernel<<<(N+K_THREADS-1)/K_THREADS, K_THREADS>>>(N, texObj_DACSTable);
    
    cudaDeviceSynchronize();
   
    //cudaFree(dev_DACSTable);

free(DACSTable);
    free(elastDCSTable);
    free(ieeCSTable);
	

}

Got the same result with/without nvprof on das0 - das5.
Using nvcc command with -arch=sm_62 on the newest Jetpack + TX2 (Ubuntu 18.04 with cuda10.0).

nvidia@tegra-ubuntu:~$ ./run 
Unknown device type
cuda core = 0
Total global memory = 7851 (MB)
Shared memory per block = 49152 (Bytes)
daRow0s = 367665359520
daRow1s = 367665359712
DAC[0] = 0x559a902aa0n
DAC[1] = 0x559a902b60n
dacsPitch = 48

Estimated GPU memory usage = 304 (MB)
Total incident particles = 1
Simulation in batchs = 1
Estimated Max. batch 2nd particles = 2500
 das0=0.000000
 das1=334.276978
 das2=1002.830933
 das3=1671.385010
 das4=2339.938965
 das5=3008.492920

nvidia@tegra-ubuntu:~$ sudo /usr/local/cuda-10.0/bin/nvprof ./run
[sudo] password for nvidia: 
==4740== NVPROF is profiling process 4740, command: ./run
==4740== Warning: Unified Memory Profiling is not supported on the underlying platform. System requirements for unified memory can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
Unknown device type
cuda core = 0
Total global memory = 7851 (MB)
Shared memory per block = 49152 (Bytes)
daRow0s = 367366285344
daRow1s = 367366285536
DAC[0] = 0x5588bca820n
DAC[1] = 0x5588bca8e0n
dacsPitch = 48

Estimated GPU memory usage = 304 (MB)
Total incident particles = 1
Simulation in batchs = 1
Estimated Max. batch 2nd particles = 2500
 das0=0.000000
 das1=334.276978
 das2=1002.830933
 das3=1671.385010
 das4=2339.938965
 das5=3008.492920
==4740== Profiling application: ./run
==4740== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.60%  2.5681ms         1  2.5681ms  2.5681ms  2.5681ms  run_kernel(int, __int64)
                    0.40%  10.371us         1  10.371us  10.371us  10.371us  [CUDA memcpy HtoA]
      API calls:   98.86%  423.11ms         1  423.11ms  423.11ms  423.11ms  cudaMallocArray
                    0.86%  3.6693ms         1  3.6693ms  3.6693ms  3.6693ms  cudaDeviceSynchronize
                    0.12%  514.15us        96  5.3550us  4.2240us  64.640us  cuDeviceGetAttribute
                    0.06%  267.71us         1  267.71us  267.71us  267.71us  cudaLaunchKernel
                    0.03%  145.79us         1  145.79us  145.79us  145.79us  cudaMemcpyToArray
                    0.03%  126.56us         1  126.56us  126.56us  126.56us  cudaGetDeviceProperties
                    0.01%  61.153us         1  61.153us  61.153us  61.153us  cudaCreateTextureObject
                    0.01%  34.272us         1  34.272us  34.272us  34.272us  cudaSetDevice
                    0.00%  18.048us         3  6.0160us  4.6720us  7.8080us  cuDeviceGetCount
                    0.00%  17.792us         1  17.792us  17.792us  17.792us  cuDeviceTotalMem
                    0.00%  10.208us         2  5.1040us  4.7680us  5.4400us  cuDeviceGet
                    0.00%  5.1200us         1  5.1200us  5.1200us  5.1200us  cuDeviceGetName
                    0.00%  4.5120us         1  4.5120us  4.5120us  4.5120us  cuDeviceGetUuid
                    0.00%  2.0480us         1  2.0480us  2.0480us  2.0480us  cudaCreateChannelDesc