My CUDA code hanged when my lookup table passed to CUDA kernel by cudaTextureObject_t and profiled by command nvprof. I printf the content of the lookup table in the kernel, all entries showed zero besides the first entry. Is there any way to fix it or is there any other profiling tools to profile my CUDA code? Thanks.
Yes,nvprof works fine with that.
Debug your code or provide a short, self-contained reproducer.
My CUDA code shows below. This code executed on Titan-V compiled with nvcc run.cu -G -g -arch=sm_70 -rdc=true -lcudadevrt -lcuda -lcudart -o run
After compiling then run without nvprof (./run) the result is
daRow0s = 11057008
daRow1s = 11057200
DAC[0] = 0xa8b770n
DAC[1] = 0xa8b830n
dacsPitch = 48
das0=0.000000
das1=334.276978
das2=1002.830933
das3=1671.385010
das4=2339.938965
das5=3008.492920
With nvprof (nvprof ./run) the result becomes
daRow0s = 35388704
daRow1s = 35326672
DAC[0] = 0x21bfd20n
DAC[1] = 0x21b0ad0n
dacsPitch = -15508
das0=0.000000
das1=0.000000
das2=0.000000
das3=0.000000
das4=0.000000
das5=0.000000
Even I changed dacsPitch to 48 manually.
Thank you very much for anyone helps.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#define REAL float
#define DACS_ENTRIES 44
#define E_ENTRIES 81
#define ZERO 1.0e-20
#define SZERO 1.0e-6
#define NPART 131072 // number of particles simulated simultaneously
#define DACS_OFFSET 43
#define BINDING_ITEMS 5
#define ODDS 100
#define INTERACTION_TYPES 12
#define MCC 510998.9461 // rest mass energy ~511keV
#define TWOMCC 1021997.8922 //2*MC^2
#define M2C4 261119922915.31070521 //M^2C^4
#define PI 3.14159265359
#define C 299792458
#define GPUMEM_BASE_SIZE 300 // memory size in MB estimated for base needed
#define GPUMEM_PER_INC 4 // memory size in MB estimated per added particle
#define INC_2NDPARTICLES_100KEV 2500
#define INC_RADICALS_100KEV 12000
#define K_THREADS 128
typedef struct
{
REAL x, y, z;
REAL ux, uy, uz;
REAL e;
int h2oState;
int dead;
REAL path;
REAL elape_time;
int id;
int parentId;
} eStruct;
__global__
void run_kernel(int N, cudaTextureObject_t DACSTable)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
float das0, das1, das2, das3, das4, das5;
das0 = tex2D<REAL>(DACSTable, 0, 1.5);
printf(" das0=%f\n", das0);
das1 = tex2D<REAL>(DACSTable, 1, 1.5);
printf(" das1=%f\n", das1);
das2 = tex2D<REAL>(DACSTable, 2, 1.5);
printf(" das2=%f\n", das2);
das3 = tex2D<REAL>(DACSTable, 3, 1.5);
printf(" das3=%f\n", das3);
das4 = tex2D<REAL>(DACSTable, 4, 1.5);
printf(" das4=%f\n", das4);
das5 = tex2D<REAL>(DACSTable, 5, 1.5);
printf(" das5=%f\n", das5);
asm("trap;");
}
}
void iniElectron(float *max_e, eStruct *eQueue, int *id)
{
eQueue[0].x = 0;
eQueue[0].y = 0;
eQueue[0].z = 5.5e-4;
eQueue[0].ux = 3.673770e-01;
eQueue[0].uy = 4.772163e-01;
eQueue[0].uz = 7.983099e-01;
eQueue[0].e = 50000;
eQueue[0].h2oState = 99;
eQueue[0].dead = 0;
eQueue[0].path = 0;
eQueue[0].elape_time = 0.0;
eQueue[0].id = *id;
eQueue[0].parentId = 0;
}
void rd_dacs(REAL **DACSTable)
{
DACSTable[0][0] = 4.300000 ;
DACSTable[0][1] = 4.500000 ;
DACSTable[0][2] = 4.700000 ;
DACSTable[0][3] = 4.900000 ;
DACSTable[0][4] = 5.100000 ;
DACSTable[0][5] = 5.300000 ;
DACSTable[0][6] = 5.500000 ;
DACSTable[0][7] = 5.700000 ;
DACSTable[0][8] = 5.900000 ;
DACSTable[0][9] = 6.100000 ;
DACSTable[0][10] = 6.300000 ;
DACSTable[0][11] = 6.500000 ;
DACSTable[0][12] = 6.700000 ;
DACSTable[0][13] = 6.900000 ;
DACSTable[0][14] = 7.100000 ;
DACSTable[0][15] = 7.300000 ;
DACSTable[0][16] = 7.500000 ;
DACSTable[0][17] = 7.700000 ;
DACSTable[0][18] = 7.900000 ;
DACSTable[0][19] = 8.100000 ;
DACSTable[0][20] = 8.300000 ;
DACSTable[0][21] = 8.500000 ;
DACSTable[0][22] = 8.700000 ;
DACSTable[0][23] = 8.900000 ;
DACSTable[0][24] = 9.100000 ;
DACSTable[0][25] = 9.300000 ;
DACSTable[0][26] = 9.500000 ;
DACSTable[0][27] = 9.700000 ;
DACSTable[0][28] = 9.900000 ;
DACSTable[0][29] = 10.100000;
DACSTable[0][30] = 10.300000;
DACSTable[0][31] = 10.500000;
DACSTable[0][32] = 10.700000;
DACSTable[0][33] = 10.900000;
DACSTable[0][34] = 11.100000;
DACSTable[0][35] = 11.300000;
DACSTable[0][36] = 11.500000;
DACSTable[0][37] = 11.700000;
DACSTable[0][38] = 11.900000;
DACSTable[0][39] = 12.100000;
DACSTable[0][40] = 12.300000;
DACSTable[0][41] = 12.500000;
DACSTable[0][42] = 12.700000;
DACSTable[0][43] = 12.900000;
DACSTable[1][0] = 0.000000 ;
DACSTable[1][1] = 668.553983 ;
DACSTable[1][2] = 1337.107966 ;
DACSTable[1][3] = 2005.661948 ;
DACSTable[1][4] = 2674.215931 ;
DACSTable[1][5] = 3342.769914 ;
DACSTable[1][6] = 5014.154871 ;
DACSTable[1][7] = 6685.539828 ;
DACSTable[1][8] = 26742.159312 ;
DACSTable[1][9] = 93597.557591 ;
DACSTable[1][10] = 203908.964752 ;
DACSTable[1][11] = 213937.274493 ;
DACSTable[1][12] = 183852.345268 ;
DACSTable[1][13] = 127025.256731 ;
DACSTable[1][14] = 96940.327505 ;
DACSTable[1][15] = 63512.628365 ;
DACSTable[1][16] = 30084.929226 ;
DACSTable[1][17] = 26742.159312 ;
DACSTable[1][18] = 30084.929226 ;
DACSTable[1][19] = 43456.008881 ;
DACSTable[1][20] = 46798.778795 ;
DACSTable[1][21] = 33427.699140 ;
DACSTable[1][22] = 30084.929226 ;
DACSTable[1][23] = 26742.159312 ;
DACSTable[1][24] = 20056.619484 ;
DACSTable[1][25] = 16713.849570 ;
DACSTable[1][26] = 10028.309742 ;
DACSTable[1][27] = 6685.539828 ;
DACSTable[1][28] = 3342.769914 ;
DACSTable[1][29] = 3342.769914 ;
DACSTable[1][30] = 5014.154871 ;
DACSTable[1][31] = 6685.539828 ;
DACSTable[1][32] = 10028.309742 ;
DACSTable[1][33] = 13371.079656 ;
DACSTable[1][34] = 16713.849570 ;
DACSTable[1][35] = 20056.619484 ;
DACSTable[1][36] = 16713.849570 ;
DACSTable[1][37] = 10028.309742 ;
DACSTable[1][38] = 6685.539828 ;
DACSTable[1][39] = 3342.769914 ;
DACSTable[1][40] = 2339.938940 ;
DACSTable[1][41] = 1337.107966 ;
DACSTable[1][42] = 334.276991 ;
DACSTable[1][43] = 0.000000 ;
}
int getSPcores(cudaDeviceProp devProp)
{
int cores = 0;
int mp = devProp.multiProcessorCount;
switch (devProp.major){
case 2: // Fermi
if (devProp.minor == 1) cores = mp * 48;
else cores = mp * 32;
break;
case 3: // Kepler
cores = mp * 192;
break;
case 5: // Maxwell
cores = mp * 128;
break;
case 6: // Pascal
if (devProp.minor == 1) cores = mp * 128;
else if (devProp.minor == 0) cores = mp * 64;
else printf("Unknown device type\n");
break;
default:
printf("Unknown device type\n");
break;
}
return cores;
}
int main(int argc, char *argv[])
{
REAL **DACSTable = NULL;
REAL *BindE_array = NULL;
REAL **elastDCSTable = NULL;
REAL **ieeCSTable = NULL;
int i;
long long where_all = 0;
long long where = 0;
int second_num = 0;
int e_num = 1;
int gEid = 0;
int currentID;
time_t t;
srand((unsigned)time(&t));
float acc_kerneltime = 0;
int *dev_where, *dev_second_num, *dev_gEid;
eStruct *dev_eQueue, *dev_e2Queue;
int *dev_stepQ, *dev_cntQ;
FILE *report = NULL;
int GPUDeviceNo;
REAL Ecutoff;
GPUDeviceNo = 0;
Ecutoff = 7.4;
cudaSetDevice(GPUDeviceNo);
size_t total_memory;
size_t free_memory;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, GPUDeviceNo);
int cudacores = getSPcores(deviceProp);
printf("cuda core = %d\n", cudacores);
int globalMB = deviceProp.totalGlobalMem >> 20;
printf("Total global memory = %d (MB)\n", globalMB);
size_t sharedB = deviceProp.sharedMemPerBlock;
printf("Shared memory per block = %zd (Bytes)\n", sharedB);
ieeCSTable = (REAL**)malloc(sizeof(ieeCSTable)*(INTERACTION_TYPES-1));
for (i=0; i<INTERACTION_TYPES-1; i++)
ieeCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
long int row0s = (long int)ieeCSTable[0];
long int row1s = (long int)ieeCSTable[1];
int ieecsPitch = (row1s - row0s)/sizeof(REAL);
DACSTable = (REAL**)malloc(sizeof(DACSTable)*2);
for (i=0; i<2; i++)
DACSTable[i]=(REAL*)malloc(sizeof(REAL)*DACS_ENTRIES);
long int daRow0s = (long int)DACSTable[0];
long int daRow1s = (long int)DACSTable[1];
int dacsPitch = (daRow1s - daRow0s)/sizeof(REAL);
printf("daRow0s = %ld\n", daRow0s);
printf("daRow1s = %ld\n", daRow1s);
printf("DAC[0] = %pn\n", &DACSTable[0][0]);
printf("DAC[1] = %pn\n", &DACSTable[1][0]);
printf("dacsPitch = %d\n\n", dacsPitch);
dacsPitch = 48;
ieecsPitch = 84;
elastDCSTable = (REAL**)malloc(sizeof(elastDCSTable)*ODDS);
for (i=0; i<ODDS; i++)
elastDCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
rd_dacs(DACSTable);
float max_e = 0;
eStruct *eQueue_ini = (eStruct *)malloc(e_num * sizeof(eStruct));
iniElectron(&max_e, eQueue_ini, ¤tID);
int memory_rqMB = GPUMEM_BASE_SIZE + e_num * GPUMEM_PER_INC;
printf("Estimated GPU memory usage = %d (MB)\n", memory_rqMB);
int batch = 1 + memory_rqMB/globalMB;
int MaxN;
int enumPerBatch = e_num/batch;
int scale = 1;
if (max_e > 100e3)
scale = 1 + max_e/100e3;
MaxN = enumPerBatch * scale * INC_2NDPARTICLES_100KEV;
long long contsize = e_num * scale * INC_RADICALS_100KEV;
printf("Total incident particles = %d\n", e_num);
printf("Simulation in batchs = %d\n", batch);
printf("Estimated Max. batch 2nd particles = %d\n", MaxN);
float *e2ndQueue_test = (float *)malloc(MaxN * sizeof(float));
memset(e2ndQueue_test, 0.0, sizeof(MaxN * sizeof(float)));
int *host_stepQ = (int *)malloc(MaxN * sizeof(int));
int *host_cntQ = (int *)malloc(MaxN * sizeof(int));
cudaMalloc(&dev_stepQ, MaxN * sizeof(int));
cudaMalloc(&dev_cntQ, MaxN * sizeof(int));
cudaMalloc(&dev_e2Queue, MaxN * sizeof(eStruct));
/********************************/
/**** < TEXTURE MEMORY > ****/
/********************************/
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *dev_DACSTable, *dev_BindE_array, *dev_ieeCSTable, *dev_elastDCSTable;
// cuda memory
cudaMallocArray(&dev_DACSTable, &channelDesc, dacsPitch, 2);
// cuda memory
cudaMemcpyToArray(dev_DACSTable , 0, 0, &DACSTable[0][0], 2 * dacsPitch * sizeof(REAL), cudaMemcpyHostToDevice);
// resource description -> from cuda memory
struct cudaResourceDesc resD_DACSTable;
memset(&resD_DACSTable, 0, sizeof(resD_DACSTable));
resD_DACSTable.resType = cudaResourceTypeArray;
resD_DACSTable.res.array.array = dev_DACSTable;
// texture description -> from cuda memory
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.readMode = cudaReadModeElementType;
texDesc.filterMode = cudaFilterModeLinear;
cudaTextureObject_t texObj_DACSTable = 0;
cudaCreateTextureObject(&texObj_DACSTable, &resD_DACSTable, &texDesc, NULL);
int N = e_num;
run_kernel<<<(N+K_THREADS-1)/K_THREADS, K_THREADS>>>(N, texObj_DACSTable);
cudaFree(dev_e2Queue);
cudaFree(dev_where);
cudaFree(dev_second_num);
cudaFree(dev_DACSTable);
cudaFree(dev_BindE_array);
cudaFree(dev_ieeCSTable);
cudaFree(dev_elastDCSTable);
free(DACSTable);
free(BindE_array);
free(elastDCSTable);
free(ieeCSTable);
}
-
Why are you using a asm trap instruction in your kernel? Unless you have a specific and good reason for doing so, you shouldn’t be using that. The tools don’t react well to that instruction because that instruction by definition corrupts the CUDA context.
-
After launching a kernel call, for proper profiling operation, you should be at a minimum having some sort of synchronizing instruction, such as cudaDeviceSynchronize().
When I remove the asm trap, and add cudaDeviceSynchronize(), I get the same output with or without nvprof.
Sorry, I forgot to remove the asm trap. That’s for debugging.
Where do you add cudaDeviceSynchronize()? I removed the trap and added cudaDeviceSynchronize() at the end of kernel function, but they were still different.
Thanks.
add cudaDeviceSynchronize() at the end of main, before cudaFree calls.
I tried on two machines
CentOS 7.6 TITAN V nvcc 10.1
Ubuntu 16.04 Quadro P4000 nvcc 9.2
Both results were still different with/without nvprof.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#define REAL float
#define DACS_ENTRIES 44
#define E_ENTRIES 81
#define ZERO 1.0e-20
#define SZERO 1.0e-6
#define NPART 131072 // number of particles simulated simultaneously
#define DACS_OFFSET 43
#define BINDING_ITEMS 5
#define ODDS 100
#define INTERACTION_TYPES 12
#define MCC 510998.9461 // rest mass energy ~511keV
#define TWOMCC 1021997.8922 //2*MC^2
#define M2C4 261119922915.31070521 //M^2C^4
#define PI 3.14159265359
#define C 299792458
#define GPUMEM_BASE_SIZE 300 // memory size in MB estimated for base needed
#define GPUMEM_PER_INC 4 // memory size in MB estimated per added particle
#define INC_2NDPARTICLES_100KEV 2500
#define INC_RADICALS_100KEV 12000
#define K_THREADS 128
typedef struct
{
REAL x, y, z;
REAL ux, uy, uz;
REAL e;
int h2oState;
int dead;
REAL path;
REAL elape_time;
int id;
int parentId;
} eStruct;
__global__
void run_kernel(int N, cudaTextureObject_t DACSTable)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
float das0, das1, das2, das3, das4, das5;
das0 = tex2D<REAL>(DACSTable, 0, 1.5);
printf(" das0=%f\n", das0);
das1 = tex2D<REAL>(DACSTable, 1, 1.5);
printf(" das1=%f\n", das1);
das2 = tex2D<REAL>(DACSTable, 2, 1.5);
printf(" das2=%f\n", das2);
das3 = tex2D<REAL>(DACSTable, 3, 1.5);
printf(" das3=%f\n", das3);
das4 = tex2D<REAL>(DACSTable, 4, 1.5);
printf(" das4=%f\n", das4);
das5 = tex2D<REAL>(DACSTable, 5, 1.5);
printf(" das5=%f\n", das5);
}
}
void iniElectron(float *max_e, eStruct *eQueue, int *id)
{
eQueue[0].x = 0;
eQueue[0].y = 0;
eQueue[0].z = 5.5e-4;
eQueue[0].ux = 3.673770e-01;
eQueue[0].uy = 4.772163e-01;
eQueue[0].uz = 7.983099e-01;
eQueue[0].e = 50000;
eQueue[0].h2oState = 99;
eQueue[0].dead = 0;
eQueue[0].path = 0;
eQueue[0].elape_time = 0.0;
eQueue[0].id = *id;
eQueue[0].parentId = 0;
}
void rd_dacs(REAL **DACSTable)
{
DACSTable[0][0] = 4.300000 ;
DACSTable[0][1] = 4.500000 ;
DACSTable[0][2] = 4.700000 ;
DACSTable[0][3] = 4.900000 ;
DACSTable[0][4] = 5.100000 ;
DACSTable[0][5] = 5.300000 ;
DACSTable[0][6] = 5.500000 ;
DACSTable[0][7] = 5.700000 ;
DACSTable[0][8] = 5.900000 ;
DACSTable[0][9] = 6.100000 ;
DACSTable[0][10] = 6.300000 ;
DACSTable[0][11] = 6.500000 ;
DACSTable[0][12] = 6.700000 ;
DACSTable[0][13] = 6.900000 ;
DACSTable[0][14] = 7.100000 ;
DACSTable[0][15] = 7.300000 ;
DACSTable[0][16] = 7.500000 ;
DACSTable[0][17] = 7.700000 ;
DACSTable[0][18] = 7.900000 ;
DACSTable[0][19] = 8.100000 ;
DACSTable[0][20] = 8.300000 ;
DACSTable[0][21] = 8.500000 ;
DACSTable[0][22] = 8.700000 ;
DACSTable[0][23] = 8.900000 ;
DACSTable[0][24] = 9.100000 ;
DACSTable[0][25] = 9.300000 ;
DACSTable[0][26] = 9.500000 ;
DACSTable[0][27] = 9.700000 ;
DACSTable[0][28] = 9.900000 ;
DACSTable[0][29] = 10.100000;
DACSTable[0][30] = 10.300000;
DACSTable[0][31] = 10.500000;
DACSTable[0][32] = 10.700000;
DACSTable[0][33] = 10.900000;
DACSTable[0][34] = 11.100000;
DACSTable[0][35] = 11.300000;
DACSTable[0][36] = 11.500000;
DACSTable[0][37] = 11.700000;
DACSTable[0][38] = 11.900000;
DACSTable[0][39] = 12.100000;
DACSTable[0][40] = 12.300000;
DACSTable[0][41] = 12.500000;
DACSTable[0][42] = 12.700000;
DACSTable[0][43] = 12.900000;
DACSTable[1][0] = 0.000000 ;
DACSTable[1][1] = 668.553983 ;
DACSTable[1][2] = 1337.107966 ;
DACSTable[1][3] = 2005.661948 ;
DACSTable[1][4] = 2674.215931 ;
DACSTable[1][5] = 3342.769914 ;
DACSTable[1][6] = 5014.154871 ;
DACSTable[1][7] = 6685.539828 ;
DACSTable[1][8] = 26742.159312 ;
DACSTable[1][9] = 93597.557591 ;
DACSTable[1][10] = 203908.964752 ;
DACSTable[1][11] = 213937.274493 ;
DACSTable[1][12] = 183852.345268 ;
DACSTable[1][13] = 127025.256731 ;
DACSTable[1][14] = 96940.327505 ;
DACSTable[1][15] = 63512.628365 ;
DACSTable[1][16] = 30084.929226 ;
DACSTable[1][17] = 26742.159312 ;
DACSTable[1][18] = 30084.929226 ;
DACSTable[1][19] = 43456.008881 ;
DACSTable[1][20] = 46798.778795 ;
DACSTable[1][21] = 33427.699140 ;
DACSTable[1][22] = 30084.929226 ;
DACSTable[1][23] = 26742.159312 ;
DACSTable[1][24] = 20056.619484 ;
DACSTable[1][25] = 16713.849570 ;
DACSTable[1][26] = 10028.309742 ;
DACSTable[1][27] = 6685.539828 ;
DACSTable[1][28] = 3342.769914 ;
DACSTable[1][29] = 3342.769914 ;
DACSTable[1][30] = 5014.154871 ;
DACSTable[1][31] = 6685.539828 ;
DACSTable[1][32] = 10028.309742 ;
DACSTable[1][33] = 13371.079656 ;
DACSTable[1][34] = 16713.849570 ;
DACSTable[1][35] = 20056.619484 ;
DACSTable[1][36] = 16713.849570 ;
DACSTable[1][37] = 10028.309742 ;
DACSTable[1][38] = 6685.539828 ;
DACSTable[1][39] = 3342.769914 ;
DACSTable[1][40] = 2339.938940 ;
DACSTable[1][41] = 1337.107966 ;
DACSTable[1][42] = 334.276991 ;
DACSTable[1][43] = 0.000000 ;
}
int getSPcores(cudaDeviceProp devProp)
{
int cores = 0;
int mp = devProp.multiProcessorCount;
switch (devProp.major){
case 2: // Fermi
if (devProp.minor == 1) cores = mp * 48;
else cores = mp * 32;
break;
case 3: // Kepler
cores = mp * 192;
break;
case 5: // Maxwell
cores = mp * 128;
break;
case 6: // Pascal
if (devProp.minor == 1) cores = mp * 128;
else if (devProp.minor == 0) cores = mp * 64;
else printf("Unknown device type\n");
break;
default:
printf("Unknown device type\n");
break;
}
return cores;
}
int main(int argc, char *argv[])
{
REAL **DACSTable = NULL;
REAL *BindE_array = NULL;
REAL **elastDCSTable = NULL;
REAL **ieeCSTable = NULL;
int i;
long long where_all = 0;
long long where = 0;
int second_num = 0;
int e_num = 1;
int gEid = 0;
int currentID;
time_t t;
srand((unsigned)time(&t));
float acc_kerneltime = 0;
int *dev_where, *dev_second_num, *dev_gEid;
eStruct *dev_eQueue, *dev_e2Queue;
int *dev_stepQ, *dev_cntQ;
FILE *report = NULL;
int GPUDeviceNo;
REAL Ecutoff;
GPUDeviceNo = 0;
Ecutoff = 7.4;
cudaSetDevice(GPUDeviceNo);
size_t total_memory;
size_t free_memory;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, GPUDeviceNo);
int cudacores = getSPcores(deviceProp);
printf("cuda core = %d\n", cudacores);
int globalMB = deviceProp.totalGlobalMem >> 20;
printf("Total global memory = %d (MB)\n", globalMB);
size_t sharedB = deviceProp.sharedMemPerBlock;
printf("Shared memory per block = %zd (Bytes)\n", sharedB);
ieeCSTable = (REAL**)malloc(sizeof(ieeCSTable)*(INTERACTION_TYPES-1));
for (i=0; i<INTERACTION_TYPES-1; i++)
ieeCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
long int row0s = (long int)ieeCSTable[0];
long int row1s = (long int)ieeCSTable[1];
int ieecsPitch = (row1s - row0s)/sizeof(REAL);
DACSTable = (REAL**)malloc(sizeof(DACSTable)*2);
for (i=0; i<2; i++)
DACSTable[i]=(REAL*)malloc(sizeof(REAL)*DACS_ENTRIES);
long int daRow0s = (long int)DACSTable[0];
long int daRow1s = (long int)DACSTable[1];
int dacsPitch = (daRow1s - daRow0s)/sizeof(REAL);
printf("daRow0s = %ld\n", daRow0s);
printf("daRow1s = %ld\n", daRow1s);
printf("DAC[0] = %pn\n", &DACSTable[0][0]);
printf("DAC[1] = %pn\n", &DACSTable[1][0]);
printf("dacsPitch = %d\n\n", dacsPitch);
dacsPitch = 48;
ieecsPitch = 84;
elastDCSTable = (REAL**)malloc(sizeof(elastDCSTable)*ODDS);
for (i=0; i<ODDS; i++)
elastDCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
rd_dacs(DACSTable);
float max_e = 0;
eStruct *eQueue_ini = (eStruct *)malloc(e_num * sizeof(eStruct));
iniElectron(&max_e, eQueue_ini, ¤tID);
int memory_rqMB = GPUMEM_BASE_SIZE + e_num * GPUMEM_PER_INC;
printf("Estimated GPU memory usage = %d (MB)\n", memory_rqMB);
int batch = 1 + memory_rqMB/globalMB;
int MaxN;
int enumPerBatch = e_num/batch;
int scale = 1;
if (max_e > 100e3)
scale = 1 + max_e/100e3;
MaxN = enumPerBatch * scale * INC_2NDPARTICLES_100KEV;
long long contsize = e_num * scale * INC_RADICALS_100KEV;
printf("Total incident particles = %d\n", e_num);
printf("Simulation in batchs = %d\n", batch);
printf("Estimated Max. batch 2nd particles = %d\n", MaxN);
float *e2ndQueue_test = (float *)malloc(MaxN * sizeof(float));
memset(e2ndQueue_test, 0.0, sizeof(MaxN * sizeof(float)));
int *host_stepQ = (int *)malloc(MaxN * sizeof(int));
int *host_cntQ = (int *)malloc(MaxN * sizeof(int));
cudaMalloc(&dev_stepQ, MaxN * sizeof(int));
cudaMalloc(&dev_cntQ, MaxN * sizeof(int));
cudaMalloc(&dev_e2Queue, MaxN * sizeof(eStruct));
/********************************/
/**** < TEXTURE MEMORY > ****/
/********************************/
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *dev_DACSTable, *dev_BindE_array, *dev_ieeCSTable, *dev_elastDCSTable;
// cuda memory
cudaMallocArray(&dev_DACSTable, &channelDesc, dacsPitch, 2);
// cuda memory
cudaMemcpyToArray(dev_DACSTable , 0, 0, &DACSTable[0][0], 2 * dacsPitch * sizeof(REAL), cudaMemcpyHostToDevice);
// resource description -> from cuda memory
struct cudaResourceDesc resD_DACSTable;
memset(&resD_DACSTable, 0, sizeof(resD_DACSTable));
resD_DACSTable.resType = cudaResourceTypeArray;
resD_DACSTable.res.array.array = dev_DACSTable;
// texture description -> from cuda memory
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.readMode = cudaReadModeElementType;
texDesc.filterMode = cudaFilterModeLinear;
cudaTextureObject_t texObj_DACSTable = 0;
cudaCreateTextureObject(&texObj_DACSTable, &resD_DACSTable, &texDesc, NULL);
int N = e_num;
run_kernel<<<(N+K_THREADS-1)/K_THREADS, K_THREADS>>>(N, texObj_DACSTable);
cudaDeviceSynchronize();
cudaFree(dev_e2Queue);
cudaFree(dev_where);
cudaFree(dev_second_num);
cudaFree(dev_DACSTable);
cudaFree(dev_BindE_array);
cudaFree(dev_ieeCSTable);
cudaFree(dev_elastDCSTable);
free(DACSTable);
free(BindE_array);
free(elastDCSTable);
free(ieeCSTable);
}
When I compile the code you have posted, I get warnings like this:
t1541.cu(353): warning: variable "dev_where" is used before its value is set
t1541.cu(354): warning: variable "dev_second_num" is used before its value is set
t1541.cu(356): warning: variable "dev_BindE_array" is used before its value is set
t1541.cu(357): warning: variable "dev_ieeCSTable" is used before its value is set
t1541.cu(358): warning: variable "dev_elastDCSTable" is used before its value is set
You should not ignore those (ever, in my opinion). In this case you may think they are benign because they are coming from your cudaFree statements. If you cudaFree a variable you have not previously allocated, you will get a CUDA runtime error. You generally want to make sure your application is free of any runtime errors:
- Before asking others for help with it
- Before attempting to use any profiling tools on it.
However, on my setup with CUDA 10.1.243 on CentOS7 and a tesla V100, I get this output:
$ ./t1541
Unknown device type
cuda core = 0
Total global memory = 32480 (MB)
Shared memory per block = 49152 (Bytes)
daRow0s = 13494592
daRow1s = 13494784
DAC[0] = 0xcde940n
DAC[1] = 0xcdea00n
dacsPitch = 48
Estimated GPU memory usage = 304 (MB)
Total incident particles = 1
Simulation in batchs = 1
Estimated Max. batch 2nd particles = 2500
das0=0.000000
das1=334.276978
das2=1002.830933
das3=1671.385010
das4=2339.938965
das5=3008.492920
$ nvprof ./t1541
==25002== NVPROF is profiling process 25002, command: ./t1541
Unknown device type
cuda core = 0
Total global memory = 32480 (MB)
Shared memory per block = 49152 (Bytes)
daRow0s = 22016160
daRow1s = 22016352
DAC[0] = 0x14ff0a0n
DAC[1] = 0x14ff160n
dacsPitch = 48
Estimated GPU memory usage = 304 (MB)
Total incident particles = 1
Simulation in batchs = 1
Estimated Max. batch 2nd particles = 2500
das0=0.000000
das1=334.276978
das2=1002.830933
das3=1671.385010
das4=2339.938965
das5=3008.492920
==25002== Profiling application: ./t1541
==25002== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 98.40% 222.63us 1 222.63us 222.63us 222.63us run_kernel(int, __int64)
1.60% 3.6160us 1 3.6160us 3.6160us 3.6160us [CUDA memcpy HtoA]
API calls: 95.11% 312.77ms 3 104.26ms 8.0240us 312.74ms cudaMalloc
2.01% 6.5942ms 388 16.995us 344ns 1.8024ms cuDeviceGetAttribute
1.63% 5.3679ms 4 1.3420ms 641.00us 3.3724ms cuDeviceTotalMem
0.36% 1.1805ms 1 1.1805ms 1.1805ms 1.1805ms cudaGetDeviceProperties
0.35% 1.1478ms 1 1.1478ms 1.1478ms 1.1478ms cudaDeviceSynchronize
0.31% 1.0030ms 1 1.0030ms 1.0030ms 1.0030ms cudaMallocArray
0.17% 547.86us 4 136.96us 98.376us 237.71us cuDeviceGetName
0.03% 103.16us 1 103.16us 103.16us 103.16us cudaLaunchKernel
0.01% 38.904us 1 38.904us 38.904us 38.904us cudaMemcpyToArray
0.01% 27.949us 1 27.949us 27.949us 27.949us cudaCreateTextureObject
0.01% 27.162us 4 6.7900us 3.5150us 12.000us cuDeviceGetPCIBusId
0.01% 23.897us 7 3.4130us 449ns 20.008us cudaFree
0.01% 16.745us 1 16.745us 16.745us 16.745us cudaSetDevice
0.00% 8.1250us 8 1.0150us 405ns 1.8000us cuDeviceGet
0.00% 4.1880us 3 1.3960us 342ns 2.4380us cuDeviceGetCount
0.00% 2.8470us 4 711ns 629ns 884ns cuDeviceGetUuid
0.00% 463ns 1 463ns 463ns 463ns cudaCreateChannelDesc
$
With respect to the kernel output (das0 - das5) the output appears to be identical. Note that if you are compiling with -arch=sm_70 on your P4000 that won’t work.
Before posting further, run your code with cuda-memcheck. Keep modifying your code until cuda-memcheck reports no errors. Then try your profiling again.
I compiled with -arch=sm_61 on P4000, thank you for the reminding.
The code below I removed all the warnings and run cuda-memcheck with no error.
But the results are still different with/without nvprof.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#define REAL float
#define DACS_ENTRIES 44
#define E_ENTRIES 81
#define ZERO 1.0e-20
#define SZERO 1.0e-6
#define NPART 131072 // number of particles simulated simultaneously
#define DACS_OFFSET 43
#define BINDING_ITEMS 5
#define ODDS 100
#define INTERACTION_TYPES 12
#define MCC 510998.9461 // rest mass energy ~511keV
#define TWOMCC 1021997.8922 //2*MC^2
#define M2C4 261119922915.31070521 //M^2C^4
#define PI 3.14159265359
#define C 299792458
#define GPUMEM_BASE_SIZE 300 // memory size in MB estimated for base needed
#define GPUMEM_PER_INC 4 // memory size in MB estimated per added particle
#define INC_2NDPARTICLES_100KEV 2500
#define INC_RADICALS_100KEV 12000
#define K_THREADS 128
typedef struct
{
REAL x, y, z;
REAL ux, uy, uz;
REAL e;
int h2oState;
int dead;
REAL path;
REAL elape_time;
int id;
int parentId;
} eStruct;
__global__
void run_kernel(int N, cudaTextureObject_t DACSTable)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
float das0, das1, das2, das3, das4, das5;
das0 = tex2D<REAL>(DACSTable, 0, 1.5);
printf(" das0=%f\n", das0);
das1 = tex2D<REAL>(DACSTable, 1, 1.5);
printf(" das1=%f\n", das1);
das2 = tex2D<REAL>(DACSTable, 2, 1.5);
printf(" das2=%f\n", das2);
das3 = tex2D<REAL>(DACSTable, 3, 1.5);
printf(" das3=%f\n", das3);
das4 = tex2D<REAL>(DACSTable, 4, 1.5);
printf(" das4=%f\n", das4);
das5 = tex2D<REAL>(DACSTable, 5, 1.5);
printf(" das5=%f\n", das5);
}
}
void iniElectron(float *max_e, eStruct *eQueue, int *id)
{
eQueue[0].x = 0;
eQueue[0].y = 0;
eQueue[0].z = 5.5e-4;
eQueue[0].ux = 3.673770e-01;
eQueue[0].uy = 4.772163e-01;
eQueue[0].uz = 7.983099e-01;
eQueue[0].e = 50000;
eQueue[0].h2oState = 99;
eQueue[0].dead = 0;
eQueue[0].path = 0;
eQueue[0].elape_time = 0.0;
eQueue[0].id = *id;
eQueue[0].parentId = 0;
}
void rd_dacs(REAL **DACSTable)
{
DACSTable[0][0] = 4.300000 ;
DACSTable[0][1] = 4.500000 ;
DACSTable[0][2] = 4.700000 ;
DACSTable[0][3] = 4.900000 ;
DACSTable[0][4] = 5.100000 ;
DACSTable[0][5] = 5.300000 ;
DACSTable[0][6] = 5.500000 ;
DACSTable[0][7] = 5.700000 ;
DACSTable[0][8] = 5.900000 ;
DACSTable[0][9] = 6.100000 ;
DACSTable[0][10] = 6.300000 ;
DACSTable[0][11] = 6.500000 ;
DACSTable[0][12] = 6.700000 ;
DACSTable[0][13] = 6.900000 ;
DACSTable[0][14] = 7.100000 ;
DACSTable[0][15] = 7.300000 ;
DACSTable[0][16] = 7.500000 ;
DACSTable[0][17] = 7.700000 ;
DACSTable[0][18] = 7.900000 ;
DACSTable[0][19] = 8.100000 ;
DACSTable[0][20] = 8.300000 ;
DACSTable[0][21] = 8.500000 ;
DACSTable[0][22] = 8.700000 ;
DACSTable[0][23] = 8.900000 ;
DACSTable[0][24] = 9.100000 ;
DACSTable[0][25] = 9.300000 ;
DACSTable[0][26] = 9.500000 ;
DACSTable[0][27] = 9.700000 ;
DACSTable[0][28] = 9.900000 ;
DACSTable[0][29] = 10.100000;
DACSTable[0][30] = 10.300000;
DACSTable[0][31] = 10.500000;
DACSTable[0][32] = 10.700000;
DACSTable[0][33] = 10.900000;
DACSTable[0][34] = 11.100000;
DACSTable[0][35] = 11.300000;
DACSTable[0][36] = 11.500000;
DACSTable[0][37] = 11.700000;
DACSTable[0][38] = 11.900000;
DACSTable[0][39] = 12.100000;
DACSTable[0][40] = 12.300000;
DACSTable[0][41] = 12.500000;
DACSTable[0][42] = 12.700000;
DACSTable[0][43] = 12.900000;
DACSTable[1][0] = 0.000000 ;
DACSTable[1][1] = 668.553983 ;
DACSTable[1][2] = 1337.107966 ;
DACSTable[1][3] = 2005.661948 ;
DACSTable[1][4] = 2674.215931 ;
DACSTable[1][5] = 3342.769914 ;
DACSTable[1][6] = 5014.154871 ;
DACSTable[1][7] = 6685.539828 ;
DACSTable[1][8] = 26742.159312 ;
DACSTable[1][9] = 93597.557591 ;
DACSTable[1][10] = 203908.964752 ;
DACSTable[1][11] = 213937.274493 ;
DACSTable[1][12] = 183852.345268 ;
DACSTable[1][13] = 127025.256731 ;
DACSTable[1][14] = 96940.327505 ;
DACSTable[1][15] = 63512.628365 ;
DACSTable[1][16] = 30084.929226 ;
DACSTable[1][17] = 26742.159312 ;
DACSTable[1][18] = 30084.929226 ;
DACSTable[1][19] = 43456.008881 ;
DACSTable[1][20] = 46798.778795 ;
DACSTable[1][21] = 33427.699140 ;
DACSTable[1][22] = 30084.929226 ;
DACSTable[1][23] = 26742.159312 ;
DACSTable[1][24] = 20056.619484 ;
DACSTable[1][25] = 16713.849570 ;
DACSTable[1][26] = 10028.309742 ;
DACSTable[1][27] = 6685.539828 ;
DACSTable[1][28] = 3342.769914 ;
DACSTable[1][29] = 3342.769914 ;
DACSTable[1][30] = 5014.154871 ;
DACSTable[1][31] = 6685.539828 ;
DACSTable[1][32] = 10028.309742 ;
DACSTable[1][33] = 13371.079656 ;
DACSTable[1][34] = 16713.849570 ;
DACSTable[1][35] = 20056.619484 ;
DACSTable[1][36] = 16713.849570 ;
DACSTable[1][37] = 10028.309742 ;
DACSTable[1][38] = 6685.539828 ;
DACSTable[1][39] = 3342.769914 ;
DACSTable[1][40] = 2339.938940 ;
DACSTable[1][41] = 1337.107966 ;
DACSTable[1][42] = 334.276991 ;
DACSTable[1][43] = 0.000000 ;
}
int getSPcores(cudaDeviceProp devProp)
{
int cores = 0;
int mp = devProp.multiProcessorCount;
switch (devProp.major){
case 2: // Fermi
if (devProp.minor == 1) cores = mp * 48;
else cores = mp * 32;
break;
case 3: // Kepler
cores = mp * 192;
break;
case 5: // Maxwell
cores = mp * 128;
break;
case 6: // Pascal
if (devProp.minor == 1) cores = mp * 128;
else if (devProp.minor == 0) cores = mp * 64;
else printf("Unknown device type\n");
break;
default:
printf("Unknown device type\n");
break;
}
return cores;
}
int main(int argc, char *argv[])
{
REAL **DACSTable = NULL;
REAL **elastDCSTable = NULL;
REAL **ieeCSTable = NULL;
int i;
int e_num = 1;
int currentID;
time_t t;
srand((unsigned)time(&t));
int GPUDeviceNo;
GPUDeviceNo = 0;
cudaSetDevice(GPUDeviceNo);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, GPUDeviceNo);
int cudacores = getSPcores(deviceProp);
printf("cuda core = %d\n", cudacores);
int globalMB = deviceProp.totalGlobalMem >> 20;
printf("Total global memory = %d (MB)\n", globalMB);
size_t sharedB = deviceProp.sharedMemPerBlock;
printf("Shared memory per block = %zd (Bytes)\n", sharedB);
ieeCSTable = (REAL**)malloc(sizeof(ieeCSTable)*(INTERACTION_TYPES-1));
for (i=0; i<INTERACTION_TYPES-1; i++)
ieeCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
DACSTable = (REAL**)malloc(sizeof(DACSTable)*2);
for (i=0; i<2; i++)
DACSTable[i]=(REAL*)malloc(sizeof(REAL)*DACS_ENTRIES);
long int daRow0s = (long int)DACSTable[0];
long int daRow1s = (long int)DACSTable[1];
int dacsPitch = (daRow1s - daRow0s)/sizeof(REAL);
printf("daRow0s = %ld\n", daRow0s);
printf("daRow1s = %ld\n", daRow1s);
printf("DAC[0] = %pn\n", &DACSTable[0][0]);
printf("DAC[1] = %pn\n", &DACSTable[1][0]);
printf("dacsPitch = %d\n\n", dacsPitch);
dacsPitch = 48;
elastDCSTable = (REAL**)malloc(sizeof(elastDCSTable)*ODDS);
for (i=0; i<ODDS; i++)
elastDCSTable[i]=(REAL*)malloc(sizeof(REAL)*E_ENTRIES);
rd_dacs(DACSTable);
float max_e = 0;
eStruct *eQueue_ini = (eStruct *)malloc(e_num * sizeof(eStruct));
iniElectron(&max_e, eQueue_ini, ¤tID);
int memory_rqMB = GPUMEM_BASE_SIZE + e_num * GPUMEM_PER_INC;
printf("Estimated GPU memory usage = %d (MB)\n", memory_rqMB);
int batch = 1 + memory_rqMB/globalMB;
int MaxN;
int enumPerBatch = e_num/batch;
int scale = 1;
if (max_e > 100e3)
scale = 1 + max_e/100e3;
MaxN = enumPerBatch * scale * INC_2NDPARTICLES_100KEV;
printf("Total incident particles = %d\n", e_num);
printf("Simulation in batchs = %d\n", batch);
printf("Estimated Max. batch 2nd particles = %d\n", MaxN);
float *e2ndQueue_test = (float *)malloc(MaxN * sizeof(float));
memset(e2ndQueue_test, 0.0, sizeof(MaxN * sizeof(float)));
int *host_stepQ = (int *)malloc(MaxN * sizeof(int));
int *host_cntQ = (int *)malloc(MaxN * sizeof(int));
/********************************/
/**** < TEXTURE MEMORY > ****/
/********************************/
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *dev_DACSTable;
// cuda memory
cudaMallocArray(&dev_DACSTable, &channelDesc, dacsPitch, 2);
// cuda memory
cudaMemcpyToArray(dev_DACSTable , 0, 0, &DACSTable[0][0], 2 * dacsPitch * sizeof(REAL), cudaMemcpyHostToDevice);
// resource description -> from cuda memory
struct cudaResourceDesc resD_DACSTable;
memset(&resD_DACSTable, 0, sizeof(resD_DACSTable));
resD_DACSTable.resType = cudaResourceTypeArray;
resD_DACSTable.res.array.array = dev_DACSTable;
// texture description -> from cuda memory
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.readMode = cudaReadModeElementType;
texDesc.filterMode = cudaFilterModeLinear;
cudaTextureObject_t texObj_DACSTable = 0;
cudaCreateTextureObject(&texObj_DACSTable, &resD_DACSTable, &texDesc, NULL);
int N = e_num;
run_kernel<<<(N+K_THREADS-1)/K_THREADS, K_THREADS>>>(N, texObj_DACSTable);
cudaDeviceSynchronize();
//cudaFree(dev_DACSTable);
free(DACSTable);
free(elastDCSTable);
free(ieeCSTable);
}
Got the same result with/without nvprof on das0 - das5.
Using nvcc command with -arch=sm_62 on the newest Jetpack + TX2 (Ubuntu 18.04 with cuda10.0).
nvidia@tegra-ubuntu:~$ ./run
Unknown device type
cuda core = 0
Total global memory = 7851 (MB)
Shared memory per block = 49152 (Bytes)
daRow0s = 367665359520
daRow1s = 367665359712
DAC[0] = 0x559a902aa0n
DAC[1] = 0x559a902b60n
dacsPitch = 48
Estimated GPU memory usage = 304 (MB)
Total incident particles = 1
Simulation in batchs = 1
Estimated Max. batch 2nd particles = 2500
das0=0.000000
das1=334.276978
das2=1002.830933
das3=1671.385010
das4=2339.938965
das5=3008.492920
nvidia@tegra-ubuntu:~$ sudo /usr/local/cuda-10.0/bin/nvprof ./run
[sudo] password for nvidia:
==4740== NVPROF is profiling process 4740, command: ./run
==4740== Warning: Unified Memory Profiling is not supported on the underlying platform. System requirements for unified memory can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
Unknown device type
cuda core = 0
Total global memory = 7851 (MB)
Shared memory per block = 49152 (Bytes)
daRow0s = 367366285344
daRow1s = 367366285536
DAC[0] = 0x5588bca820n
DAC[1] = 0x5588bca8e0n
dacsPitch = 48
Estimated GPU memory usage = 304 (MB)
Total incident particles = 1
Simulation in batchs = 1
Estimated Max. batch 2nd particles = 2500
das0=0.000000
das1=334.276978
das2=1002.830933
das3=1671.385010
das4=2339.938965
das5=3008.492920
==4740== Profiling application: ./run
==4740== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 99.60% 2.5681ms 1 2.5681ms 2.5681ms 2.5681ms run_kernel(int, __int64)
0.40% 10.371us 1 10.371us 10.371us 10.371us [CUDA memcpy HtoA]
API calls: 98.86% 423.11ms 1 423.11ms 423.11ms 423.11ms cudaMallocArray
0.86% 3.6693ms 1 3.6693ms 3.6693ms 3.6693ms cudaDeviceSynchronize
0.12% 514.15us 96 5.3550us 4.2240us 64.640us cuDeviceGetAttribute
0.06% 267.71us 1 267.71us 267.71us 267.71us cudaLaunchKernel
0.03% 145.79us 1 145.79us 145.79us 145.79us cudaMemcpyToArray
0.03% 126.56us 1 126.56us 126.56us 126.56us cudaGetDeviceProperties
0.01% 61.153us 1 61.153us 61.153us 61.153us cudaCreateTextureObject
0.01% 34.272us 1 34.272us 34.272us 34.272us cudaSetDevice
0.00% 18.048us 3 6.0160us 4.6720us 7.8080us cuDeviceGetCount
0.00% 17.792us 1 17.792us 17.792us 17.792us cuDeviceTotalMem
0.00% 10.208us 2 5.1040us 4.7680us 5.4400us cuDeviceGet
0.00% 5.1200us 1 5.1200us 5.1200us 5.1200us cuDeviceGetName
0.00% 4.5120us 1 4.5120us 4.5120us 4.5120us cuDeviceGetUuid
0.00% 2.0480us 1 2.0480us 2.0480us 2.0480us cudaCreateChannelDesc