I want to profile gpu call on a Orin-n device by using NsightCompute but facing some problems:
I wrote a simple cuda program like this:
include “cuda_sample.h”
includeint main(int argc, char** argv) {
int a = 3;
int b = 4;
std::cout << "expect output 7, actucal output is : ";
foo(a, b);
return 0;
}
include “cuda_runtime_api.h”
includeglobal void kernel_foo(int* a, int* b, int* c) {
c[0] = a[0] + b[0];
}
void foo(int a, int b) {
int* ad;
int* bd;
int* cd;
int ch = 0;
cudaMalloc(&ad, sizeof(int));
cudaMalloc(&bd, sizeof(int));
cudaMalloc(&cd, sizeof(int));
cudaMemcpy(ad, &a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(bd, &b, sizeof(int), cudaMemcpyHostToDevice);
kernel_foo<<<1, 1>>>(ad, bd, cd);
cudaMemcpy(&ch, cd, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << ch << std::endl;
cudaFree(ad);
cudaFree(bd);
cudaFree(cd);
}
The code was executed successfully on device:
But I failed to profile it using NsightCompute
please let me know how to correctly profile gpu code, thanks!
NsightCompute version: 2021.2.9.0 build 32380564 from cuda-toolkit-11-4
drive-os version 6.0.7
target board: orin-n