Running Nsight Systems fails to show my kernel in the timeline view when there is dynamic parallelism in my code. Sometimes, providing --gpuctxsw=true --trace-fork-before-exec=true
to the profiler causes all kernel activity to be displayed under “Run” and not by function name, as in the image below.
More often I get no indication from the timeline that my CUDA kernel ran. It’s very similar to the image above except the
dGPU (Tesla V100-PCIE-32GB)
sections are not present.
Here is a minimal example to reproduce this result:
#include <cstddef>
#include <cstdio>
#include <cuda_runtime.h>
#include <cuda.h>
__global__ void add_one(float *val)
{
*val = *val + 1;
}
__global__ void vector_add(const float *lhs, const float *rhs, float *result, std::size_t len)
{
int index = threadIdx.x;
int stride = blockDim.x;
for (int i = index; i < len; i += stride)
{
result[i] = lhs[i] + rhs[i];
add_one<<<1,1>>>(result + i); // works fine when this line is commented
}
}
void add(const float *lhs, const float *rhs, float *result, std::size_t len)
{
float *lhs_dev = nullptr, *rhs_dev = nullptr, *result_dev = nullptr;
cudaMalloc(&lhs_dev, len * sizeof(float));
cudaMalloc(&rhs_dev, len * sizeof(float));
cudaMalloc(&result_dev, len * sizeof(float));
cudaMemcpy(lhs_dev, lhs, len * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(rhs_dev, rhs, len * sizeof(float), cudaMemcpyHostToDevice);
dim3 block(32);
dim3 grid((len + block.x - 1) / block.x);
printf("Running kernel with config: <<<%d, %d>>>>\n", grid.x, block.x);
vector_add<<<grid, block>>>(lhs_dev, rhs_dev, result_dev, len);
cudaDeviceSynchronize();
cudaMemcpy(result, result_dev, len * sizeof(float), cudaMemcpyDeviceToHost);
}
int main(void)
{
constexpr int size = 6;
float lhs[size] = {1, 2, 3, 4, 5, 6};
float rhs[size] = {2, 3, 4, 5, 6, 7};
float result[size];
add(lhs, rhs, result, size);
printf("%f %f %f %f %f %f\n", result[0], result[1], result[2], result[3], result[4], result[5]);
return 0;
}
Running the program, I get the correct output as expected:
./example_cuda_exe
Running kernel with config: <<<1, 32>>>>
4.000000 6.000000 8.000000 10.000000 12.000000 14.000000
Why aren’t vector_add
and add_one
showing up on the profiler timeline? Or when they do, why do all kernels show as Run under dGPU?