Nsight Compute: Target process terminated before first instrumented API call

Hi, I am running a cuda program on wsl2Arch Linux. My CUDA version is 11.7, cuda-tools version is 12.3. The GPU is an RTX 4070 TI. I keep getting the error in the image attached when I use the ncu profiler.

rwang2000@RicharddddW:~/eecs583/final-project/examples/rgb|main⚡ ⇒  ncu --verbose --kernel-name rgb_increase_brightness_pass_ready main.out
Skipping first line...
Skipping comments...
Reading width...
2
5
6
Reading height...
2
5
6
Reading max val...
2
5
5
width: 256, height: 256, max_val: 255
Allocating mem...
Reading rgb data...
Read ppm succeeded
==PROF== Connected to process 27311 (/home/rwang2000/eecs583/final-project/examples/rgb/main.out)
==LOG== Target process 27315 terminated before first instrumented API call.
Writing result...
==PROF== Disconnected from process 27311
==WARNING== No kernels were profiled.

I am a bit confused because I am sure the kernel is called but it seems like ncu didn’t recognize it.

As a reference, here is main.cu:

#include <cstdio>

#include "ppm_utils.h"
// #include "rgb_pass_ready.cu"

#define TILE_WIDTH 512

__global__ void rgb_increase_brightness_pass_ready(int *pixel_dst,
                                                   int *pixel_src, int size,
                                                   float factor) {
  /*
    Declaration
  */
  int global_tid = blockIdx.x * blockDim.x + threadIdx.x;
  int local_tid = threadIdx.x;
  __shared__ int pixel_smem_src[3 * TILE_WIDTH];
  __shared__ int pixel_smem_dst[3 * TILE_WIDTH];

  /*
    "Gather": Read input pixels to shared memory
  */
  pixel_smem_src[3 * local_tid + 0] = pixel_src[3 * global_tid + 0];  // r
  __syncthreads();
  pixel_smem_src[3 * local_tid + 1] = pixel_src[3 * global_tid + 1];  // g
  __syncthreads();
  pixel_smem_src[3 * local_tid + 2] = pixel_src[3 * global_tid + 2];  // b
  __syncthreads();

  /*
    Computation, very simple here but can be more complicated
  */
  pixel_smem_dst[3 * local_tid + 0] =
      min(255, (int)(factor * (pixel_smem_src[3 * local_tid + 0])));
  pixel_smem_dst[3 * local_tid + 1] =
      min(255, (int)(factor * (pixel_smem_src[3 * local_tid + 1])));
  pixel_smem_dst[3 * local_tid + 2] =
      min(255, (int)(factor * (pixel_smem_src[3 * local_tid + 2])));
  __syncthreads();

  /*
    "Scatter": Write result to destination
  */
  pixel_dst[3 * global_tid + 0] = pixel_smem_dst[3 * local_tid + 0];  // r
  __syncthreads();
  pixel_dst[3 * global_tid + 1] = pixel_smem_dst[3 * local_tid + 1];  // g
  __syncthreads();
  pixel_dst[3 * global_tid + 2] = pixel_smem_dst[3 * local_tid + 2];  // b
};

void test_increase_brightness_pass_ready() {

  /*
    Declaration and configuration
  */
  int width, height;
  int *host_pixel_src = read_ppm("images/1.ppm", width, height);
  long long total_pixel = width * height;
  int num_block = (total_pixel + TILE_WIDTH - 1) / TILE_WIDTH;
  const dim3 dimGrid = dim3(num_block, 1, 1);
  const dim3 dimBlock = dim3(TILE_WIDTH, 1, 1);
  int num_pixels = width * height;
  int host_pixel_res[3 * num_pixels];
  int *device_pixel_src;
  int *device_pixel_cpy;

  /*
    Allocate host and device memory
  */
  cudaMalloc(&device_pixel_src, 3 * num_pixels * sizeof(int));
  cudaMalloc(&device_pixel_cpy, 3 * num_pixels * sizeof(int));

  /*
    Data movement and kernel launch
  */
  cudaMemcpy(device_pixel_src, host_pixel_src, 3 * num_pixels * sizeof(int),
             cudaMemcpyHostToDevice);
  rgb_increase_brightness_pass_ready<<<dimGrid, dimBlock>>>(
      device_pixel_cpy, device_pixel_src, width * height, 1.8);
  cudaDeviceSynchronize();
  cudaMemcpy(host_pixel_res, device_pixel_cpy, 3 * num_pixels * sizeof(int),
             cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  /*
    Write result to output file
  */
  write_ppm("images/1_modified.ppm", host_pixel_res, width, height);


  /*
    Free host and device memory
  */
  delete[] host_pixel_src;
  cudaFree(device_pixel_src);
  cudaFree(device_pixel_cpy);
}

int main() {
  test_increase_brightness_pass_ready();
  return 0;
}

And here is run.sh, which is used to compile and apply our own LLVM pass to the device code. I build by running

sh run.sh main

#!/bin/bash

# TODO: Set the target GPU architecture
GPU_ARCH="sm_75"

# TODO: Set CUDA path
CUDA_PATH="/opt/cuda"

# TODO: Set nvcc path
NVCC="/opt/cuda/bin/nvcc"

# NOTE: If you have no inline problem, try to switch to CUDA 11.7


cd ../..
rm -rf build
mkdir build
cd build
cmake ..
make
cd ../examples/rgb


# Seperate Compilation follows the below process
# "nvptx64-nvidia-cuda" - "clang", inputs: ["rgb.cu"], output: "/tmp/rgb-c42443/rgb-sm_35.s"
# "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["/tmp/rgb-c42443/rgb-sm_35.s"], output: "/tmp/rgb-d56ddb/rgb-sm_35.o"
# "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["/tmp/rgb-d56ddb/rgb-sm_35.o", "/tmp/rgb-c42443/rgb-sm_35.s"], output: "/tmp/rgb-25be7e.fatbin"
# "x86_64-pc-linux-gnu" - "clang", inputs: ["rgb.cu", "/tmp/rgb-25be7e.fatbin"], output: "/tmp/rgb-2642c4.o"
# "nvptx64-nvidia-cuda" - "clang", inputs: ["main.cu"], output: "/tmp/main-99237c/main-sm_35.s"
# "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["/tmp/main-99237c/main-sm_35.s"], output: "/tmp/main-148b92/main-sm_35.o"
# "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["/tmp/main-148b92/main-sm_35.o", "/tmp/main-99237c/main-sm_35.s"], output: "/tmp/main-93db48.fatbin"
# "x86_64-pc-linux-gnu" - "clang", inputs: ["main.cu", "/tmp/main-93db48.fatbin"], output: "/tmp/main-9b99b7.o"
# "x86_64-pc-linux-gnu" - "GNU::Linker", inputs: ["/tmp/rgb-2642c4.o", "/tmp/main-9b99b7.o"], output: "a.out"

# Compile the CUDA device and host code to rgb_device.bc and rgb_host.o
clang++ -stdlib=libc++ --cuda-gpu-arch=${GPU_ARCH} --cuda-path=${CUDA_PATH} --cuda-device-only -emit-llvm -c ${1}.cu -o ${1}_device.bc -Xclang -disable-O0-optnone
clang++ -stdlib=libc++ --cuda-gpu-arch=${GPU_ARCH} --cuda-path=${CUDA_PATH} --cuda-host-only -emit-llvm -c ${1}.cu -o ${1}_host.o -Xclang -disable-O0-optnone

# Apply the pass to the device bc code
opt -load-pass-plugin ../../build/coalpass/CoalPass.so -passes=coal ${1}_device.bc -o ${1}_device.bc

# Convert rgb_device.bc to ptx
llc -march=nvptx64 -mcpu=${GPU_ARCH} ${1}_device.bc -o ${1}_device.ptx

# Convert rgb_device.ptx to cubin
${NVCC} --gpu-architecture=${GPU_ARCH} --cubin ${1}_device.ptx

# Dlink rgb_devic.cubin with rgb_host.o and generate a rgb object file
${NVCC} --gpu-architecture=${GPU_ARCH} --device-link ${1}_device.cubin ${1}_host.o -o ${1}.o

# Assemble the device code and main to generate an executable rgb.out
${NVCC} -gencode arch=compute_75,code=sm_75 ${1}.cu ${1}.o -o ${1}.out

# Run the executable
./${1}.out

# Remove redundant files
# rm *.bc *.ptx *.o

Hi, @slytherinheir1214

Thanks for contacting !

But I fail to build the sample only with the main,cu, where can I get images/1.ppm and ppm_utils.h etc ? Can you provide ?

Also please help to check if it is possible the same reason as we recently mentioned in Ncu does not detect kernels, ==ERROR== The application returned an error code (11) - #6 by felix_dt

Hi, @slytherinheir1214

Is this issue resolved now ?

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.