nvprof never returns

Hello everyone,

I am new to this forum, so please let me know if I am posting in the wrong place.

Recently acquired a Digits box with pre-installed software, but the command-line profiler does not seem to be working. For testing purposes, I am trying to profile the following program:

// This is the REAL "hello world" for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string "World!"
// By Ingemar Ragnemalm 2010

#include <stdio.h>

const int N = 16;
const int blocksize = 1;

__global__
void hello(char *a, int *b)
{
    a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
    char a[N] = "Hello 

// This is the REAL “hello world” for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string “World!”
// By Ingemar Ragnemalm 2010

#include <stdio.h>

const int N = 16;
const int blocksize = 1;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


// This is the REAL “hello world” for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string “World!”
// By Ingemar Ragnemalm 2010

#include <stdio.h>

const int N = 16;
const int blocksize = 1;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


// This is the REAL “hello world” for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string “World!”
// By Ingemar Ragnemalm 2010

#include <stdio.h>

const int N = 16;
const int blocksize = 1;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


// This is the REAL “hello world” for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string “World!”
// By Ingemar Ragnemalm 2010

#include <stdio.h>

const int N = 16;
const int blocksize = 1;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


// This is the REAL “hello world” for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string “World!”
// By Ingemar Ragnemalm 2010

#include <stdio.h>

const int N = 16;
const int blocksize = 1;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}


// This is the REAL “hello world” for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string “World!”
// By Ingemar Ragnemalm 2010

#include <stdio.h>

const int N = 16;
const int blocksize = 1;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);

printf("%s", a);

cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
cudaFree( bd );

printf("%s\n", a);
return EXIT_SUCCESS;

}

";
    int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    char *ad;
    int *bd;
    const int csize = N*sizeof(char);
    const int isize = N*sizeof(int);

    printf("%s", a);

    cudaMalloc( (void**)&ad, csize );
    cudaMalloc( (void**)&bd, isize );
    cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
    cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

    dim3 dimBlock( blocksize, 1 );
    dim3 dimGrid( 1, 1 );
    hello<<<dimGrid, dimBlock>>>(ad, bd);
    cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
    cudaFree( ad );
    cudaFree( bd );

    printf("%s\n", a);
    return EXIT_SUCCESS;
}

Which I compile as follows:

nvcc -c hello_world.cu
nvcc hello_world.o -o hello_world

I then try to profile it as:

nvprof /home/joost/code/hello_world/hello_world

This command never returns, and after a minute or 5 I kill it, after which it gives me the following output:

^C======== Warning: No CUDA application was profiled, exiting
======== Error: Application received signal 2

When I run the visual profiler (nvvp), the program can be profiled just fine, until I try to debug individual kernels. At that point, the visual profiler seems to be calling the nvprof, and the process never completes.

nvprof gives me the following version information:

~/code/hello_world$ nvprof --version
nvprof: NVIDIA (R) Cuda command line profiler
Copyright (c) 2012 - 2015 NVIDIA Corporation
Release version 7.5.18 (21)

The machine contains 4 GeForce GTX TITAN X.
Major revision number: 5
Minor revision number: 2

Any suggestions what I may be doing wrong, or where I should look for more information?

Before you return you should call cudaDeviceReset() or cudaProfilerStop() to “ensure all profile data is collected”.

It’s lightly documented here.

Sorry for responding a bit late, the “send e-mail notification” functionality of the form doesn’t seem to be working for me.

Thank you for your suggestion. I have changed my code as shown below, but this has no effect on the behavior of the profiler. It hangs and never returns.

// This is the REAL "hello world" for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string "World!"
// By Ingemar Ragnemalm 2010

#include <stdio.h>
#include <cuda_profiler_api.h>

const int N = 16;
const int blocksize = 1;

__global__
void hello(char *a, int *b)
{
    a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
    char a[N] = "Hello \0\0\0\0\0\0";
    int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    char *ad;
    int *bd;
    const int csize = N*sizeof(char);
    const int isize = N*sizeof(int);

    printf("%s", a);

    cudaProfilerStart();
    cudaMalloc( (void**)&ad, csize );
    cudaMalloc( (void**)&bd, isize );
    cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
    cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );

    dim3 dimBlock( blocksize, 1 );
    dim3 dimGrid( 1, 1 );
    hello<<<dimGrid, dimBlock>>>(ad, bd);
    cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
    cudaFree( ad );
    cudaFree( bd );

    cudaProfilerStop();

    printf("%s\n", a);
    return EXIT_SUCCESS;
}

Hmm… it works on Win10/x64 + CUDA 7.5:

Is this on a DIGITS DEVBOX that you purchased directly from NVIDIA?

what is the output from running nvidia-smi on that machine?

what is the output from running:

cuda-memcheck /home/joost/code/hello_world/hello_world

?

Works for me as well (64-bit Win7, CUDA 7.5):

>nvprof profiler_problem.exe
Hello ==2812== NVPROF is profiling process 2812, command: profiler_problem.exe
Wello
==2812== Profiling application: profiler_problem.exe
==2812== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 36.72%  2.4320us         2  1.2160us     640ns  1.7920us  [CUDA memcpy HtoD]
 32.37%  2.1440us         1  2.1440us  2.1440us  2.1440us  hello(char*, int*)
 30.91%  2.0470us         1  2.0470us  2.0470us  2.0470us  [CUDA memcpy DtoH]

==2812== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 99.56%  291.98ms         1  291.98ms  291.98ms  291.98ms  cudaProfilerStart
  0.15%  447.16us        83  5.3870us       0ns  210.24us  cuDeviceGetAttribute
  0.15%  434.85us         2  217.43us  9.0900us  425.76us  cudaMalloc
  0.05%  151.30us         3  50.434us  29.616us  75.652us  cudaMemcpy
  0.04%  104.39us         1  104.39us  104.39us  104.39us  cuDeviceGetName
  0.03%  91.486us         2  45.743us  14.955us  76.531us  cudaFree
  0.02%  44.570us         1  44.570us  44.570us  44.570us  cudaLaunch
  0.00%  4.6920us         1  4.6920us  4.6920us  4.6920us  cuDeviceTotalMem
  0.00%  1.7600us         1  1.7600us  1.7600us  1.7600us  cudaConfigureCall
  0.00%  1.7590us         2     879ns     586ns  1.1730us  cuDeviceGetCount
  0.00%  1.1730us         2     586ns     293ns     880ns  cudaSetupArgument
  0.00%     587ns         2     293ns     293ns     294ns  cuDeviceGet

This problem can occur if the version of nvprof you are using does not match the CUDART library that your application was linked against.

However this doesn’t seem to be likely in your case.

Thanks to everyone for the help and suggestions.

It happens on a DIGITS DEVBOX purchased directly from NVIDIA.

The output of nvidia-smi:

Sat Mar 26 15:44:32 2016       
+------------------------------------------------------+                       
| NVIDIA-SMI 352.79     Driver Version: 352.79         |                       
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX TIT...  Off  | 0000:05:00.0     Off |                  N/A |
| 22%   55C    P0    73W / 250W |     23MiB / 12285MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX TIT...  Off  | 0000:06:00.0     Off |                  N/A |
| 22%   58C    P0    75W / 250W |     23MiB / 12287MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce GTX TIT...  Off  | 0000:09:00.0     Off |                  N/A |
| 22%   53C    P0    72W / 250W |     23MiB / 12287MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   3  GeForce GTX TIT...  Off  | 0000:0A:00.0     Off |                  N/A |
|  0%   44C    P0    54W / 250W |     23MiB / 12287MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID  Type  Process name                               Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

Output of cuda-memcheck /home/joost/code/hello_world/hello_world:

========= CUDA-MEMCHECK
Hello Wello 
========= ERROR SUMMARY: 0 errors

Also, the output of nvcc --version is:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17

Output of ldd hello_world:

linux-vdso.so.1 =>  (0x00007ffd0cd9c000)
	librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f146dd67000)
	libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f146db49000)
	libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f146d944000)
	libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f146d57f000)
	/lib64/ld-linux-x86-64.so.2 (0x00007f146df92000)

(I guess cuda libraries are statically linked?)

Based on the responses, I expect that there is some issue with the nvprof installation, any of the libraries it requires, or even an issue with the nvcc compiler. I expect that the issue may be solved by reinstalling or updating some of the components. I’ll probably contact NVIDIA support, and I’ll post back here if we find the issue.

With the help of NVIDIA support we found the issue.

After receiving the DEVBOX, we updated CUDA with:

sudo dpkg -i cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb
sudo apt-get update
sudo apt-get install cuda

That breaks the profiler.

Reverting back to an older installation (in this case cuda-7.0) by changing the simlink to CUDA solved the problem:

cd /usr/local
sudo mv cuda cuda_old
sudo ln -s /usr/local/cuda-7.0 /usr/local/cuda

Note to self, and others, the DEVBOX was designed to updated as described here:
http://docs.nvidia.com/deeplearning/digits-devbox-user-guide/index.html#software-updates