Dump/inspect NVIDIA GPU global memory contents corresponding to arbitrary (but not invalid) addresses

I was wondering how to inspect or dump the NVIDIA GPU global memory contents corresponding to some arbitrary address but not invalid (not necessarily something returned by CUDA’s memory management APIs), like some address pointing to some symbol through cuda-gdb may be.

Let us take a small example program to illustrate the situation:

/*********************************post.cu***********************************/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define CUDA_SAFECALL(call)                                                 \
    {                                                                       \
        call;                                                               \
        cudaError err = cudaGetLastError();                                 \
        if (cudaSuccess != err) {                                           \
            fprintf(                                                        \
                stderr,                                                     \
                "Cuda error in function '%s' file '%s' in line %i : %s.\n", \
                #call, __FILE__, __LINE__, cudaGetErrorString(err));        \
            fflush(stderr);                                                 \
            exit(EXIT_FAILURE);                                             \
        }                                                                   \
    }

// CUDA kernel that adds two vectors, each thread handles one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n) c[id] = a[id] + b[id];
}

// CUDA kernel that doubles the elements of a vector
__global__ void vecDouble(double *a, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n) a[id] *= 2;
}

// CUDA kernel that halves the elements of a vector
__global__ void vecHalve(double *a, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n) a[id] /= 2;
}

typedef void (*fp)(double *, double *, double *, int);
typedef void (*fp1)(double *, int);

__device__ fp kernelPtrvecAdd = vecAdd;
__device__ fp1 kernelPtrvecDouble = vecDouble;
__device__ fp1 kernelPtrvecHalve = vecHalve;

// Parent kernel that launches vecAdd dynamically
__global__ void parentKernel(fp kernelPtr, double *d_a, double *d_b, double *d_c, int n) {
    int blockSize = 1024;
    int gridSize = (int)ceil((float)n / blockSize);
    printf("Parent kernel:: kernelPtr: %p\n", kernelPtr);
    // Launch the vecAdd kernel dynamically from the device
    kernelPtr<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
}

__global__ void breakpointKernel(){
    printf("Breakpoint Kernel\n");
}

void breakpointFunction(){
    printf("Breakpoint Function\n");
}

int main(int argc, char *argv[]) {
    // Size of vectors
    int n = 10000;
    if (argc > 1) n = atoi(argv[1]);
    
    // Host input vectors
    double *h_a, *h_b, *h_c;

    // Device input vectors
    double *d_a, *d_b, *d_c;

    // Size in bytes of each vector
    size_t bytes = n * sizeof(double);

    // Allocate memory for each vector on host
    h_a = (double *)malloc(bytes);
    h_b = (double *)malloc(bytes);
    h_c = (double *)malloc(bytes);

    // Initialize vectors on host
    for (int i = 0; i < n; i++) {
        h_a[i] = sin(i) * sin(i);
        h_b[i] = cos(i) * cos(i);
        h_c[i] = 0;
    }

    fp h_kernelPtrvecAdd;
    fp1 h_kernelPtrvecDouble;
    fp1 h_kernelPtrvecHalve;
    CUDA_SAFECALL(cudaMemcpyFromSymbol(&h_kernelPtrvecAdd, kernelPtrvecAdd, sizeof(fp)));
    CUDA_SAFECALL(cudaMemcpyFromSymbol(&h_kernelPtrvecDouble, kernelPtrvecDouble, sizeof(fp1)));
    CUDA_SAFECALL(cudaMemcpyFromSymbol(&h_kernelPtrvecHalve, kernelPtrvecHalve, sizeof(fp1)));

    printf("Device vecAdd Ptr: %p\n", h_kernelPtrvecAdd);
    printf("Host   vecAdd Ptr: %p\n", vecAdd);

    printf("Device vecDouble Ptr: %p\n", h_kernelPtrvecDouble);
    printf("Host   vecDouble Ptr: %p\n", vecDouble);

    printf("Device vecHalve Ptr: %p\n", h_kernelPtrvecHalve);
    printf("Host   vecHalve Ptr: %p\n", vecHalve);

    // Create CUDA stream
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    // Allocate memory for each vector on GPU using async memory allocation
    cudaMallocAsync(&d_a, bytes, stream);
    cudaMallocAsync(&d_b, bytes, stream);
    cudaMallocAsync(&d_c, bytes, stream);

    cudaStreamSynchronize(stream);

    printf("d_a: %p\n", d_a);
    printf("d_b: %p\n", d_b);
    printf("d_c: %p\n", d_c);

    // Copy host vectors to device using async memory copy
    cudaMemcpyAsync(d_a, h_a, bytes, cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_b, h_b, bytes, cudaMemcpyHostToDevice, stream);
    
    cudaStreamSynchronize(stream);

    // Launch parent kernel that launches vecAdd dynamically
    (parentKernel<<<1, 1, 0, stream>>>(h_kernelPtrvecAdd, d_a, d_b, d_c, n));

    int blockSize, gridSize;
    
    // Number of threads in each thread block
    blockSize = 1024;

    // Number of thread blocks in grid
    gridSize = (int)ceil((float)n / blockSize);

    vecDouble<<<gridSize, blockSize, 0, stream>>>(d_a, n);
    vecDouble<<<gridSize, blockSize, 0, stream>>>(d_b, n);
    vecAdd<<<gridSize, blockSize, 0, stream>>>(d_a, d_b, d_c, n);
    vecHalve<<<gridSize, blockSize, 0, stream>>>(d_c, n);
    
    // Synchronize the stream to ensure everything is done
    cudaStreamSynchronize(stream);
    
    // Copy array back to host using async memory copy
    cudaMemcpyAsync(h_c, d_c, bytes, cudaMemcpyDeviceToHost, stream);
        
    // Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for (int i = 0; i < n; i++) sum += h_c[i];
    printf("Final sum = %f; sum/n = %f (should be ~1)\n", sum, sum / n);

    breakpointKernel<<<1, 1, 0, stream>>>();
    breakpointFunction();
    
    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);

    // Release device memory using async memory deallocation
    cudaFreeAsync(d_a, stream);
    cudaFreeAsync(d_b, stream);
    cudaFreeAsync(d_c, stream);


    cudaStreamDestroy(stream);
    return 0;
}



$ nvcc -g -G -o post post.cu -rdc=true

Now let’s attach the program to cuda-gdb:

$ cuda-gdb ./post
(cuda-gdb) break breakpointKernel() 
Breakpoint 1 at 0xcf30: file post.cu, line 53.
(cuda-gdb) break breakpointFunction() 
Breakpoint 2 at 0xbe2f: file post.cu, line 58.
(cuda-gdb) run                        
Starting program: post 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff54c0000 (LWP 22501)]
[Detaching after fork from child process 22502]
[New Thread 0x7ffff4993000 (LWP 22511)]
[New Thread 0x7fffe89ff000 (LWP 22512)]
[New Thread 0x7fffe3fff000 (LWP 22513)]
Device vecAdd Ptr: 0x7fffceaff800
Host   vecAdd Ptr: 0x555555560957
Device vecDouble Ptr: 0x7fffceaff300
Host   vecDouble Ptr: 0x555555560ac7
Device vecHalve Ptr: 0x7fffceafee00
Host   vecHalve Ptr: 0x555555560c29
d_a: 0x302000000
d_b: 0x302013a00
d_c: 0x302027400
Parent kernel:: kernelPtr: 0x7fffceaff800
Final sum = 10000.000000; sum/n = 1.000000 (should be ~1)
[Switching focus to CUDA kernel 0, grid 8, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 1, lane 0]

Thread 1 "post" hit Breakpoint 1, breakpointKernel<<<(1,1,1),(1,1,1)>>> () at post.cu:54
54          printf("Breakpoint Kernel\n");

So, we are inside a kernel.

Trying to dump the memory contents of array a on the device (d_a):

(cuda-gdb) x /10x (@global void*) 0x302000000
0x302000000:    0x00000000      0x00000000      0x95d4dc81      0x3ff6a889
0x302000010:    0x036d9261      0x3ffa7553      0x687dd0a8      0x3fa4648f
0x302000020:    0xd7ec65f2      0x3ff253f7

We get some value. Now trying to access the memory content at the device pointer of vecAdd gives:

(cuda-gdb) x /10x (@global void*) 0x7fffceaff800
0x7fffceaff800: Error: Failed to read 4 bytes of global memory from 0x7fffceaff800
, error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).

(cuda-gdb) x /10bx (@global void*) 0x7fffceaff800
0x7fffceaff800: Error: Failed to read 1 bytes of global memory from 0x7fffceaff800
, error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).

(cuda-gdb) x /10i (@global void*) 0x7fffceaff800 
   0x7fffceaff800:
   0x7fffceaff801:
   0x7fffceaff802:
   0x7fffceaff803:
   0x7fffceaff804:
   0x7fffceaff805:
   0x7fffceaff806:
   0x7fffceaff807:
   0x7fffceaff808:
   0x7fffceaff809:

Let us continue and hit the breakpoint in the host function:

cuda-gdb) continue
Continuing.
Breakpoint Kernel

Thread 1 "post" hit Breakpoint 2, breakpointFunction () at post.cu:58
58          printf("Breakpoint Function\n");
(cuda-gdb) 

Now let us try to access the content at the host pointer of vecAdd:


(cuda-gdb) x /10i 0x555555560957                
   0x555555560957 <_Z6vecAddPdS_S_i>:   endbr64 
   0x55555556095b <_Z6vecAddPdS_S_i+4>: push   %rbp
   0x55555556095c <_Z6vecAddPdS_S_i+5>: mov    %rsp,%rbp
   0x55555556095f <_Z6vecAddPdS_S_i+8>: sub    $0x20,%rsp
   0x555555560963 <_Z6vecAddPdS_S_i+12>:        mov    %rdi,-0x8(%rbp)
   0x555555560967 <_Z6vecAddPdS_S_i+16>:        mov    %rsi,-0x10(%rbp)
   0x55555556096b <_Z6vecAddPdS_S_i+20>:        mov    %rdx,-0x18(%rbp)
   0x55555556096f <_Z6vecAddPdS_S_i+24>:        mov    %ecx,-0x1c(%rbp)
   0x555555560972 <_Z6vecAddPdS_S_i+27>:        mov    -0x1c(%rbp),%ecx
   0x555555560975 <_Z6vecAddPdS_S_i+30>:        mov    -0x18(%rbp),%rdx

The above are valid x86_64 instructions.

Now trying to get memory content of the device pointer from the host function breakpoint:

(cuda-gdb) x /10i (@global void*) 0x7fffceaff800
   0x7fffceaff800:      add    %al,(%rax)
   0x7fffceaff802:      add    %al,(%rax)
   0x7fffceaff804:      add    %al,(%rax)
   0x7fffceaff806:      add    %al,(%rax)
   0x7fffceaff808:      add    %al,(%rax)
   0x7fffceaff80a:      add    %al,(%rax)
   0x7fffceaff80c:      add    %al,(%rax)
   0x7fffceaff80e:      add    %al,(%rax)
   0x7fffceaff810:      add    %al,(%rax)
   0x7fffceaff812:      add    %al,(%rax)

I get some weird stretch of add instruction. (which is exactly the same for other kernels like vecDouble or vecHalf)

Trying out for vecDouble:

(cuda-gdb) x /10i (@global void*)  0x7fffceaff300        
   0x7fffceaff300:      add    %al,(%rax)
   0x7fffceaff302:      add    %al,(%rax)
   0x7fffceaff304:      add    %al,(%rax)
   0x7fffceaff306:      add    %al,(%rax)
   0x7fffceaff308:      add    %al,(%rax)
   0x7fffceaff30a:      add    %al,(%rax)
   0x7fffceaff30c:      add    %al,(%rax)
   0x7fffceaff30e:      add    %al,(%rax)
   0x7fffceaff310:      add    %al,(%rax)
   0x7fffceaff312:      add    %al,(%rax)

These weird add instructions seem erroneous to me anyway.

My question is, how could one dump the memory content of some arbitrary (but valid) device side pointer corresponding to some symbol or around that?


cuda-gdb version:

NVIDIA (R) CUDA Debugger
12.0 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 12.1

[Crosspost]

Hi - thanks for reaching out. Looks like you have already had this answered on stack overflow.

When we dereference the memory at vecAdd with the cast provided, we are instructing the debugger to dereference memory stored in global memory. However, in this example vecAdd resides in code memory. Currently, we do not support casting a pointer to to code memory. We can work around this by instead using the disas command:

(cuda-gdb) disass 0x7fffd3291800
Dump of assembler code for function _Z6vecAddPdS_S_i:
   0x00007fffd3291800 <+0>:	MOV R1,c[0x0][0x28]
   0x00007fffd3291810 <+16>:	MOV R10,c[0x0][0x118]
   0x00007fffd3291820 <+32>:	MOV R11,c[0x0][0x11c]
   0x00007fffd3291830 <+48>:	MOV R2,RZ
   0x00007fffd3291840 <+64>:	LDC.64 R2,c[0x0][R2+0x160]
   0x00007fffd3291850 <+80>:	MOV R8,R2
   0x00007fffd3291860 <+96>:	MOV R9,R3
   0x00007fffd3291870 <+112>:	MOV R8,R8
   0x00007fffd3291880 <+128>:	MOV R9,R9

It’s important to note when using function pointers, there are certain rules to follow which we document in the CUDA programming guide. Today, function pointers are not interchangeable on the host and device. What the device function pointer points to may not be the actual start of the function body. Today, you can think of it more as an entry into a jump table. For the documented rules, see: CUDA C++ Programming Guide

I think this is a valid feature request for us to look into in the future. Right now, there is no way to dump the contents of the function a function pointer points to. If I try the disassemble command on the contents of kernelPtrvecAdd, we get an error today:

(cuda-gdb) disass kernelPtrvecAdd
No function contains specified address.
1 Like