Shared memory access lead to crash when using cuda-memcheck

Hi!

When I runs my program, I gets no errors (including cudaError_t). but the data after my kernel processing looks like it didn’t changed at all.
I tried to check my kernel memory-accesses. But using cuda-memcheck makes my program crash and reboot the system.

I was able to isolate the problem (by commenting sections of the code) and found out that it happens when the data is loaded from global memory to shared memory.
I have array W in global memory, size 102451216*8 float elements (representing 4D array).
I launched the kernel with 8192 blocks and I want each block (1024 threads) to load 32 KB of the array W from global to shared memory.

I’m working on Jetson AGX Xavier with cuda 10.2.
Here is my code. Can someone tell my what I am doing wrong?
Thank you.

#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <complex>

using namespace std;


#define CUDA_RUNTIME_HANDLE(cuda_err, string_API_name)  \
{                   \
cudaError_t err = cuda_err;     \
if (err != cudaSuccess)     \
    cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " << string_API_name << ":  " << cudaGetErrorString(err) <<endl;    \
}  



typedef struct complexIn{
    int16_t real;
    int16_t imag;
} complexIn_t;

typedef struct complexOut{
    float real;
    float imag;
} complex_t;



__global__ void kernel_processing(complexIn_t *dataIn, complex_t *dataOut,
    float *W, int Np, int Nrg, int Nch, int Nb,
    int num_of_W_chunks, int elem_per_thread, int stride){
        

    extern __shared__ float W_shared[];

    // Load W from global to shared 
    for(int i =0; i < elem_per_thread; i++){
        W_shared[threadIdx.x + i*stride] = W[blockIdx.x*blockDim.x*elem_per_thread + threadIdx.x + i*stride];
    }

    __syncthreads();

    // Some processing
}



int main( int argc, char **argv ) {

    cout << "complex float size: " << sizeof(complex_t) << endl;
    cout << "complex int16 size: " << sizeof(complexIn_t) << endl;

    int Np = 1024;
    int Nrg = 512; 
    int Nch = 16;
    int Nb = 8;
    
    complexIn_t *dataHostIn = new complexIn_t[Np*Nrg*Nch];
    complex_t *dataHostOut = new complex_t[Np*Nrg*Nb];
    float *W = new float[Np*Nrg*Nb*Nch];
    
    // Initialize dataHostIn in size Np*Nrg*Nch
    
    for(int i =0; i<Np*Nrg*Nch; i++){
        dataHostIn[i].real = (int16_t)sin(17*i);
        dataHostIn[i].imag = (int16_t)sin(16*i);
    }

    // Initialize dataHostOut in size Np*Nrg*Nb
    
    for(int i =0; i<Np*Nrg*Nb; i++){
        dataHostOut[i].real =0;
        dataHostOut[i].imag =0;
    }

    // Initialize W in size Np*Nrg*Nb
    
    for(int i =0; i<Np*Nrg*Nb*Nch; i++){
        W[i] = (float)sin(25*i);
    }

    cout << endl;

    complexIn_t *dataDeviceIn;
    CUDA_RUNTIME_HANDLE(cudaMalloc(&dataDeviceIn, Np*Nrg*Nch*sizeof(complexIn_t)), "cudaMalloc");
    CUDA_RUNTIME_HANDLE(cudaMemcpy(dataDeviceIn, dataHostIn, Np*Nrg*Nch*sizeof(complexIn_t), cudaMemcpyDefault) , "cudaMemcpy");
    
    float *W_Device;
    CUDA_RUNTIME_HANDLE(cudaMalloc(&W_Device, Np*Nrg*Nb*Nch*sizeof(float)), "cudaMalloc");
    CUDA_RUNTIME_HANDLE(cudaMemcpy(W_Device, W, Np*Nrg*Nb*Nch*sizeof(float), cudaMemcpyDefault) , "cudaMemcpy");
    
    complex_t *dataDeviceOut;
    CUDA_RUNTIME_HANDLE(cudaMalloc(&dataDeviceOut, Np*Nrg*Nb*sizeof(complex_t)), "cudaMalloc");

    int W_chunk_size = Nch*Nb;
    int num_of_W_chunks = 64;
    int Nelements_in_W = W_chunk_size*num_of_W_chunks;
    int elem_per_thread = Nelements_in_W/1024;
    int stride = Nelements_in_W/elem_per_thread;
    int shared_mem_per_block = W_chunk_size*num_of_W_chunks*sizeof(float);
    int BlockDim = 1024;
    int GridDim = (Np*Nrg)/num_of_W_chunks;


    CUDA_RUNTIME_HANDLE(cudaDeviceSynchronize(), "cudaDeviceSynchronize");

    kernel_processing<<<GridDim, BlockDim ,shared_mem_per_block>>>(dataDeviceIn,
     dataDeviceOut, W_Device, Np, Nrg, Nch, Nb, num_of_W_chunks, elem_per_thread, stride);
     
    CUDA_RUNTIME_HANDLE(cudaDeviceSynchronize(), "cudaDeviceSynchronize");

    CUDA_RUNTIME_HANDLE(cudaMemcpy(dataHostOut, dataDeviceOut, Np*Nrg*Nb*sizeof(complex_t), cudaMemcpyDefault), "cudaMemcpy");

    CUDA_RUNTIME_HANDLE(cudaFree(dataDeviceIn), "cudaFree");
    CUDA_RUNTIME_HANDLE(cudaFree(dataDeviceOut), "cudaFree");

    cout << "----------- Check the output: -----------" << endl;
    cout << "dataOut[555] = " << dataHostOut[555].real << " + " << dataHostOut[555].imag << "i" << endl;


    delete[] dataHostIn;
    delete[] dataHostOut;

    return 0;
}

There are no cuda-detectable errors in your code. The reason cuda-memcheck causes the crash you describe is because kernel code may run much more slowly when instrumented by cuda-memcheck. As a result, you are hitting a timeout on your Jetson device. This is tough to work around, but one approach would be to make sure you are not using cuda-memcheck on a debug code, because that also runs more slowly. Another way to test things would be to reduce the data set size, so the kernel has less work to do. Perhaps try working on making a data set size consistent with a single block, then get a single block working. Then move to 2 blocks, etc.

When I run your code on a device that has no timeout, there is no crash, and no error reported by cuda-memcheck either.

Beyond that I can’t say much. Your program outputs a result like:

dataOut[555] = 0 + 0i

but of course there is no reason to expect anything sensible there, because the only thing your kernel is doing is loading shared memory.

I’m not sure how you arrived at the conclusion that the problem is in loading shared memory, but cuda-memcheck does not agree with you.

Since your kernel doesn’t do anything else, I can’t say anything further.

Thank you!
I will check my program again