Shared memory access lead to crash when using cuda-memcheck

oshinover · February 1, 2022, 4:23pm

Hi!

When I runs my program, I gets no errors (including cudaError_t). but the data after my kernel processing looks like it didn’t changed at all.
I tried to check my kernel memory-accesses. But using cuda-memcheck makes my program crash and reboot the system.

I was able to isolate the problem (by commenting sections of the code) and found out that it happens when the data is loaded from global memory to shared memory.
I have array W in global memory, size 102451216*8 float elements (representing 4D array).
I launched the kernel with 8192 blocks and I want each block (1024 threads) to load 32 KB of the array W from global to shared memory.

I’m working on Jetson AGX Xavier with cuda 10.2.
Here is my code. Can someone tell my what I am doing wrong?
Thank you.

#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <complex>

using namespace std;


#define CUDA_RUNTIME_HANDLE(cuda_err, string_API_name)  \
{                   \
cudaError_t err = cuda_err;     \
if (err != cudaSuccess)     \
    cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " << string_API_name << ":  " << cudaGetErrorString(err) <<endl;    \
}  



typedef struct complexIn{
    int16_t real;
    int16_t imag;
} complexIn_t;

typedef struct complexOut{
    float real;
    float imag;
} complex_t;



__global__ void kernel_processing(complexIn_t *dataIn, complex_t *dataOut,
    float *W, int Np, int Nrg, int Nch, int Nb,
    int num_of_W_chunks, int elem_per_thread, int stride){
        

    extern __shared__ float W_shared[];

    // Load W from global to shared 
    for(int i =0; i < elem_per_thread; i++){
        W_shared[threadIdx.x + i*stride] = W[blockIdx.x*blockDim.x*elem_per_thread + threadIdx.x + i*stride];
    }

    __syncthreads();

    // Some processing
}



int main( int argc, char **argv ) {

    cout << "complex float size: " << sizeof(complex_t) << endl;
    cout << "complex int16 size: " << sizeof(complexIn_t) << endl;

    int Np = 1024;
    int Nrg = 512; 
    int Nch = 16;
    int Nb = 8;
    
    complexIn_t *dataHostIn = new complexIn_t[Np*Nrg*Nch];
    complex_t *dataHostOut = new complex_t[Np*Nrg*Nb];
    float *W = new float[Np*Nrg*Nb*Nch];
    
    // Initialize dataHostIn in size Np*Nrg*Nch
    
    for(int i =0; i<Np*Nrg*Nch; i++){
        dataHostIn[i].real = (int16_t)sin(17*i);
        dataHostIn[i].imag = (int16_t)sin(16*i);
    }

    // Initialize dataHostOut in size Np*Nrg*Nb
    
    for(int i =0; i<Np*Nrg*Nb; i++){
        dataHostOut[i].real =0;
        dataHostOut[i].imag =0;
    }

    // Initialize W in size Np*Nrg*Nb
    
    for(int i =0; i<Np*Nrg*Nb*Nch; i++){
        W[i] = (float)sin(25*i);
    }

    cout << endl;

    complexIn_t *dataDeviceIn;
    CUDA_RUNTIME_HANDLE(cudaMalloc(&dataDeviceIn, Np*Nrg*Nch*sizeof(complexIn_t)), "cudaMalloc");
    CUDA_RUNTIME_HANDLE(cudaMemcpy(dataDeviceIn, dataHostIn, Np*Nrg*Nch*sizeof(complexIn_t), cudaMemcpyDefault) , "cudaMemcpy");
    
    float *W_Device;
    CUDA_RUNTIME_HANDLE(cudaMalloc(&W_Device, Np*Nrg*Nb*Nch*sizeof(float)), "cudaMalloc");
    CUDA_RUNTIME_HANDLE(cudaMemcpy(W_Device, W, Np*Nrg*Nb*Nch*sizeof(float), cudaMemcpyDefault) , "cudaMemcpy");
    
    complex_t *dataDeviceOut;
    CUDA_RUNTIME_HANDLE(cudaMalloc(&dataDeviceOut, Np*Nrg*Nb*sizeof(complex_t)), "cudaMalloc");

    int W_chunk_size = Nch*Nb;
    int num_of_W_chunks = 64;
    int Nelements_in_W = W_chunk_size*num_of_W_chunks;
    int elem_per_thread = Nelements_in_W/1024;
    int stride = Nelements_in_W/elem_per_thread;
    int shared_mem_per_block = W_chunk_size*num_of_W_chunks*sizeof(float);
    int BlockDim = 1024;
    int GridDim = (Np*Nrg)/num_of_W_chunks;


    CUDA_RUNTIME_HANDLE(cudaDeviceSynchronize(), "cudaDeviceSynchronize");

    kernel_processing<<<GridDim, BlockDim ,shared_mem_per_block>>>(dataDeviceIn,
     dataDeviceOut, W_Device, Np, Nrg, Nch, Nb, num_of_W_chunks, elem_per_thread, stride);
     
    CUDA_RUNTIME_HANDLE(cudaDeviceSynchronize(), "cudaDeviceSynchronize");

    CUDA_RUNTIME_HANDLE(cudaMemcpy(dataHostOut, dataDeviceOut, Np*Nrg*Nb*sizeof(complex_t), cudaMemcpyDefault), "cudaMemcpy");

    CUDA_RUNTIME_HANDLE(cudaFree(dataDeviceIn), "cudaFree");
    CUDA_RUNTIME_HANDLE(cudaFree(dataDeviceOut), "cudaFree");

    cout << "----------- Check the output: -----------" << endl;
    cout << "dataOut[555] = " << dataHostOut[555].real << " + " << dataHostOut[555].imag << "i" << endl;


    delete[] dataHostIn;
    delete[] dataHostOut;

    return 0;
}

Robert_Crovella · February 1, 2022, 10:32pm

There are no cuda-detectable errors in your code. The reason cuda-memcheck causes the crash you describe is because kernel code may run much more slowly when instrumented by cuda-memcheck. As a result, you are hitting a timeout on your Jetson device. This is tough to work around, but one approach would be to make sure you are not using cuda-memcheck on a debug code, because that also runs more slowly. Another way to test things would be to reduce the data set size, so the kernel has less work to do. Perhaps try working on making a data set size consistent with a single block, then get a single block working. Then move to 2 blocks, etc.

When I run your code on a device that has no timeout, there is no crash, and no error reported by cuda-memcheck either.

Beyond that I can’t say much. Your program outputs a result like:

dataOut[555] = 0 + 0i

but of course there is no reason to expect anything sensible there, because the only thing your kernel is doing is loading shared memory.

I’m not sure how you arrived at the conclusion that the problem is in loading shared memory, but cuda-memcheck does not agree with you.

Since your kernel doesn’t do anything else, I can’t say anything further.

oshinover · February 6, 2022, 7:17am

Thank you!
I will check my program again

Topic		Replies	Views
Access memory of cudaMallocManaged after launch kernel will cause crash Jetson AGX Orin cuda	5	381	December 5, 2023
bug in cuda-memcheck CUDA-MEMCHECK	1	3047	January 17, 2013
Memory Checker detected 12 access violations. error = access violation on load (global memory) CUDA Programming and Performance	6	1582	January 21, 2018
CUDA racecheck, shared memory array and cudaDeviceSynchronize() CUDA Programming and Performance	10	2767	February 5, 2013
cuda-memcheck and available memory Jetson TX1	5	1331	March 7, 2018
New dynamic shared memory allocation in CUDA 5? CUDA Programming and Performance	5	4280	November 8, 2012
cuda-memcheck : windows + cublas CUDA-MEMCHECK	2	1276	November 12, 2018
CUDA_ERROR_ILLEGAL_ADDRESS CUDA Programming and Performance	6	10630	September 26, 2017
Cuda8 samples crash but work with memcheck Jetson TX2	3	444	October 18, 2021
Problem with arch=sm_20 CUDA Programming and Performance	16	4222	March 4, 2011

Shared memory access lead to crash when using cuda-memcheck

Related topics