Illegal memory access was encountered when launch some kernels in a loop

So i have a illegal memory access was encountered error when i try to launch some kernels in a loop like this:

        for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
        {
            // Extract bits
            extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);

            // Compute nOnesBefore       
            scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);

            int zero = 0;
            CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
            CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

            // Compute rank
            computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);

            // Swap d_in and d_out
            uint32_t* temp = d_in;
            d_in = d_out;
            d_out = temp;
        }

And i debug and found out this ‘extractBits’ function was calculate wrong. When I try to print to see problem, the error rarely happen. So i realize the more delay i add, the less chance error happen. I add something like this and it run fine without error:

        for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
        {
            // Extract bits
            extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);

            // Dummy copy to delay, do nothing.
            CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

            // Compute nOnesBefore       
            scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);

            int zero = 0;
            CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
            CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

            // Compute rank
            computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);

            // Swap d_in and d_out
            uint32_t* temp = d_in;
            d_in = d_out;
            d_out = temp;
        }

What am i missing here? Also my laptop with gt740m can only run this with block size 256 or lower, my friend’s laptop with gtx 1050 can’t run and google colab with T4 run fine with any block size.

I don’t think there is enough information here to say anything definitive.

For the laptop issues, you may want to make sure you are not hitting a WDDM TDR timeout. There is also a similar kernel duration limit for display GPUs on linux.

For the block size 256 issue, you may want to see if you are running into a registers per thread issue.

There are many questions about both of these topics on the web already. Also, I would generally recommend any time you are having trouble with a CUDA code, to use:

  1. proper CUDA error checking (google that, take the first hit)
  2. the CUDA compute sanitizer tool (or cuda-memcheck if you wish)

Memory error happen because ‘extractBits’ function’s result was wrong.
After some test, i found out if i added something to delay under that function, result will be right and no more error.
So i think maybe because other function was run before ‘extractBits’ function done.
But as i remember correctly, kernels are executed sequentially by default(not using stream). So i don’t know what is problem.
Here is source code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#define CHECK(call)\
{\
    const cudaError_t error = call;\
    if (error != cudaSuccess)\
    {\
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);\
        fprintf(stderr, "code: %d, reason: %s\n", error,\
                cudaGetErrorString(error));\
        exit(1);\
    }\
}

struct GpuTimer
{
    cudaEvent_t start;
    cudaEvent_t stop;

    GpuTimer()
    {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }

    ~GpuTimer()
    {
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    void Start()
    {
        cudaEventRecord(start, 0);
        cudaEventSynchronize(start);
    }

    void Stop()
    {
        cudaEventRecord(stop, 0);
    }

    float Elapsed()
    {
        float elapsed;
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsed, start, stop);
        return elapsed;
    }
};

// Sequential Radix Sort
// "const uint32_t * in" means: the memory region pointed by "in" is read-only
void sortByHost(const uint32_t * in, int n,
                uint32_t * out)
{
    int * bits = (int *)malloc(n * sizeof(int));
    int * nOnesBefore = (int *)malloc(n * sizeof(int));

    uint32_t * src = (uint32_t *)malloc(n * sizeof(uint32_t));
    uint32_t * originalSrc = src; // To free memory later
    memcpy(src, in, n * sizeof(uint32_t));
    uint32_t * dst = out;

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
	// In each loop, sort elements according to the current bit from src to dst 
	// (using STABLE counting sort)
    for (int bitIdx = 0; bitIdx < sizeof(uint32_t) * 8; bitIdx++)
    {
        // Extract bits
        for (int i = 0; i < n; i++)
            bits[i] = (src[i] >> bitIdx) & 1;

        // Compute nOnesBefore
        nOnesBefore[0] = 0;
        for (int i = 1; i < n; i++)
            nOnesBefore[i] = nOnesBefore[i-1] + bits[i-1];

        // Compute rank and write to dst
        int nZeros = n - nOnesBefore[n-1] - bits[n-1];
        for (int i = 0; i < n; i++)
        {
            int rank;
            if (bits[i] == 0)
                rank = i - nOnesBefore[i];
            else
                rank = nZeros + nOnesBefore[i];
            dst[rank] = src[i];
        }

        // Swap src and dst
        uint32_t * temp = src;
        src = dst;
        dst = temp;
    }

    // Does out array contain results?
    memcpy(out, src, n * sizeof(uint32_t));

    // Free memory
    free(originalSrc);
    free(bits);
    free(nOnesBefore);
}

__global__ void extractBits(uint32_t* in, int n, int* out, int bitId)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        out[i] = (in[i] >> bitId) & 1;
}

__device__ int bCount = 0;
volatile __device__ int bDoneCount = 0;

__global__ void scanKernel(int* in, int n, int* out, volatile int* bSums)
{
    __shared__ int blockId;
    if (threadIdx.x == 0)
    {
        blockId = atomicAdd(&bCount, 1);
    }
    __syncthreads();

    // 1. Each block loads data from GMEM to SMEM
    extern __shared__ int s_data[];

    int i = blockId * blockDim.x + threadIdx.x;

    if (i < n)
    {
        if (i == 0)
            s_data[threadIdx.x] = 0;
        else
            s_data[threadIdx.x] = in[i - 1];
        __syncthreads();

        // 2. Each block does scan with data on SMEM
        for (int stride = 1; stride < blockDim.x; stride *= 2)
        {
            int neededVal;
            if (threadIdx.x >= stride)
                neededVal = s_data[threadIdx.x - stride];
            __syncthreads();
            if (threadIdx.x >= stride)
                s_data[threadIdx.x] += neededVal;
            __syncthreads();
        }

        // 3. Each block write results from SMEM to GMEM
        out[i] = s_data[threadIdx.x];

        if (bSums != NULL)
        {
            if (threadIdx.x == 0)
            {
                bSums[blockId] = s_data[blockDim.x - 1];

                if (blockId > 0)
                {
                    while (bDoneCount < blockId) {}
                    bSums[blockId] += bSums[blockId - 1];
                    __threadfence();
                }
                bDoneCount += 1;
            }
            __syncthreads();

            if (i + blockDim.x < n)
                out[i + blockDim.x] += bSums[blockId];
        }
    }
}

__global__ void computeRank(uint32_t* in, int n, uint32_t* out, int* bits, int* nOnesBefore)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int nZeros = n - nOnesBefore[n - 1] - bits[n - 1];

    if (i < n)
    {
        int rank;
        if (bits[i] == 0)
            rank = i - nOnesBefore[i];
        else
            rank = nZeros + nOnesBefore[i];
        out[rank] = in[i];
    }
}

// Parallel Radix Sort
void sortByDevice(const uint32_t * in, int n, uint32_t * out, int blockSize)
{
    uint32_t* src = (uint32_t*)malloc(n * sizeof(uint32_t));
    // TODO
    int *d_bits, *d_nOnesBefore;
    size_t nBytes = n * sizeof(int);
    CHECK(cudaMalloc(&d_bits, nBytes));
    CHECK(cudaMalloc(&d_nOnesBefore, nBytes));

    uint32_t * d_in, * d_out;
    nBytes = n * sizeof(uint32_t);
    CHECK(cudaMalloc(&d_in, nBytes));
    CHECK(cudaMalloc(&d_out, nBytes));

    CHECK(cudaMemcpy(d_in, in, nBytes, cudaMemcpyHostToDevice));

    int gridSize = (n - 1) / blockSize + 1;

    int* d_bSums;
    if (gridSize > 1)
    {
        CHECK(cudaMalloc(&d_bSums, gridSize * sizeof(int)));
    }
    else
    {
        d_bSums = NULL;
    }
    size_t smem = blockSize * sizeof(int);

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst 
    // (using STABLE counting sort)
    for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        // Dummy copy to delay, do nothing.
        CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank and write to d_out
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());
        
        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

    CHECK(cudaMemcpy(out, d_in, nBytes, cudaMemcpyDeviceToHost));
    

    // Free memory  
    CHECK(cudaFree(d_bits));
    CHECK(cudaFree(d_nOnesBefore));
    CHECK(cudaFree(d_in));
    CHECK(cudaFree(d_out));

    if (gridSize > 1)
        CHECK(cudaFree(d_bSums));

    free(src);
}

// Radix Sort
void sort(const uint32_t * in, int n, 
        uint32_t * out, 
        bool useDevice=false, int blockSize=1)
{
    GpuTimer timer; 
    timer.Start();

    if (useDevice == false)
    {
    	printf("\nRadix Sort by host\n");
        sortByHost(in, n, out);
    }
    else // use device
    {
    	printf("\nRadix Sort by device\n");
        sortByDevice(in, n, out, blockSize);
    }

    timer.Stop();
    printf("Time: %.3f ms\n", timer.Elapsed());
}

void printDeviceInfo()
{
    cudaDeviceProp devProv;
    CHECK(cudaGetDeviceProperties(&devProv, 0));
    printf("**********GPU info**********\n");
    printf("Name: %s\n", devProv.name);
    printf("Compute capability: %d.%d\n", devProv.major, devProv.minor);
    printf("Num SMs: %d\n", devProv.multiProcessorCount);
    printf("Max num threads per SM: %d\n", devProv.maxThreadsPerMultiProcessor); 
    printf("Max num warps per SM: %d\n", devProv.maxThreadsPerMultiProcessor / devProv.warpSize);
    printf("GMEM: %zu byte\n", devProv.totalGlobalMem);
    printf("SMEM per SM: %zu byte\n", devProv.sharedMemPerMultiprocessor);
    printf("SMEM per block: %zu byte\n", devProv.sharedMemPerBlock);
    printf("****************************\n");
}

void checkCorrectness(uint32_t * out, uint32_t * correctOut, int n)
{
    for (int i = 0; i < n; i++)
    {
        if (out[i] != correctOut[i])
        {
            printf("INCORRECT :(\n");
            return;
        }
    }
    printf("CORRECT :)\n");
}

void printArray(uint32_t * a, int n)
{
    for (int i = 0; i < n; i++)
        printf("%i ", a[i]);
    printf("\n");
}

int main(int argc, char ** argv)
{
    // PRINT OUT DEVICE INFO
    printDeviceInfo();

    // SET UP INPUT SIZE
    //int n = 50; // For test by eye
    int n = (1 << 24) + 1;
    printf("\nInput size: %d\n", n);

    // ALLOCATE MEMORIES
    size_t bytes = n * sizeof(uint32_t);
    uint32_t * in = (uint32_t *)malloc(bytes);
    uint32_t * out = (uint32_t *)malloc(bytes); // Device result
    uint32_t * correctOut = (uint32_t *)malloc(bytes); // Host result

    // SET UP INPUT DATA
    for (int i = 0; i < n; i++)
    {
        //in[i] = rand() % 255; // For test by eye
        in[i] = rand();
    }
    //printArray(in, n); // For test by eye

    // DETERMINE BLOCK SIZE
    int blockSize = 256; // Default 
    if (argc == 2)
        blockSize = atoi(argv[1]);

    // SORT BY HOST
    sort(in, n, correctOut);
    //printArray(correctOut, n); // For test by eye
    
    // SORT BY DEVICE
    sort(in, n, out, true, blockSize);
    //printArray(out, n); // For test by eye
    checkCorrectness(out, correctOut, n);

    // FREE MEMORIES
    free(in);
    free(out);
    free(correctOut);
    
    return EXIT_SUCCESS;
}

cross posting here

1 Like