pycuda._driver.LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered

milyklj · November 26, 2019, 1:59pm

Hallo,

I have a piece of very simple code written in Pycuda. Can someone tell me, why shouldn’t I set the index of array CC as “c = wA * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx”? For example, if I set the index of CC as 1 or 2 or 3, it can get the right value.

import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools   
from pycuda.compiler import SourceModule 
import numpy as np

kernel_code_template = """

#include <cuComplex.h>

__device__ void ExtractVector(cuFloatComplex *mul_a, int &idt, cuFloatComplex *A){  
    for (int i = 0; i < %(MATRIX_SIZE_O)d; i ++)
    {
        A[i] = make_cuFloatComplex(0,0);
        A[i] = make_cuFloatComplex(cuCrealf(mul_a[i*%(tSize)d + idt]),cuCimagf(mul_a[i*%(tSize)d + idt]));
    }
    __syncthreads();

}

__global__ void MatrixMulKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *CC)
{
    const uint wA = %(MATRIX_SIZE_O)d;
    const uint wB = %(MATRIX_SIZE_O)d;

    // Block index
    const uint bx = blockIdx.x;
    const uint by = blockIdx.y;

    // Thread index
    const uint tx = threadIdx.x;
    const uint ty = threadIdx.y;

cuFloatComplex Csub = make_cuFloatComplex(0,0);
    
    __shared__ cuFloatComplex VectorA[%(MATRIX_SIZE_O)d];
    __shared__ cuFloatComplex C1[%(MATRIX_SIZE_O)d];
    
    const uint aBegin = wA * %(BLOCK_SIZE)d * by;
    const uint aEnd   = aBegin + wA - 1;
    const uint aStep = 1;
    const int bBegin = %(BLOCK_SIZE)d * bx;
    const uint bStep = wB;

    const uint c = wA * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx ;

    for (int index_r = 0; index_r < %(rSize)d; index_r ++)
    {   
        for (int index_t = 0; index_t < %(tSize)d; index_t ++)
        {
            ExtractVector(A, index_t, VectorA);
            for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep)
            {
                Csub = cuCaddf(Csub,cuCmulf(VectorA[a], B[b]));
                __syncthreads();
            
            }
            CC[bx]  = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub));
            __syncthreads();  
        }   
    }
}
"""

theta = (np.arange(-80, 80, 40))/360*2*np.pi
tsize = theta.size
rmax = 300
r = np.arange(0, rmax, rmax/3)
rsize = r.size
Mt = 1                  
Mr = 6

MATRIX_SIZE_O = Mt*Mr
MATRIX_SIZE_I = 1
BLOCK_SIZE = 1
BLOCK_SIZE_x = 1
BLOCK_SIZE_y = 1
GRID_SIZE = MATRIX_SIZE_O

kernel_code = kernel_code_template % {
        'MATRIX_SIZE_O': MATRIX_SIZE_O,
        'BLOCK_SIZE': BLOCK_SIZE,
        'tSize': tsize,
        'rSize': rsize,
        }

a_cpu = 1 + 1j*1 + np.arange(0, tsize * MATRIX_SIZE_O)
a_cpu = a_cpu.reshape(MATRIX_SIZE_O,tsize).astype(np.complex64)
b_cpu = 1 + 1j*2 + np.arange(0, MATRIX_SIZE_O * MATRIX_SIZE_O)
b_cpu = b_cpu.reshape(MATRIX_SIZE_O,MATRIX_SIZE_O).astype(np.complex64)
bb_cpu = 1 + 1j*3 + np.arange(0, tsize * MATRIX_SIZE_O)
bb_cpu = bb_cpu.reshape(MATRIX_SIZE_O, tsize).astype(np.complex64)
f_cpu = 1 + 1j*4 + np.arange(0, rsize * MATRIX_SIZE_O)
f_cpu = f_cpu.reshape(MATRIX_SIZE_O,rsize).astype(np.complex64) 

Bcon = np.real(a_cpu).astype(np.float32) + 1j*np.imag(a_cpu).astype(np.float32)
a_gpu = gpuarray.to_gpu(Bcon)

Rinv = np.real(b_cpu).astype(np.float32) + 1j*np.imag(b_cpu).astype(np.float32)
b_gpu = gpuarray.to_gpu(Rinv)

FFT_cpu = np.real(f_cpu).astype(np.float32) + 1j*np.imag(f_cpu).astype(np.float32)
f_gpu = gpuarray.to_gpu(FFT_cpu)

B = np.real(bb_cpu).astype(np.float32) + 1j*np.imag(bb_cpu).astype(np.float32)
bb_gpu = gpuarray.to_gpu(B)

beta_gpu = gpuarray.empty((rsize,tsize), np.complex64)
c_gpu = gpuarray.empty((MATRIX_SIZE_O), np.complex64)

beta_cpu = np.zeros(shape=(rsize,tsize)).astype(np.complex64)
for index_r in range (0, rsize):
    for index_theta in range(0 , theta.size):
        c_cpu = np.dot(a_cpu[:, index_theta], b_cpu)
        beta_cpu[index_r, index_theta] = np.dot(c_cpu, f_cpu[:, index_r]) 
        #/ (np.dot(c_cpu, bb_cpu[:, index_theta]))
        #beta_cpu[index_r, index_theta] = np.dot(c_cpu, bb_cpu[:, index_theta])

mod = compiler.SourceModule(kernel_code)
matrixmul = mod.get_function("MatrixMulKernel")

matrixmul(
        # inputs
        a_gpu,
        b_gpu,
        # outputs
        c_gpu,

        # grid of multiple blocks
        grid = (GRID_SIZE, GRID_SIZE),
        # block of multiple threads
        block = (BLOCK_SIZE_x , BLOCK_SIZE_y , 1)
        )

# print the results
print("-" * 80)
print("Matrix A (GPU): ")
print(a_gpu.get())

print("-" * 80)
print("Matrix B (GPU): ")
print(bb_gpu.get())

print("-" * 80)
print("Matrix C (GPU): ")
print(c_gpu.get())

print("-" * 80)
print("Matrix C (CPU): ")
print(c_cpu)

print("-" * 80)
print("Matrix beta (GPU): ")
print(beta_gpu.get())

print("-" * 80)
print("Matrix beta (CPU): ")
print(beta_cpu)

print("-" * 80)
print("CPU-GPU Difference: ")
print(beta_cpu/beta_gpu.get())

Robert_Crovella · November 26, 2019, 4:57pm

You’re launching 6x6 blocks.

   grid = (GRID_SIZE, GRID_SIZE),

That means bx and by take on values from 0…5

wA is 6, BLOCK_SIZE is 1

c = wA * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx

c = 6 * 1 * 5 + 1 * 5 = 35

(at its largest)

You c_gpu array is allocated to hold a total size of 6 elements:

c_gpu = gpuarray.empty((MATRIX_SIZE_O), np.complex64)

So I guess you can’t index into an array of size 6, with an index of 35

milyklj · November 26, 2019, 7:19pm

Robert_Crovella:

Can someone tell me, why shouldn’t I set the index of array CC as “c = wA * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx”?

You’re launching 6x6 blocks.
   grid = (GRID_SIZE, GRID_SIZE),
That means bx and by take on values from 0…5

wA is 6, BLOCK_SIZE is 1

c = wA * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx

c = 6 * 1 * 5 + 1 * 5 = 35

(at its largest)

You c_gpu array is allocated to hold a total size of 6 elements:

c_gpu = gpuarray.empty((MATRIX_SIZE_O), np.complex64)

So I guess you can’t index into an array of size 6, with an index of 35

Thank you very much for the answer! I tried to change c_gpu into size 6*6, but the code still has the same bug report. I don´t understand why.

Robert_Crovella · November 26, 2019, 7:32pm

For me, your posted code doesn’t even compile.

In your kernel, you haven’t defined B anywhere:

Csub = cuCaddf(Csub,cuCmulf(VectorA[a], B[b]));
                                        ^


kernel.cu(51): error: identifier "B" is undefined

So this obviously isn’t the code you are running. I won’t be able to help with that.

milyklj · November 26, 2019, 8:06pm

Robert_Crovella:

For me, your posted code doesn’t even compile.

In your kernel, you haven’t defined B anywhere:
Csub = cuCaddf(Csub,cuCmulf(VectorA[a], B[b]));
                                        ^


kernel.cu(51): error: identifier "B" is undefined
So this obviously isn’t the code you are running. I won’t be able to help with that.

I am very sorry about that mistake! I added the B array to that Code.
I also tested the code with cuda-memcheck. It shows two kinds of bugs:

========= Invalid __shared__ read of size 8
=========     at 0x00000250 in MatrixMulKernel
=========     by thread (0,0,0) in block (0,5,0)
=========     Address 0x000000f0 is out of bounds

cuModuleUnload failed: unspecified launch failure
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.

There are totally 38 errors. Why? This is a very simple code.

Robert_Crovella · November 26, 2019, 8:34pm

Note that in your now posted code, the size of c_gpu has not been corrected. That coding error is still present.

Invalid __shared__ read of size 8

Address 0x000000f0 is out of bounds

Your kernel code is reading from shared memory. The indexing is out of bounds.

You have an indexing error in your code. You should be able to find it using a method similar to what I pointed out for the global write to C.

Identify the size of the shared memory allocations (number of elements)
Identify all the locations in your kernel code, where you are reading from shared memory
For each location, determine the indexing patterns and the ranges of indices that result
Compare these patterns/ranges to the allocation sizes. You should find at least one example where the range of indexing somehow exceeds the allocation size

If you still need help, go through that 1-2-3-4 sequence, identify the work you have done, and the step where you are having trouble. Show all your work leading up to the step that you are having trouble with.

This code is so simple that it is a good opportunity to increase your debugging skills. This would be a very tedious process for longer codes, so there are tools available in CUDA C++ (cuda-memcheck with codes compiled with -lineinfo) that make this much much easier to find the offending line.

https://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218

Unfortunately I don’t know a way to get to similar information quickly with pycuda. But this code is so simple that you should be able to use the above 1-4 method to find the problem quickly, anyway.

classysdb · May 17, 2023, 10:38am

Hello @Robert_Crovella,

I am receiving the same error in a very weird way, whenever i use batch size =10 in below code i recieve this error,

        output = np.zeros((batch_size,512),dtype=np.float32,order="C")
        
        print("number of bytes in output array : ",output.nbytes)
        # print("output type : ",type(output))
        # print(output.shape)
        d_input = cuda.mem_alloc(im_batch.nbytes)
        d_output = cuda.mem_alloc(output.nbytes)
        
        bindings = [int(d_input), int(d_output)]

        # copy input to device, run inference, copy output to host
        cuda.memcpy_htod_async(d_input,im_batch)
        self.context.execute_v2(bindings=bindings)
        # cuda.Context.synchronize()
        cuda.memcpy_dtoh_async(output,d_output)

error message :


cuda.memcpy_dtoh_async(output,d_output)
pycuda._driver.LogicError: cuMemcpyDtoHAsync failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
[05/17/2023-15:58:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::42] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::42] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::42] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaStream::47] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::42] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::42] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::42] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)
[05/17/2023-15:58:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::42] Error Code 1: Cuda Runtime (CUDA-capable device(s) is/are busy or unavailable)

This error comes whenever i set batch_size =10. please can you help me identify the cause of this behavior ?

Robert_Crovella · May 17, 2023, 1:50pm

Just like in the previous cases in this thread, the error is coming about due to device code (kernel) activity. In this case, the kernel is being launched by

    self.context.execute_v2(bindings=bindings)

which is from TensorRT. I won’t be able to help further here, I suggest you ask TensorRT questions on the forum for TensorRT (and I’m sure additional information would be needed.)

Topic		Replies	Views
pycuda._driver.LogicError: cuMemcpyDtoHAsync failed: an illegal memory access was encountered TensorRT tensorrt , cuda , kernel	3	835	May 18, 2023
[Pycuda] LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered CUDA Programming and Performance	4	2028	November 11, 2019
Error Code 1: Cuda Runtime (invalid argument) TensorRT tensorrt , cuda , cudnn	2	212	September 30, 2024
PyCUDA WARNING: a clean-up operation failed (dead context maybe?) CUDA Programming and Performance	2	3001	July 25, 2019
Why cannot run this program CUDA Programming and Performance	7	951	May 11, 2018
pycuda._driver.LogicError: explicit_context_dependent failed: invalid device context - no currently active context? TensorRT	7	3933	October 22, 2018
Code hangs... CUDA Programming and Performance	24	19899	August 18, 2010
[Solved] CUDA code works on Kepler but not on Maxwell CUDA Programming and Performance	10	3125	October 5, 2015
Cuda code performance CUDA Programming and Performance	14	3144	December 16, 2014
CUDA error, bandwithTest.exe CUDA Setup and Installation	12	2500	January 21, 2019

pycuda._driver.LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered

Related topics