[Pycuda] LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered

Hallo,

I got a problem. In the second kernel “DotKernel”, I can’t change the values of any shared array or global array.

import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools   
from pycuda.compiler import SourceModule
import numpy as np
from time import *

import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import time
from scipy import signal
from numba import jit
import numba as nb

kernel_code_template = """

#include <cuComplex.h>

__global__ void OuterKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA2, cuFloatComplex *C)
{
    const uint wB = %(MATRIX_SIZE_O)d;
    const uint bx = blockIdx.x;
    const uint by = blockIdx.y;
    const uint tx = threadIdx.x;
    const uint ty = threadIdx.y;

    __shared__ cuFloatComplex A_y ;
    __shared__ cuFloatComplex B_y ;
    __shared__ cuFloatComplex beta2;
    __shared__ cuFloatComplex C_add;

    const uint c = wB  *by +  bx;

    for (int index_r = 0; index_r < %(rSize)d; index_r ++)
    {
        for (int wA = 1; wA < %(tSize)d+1; wA ++)
        {
            const uint aBegin = %(tSize)d * %(BLOCK_SIZE)d * by;
            const uint aEnd   = aBegin + wA -1 ;
            const uint aStep = %(BLOCK_SIZE)d;
            const int bBegin = %(tSize)d * %(BLOCK_SIZE)d * bx;
            const uint bStep = %(BLOCK_SIZE)d ;

            for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
            {
                A_y = make_cuFloatComplex(cuCrealf(A[a + wB*ty + tx]),cuCimagf(A[a + wB*ty + tx]));
                B_y = make_cuFloatComplex(cuCrealf(B[b + wB*ty + tx]),cuCimagf(B[b + wB*ty + tx]));
                C_add= cuCmulf(A_y,B_y);                                                 
            }

            beta2 = make_cuFloatComplex(cuCrealf(BETA2[%(tSize)d * index_r+wA-1]),cuCimagf(BETA2[%(tSize)d * index_r+wA-1]));
            C_add = cuCmulf(C_add, beta2);
            C[c] = cuCaddf(C[c], C_add);
        }
    }
}

__global__ void DotKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA, cuFloatComplex *C, cuFloatComplex *FFT,cuFloatComplex *CC)
{   
    const uint wA = %(MATRIX_SIZE_I)d;
    const uint wB = %(MATRIX_SIZE_O)d;

    const uint bx = blockIdx.x;
    const uint by = blockIdx.y;

    const uint aBegin = wA * %(BLOCK_SIZE)d * by;
    const uint aEnd   = aBegin + wA - 1;
    const uint aStep = 1;
    const int bBegin = %(BLOCK_SIZE)d * bx;
    const uint bStep = wB;

    cuFloatComplex Csub = make_cuFloatComplex(0,0);
    cuFloatComplex Csub_1 = make_cuFloatComplex(0,0);
    cuFloatComplex Csub_2 = make_cuFloatComplex(0,0);

    __shared__ cuFloatComplex A_global[%(MATRIX_SIZE_I)d];
    __shared__ cuFloatComplex C1[%(MATRIX_SIZE_I)d];

    for (int index_r = 0; index_r < %(rSize)d; index_r ++)
    {   
        for (int index_t = 0; index_t < %(tSize)d; index_t ++)
        {
            for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
            {
                A_global[i] = make_cuFloatComplex(cuCrealf(B[i*%(tSize)d + index_t]),cuCimagf(B[i*%(tSize)d + index_t]));
            }
            __syncthreads();

            Csub = make_cuFloatComplex(0,0);

            for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep)
            {
                Csub = cuCaddf(Csub,cuCmulf(A_global[a], C[b]));
                __syncthreads();
            }

            const uint c = wB * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx;
            C1[c]  = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub));
            __syncthreads();   

            for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
            {
                A_global[i] = make_cuFloatComplex(0,0);
                A_global[i] = make_cuFloatComplex(cuCrealf(FFT[i*%(rSize)d + index_r]),cuCimagf(FFT[i*%(rSize)d + index_r]));
            }

            CC[c] = make_cuFloatComplex(0,0);
            __syncthreads();

            CC[c] = cuCmulf(C1[c],A_global[c]);
            Csub_1 = cuCmulf(Csub,A_global[c]);
            __syncthreads();

            BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0);

            Csub_1 = make_cuFloatComplex(0,0);

            for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1)
            {
                Csub_1 = cuCaddf(Csub_1, CC[i]);
                //BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]);
            }

            for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
            {
                A_global[i] = make_cuFloatComplex(0,0);
                A_global[i] = make_cuFloatComplex(cuCrealf(A[i*%(tSize)d + index_t]),cuCimagf(A[i*%(tSize)d + index_t]));
            }

            CC[c] = make_cuFloatComplex(0,0);
            __syncthreads();
            //CC[c] = make_cuFloatComplex(cuCrealf(A_global[c]),cuCimagf(A_global[c]));
            CC[c] = cuCmulf(C1[c],A_global[c]);
            __syncthreads();

            BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0);

            Csub_2 = make_cuFloatComplex(0,0);

            for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1)
            {
                Csub_2 = cuCaddf(Csub_2, CC[i]);
                //BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]);
            }           

            //BETA[index_r* %(tSize)d + index_t] =  make_cuFloatComplex(cuCrealf(Csub_2), cuCimagf(Csub_2));
            BETA[index_r* %(tSize)d + index_t] =  cuCdivf(Csub_1,Csub_2);

        }   
    }

}
"""

tic = time.time()

iaa_iter = 4       
c = 3e8             
fc = 3.15e9             
lam = c/fc           

Mt = 7                 
Mr = 5               

rmax = 300             

theta = (np.arange(-80, 80, 0.5))/360*2*np.pi
r = np.arange(0, rmax, rmax/100)

tsize = theta.size
rsize = r.size

fmcw_fft  = np.arange(0,Mt*Mr*theta.size) + 1j*1
fmcw_fft = fmcw_fft.reshape(Mt*Mr, theta.size).astype(np.complex64)

VX = np.zeros(Mt*Mr)
for i in range(0, Mt*Mr+1, 1):
    VX[i-1] = i*lam/2

B = np.zeros((Mt*Mr, theta.size), dtype=complex)

for index_theta in range(0, theta.size, 1):
    alpha = np.exp(1j*2*np.pi*fc*np.sin(theta[index_theta])/c*VX)

    if iaa_iter == 0:
        B[:, index_theta] = np.multiply(alpha, signal.nuttall(Mr*Mt))
    else:
        B[:, index_theta] = alpha

beta = np.zeros((r.size, theta.size), dtype=complex)

for index_r in range(0, r.size):
    for index_theta in range(0, theta.size):
        beta[index_r, index_theta] = np.dot(np.conj(B[:, index_theta]), fmcw_fft[:, index_r]) / np.dot(np.conj(B[:, index_theta]), B[:, index_theta]) 

RZero = np.zeros((Mr*Mt, Mr*Mt), dtype=complex)

beta = beta.astype(np.complex64)
BETA2 = (np.square(np.abs(beta)) + 1j*0).astype(np.complex64)
Bcon = np.conj(B)

MATRIX_SIZE_O  = 35
MATRIX_SIZE_I  = 35
BLOCK_SIZE = 1
GRID_X =  MATRIX_SIZE_O
GRID_Y = MATRIX_SIZE_O

B = np.real(B).astype(np.float16) + 1j*np.imag(B).astype(np.float16)
a_cpu = B
Bcon = np.real(Bcon).astype(np.float16) + 1j*np.imag(Bcon).astype(np.float16)
b_cpu = Bcon
FFT_cpu = np.real(fmcw_fft).astype(np.float16) + 1j*np.imag(fmcw_fft).astype(np.float16)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
BETA2_gpu = gpuarray.to_gpu(BETA2)
c_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64)
FFT_gpu = gpuarray.to_gpu(FFT_cpu)

kernel_code = kernel_code_template % {
        'MATRIX_SIZE_I': MATRIX_SIZE_I,
        'MATRIX_SIZE_O': MATRIX_SIZE_O,
        'BLOCK_SIZE': BLOCK_SIZE,
        'tSize': tsize,
        'rSize': rsize,
        }

mod = compiler.SourceModule(kernel_code)
outer = mod.get_function("OuterKernel")

t1 = time.time()
outer(
        a_gpu, b_gpu, BETA2_gpu,
        c_gpu,
        grid = (GRID_X,  GRID_Y),
        block = (BLOCK_SIZE, BLOCK_SIZE, 1),
        )

t2 = time.time()
t_gpu = t2-t1

print('INFORMATION:',pycuda.driver.mem_get_info())
print('t_gpu1 = ', t_gpu)

aa_gpu = gpuarray.to_gpu(a_cpu)
bb_gpu = gpuarray.to_gpu(b_cpu)
BETA2_gpu = gpuarray.empty((rsize, tsize), np.complex64) 

rinv_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64)
rinv_gpu = gpuarray.to_gpu(c_gpu.get())

cc_gpu = gpuarray.empty((1,MATRIX_SIZE_O), np.complex64)

mod = compiler.SourceModule(kernel_code)
dot = mod.get_function("DotKernel")

t1 = time.time()       
dot(
    aa_gpu,
    bb_gpu,
    BETA2_gpu,
    rinv_gpu,
    FFT_gpu,
    cc_gpu,
    grid = (GRID_X,  GRID_Y),
    block = (BLOCK_SIZE, BLOCK_SIZE, 1),)
t2 = time.time()
t_gpu = t2-t1

print('t_gpu2 = ', t_gpu)

beta = BETA2_gpu.get()
toc = time.time()
print(toc-tic)

beta[1:12, :] = 0
plt.contourf(np.outer(r, np.sin(theta)), np.outer(r, np.cos(theta)), 20*np.log10(np.abs(beta)))
plt.show()

I guess this isn’t the code you are actually running.

Could you please run it again? I just corrected the variable.

still doesn’t work. pycuda doesn’t like trying to copy count to count_gpu

In the future, I suggest you actually provide a runnable code. Do what I would do. After posting your code, copy what you have posted into a new file, and try to run it there. If you can’t run it, I won’t be able to run it.

After removing the offending code around count_gpu, and running your code in cuda-memcheck, it appears your DotKernel is making illegal accesses (out-of-bounds) to shared memory.

========= CUDA-MEMCHECK
========= Invalid __shared__ read of size 8
=========     at 0x00000608 in DotKernel
=========     by thread (0,0,0) in block (13,10,0)
=========     Address 0x00000af0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch tim
e
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24d9dd]
=========     Host Frame:/usr/lib64/python2.7/site-packages/pycuda-2017.1.1-py2.
7-linux-x86_64.egg/pycuda/_driver.so (_ZN6pycuda8function13launch_kernelEN11pycu
daboost6python5tupleES3_NS2_3api6objectEjS5_ + 0x4ec) [0x115c1c]
=========     Host Frame:/usr/lib64/python2.7/site-packages/pycuda-2017.1.1-py2.
7-linux-x86_64.egg/pycuda/_driver.so (_ZN11pycudaboost6python7objects23caller_py
_function_implINS0_6detail6callerIMN6pycuda8functionEFvNS0_5tupleES7_NS0_3api6ob
jectEjS9_ENS0_21default_call_policiesENS_3mpl7vector7IvRS6_S7_S7_S9_jS9_EEEEEclE
P7_objectSK_ + 0x156) [0x111a86]
=========     Host Frame:/usr/lib64/python2.7/site-packages/pycuda-2017.1.1-py2.
7-linux-x86_64.egg/pycuda/_driver.so (_ZNK11pycudaboost6python7objects8function4
callEP7_objectS4_ + 0xca) [0x13af1a]
=========     Host Frame:/usr/lib64/python2.7/site-packages/pycuda-2017.1.1-py2.
7-linux-x86_64.egg/pycuda/_driver.so [0x13b288]
=========     Host Frame:/usr/lib64/python2.7/site-packages/pycuda-2017.1.1-py2.
7-linux-x86_64.egg/pycuda/_driver.so (_ZNK11pycudaboost6python6detail17exception
_handlerclERKNS_9function0IvEE + 0x63) [0x1629e3]
=========     Host Frame:/usr/lib64/python2.7/site-packages/pycuda-2017.1.1-py2.
7-linux-x86_64.egg/pycuda/_driver.so (_ZN11pycudaboost6detail8function21function
_obj_invoker2INS_3_bi6bind_tIbNS_6python6detail19translate_exceptionIN6pycuda5er
rorEPFvRKS9_EEENS3_5list3INS_3argILi1EEENSG_ILi2EEENS3_5valueISD_EEEEEEbRKNS6_17
exception_handlerERKNS_9function0IvEEE6invokeERNS1_15function_bufferESP_ST_ + 0x
13) [0x104d23]
=========     Host Frame:/usr/lib64/python2.7/site-packages/pycuda-2017.1.1-py2.
7-linux-x86_64.egg/pycuda/_driver.so (_ZN11pycudaboost6python21handle_exception_
implENS_9function0IvEE + 0x3d) [0x16277d]
=========     Host Frame:/usr/lib64/python2.7/site-packages/pycuda-2017.1.1-py2.
7-linux-x86_64.egg/pycuda/_driver.so [0x139963]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyObject_Call + 0x43) [0x4a
0f3]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x1d4c
) [0xde43c]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x7ed)
[0xe229d]
=========     Host Frame:/lib64/libpython2.7.so.1.0 [0x6f26d]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyObject_Call + 0x43) [0x4a
0f3]
=========     Host Frame:/lib64/libpython2.7.so.1.0 [0x59105]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyObject_Call + 0x43) [0x4a
0f3]
=========     Host Frame:/lib64/libpython2.7.so.1.0 [0xa15c7]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyObject_Call + 0x43) [0x4a
0f3]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x1d4c
) [0xde43c]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x7ed)
[0xe229d]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyEval_EvalCode + 0x32) [0x
e23a2]
=========     Host Frame:/lib64/libpython2.7.so.1.0 [0xfb7ef]
=========     Host Frame:/lib64/libpython2.7.so.1.0 (PyRun_FileExFlags + 0x7e)

This is a coding problem. You should inspect all read operations from shared memory in that kernel, to see which are out-of-bounds.

You’re launching 1 thread per block? You won’t get any interesting performance out of the GPU that way.

Yes, I tried to launch 1 thread per block, because the algorithm needs some shared array and also the matrix changes its shape always, thus I can’t split the matrix appositely. I ran both kernels separately, they work alone well, but when I put them together, there is the memory problem.