# UNKNOWN_CUDA_ERROR from cuMemcpyDtoH and cuMemFree

Dear All,

Hi, I recently started using CUDA with Python for accelerating my MCMC application. I’m developing the code on Ubuntu 14.04 with NVIDIA GeForce 950M. My Cuda Toolkit version is 8.0.61. I wrote a sample MCMC program as following and ran the code (xorg was turned off using ‘service lightdm stop’ command).

Code:

``````import numpy as np
import numba as nb
from timeit import default_timer as timer
from matplotlib import pyplot as pt
import math
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_normal_float32

LOOKUP_TABLE = cuda.to_device(np.array([
1, 1, 2, 6, 24, 120, 720, 5040, 40320,
362880, 3628800, 39916800, 479001600,
6227020800, 87178291200, 1307674368000,
20922789888000, 355687428096000, 6402373705728000,
121645100408832000, 2432902008176640000], dtype='int64'))

@cuda.jit(device=True)
def arr_sum(arr):
result = 0
for i in range(arr.size):
result = result + arr[i]

return result

@cuda.jit(device=True)
def dot(arr1, arr2):
result = 0
for i in range(arr1.size):
result = arr1[i]*arr2[i] + result

return result

@cuda.jit(device=True)
def arr_div(arr, div):

@cuda.jit(device=True)
def sirModule_sample_draw(rng_states, inp, beta, omega, out):
"""Find a value less than 1 from nomral distribution"""

# draw candidate sample from normal distribution and store
# when less than 1
while True:

if candidate < 1:
break

@cuda.jit(device=True)
def sirModule_weight(current, previous, weight, out, y, beta, omega, gamma):
PI = 3.14159265359

# calculate the pdf/pmf of given state
Z = ( current[thread_id] - ( previous[ thread_id ] + beta ) ) / omega
p1_div_p3 = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )

mu = math.log( 1 + math.exp( gamma * current[ thread_id ] ) )
p2 = math.exp( mu ) * mu**y / LOOKUP_TABLE[ y ]

@cuda.jit(device=True)
def sirModule_tau(current, beta, omega, phi, phi_sub):

# calculate phi distribution and subtract from 1
Z = ( 1 - ( current[ thread_id ] + beta ) ) / omega
phi[ thread_id ] = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )

@cuda.jit
def SIR(rng_states, y, particles, weight, beta, omega, gamma,
greater, equal, phi, phi_sub):
# thread/block index for accessing data
tx = cuda.threadIdx.x # Thread id in a 1D block = particle index
ty = cuda.blockIdx.x # Block id in a 1D grid = event index
bw = cuda.blockDim.x # Block width, i.e. number of threads per block = particle number
pos = tx + ty * bw # computed flattened index inside the array

y_current = y[ ty ]
tn = y_current.size

# iterator over timestep
for i in range(1, tn):
# draw samples
sirModule_sample_draw(rng_states, particles[ty][i-1], beta,
omega, particles[ty][i])

# get weight
sirModule_weight(particles[ty][i], particles[ty][i-1], weight[ty][i-1],
weight[ty][i], y_current[i], beta, omega, gamma)

# normalize weight
weight_sum = arr_sum(weight[ty][i])
arr_div(weight[ty][i], weight_sum)

# calculate tau
sirModule_tau(particles[ty][i], beta, omega, phi, phi_sub)

# update greater and equal
greater[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi)
equal[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi_sub)

def main():

beta = 1
omega = 1
gamma = 2

pn = 100
event_number = 50
timestep = 100

y = np.ones((event_number, timestep), dtype = np.int8)
particles = cuda.to_device(np.zeros((event_number, timestep, pn), dtype = np.float32))
weight = cuda.to_device(np.ones((event_number, timestep, pn), dtype = np.float32))
greater = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
equal = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))

phi = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
phi_sub = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))

rng_states = create_xoroshiro128p_states(pn, seed=1)

start = timer()
SIR[event_number, pn](rng_states, y, particles, weight, beta,
omega, gamma, greater, equal, phi, phi_sub)

print("sirModule1 took %f seconds" % vectoradd_time)

if __name__ == '__main__':
main()
``````

when I run the code I get the the following code when calling SIR[event_number, pn](rng_states, y, particles, weight, beta, omega, gamma, greater, equal, phi, phi_sub). Both UNKNOWN_CUDA_ERROR from cuMemcpyDtoH and cuMemFree seems to be associated with memory and someone suggested me to disable X so that GPU does not have to render GUI. I did so but it does not resolve the issue. Has anyone faced the same problem? Thanks for the response.

``````Traceback (most recent call last):
File "CUDA_MonteCarlo_Testesr.py", line 214, in <module>
main()
File "CUDA_MonteCarlo_Testesr.py", line 207, in main
omega, gamma, greater, equal, phi, phi_sub)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 703, in __call__
cfg(*args)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 483, in __call__
sharedmem=self.sharedmem)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 585, in _kernel_call
wb()
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 600, in <lambda>
retr.append(lambda: devary.copy_to_host(val, stream=stream))
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/devicearray.py", line 198, in copy_to_host
_driver.device_to_host(hostary, self, self.alloc_size, stream=stream)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1597, in device_to_host
fn(host_pointer(dst), device_pointer(src), size, *varargs)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 288, in safe_cuda_api_call
self._check_error(fname, retcode)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 323, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [715] Call to cuMemcpyDtoH results in UNKNOWN_CUDA_ERROR
Traceback (most recent call last):
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 647, in _exitfunc
f()
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 571, in __call__
return info.func(*info.args, **(info.kwargs or {}))
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1099, in deref
mem.free()
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1013, in free
self._finalizer()
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 571, in __call__
return info.func(*info.args, **(info.kwargs or {}))
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 863, in core