I have the following code:
#include <cuda.h>
#include <cutil_inline.h>

#include “MersenneTwister.h”
#include “MersenneTwister_kernel.cu”

//ceil(a /b)
extern “C” int iDivUp(int a, int b){
return ((a % b) != 0) ? (a / b + 1) : (a / b);

//floor(a / b)
extern “C” int iDivDown(int a, int b){
return a / b;

//Align a to nearest higher multiple of b
extern “C” int iAlignUp(int a, int b){
return ((a % b) != 0) ? (a - a % b + b) : a;

//Align a to nearest lower multiple of b
extern “C” int iAlignDown(int a, int b){
return a - a % b;

const int PATH_N = 24000000;
const int N_PER_RNG = iAlignUp(iDivUp(PATH_N, MT_RNG_COUNT), 2);

int N = 1000;

float *Rand_h;
Rand_h = (float )malloc(Nsizeof(float));

float Rand_d;
error = cudaMalloc((void **) &Rand_d, N
if (error != cudaSuccess){
std::cout << “Failed at malloc:Rand_d.\n”;
return 0;
error = cudaMemcpy(Rand_d, Rand_h, sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess){
std::cout << “Failed at memcpy:Rand_d.\n”;
return 0;

RandomGPU<<<32, 128>>>(Rand_d, N_PER_RNG);

error = cudaMemcpy(Rand_h, Rand_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
if (error != cudaSuccess){
std::cout << “Failed at memcpy:Rand_h.\n”;
return 0;

The code compiles, but when I run it i get the following error:
Failed at memcpy:Rand_h.

Any suggestions or help would be greatly appreciated. Thanks.

What’s the error string from cudaGetErrorString?

Try something like:

void checkCUDAError(const char *msg)


	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err) 


		fprintf(stderr, "Cuda error: %s: %s.\n", msg, 

								  cudaGetErrorString( err) );




I tried it. The error message is “unspecified launch failure”.

I’m guessing that the ULF is coming from the kernel launch.

Try adding a cudaThreadSynchronize() after your kernel launch and checking for errors there…

Something like:

// launch kernel 

RandomGPU<<<32, 128>>>(Rand_d, N_PER_RNG); 

// block until the device has completed


// check if kernel execution generated an error

 // Check for any CUDA errors 

checkCUDAError("kernel invocation");

If your kernel is causing the ULF, it is probably segfaulting. A common cause of this is out of bounds memory access so check there first…You could try running in emulation mode to see if that catches the problem. If possible, run the emulation executable through Valgrind to check for memory errors.

I think the error is coming from

RandomGPU<<<32, 128>>>(Rand_d, N_PER_RNG);

Since when I disabled it, the error disappeared. I tried with


but it does not fix the error. I think every kernel should finish before the next one starts, without having to write explicit synchronization code for that.

The cudaThreadSynchronize is there to ensure that the kernel is finished before you check for errors, without it, cudaGetErrorString may not return the correct value.

The cudaThreadSynchronize will not fix the problem, as I suggested earlier…you most likely have something crashing your kernel and causing the ULF. You will have to find out what is causing the crash and fix it.