CudaMemCpy not working with out device emulation mode

Hi Folks,

Am a newbie to CUDA and am stuck while playing around with arrays.

All I am trying to do is, copy a single dimensional array from host to device and multiply each element of an array with some scalar value and return the results.


Src : 5, 10, 20, 30, 40

Scalar Val : 10

Exp Result: 50, 100, 200, 300, 400

It works file when I compile in device emulation mode, but when that flag is turned off it just doesn’t. It always return zero’s

Before I go ahead I would like to post the code below

Kernel Code

__global__ void multiply(double* srcElems, double* resElems, int multiplier){

	int threadIndex = threadIdx.x;

	resElems[threadIndex] = srcElems[threadIndex] * multiplier;



C host

void checkCudaError(const char *msg)


	cudaError_t err = cudaGetLastError();

	if( 0 != err)


		printf( "\nCUDA error> %s %s.\n", msg, cudaGetErrorString( err) );




double* multiplyArray(const double* srcArr, int multiplier, int size){

double* hostResElements = (double*)calloc(size, sizeof(double));

 double* deviceSrcElements; 

 double* deviceResElements; 

int memSize = size * sizeof(double);

 int index;

for(index=0; index<size; index++){

	printf("\nSrc [%d] : %f", index, srcArr[index]);


cudaMalloc((void**) &deviceSrcElements, memSize);

 checkCudaError("Memory Allocated to device src elements");

cudaMalloc((void**) &deviceResElements, memSize);

 checkCudaError("Memory Allocated to device target elements");

cudaMemcpy(deviceSrcElements, srcArr,  memSize, cudaMemcpyHostToDevice);

 checkCudaError("Memory copied from host src to device src");

multiply<<<1, size>>>(deviceSrcElements, deviceResElements, multiplier);

 checkCudaError("Kernel code invoked");

cudaMemcpy(hostResElements, deviceResElements, memSize, cudaMemcpyDeviceToHost);

 checkCudaError("Memory copied from device result to host result");

for(index=0; index<size; index++){

	printf("\nRes [%d] : %f", index, hostResElements[index]);



 checkCudaError("Cleaning up device src elements");


 checkCudaError("Cleaning up device res elements");

return hostResElements;


int main(){

	double* srcArr = (double*) calloc ( 5, sizeof(double));

	int size = 5;

	int index;

	for(index=0; index<size; index++){

		srcArr[index] =  size * index;


	double* result = multiplyArray(srcArr, 5, 5);

	for(index=0; index<size; index++){

		printf ("Result in C [%d] : %f", index, result[index]);


	return 0;


Please let me know if I am doing anything wrong.



Oops!! I got to learn from one of the earlier posts that device code doesn’t support double.

So i changed all the types in my previous code to floats and it starts working fine.