Global arrays?

MrNightLifeLover · August 18, 2010, 1:10pm

My CUDA program has large arrays containing static data. This data should be available for all functions in the kernel. I tried to use global memory for it, but so far I failed to move it on the GPU. I always get the CUDA error “invalid argument.” after calling cudaMemcpyToSymbol(…). Is it correct to use cudaMemcpyToSymbol here? What am I doing wrong?

#include <stdio.h>

#include <cuda.h>

void checkCUDAError(const char *msg)

{

	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err)

	{

		fprintf(stderr, "Cuda error: %s: %s.\n", msg,

				cudaGetErrorString( err) );

		exit(EXIT_FAILURE);

	}

}

__device__ int *a_d;

__global__ void myKernel()

{

}

int main()

{   

	int a[] = {1, 2, 3};

	int n = 3;

	cudaMalloc((void**)&a_d, sizeof(int)*n);

	cudaMemcpyToSymbol(a_d, a, sizeof(int)*n, 0, cudaMemcpyHostToDevice);

	checkCUDAError("Blah");

	myKernel<<<1,1>>>();

	return 0;

}

avidday · August 18, 2010, 1:12pm

try

cudaMemcpyToSymbol("a_d", a, sizeof(int)*n, 0, cudaMemcpyHostToDevice);

avidday · August 18, 2010, 1:12pm

try

cudaMemcpyToSymbol("a_d", a, sizeof(int)*n, 0, cudaMemcpyHostToDevice);

MrNightLifeLover · August 18, 2010, 1:17pm

Same error. According to the documentation you are allowed to pass both the variable or it’s name.

MrNightLifeLover · August 18, 2010, 1:17pm

Same error. According to the documentation you are allowed to pass both the variable or it’s name.

avidday · August 18, 2010, 1:29pm

OK I didn’t bother to read the rest of your code. The cudaMalloc call you are doing is wrong, and that is the source of the problems. cudaMallloc() onto a host symbol, copy to the device address it holds, then copy the address it holds onto your device symbol.

avidday · August 18, 2010, 1:29pm

OK I didn’t bother to read the rest of your code. The cudaMalloc call you are doing is wrong, and that is the source of the problems. cudaMallloc() onto a host symbol, copy to the device address it holds, then copy the address it holds onto your device symbol.

MrNightLifeLover · August 18, 2010, 1:52pm

then copy the address it holds onto your device symbol.

The device symbol must point to the address where the memory is allocated by cudaMalloc(…), however in c it’s not possible to modify the address of a pointer once declared… confused

MrNightLifeLover · August 18, 2010, 1:52pm

then copy the address it holds onto your device symbol.

The device symbol must point to the address where the memory is allocated by cudaMalloc(…), however in c it’s not possible to modify the address of a pointer once declared… confused

avidday · August 18, 2010, 2:21pm

You don’t modify the address of a pointer, you modify its value (which is an address). Something like this:

__device__ int *a_d;

int main()

{  

	int a[] = {1, 2, 3};

	int n = 3;

	int *scratch;

	cudaMalloc((void**)&scratch, sizeof(int)*n);

	cudaMemcpy(scratch, &a[0], sizeof(int)*n, cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol("a_d", &scratch, sizeof(int *), 0, cudaMemcpyHostToDevice);

}

avidday · August 18, 2010, 2:21pm

You don’t modify the address of a pointer, you modify its value (which is an address). Something like this:

__device__ int *a_d;

int main()

{  

	int a[] = {1, 2, 3};

	int n = 3;

	int *scratch;

	cudaMalloc((void**)&scratch, sizeof(int)*n);

	cudaMemcpy(scratch, &a[0], sizeof(int)*n, cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol("a_d", &scratch, sizeof(int *), 0, cudaMemcpyHostToDevice);

}

MrNightLifeLover · August 18, 2010, 2:54pm

Uhh tired this with --device-emulation and got a “Bus error”.

I also don’t understand why I should invoke memcpy twice, there must be an easier solution for such a common problem.

MrNightLifeLover · August 18, 2010, 2:54pm

Uhh tired this with --device-emulation and got a “Bus error”.

I also don’t understand why I should invoke memcpy twice, there must be an easier solution for such a common problem.

MrNightLifeLover · August 18, 2010, 2:55pm

Uhh tired this with --device-emulation and got a “Bus error”.

I also don’t understand why I should invoke memcpy twice, there must be an easier solution for such a common problem.

MrNightLifeLover · August 18, 2010, 2:55pm

Uhh tired this with --device-emulation and got a “Bus error”.

I also don’t understand why I should invoke memcpy twice, there must be an easier solution for such a common problem.

avidday · August 18, 2010, 3:20pm

Here is a complete working example.

#include <stdio.h>

__device__ int *a_d;

__global__ void testkernel(int *b, int n)

{

	for(int i=0; i<n; i++) b[i] = a_d[i];

	return;

}

int main()

{  

	int a[] = {1, 2, 3};

	int b[] = {4, 5, 6};

	int n = 3;

	int *scratch0, *scratch1;

	cudaMalloc((void**)&scratch0, sizeof(int)*n);

	cudaMalloc((void**)&scratch1, sizeof(int)*n);

	cudaMemcpy(scratch0, &a[0], sizeof(int)*n, cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol("a_d", &scratch0, sizeof(int *), 0, cudaMemcpyHostToDevice);

	testkernel <<<1,1>>> (scratch1,n);

	cudaMemcpy(&b[0], scratch1, sizeof(int)*n, cudaMemcpyDeviceToHost);

	for(int i=0; i<3; i++) fprintf(stdout, "%d %d %d\n", i, a[i], b[i]);

}

avidday · August 18, 2010, 3:20pm

Here is a complete working example.

#include <stdio.h>

__device__ int *a_d;

__global__ void testkernel(int *b, int n)

{

	for(int i=0; i<n; i++) b[i] = a_d[i];

	return;

}

int main()

{  

	int a[] = {1, 2, 3};

	int b[] = {4, 5, 6};

	int n = 3;

	int *scratch0, *scratch1;

	cudaMalloc((void**)&scratch0, sizeof(int)*n);

	cudaMalloc((void**)&scratch1, sizeof(int)*n);

	cudaMemcpy(scratch0, &a[0], sizeof(int)*n, cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol("a_d", &scratch0, sizeof(int *), 0, cudaMemcpyHostToDevice);

	testkernel <<<1,1>>> (scratch1,n);

	cudaMemcpy(&b[0], scratch1, sizeof(int)*n, cudaMemcpyDeviceToHost);

	for(int i=0; i<3; i++) fprintf(stdout, "%d %d %d\n", i, a[i], b[i]);

}

MrNightLifeLover · August 18, 2010, 3:31pm

thx a lot for your anwsers :)

I still would like a more elegant solution, but this one works at least correct. One last question, why are u using &a[0] instead of just &a?

MrNightLifeLover · August 18, 2010, 3:31pm

thx a lot for your anwsers :)

I still would like a more elegant solution, but this one works at least correct. One last question, why are u using &a[0] instead of just &a?

jan.heckman · August 18, 2010, 6:18pm

I would do it like this:

[font=“Courier New”][codebox]include <stdio.h>

include “cutil_inline.h”

define ELEMS(a) (sizeof((a))/sizeof((a)[0]))

int a = {1, 2, 3};

constant int a_c[ELEMS(a)];

char *cudaErrorString=

{

"No errors",

"Missing configuration error",

"Memory allocation error",

"Initialization error",

"Launch failure",

"Prior launch failure",

"Launch timeout error",

"Launch out of resources error",

"Invalid device function",

"Invalid configuration",

"Invalid device",

"Invalid value",

"Invalid pitch value",

"Invalid symbol",

"Map buffer object failed",

"Unmap buffer object failed",

"Invalid host pointer",

"Invalid device pointer",

"Invalid texture",

"Invalid texture binding",

"Invalid channel descriptor",

"Invalid memcpy direction"

};

global void testkernel( int *b, int n )

{

for( int i = 0; i < n; i++ ) b[i] = a_c[i];

}

int main()

{

int b[ ELEMS( a )];

int *b_d;

cudaError_t cerr;

cudaMalloc((void**)&b_d, sizeof(B));

cerr=cudaMemcpyToSymbol("a_c", a, sizeof(a_c), 0, cudaMemcpyHostToDevice);

if( cerr != cudaSuccess ) puts( cudaErrorString[ cerr ]);

testkernel <<< 1, 1 >>> ( b_d, ELEMS(a) );

cutilCheckMsg("Kernel execution failed");

cudaMemcpy( b, b_d, sizeof(B), cudaMemcpyDeviceToHost );

int succes = 1;

for( int i = 0; i < ELEMS(a); i++) { printf( "%d %d %d\n", i, a[i], b[i] ); if( a[i] != b[i] ) succes=0; }

puts( succes ? "Passed" : "Failed" );

}[/codebox][/font]

I’ve used constant memory here. If your arrays do not fit in constant memory, you can put them in global memory, using the normal cudaMalloc() and cudaMemcpy() as used in (all) the SDK examples. I think constant memory is faster.

The array of errorstrings is a bit unnecessary, should have used cudaGetErrorString() as you did.

Topic		Replies	Views
__const__ writting with cudaMemcpyToSymbol CUDA Programming and Performance	15	10997	August 30, 2007
Constant memory when having more than one file external does not work CUDA Programming and Performance	24	3368	August 27, 2010
Constant Array Usage Problem CUDA Programming and Performance	3	1146	July 26, 2009
Can we do malloc inside a __global__ function CUDA Programming and Performance	26	10057	February 21, 2010
How do you copy an array into constant memory? CUDA Programming and Performance	11	15965	December 17, 2010
__constant__ Strangeness CUDA Programming and Performance	5	2556	July 21, 2008
I had a problem on using constant memory CUDA Programming and Performance	4	934	October 13, 2013
How to Copy an Array to the GPU memory CUDA Programming and Performance	4	3981	June 20, 2008
pointer in global device memory CUDA Programming and Performance	9	11744	November 23, 2011
__device__ variables and arrays CUDA Programming and Performance	8	15530	August 16, 2014

Global arrays?

Related topics