Global arrays?

My CUDA program has large arrays containing static data. This data should be available for all functions in the kernel. I tried to use global memory for it, but so far I failed to move it on the GPU. I always get the CUDA error “invalid argument.” after calling cudaMemcpyToSymbol(…). Is it correct to use cudaMemcpyToSymbol here? What am I doing wrong?

#include <stdio.h>

#include <cuda.h>

void checkCUDAError(const char *msg)

{

	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err)

	{

		fprintf(stderr, "Cuda error: %s: %s.\n", msg,

				cudaGetErrorString( err) );

		exit(EXIT_FAILURE);

	}

}

__device__ int *a_d;

__global__ void myKernel()

{

}

int main()

{   

	int a[] = {1, 2, 3};

	int n = 3;

	cudaMalloc((void**)&a_d, sizeof(int)*n);

	cudaMemcpyToSymbol(a_d, a, sizeof(int)*n, 0, cudaMemcpyHostToDevice);

	checkCUDAError("Blah");

	myKernel<<<1,1>>>();

	return 0;

}

try

cudaMemcpyToSymbol("a_d", a, sizeof(int)*n, 0, cudaMemcpyHostToDevice);

try

cudaMemcpyToSymbol("a_d", a, sizeof(int)*n, 0, cudaMemcpyHostToDevice);

Same error. According to the documentation you are allowed to pass both the variable or it’s name.

Same error. According to the documentation you are allowed to pass both the variable or it’s name.

OK I didn’t bother to read the rest of your code. The cudaMalloc call you are doing is wrong, and that is the source of the problems. cudaMallloc() onto a host symbol, copy to the device address it holds, then copy the address it holds onto your device symbol.

OK I didn’t bother to read the rest of your code. The cudaMalloc call you are doing is wrong, and that is the source of the problems. cudaMallloc() onto a host symbol, copy to the device address it holds, then copy the address it holds onto your device symbol.

then copy the address it holds onto your device symbol.

The device symbol must point to the address where the memory is allocated by cudaMalloc(…), however in c it’s not possible to modify the address of a pointer once declared… confused

then copy the address it holds onto your device symbol.

The device symbol must point to the address where the memory is allocated by cudaMalloc(…), however in c it’s not possible to modify the address of a pointer once declared… confused

You don’t modify the address of a pointer, you modify its value (which is an address). Something like this:

__device__ int *a_d;

int main()

{  

	int a[] = {1, 2, 3};

	int n = 3;

	int *scratch;

	cudaMalloc((void**)&scratch, sizeof(int)*n);

	cudaMemcpy(scratch, &a[0], sizeof(int)*n, cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol("a_d", &scratch, sizeof(int *), 0, cudaMemcpyHostToDevice);

}

You don’t modify the address of a pointer, you modify its value (which is an address). Something like this:

__device__ int *a_d;

int main()

{  

	int a[] = {1, 2, 3};

	int n = 3;

	int *scratch;

	cudaMalloc((void**)&scratch, sizeof(int)*n);

	cudaMemcpy(scratch, &a[0], sizeof(int)*n, cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol("a_d", &scratch, sizeof(int *), 0, cudaMemcpyHostToDevice);

}

Uhh tired this with --device-emulation and got a “Bus error”.

I also don’t understand why I should invoke memcpy twice, there must be an easier solution for such a common problem.

Uhh tired this with --device-emulation and got a “Bus error”.

I also don’t understand why I should invoke memcpy twice, there must be an easier solution for such a common problem.

Uhh tired this with --device-emulation and got a “Bus error”.

I also don’t understand why I should invoke memcpy twice, there must be an easier solution for such a common problem.

Uhh tired this with --device-emulation and got a “Bus error”.

I also don’t understand why I should invoke memcpy twice, there must be an easier solution for such a common problem.

Here is a complete working example.

#include <stdio.h>

__device__ int *a_d;

__global__ void testkernel(int *b, int n)

{

	for(int i=0; i<n; i++) b[i] = a_d[i];

	return;

}

int main()

{  

	int a[] = {1, 2, 3};

	int b[] = {4, 5, 6};

	int n = 3;

	int *scratch0, *scratch1;

	cudaMalloc((void**)&scratch0, sizeof(int)*n);

	cudaMalloc((void**)&scratch1, sizeof(int)*n);

	cudaMemcpy(scratch0, &a[0], sizeof(int)*n, cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol("a_d", &scratch0, sizeof(int *), 0, cudaMemcpyHostToDevice);

	testkernel <<<1,1>>> (scratch1,n);

	cudaMemcpy(&b[0], scratch1, sizeof(int)*n, cudaMemcpyDeviceToHost);

	for(int i=0; i<3; i++) fprintf(stdout, "%d %d %d\n", i, a[i], b[i]);

}

Here is a complete working example.

#include <stdio.h>

__device__ int *a_d;

__global__ void testkernel(int *b, int n)

{

	for(int i=0; i<n; i++) b[i] = a_d[i];

	return;

}

int main()

{  

	int a[] = {1, 2, 3};

	int b[] = {4, 5, 6};

	int n = 3;

	int *scratch0, *scratch1;

	cudaMalloc((void**)&scratch0, sizeof(int)*n);

	cudaMalloc((void**)&scratch1, sizeof(int)*n);

	cudaMemcpy(scratch0, &a[0], sizeof(int)*n, cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol("a_d", &scratch0, sizeof(int *), 0, cudaMemcpyHostToDevice);

	testkernel <<<1,1>>> (scratch1,n);

	cudaMemcpy(&b[0], scratch1, sizeof(int)*n, cudaMemcpyDeviceToHost);

	for(int i=0; i<3; i++) fprintf(stdout, "%d %d %d\n", i, a[i], b[i]);

}

thx a lot for your anwsers :)

I still would like a more elegant solution, but this one works at least correct. One last question, why are u using &a[0] instead of just &a?

thx a lot for your anwsers :)

I still would like a more elegant solution, but this one works at least correct. One last question, why are u using &a[0] instead of just &a?

I would do it like this:

[font=“Courier New”][codebox]include <stdio.h>

include “cutil_inline.h”

define ELEMS(a) (sizeof((a))/sizeof((a)[0]))

int a = {1, 2, 3};

constant int a_c[ELEMS(a)];

char *cudaErrorString=

{

"No errors",

"Missing configuration error",

"Memory allocation error",

"Initialization error",

"Launch failure",

"Prior launch failure",

"Launch timeout error",

"Launch out of resources error",

"Invalid device function",

"Invalid configuration",

"Invalid device",

"Invalid value",

"Invalid pitch value",

"Invalid symbol",

"Map buffer object failed",

"Unmap buffer object failed",

"Invalid host pointer",

"Invalid device pointer",

"Invalid texture",

"Invalid texture binding",

"Invalid channel descriptor",

"Invalid memcpy direction"

};

global void testkernel( int *b, int n )

{

for( int i = 0; i < n; i++ ) b[i] = a_c[i];

}

int main()

{

int b[ ELEMS( a )];

int *b_d;

cudaError_t cerr;

cudaMalloc((void**)&b_d, sizeof(B));

cerr=cudaMemcpyToSymbol("a_c", a, sizeof(a_c), 0, cudaMemcpyHostToDevice);

if( cerr != cudaSuccess ) puts( cudaErrorString[ cerr ]);

testkernel <<< 1, 1 >>> ( b_d, ELEMS(a) );

cutilCheckMsg("Kernel execution failed");

cudaMemcpy( b, b_d, sizeof(B), cudaMemcpyDeviceToHost );

int succes = 1;

for( int i = 0; i < ELEMS(a); i++) { printf( "%d %d %d\n", i, a[i], b[i] ); if( a[i] != b[i] ) succes=0; }

puts( succes ? "Passed" : "Failed" );

}[/codebox][/font]

I’ve used constant memory here. If your arrays do not fit in constant memory, you can put them in global memory, using the normal cudaMalloc() and cudaMemcpy() as used in (all) the SDK examples. I think constant memory is faster.

The array of errorstrings is a bit unnecessary, should have used cudaGetErrorString() as you did.