Another Device Memory Question

device SynapseType* SynapseArray;

global void kernel()
{
SynapseType* SynapsePtr=&(SynapseArray[1]);
SynapsePtr->element1=5;
Put SynapsePtr in a linked list in a device structure
}

main()
{
cudaMalloc((void**)&SynapseArray, NumberOfSynapses*sizeof(SynapseType));
kernel<<<1,1>();
}

Given that I have some code similar to the above, if I have a device array that is allocated in host code via cudaMalloc(), can a kernel access the array element by address, assign values to that address pointer element, and then use that pointer in a device structure that contains a linked list of pointers, that is only accessed by other kernels or device functions?

I am thinking I can, but I’m having strange behavior, including kernel sometimes failing to launch. I also get different results if the host passes the device array by address to the kernel such as:

global void kernel(SynapseType* synapsearray)
{
SynapseType * SynapsePtr=&(synapsearray[1]);
SynapsePtr->element1=5;
Put SynapsePtr in a linked list in a device structure
}

main()
{
cudaMalloc((void**)&SynapseArray, NumberOfSynapses*sizeof(SynapseType));

kernel<<<1,1>>>(SynapseArray);
}

This second method is better behaved, but still not working correctly in my program, which is much more complex. Would you expect these two uses of SynapseArray to behave differently? Are both uses valid? Is one preferable over the other?

I’m hoping I’ve done something dumb that someone will see.

Thanks,
Ken Chaffin

I don’t think the cudaMalloc in the first code snippet is valid. Allocate the memory using a host variable and then copy the address it holds onto a device symbol (I have use constant memory pointers for this successfully). Then it should work as advertised.

That didn’t seem to work.

What I did was:

device SynapseType* d_SynapseArray;

main()

{

SynapseType* synapsearray; // also tried const SynapseType* synapsearray;

cudaMalloc((void**)&synapsearray, NumberOfSynapses*sizeof(SynapseType));

cudaMemcpyToSymbol(d_SynapsesArray, synapsearray, sizeof(SynapseType*));

}

Of course I checked error returns and this all succeeded, but when I call my kernel I get an error 30, “unknown error”. I misspoke when I said that I sometimes get a failure to launch, but rather I just have some unknown error in the kernel. This has the symptoms of an out of bound array access, but I can’t determine how that is the case.

Does the above look correct for the copy to symbol?

It doesn’t seem to matter whether I pass this d_SynapseArray pointer as an argument to the kernel, or if I allow the kernel to access d_SynapseArray directly, the kernel still fails. This result is actually worse that what I was getting before.

Ken Chaffin

Okay, got it working. What I had to do was change the array declaration to:

SynapseType* SynapseArray;

Then I was able to do the cudaMalloc((void**)&SynapseArray, …);

and then the host was able to call the kernel with this pointer as an argument kernel<<<1,1>>>(SynapseArray);

and the kernel could make use of the pointer at will.

I guess that pointers to contain cudaMalloc() device memory should never be qualified as device. I don’t understand it, but I can make it work!

Thanks for your suggestion, even though that didn’t fix the problem.

Ken Chaffin

Not even close and not really what I was suggesting. This will work:

__device__ float * ad;

int main(void)

{

	float *_a;

	cudaMalloc( (void**)&_a, DSIZE ));

	cudaMemcpyToSymbol( ad, &_a, sizeof(float *), (size_t)0, cudaMemcpyHostToDevice);

}

You have to pass a host pointer to cudaMalloc. That is non-negotiable. The device side pointer you write the address to can be either global or constant memory, both will work. I generally use constant memory pointers if I don’t need to modify the actual pointer value for performance reasons.

This did not work for me. After making the exact changes you suggest, I’m back to the kernel setting an error 30, unknown error. Same thing happens if the host passes the device array pointer as an argument to the kernel or if the kernel directly accesses the device array pointer.

Ken

I don’t understand why, I cut the code from part of a larger test which works for me:

#include <assert.h>

#include <stdio.h>

#include <cuda_runtime.h>

#ifndef gpuAssert

#include <stdio.h>

#define gpuAssert( condition ) {if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE %d in %s, line %d\n", condition, __FILE__, __LINE__ );exit( 1 );}}

#endif

#define _DSIZE (32)

__device__ float * ad;

__global__ void testkernel2(float *b, float *c, float *d)

{

	unsigned int idx = threadIdx.x + blockDim.x*blockIdx.x;

	d[idx] += ad[idx] + b[idx] + c[idx];

}

int main()

{

	float *a, *b, *c, *d;

	float *_a;

	float *_b, *_c, *_d;

	assert( !(( a = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	assert( !(( b = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	assert( !(( c = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	assert( !(( d = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	gpuAssert( cudaMalloc( (void**)&_a, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_b, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_c, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_d, _DSIZE * sizeof(float) ) );

	for(int i = 0; i < _DSIZE; i++) {

		a[i] = 3.f;

		b[i] = 5.f;

		c[i] = 7.f;

		d[i] = (float)i;

	}

	gpuAssert( cudaMemcpy(_a, a, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpy(_b, b, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpy(_c, c, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpy(_d, d, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpyToSymbol( ad, &_a, sizeof(float *), (size_t)0, cudaMemcpyHostToDevice) ); 

	testkernel2 <<< 1, _DSIZE >>> (_b, _c, _d);

	gpuAssert( cudaThreadSynchronize() );

	gpuAssert( cudaMemcpy(d, _d, _DSIZE * sizeof(float), cudaMemcpyDeviceToHost) );

	for(int i = 0; i < _DSIZE; i++) {

		fprintf(stdout, "%2d %6.1f\n", i, d[i]);

	}

	cudaFree(_a);

	cudaFree(_b);

	cudaFree(_c);

	cudaFree(_d);

	free(a);

	free(b);

	free(c);

	free(d);

	return cudaThreadExit();

}

My guess is that whatever your kernel is doing with the addresses/memory you are allocating is illegal.

I will have to play with this more. I am allocating an array of about 1 million structure elements. I’m not doing anything fancy other than accessing the array elements and assigning values to the structure members. I agree that my problems act like I’m getting an out of bounds index problem, but that shouldn’t be. But, it may be that I’m doing something wrong related to this being an array device variable rather than a simple type. It would be pretty easy that I have a pointer where I need a pointer to the pointer etc.

Thanks for your suggestions.

Ken