2D array indexing with double pointers

I know that allocating a 2D array to the GPU that can be addressed in the form [i]array[j] isn’t fast due to the double pointers, but how do you actually allocate an array in such a manner? Searches on the forums have given the answer “flatten it to a 1D array”, but my supervisor wants to see the difference in access time between a double pointered array and the flattened version. Does anyone have a simple example of how to allocate and copy to/from a 2D array in this manner? Thanks in advance.

This is a pretty trivial example, and I wouldn’t recommend it for benchmarking, but it at least shows one way to do it:

#include <assert.h>

#include <stdio.h>

#include <cuda_runtime.h>

#ifndef gpuAssert

#include <stdio.h>

#define gpuAssert( condition ) {if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE %d in %s, line %d\n", condition, __FILE__, __LINE__ );exit( 1 );}}

#endif

#define _DSIZE (32)

__device__ float * ad[3];

__global__ void testkernel2(float *d)

{

	unsigned int idx = threadIdx.x + blockDim.x*blockIdx.x;

	d[idx] += ad[0][idx] + ad[1][idx] + ad[2][idx];

}

int main()

{

	float *a, *b, *c, *d;

	float *_a, *_b, *_c, *_d;

	assert( !(( a = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	assert( !(( b = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	assert( !(( c = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	assert( !(( d = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );

	gpuAssert( cudaMalloc( (void**)&_a, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_b, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_c, _DSIZE * sizeof(float) ) );

	gpuAssert( cudaMalloc( (void**)&_d, _DSIZE * sizeof(float) ) );

	for(int i = 0; i < _DSIZE; i++) {

		a[i] = 3.f;

		b[i] = 5.f;

		c[i] = 7.f;

		d[i] = (float)i;

	}

	gpuAssert( cudaMemcpy(_a, a, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpy(_b, b, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpy(_c, c, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpy(_d, d, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpyToSymbol( ad, &_a, sizeof(float *), sizeof(float *) * (size_t)0, cudaMemcpyHostToDevice) ); 

	gpuAssert( cudaMemcpyToSymbol( ad, &_b, sizeof(float *), sizeof(float *) * (size_t)1, cudaMemcpyHostToDevice) ); 

	gpuAssert( cudaMemcpyToSymbol( ad, &_c, sizeof(float *), sizeof(float *) * (size_t)2, cudaMemcpyHostToDevice) ); 

	testkernel2 <<< 1, _DSIZE >>> (_d);

	gpuAssert( cudaThreadSynchronize() );

	gpuAssert( cudaMemcpy(d, _d, _DSIZE * sizeof(float), cudaMemcpyDeviceToHost) );

	for(int i = 0; i < _DSIZE; i++) {

		fprintf(stdout, "%2d %6.1f\n", i, d[i]);

	}

	cudaFree(_a);

	cudaFree(_b);

	cudaFree(_c);

	cudaFree(_d);

	free(a);

	free(b);

	free(c);

	free(d);

	return cudaThreadExit();

}

You might be able to turn it into something that suits your needs.