I know that allocating a 2D array to the GPU that can be addressed in the form [i]array[j] isn’t fast due to the double pointers, but how do you actually allocate an array in such a manner? Searches on the forums have given the answer “flatten it to a 1D array”, but my supervisor wants to see the difference in access time between a double pointered array and the flattened version. Does anyone have a simple example of how to allocate and copy to/from a 2D array in this manner? Thanks in advance.
This is a pretty trivial example, and I wouldn’t recommend it for benchmarking, but it at least shows one way to do it:
#include <assert.h>
#include <stdio.h>
#include <cuda_runtime.h>
#ifndef gpuAssert
#include <stdio.h>
#define gpuAssert( condition ) {if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE %d in %s, line %d\n", condition, __FILE__, __LINE__ );exit( 1 );}}
#endif
#define _DSIZE (32)
__device__ float * ad[3];
__global__ void testkernel2(float *d)
{
unsigned int idx = threadIdx.x + blockDim.x*blockIdx.x;
d[idx] += ad[0][idx] + ad[1][idx] + ad[2][idx];
}
int main()
{
float *a, *b, *c, *d;
float *_a, *_b, *_c, *_d;
assert( !(( a = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );
assert( !(( b = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );
assert( !(( c = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );
assert( !(( d = (float *)malloc(_DSIZE * sizeof(float)) ) == NULL) );
gpuAssert( cudaMalloc( (void**)&_a, _DSIZE * sizeof(float) ) );
gpuAssert( cudaMalloc( (void**)&_b, _DSIZE * sizeof(float) ) );
gpuAssert( cudaMalloc( (void**)&_c, _DSIZE * sizeof(float) ) );
gpuAssert( cudaMalloc( (void**)&_d, _DSIZE * sizeof(float) ) );
for(int i = 0; i < _DSIZE; i++) {
a[i] = 3.f;
b[i] = 5.f;
c[i] = 7.f;
d[i] = (float)i;
}
gpuAssert( cudaMemcpy(_a, a, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );
gpuAssert( cudaMemcpy(_b, b, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );
gpuAssert( cudaMemcpy(_c, c, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );
gpuAssert( cudaMemcpy(_d, d, _DSIZE * sizeof(float), cudaMemcpyHostToDevice) );
gpuAssert( cudaMemcpyToSymbol( ad, &_a, sizeof(float *), sizeof(float *) * (size_t)0, cudaMemcpyHostToDevice) );
gpuAssert( cudaMemcpyToSymbol( ad, &_b, sizeof(float *), sizeof(float *) * (size_t)1, cudaMemcpyHostToDevice) );
gpuAssert( cudaMemcpyToSymbol( ad, &_c, sizeof(float *), sizeof(float *) * (size_t)2, cudaMemcpyHostToDevice) );
testkernel2 <<< 1, _DSIZE >>> (_d);
gpuAssert( cudaThreadSynchronize() );
gpuAssert( cudaMemcpy(d, _d, _DSIZE * sizeof(float), cudaMemcpyDeviceToHost) );
for(int i = 0; i < _DSIZE; i++) {
fprintf(stdout, "%2d %6.1f\n", i, d[i]);
}
cudaFree(_a);
cudaFree(_b);
cudaFree(_c);
cudaFree(_d);
free(a);
free(b);
free(c);
free(d);
return cudaThreadExit();
}
You might be able to turn it into something that suits your needs.