Hello,
The nested loop problem I’m facing is still not solved (please refer to one of my previous posts for more information).
To check if this problem is related to my GFX board, I wrote a self-contained sample code which is shown below. It runs my test kernel and checks if the kernel modifies the variable ‘r’. If ‘r’ is modified “OK” is printed on the terminal (as this is what I intend to do), if ‘r’ is left unmodified the code prints “ERROR”.
So I would be very glad if someone would compile and run this code on his machine to check the result. In my case (ubuntu 10.04 64bit, CUDA v3.2, NVIDIA Quadro FX 570) the result is “ERROR”. If I outcomment the inner if-block in the kernel, I get “OK”. I do not understand this behaviour.
So please help me External Image
Many thanks in advance
enuhtac
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <float.h>
#include <stdio.h>
#include <cuda.h>
/*
* CUDA error handling macro
*/
#define CUDA_CALL( call ) \
{ \
cudaError_t err; \
err = (call); \
if( err != cudaSuccess ) \
{ \
fprintf( stderr, "error in CUDA call in file '%s', line: %d\n" \
"%s\nerror %d: %s\nterminating!\n", \
__FILE__, __LINE__, #call, \
err, cudaGetErrorString( err ) ); \
exit( ~0 ); \
} \
}
/*
* macro to convert a coordinate triple into a 1d-index for memory acces
*/
#define IDX(i,j,k,n) (((k) * (n).y + (j)) * (n).x + (i))
/*
* my own kind of pitched pointer struct to ease access on the device side.
* The following functions are "member function" of Buffer to deal with it.
*/
struct Buffer
{
float *ptr;
int2 pitchedSize;
int origXSize;
};
static struct Buffer bufferFromPitchedPtr(const struct cudaPitchedPtr *cudaPtr)
{
Buffer result;
result.ptr = (float *) cudaPtr->ptr;
result.pitchedSize.x = cudaPtr->pitch / sizeof( float );
result.pitchedSize.y = cudaPtr->ysize;
result.origXSize = cudaPtr->xsize / sizeof( float );
return result;
}
static struct cudaPitchedPtr bufferToPitchedPtr( const struct Buffer *b )
{
return make_cudaPitchedPtr( b->ptr,
b->pitchedSize.x * sizeof( float ),
b->origXSize * sizeof( float ), b->pitchedSize.y );
}
static void copyToBuffer( struct Buffer *dst, const float *src, int3 size )
{
cudaMemcpy3DParms parms = { 0 };
parms.srcPtr = make_cudaPitchedPtr( (void *) src,
size.x * sizeof( float ), size.x * sizeof( float ), size.y );
parms.dstPtr = bufferToPitchedPtr( dst );
parms.extent = make_cudaExtent( size.x * sizeof( float ), size.y, size.z );
parms.kind = cudaMemcpyHostToDevice;
CUDA_CALL( cudaMemcpy3D( &parms ) );
}
static void copyFromBuffer( float *dst, const struct Buffer *src, int3 size )
{
cudaMemcpy3DParms parms = { 0 };
parms.srcPtr = bufferToPitchedPtr( src );
parms.dstPtr = make_cudaPitchedPtr( dst,
size.x * sizeof( float ), size.x * sizeof( float ), size.y );
parms.extent = make_cudaExtent(
size.x * sizeof( float ), size.y, size.z );
parms.kind = cudaMemcpyDeviceToHost;
CUDA_CALL( cudaMemcpy3D( &parms ) );
}
static struct Buffer allocBuffer( int3 size )
{
cudaPitchedPtr tmp;
CUDA_CALL( cudaMalloc3D( &tmp, make_cudaExtent(
size.x * sizeof( float ), size.y, size.z ) ) )
return bufferFromPitchedPtr( &tmp );
}
static void freeBuffer( struct Buffer *buf )
{
CUDA_CALL( cudaFree( buf->ptr ) )
}
__global__ void loopTest( Buffer r, Buffer x, int dim,
int3 nc, int3 ng, dim3 realGridDim )
{
int i, j, k, bz;
bz = blockIdx.y / realGridDim.y;
for( k = blockDim.z * bz + threadIdx.z;
k < nc.z - ng.z; k += blockDim.z * realGridDim.z )
for( j = blockDim.y * (blockIdx.y - realGridDim.y*bz) + threadIdx.y;
j < nc.y - ng.y; j += blockDim.y * realGridDim.y )
for( i = blockDim.x * blockIdx.x + threadIdx.x;
i < nc.x - ng.x; i += blockDim.x * gridDim.x )
if( (i >= ng.x) && (j >= ng.y) && (k >= ng.z) )
{
/*
* do some silly stuff inside kernel. This kernel is
* never executed on CUDA v3.2 and NVIDIA Quadro FX 570
* GFX board.
* Outcomment the if-block to have this kernel executed.
*/
int nijk = IDX( i, j, k, r.pitchedSize ),
nijk_n = 0;
float tmp = 0.0f;
if( dim >= 2 )
{
float tmp2 = 0.0f;
nijk_n = nijk;
tmp2 += x.ptr[nijk_n] - x.ptr[nijk];
tmp += tmp2;
}
r.ptr[nijk] = nijk_n;
}
}
int main()
{
/*
* define size of real 3D grid and CUDA thread layout
*/
int3 nCells = { 42, 42, 42 },
nGhosts = { 1, 1, 1 },
blockDim = { 32, 4, 4 },
gridDim = { 2, 11, 11 };
dim3 threads( blockDim.x, blockDim.y, blockDim.z ),
realBlocks( gridDim.x, gridDim.y, gridDim.z ),
cudaBlocks( gridDim.x, gridDim.y * gridDim.z, 1 );
float *r, *a;
struct Buffer rD, aD;
size_t i, size;
int changed;
/*
* allocate host memory
*/
size = nCells.x * nCells.y * nCells.z;
r = (float *) malloc( size * sizeof( float ) );
a = (float *) malloc( size * sizeof( float ) );
assert( r );
assert( a );
/*
* allocate device memory
*/
rD = allocBuffer( nCells );
aD = allocBuffer( nCells );
/*
* fill r and a for testing purpose
*/
for( i = 0; i < size; ++i )
{
r[i] = 0.42;
a[i] = 2.0;
}
/*
* copy r and a from host to device
*/
copyToBuffer( &rD, r, nCells );
copyToBuffer( &aD, a, nCells );
/*
* call kernel
*/
loopTest<<< cudaBlocks, threads >>>(rD, aD, 3, nCells, nGhosts, realBlocks);
CUDA_CALL( cudaThreadSynchronize() )
/*
* copy resulting r from device to host
*/
copyFromBuffer( r, &rD, nCells );
/*
* check if r was changed
*/
changed = 0;
for( i = 0; i < size; ++i )
if( fabs( r[i]-0.42 ) > FLT_EPSILON )
changed = 1;
if( changed )
puts( "OK - r was changed!\n" );
else
puts( "ERROR -r was not changed!\n" );
/*
* cleanup
*/
free( a );
free( r );
freeBuffer( &aD );
freeBuffer( &rD );
return 0;
}