Please help me and run this code the question is if it just fails on my GFX board

enuhtac · March 2, 2011, 11:10pm

Hello,

The nested loop problem I’m facing is still not solved (please refer to one of my previous posts for more information).

To check if this problem is related to my GFX board, I wrote a self-contained sample code which is shown below. It runs my test kernel and checks if the kernel modifies the variable ‘r’. If ‘r’ is modified “OK” is printed on the terminal (as this is what I intend to do), if ‘r’ is left unmodified the code prints “ERROR”.

So I would be very glad if someone would compile and run this code on his machine to check the result. In my case (ubuntu 10.04 64bit, CUDA v3.2, NVIDIA Quadro FX 570) the result is “ERROR”. If I outcomment the inner if-block in the kernel, I get “OK”. I do not understand this behaviour.

So please help me External Image

Many thanks in advance

enuhtac

#include <stdlib.h>

#include <assert.h>

#include <math.h>

#include <float.h>

#include <stdio.h>

#include <cuda.h>

/*

 * CUDA error handling macro

 */

#define CUDA_CALL( call ) \

        { \

            cudaError_t err; \

            err = (call); \

            if( err != cudaSuccess ) \

            { \

                fprintf( stderr, "error in CUDA call in file '%s', line: %d\n" \

                        "%s\nerror %d: %s\nterminating!\n", \

                        __FILE__, __LINE__, #call, \

                        err, cudaGetErrorString( err ) ); \

                exit( ~0 ); \

            } \

        }

/*

 * macro to convert a coordinate triple into a 1d-index for memory acces

 */

#define IDX(i,j,k,n) (((k) * (n).y + (j)) * (n).x + (i))

/*

 * my own kind of pitched pointer struct to ease access on the device side.

 * The following functions are "member function" of Buffer to deal with it.

 */

struct Buffer

{

    float   *ptr;

    int2    pitchedSize;

    int     origXSize;

};

static struct Buffer bufferFromPitchedPtr(const struct cudaPitchedPtr *cudaPtr)

{

    Buffer result;

result.ptr = (float *) cudaPtr->ptr;

    result.pitchedSize.x = cudaPtr->pitch / sizeof( float );

    result.pitchedSize.y = cudaPtr->ysize;

    result.origXSize = cudaPtr->xsize / sizeof( float );

return result;

}

static struct cudaPitchedPtr bufferToPitchedPtr( const struct Buffer *b )

{

    return make_cudaPitchedPtr( b->ptr,

            b->pitchedSize.x * sizeof( float ),

            b->origXSize * sizeof( float ), b->pitchedSize.y );

}

static void copyToBuffer( struct Buffer *dst, const float *src, int3 size )

{

    cudaMemcpy3DParms parms = { 0 };

parms.srcPtr = make_cudaPitchedPtr( (void *) src,

            size.x * sizeof( float ), size.x * sizeof( float ), size.y );

    parms.dstPtr = bufferToPitchedPtr( dst );

    parms.extent = make_cudaExtent( size.x * sizeof( float ), size.y, size.z );

    parms.kind = cudaMemcpyHostToDevice;

CUDA_CALL( cudaMemcpy3D( &parms ) );

}

static void copyFromBuffer( float *dst, const struct Buffer *src, int3 size )

{

    cudaMemcpy3DParms parms = { 0 };

parms.srcPtr = bufferToPitchedPtr( src );

    parms.dstPtr = make_cudaPitchedPtr( dst,

            size.x * sizeof( float ), size.x * sizeof( float ), size.y );

    parms.extent = make_cudaExtent(

            size.x * sizeof( float ), size.y, size.z );

    parms.kind = cudaMemcpyDeviceToHost;

CUDA_CALL( cudaMemcpy3D( &parms ) );

}

static struct Buffer allocBuffer( int3 size )

{

    cudaPitchedPtr tmp;

CUDA_CALL( cudaMalloc3D( &tmp, make_cudaExtent(

            size.x * sizeof( float ), size.y, size.z ) ) )

return bufferFromPitchedPtr( &tmp );

}

static void freeBuffer( struct Buffer *buf )

{

    CUDA_CALL( cudaFree( buf->ptr ) )

}

__global__ void loopTest(   Buffer r, Buffer x, int dim,

                            int3 nc, int3 ng, dim3 realGridDim )

{

    int i, j, k, bz;

bz = blockIdx.y / realGridDim.y;

for( k = blockDim.z * bz + threadIdx.z;

            k < nc.z - ng.z; k += blockDim.z * realGridDim.z )

        for( j = blockDim.y * (blockIdx.y - realGridDim.y*bz) + threadIdx.y;

                j < nc.y - ng.y; j += blockDim.y * realGridDim.y )

            for( i = blockDim.x * blockIdx.x + threadIdx.x;

                    i < nc.x - ng.x; i += blockDim.x * gridDim.x )

                if( (i >= ng.x) && (j >= ng.y) && (k >= ng.z) )

                {

                    /*

                     * do some silly stuff inside kernel. This kernel is

                     * never executed on CUDA v3.2 and NVIDIA Quadro FX 570

                     * GFX board.

                     * Outcomment the if-block to have this kernel executed.

                     */

                    int     nijk = IDX( i, j, k, r.pitchedSize ),

                            nijk_n = 0;

                    float   tmp = 0.0f;

if( dim >= 2 )

                    {

                        float tmp2 = 0.0f;

nijk_n = nijk;

                        tmp2 += x.ptr[nijk_n] - x.ptr[nijk];

tmp += tmp2;

}

r.ptr[nijk] = nijk_n;

                }

}

int main()

{

    /*

     * define size of real 3D grid and CUDA thread layout

     */

    int3    nCells = { 42, 42, 42 },

            nGhosts = { 1, 1, 1 },

            blockDim = { 32, 4, 4 },

            gridDim = { 2, 11, 11 };

    dim3    threads( blockDim.x, blockDim.y, blockDim.z ),

            realBlocks( gridDim.x, gridDim.y, gridDim.z ),

            cudaBlocks( gridDim.x, gridDim.y * gridDim.z, 1 );

float           *r, *a;

    struct Buffer   rD, aD;

    size_t          i, size;

    int             changed;

/*

     * allocate host memory

     */

    size = nCells.x * nCells.y * nCells.z;

    r = (float *) malloc( size * sizeof( float ) );

    a = (float *) malloc( size * sizeof( float ) );

    assert( r );

    assert( a );

/*

     * allocate device memory

     */

    rD = allocBuffer( nCells );

    aD = allocBuffer( nCells );

/*

     * fill r and a for testing purpose

     */

    for( i = 0; i < size; ++i )

    {

        r[i] = 0.42;

        a[i] = 2.0;

    }

/*

     * copy r and a from host to device

     */

    copyToBuffer( &rD, r, nCells );

    copyToBuffer( &aD, a, nCells );

/*

     * call kernel

     */

    loopTest<<< cudaBlocks, threads >>>(rD, aD, 3, nCells, nGhosts, realBlocks);

    CUDA_CALL( cudaThreadSynchronize() )

/*

     * copy resulting r from device to host

     */

    copyFromBuffer( r, &rD, nCells );

/*

     * check if r was changed

     */

    changed = 0;

    for( i = 0; i < size; ++i )

        if( fabs( r[i]-0.42 ) > FLT_EPSILON )

            changed = 1;

    if( changed )

        puts( "OK - r was changed!\n" );

    else

        puts( "ERROR -r was not changed!\n" );

/*

     * cleanup

     */

    free( a );

    free( r );

    freeBuffer( &aD );

    freeBuffer( &rD );

return 0;

}

sanstorm · March 3, 2011, 10:56am

Hi! I copied your code into my CUDA template, compiled fine and got the OK message.

I run Windows 7, GeForce GT 415M. Let me know if you want any more details!’

EDIT:

I guess I could include this:

C:\CUDASDK\C\bin\win64\Release\deviceQuery.exe Starting…

CUDA Device Query (Runtime API) version (CUDART static linking)

There is 1 device supporting CUDA

Device 0: “GeForce GT 415M”
CUDA Driver Version: 3.10
CUDA Runtime Version: 3.10
CUDA Capability Major revision number: 2
CUDA Capability Minor revision number: 1
Total amount of global memory: 1041825792 bytes
Number of multiprocessors: 1
Number of cores: 32
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per block: 1024
Maximum sizes of each dimension of a block: 1024 x 1024 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Clock rate: 1.00 GHz
Concurrent copy and execution: Yes
Run time limit on kernels: Yes
Integrated: No
Support host page-locked memory mapping: Yes
Compute mode: Default (multiple host threads can use this device simultaneously)
Concurrent kernel execution: Yes
Device has ECC support enabled: No

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 3.10, CUDA Runtime Version = 3.10, NumDevs = 1, Device = GeForce GT 415M

PASSED

Press to Quit…

enuhtac · March 5, 2011, 12:59am

Hi sanstorm,

thank you very much for your help External Image

I suppose you got the OK message without changing the code (i.e. without outcommenting the inner if in the kernel).

But it’s still not clear what the reason is that this code works for you but not for me. It could be my graphics board (as I suspected) but it could also be the CUDA version: I’m using Linux version 3.2, you’re using Windows version 3.1. I defintely should try version 4.0 and see what’s happens. Maybe it’s time for a bug report…

enuhtac

sanstorm · March 6, 2011, 7:45pm

Hi! I cannot help you as I am a total newbie, but I did not change the code at all, just copied and pasted!

ceearem · March 7, 2011, 5:05pm

Hi

compiling your code with “nvcc test.cu” and running it on various machines (all with Cuda 3.2) worked and gave always OK.

Systems:
GTX280 CentOS 5.5 (RedHat)
GTX470 CentOS 5.5 (RedHat)
GTX470 Ubuntu 10.04
GT220 Ubuntu 10.04

cheers
Ceearem

Topic		Replies	Views
problems with nested loop, again CUDA Programming and Performance	4	6521	March 15, 2011
Using unified memory causes system crash CUDA Programming and Performance	28	5855	February 4, 2019
strange behaviour CUDA Programming and Performance	6	2544	February 1, 2010
dealing with 3d grids for cfd simulations strange behaviour of nested loops in CUDA CUDA Programming and Performance	1	1770	February 15, 2011
Cuda code performance CUDA Programming and Performance	14	3137	December 16, 2014
Annoying problems with memory and/or syntax CUDA Programming and Performance	19	4769	April 8, 2008
Emulator works but G80 doesn't CUDA Programming and Performance	11	5432	July 3, 2007
code that crashes unpredictably CUDA Programming and Performance	15	12635	April 28, 2010
Urgent help with threads please! CUDA Programming and Performance	21	10786	March 6, 2008
Error 719 (failure to launch) for JCUDA and PyCUDA; How to run GPU consecutive times for 'large' data blocks CUDA Programming and Performance	0	2329	December 13, 2016

Please help me and run this code the question is if it just fails on my GFX board

Press to Quit…

Related topics