Help with driver API What am I doing wrong?

I have a generic ptx file and I am trying to get it to run using the driver API. Though, whenever I get to cuMemcpyDtoH I get error 700. I assume that it is from my kernel invocation before it, but I just can figure out why…

Any help would be greatly appreciated

[codebox]

#define PTX

int main()

{

//===================================

// CUDA Driver API vars

//===================================

CUcontext   hContext   = 0;

CUdevice    hDevice    = 0;

CUmodule    hModuleAdd = 0;

CUstream    hStream    = 0;

CUfunction  hKernelAdd = 0;

//===================================

// Function input data

//===================================

CUdeviceptr   dA      = 0;// Buffer A - GPU

float        *hA      = 0;// Buffer A - Host

CUdeviceptr   dB      = 0;// Buffer B - GPU

float        *hB      = 0;// Buffer B - Host

CUdeviceptr   dC      = 0;// Buffer C - GPU

float        *hC      = 0;// Buffer C - Host

int           N       = NUM_ELEMS;

string        s       = "";

//===================================

// Misc. vars

//===================================

int  iDevice           = 0;

int  paramOfs          = 0;

bool pass              = true;

// Initialize the CUDA Driver API and get the number of available devices

cutilDrvSafeCall( cuInit( 0 ) );

cutilDrvSafeCall( cuDeviceGetCount( &iDevice ) );

assert( iDevice > 0 );

// Load the first device we find and create a context for it

cutilDrvSafeCall( cuDeviceGet( &hDevice, 0 ) );

cutilDrvSafeCall( cuCtxCreate( &hContext, CU_CTX_BLOCKING_SYNC, hDevice ) );

// Create a stream

cutilDrvSafeCall( cuStreamCreate( &hStream, 0 ) );

//===================================

// CUDA Set up Complete

//===================================

// Load the modules from disk

if( FileExists( VADDX_PATH ) )

#ifdef PTX

if( !ReadFile( s, VADDX_PATH ) ){

        cout << "Error loading file!" << endl;

        return EXIT_FAILURE;}

    else{

		// in this branch we use compilation with parameters

	    const unsigned int jitNumOptions = 3;

	    CUjit_option *jitOptions         = new CUjit_option[jitNumOptions];

	    void **jitOptVals                = new void*[jitNumOptions];

	    // set up size of compilation log buffer

	    jitOptions[0]                    = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;

	    int jitLogBufferSize             = 1024;

	    jitOptVals[0]                    = (void *)jitLogBufferSize;

	    // set up pointer to the compilation log buffer

	    jitOptions[1]                    = CU_JIT_INFO_LOG_BUFFER;

	    char *jitLogBuffer               = new char[jitLogBufferSize];

	    jitOptVals[1]                    = jitLogBuffer;

	    // set up pointer to set the Maximum # of registers for a particular kernel

	    jitOptions[2]                    = CU_JIT_MAX_REGISTERS;

	    int jitRegCount                  = 32;

	    jitOptVals[2]                    = (void *)jitRegCount;

cout << s.c_str() << endl;

	    cutilDrvSafeCall( cuModuleLoadDataEx( &hModuleAdd, s.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals ) );}

#else

    cutilDrvSafeCall( cuModuleLoad( &hModuleAdd, VADDX_PATH ) );

#endif

else

    cout << "Could not load vaddx module located at " << VADDX_PATH << "!" << endl;

cout << "Modules have been loaded from disk!" << endl << endl;

// Load the kernel entry point

cutilDrvSafeCall( cuModuleGetFunction( &hKernelAdd, hModuleAdd, VADDX_NAME ) );

cout << "Kernel entry points have been loaded!" << endl << endl;

//===================================

// Allocate and Initialize input data

//===================================

hA = ( float* )malloc( NUM_ELEMS*sizeof( float ) );

hB = ( float* )malloc( NUM_ELEMS*sizeof( float ) );

hC = ( float* )malloc( NUM_ELEMS*sizeof( float ) );

assert( hA != NULL );

assert( hB != NULL );

assert( hC != NULL );

int result_A = Ramp( hA, NUM_ELEMS, RAMP_INC );

int result_B = Ramp( hB, NUM_ELEMS, RAMP_DEC );

if( result_A == EXIT_FAILURE ){

    cout << "Error initializing A!" << endl;

    return EXIT_FAILURE;}

if( result_B == EXIT_FAILURE ){

    cout << "Error initializing B!" << endl;

    return EXIT_FAILURE;}

cout << "Input data has been initialized!" << endl << endl;

//===================================

// Allocate device memory and copy

// from host

//===================================

cutilDrvSafeCall( cuMemAlloc( &dA, NUM_ELEMS*sizeof(float) ) );

cutilDrvSafeCall( cuMemAlloc( &dB, NUM_ELEMS*sizeof(float) ) );

cutilDrvSafeCall( cuMemAlloc( &dC, NUM_ELEMS*sizeof(float) ) );

cout << "GPU memory has been allocated!" << endl << endl;

cutilDrvSafeCall( cuMemcpyHtoD( dA, hA, NUM_ELEMS*sizeof(float) ) );

cutilDrvSafeCall( cuMemcpyHtoD( dB, hB, NUM_ELEMS*sizeof(float) ) );

cutilDrvSafeCall( cuMemcpyHtoD( dC, hC, NUM_ELEMS*sizeof(float) ) );

cout << "GPU memory has been copied to device!" << endl << endl;

cutilDrvSafeCall( cuParamSetv( hKernelAdd, paramOfs, &dA, sizeof( void* ) ) ); paramOfs += sizeof( void* );

cutilDrvSafeCall( cuParamSetv( hKernelAdd, paramOfs, &dB, sizeof( void* ) ) ); paramOfs += sizeof( void* );// paramOfs = align( paramOfs + sizeof( void* ), __alignof( void* ) );

cutilDrvSafeCall( cuParamSetv( hKernelAdd, paramOfs, &dC, sizeof( void* ) ) ); paramOfs += sizeof( void* );// paramOfs = align( paramOfs + sizeof( void* ), __alignof( void* ) );

cutilDrvSafeCall( cuParamSeti( hKernelAdd, paramOfs,  N ) );                   paramOfs += sizeof( N     );// paramOfs = align( paramOfs + sizeof( void* ), __alignof( void* ) );

cutilDrvSafeCall( cuParamSetSize( hKernelAdd, paramOfs ) );

cutilDrvSafeCall( cuFuncSetBlockShape( hKernelAdd, NUM_THREADS, 1, 1 ) );

cout << "Parameters have been set!" << endl << endl;

//===================================

// Run the function

//===================================

cutilDrvSafeCall( cuLaunchGrid( hKernelAdd, NUM_BLOCKS, 1 ) );

cout << "Function has completed!" << endl << endl;

//===================================

// Retrieve the results

//===================================

cutilDrvSafeCall( cuMemcpyDtoH( hC, dC, NUM_ELEMS*sizeof(hC) ) );

cout << "Data has been retrieved from device!" << endl << endl;

//===================================

// Verify the results

//===================================

cout << "Verifying results ..." << endl << endl;

for (int i = 0; i < NUM_ELEMS; i += 1)

{

    if( abs( hC[i] - (hA[i] + hB[i] ) ) > DELTA ){

        cout << "Error -- [" << i << "] : hA[i] + hB[i] (" << hA[i] + hB[i] << ") != hC[i] (" << hC[i] << ")" << endl;

        pass = false;}

}

if( !pass )

    return EXIT_FAILURE;

return EXIT_SUCCESS;

}[/codebox]