I have a generic ptx file and I am trying to get it to run using the driver API. Though, whenever I get to cuMemcpyDtoH I get error 700. I assume that it is from my kernel invocation before it, but I just can figure out why…
Any help would be greatly appreciated
[codebox]
#define PTX
int main()
{
//===================================
// CUDA Driver API vars
//===================================
CUcontext hContext = 0;
CUdevice hDevice = 0;
CUmodule hModuleAdd = 0;
CUstream hStream = 0;
CUfunction hKernelAdd = 0;
//===================================
// Function input data
//===================================
CUdeviceptr dA = 0;// Buffer A - GPU
float *hA = 0;// Buffer A - Host
CUdeviceptr dB = 0;// Buffer B - GPU
float *hB = 0;// Buffer B - Host
CUdeviceptr dC = 0;// Buffer C - GPU
float *hC = 0;// Buffer C - Host
int N = NUM_ELEMS;
string s = "";
//===================================
// Misc. vars
//===================================
int iDevice = 0;
int paramOfs = 0;
bool pass = true;
// Initialize the CUDA Driver API and get the number of available devices
cutilDrvSafeCall( cuInit( 0 ) );
cutilDrvSafeCall( cuDeviceGetCount( &iDevice ) );
assert( iDevice > 0 );
// Load the first device we find and create a context for it
cutilDrvSafeCall( cuDeviceGet( &hDevice, 0 ) );
cutilDrvSafeCall( cuCtxCreate( &hContext, CU_CTX_BLOCKING_SYNC, hDevice ) );
// Create a stream
cutilDrvSafeCall( cuStreamCreate( &hStream, 0 ) );
//===================================
// CUDA Set up Complete
//===================================
// Load the modules from disk
if( FileExists( VADDX_PATH ) )
#ifdef PTX
if( !ReadFile( s, VADDX_PATH ) ){
cout << "Error loading file!" << endl;
return EXIT_FAILURE;}
else{
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 3;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void*[jitNumOptions];
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)jitRegCount;
cout << s.c_str() << endl;
cutilDrvSafeCall( cuModuleLoadDataEx( &hModuleAdd, s.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals ) );}
cutilDrvSafeCall( cuModuleLoad( &hModuleAdd, VADDX_PATH ) );
else
cout << "Could not load vaddx module located at " << VADDX_PATH << "!" << endl;
cout << "Modules have been loaded from disk!" << endl << endl;
// Load the kernel entry point
cutilDrvSafeCall( cuModuleGetFunction( &hKernelAdd, hModuleAdd, VADDX_NAME ) );
cout << "Kernel entry points have been loaded!" << endl << endl;
//===================================
// Allocate and Initialize input data
//===================================
hA = ( float* )malloc( NUM_ELEMS*sizeof( float ) );
hB = ( float* )malloc( NUM_ELEMS*sizeof( float ) );
hC = ( float* )malloc( NUM_ELEMS*sizeof( float ) );
assert( hA != NULL );
assert( hB != NULL );
assert( hC != NULL );
int result_A = Ramp( hA, NUM_ELEMS, RAMP_INC );
int result_B = Ramp( hB, NUM_ELEMS, RAMP_DEC );
if( result_A == EXIT_FAILURE ){
cout << "Error initializing A!" << endl;
return EXIT_FAILURE;}
if( result_B == EXIT_FAILURE ){
cout << "Error initializing B!" << endl;
return EXIT_FAILURE;}
cout << "Input data has been initialized!" << endl << endl;
//===================================
// Allocate device memory and copy
// from host
//===================================
cutilDrvSafeCall( cuMemAlloc( &dA, NUM_ELEMS*sizeof(float) ) );
cutilDrvSafeCall( cuMemAlloc( &dB, NUM_ELEMS*sizeof(float) ) );
cutilDrvSafeCall( cuMemAlloc( &dC, NUM_ELEMS*sizeof(float) ) );
cout << "GPU memory has been allocated!" << endl << endl;
cutilDrvSafeCall( cuMemcpyHtoD( dA, hA, NUM_ELEMS*sizeof(float) ) );
cutilDrvSafeCall( cuMemcpyHtoD( dB, hB, NUM_ELEMS*sizeof(float) ) );
cutilDrvSafeCall( cuMemcpyHtoD( dC, hC, NUM_ELEMS*sizeof(float) ) );
cout << "GPU memory has been copied to device!" << endl << endl;
cutilDrvSafeCall( cuParamSetv( hKernelAdd, paramOfs, &dA, sizeof( void* ) ) ); paramOfs += sizeof( void* );
cutilDrvSafeCall( cuParamSetv( hKernelAdd, paramOfs, &dB, sizeof( void* ) ) ); paramOfs += sizeof( void* );// paramOfs = align( paramOfs + sizeof( void* ), __alignof( void* ) );
cutilDrvSafeCall( cuParamSetv( hKernelAdd, paramOfs, &dC, sizeof( void* ) ) ); paramOfs += sizeof( void* );// paramOfs = align( paramOfs + sizeof( void* ), __alignof( void* ) );
cutilDrvSafeCall( cuParamSeti( hKernelAdd, paramOfs, N ) ); paramOfs += sizeof( N );// paramOfs = align( paramOfs + sizeof( void* ), __alignof( void* ) );
cutilDrvSafeCall( cuParamSetSize( hKernelAdd, paramOfs ) );
cutilDrvSafeCall( cuFuncSetBlockShape( hKernelAdd, NUM_THREADS, 1, 1 ) );
cout << "Parameters have been set!" << endl << endl;
//===================================
// Run the function
//===================================
cutilDrvSafeCall( cuLaunchGrid( hKernelAdd, NUM_BLOCKS, 1 ) );
cout << "Function has completed!" << endl << endl;
//===================================
// Retrieve the results
//===================================
cutilDrvSafeCall( cuMemcpyDtoH( hC, dC, NUM_ELEMS*sizeof(hC) ) );
cout << "Data has been retrieved from device!" << endl << endl;
//===================================
// Verify the results
//===================================
cout << "Verifying results ..." << endl << endl;
for (int i = 0; i < NUM_ELEMS; i += 1)
{
if( abs( hC[i] - (hA[i] + hB[i] ) ) > DELTA ){
cout << "Error -- [" << i << "] : hA[i] + hB[i] (" << hA[i] + hB[i] << ") != hC[i] (" << hC[i] << ")" << endl;
pass = false;}
}
if( !pass )
return EXIT_FAILURE;
return EXIT_SUCCESS;
}[/codebox]