Many thanks to all three of you for your replies.
Mark: Well, I knew that if anyone replied (and that was far from certain) that at least one of them would ask to see the source code (it’s what I would do). So, because the source code itself includes a whole mess of stuff irrelevant to the problem, I created the following, which is essentially what the program does, without the other, entirely superfluous code:
#include "stdafx.h"
#include "CUDA.h" // <== includes: #include "cuda_drvapi_dynlink_cuda.h"
extern "C" {
#include "cuda_drvapi_dynlink.c" // <== ..to dynamically link in the 4.0 CUDA API..
}
/***************
* CONFIGURATION *
***************/
#define PTXINFLOGSZE 1024 // <== Size, in bytes, of the buffer to hold informational log messages from the PTX assembly..
#define PTXERRLOGSZE 1024 // <== Size, in bytes, of the buffer to hold any error log messages from the PTX assembly....
/**************
* ENUMERATIONS *
**************/
enum // These are the named indeces of the LoadOpts[] array, and as such, MUST exactly match the number and order of the elements of that array..
{
OptNdx_MAX_REGS,
// OptNdx_BLK_THREADS,
OptNdx_INFO_BUFF,
OptNdx_INFO_BUFF_SZE,
OptNdx_ERR_BUFF,
OptNdx_ERR_BUFF_SZE,
OptNdx_TARGET,
OptNdx_FALLBACK
};
/**************
* LOCAL MACROS *
**************/
#define Elems( array ) (sizeof(array)/sizeof(*(array)))
#define CALL_DRVR_API( func, call )
if ( (Rslt = (call)) != CUDA_SUCCESS ) GenError( Caller, ERR_CUDACALL, #func, Rslt )
#define FREE_ARRAY_PTR( A )
if ( A ) { delete [] A; A = NULL; }
/*****************
* LOCAL CONSTANTS *
*****************/
static CUjit_option LoadOpts[] =
{
CU_JIT_MAX_REGISTERS, // <== (unsigned int) input specifies the maximum number of registers per thread..
// CU_JIT_THREADS_PER_BLOCK, // <== (unsigned int) input specifies number of threads per block to target compilation for..
CU_JIT_INFO_LOG_BUFFER, // <== (char *) input is a pointer to a buffer in which to print any informational log messages from PTX assembly..
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, // <== (unsigned int) input is the size in bytes of the buffer; output is the number of bytes filled with messages..
CU_JIT_ERROR_LOG_BUFFER, // <== (char *) input is a pointer to a buffer in which to print any error log messages from PTX assembly..
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, // <== (unsigned int) input is the size in bytes of the buffer; output is the number of bytes filled with messages..
CU_JIT_TARGET, // <== (unsigned int for enumerated type CUjit_target_enum) input is the compilation target based on supplied CUjit_target_enum..
CU_JIT_FALLBACK_STRATEGY // <== (unsigned int for enumerated type CUjit_fallback_enum) chooses fallback strategy if matching cubin is not found..
};
static void *LoadOptVals[ Elems( LoadOpts ) ] =
{
(void *)32, // <== (unsigned int) The maximum number of registers per thread to use for this kernel..
// (void *)1024, // <== (unsigned int) The number of threads per block; output returns the number of threads the compiler actually targeted..
(void *)NULL, // <== (char *) Pointer to a dynamically allocated buffer which holds any informational log messages from the PTX assembly..
(void *)PTXINFLOGSZE, // <== (unsigned int) Size, in bytes, of the aforementioned informational log message buffer; output is the number of bytes filled with messages..
(void *)NULL, // <== (char *) Pointer to a dynamically allocated buffer which holds any error log messages from the PTX assembly..
(void *)PTXERRLOGSZE, // <== (unsigned int) Size, in bytes, of the aforementioned error log message buffer; output is the number of bytes filled with messages..
(void *)CU_TARGET_COMPUTE_21, // <== (unsigned int) Designates the intended 'compilation target'..
(void *)CU_PREFER_PTX // <== (unsigned int) Designates that the 'fallback strategy', if no matching cubin is found, is to assume PTX..
};
// This is the actual kernel I'm trying to load:
static char MiniKern[] =
".version 2.2
"
".target sm_20
"
".address_size 32
"
".global .u32 Frst[512];
"
".global .u32 Scnd[512];
"
".global .u32 USum[512];
"
".entry AddVec()
"
"{
"
" .reg .u32 ndx;
"
" .reg .u32 Fptr;
"
" .reg .u32 Sptr;
"
" .reg .u32 Uptr;
"
" .reg .u32 Fval;
"
" .reg .u32 Sval;
"
" .reg .u32 Uval;
"
" cvta.global.u32 Fptr, Frst;
"
" cvta.global.u32 Sptr, Scnd;
"
" cvta.global.u32 Uptr, USum;
"
" mad.lo.u32 ndx, %ctaid.x, %ntid.x, %tid.x;
"
" shl.b32 ndx, ndx, 2;
"
" add.u32 Fptr, Fptr, ndx;
"
" add.u32 Sptr, Sptr, ndx;
"
" add.u32 Uptr, Uptr, ndx;
"
" ld.global.u32 Fval, [Fptr];
"
" ld.global.u32 Sval, [Sptr];
"
" add.u32 Uval, Fval, Sval;
"
" st.global.u32 [Uptr], Uval;
"
" ret.uni;
"
"}
";
/*****************
* LOCAL VARIABLES *
*****************/
static unsigned NumDevs; // <== total number of NVidia GPUs in the system, usable or not..
/********************
* EXTERNAL VARIABLES *
********************/
extern char cuErrMsg[]; // <== cuInit() is called elsewhere, and on error, fills this with an ASCII error message..
/*****************************
* GLOBAL FUNCTION DEFINITIONS *
*****************************/
void InitCUDA( CDialog *Caller )
{
CUresult Rslt; // <== ..error return from CUDA Driver API function calls..
CUcontext Ctx; // <== ..the CUDA Context..
CUmodule KMod; // <== ..the kernel Module handle (which I never get)..
CUdevice DevHndle; // <== ..the CUDA device handle..
unsigned Var1, Var2; // <== parameters for a call to cuDeviceComputeCapability()..
#ifdef DBG_PTX
CString DBG_MsgBxFmt;
#endif
// FYI: GenError() is a function (defined elsewhere) that issues a longjmp(), so does NOT return..
if ( *cuErrMsg ) GenError( Caller, ERR_NO_CUDA, cuErrMsg ); // <== cuInit() was called elsewhere, and if error, filled cuErrMsg[]..
// CALL_DRVR_API() is a macro defined above - it's just an easy way to call into the Driver API..
CALL_DRVR_API( cuDeviceGetCount, (*cuDeviceGetCount)( (int *)&NumDevs ) ); // <== ..works: no error..
if ( !NumDevs ) GenError( Caller, ERR_NO_DEVS ); // <== ..works: no error..
if ( !( LoadOptVals[ OptNdx_INFO_BUFF ] = new char [ PTXINFLOGSZE ] ) ||
!( LoadOptVals[ OptNdx_ERR_BUFF ] = new char [ PTXERRLOGSZE ] ) ) GenError( Caller, ERR_NO_MEM ); // <== ..works: no error..
// for simplicity, this code only attempts access to the 'first' GPU that it finds (I only have one in my system anyway)..
CALL_DRVR_API( cuDeviceGet, (*cuDeviceGet)( &DevHndle, 0 ) ); // <== ..works: no error..
CALL_DRVR_API( cuDeviceComputeCapability, (*cuDeviceComputeCapability)( (int *)&Var1, (int *)&Var2, DevHndle ) ); // <== ..works: no error..
if ( Var1 < 2 ) GenError( Caller, ERR_OLD_DEV ); // <== CUDA Compute Capability must be > 2.0 ==> works: no error..
CALL_DRVR_API( cuCtxCreate, (*cuCtxCreate)( &Ctx, CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST, DevHndle ) ); // <== ..works: no error..
#ifdef DBG_PTX
DBG_MsgBxFmt.Format( "About to attempt compilation of the following PTX Source Code:
%s", MiniKern );
Caller->MessageBox( DBG_MsgBxFmt, "FYI");
#endif
CALL_DRVR_API( cuModuleLoadDataEx, (*cuModuleLoadDataEx)( &KMod, (LPCTSTR)MiniKern, Elems( LoadOpts ), LoadOpts, LoadOptVals ) );
// BOOM !! - the above call ALWAYS returns an error code of CUDA_ERROR_NO_BINARY_FOR_GPU !!
// ...[ more code ]..
// Cleanup:
CALL_DRVR_API( cuModuleUnload, (*cuModuleUnload)( KMod ) );
CALL_DRVR_API( cuCtxPopCurrent, (*cuCtxPopCurrent)( &Ctx ) );
FREE_ARRAY_PTR( LoadOptVals[ OptNdx_INFO_BUFF ] )
FREE_ARRAY_PTR( LoadOptVals[ OptNdx_ERR_BUFF ] )
}
This forum’s editor knocked out most of my first attempt at this reply, so I’m having to type it in again (from memory, excuse the pun)… What a pain…
Worth mentioning in the above is the fact that the ‘CU_JIT_THREADS_PER_BLOCK’ parameter to the cuModuleLoadDataEx() function is commented out. That’s because when it’s commented back in, the function call returns CUDA_ERROR_INVALID_VALUE instead of the usual CUDA_ERROR_NO_BINARY_FOR_GPU. Seeing as how a value of 1024 for ‘threads per block’ is fairly standard in CUDA, this might be some sort of clue as to what’s going on, but I can’t make heads or tails of it, other than to suggest the possibility that the cuModuleLoadDataEx() function is expecting an array of 64-bit pointers - is that even possible?
The driver I’m linking to is:
Filename: nvcuda32.dll
File description: NVIDIA CUDA Driver, Version 307.21
Product Name: NVIDIA CUDA 5.0.1 driver
File version: 8.17.13.721
Product version: 8.17.13.0721
File size: 7,697,768 bytes (7.34 MB)
which would seem to strongly imply that it’s using 32-bit pointers…
Another fairly remote possibility is that the cuModuleLoadDataEx() function is expecting ‘pinned’ memory for all of its parameters. But I’ve already tried passing in a ‘pinned’ memory kernel, and it didn’t change a thing, so…
Carl: I’m going to take a good, long, hard look at the source code you provided a link to, but I thought I’d get the ‘let’s see the source code’ part out of the way first…
allanmac: Excellent suggestion, so I tried it. Since the docs say that CU_JIT_TARGET_FROM_CUCCONTEXT is the default, I simply commented out the CU_JIT_TARGET parameter (in all three places, rest assured), but still got the dreaded CUDA_ERROR_NO_BINARY_FOR_GPU…
Thanks again to all who’ve responded…