On Windows 7 64-bit, I was trying to use the 4.0 RC driver API with CUBLAS in the same program, and running into problems where:
-
creating a context and then calling cublasInit() returns the “not initialized” error
-
calling cublasInit() first, and then using that context to cuMemAlloc returned “invalid context” error
Following the guidance in the CUDA Reference Manual, I used cuCtxGetApiVersion and found that cublas64_40_10.dll appears to bind a version 3020 (3.2) context to the host thread, while the version returned for a context generated by cuCtxCreate is 3010. I believe this is causes the errors I encountered.
I wrote a small C# program to demonstrate this. The output is below, and source code follows that:
C# source code to generate this output:
using System;
using System.Runtime.InteropServices;
static class Program
{
static void Main()
{
cuInit(0).AssertOk();
int cudaVer;
cuDriverGetVersion(out cudaVer).AssertOk();
Console.WriteLine("Loaded CUDA driver API v" + cudaVer);
CUdevice dev;
cuDeviceGet(out dev, 0).AssertOk();
CUcontext myContext;
cuCtxCreate(out myContext, Scheduling.ScheduleAuto, dev).AssertOk();
ReportCurrentContext();
Console.Write("\tTrying to load CUBLAS after creating driver API context...\n\t");
CublasHandle cublasHandle;
CublasStatus cbStatus = cublasCreate_v2(out cublasHandle);
Console.WriteLine(cbStatus.ToString());
if (cbStatus == CublasStatus.Success)
cublasDestroy_v2(cublasHandle).AssertOk();
Console.Write("Destroying driver API context...");
cuCtxDestroy(myContext).AssertOk();
Console.WriteLine("done.\n\n");
cublasCreate_v2(out cublasHandle).AssertOk();
int cublasVer;
cublasGetVersion_v2(cublasHandle, out cublasVer).AssertOk();
Console.WriteLine("Loaded CUBLAS v" + cublasVer);
ReportCurrentContext();
Console.Write("\tTrying to allocate memory using driver API, after loading CUBLAS...\n\t");
DevicePtr devPtr;
CudaResult status = cuMemAlloc(out devPtr, 1024);
Console.WriteLine(status.ToString());
if (status == CudaResult.SUCCESS)
cuMemFree(devPtr).AssertOk();
Console.Write("Destroying CUBLAS...");
cublasDestroy_v2(cublasHandle).AssertOk();
Console.WriteLine("done.\n\n");
Console.ReadLine();
}
static void ReportCurrentContext()
{
CUcontext currentContext;
cuCtxGetCurrent(out currentContext).AssertOk();
uint contextVer;
cuCtxGetApiVersion(currentContext, out contextVer).AssertOk();
Console.WriteLine("Host thread bound to context {0}, which has API version {1}",
currentContext, contextVer);
}
static void AssertOk(this CudaResult r) { if (r != CudaResult.SUCCESS) throw new Exception("CUDA ERROR: " + r); }
static void AssertOk(this CublasStatus r) { if (r != CublasStatus.Success) throw new Exception("CUBLAS ERROR: " + r); }
const string cudaDLL = "nvcuda.dll";
[DllImport(cudaDLL)] static extern CudaResult cuInit(uint Flags);
[DllImport(cudaDLL)] static extern CudaResult cuDriverGetVersion(out int driverVersion);
[DllImport(cudaDLL)] static extern CudaResult cuDeviceGet(out CUdevice device, int ordinal);
[DllImport(cudaDLL)] static extern CudaResult cuCtxCreate(out CUcontext outCtx, Scheduling flags, CUdevice dev);
[DllImport(cudaDLL)] static extern CudaResult cuCtxDestroy(CUcontext ctx);
[DllImport(cudaDLL)] static extern CudaResult cuCtxGetApiVersion(CUcontext ctx, out uint version);
[DllImport(cudaDLL)] static extern CudaResult cuCtxGetCurrent(out CUcontext outCtx);
[DllImport(cudaDLL)] static extern CudaResult cuMemAlloc(out DevicePtr devptr, SizeT bytesize);
[DllImport(cudaDLL)] static extern CudaResult cuMemFree(DevicePtr dptr);
const string cublasDLL = "cublas64_40_10.dll";
[DllImport(cublasDLL)] static extern CublasStatus cublasCreate_v2(out CublasHandle handle);
[DllImport(cublasDLL)] static extern CublasStatus cublasDestroy_v2(CublasHandle handle);
[DllImport(cublasDLL)] static extern CublasStatus cublasGetVersion_v2(CublasHandle handle, out int version);
}
struct CUdevice { readonly IntPtr pointer; }
struct CublasHandle { readonly IntPtr pointer; }
struct DevicePtr { readonly IntPtr pointer; }
struct CUcontext
{
readonly IntPtr pointer;
public override string ToString() { return string.Format("0x{0:x16}", pointer.ToInt64()); }
}
struct SizeT
{
static readonly bool is64Bit = Environment.Is64BitProcess;
IntPtr size; // Not a pointer; size determined at JIT-compile-time
public static implicit operator SizeT(int value) { return new SizeT(value); }
public SizeT(long val)
{
if (val < 0)
throw new ArgumentOutOfRangeException();
if (!is64Bit && val > int.MaxValue)
throw new OverflowException();
size = new IntPtr(val);
}
}
enum Scheduling
{
ScheduleAuto = 0,
ScheduleSpin = 1,
ScheduleYield = 2,
ScheduleBlockingSync = 4,
}
enum CublasStatus
{
Success = 0x00000000,
ErrorNotInitialized = 0x00000001,
ErrorAllocFailed = 0x00000003,
ErrorInvalidValue = 0x00000007,
ErrorArchMismatch = 0x00000008,
ErrorMappingError = 0x0000000B,
ErrorExecutionFailed = 0x0000000D,
ErrorInternalError = 0x0000000E
}
enum CudaResult
{
SUCCESS = 0,
ERROR_INVALID_VALUE = 1,
ERROR_OUT_OF_MEMORY = 2,
ERROR_NOT_INITIALIZED = 3,
ERROR_DEINITIALIZED = 4,
ERROR_NO_DEVICE = 100,
ERROR_INVALID_DEVICE = 101,
ERROR_INVALID_IMAGE = 200,
ERROR_INVALID_CONTEXT = 201,
ERROR_CONTEXT_ALREADY_CURRENT = 202,
ERROR_MAP_FAILED = 205,
ERROR_UNMAP_FAILED = 206,
ERROR_ARRAY_IS_MAPPED = 207,
ERROR_ALREADY_MAPPED = 208,
ERROR_NO_BINARY_FOR_GPU = 209,
ERROR_ALREADY_ACQUIRED = 210,
ERROR_NOT_MAPPED = 211,
ERROR_NOT_MAPPED_AS_ARRAY = 212,
ERROR_NOT_MAPPED_AS_POINTER = 213,
ERROR_ECC_UNCORRECTABLE = 214,
ERROR_UNSUPPORTED_LIMIT = 215,
ERROR_INVALID_SOURCE = 300,
ERROR_FILE_NOT_FOUND = 301,
ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
ERROR_SHARED_OBJECT_INIT_FAILED = 303,
ERROR_OPERATING_SYSTEM = 304,
ERROR_INVALID_HANDLE = 400,
ERROR_NOT_FOUND = 500,
ERROR_NOT_READY = 600,
ERROR_LAUNCH_FAILED = 700,
ERROR_LAUNCH_OUT_OF_RESOURCES = 701,
ERROR_LAUNCH_TIMEOUT = 702,
ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,
ERROR_UNKNOWN = 999
}
Is this expected behavior? Is there some workaround to allow using the same context (and memory) in both the driver API and CUBLAS?