CUBLAS 4 RC uses a v3.2 context Prevents data sharing between CUBLAS & CUDA API

On Windows 7 64-bit, I was trying to use the 4.0 RC driver API with CUBLAS in the same program, and running into problems where:

  • creating a context and then calling cublasInit() returns the “not initialized” error

  • calling cublasInit() first, and then using that context to cuMemAlloc returned “invalid context” error

Following the guidance in the CUDA Reference Manual, I used cuCtxGetApiVersion and found that cublas64_40_10.dll appears to bind a version 3020 (3.2) context to the host thread, while the version returned for a context generated by cuCtxCreate is 3010. I believe this is causes the errors I encountered.

I wrote a small C# program to demonstrate this. The output is below, and source code follows that:

C# source code to generate this output:

using System;

using System.Runtime.InteropServices;

static class Program

{

    static void Main()

    {

        cuInit(0).AssertOk();

int cudaVer;

        cuDriverGetVersion(out cudaVer).AssertOk();

        Console.WriteLine("Loaded CUDA driver API v" + cudaVer);

        CUdevice dev;

        cuDeviceGet(out dev, 0).AssertOk();

        CUcontext myContext;

        cuCtxCreate(out myContext, Scheduling.ScheduleAuto, dev).AssertOk();

        ReportCurrentContext();

        Console.Write("\tTrying to load CUBLAS after creating driver API context...\n\t");

        CublasHandle cublasHandle;

        CublasStatus cbStatus = cublasCreate_v2(out cublasHandle);

        Console.WriteLine(cbStatus.ToString());

        if (cbStatus == CublasStatus.Success)

            cublasDestroy_v2(cublasHandle).AssertOk();

        Console.Write("Destroying driver API context...");

        cuCtxDestroy(myContext).AssertOk();

        Console.WriteLine("done.\n\n");

cublasCreate_v2(out cublasHandle).AssertOk();

        int cublasVer;

        cublasGetVersion_v2(cublasHandle, out cublasVer).AssertOk();

        Console.WriteLine("Loaded CUBLAS v" + cublasVer);

        ReportCurrentContext();

        Console.Write("\tTrying to allocate memory using driver API, after loading CUBLAS...\n\t");

        DevicePtr devPtr;

        CudaResult status = cuMemAlloc(out devPtr, 1024);

        Console.WriteLine(status.ToString());

        if (status == CudaResult.SUCCESS)

            cuMemFree(devPtr).AssertOk();

        Console.Write("Destroying CUBLAS...");

        cublasDestroy_v2(cublasHandle).AssertOk();

        Console.WriteLine("done.\n\n");

Console.ReadLine();

    }

static void ReportCurrentContext()

    {

        CUcontext currentContext;

        cuCtxGetCurrent(out currentContext).AssertOk();

        uint contextVer;

        cuCtxGetApiVersion(currentContext, out contextVer).AssertOk();

Console.WriteLine("Host thread bound to context {0}, which has API version {1}",

            currentContext, contextVer);

    }

static void AssertOk(this CudaResult r) { if (r != CudaResult.SUCCESS) throw new Exception("CUDA ERROR: " + r); }

    static void AssertOk(this CublasStatus r) { if (r != CublasStatus.Success) throw new Exception("CUBLAS ERROR: " + r); }

const string cudaDLL = "nvcuda.dll";

    [DllImport(cudaDLL)] static extern CudaResult cuInit(uint Flags);

    [DllImport(cudaDLL)] static extern CudaResult cuDriverGetVersion(out int driverVersion);

    [DllImport(cudaDLL)] static extern CudaResult cuDeviceGet(out CUdevice device, int ordinal);

    [DllImport(cudaDLL)] static extern CudaResult cuCtxCreate(out CUcontext outCtx, Scheduling flags, CUdevice dev);

    [DllImport(cudaDLL)] static extern CudaResult cuCtxDestroy(CUcontext ctx);

    [DllImport(cudaDLL)] static extern CudaResult cuCtxGetApiVersion(CUcontext ctx, out uint version);

    [DllImport(cudaDLL)] static extern CudaResult cuCtxGetCurrent(out CUcontext outCtx);

    [DllImport(cudaDLL)] static extern CudaResult cuMemAlloc(out DevicePtr devptr, SizeT bytesize);

    [DllImport(cudaDLL)] static extern CudaResult cuMemFree(DevicePtr dptr);

const string cublasDLL = "cublas64_40_10.dll";

    [DllImport(cublasDLL)] static extern CublasStatus cublasCreate_v2(out CublasHandle handle);

    [DllImport(cublasDLL)] static extern CublasStatus cublasDestroy_v2(CublasHandle handle);

    [DllImport(cublasDLL)] static extern CublasStatus cublasGetVersion_v2(CublasHandle handle, out int version);

}

struct CUdevice { readonly IntPtr pointer; }

struct CublasHandle { readonly IntPtr pointer;  }

struct DevicePtr { readonly IntPtr pointer; }

struct CUcontext 

{ 

    readonly IntPtr pointer;

    public override string ToString() { return string.Format("0x{0:x16}", pointer.ToInt64()); }

}

struct SizeT

{

    static readonly bool is64Bit = Environment.Is64BitProcess;

IntPtr size; // Not a pointer; size determined at JIT-compile-time

    public static implicit operator SizeT(int value) { return new SizeT(value); }

public SizeT(long val)

    {

        if (val < 0)

            throw new ArgumentOutOfRangeException();

        if (!is64Bit && val > int.MaxValue)

            throw new OverflowException();

        size = new IntPtr(val);

    }

}

enum Scheduling

{

    ScheduleAuto = 0,

    ScheduleSpin = 1,

    ScheduleYield = 2,

    ScheduleBlockingSync = 4,

}

enum CublasStatus

{

    Success = 0x00000000,

    ErrorNotInitialized = 0x00000001,

    ErrorAllocFailed = 0x00000003,

    ErrorInvalidValue = 0x00000007,

    ErrorArchMismatch = 0x00000008,

    ErrorMappingError = 0x0000000B,

    ErrorExecutionFailed = 0x0000000D,

    ErrorInternalError = 0x0000000E

}

enum CudaResult

{

    SUCCESS = 0,

    ERROR_INVALID_VALUE = 1,

    ERROR_OUT_OF_MEMORY = 2,

    ERROR_NOT_INITIALIZED = 3,

    ERROR_DEINITIALIZED = 4,

    ERROR_NO_DEVICE = 100,

    ERROR_INVALID_DEVICE = 101,

    ERROR_INVALID_IMAGE = 200,

    ERROR_INVALID_CONTEXT = 201,

    ERROR_CONTEXT_ALREADY_CURRENT = 202,

    ERROR_MAP_FAILED = 205,

    ERROR_UNMAP_FAILED = 206,

    ERROR_ARRAY_IS_MAPPED = 207,

    ERROR_ALREADY_MAPPED = 208,

    ERROR_NO_BINARY_FOR_GPU = 209,

    ERROR_ALREADY_ACQUIRED = 210,

    ERROR_NOT_MAPPED = 211,

    ERROR_NOT_MAPPED_AS_ARRAY = 212,

    ERROR_NOT_MAPPED_AS_POINTER = 213,

    ERROR_ECC_UNCORRECTABLE = 214,

    ERROR_UNSUPPORTED_LIMIT = 215,

    ERROR_INVALID_SOURCE = 300,

    ERROR_FILE_NOT_FOUND = 301,

    ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,

    ERROR_SHARED_OBJECT_INIT_FAILED = 303,

    ERROR_OPERATING_SYSTEM = 304,

    ERROR_INVALID_HANDLE = 400,

    ERROR_NOT_FOUND = 500,

    ERROR_NOT_READY = 600,

    ERROR_LAUNCH_FAILED = 700,

    ERROR_LAUNCH_OUT_OF_RESOURCES = 701,

    ERROR_LAUNCH_TIMEOUT = 702,

    ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,

    ERROR_UNKNOWN = 999

}

Is this expected behavior? Is there some workaround to allow using the same context (and memory) in both the driver API and CUBLAS?

are you just loading the DLL and calling cuCtxCreate directly? because that creates a 3010 context, not a 3020 context. you have to call a different function–cuda.h is heavily versioned and redirects calls like cuCtxCreate to cuCtxCreate_v2 depending on your CUDA version.

Ahhhh, yes, I believe that is the issue. I was careful to add the _v2 to the CUBLAS calls but completely spaced for the CUDA calls. Thank you!

there may be _v3 calls in 4.0, I don’t remember…