cublas functions not working when called from .cubin files compiled with "nvcc -G..."

I have a kernel:

#include <cublas_v2.h>
#include <math_constants.h>
#include <stdio.h>

extern “C” {

__device__ float ONE = 1.0f;
__device__ float M_ONE = -1.0f;
__device__ float ZERO = 0.0f;

__global__ void kernel(float *W, float *input, int i, float *output, int o) {
	int idx = blockIdx.x*blockDim.x+threadIdx.x;
	cublasHandle_t cnpHandle;

	if(idx == 0) {
	if(idx == 0) {
		cublasSgemv(cnpHandle, CUBLAS_OP_N, o, i, &ONE, W, o, input, 1, &ZERO, output, 1);



And I am compiling it to a .cubin using the following command:

nvcc -arch=sm_52 -dc -o minimalKernel.cubin -cubin -dlink -lcublas_device -lcudadevrt

When i call the cubin from my host code, the cublas routine is only called if I DONT have the -G flag when compiling. I have seen this behaviour on two .cubin files now that I compile with -G, cublas just does not run when I have the “-G” command specified. Is this something documented somewhere? a bug perhaps?

Any help will be greatly appreciated.

GTX980 compiling with cuda-7.0

here is my host code:

#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>

extern "C" {
	__global__ void kernel(float *W, float *input, int i, float *output, int o);

int main(int argc, char* argv[])

	CUcontext pctx;
	CUdevice dev;
	cuDeviceGet(&dev, 0);
	cuCtxCreate(&pctx, 0, dev);

	CUmodule module;
	CUresult t = cuModuleLoad(&module, "path to/minimalKernel.cubin");

	CUfunction function;
	CUresult r = cuModuleGetFunction(&function, module, "kernel");

	float *W = new float[2];
	W[0] = 0.1f;
	W[1] = 0.1f;
	float *input = new float[2];
	input[0] = 0.1f;
	input[1] = 0.1f;
	float *out = new float[1];
	out[0] = 0.0f;

	int i = 2;
	int o = 1;

	float *d_W;
	float *d_input;
	float *d_out;
	cudaMalloc((void**)&d_W, 2*sizeof(float));
	cudaMalloc((void**)&d_input, 2*sizeof(float));
	cudaMalloc((void**)&d_out, sizeof(float));
	cudaMemcpy(d_W, W, 2*sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_input, input, 2*sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_out, out, sizeof(float), cudaMemcpyHostToDevice);
	//kernel<<<1, 2>>>(d_W, d_input, i, d_out, o);

	//cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);


	void * kernelParams[] { &d_W, &d_input, &i, &d_out, &o };

	CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);

	cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);



I should say… the output printed should be “out: 0.02”, but with “nvcc -G…” it is “out: 0” (as cublas does not run)