cublasSgemm fails on first run, works on subsequent runs

*** Edit: Nevermind, found the problem. I accidentally tried to call cudaFree on a host pointer (indexes) without checking for errors and the error propagated ***

I’m having a strange problem calling cudaSgemm inside a mex (Matlab) failing, although it doesn’t seem like Matlab is the culprit here.

On a the first call to cudaSgemm after memory allocation the function fails with error CUBLAS_STATUS_EXECUTION_FAILED, all subsequent calls work fine. Following is the current code for the function (the logic is due to the allocation being stored in global pointers so I need to reallocate only on buffer size change).

The last line in the code is the problematic one. If I duplicate it or call the function again (with no allocation happening) everything runs fine. It’s just the first call to cudaSgemm that fails.

It doesn’t seem to be cudaSgemm by itself as I have other code that works, so it seems that I’m missing something here.

Will be grateful for any ideas

// C is stuck on the end of F

float * gFgC = NULL;

// Weights

float * gFW = NULL;

// Intermediate matrix for F*C'

float * gFC = NULL;

float * gSqDistances = NULL;

int * gIndexes = NULL;

int gn = 0;

int gd = 0;

int gbeta = 0;

cublasHandle_t handle = NULL;

int ComputeMtimes(const float * F, const float * FW, const float * C,

        int * indexes, float * sqDistances,

        int n, int d, int beta)


	if (n != gn || d != gd || beta != gbeta)


		if (gFgC) cudaFree(gFgC);

		if (gFC) cudaFree(gFC);

		CHKERR(cudaMalloc((void **)&gFgC, (n + beta)*d*sizeof(float)));

   		CHKERR(cudaMalloc((void **)&gFC, n*beta*sizeof(float)));


	if (n != gn)


		if (gFW) cudaFree(gFW);

		if (gSqDistances) cudaFree(gSqDistances);

		if (indexes) cudaFree(indexes);

		CHKERR(cudaMalloc((void **)&gFW, n*sizeof(float)));

		CHKERR(cudaMalloc((void **)&gSqDistances, n*sizeof(float)));

		CHKERR(cudaMalloc((void **)&gIndexes, n*sizeof(float)));


	gn = n;

	gd = d;

	gbeta = beta;

	if (!handle)


	CHKERR(cudaMemcpy(gFgC, F, n*d*sizeof(float), cudaMemcpyHostToDevice));

	CHKERR(cudaMemcpy(gFW, FW, n*sizeof(float), cudaMemcpyHostToDevice));

	CHKERR(cudaMemcpy(gFgC + n*d, C, beta*d*sizeof(float), cudaMemcpyHostToDevice));

	float a = 2.f;

	float b = 0.f

	CHKERR(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, n, beta, d, &a, gFgC, d, gFgC + n*d, d, &b, gFC, n));

	return 0;