using MATLAB mex function, CudaDevicereset() gives C++ runtime debug library error

I have a mex function to solve a linear systems of equations. I use the google cusp library to do this.
I compile the cu file in Visual studio 2012.
And then I link the files in matlab using mex
Everything works well, until I add at the end of my file CudaDevicereset(), it results in C++ runtime library error. Anyone any idea why this happens and how to solve this?

Does have anything to do in the way I compile and link the files? not using nvmex?

and here is the code

#include "stdlib.h"
#include <fstream>
#include "mex.h"
#include "matrix.h"
#include <thrust/device_vector.h>
#include <cusp/csr_matrix.h>
#include <cusp/hyb_matrix.h>
#include <cusp/monitor.h>
#include <cusp/krylov/bicgstab.h>
#include <cusp/krylov/cg.h>
#include <cusp/krylov/gmres.h>
#include <cusp/krylov/bicg.h>
#include <cusp/precond/smoothed_aggregation.h>
#include <cusp/array1d.h>
#include <cusp/precond/ainv.h>
#include <cusp/precond/diagonal.h>
#include <cusp/print.h>
#include <cusp/array2d.h>
#include <cusp/multiply.h> 
#include <Windows.h>
#include <cusp/precond/ainv.h>
#include "sabrealgorithm.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include "stdafx.h"

//stage 2
//input function A,B,b,G
//solves the systems Ax=Bb CAUTION the A transpose has taken be cause of C being row major and MATLAB column major
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
	//make it possible to use mexprintf
	std::ostringstream sout ;
    std::cout.rdbuf(sout.rdbuf()); //redirect std::cout to out.tx
    std::string word;
    std::cout << word << "  ";  //output to the file out.txt

   	//check incoming data
  if(nrhs < 2 || nrhs > 2) 		
    {mexErrMsgTxt("Wrong number of input arguments.");}
  if(nlhs > 1)
   {mexErrMsgTxt("Too many output arguments.");}
 //retrieve input 
   #define Yin prhs[1]			//measured data
   #define Dpin prhs[0]		//pseudo inverse of D

 cusp::csr_matrix<int,float,cusp::host_memory> Dp=initializematrix(Dpin);
  if( !(mxIsDouble(Yin)) )
    {mexErrMsgTxt("Input is not double");}

  //initialize data and store in the right format  
  	//initialize row and column entries
  double *Ypr ;				//initialize values
  float *Ba;
  //Dpir = mxGetIr(Dpin);
  //Dpjc = mxGetJc(Dpin);
  //Dppr = (double *)mxGetData(Dpin);
  Ypr= (double *)mxGetData(Yin);
  //int i;

  cusp::hyb_matrix<int,float,cusp::device_memory> Dphyb = Dp;
  int i=0;
  cusp::array1d<float, cusp::host_memory> bh(Dp.num_cols,0.0);
  for (i=0;i<(Dp.num_rows);i++)
	 bh[i] = (float)(Ypr[i]);
  cusp::array1d<float, cusp::device_memory> b=bh;
  cusp::array1d<float, cusp::device_memory> alphad(Dp.num_cols,0.0);

       // set stopping criteria:
      //  iteration_limit    = 100
      //  relative_tolerance = 1e-6
      cusp::verbose_monitor<float> monitor(b, 1000, 1e-6);
	  cusp::precond::smoothed_aggregation<int, float, cusp::device_memory> M(Dphyb);
//	  cusp::precond::scaled_bridson_ainv<float, cusp::device_memory> M(Dphyb, 3);
FILETIME filetime,filetime2;

	cusp::krylov::cg(Dphyb, alphad,b , monitor,M);
ULONGLONG time1,time2;
time1 = (((ULONGLONG) filetime.dwHighDateTime) << 32) + filetime.dwLowDateTime;
time2 = (((ULONGLONG) filetime2.dwHighDateTime) << 32) + filetime2.dwLowDateTime;
mexPrintf("ELAPSED TIME IN MS:%d",(int)((time2-time1)/10000));
//  mexPrintf("stage 1 tooks %f seconds or %d clicks",((float)t)/CLOCKS_PER_SEC,t); 
	  //initialize output
	  plhs[0] = mxCreateNumericMatrix(Dp.num_rows, 1, mxSINGLE_CLASS, mxREAL); /* Create the output matrix */
       /* Get the pointer to the data of B */
	  cusp::array1d<float, cusp::host_memory> xans =alphad;
	  thrust::copy(xans.begin(), xans.end(), Ba);
	  mexPrintf("%s", sout.str().c_str()) ;

It is possible that there are some implicit CUDA API calls after the cudaDeviceReset() due to scoping. (I am not sure that would result result in your observed error though.) I didn’t look at the code closely, but it appears that you are depending on the destructors to free CUDA memory (e.g., for Dphyb). You can still accomplish that by scoping all of your code inside an additional set of curly braces ({}) with the cudaDeviceReset() at the very end of the function after the closing brace. That extra scope should force the destructors to run before you reset the device.

Have a look at this post

Problem solved

I used the mexatexit function to call cudadevicereset(). I also set in visual studio cuda Runtime to shared/dynamic library instead of static library based on the following thread