using the visual profiler for matlab mex files

I have a mex function to solve a linear systems of equations. I use the google cusp library to do this.
I compile the cu file in Visual studio 2012.
And then I link the files in matlab using mex

As suggest by the others to profile the application in the following way
Write your Matlab .m file end add exit at its end.
Launch the NVIDIA Visual Profiler. File -> New Session.
File: add the full path of the Matlab executable file, for example C:\Program Files\MATLAB\R2012b \bin\win64\MATLAB.exe .
Working directory: add the full path of the Matlab .m file.
Arguments: -nojvm -nosplash -r file_name_without_m_extension.

but the time line stays empty, I wonder whether it is possible to profile the mex file in the way I compile and link? (not using nvmex)

I tested the cuda samples with the profiler and they worked fine

#include "stdlib.h"
#include <fstream>
#include "mex.h"
#include "matrix.h"
#include <thrust/device_vector.h>
#include <cusp/csr_matrix.h>
#include <cusp/hyb_matrix.h>
#include <cusp/monitor.h>
#include <cusp/krylov/bicgstab.h>
#include <cusp/krylov/cg.h>
#include <cusp/krylov/gmres.h>
#include <cusp/krylov/bicg.h>
#include <cusp/precond/smoothed_aggregation.h>
#include <cusp/array1d.h>
#include <cusp/precond/ainv.h>
#include <cusp/precond/diagonal.h>
#include <cusp/print.h>
#include <cusp/array2d.h>
#include <cusp/multiply.h> 
#include <Windows.h>
#include <cusp/precond/ainv.h>
#include "sabrealgorithm.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include "stdafx.h"

//stage 2
//input function A,B,b,G
//solves the systems Ax=Bb CAUTION the A transpose has taken be cause of C being row major and MATLAB column major
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
	//make it possible to use mexprintf
	std::ostringstream sout ;
    std::cout.rdbuf(sout.rdbuf()); //redirect std::cout to out.tx
    std::string word;
    std::cout << word << "  ";  //output to the file out.txt

   	//check incoming data
  if(nrhs < 2 || nrhs > 2) 		
    {mexErrMsgTxt("Wrong number of input arguments.");}
  if(nlhs > 1)
   {mexErrMsgTxt("Too many output arguments.");}
 //retrieve input 
   #define Yin prhs[1]			//measured data
   #define Dpin prhs[0]		//pseudo inverse of D

 cusp::csr_matrix<int,float,cusp::host_memory> Dp=initializematrix(Dpin);
  if( !(mxIsDouble(Yin)) )
    {mexErrMsgTxt("Input is not double");}

  //initialize data and store in the right format  
  	//initialize row and column entries
  double *Ypr ;				//initialize values
  float *Ba;
  //Dpir = mxGetIr(Dpin);
  //Dpjc = mxGetJc(Dpin);
  //Dppr = (double *)mxGetData(Dpin);
  Ypr= (double *)mxGetData(Yin);
  //int i;

  //cusp::csr_matrix<int,float,cusp::host_memory> Dp(mxGetN(Dpin),mxGetM(Dpin),mxGetNzmax(Dpin));
  /*for (i=0;i<(mxGetNzmax(Dpin));i++)
	  Dp.column_indices[i] = Dpir[i];
	  Dp.values[i] = (float)Dppr[i];
  for (i=0;i<(mxGetN(Dpin));i++)
	  Dp.row_offsets[i] = Dpjc[i];
  cusp::hyb_matrix<int,float,cusp::device_memory> Dphyb = Dp;
  int i=0;
  cusp::array1d<float, cusp::host_memory> bh(Dp.num_cols,0.0);
  for (i=0;i<(Dp.num_rows);i++)
	 bh[i] = (float)(Ypr[i]);
  cusp::array1d<float, cusp::device_memory> b=bh;
  cusp::array1d<float, cusp::device_memory> alphad(Dp.num_cols,0.0);

       // set stopping criteria:
      //  iteration_limit    = 100
      //  relative_tolerance = 1e-6
      cusp::verbose_monitor<float> monitor(b, 1000, 1e-6);
	  cusp::precond::smoothed_aggregation<int, float, cusp::device_memory> M(Dphyb);
//	  cusp::precond::scaled_bridson_ainv<float, cusp::device_memory> M(Dphyb, 3);
FILETIME filetime,filetime2;

	cusp::krylov::cg(Dphyb, alphad,b , monitor,M);
ULONGLONG time1,time2;
time1 = (((ULONGLONG) filetime.dwHighDateTime) << 32) + filetime.dwLowDateTime;
time2 = (((ULONGLONG) filetime2.dwHighDateTime) << 32) + filetime2.dwLowDateTime;
mexPrintf("ELAPSED TIME IN MS:%d",(int)((time2-time1)/10000));
//  mexPrintf("stage 1 tooks %f seconds or %d clicks",((float)t)/CLOCKS_PER_SEC,t); 
	  //initialize output
	  plhs[0] = mxCreateNumericMatrix(Dp.num_rows, 1, mxSINGLE_CLASS, mxREAL); /* Create the output matrix */
       /* Get the pointer to the data of B */
	  cusp::array1d<float, cusp::host_memory> xans =alphad;
	  thrust::copy(xans.begin(), xans.end(), Ba);
	  mexPrintf("%s", sout.str().c_str()) ;

Which version of Matlab do you have?

Matlab has some startup bugs (2007b, 2009b, 2010a), see

I was able to successfully use the above procedure you quoted with Matlab 2008b, but not with Matlab 2010a. I have done my tests using a code of my own and not yours.

I use matlab 2013b. But I see MATLAB running if I start the profiler, so that shouldn’t be the problem right?

problem is solved, I think I had the profiler assign to matlab/bin/x64\matlab.exe, instead of just matlab/bin/matlab.exe, Also I noticed that CudaDeviceReset() is required to let the profiler work

Happy that you solved the problem.

Let me say that in your original post you were already referring to C:\Program Files\MATLAB\R2012b \bin\win64\MATLAB.exe and in the code you had already cudaDeviceReset() :-)