Problem with a sample Matlab and CUDA

Hi :

I have installed matlab and also cuda Matlab plugin, when I run the example of the manual I get this result

#include "mex.h"

#include "cublas.h"

/* sgemm_cu.cu - Gateway function for subroutine sgemm

  C = sgemm_cu(transa,transb,single(alpha),single(beta),single(A),single(B),single(C))

  transa,transb = 0/1 for no transpose/transpose of A,B

  Input arrays must be single precision.

*/

void mexFunction( int nlhs, mxArray *plhs[],

	int nrhs, const mxArray *prhs[])

{

	 cublasStatus status;

	 int M,K,L,N,MM,NN,KK;

	 int Mc,Kc,Lc,Nc,MMc,NNc,KKc;

	 int dims0[2];

	 int ta,tb;

	 float alpha,beta;

	 float *a,*b,*c,*cc;

	 float *ga,*gb,*gc;

	 char transa,transb;

	 cublasStatus retStatus;

	 if (nrhs != 7) {

		 mexErrMsgTxt("sgemm requires 7 input arguments");

	 } else if (nlhs != 1) {

		 mexErrMsgTxt("sgemm requires 1 output argument");

	 }

	 if ( !mxIsSingle(prhs[4]) ||

		  !mxIsSingle(prhs[5]) ||

		  !mxIsSingle(prhs[6]))	{

		  mexErrMsgTxt("Input arrays must be single precision.");

	 }

	 ta = (int) mxGetScalar(prhs[0]);

	 tb = (int) mxGetScalar(prhs[1]);

	 alpha = (float) mxGetScalar(prhs[2]);

	 beta = (float) mxGetScalar(prhs[3]);

	 M = mxGetM(prhs[4]);	/* gets number of rows of A */

	 K = mxGetN(prhs[4]);	/* gets number of columns of A */

	 L = mxGetM(prhs[5]);	/* gets number of rows of B */

	 N = mxGetN(prhs[5]);	/* gets number of columns of B */

	 if (ta == 0) {

		 transa='n';

		 MM=M;

		 KK=K;

	 } else {

		 transa='t';

		 MM=K;

		 KK=M;

	 }

	 if (tb == 0) {

		 transb='n';

		 NN=N;

	 } else {

		 transb='t';

		 NN=L;

	 }

   /*	 printf("transa=%c\n",transa);

	   printf("transb=%c\n",transb);

	   printf("alpha=%f\n",alpha);

	   printf("beta=%f\n",beta);	  */

 /* Left hand side matrix set up */

	   dims0[0]=MM;

	   dims0[1]=NN;

	   plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL);

	   cc = (float*) mxGetData(plhs[0]);

 /* Three single-precision arrays */

	   a = (float*) mxGetData(prhs[4]);

	   b = (float*) mxGetData(prhs[5]);

	   c = (float*) mxGetData(prhs[6]);

 /* STARTUP	 CUBLAS */

	   retStatus = cublasInit();

	  // test for error

	  retStatus = cublasGetError ();

	  if (retStatus != CUBLAS_STATUS_SUCCESS) {

		 printf("CUBLAS: an error occurred in cublasInit\n");

	   }

	  Mc=M+32-M%32;

	  Kc=K+32-K%32;

 /* ALLOCATE SPACE ON THE GPU AND COPY a INTO IT */

	   cublasAlloc (Mc*Kc, sizeof(float), (void**)&ga);

	   // test for error

	   retStatus = cublasGetError ();

	   if (retStatus != CUBLAS_STATUS_SUCCESS) {

		   printf("CUBLAS: an error occurred in cublasAlloc\n");

	   }

	   cudaMemset(ga,0,Mc*Kc*4);				  /* zeros the array ga byte-by-byte */

	   retStatus = cublasSetMatrix (M, K, sizeof(float),

				a, M, (void*)ga, Mc);

	   Lc=L+32-L%32;

	   Nc=N+32-N%32;

 /* SAME FOR B, C */

	   cublasAlloc (Lc*Nc, sizeof(float), (void**)&gb);

	   cudaMemset(gb,0,Lc*Nc*4);

	   retStatus = cublasSetMatrix (L, N, sizeof(float),

				b, L, (void*)gb, Lc);

	   MMc=MM+32-MM%32;

	   NNc=NN+32-NN%32;

	   KKc=KK+32-KK%32;

	   cublasAlloc (MMc*NNc, sizeof(float), (void**)&gc);

	   if (beta != 0.0 ) {

		  cudaMemset(gc,0,MMc*NNc*4);

		  retStatus = cublasSetMatrix (MM, NN, sizeof(float),

					c, MM, (void*)gc, MMc);

	   }

/*	PADDED ARRAYS */

/*	   printf("Op(A) has No. rows = %i\n",MMc);

		 printf("Op(B) has No. cols = %i\n",NNc);

		 printf("Op(A) has No. cols = %i\n",KKc);

		 printf("A has leading dimension = %i\n",Mc);

		 printf("B has leading dimension = %i\n",Lc);

		 printf("C has leading dimension = %i\n",MMc); */

/* READY TO CALL SGEMM */

  (void) cublasSgemm (transa, transb, MMc, NNc, KKc, alpha,

								ga, Mc, gb, Lc, beta, gc, MMc);

  status = cublasGetError();

  if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! kernel execution error.\n");

  }

/* NOW COPY THE RESULTING gc ON THE GPU TO THE LOCAL c */

	retStatus = cublasGetMatrix (MM, NN, sizeof(float), gc, MMc, cc, MM);

	if (retStatus != CUBLAS_STATUS_SUCCESS) {

		  printf("CUBLAS: an error occurred in cublasGetMatrix\n");

	}

/* FREE UP GPU MEMORY AND SHUTDOWN (OPTIONAL?) */

	  cublasFree (ga);

	  cublasFree (gb);

	  cublasFree (gc);

	  cublasShutdown();

}

and

A=[1.5 2 3;4 4 4];

B=[44 55 66;11 11 11]';

% Let's use some large arrays...

A=randn(4000,2000);

B=randn(2000,4000);

[m n]=size(A);

[mm nn]=size(B);

C=randn(m,nn);

alpha=-1;

beta=0;

disp('Matlab:')

tic

C1d=alpha*A*B + beta*C;

toc

% In single precision, Matlab is twice as fast! (go figure...)

tic

A1=single(A);

B1=single(B);

C1=single(C);

C1s=alpha*A1*B1 + beta*C1;

toc

% The call here is testing out the transposes of the code.

disp('CUDA:')

tic

C2=sgemm_cu(0,1,single(alpha),single(beta),single(A),single(B'),single(C));

toc

% Compare the CUDA results with the Matlab results

min(min(C2-C1s))/min(min(C1s))

max(max(C2-C1s))/max(max(C1s))

and this is a error

??? Undefined function or method 'sgemm_cu' for input arguments of type

'single'.

Error in ==> sgemm_ at 25

C2=sgemm_cu(0,1,single(alpha),single(beta),single(A),single(B'),single(C));