Help me Cuda on Matlab

i want to add 2 matrix by cuda programming in matlab mex file, and my program is:


#include “mex.h”

void convert_double2float( double *input_double, float *output_float,int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i] = (float) input_double[i];
}
}

void convert_float2double( float *input_float, double *output_double,int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_double[i] = (double) input_float[i];
}
}

global void VecAdd(float* A, float* B, float* C)
{
int i = threadIdx.x;
C[i] = A[i] + B[i];
}

void mexFunction( int nlhs, mxArray *plhs, int nrhs, const mxArray *prhs)
{
float *A,*B,*C;
double *input_A,*input_B, *output_C;
int M,N;
M = mxGetM(prhs[0]);
N = mxGetN(prhs[0]);
mexPrintf("\nm=%d , n=%d\n",M,N);

/* Retrieve the input data */
input_A = (double *) mxGetData(prhs[0]);
input_B = (double *) mxGetData(prhs[1]);

A  = (float*) mxMalloc(sizeof(float)*N*M);
B  = (float*) mxMalloc(sizeof(float)*N*M);
C  = (float*) mxMalloc(sizeof(float)*N*M);

convert_double2float(input_A, A,  N*M);
convert_double2float(input_B, B,  N*M);

int block_size=16;
dim3 dimBlock(block_size,block_size);
VecAdd<<<1, dimBlock>>>(A, B, C);

/* Create an mxArray for the output data */
plhs[0] = mxCreateDoubleMatrix(M, N, mxREAL);
/* Create a pointer to the output data */
output_C = mxGetPr(plhs[0]);
convert_float2double(C,output_C, N*M);

mxFree(A);
mxFree(B);
mxFree©;
return;

}


in execute:

nvmex -f nvmexopts.bat add.cu -IC:\cuda\include -LC:\cuda\lib -lcufft -lcudart
add.cu
tmpxft_00000a24_00000000-3_add.cudafe1.gpu
tmpxft_00000a24_00000000-8_add.cudafe2.gpu
tmpxft_00000a24_00000000-3_add.cudafe1.cpp

c=add(a,B);


:verymad: but it not correct answer, please help me!!!

:wacko:

You are not uploading your data to a GPU at all, how do you expect it will do anything with your data stored ON A HOST?

See the example vectorAdd from the SDK::

// Copy vectors from host memory to device memory

	cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );

	// Invoke kernel

	int threadsPerBlock = 256;

	int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

	cutilCheckMsg("kernel launch failure");

#ifdef _DEBUG

	cutilSafeCall( cudaThreadSynchronize() );

#endif

	// Copy result from device memory to host memory

	// h_C contains the result in host memory

	cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );