cublasSgemv() returning not expected values

Hey all,

So I’m trying to do a simple matrix multiplication using the cublas function cublasSgemv, but I am not getting the values back that I am expecting. I am using a 5X5 matrix with all values initialized to 1(for testing purposes). I am multiplying this matrix by a 5 element vector initialized to {1,2,3,4,5} respectively. My results vector is giving me the values of: 55, 60, 65, 70, 75. Those numbers don’t seem right to me but maybe I am just misunderstanding the algorithm that the function is carrying out. Here is my code:

   cudaSetDevice( cutGetMaxGflopsDeviceId() );

cublasInit();	

float *h_Matrix, *d_Matrix, *h_Multiplier, *d_Multiplier, *h_Results, *d_Results;	
float elapsedTime, selapsedTime = 0;
size_t matrix_Size, multiplier_Size;

matrix_Size = ROWS * COLS * sizeof(float);
multiplier_Size = ROWS * sizeof(float);

//allocate memory for the matrices
h_Matrix =(float*) malloc(matrix_Size);
cublasAlloc(ROWS * COLS, sizeof(float), (void**)&d_Matrix);

//allocate memory for the multipliers
h_Multiplier =(float*) malloc(multiplier_Size);
cublasAlloc(ROWS, sizeof(float), (void**)&d_Multiplier);

h_Results =(float*) malloc(multiplier_Size);
cublasAlloc(ROWS , sizeof(float), (void**)&d_Results);

//set values in Matrix to 1	
for(int i = 0; i < TOTAL; i++)
	 h_Matrix[i]= (float)1;	

for(int f = 0; f < ROWS; f++)
{  
	h_Multiplier[f] = (float)f + 1;
	h_Results[f] = (float)1;
}

cublasSetMatrix(ROWS,COLS,sizeof(float),h_Matrix,1,d_Matrix,

1);

cublasSetVector(ROWS,sizeof(float),h_Multiplier,1,d_Multipli

er,1);

cublasSetVector(ROWS,sizeof(float),h_Results,1,d_Results,1);

cublasSgemv('n', ROWS, COLS, 1, d_Matrix, ROWS, d_Multiplier,1,0,d_Results , 1);
cudaThreadSynchronize();


h_Results =(float*) malloc(multiplier_Size);
cublasGetVector(ROWS, sizeof(float), d_Results, 1, h_Results, 1);

cublasFree(d_Matrix);
cublasFree(d_Multiplier);
cublasFree(d_Results);
cublasShutdown();

Any information would greatly be appreciated. Thanks in advance!!

This one seems to work right. Please, take in account that even memory allocated by cudaMalloc may be used in calculations

void CUDA_ALLOC_DEVICE(qreal *&dev_out, int float_count) 

{

	CUDA_SAFE_CALL(cudaMalloc((void**)&dev_out, float_count*sizeof(qreal))); 

	CUT_CHECK_ERROR("CUDA_ALLOC_DEVICE");

}

void COPY_FROM_DEVICE(qreal *host_out, qreal *&dev_in, int float_count) 

{

	CUDA_SAFE_CALL(cudaMemcpy(host_out, dev_in, float_count*sizeof(qreal), cudaMemcpyDeviceToHost)); 

	CUT_CHECK_ERROR("COPY_FROM_DEVICE");

}

void COPY_TO_DEVICE(qreal *&dev_out, qreal *host_in, int float_count)		   

{															   

	CUDA_SAFE_CALL(cudaMemcpy(dev_out, host_in, float_count*sizeof(qreal), cudaMemcpyHostToDevice)); 

	CUT_CHECK_ERROR("COPY_TO_DEVICE"); 

}

void testCublas()

{

	const int ROWS = 5;

	const int COLS = 4;

	const int TOTAL = ROWS * COLS;

	cublasStatus status;

	int iSuc = CUBLAS_STATUS_SUCCESS; //operation completed successfully

	int iNotInit = CUBLAS_STATUS_NOT_INITIALIZED; //CUBLAS library not initialized

	int iFailed = CUBLAS_STATUS_ALLOC_FAILED; //resource allocation failed

	int iInvalid = CUBLAS_STATUS_INVALID_VALUE; //unsupported numerical value was passed to function

	int iMap = CUBLAS_STATUS_MAPPING_ERROR; //access to GPU memory space failed

	int iExec = CUBLAS_STATUS_EXECUTION_FAILED; //GPU program failed to execute

	int iInternal = CUBLAS_STATUS_INTERNAL_ERROR; //an internal CUBLAS operation failed

//	cudaSetDevice( cutGetMaxGflopsDeviceId() );

	//status = cublasInit(); 

	float *d_Matrix, *d_Multiplier, *d_Results; 

	float h_Matrix[ROWS*COLS], h_Multiplier[COLS], h_Results[COLS];

	float elapsedTime, selapsedTime = 0;

	size_t matrix_Size, multiplier_Size;

	matrix_Size = TOTAL * sizeof(float);

	multiplier_Size = ROWS * sizeof(float);

	//allocate memory for the matrices

	//h_Matrix =(float*) malloc(matrix_Size);

	CUDA_ALLOC_DEVICE(d_Matrix, TOTAL);

	//status = cublasAlloc(TOTAL, sizeof(float), (void**)&d_Matrix);

	//allocate memory for the multipliers

	//h_Multiplier =(float*) malloc(multiplier_Size);

	CUDA_ALLOC_DEVICE(d_Multiplier, COLS);

	//status = cublasAlloc(COLS, sizeof(float), (void**)&d_Multiplier);

	//h_Results =(float*) malloc(multiplier_Size);

	CUDA_ALLOC_DEVICE(d_Results, COLS);

	status = cublasAlloc(COLS , sizeof(float), (void**)&d_Results);

	//set values in Matrix to 1 

	for (int i = 0; i < COLS; ++i)

		for(int j = 0; j < ROWS; j++)

			h_Matrix[i*ROWS + j]= j+1; 

	for(int f = 0; f < COLS; f++)

	{ 

		h_Multiplier[f] = (float)f + 1;

		h_Results[f] = (float)1;

	}

	COPY_TO_DEVICE(d_Matrix, (float*)h_Matrix, TOTAL);

	//status = cublasSetMatrix(ROWS,COLS,sizeof(float),h_Matrix,ROWS,d_Matrix, ROWS);

	

	COPY_TO_DEVICE(d_Multiplier, (float*)h_Multiplier, COLS);

	//status = cublasSetVector(ROWS,sizeof(float),h_Multiplier,1,d_Multiplier,1);

	//h_Results =(float*) malloc(multiplier_Size);

	COPY_TO_DEVICE(d_Results, h_Results, COLS);

	//status = cublasSetVector(ROWS,sizeof(float),h_Results,1,d_Results,1);

	cublasSgemv('n', ROWS, COLS, 1, d_Matrix, ROWS, d_Multiplier,1,0,d_Results, 1);

	cudaThreadSynchronize();

	status = cublasGetError();

	COPY_FROM_DEVICE(h_Results, d_Results, COLS);

	//status = cublasGetVector(ROWS, sizeof(float), d_Results, 1, h_Results, 1);

	//status = cublasFree(d_Matrix);

	//status = cublasFree(d_Multiplier);

	//status = cublasFree(d_Results);

	//status = cublasShutdown();

}