cublasSgemv() returning not expected values

CUDA_Novice · June 25, 2009, 7:59pm

Hey all,

So I’m trying to do a simple matrix multiplication using the cublas function cublasSgemv, but I am not getting the values back that I am expecting. I am using a 5X5 matrix with all values initialized to 1(for testing purposes). I am multiplying this matrix by a 5 element vector initialized to {1,2,3,4,5} respectively. My results vector is giving me the values of: 55, 60, 65, 70, 75. Those numbers don’t seem right to me but maybe I am just misunderstanding the algorithm that the function is carrying out. Here is my code:

   cudaSetDevice( cutGetMaxGflopsDeviceId() );

cublasInit();	

float *h_Matrix, *d_Matrix, *h_Multiplier, *d_Multiplier, *h_Results, *d_Results;	
float elapsedTime, selapsedTime = 0;
size_t matrix_Size, multiplier_Size;

matrix_Size = ROWS * COLS * sizeof(float);
multiplier_Size = ROWS * sizeof(float);

//allocate memory for the matrices
h_Matrix =(float*) malloc(matrix_Size);
cublasAlloc(ROWS * COLS, sizeof(float), (void**)&d_Matrix);

//allocate memory for the multipliers
h_Multiplier =(float*) malloc(multiplier_Size);
cublasAlloc(ROWS, sizeof(float), (void**)&d_Multiplier);

h_Results =(float*) malloc(multiplier_Size);
cublasAlloc(ROWS , sizeof(float), (void**)&d_Results);

//set values in Matrix to 1	
for(int i = 0; i < TOTAL; i++)
	 h_Matrix[i]= (float)1;	

for(int f = 0; f < ROWS; f++)
{  
	h_Multiplier[f] = (float)f + 1;
	h_Results[f] = (float)1;
}

cublasSetMatrix(ROWS,COLS,sizeof(float),h_Matrix,1,d_Matrix,

1);

cublasSetVector(ROWS,sizeof(float),h_Multiplier,1,d_Multipli

er,1);

cublasSetVector(ROWS,sizeof(float),h_Results,1,d_Results,1);

cublasSgemv('n', ROWS, COLS, 1, d_Matrix, ROWS, d_Multiplier,1,0,d_Results , 1);
cudaThreadSynchronize();


h_Results =(float*) malloc(multiplier_Size);
cublasGetVector(ROWS, sizeof(float), d_Results, 1, h_Results, 1);

cublasFree(d_Matrix);
cublasFree(d_Multiplier);
cublasFree(d_Results);
cublasShutdown();

Any information would greatly be appreciated. Thanks in advance!!

izogfif · December 1, 2009, 8:04am

This one seems to work right. Please, take in account that even memory allocated by cudaMalloc may be used in calculations

void CUDA_ALLOC_DEVICE(qreal *&dev_out, int float_count) 

{

	CUDA_SAFE_CALL(cudaMalloc((void**)&dev_out, float_count*sizeof(qreal))); 

	CUT_CHECK_ERROR("CUDA_ALLOC_DEVICE");

}

void COPY_FROM_DEVICE(qreal *host_out, qreal *&dev_in, int float_count) 

{

	CUDA_SAFE_CALL(cudaMemcpy(host_out, dev_in, float_count*sizeof(qreal), cudaMemcpyDeviceToHost)); 

	CUT_CHECK_ERROR("COPY_FROM_DEVICE");

}

void COPY_TO_DEVICE(qreal *&dev_out, qreal *host_in, int float_count)		   

{															   

	CUDA_SAFE_CALL(cudaMemcpy(dev_out, host_in, float_count*sizeof(qreal), cudaMemcpyHostToDevice)); 

	CUT_CHECK_ERROR("COPY_TO_DEVICE"); 

}

void testCublas()

{

	const int ROWS = 5;

	const int COLS = 4;

	const int TOTAL = ROWS * COLS;

	cublasStatus status;

	int iSuc = CUBLAS_STATUS_SUCCESS; //operation completed successfully

	int iNotInit = CUBLAS_STATUS_NOT_INITIALIZED; //CUBLAS library not initialized

	int iFailed = CUBLAS_STATUS_ALLOC_FAILED; //resource allocation failed

	int iInvalid = CUBLAS_STATUS_INVALID_VALUE; //unsupported numerical value was passed to function

	int iMap = CUBLAS_STATUS_MAPPING_ERROR; //access to GPU memory space failed

	int iExec = CUBLAS_STATUS_EXECUTION_FAILED; //GPU program failed to execute

	int iInternal = CUBLAS_STATUS_INTERNAL_ERROR; //an internal CUBLAS operation failed

//	cudaSetDevice( cutGetMaxGflopsDeviceId() );

	//status = cublasInit(); 

	float *d_Matrix, *d_Multiplier, *d_Results; 

	float h_Matrix[ROWS*COLS], h_Multiplier[COLS], h_Results[COLS];

	float elapsedTime, selapsedTime = 0;

	size_t matrix_Size, multiplier_Size;

	matrix_Size = TOTAL * sizeof(float);

	multiplier_Size = ROWS * sizeof(float);

	//allocate memory for the matrices

	//h_Matrix =(float*) malloc(matrix_Size);

	CUDA_ALLOC_DEVICE(d_Matrix, TOTAL);

	//status = cublasAlloc(TOTAL, sizeof(float), (void**)&d_Matrix);

	//allocate memory for the multipliers

	//h_Multiplier =(float*) malloc(multiplier_Size);

	CUDA_ALLOC_DEVICE(d_Multiplier, COLS);

	//status = cublasAlloc(COLS, sizeof(float), (void**)&d_Multiplier);

	//h_Results =(float*) malloc(multiplier_Size);

	CUDA_ALLOC_DEVICE(d_Results, COLS);

	status = cublasAlloc(COLS , sizeof(float), (void**)&d_Results);

	//set values in Matrix to 1 

	for (int i = 0; i < COLS; ++i)

		for(int j = 0; j < ROWS; j++)

			h_Matrix[i*ROWS + j]= j+1; 

	for(int f = 0; f < COLS; f++)

	{ 

		h_Multiplier[f] = (float)f + 1;

		h_Results[f] = (float)1;

	}

	COPY_TO_DEVICE(d_Matrix, (float*)h_Matrix, TOTAL);

	//status = cublasSetMatrix(ROWS,COLS,sizeof(float),h_Matrix,ROWS,d_Matrix, ROWS);

	

	COPY_TO_DEVICE(d_Multiplier, (float*)h_Multiplier, COLS);

	//status = cublasSetVector(ROWS,sizeof(float),h_Multiplier,1,d_Multiplier,1);

	//h_Results =(float*) malloc(multiplier_Size);

	COPY_TO_DEVICE(d_Results, h_Results, COLS);

	//status = cublasSetVector(ROWS,sizeof(float),h_Results,1,d_Results,1);

	cublasSgemv('n', ROWS, COLS, 1, d_Matrix, ROWS, d_Multiplier,1,0,d_Results, 1);

	cudaThreadSynchronize();

	status = cublasGetError();

	COPY_FROM_DEVICE(h_Results, d_Results, COLS);

	//status = cublasGetVector(ROWS, sizeof(float), d_Results, 1, h_Results, 1);

	//status = cublasFree(d_Matrix);

	//status = cublasFree(d_Multiplier);

	//status = cublasFree(d_Results);

	//status = cublasShutdown();

}

Topic		Replies	Views
Incorrect results when using cublas matrix multiplication GPU-Accelerated Libraries	1	1512	April 28, 2016
cublasSgemm gives incorrect result with big matrix CUDA Programming and Performance cuda	1	431	June 28, 2020
How does cublasGemmEx() call work with CUDA_R_16F inputs and CUDA_R_32F computeType CUDA Programming and Performance	3	1842	December 10, 2017
Help with CUBLAS performance and timing issues, please help... CUDA Programming and Performance	1	3441	December 26, 2008
Calling cublasSgbmv Incorrect results CUDA Programming and Performance	1	1500	September 29, 2009
CUBLAS matrix-vector multiplication CUDA Programming and Performance	14	10034	January 20, 2010
cublasSgemm gives incorrect result with big matrix CUDA Programming and Performance cuda	0	378	June 26, 2020
Cublas_status_execution_failed GPU-Accelerated Libraries	2	10674	February 23, 2021
cgemm operation returns wrong result Error in C Code? CUDA Programming and Performance	8	1697	August 25, 2009
cuBLAS handle creation fails CUDA Programming and Performance	1	504	June 13, 2022

cublasSgemv() returning not expected values

Related topics