cublasSgemv() returning not expected values

Hey all,

So I’m trying to do a simple matrix multiplication using the cublas function cublasSgemv, but I am not getting the values back that I am expecting. I am using a 5X5 matrix with all values initialized to 1(for testing purposes). I am multiplying this matrix by a 5 element vector initialized to {1,2,3,4,5} respectively. My results vector is giving me the values of: 55, 60, 65, 70, 75. Those numbers don’t seem right to me but maybe I am just misunderstanding the algorithm that the function is carrying out. Here is my code:

``````   cudaSetDevice( cutGetMaxGflopsDeviceId() );

cublasInit();

float *h_Matrix, *d_Matrix, *h_Multiplier, *d_Multiplier, *h_Results, *d_Results;
float elapsedTime, selapsedTime = 0;
size_t matrix_Size, multiplier_Size;

matrix_Size = ROWS * COLS * sizeof(float);
multiplier_Size = ROWS * sizeof(float);

//allocate memory for the matrices
h_Matrix =(float*) malloc(matrix_Size);
cublasAlloc(ROWS * COLS, sizeof(float), (void**)&d_Matrix);

//allocate memory for the multipliers
h_Multiplier =(float*) malloc(multiplier_Size);
cublasAlloc(ROWS, sizeof(float), (void**)&d_Multiplier);

h_Results =(float*) malloc(multiplier_Size);
cublasAlloc(ROWS , sizeof(float), (void**)&d_Results);

//set values in Matrix to 1
for(int i = 0; i < TOTAL; i++)
h_Matrix[i]= (float)1;

for(int f = 0; f < ROWS; f++)
{
h_Multiplier[f] = (float)f + 1;
h_Results[f] = (float)1;
}

cublasSetMatrix(ROWS,COLS,sizeof(float),h_Matrix,1,d_Matrix,
``````

1);

``````cublasSetVector(ROWS,sizeof(float),h_Multiplier,1,d_Multipli
``````

er,1);

``````cublasSetVector(ROWS,sizeof(float),h_Results,1,d_Results,1);

cublasSgemv('n', ROWS, COLS, 1, d_Matrix, ROWS, d_Multiplier,1,0,d_Results , 1);

h_Results =(float*) malloc(multiplier_Size);
cublasGetVector(ROWS, sizeof(float), d_Results, 1, h_Results, 1);

cublasFree(d_Matrix);
cublasFree(d_Multiplier);
cublasFree(d_Results);
cublasShutdown();
``````

Any information would greatly be appreciated. Thanks in advance!!

This one seems to work right. Please, take in account that even memory allocated by cudaMalloc may be used in calculations

``````void CUDA_ALLOC_DEVICE(qreal *&dev_out, int float_count)

{

CUDA_SAFE_CALL(cudaMalloc((void**)&dev_out, float_count*sizeof(qreal)));

CUT_CHECK_ERROR("CUDA_ALLOC_DEVICE");

}

void COPY_FROM_DEVICE(qreal *host_out, qreal *&dev_in, int float_count)

{

CUDA_SAFE_CALL(cudaMemcpy(host_out, dev_in, float_count*sizeof(qreal), cudaMemcpyDeviceToHost));

CUT_CHECK_ERROR("COPY_FROM_DEVICE");

}

void COPY_TO_DEVICE(qreal *&dev_out, qreal *host_in, int float_count)

{

CUDA_SAFE_CALL(cudaMemcpy(dev_out, host_in, float_count*sizeof(qreal), cudaMemcpyHostToDevice));

CUT_CHECK_ERROR("COPY_TO_DEVICE");

}

void testCublas()

{

const int ROWS = 5;

const int COLS = 4;

const int TOTAL = ROWS * COLS;

cublasStatus status;

int iSuc = CUBLAS_STATUS_SUCCESS; //operation completed successfully

int iNotInit = CUBLAS_STATUS_NOT_INITIALIZED; //CUBLAS library not initialized

int iFailed = CUBLAS_STATUS_ALLOC_FAILED; //resource allocation failed

int iInvalid = CUBLAS_STATUS_INVALID_VALUE; //unsupported numerical value was passed to function

int iExec = CUBLAS_STATUS_EXECUTION_FAILED; //GPU program failed to execute

int iInternal = CUBLAS_STATUS_INTERNAL_ERROR; //an internal CUBLAS operation failed

//	cudaSetDevice( cutGetMaxGflopsDeviceId() );

//status = cublasInit();

float *d_Matrix, *d_Multiplier, *d_Results;

float h_Matrix[ROWS*COLS], h_Multiplier[COLS], h_Results[COLS];

float elapsedTime, selapsedTime = 0;

size_t matrix_Size, multiplier_Size;

matrix_Size = TOTAL * sizeof(float);

multiplier_Size = ROWS * sizeof(float);

//allocate memory for the matrices

//h_Matrix =(float*) malloc(matrix_Size);

CUDA_ALLOC_DEVICE(d_Matrix, TOTAL);

//status = cublasAlloc(TOTAL, sizeof(float), (void**)&d_Matrix);

//allocate memory for the multipliers

//h_Multiplier =(float*) malloc(multiplier_Size);

CUDA_ALLOC_DEVICE(d_Multiplier, COLS);

//status = cublasAlloc(COLS, sizeof(float), (void**)&d_Multiplier);

//h_Results =(float*) malloc(multiplier_Size);

CUDA_ALLOC_DEVICE(d_Results, COLS);

status = cublasAlloc(COLS , sizeof(float), (void**)&d_Results);

//set values in Matrix to 1

for (int i = 0; i < COLS; ++i)

for(int j = 0; j < ROWS; j++)

h_Matrix[i*ROWS + j]= j+1;

for(int f = 0; f < COLS; f++)

{

h_Multiplier[f] = (float)f + 1;

h_Results[f] = (float)1;

}

COPY_TO_DEVICE(d_Matrix, (float*)h_Matrix, TOTAL);

//status = cublasSetMatrix(ROWS,COLS,sizeof(float),h_Matrix,ROWS,d_Matrix, ROWS);

COPY_TO_DEVICE(d_Multiplier, (float*)h_Multiplier, COLS);

//status = cublasSetVector(ROWS,sizeof(float),h_Multiplier,1,d_Multiplier,1);

//h_Results =(float*) malloc(multiplier_Size);

COPY_TO_DEVICE(d_Results, h_Results, COLS);

//status = cublasSetVector(ROWS,sizeof(float),h_Results,1,d_Results,1);

cublasSgemv('n', ROWS, COLS, 1, d_Matrix, ROWS, d_Multiplier,1,0,d_Results, 1);

status = cublasGetError();

COPY_FROM_DEVICE(h_Results, d_Results, COLS);

//status = cublasGetVector(ROWS, sizeof(float), d_Results, 1, h_Results, 1);

//status = cublasFree(d_Matrix);

//status = cublasFree(d_Multiplier);

//status = cublasFree(d_Results);

//status = cublasShutdown();

}
``````