small problem with cublas sgemv

I am trying to do a small sgemv using cublas.I have taken a look at the blas c-interface from netlib.org .

The only thing I could be missing is the size of lda which because the matrix is transposed i have set it to be the number of columns.

The problem is that my program works only is numRows>numCols if I set lda to be nRows it doesn’t work.

Here is the code:

#include <thrust/host_vector.h>

#include <thrust/device_vector.h>

#include <thrust/generate.h>

#include <thrust/reduce.h>

#include <thrust/functional.h>

#include <thrust/random.h>

#include <thrust/device_ptr.h>

#include <thrust/fill.h>

#include <iostream>

#include <cutil_inline.h>  

#include <cublas_v2.h>

#define DEBUG 1

#if DEBUG ==1

#define MY_SAFE_CALL(x) CUDA_SAFE_CALL(x)

#else

#define MY_SAFE_CALL(x) x

#endif

void row_sum_kernelCPU2(float * outputMatrix,float * inputMatrix,int numRows,int numColumns){

for (int i=0;i<numRows;i++){

  outputMatrix[i]=0;

for (int j=0;j<numColumns;j++){

//   std::cout<<"adding "<<inputMatrix[i*numColumns+j]<<" to the current sum "<<std::endl;

  outputMatrix[i]+=inputMatrix[i*numColumns+j];

}

// std::cout<<"finally storing "<<curr_sum<<" to position "<<i<<std::endl;

// std::cout<<outputMatrix[i]<<std::endl;

}

}

int main(void)

{

  int R = 10;     // number of rows

  int C = 5;     // number of columns

  thrust::default_random_engine rng;

  thrust::uniform_real_distribution<float> dist(10, 99);

// initialize data

  thrust::host_vector<float> array(R * C);

  for (size_t i = 0; i < array.size(); i++)

    array[i] = dist(rng);

// thrust::device_vector<T> row_sums(R);

float alpha=1.0;

float beta=0.0;

int m=R;

int n=C;

int lda=n;

float* x_vector;

MY_SAFE_CALL(cudaMalloc(&x_vector,R*sizeof(float)));

float* array_gpu;

MY_SAFE_CALL(cudaMalloc(&array_gpu,R*C*sizeof(float)));

MY_SAFE_CALL(cudaMemcpy(array_gpu,&array[0],R*C*sizeof(float),cudaMemcpyHostToDevice));

thrust::device_ptr<float> x_vector_ptr(x_vector);

thrust::device_ptr<float> array_ptr(array_gpu);

thrust::fill(x_vector_ptr,x_vector_ptr+R,1.0);

float* out_vector;

MY_SAFE_CALL(cudaMalloc(&out_vector,R*sizeof(float)));

cublasStatus_t cublasStat;

cublasHandle_t cublasHandle;

cublasStat = cublasCreate(&cublasHandle);

cublasStat= cublasSgemv(cublasHandle,CUBLAS_OP_T ,n, m,&alpha,array_gpu,lda,x_vector, 1,&beta,out_vector,1);

if (cublasStat!=CUBLAS_STATUS_SUCCESS)

{printf("execution failed \n");}

else{printf("execution successfull \n");}

float * cuda_out=(float*)malloc(R*sizeof(float));

float * vector_cpu_out=(float*)malloc(R*sizeof(float));

MY_SAFE_CALL(cudaMemcpy(cuda_out,out_vector,R*sizeof(float),cudaMemcpyDeviceToHost));

row_sum_kernelCPU2(vector_cpu_out,&array[0],R,C);

for (int i=0;i<R;i++)

{

if ((vector_cpu_out[i]-cuda_out[i])>0.1){printf("error in position %d \n",i);}

}

return 0;

}

Thank you in advance.

Apostolis

The lda is the leading dimension of the matrix before the operation (the transpose) has been performed. In other words, the lda describes the way the data is given, not how it is to be used.

by transposed I mean fortran-major instead of C-major.
CUBLAS uses fortran major so lda=nColumns(in the C program)sounds correct to me.I could be wrong of course.
Can you give me the CUBLAS call as you would write it?

I understand your point, now.

Since the number of rows is m and the number of columns is n, I believe you have your dimensions swapped. I think it should be:

cublasSgemv(cublasHandle,CUBLAS_OP_T ,m, n,&alpha,array_gpu,lda,x_vector, 1,&beta,out_vector,1);

This works ok,I just figured it out,before I saw your post.I think you were correct

#include <thrust/host_vector.h>

#include <thrust/device_vector.h>

#include <thrust/generate.h>

#include <thrust/reduce.h>

#include <thrust/functional.h>

#include <thrust/random.h>

#include <thrust/device_ptr.h>

#include <thrust/fill.h>

#include <iostream>

#include <cutil_inline.h>  

#include <cublas_v2.h>

#include <math.h>

#define DEBUG 1

#if DEBUG ==1

#define MY_SAFE_CALL(x) CUDA_SAFE_CALL(x)

#else

#define MY_SAFE_CALL(x) x

#endif

void row_sum_kernelCPU2(float * outputMatrix,float * inputMatrix,int numRows,int numColumns){

for (int i=0;i<numRows;i++){

  outputMatrix[i]=0;

for (int j=0;j<numColumns;j++){

//   std::cout<<"adding "<<inputMatrix[i*numColumns+j]<<" to the current sum "<<std::endl;

  outputMatrix[i]+=inputMatrix[i*numColumns+j];

}

// std::cout<<"finally storing "<<curr_sum<<" to position "<<i<<std::endl;

// std::cout<<outputMatrix[i]<<std::endl;

}

}

int main(void)

{

  int R = 100;     // number of rows

  int C = 1000;     // number of columns

  thrust::default_random_engine rng;

  thrust::uniform_real_distribution<float> dist(10, 99);

// initialize data

  thrust::host_vector<float> array(R * C);

  for (size_t i = 0; i < array.size(); i++)

    array[i] = dist(rng);

// thrust::device_vector<T> row_sums(R);

float alpha=1.0;

float beta=0.0;

int lda=C;

int m=C;

int n=R;

float* x_vector;

MY_SAFE_CALL(cudaMalloc(&x_vector,R*sizeof(float)));

float* array_gpu;

MY_SAFE_CALL(cudaMalloc(&array_gpu,R*C*sizeof(float)));

MY_SAFE_CALL(cudaMemcpy(array_gpu,&array[0],R*C*sizeof(float),cudaMemcpyHostToDevice));

thrust::device_ptr<float> x_vector_ptr(x_vector);

thrust::device_ptr<float> array_ptr(array_gpu);

thrust::fill(x_vector_ptr,x_vector_ptr+R,1.0);

float* out_vector;

MY_SAFE_CALL(cudaMalloc(&out_vector,R*sizeof(float)));

cublasStatus_t cublasStat;

cublasHandle_t cublasHandle;

cublasStat = cublasCreate(&cublasHandle);

cublasStat= cublasSgemv(cublasHandle,CUBLAS_OP_T ,m, n,&alpha,array_gpu,lda,x_vector, 1,&beta,out_vector,1);

if (cublasStat!=CUBLAS_STATUS_SUCCESS)

{printf("execution failed \n");}

else{printf("execution successfull \n");}

float * cuda_out=(float*)malloc(R*sizeof(float));

float * vector_cpu_out=(float*)malloc(R*sizeof(float));

MY_SAFE_CALL(cudaMemcpy(cuda_out,out_vector,R*sizeof(float),cudaMemcpyDeviceToHost));

row_sum_kernelCPU2(vector_cpu_out,&array[0],R,C);

for (int i=0;i<R;i++)

{

if (abs(vector_cpu_out[i]-cuda_out[i])>0.1){printf("error in position %d \n",i);}

}

return 0;

}