cublas matrix-vector problem


i’m trying to do a simple non-square matrix-vector multiplication using CUBLAS. The problem should be trivial but i can’t make it work correctly.

matrix w of size UxZ, U is the number of rows

vector z of size Z

output vector y of size U



all values are set randomly between 0.0 and 1.0

the difference between CPU and GPU results is even more than 2.0

i’m using the following code:

cublasSetMatrix (U, Z, sizeof(float), w, U, dev_W, U);

	cublasSetVector (Z, sizeof(float), z,1, dev_Z, 1);

	cublasSgemv('n', U, Z, 1.0,dev_W, U, dev_Z,1, 0.0, dev_Y, 1);

	cublasGetVector (U, sizeof(float), dev_Y,1, y, 1);

what have I done wrong ?

I can’t see anything wrong, I can send you what I used and seems to work.

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <math.h>

#include <cutil.h>

#include "cublas.h"

#include <cutil_inline.h>

#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))

#define M 6

#define N 5

int main (int argc, char** argv)


   int i, j;

   cublasStatus stat;

   float* devPtrA;

   float* devPtrx;

   float* devPtry;

   float* a = 0;

   float* x;

   float* y;

   a = (float *)malloc (M * N * sizeof (float));

   x = (float *)malloc (N * sizeof (float));

   y = (float *)malloc (M * sizeof (float));

   for (j = 1; j <= N; j++) {

	  for (i = 1; i <= M; i++) {

		 a[IDX2F(i,j,M)] = (i-1) * M + j;



   for (i = 1; i <= N; i++) {

		 x[i-1] = i;


   for (i = 1; i <= M; i++) {

		 y[i-1] = 0.0f;


   for (i = 1; i <= M; i++) {

	  for (j = 1; j <= N; j++) {

		 printf ("%7.0f", a[IDX2F(i,j,M)]);


	  printf ("\n");


   printf ("\n");

   if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

		cutilDeviceInit(argc, argv);


		cudaSetDevice( cutGetMaxGflopsDeviceId() );

cutilSafeCall(cudaMalloc((void**) &devPtrA, M * N * sizeof(float)));

   cutilSafeCall(cudaMemcpy(devPtrA, a, M * N * sizeof(float), cudaMemcpyHostToDevice));


   for (i = 1; i <= M; i++) {

	  for (j = 1; j <= N; j++) {

		 printf ("%7.0f", a[IDX2F(i,j,M)]);


	  printf ("\n");


   cutilSafeCall(cudaMalloc((void**) &devPtrx, N*sizeof(float)));

   printf ("maloc done\n");

   cutilSafeCall(cudaMalloc((void**) &devPtry, M*sizeof(float)));

   printf ("maloc done\n");

   cutilSafeCall(cudaMemcpy(devPtrx, x, N * sizeof(float), cudaMemcpyHostToDevice));

   cutilSafeCall(cudaMemcpy(devPtry, y, M * sizeof(float), cudaMemcpyHostToDevice));

   cublasSgemv ('n', M, N, 1.0f, devPtrA, M, devPtrx, 1, 1.0f, devPtry, 1);

   stat = cublasGetVector (M, sizeof(float), devPtry, 1, y, 1);

   for (i=0;i<M;i++) {printf("%7.0f\n",y[i]);}

cublasFree (devPtrA);

   cublasFree (devPtrx);

   cublasFree (devPtry);