I can’t see anything wrong, I can send you what I used and seems to work.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include "cublas.h"
#include <cutil_inline.h>
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
#define M 6
#define N 5
int main (int argc, char** argv)
{
int i, j;
cublasStatus stat;
float* devPtrA;
float* devPtrx;
float* devPtry;
float* a = 0;
float* x;
float* y;
a = (float *)malloc (M * N * sizeof (float));
x = (float *)malloc (N * sizeof (float));
y = (float *)malloc (M * sizeof (float));
for (j = 1; j <= N; j++) {
for (i = 1; i <= M; i++) {
a[IDX2F(i,j,M)] = (i-1) * M + j;
}
}
for (i = 1; i <= N; i++) {
x[i-1] = i;
}
for (i = 1; i <= M; i++) {
y[i-1] = 0.0f;
}
for (i = 1; i <= M; i++) {
for (j = 1; j <= N; j++) {
printf ("%7.0f", a[IDX2F(i,j,M)]);
}
printf ("\n");
}
printf ("\n");
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
cutilSafeCall(cudaMalloc((void**) &devPtrA, M * N * sizeof(float)));
cutilSafeCall(cudaMemcpy(devPtrA, a, M * N * sizeof(float), cudaMemcpyHostToDevice));
cublasInit();
for (i = 1; i <= M; i++) {
for (j = 1; j <= N; j++) {
printf ("%7.0f", a[IDX2F(i,j,M)]);
}
printf ("\n");
}
cutilSafeCall(cudaMalloc((void**) &devPtrx, N*sizeof(float)));
printf ("maloc done\n");
cutilSafeCall(cudaMalloc((void**) &devPtry, M*sizeof(float)));
printf ("maloc done\n");
cutilSafeCall(cudaMemcpy(devPtrx, x, N * sizeof(float), cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(devPtry, y, M * sizeof(float), cudaMemcpyHostToDevice));
cublasSgemv ('n', M, N, 1.0f, devPtrA, M, devPtrx, 1, 1.0f, devPtry, 1);
stat = cublasGetVector (M, sizeof(float), devPtry, 1, y, 1);
for (i=0;i<M;i++) {printf("%7.0f\n",y[i]);}
cublasFree (devPtrA);
cublasFree (devPtrx);
cublasFree (devPtry);
free(a);
free(x);
free(y);
cublasShutdown();
return EXIT_SUCCESS;
}