Hi there,
I just started CUDA programming, so please be forgiving. :)
My question concerns using CUBLAS. I tried to write a simple code which just multiplies two matrices and then outputs them on the console. The code runs well, but the result is wrong. What am I doing wrong here?
Thanks in advance, snippet follows:
cublasHandle_t handle;
int L,M,N; //Matrix dimensions A [L;M], B[M;N], C [L;N]
L=M=2;N=1;
float alpha = 4.0f;
float beta = 0.0f;
float* devA;float* devB; float* devC;
float* a = NULL;
float* b = NULL;
float* c = NULL;
a=new float[L*M];//simpler this way than malloc
b=new float[M*N];
c=new float[L*N];
float tempc=0;
//Fill with content
for ( j = 1 ; j <= L ; j++) { for ( i = 1 ; i <= M ; i++) { a[IDX2C(i,j,M)] = tempc++; }}
for ( j = 1 ; j <= M ; j++) { for ( i = 1 ; i <= N ; i++) { b[IDX2C(i,j,N)] = tempc++; }}
cout << "Matrices initialized!\n";
cudaStat=cudaMalloc((void**)&devA,L*M*sizeof(*a));
cudaStat=cudaMalloc((void**)&devB,M*N*sizeof(*b));
cudaStat=cudaMalloc((void**)&devC,L*N*sizeof(*c));
cublasInit();
stat=cublasCreate(&handle);
stat=cublasSetMatrix(L,M,sizeof(*a),a,L,devA,L);
stat=cublasSetMatrix(M,N,sizeof(*b),b,M,devB,M);
stat=cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N, L,M,N,&alpha, devA,L , devB, M, &beta, devC, L);
stat=cublasGetMatrix(L,N,sizeof(*c),devC,N,c,N);
RunCUDACode();
cout << "Results A:";
for ( j = 1 ; j <= L ; j++) {
for ( i = 1 ; i <= M ; i++) {
printf("%7.0f", a[IDX2C(i,j,M)]);
}
cout <<"\n";
}
cout << "Results B:";
for ( j = 1 ; j <=M ; j++) {
for ( i = 1 ; i <= N ; i++) {
printf("%7.0f", b[IDX2C(i,j,N)]);
}
cout <<"\n";
}
cout << "Results C:";
for ( j = 1 ; j <= L ; j++) {
for ( i = 1 ; i <= N ; i++) {
printf("%7.0f", c[IDX2C(i,j,N)]);
}
cout <<"\n";
}
... (freeing stuff)
return 0;