I think I have a problem as well. I multiply two matrices V=10,6 H=10,5
The multiplication done is w+=V.transpose * H
The result should be shaped: 6,5
The results for the first 5 rows is correct, but the last row is off. If I change the dimensions, I get even bigger problems. Am I doing something wrong, or is this a bug?
I use cuda 2.1 and did store matrices in column-major order
-Peter
[codebox]
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include “cutil.h”
#include <cublas.h>
#include “cutil_inline.h”
int
main(int argc, char** argv){
int device;
struct cudaDeviceProp properties;
if( cutCheckCmdLineFlag(argc, (const char**)argv, “device”) )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
cutilSafeCall(cudaGetDevice(&device));
cutilSafeCall(cudaGetDeviceProperties(&properties, device));
cublasStatus stat;
cublasInit();
int s1=6;int s2=5;int m1=0;int T=10;
unsigned int mem_size_w = sizeof(float)s1s2*(m1+1);
float* d_H;
float* d_V;
float* d_w;
size_t d_Hp;
size_t d_Vp;
size_t d_wp;
float* h_w = (float*)malloc(mem_size_w);
float* h_H = (float*)malloc(T*sizeof(float)*s2);
float* h_V = (float*)malloc(T*sizeof(float)*s1);
size_t h_wp=s2*sizeof(float);
size_t h_Hp=T*sizeof(float);
size_t h_Vp=T*sizeof(float);
cutilSafeCall(cudaMallocPitch((void**) &d_H, &d_Hp, T*sizeof(float), s2));
cutilSafeCall(cudaMallocPitch((void**) &d_V, &d_Vp, T*sizeof(float), s1));
cutilSafeCall(cudaMallocPitch((void**) &d_w, &d_wp, s2sizeof(float), s1(m1+1)));
for (int i=0;i<T;i++) {
for (int k=0;k<s1;k++) {
h_V[k*h_Vp/sizeof(float)+i]=k*h_Vp/sizeof(float)+i;
}
for (int k=0;k<s2;k++) {
h_H[k*h_Hp/sizeof(float)+i]=-float((k*h_Hp/sizeof(float)+i));
}
}
cutilSafeCall(cudaMemcpy2D(d_H, d_Hp, h_H, h_Hp, T*sizeof(float), s2, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy2D(d_V, d_Vp, h_V, h_Vp, T*sizeof(float), s1, cudaMemcpyHostToDevice));
cudaMemset(d_w, 0, s1d_wp(m1+1));
cublasSgemm(‘t’, ‘n’, s1, s2, T, 1.0f, d_V, d_Vp/sizeof(float), d_H, d_Hp/sizeof(float), 1.0f, d_w, d_wp/sizeof(float));
cutilSafeCall(cudaMemcpy2D(h_w, h_wp, d_w, d_wp, s2*sizeof(float), (m1+1)*s1, cudaMemcpyDeviceToHost));
for (int m=0;m<m1+1;m++){
for (int i=0;i<s1;i++){
for (int k=0;k<s2;k++) {
if (k<s2-1) {printf("%3.8f,", h_w[m*s1*s2+i*s2+k]);} else {printf("%3.8f],\n[", h_w[m*s1*s2+i*s2+k]);}}
}
}
free(h_w);free(h_H);free(h_V);
cublasFree (d_w);cublasFree (d_H);cublasFree (d_V);
}
[/codebox]