I think I have a problem as well. I multiply two matrices V=10,6 H=10,5

The multiplication done is w+=V.transpose * H

The result should be shaped: 6,5

The results for the first 5 rows is correct, but the last row is off. If I change the dimensions, I get even bigger problems. Am I doing something wrong, or is this a bug?

I use cuda 2.1 and did store matrices in column-major order

-Peter

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include “cutil.h”

#include <cublas.h>

#include “cutil_inline.h”

int

main(int argc, char** argv){

int device;

struct cudaDeviceProp properties;

if( cutCheckCmdLineFlag(argc, (const char**)argv, “device”) )

cutilDeviceInit(argc, argv);

else

cudaSetDevice( cutGetMaxGflopsDeviceId() );

cutilSafeCall(cudaGetDevice(&device));

cutilSafeCall(cudaGetDeviceProperties(&properties, device));

cublasStatus stat;

cublasInit();

int s1=6;int s2=5;int m1=0;int T=10;

unsigned int mem_size_w = sizeof(float)*s1*s2*(m1+1);

float* d_H;

float* d_V;

float* d_w;

size_t d_Hp;

size_t d_Vp;

size_t d_wp;

float* h_w = (float*)malloc(mem_size_w);

float* h_H = (float*)malloc(T*sizeof(float)*s2);

float* h_V = (float*)malloc(T*sizeof(float)*s1);

size_t h_wp=s2*sizeof(float);

size_t h_Hp=T*sizeof(float);

size_t h_Vp=T*sizeof(float);

cutilSafeCall(cudaMallocPitch((void**) &d_H, &d_Hp, T*sizeof(float), s2));

cutilSafeCall(cudaMallocPitch((void**) &d_V, &d_Vp, T*sizeof(float), s1));

cutilSafeCall(cudaMallocPitch((void**) &d_w, &d_wp, s2*sizeof(float), s1*(m1+1)));

for (int i=0;i<T;i++) {

```
for (int k=0;k<s1;k++) {
h_V[k*h_Vp/sizeof(float)+i]=k*h_Vp/sizeof(float)+i;
}
for (int k=0;k<s2;k++) {
h_H[k*h_Hp/sizeof(float)+i]=-float((k*h_Hp/sizeof(float)+i));
}
```

}

cutilSafeCall(cudaMemcpy2D(d_H, d_Hp, h_H, h_Hp, T*sizeof(float), s2, cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy2D(d_V, d_Vp, h_V, h_Vp, T*sizeof(float), s1, cudaMemcpyHostToDevice));

cudaMemset(d_w, 0, s1*d_wp*(m1+1));

cublasSgemm(‘t’, ‘n’, s1, s2, T, 1.0f, d_V, d_Vp/sizeof(float), d_H, d_Hp/sizeof(float), 1.0f, d_w, d_wp/sizeof(float));

cutilSafeCall(cudaMemcpy2D(h_w, h_wp, d_w, d_wp, s2*sizeof(float), (m1+1)*s1, cudaMemcpyDeviceToHost));

for (int m=0;m<m1+1;m++){

```
for (int i=0;i<s1;i++){
for (int k=0;k<s2;k++) {
if (k<s2-1) {printf("%3.8f,", h_w[m*s1*s2+i*s2+k]);} else {printf("%3.8f],\n[", h_w[m*s1*s2+i*s2+k]);}}
}
}
```

free(h_w);free(h_H);free(h_V);

cublasFree (d_w);cublasFree (d_H);cublasFree (d_V);

}

