beginner CUBLAS Sgemm question

I’m having trouble getting the correct result using cublasSgemm. Basically, I just copied the simpleCUBLAS SDK example with some minor changes, and as far as I can tell from the documentation, I’m using the right input arguments. Any help with where I’m going wrong is appreciated. I’m guessing it’s probably something stupid I’m overlooking.

The C initialization:

[codebox] float* in1 = new float[5*3];

float* in2 = new float[3*5];

float* out1 = new float[5*5];

for (int i = 0; i < 3*5; i++)	{

	in1[i] = i;

	in2[i] = 25-i;

}

call_MultiplyMatrices_GPU(h_in1, h_in2, 5, 3, 5);[/codebox]

The cublas calls:

[codebox]void call_MultiplyMatrices_GPU(float* h_in1, float* h_in2, uint32 in1_rows, uint32 in1_cols, uint32 in2_cols)

{

cublasStatus status;

    status = cublasInit();

uint32 m = in1_rows; // = out1_rows

uint32 k = in1_cols; // = in2_rows

uint32 n = in2_cols; // = out1_cols

float *d_in1, *d_in2, *d_out1;

status = cublasAlloc(m*k, sizeof(float), (void**)&d_in1);

    status = cublasAlloc(k*n, sizeof(float), (void**)&d_in2);

    status = cublasAlloc(m*n, sizeof(float), (void**)&d_out1);

float* h_out1 = (float*)malloc(m*n * sizeof(float));

for (uint32 i = 0; i < m*n; i++)     

	h_out1[i] = 0;

status = cublasSetVector(m*k, sizeof(float), h_in1, 1, d_in1, 1);

    status = cublasSetVector(k*n, sizeof(float), h_in2, 1, d_in2, 1);

    status = cublasSetVector(m*n, sizeof(float), h_out1, 1, d_out1, 1);

cublasSgemm(‘n’, ‘n’, m, n, k, 1.0f, d_in1, m, d_in2, k, 0.0f, d_out1, m);

status = cublasGetVector(m*n, sizeof(float), d_out1, 1, h_out1, 1);

for (uint32 i = 0; i < 5; i++)

{

	for (uint32 j = 0; j < 5; j++)

		printf("%4.2f\t", h_out1[i*5 + j]);

	printf("\n");

}

status = cublasFree(d_in1);

status = cublasFree(d_in2);

    cublasFree(d_out1);

cublasShutdown();

}[/codebox]

Output:

[codebox]Cublas:

350.00 422.00 494.00 566.00 638.00

305.00 368.00 431.00 494.00 557.00

260.00 314.00 368.00 422.00 476.00

215.00 260.00 305.00 350.00 395.00

170.00 206.00 242.00 278.00 314.00

C:

50.00 47.00 44.00 41.00 38.00

230.00 218.00 206.00 194.00 182.00

410.00 389.00 368.00 347.00 326.00

590.00 560.00 530.00 500.00 470.00

770.00 731.00 692.00 653.00 614.00[/codebox]

your CPU code is row-major

in1 =

	 0	 1	 2

	 3	 4	 5

	 6	 7	 8

	 9	10	11

	12	13	14

in2 =

	25	24	23	22	21

	20	19	18	17	16

	15	14	13	12	11

out1 =

	50	47	44	41	38

   230   218   206   194   182

   410   389   368   347   326

   590   560   530   500   470

   770   731   692   653   614

but CUBLAS uses column-major

Looks like row major versus column majoring ordering, CUBLAS follows the FORTRAN BLAS conventions and expects matrices to be in column major order.