multiplication of no square matrix general code of multiplication of two matrix

hello,

i would wirte my own code of multiplication of two matrix R[1][3]= A[1][2] xB[2][3]

A=[0 1]

B= 0 1 2
3 4 5

we should have a result R= [3 4 5 ]

but the result is 2 3 4

this my code

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <cutil_inline.h>

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void MatrixMulOnDevice(float * M, float * N, float * P,int Hm,int Wm ,int Hn,int Wn);
void MatrixMulKernel(float * Md, float * Nd, float * Pd, int Hm,int Wm ,int Hn,int Wn);

float* Md;
float* Nd;
float* Pd;

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv)
{

MatrixMulOnDevice(Md, Nd, Pd,1,2,2,3);

}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////

global void MatrixMulKernel(float * Md, float * Nd, float * Pd,int Hm,int Wm,int Hn,int Wn)
{

int tx = threadIdx.x;
int ty = threadIdx.y;

float Pvaleur = 0;
for (int i = 0; i < Wm; ++i)
{
float MdElement = Md[ty * Wm + i];
float NdElement = Nd[i * Wm + tx];
Pvaleur += MdElement * NdElement;
}

Pd[ty * Wn + tx] = Pvaleur;
}

void MatrixMulOnDevice(float * M, float * N, float * P , int Hm,int Wm,int Hn ,int Wn)
{
int sizem = HmWm * sizeof(float);
int sizen = Hn
Wn * sizeof(float);
int sizep = Hm*Wn * sizeof(float);

// allocate arrays on host
float* Ma = (float*) malloc(sizem);
float* Na = (float*) malloc(sizen);
float* Pa = (float*) malloc(sizep);

// initialize host memory
for (int i=0; i< HmWm; i++) Ma[i] = (float)i;
for (int j=0; j<Hn
Wn; j++) Na[j] = (float)j;

for(int i=0; i<HmWm;i++)
printf(“b[%d]=%f\n”,i,Ma[i]);
printf("\n");
for(int i=0; i<Hn
Wn;i++)
printf(“a[%d]=%f\n”,i,Na[i]);
printf("\n");

// allocate array on device
cudaMalloc((void **) &Md, sizem);
cudaMemcpy(Md, Ma, sizem, cudaMemcpyHostToDevice) ;//matrix M

cudaMalloc((void **) &Nd, sizen);
cudaMemcpy(Nd, Na, sizen, cudaMemcpyHostToDevice); //matrix N

cudaMalloc((void **) &Pd, sizep); //matrix P

dim3 dimGrid(1, 1);

dim3 dimBlock(4,4);

//Call of MatrixMulKernel
MatrixMulKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd,Hm,Wm,Hn,Wn);

// copy data from device to host
cudaMemcpy(Pa, Pd, sizep, cudaMemcpyDeviceToHost);
for(int i=0; i<Hm*Wn;i++)
printf(“s[%d]=%f\n”,i,Pa[i]);

free(Ma);
free(Na);
free(Pa);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
}