problem of compile matrixMul error of compilation

sorry this is my code

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <cutil_inline.h>

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void MatrixMulOnDevice(float * M, float * N, float * P, int Width);
void MatrixMulKernel(float * Md, float * Nd, float * Pd, int Width);

float* Md;
float* Nd;
float* Pd;

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv)
{

MatrixMulOnDevice(Md, Nd, Pd, 10);

}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////

void MatrixMulKernel(float * Md, float * Nd, float * Pd, int Width)
{

int tx = threadIdx.x;
int ty = threadIdx.y;

float Pvaleur = 0;
for (int i = 0; i < Width; ++i)
{
    float MdElement = Md[ty * Width + i];
    float NdElement = Nd[i  * Width + tx];
    Pvaleur        += MdElement * NdElement;
}


Pd[ty * Width + tx] = Pvaleur;

}

void MatrixMulOnDevice(float * M, float * N, float * P, int Width)
{
int size = Width*Width * sizeof(float);

// allocate arrays on host
float* Ma = (float*) malloc(size);
float* Na = (float*) malloc(size);
float* Pa = (float*) malloc(size);

// initialize host memory
for (int i=0; i<size; i++) Ma[i] = (float)i;
for (int j=0; j<size; j++) Na[j] = (float)j;

// allocate array on device
cudaMalloc((void **) &Md, size);
cudaMemcpy(Md, Ma, size, cudaMemcpyHostToDevice) ;
cudaMalloc((void **) &Nd, size);
cudaMemcpy(Nd, Na, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &Pd, size);

dim3 dimGrid(1, 1);

dim3 dimBlock(Width, Width);

//Call of MatrixMulKernel
MatrixMulKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, Width);

// copy data from host to device
cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost);

//destruction des matrices, désormais inutilisées
free(Ma);
free(Na);
free(Pa);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);

}