# problem in Matrix multiplication code

Hi, I try to check calculate time of my square matrix multiplication code.

I can check calculate time on 2048x2048 matrix. And result of CPU calculate is same of GPU result.

But, i can’t check calculate time on 4096x4096 matrix. And result of CPU calculate is not same of GPU result.

In my code, M and N is square matrix. element of M is all 1, N is unit matrix.

so M x N = M, but GPU return all 0.

sombody help me plzzzzzzzzzzzzzzzz :(;

p.s i use GTX680 and program environment Window7 64bit-Visualstudio2010

here is my code:

#include <omp.h>
#include
#include <stdio.h>
#include <Windows.h>
#include “…/…/common/book.h”
#include “cuda.h”
#include “cuda_runtime.h”
#include “device_launch_parameters.h”

void MatrixMulC(intM, intN, intP, int Width)
{
int col=0;
int raw=0;
int index=0;
int Destindex=0;
#pragma omp parallel
for(col=0; col<Width; col++)
{
for(raw=0; raw< Width; raw++)
{
Destindex= col
Width+raw;
for(index =0; index< Width; index++)
P[Destindex]+=M[col * Width + index]N[index * Width + raw];
}
}
}
global void MatrixMul(int
M, intN, intP, int Width)
{
int tid, tx, ty;
tx= blockDim.x * blockIdx.x + threadIdx.x;
ty= blockDim.y * blockIdx.y + threadIdx.y;
tid=Width * ty + tx;

``````int Value=0;
int MVal=0;
int NVal=0;

for(int i=0; i< Width; i++)
{
MVal=M[ty * Width +i];
NVal=N[i * Width + tx];
Value +=MVal * NVal;
}
P[tid]=Value;
``````

}

int main()
{
const int MatrixWidth =4096;
const int MatrixHeight=4096;
const int MatrixSize = MatrixWidthMatrixHeight;
const int BufferSize = MatrixSize
sizeof(int);
LARGE_INTEGER liCounter1, liCounter2, liFrequency;
int* M;
int* N;
int* P_cuda;
int* P_C;

``````FILE *fp, *fp1, *fp2, *fp3;
M=(int*)malloc(BufferSize);
N=(int*)malloc(BufferSize);
P_cuda = (int*)malloc(BufferSize);
P_C = (int *)malloc(BufferSize);

int i=0;
for(i=0; i<MatrixSize; i++)
{
M[i] = 1;
N[i] = 0;
P_cuda[i] =0;
P_C[i]=0;

}
for(i =0; i<MatrixWidth; i++)
{

N[i*MatrixWidth+i]=1;
}

fp2=fopen("M.txt","w");
fp3=fopen("N.txt","w");
for(i=0; i<MatrixSize; i++)
{
if(i%MatrixWidth == 0)
{
fprintf(fp2,"\n");
fprintf(fp3,"\n");

}
fprintf(fp2,"[%d], ",M[i]);
fprintf(fp3,"[%d], ",N[i]);
}
fclose(fp2);
fclose(fp3);

int* dev_M;
int* dev_N;
int* dev_P;

//
(cudaMalloc((void**)&dev_M, BufferSize));
(cudaMalloc((void**)&dev_N, BufferSize));
(cudaMalloc((void**)&dev_P, BufferSize));

//
(cudaMemcpy(dev_M, M,BufferSize, cudaMemcpyHostToDevice));
(cudaMemcpy(dev_N, N,BufferSize, cudaMemcpyHostToDevice));

dim3 Dg(128,128,1);
dim3 Db(32,32,1);

QueryPerformanceFrequency(&liFrequency);  // retrieves the frequency of the high-resolution performance counter
QueryPerformanceCounter(&liCounter1);         // Start
//
MatrixMul<<<Dg, Db>>>(dev_M, dev_N, dev_P, MatrixWidth);
//
cudaMemcpy(P_cuda, dev_P, BufferSize, cudaMemcpyDeviceToHost);
QueryPerformanceCounter(&liCounter2);         // End
fp=fopen("P_cuda.txt","w");
for(i=0; i<MatrixSize; i++)
{
if(i%MatrixWidth == 0)
{
fprintf(fp,"\n");

}
fprintf(fp,"[%d], ",P_cuda[i]);
}
fclose(fp);

QueryPerformanceFrequency(&liFrequency);  // retrieves the frequency of the high-resolution performance counter
QueryPerformanceCounter(&liCounter1);         // Start
//
MatrixMulC(M, N, P_C, MatrixWidth);
QueryPerformanceCounter(&liCounter2);         // End
fp1=fopen("P_C.txt","w");
for(i=0; i<MatrixSize; i++)
{
if(i%MatrixWidth == 0)
{
fprintf(fp1,"\n",i);
}
fprintf(fp1,"[%d]",P_C[i]);
}
fclose(fp1);

bool ResultFlag = true;

for(i = 0; i< MatrixSize; i++)
{
if( P_cuda[i]!=P_C[i])
{
ResultFlag= false;

}

}
if(ResultFlag == true) printf("result ok\n");
else printf("resutl false");

cudaFree(dev_M);
cudaFree(dev_N);
cudaFree(dev_P);

free(M);
free(N);
free(P_cuda);
free(P_C);

return 0;
``````

}