problem in Matrix multiplication code

Hi, I try to check calculate time of my square matrix multiplication code.

I can check calculate time on 2048x2048 matrix. And result of CPU calculate is same of GPU result.

But, i can’t check calculate time on 4096x4096 matrix. And result of CPU calculate is not same of GPU result.

In my code, M and N is square matrix. element of M is all 1, N is unit matrix.

so M x N = M, but GPU return all 0.

sombody help me plzzzzzzzzzzzzzzzz :(;

p.s i use GTX680 and program environment Window7 64bit-Visualstudio2010

here is my code:

#include <omp.h>
#include
#include <stdio.h>
#include <Windows.h>
#include “…/…/common/book.h”
#include “cuda.h”
#include “cuda_runtime.h”
#include “device_launch_parameters.h”

void MatrixMulC(intM, intN, intP, int Width)
{
int col=0;
int raw=0;
int index=0;
int Destindex=0;
#pragma omp parallel
for(col=0; col<Width; col++)
{
for(raw=0; raw< Width; raw++)
{
Destindex= col
Width+raw;
for(index =0; index< Width; index++)
P[Destindex]+=M[col * Width + index]N[index * Width + raw];
}
}
}
global void MatrixMul(int
M, intN, intP, int Width)
{
int tid, tx, ty;
tx= blockDim.x * blockIdx.x + threadIdx.x;
ty= blockDim.y * blockIdx.y + threadIdx.y;
tid=Width * ty + tx;

int Value=0;
int MVal=0;
int NVal=0;

for(int i=0; i< Width; i++)
{
	MVal=M[ty * Width +i];
	NVal=N[i * Width + tx];
	Value +=MVal * NVal;
}
P[tid]=Value;

}

int main()
{
const int MatrixWidth =4096;
const int MatrixHeight=4096;
const int MatrixSize = MatrixWidthMatrixHeight;
const int BufferSize = MatrixSize
sizeof(int);
LARGE_INTEGER liCounter1, liCounter2, liFrequency;
int* M;
int* N;
int* P_cuda;
int* P_C;

FILE *fp, *fp1, *fp2, *fp3;	
M=(int*)malloc(BufferSize);
N=(int*)malloc(BufferSize);
P_cuda = (int*)malloc(BufferSize);
P_C = (int *)malloc(BufferSize);


int i=0;
for(i=0; i<MatrixSize; i++)
{
	M[i] = 1;
	N[i] = 0;
	P_cuda[i] =0;
	P_C[i]=0;
	
}
for(i =0; i<MatrixWidth; i++)
{

	N[i*MatrixWidth+i]=1;
}

fp2=fopen("M.txt","w");
fp3=fopen("N.txt","w");
for(i=0; i<MatrixSize; i++)
{
	if(i%MatrixWidth == 0)
	{	
		fprintf(fp2,"\n");
		fprintf(fp3,"\n");
		
	}
	fprintf(fp2,"[%d], ",M[i]);
	fprintf(fp3,"[%d], ",N[i]);		
}
fclose(fp2);
fclose(fp3);

int* dev_M;
int* dev_N;
int* dev_P;


//
(cudaMalloc((void**)&dev_M, BufferSize));
(cudaMalloc((void**)&dev_N, BufferSize));
(cudaMalloc((void**)&dev_P, BufferSize));



//
(cudaMemcpy(dev_M, M,BufferSize, cudaMemcpyHostToDevice));
(cudaMemcpy(dev_N, N,BufferSize, cudaMemcpyHostToDevice));


dim3 Dg(128,128,1);
dim3 Db(32,32,1);


QueryPerformanceFrequency(&liFrequency);  // retrieves the frequency of the high-resolution performance counter    
QueryPerformanceCounter(&liCounter1);         // Start
//
MatrixMul<<<Dg, Db>>>(dev_M, dev_N, dev_P, MatrixWidth);
//	
cudaMemcpy(P_cuda, dev_P, BufferSize, cudaMemcpyDeviceToHost);
QueryPerformanceCounter(&liCounter2);         // End
printf("cuda Kernel 2D Time : %f \n", (double)(liCounter2.QuadPart - liCounter1.QuadPart) / (double)liFrequency.QuadPart);
fp=fopen("P_cuda.txt","w");
for(i=0; i<MatrixSize; i++)
{
	if(i%MatrixWidth == 0)
	{	
		fprintf(fp,"\n");
		
	}
	fprintf(fp,"[%d], ",P_cuda[i]);		
}
fclose(fp);



QueryPerformanceFrequency(&liFrequency);  // retrieves the frequency of the high-resolution performance counter    
QueryPerformanceCounter(&liCounter1);         // Start
//
MatrixMulC(M, N, P_C, MatrixWidth);
QueryPerformanceCounter(&liCounter2);         // End
printf(" CPU Time : %f \n", (double)(liCounter2.QuadPart - liCounter1.QuadPart) / (double)liFrequency.QuadPart);
fp1=fopen("P_C.txt","w");	
for(i=0; i<MatrixSize; i++)
{
	if(i%MatrixWidth == 0)
	{				
		fprintf(fp1,"\n",i);
	}		
	fprintf(fp1,"[%d]",P_C[i]);	
}
fclose(fp1);

bool ResultFlag = true;

for(i = 0; i< MatrixSize; i++)
{
	if( P_cuda[i]!=P_C[i]) 
	{
		ResultFlag= false;

	}
	
}
if(ResultFlag == true) printf("result ok\n");
else printf("resutl false");

cudaFree(dev_M);
cudaFree(dev_N);
cudaFree(dev_P);

free(M);
free(N);
free(P_cuda);
free(P_C);


return 0;

}