Application has been crashed 2d matrix multiplication

I want to multiplicate 2 matrixs. When I write the size of matrix - 2048Ñ…2048 (or less), it’s work nice. But when the size of matrix become more (for example 4096x4096) the application has been crached.
The size of the block is (32,16)
The size of the grid is (Size of matrix/32,Size of matrix/16)=( 4096/32, 4096/16)=(128,256)
128*256=32768
But I have gtx 260. I got information about my device by dint of “cudaGetDeviceProperties”
There was written “Max grid size: x=65535. y=65535”.
But why I have a problem?

P.S. Sorry for my EN

could you post your code?

GTX260 can afford matrix multiplication of size 4096 x 4096.

[codebox]global void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

// Each thread computes one element of C 

// by accumulating results into Cvalue 

float Cvalue = 0; 

int row = blockIdx.y * blockDim.y + threadIdx.y; 

int col = blockIdx.x * blockDim.x + threadIdx.x; 

for (int e = 0; e < A.width; ++e) 

    Cvalue += A.elements[row * A.width + e] 

            * B.elements[e * B.width + col]; 

C.elements[row * C.width + col] = Cvalue; 

}[/codebox]

[codebox]void MatMul(const Matrix A, const Matrix B, Matrix C)

{

// Load A and B to device memory 

Matrix d_A; 

  d_A.width = A.width; d_A.height = A.height; 

size_t size = A.width * A.height * sizeof(float); 

cudaMalloc((void**)&d_A.elements, size); 

cudaMemcpy(d_A.elements, A.elements, size, 

           cudaMemcpyHostToDevice); 

Matrix d_B; 

d_B.width = B.width; d_B.height = B.height; 

size = B.width * B.height * sizeof(float); 

cudaMalloc((void**)&d_B.elements, size); 

cudaMemcpy(d_B.elements, B.elements, size, 

           cudaMemcpyHostToDevice); 

// Allocate C in device memory

Matrix d_C; 

d_C.width = C.width; d_C.height = C.height; 

size = C.width * C.height * sizeof(float); 

cudaMalloc((void**)&d_C.elements, size); 

// Invoke kernel

dim3 dimBlock(BLOCK_SIZE*2, BLOCK_SIZE); 

dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y); 

MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); 

// Read C from device memory

cudaMemcpy(C.elements, d_C.elements, size,

	 cudaMemcpyDeviceToHost); 

// Free device memory

cudaFree(d_A.elements); 

cudaFree(d_B.elements); 

cudaFree(d_C.elements); 

}[/codebox]

[codebox]void Start ()

{

int size;

size=NNsizeof(float);

Matrix A,B,C;

A.width=N;

A.height=N;

A.elements=(float*)malloc(size);

B.width=N;

B.height=N;

B.elements=(float*)malloc(size);

C.width=N;

C.height=N;

C.elements=(float*)malloc(size);

unsigned int timer = 0;

CUT_SAFE_CALL( cutCreateTimer( &timer));

CUT_SAFE_CALL( cutStartTimer( timer));

MatMul(A,B,C);

//MatMulCPU(A,B,C);

CUDA_SAFE_CALL( cudaThreadSynchronize() );

CUT_SAFE_CALL( cutStopTimer( timer));

printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

CUT_SAFE_CALL( cutDeleteTimer( timer));

}[/codebox]

And:

BLOCK_SIZE=16

N=4096

I test your code in my machine,

GTX295 ( it has same size of memory as GTX260), driver 190.38, cuda2.3

it is O.K. Processing time: 5565.205078 (ms)

And therein lies the answer - 5.565 seconds is longer than the display driver watchdog timer limit. I am going to guess that the GTX260 has an attached display and the reason it is crashing is because the kernel is taking longer than 5 seconds to finish.