Matrix Multiplucation

I need help of matrix multiplication code. I am using MS Visual Studio 2008 .I made a dll from Nvidia’s matrixMul example and calling it from c# application. There is no problem with dll.Because it runs CPU matrix multiplication.But when I call the GPU Matrix Multiplication it stops the application. I thought the problem is block and thread size. May matrix size:400x10204 What would be block and thread size?
Sorry for my bad english and thanks for explanation. The code is below
extern “C” int __declspec(dllexport) __stdcall MatrixMulCU
(
float* h_C, float* h_A, float* h_B,
unsigned int uiHA,unsigned int uiWA,
unsigned int uiHB,unsigned int uiWB,
unsigned int uiHC,unsigned int uiWC
)
{
// allocate host memory for matrices A and B
unsigned int size_A = uiWA * uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
unsigned int size_B = uiWB * uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;

// allocate device memory
float* d_A;
cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));
float* d_B;
cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));

// copy host memory to device
cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,
                          cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,
                          cudaMemcpyHostToDevice) );
   // allocate device memory for result
unsigned int size_C = uiWC * uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* d_C;
cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));

// setup execution parameters

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(uiWC / threads.x, uiHC / threads.y);
// int cuBlockSize = 16; // kernel block size (max 512)
// int n_blocks =uiHA/ cuBlockSize + (uiHA % cuBlockSize == 0 ? 0 : 1);

int cuerr = 0; // no errors
unsigned int timer = 0;

cutCreateTimer(&timer);			    // from cutil.h
cutStartTimer(timer);
// kernel warmup

matrixMul<<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
// matrixMul<<< n_blocks, cuBlockSize >>>(d_C, d_A, d_B, uiWA, uiWB);
cudaThreadSynchronize(); // by default kernel runs in parallel with CPU code
cutStopTimer(timer);

cuerr = checkCUDAError("cuda kernel");

// copy result from device to host
cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C,
                          cudaMemcpyDeviceToHost) );

if(!cuerr) cuerr = checkCUDAError("cuda memcpy");

sExecutionTime = cutGetTimerValue(timer);

 if(!cuerr) cuerr = checkCUDAError("cuda free");



// clean up memory
cutilSafeCall(cudaFree(d_A));
cutilSafeCall(cudaFree(d_B));
cutilSafeCall(cudaFree(d_C));

cutilCheckError(cutDeleteTimer(timer));
cudaThreadExit();
return cuerr;
}