I need help of matrix multiplication code. I am using MS Visual Studio 2008 .I made a dll from Nvidia’s matrixMul example and calling it from c# application. There is no problem with dll.Because it runs CPU matrix multiplication.But when I call the GPU Matrix Multiplication it stops the application. I thought the problem is block and thread size. May matrix size:400x10204 What would be block and thread size?
Sorry for my bad english and thanks for explanation. The code is below
extern “C” int __declspec(dllexport) __stdcall MatrixMulCU
(
float* h_C, float* h_A, float* h_B,
unsigned int uiHA,unsigned int uiWA,
unsigned int uiHB,unsigned int uiWB,
unsigned int uiHC,unsigned int uiWC
)
{
// allocate host memory for matrices A and B
unsigned int size_A = uiWA * uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
unsigned int size_B = uiWB * uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
// allocate device memory
float* d_A;
cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));
float* d_B;
cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));
// copy host memory to device
cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,
cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,
cudaMemcpyHostToDevice) );
// allocate device memory for result
unsigned int size_C = uiWC * uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* d_C;
cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(uiWC / threads.x, uiHC / threads.y);
// int cuBlockSize = 16; // kernel block size (max 512)
// int n_blocks =uiHA/ cuBlockSize + (uiHA % cuBlockSize == 0 ? 0 : 1);
int cuerr = 0; // no errors
unsigned int timer = 0;
cutCreateTimer(&timer); // from cutil.h
cutStartTimer(timer);
// kernel warmup
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, uiWA, uiWB);
// matrixMul<<< n_blocks, cuBlockSize >>>(d_C, d_A, d_B, uiWA, uiWB);
cudaThreadSynchronize(); // by default kernel runs in parallel with CPU code
cutStopTimer(timer);
cuerr = checkCUDAError("cuda kernel");
// copy result from device to host
cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C,
cudaMemcpyDeviceToHost) );
if(!cuerr) cuerr = checkCUDAError("cuda memcpy");
sExecutionTime = cutGetTimerValue(timer);
if(!cuerr) cuerr = checkCUDAError("cuda free");
// clean up memory
cutilSafeCall(cudaFree(d_A));
cutilSafeCall(cudaFree(d_B));
cutilSafeCall(cudaFree(d_C));
cutilCheckError(cutDeleteTimer(timer));
cudaThreadExit();
return cuerr;
}