#include "mex.h" #include "cuda.h" typedef struct{ int width; int height; float* elements; }Matrix; #define BLOCK_SIZE 16 __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) { float Cvalue=0; int row=blockIdx.y*blockDim.y+threadIdx.y; int col=blockIdx.x*blockDim.x+threadIdx.x; for(int e=0; e>>(dA,dB,dC); cudaMemcpy(C.elements,dC.elements,size,cudaMemcpyDeviceToHost); cudaFree(dA.elements); cudaFree(dB.elements); cudaFree(dC.elements); }