First of all, please note: I am not asking for bug fixes.

I am new to CUDA programming (not a very good coder as it is), and I only wrote this code because I’m in desperate need of a fast code to convolve many small matrices with a few convolution masks.

All I ask for is suggestions on what changes I can make to my code to make it even faster it’s a matter of approach - I assume my code is full of problems, which I can solve on my own. I am here to ask for help from those who have experience and know how to solve a problem in a manner suitable for the GPU.

If you have a different, faster approach to suggest, please do.

```
//matrix to process
inline int mat2proc()
{
return blockIdx.y*threadDim.x*threadDim.y + threadIdx.y*threadDim.x + threadIdx.x;
}
inline int place(k,s,i) //k is an index in the big matrix; s is the size of the small matrix, i is an index in the small matrix
{ //this calculates the index in the result matrix where we should add the product
//of an element in line/row k of the big matrix and line/row i of small matrix (size s)
return i-s+k+1;
}
__global__ void batchConv (float *res, const float *mats, const float *masks, int s1,
int s2, int matCount, int maskCount)
//res = result matrix. mats= 3D array with matCount matrices of size s1xs1
//masks = convolution masks masks. there are maskCount of them, all of size s2xs2
{
extern __shared__ float mask [];
int ResSize=s1+s2-1; //size of convolution result
//but
int mNum = blockIdx.x; //the number of x blocks is the number of convolution masks.
float *tmpRes;
cudaMalloc((void**)&tmpRes,ResSize*ResSize*sizeof(float));
cudaMemset((void*)tmpRes,0,sizeof(float)*ResSize*ResSize);
if (threadIdx.x < s2)
if (threadIdx.y < s2)
mask[threadIdx.x*s2 +threadIdx.y] = masks[mNum*s2*s2 +threadIdx.x*s2 +threadIdx.y]; //copy the masks to shared memory
//all threads share the same mask
__syncthreads();
if (blockIdx.y*threadDim.x*threadDim.y + threadIdx.y*threadDim.x + threadIdx.x < matCount)
{
for (k=0; k<s1; i++) //iterate over lines
for (l=0; l<s1; i++) //iterate over columns
{
float tmpVal=mats[s1*s1*mat2proc() +s1*k + l];
for (i=0; i<s2; i++) //iterate over lines
for (j=0; j<s2; i++) //iterate over columns
tmpRes[ResSize*place(k,s2,i) +place(l,s2,j)]+=tmpVal*mask[i*s2 +j];
}
}
for (i=0; i<ResSize; i++)
for (j=0; j<ResSize; j++)
res[ResSize*ResSize*mat2proc() +ResSize*i + j]=tmpRes[ResSize*i +j];
}
```