I am attempting to do FFT convolution using cuFFT and cuBlas.
I have everything up to the element-wise multiplication + sum procedure working. Rather than do the element-wise + sum procedure I believe it would be faster to use cublasCgemmStridedBatched.
I wish to multiply matrices AB=C. I am aware that cublasCgemmStridedBatched works in column major order, so after passed the multiplication is computed as:
A^T B^T = C^T
To get around this without transposing the result I anticipate this transpose and pass cublasCgemmStridedBatched:
B A
As
(A^T B^T)^T = (C^T)^T
BA = C
This changes the values that must be given to cublasCgemmStridedBatched.
My A matrix is the batched FFT of some images. The B matrix is a batch of FFT’ed kernels. I have successfully computed this in python using
A (Batched FFT Images) of shape [imgH x imgW,BatchSize,InC]
B (Batched FFT Kernels) of shape [imgH x imgW,InC,outC]
using numpy’s matmul as np.matmul(A,B).
When I do this using cublasCgemmStridedBatched I get the incorrect results. My settings can be seen below.
My thoughts on what the inputs need to be:
Since I am passing in BA, these get transposed by cublas to B^T A^T. This leaves me with m x k by k x n. My batch dimension I believe is going to be imgH x imgW. M is then outC, k is inC, and n is batchSize. The leading dimensions are always the columns so ldA will be outC, ldB with be batchSize, and ldC will be outC.
This leaves the strides. I believe these follow 3d indexing where the stride amount should bring you to the next channel / dimension. Thus I set the strides to the matrix sizes: outCinC for A, inCbatchSize for B, batchSize * outC for C.
// Runs but gives incorrect
cublasStat = cublasCgemmStridedBatched(handle,CUBLAS_OP_N,CUBLAS_OP_N,
outC, batchSize, inC,
& alpha,
kerPtr, outC, inC*outC,
imgsPtr, inC, inC*batchSize,
& beta,
d_C, outC, batchSize*outC,
padH*padW);
What are the proper arguments to feed into cublasCgemmStridedBatched to get the correct results?
Heres a reproducible example:
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Include CUDA runtime and CUFFT
#include <cuda_runtime.h>
#include <cufft.h>
#include <cuComplex.h>
#include <math.h>
#include "cufft.h"
#include "cublas_v2.h"
__global__
void Pad2DArrayNoLoop(cuComplex* __restrict__ img, cuComplex* __restrict__ padded, int imgH, int imgW, int padH, int padW)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if( i >= imgH && j >= imgW)
{
return;
}
padded[i*padW + j].x = img[i*imgW + j].x;
padded[i*padW + j].y = img[i*imgW + j].y;
}
__global__
void Pad3DArrayNoLoop(cuComplex* __restrict__ img, cuComplex* __restrict__ padded, int batchSize, int imgH, int imgW, int padH, int padW)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
if( i >= batchSize && j >= imgH && k >= imgW)
{
return;
}
padded[i*padH*padW + j*padW + k].x = img[i*imgW*imgH + j*imgW + k].x;
padded[i*padH*padW + j*padW + k].y = img[i*imgW*imgH + j*imgW + k].y;
}
void initArrZero(cuComplex *arr, int arrH, int arrW, int batchSize=1)
{
for(int i = 0; i < arrH * arrW * batchSize; i++)
{
arr[i].x = 0;
arr[i].y = 0;
}
}
void initArr_I(cuComplex *arr, int arrH, int arrW)
{
for(int i = 0; i < arrH * arrW; i++)
{
arr[i].x = i;
arr[i].y = 0;
}
}
void initArr_Ibatch(cuComplex *arr, int arrH, int arrW, int batchSize, int addVal = 0)
{
for(int i = 0; i < batchSize * arrH * arrW; i++)
{
arr[i].x = i + addVal;
arr[i].y = 0;
}
}
void printArr(cuComplex * arr, int arrH, int arrW)
{
for(int i = 0; i < arrH; i++)
{
for(int j = 0; j < arrW; j++)
{
printf("(%f, %f) ", arr[i*arrW + j].x, arr[i*arrW + j].y);
}
printf("\n");
}
}
void printArr3D(cuComplex * arr, int arrH, int arrW, int batchSize)
{
for(int i = 0; i < batchSize; i++)
{
for(int j = 0; j < arrH; j++)
{
for(int k = 0; k < arrW; k++)
{
printf("(%f,%f) ", arr[i*arrH*arrW + j*arrW + k].x, arr[i*arrH*arrW + j*arrW + k].y);
}
printf("\n");
}
printf("\n");
}
}
void FFT_Conv_Batch(int batchSize, int imgH, int imgW, int kerH, int kerW, int inC, int outC)
{
cuComplex *h_imgs;
cuComplex *d_imgs;
cuComplex *h_ker;
cuComplex *d_ker;
cuComplex *h_padded;
cuComplex *d_padded;
cuComplex *h_kerPadded;
cuComplex *d_kerPadded;
cuComplex *h_C;
cuComplex *d_C;
cufftHandle fftPlanFwdImg, fftPlanFwdKernel, fftPlanInv;
cufftComplex *d_ImgsSpectral, *d_KernelSpectral;
int padH = imgH + kerH - 1;
int padW = imgW + kerW - 1;
static const int nRank = 2;
// For no Padding
int n[nRank] = {padH,padW};
cufftResult_t stat;
printf("Creating cuFFT plans\n");
stat = cufftPlanMany(& fftPlanFwdImg,nRank,n,NULL,1,0,NULL,1,0,CUFFT_C2C,batchSize*inC);
if( stat != CUFFT_SUCCESS )
{
printf("Cufft plan didn't init\n");
}
stat = cufftPlanMany(& fftPlanFwdKernel,nRank,n,NULL,1,0,NULL,1,0,CUFFT_C2C,inC*outC);
if( stat != CUFFT_SUCCESS )
{
printf("Cufft plan didn't init\n");
}
stat = cufftPlanMany(& fftPlanInv, nRank, n, NULL,1,0,NULL,1,0,CUFFT_C2C,batchSize*outC);
if( stat != CUFFT_SUCCESS )
{
printf("Cufft plan didn't init\n");
}
printf("Allocating Memory on CPU\n");
h_imgs = (cuComplex *) malloc(batchSize * inC * imgH * imgW * sizeof(cuComplex));
h_padded = (cuComplex *) malloc(batchSize * inC * padH * padW * sizeof(cuComplex));
h_ker = (cuComplex *) malloc(inC * outC * kerH * kerW * sizeof(cuComplex));
h_kerPadded = (cuComplex *) malloc(inC * outC * padH * padW * sizeof(cuComplex));
h_C = (cuComplex *) malloc(batchSize * outC * padH * padW * sizeof(cuComplex));
printf("Allocating Memory on GPU\n");
cudaMalloc((void **) &d_imgs, batchSize * inC * imgH * imgW * sizeof(cuComplex));
cudaMalloc((void **) &d_padded, batchSize * inC * padH * padW * sizeof(cuComplex));
cudaMalloc((void **) &d_ker, inC * outC * kerH * kerW * sizeof(cuComplex));
cudaMalloc((void **) &d_kerPadded, inC * outC * padH * padW * sizeof(cuComplex));
cudaMalloc((void **) &d_ImgsSpectral, batchSize * inC * padH * padW * sizeof(cuComplex));
cudaMalloc((void **) &d_KernelSpectral, inC * outC * padH * padW * sizeof(cuComplex));
cudaMalloc((void **) &d_C, batchSize * outC * padH * padW * sizeof(cuComplex));
// To make the kernel different VS filter
int kerAddVal = 2;
initArr_Ibatch(h_imgs,imgH,imgW,batchSize*inC);
initArr_Ibatch(h_ker,kerH,kerW,inC*outC,kerAddVal);
initArrZero(h_padded,padH,padW,batchSize*inC);
initArrZero(h_kerPadded,padH,padW,inC*outC);
initArrZero(h_C,padH,padW,batchSize*outC);
// To test with complex input
// h_imgs[3].y=2.0f;
cudaMemcpyAsync(d_imgs,h_imgs, batchSize * inC * imgH * imgW * sizeof(cuComplex),cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_ker,h_ker,inC * outC * kerH * kerW * sizeof(cuComplex), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_padded,h_padded,batchSize * inC * padH * padW * sizeof(cuComplex),cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_kerPadded, h_kerPadded, inC * outC * padH * padW * sizeof(cuComplex), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_C,h_C,batchSize * outC * padH * padW * sizeof(cuComplex), cudaMemcpyHostToDevice);
// printf("Img\n");
// printArr3D(h_imgs,imgH,imgW,batchSize*inC);
// printf("Kers\n");
// printArr3D(h_ker,kerH,kerW,inC*outC);
// printf("Padded Empty Arr\n");
// printArr3D(h_padded,padH,padW,batchSize*inC);
dim3 blockSizes(batchSize*inC,4,4);
dim3 numBlocks( (batchSize*inC + blockSizes.x - 1)/ blockSizes.x, (imgH + blockSizes.y - 1)/ blockSizes.y, (imgW + blockSizes.z - 1)/ blockSizes.z);
Pad3DArrayNoLoop<<<numBlocks,blockSizes>>>(d_imgs, d_padded, batchSize, imgH, imgW, padH, padW);
blockSizes.x = inC*outC, blockSizes.y = 3; blockSizes.z = 3;
numBlocks.x = (inC*outC + blockSizes.x - 1) / blockSizes.x;
numBlocks.y = (kerH + blockSizes.y - 1) / blockSizes.y;
numBlocks.z = (kerW + blockSizes.z - 1) / blockSizes.z;
Pad3DArrayNoLoop<<<numBlocks,blockSizes>>>(d_ker,d_kerPadded,batchSize,kerH,kerW,padH,padW);
cudaMemcpyAsync(h_padded,d_padded,batchSize*inC*padH*padW*sizeof(cuComplex),cudaMemcpyDeviceToHost);
cudaMemcpyAsync(h_kerPadded,d_kerPadded,inC*outC*padH*padW*sizeof(cuComplex),cudaMemcpyDeviceToHost);
// printf("Copied to padding\n");
// printf("Images\n");
// printArr3D(h_padded,padH,padW,batchSize*inC);
// printf("Kernels\n");
// printArr3D(h_kerPadded,padH,padW,inC*outC);
printf("Executing FFT\n");
stat = cufftExecC2C(fftPlanFwdImg, (cufftComplex *) d_padded, (cufftComplex *) d_ImgsSpectral,CUFFT_FORWARD);
if(stat != CUFFT_SUCCESS)
{
printf("FFT execution error.\n");
}
stat = cufftExecC2C(fftPlanFwdKernel, (cufftComplex *) d_kerPadded, (cufftComplex *) d_KernelSpectral,CUFFT_FORWARD);
if(stat != CUFFT_SUCCESS)
{
printf("Kernel FFT Execution error.\n");
}
cudaMemcpyAsync(h_padded,d_ImgsSpectral,batchSize*inC*padH*padW*sizeof(cuComplex),cudaMemcpyDeviceToHost);
cudaMemcpyAsync(h_kerPadded,d_KernelSpectral,inC*outC*padH*padW*sizeof(cuComplex),cudaMemcpyDeviceToHost);
printf("FFT Image Results\n");
printArr3D(h_padded,padH,padW,batchSize*inC);
//
printf("FFT Kernel Results\n");
printArr3D(h_kerPadded,padH,padW,inC*outC);
printf("Element wise Multiplication\n");
cublasStatus_t cublasStat;
cublasHandle_t handle;
cuComplex alpha = {1.0f, 0.0f};
cuComplex beta = {0.0f, 0.0f};
// Doesn't matter, the matrices are square
// int dim = padH;
// /*
// * Reference to what's going on in the cublasCgemmStridedBatched
// int M = colB, N = rowA, K = colA, ldA = colB, ldB = colA, ldC = colC;
int h_w = padH * padW;
// int strideA = outC*inC;
// int strideB = batchSize*inC;
// int strideC = outC*batchSize;
// int ldA = outC;
// int ldB = inC;
// int ldC = batchSize;
//
// cuComplex c_mnp;
// for (int p = 0; p < h_w; ++p)
// {
// for (int m = 0; m < outC; ++m)
// {
// for (int n = 0; n < batchSize; ++n)
// {
// c_mnp.x = 0;
// c_mnp.y = 0;
// for (int k = 0; k < inC; ++k)
// {
// c_mnp.x += h_kerPadded[m + k*ldA + p*strideA].x * h_padded[k + n*ldB + p*strideB].x; //- h_imgs[m + k*ldA + p*strideA].y * h_dftMat[k + n*ldB + p*strideB].y;
// c_mnp.y += h_kerPadded[m + k*ldA + p*strideA].y * h_padded[k + n*ldB + p*strideB].x; //+ h_imgs[m + k*ldA + p*strideA].x * h_dftMat[k + n*ldB + p*strideB].y;
// h_C[m + n*ldC + p*strideC].x = c_mnp.x;
// h_C[m + n*ldC + p*strideC].y = c_mnp.y;
// printf("Kernel and Img Vals (%f, %f) \n", h_kerPadded[m + k*ldA + p*strideA].x, h_padded[k + n*ldB + p*strideB].x);
//// printf("(%f,%f) \n", c_mnp.x,c_mnp.y);
// }
// }
// }
// }
// Stride has to be the size of the 2D matrix, same stuff as 3D indexing
// int strideA = batchSize*inC, strideB = inC*outC, strideC = padH*padW;
cublasStat = cublasCreate(&handle);
if(cublasStat != CUBLAS_STATUS_SUCCESS)
{
printf("Cublas Error\n");
}
// Do here to get updated memory locations
cuComplex const *imgsPtr = d_ImgsSpectral;
cuComplex const *kerPtr = d_KernelSpectral;
//
// cublasStat = cublasCgemmStridedBatched(handle,CUBLAS_OP_N,CUBLAS_OP_N,
// padH, padW, padH,
// & alpha,
// kerPtr, padW, padH*padW,
// imgsPtr, padW, padH*padW,
// & beta,
// d_C, padW, padH*padW,
// h_w*outC);
// Runs but gives incorrect
cublasStat = cublasCgemmStridedBatched(handle,CUBLAS_OP_N,CUBLAS_OP_N,
outC, batchSize, inC,
& alpha,
kerPtr, outC, inC*outC,
imgsPtr, inC, inC*batchSize,
& beta,
d_C, outC, batchSize*outC,
padH*padW);
if(cublasStat != CUBLAS_STATUS_SUCCESS)
{
printf("Cublas Error\n");
}
cudaMemcpy(h_C,d_C,batchSize * outC * padH * padW * sizeof(cuComplex),cudaMemcpyDeviceToHost);
printf("Cublas Batch Res \n");
printArr3D(h_C, padH, padW, batchSize*outC);
printf("Freeing Memory\n");
cudaFree(d_imgs);
cudaFree(d_ker);
cudaFree(d_padded);
cudaFree(d_kerPadded);
free(h_imgs);
free(h_ker);
free(h_padded);
free(h_kerPadded);
cudaDeviceSynchronize();
}
int main()
{
int batchSize = 1;
int inC = 3;
int outC = 1;
int imgH = 4;
int imgW = 4;
int kerH = 3;
int kerW = 3;
FFT_Conv_Batch(batchSize, imgH, imgW, kerH, kerW, inC, outC);
}