Hi guys,
I’m having a bit of trouble with cufft batched transformations. The problem is
that I get slightly different results when the size of the batch changes.
The code below shows my problem. It transforms the same 4x4 array using:
a) A plan generated by cufftPlan2d for transforming once the 4x4 array.
b) A plan generated by cufftPlanMany for doing a batch transformation of
n copies of the 4x4 array.
I expect that b) generates the same result for each of the n copies of the
4x4 array; and that each of them is the same as the result from a).
But, this is not the case. If n<64, the results are as expected, but if n>=64
I got slightly different results. In the code, I set up a comparison of the results
for n=63 and n=64. I tested this code using a Quadro K620M with CUDA 9.0 and using a
Tesla K80 also with CUDA 9.0 and the results are the same.
So, can anyone help me to find my error?
The configurations of the plans and the calls to cufftExecR2C are done in the functions
doSingleFT and in doBatchFT
I have detected the same issue for arrays of random dimensions, and with random data
in them. I’m using a 4x4 array just to facilitate the debugging.
#include <iostream>
#include <iomanip>
#include <memory>
#include <algorithm>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cufft.h>
static const float DATA[ 4 * 4 ] = { 138.f, 101.f, 101.f, 138.f,
199.f, 58.f, 58.f, 199.f,
199.f, 58.f, 58.f, 199.f,
138.f, 101.f, 101.f, 138.f };
using std::cout;
using std::unique_ptr;
bool singleTransform( const float* const f_dataIn, cufftComplex* const f_dataOut,
const unsigned f_width, const unsigned f_height,
const unsigned f_complexWidth, const unsigned f_complexHeight );
bool doSingleFT( cufftReal* const f_in, cufftComplex* const f_dataOut,
const unsigned f_width, const unsigned f_height );
bool batchTransform( const float* const f_dataIn, cufftComplex* const f_dataOut,
const unsigned f_width, const unsigned f_height,
const unsigned f_complexWidth, const unsigned f_complexHeight,
const size_t f_batch );
bool doBatchFT( const cufftReal* const f_in, cufftComplex* const f_out,
const unsigned f_width, const unsigned f_height,
const unsigned f_complexWidth, const unsigned f_complexHeight,
const size_t f_batch );
void printComplexArray( const cufftComplex* const f_data, const unsigned f_width,
const unsigned f_height );
size_t compare( const cufftComplex* const f_singleFT,
const cufftComplex* const f_multiFT,
const unsigned f_complexWidth, const unsigned f_complexHeight,
const size_t f_batch );
bool allEqual( const cufftComplex* const f_multiFT,
const unsigned f_complexWidth, const unsigned f_complexHeight,
const size_t f_batch );
int main( void )
{
cudaDeviceReset();
const unsigned width{ 4 };
const unsigned height{ 4 };
// const size_t singleImageElements{ static_cast<size_t>( width * height ) };
const unsigned complexWidth{ ( width / 2 ) + 1 };
const unsigned complexHeight{ height };
const size_t complexImgElements{ static_cast<size_t>( complexWidth * complexHeight ) };
// Transform using a)
unique_ptr<cufftComplex> singleFT{ new cufftComplex[ complexImgElements ] };
if( !singleTransform( DATA, singleFT.get(), width, height,
complexWidth, complexHeight ) )
{
cout << "Single transform failed\n";
return false;
}
cout << "Reference data:\n";
printComplexArray( singleFT.get(), complexWidth, complexHeight );
// Transform using b) with n=63
size_t batch{ 63 };
unique_ptr<cufftComplex> batchTransform63{ new cufftComplex[
batch * complexImgElements ] };
if( !batchTransform( DATA, batchTransform63.get(), width, height,
complexWidth, complexHeight, batch ) )
{
cout << "Batch = " << batch << " transform failed\n";
return false;
}
// Check 1. All the transformed images should be equal.
if( !allEqual( batchTransform63.get(), complexWidth, complexHeight, batch ) )
{
cout << "Not all the transformed images are the same for batch = " << batch;
return false;
}
// Check 2. All the transformed images must be equal to the results
// of the a)
size_t wrongImgs = compare( singleFT.get(), batchTransform63.get(),
complexWidth, complexHeight, batch );
if( wrongImgs > 0 )
{
printComplexArray( batchTransform63.get(), complexWidth, complexHeight );
cout << wrongImgs << " imgs with wrong results for batch = " << batch << '\n';
return false;
}
// Transform using b) with n=64
batch = 64;
unique_ptr<cufftComplex> batchTransform64{ new cufftComplex[
batch * complexImgElements ] };
if( !batchTransform( DATA, batchTransform64.get(), width, height,
complexWidth, complexHeight, batch ) )
{
cout << "Batch = " << batch << " transform failed\n";
return false;
}
// Check 1. All the transformed images should be equal.
if( !allEqual( batchTransform64.get(), complexWidth, complexHeight, batch ) )
{
cout << "Not all the transformed images are the same for batch = "
<< batch << '\n';
//return false;
}
// Check 2. All the transformed images must be equal to the results
// of a)
wrongImgs = 0;
wrongImgs = compare( singleFT.get(), batchTransform64.get(),
complexWidth, complexHeight, batch );
if( wrongImgs > 0 )
{
cout << '\n' << wrongImgs << " imgs with wrong results for batch = " << batch << '\n';
cout << "Sample output:\n";
printComplexArray( batchTransform64.get(), complexWidth, complexHeight );
return false;
}
return 0;
}
bool doSingleFT( cufftReal* const f_in, cufftComplex* const f_out,
const unsigned f_width, const unsigned f_height )
{
cufftHandle plan;
if( cufftPlan2d( &plan, f_height, f_width, CUFFT_R2C ) != CUFFT_SUCCESS )
{
cout << "Plan creation failed\n";
return false;
}
if( cufftExecR2C( plan, f_in, f_out ) != CUFFT_SUCCESS )
{
cout << "FT failed\n";
return false;
}
if( cufftDestroy( plan ) != CUFFT_SUCCESS )
{
cout << "Plan destruction failed\n";
return false;
}
if( cudaDeviceSynchronize() != cudaSuccess )
{
cout << "Single transform sync error\n";
return false;
}
return true;
}
bool doBatchFT( cufftReal* const f_in, cufftComplex* const f_out,
const unsigned f_width, const unsigned f_height,
const unsigned f_complexWidth, const unsigned f_complexHeight,
const size_t f_batch )
{
cufftHandle plan;
const int rank{ 2 };
int n[ rank ] = { static_cast<int>( f_height ),
static_cast<int>( f_width ) };
int idist = static_cast<int>( f_width * f_height );
int odist = static_cast<int>( f_complexHeight * f_complexWidth );
cufftResult_t err = cufftPlanMany( &plan, rank, n,
NULL, 1, idist,
NULL, 1, odist,
CUFFT_R2C, f_batch );
if( err != CUFFT_SUCCESS )
{
cout << "cufftPlanMany failed\n";
return false;
}
if( cufftExecR2C( plan, f_in, f_out ) != CUFFT_SUCCESS )
{
cout << "cufftExecR2C failed\n";
return false;
}
if( cufftDestroy( plan ) != CUFFT_SUCCESS )
{
cout << "Destroy of the plan failed\n";
return false;
}
if( cudaDeviceSynchronize() != cudaSuccess )
{
cout << "Batch transform sync error\n";
return false;
}
return true;
}
bool singleTransform( const float* const f_dataIn, cufftComplex* const f_dataOut,
const unsigned f_width, const unsigned f_height,
const unsigned f_complexWidth, const unsigned f_complexHeight )
{
// Real image size.
const size_t realImgElements{ static_cast<size_t>( f_width * f_height ) };
// Number of elements in the input image.
const size_t realImgSize{ realImgElements * sizeof( cufftReal ) };
// Complex image size.
const size_t complexImgElements{ static_cast<size_t>( f_complexWidth *
f_complexHeight ) };
// Number of complex elements in the frequency domain.
const size_t complexImgSize{ complexImgElements * sizeof( cufftComplex ) };
cufftReal* gpu_in{ nullptr };
cudaError_t err = cudaMalloc( reinterpret_cast<void**>( &gpu_in ), realImgSize );
if( err != cudaSuccess )
{
cout << "Alloc real img failed " << err << std::endl;
return false;
}
err = cudaMemcpy( gpu_in, f_dataIn, realImgSize, cudaMemcpyHostToDevice );
if( err != cudaSuccess )
{
cout << "Host to device copy failed\n";
return false;
}
//output buffer.
cufftComplex* gpu_out{ nullptr };
err = cudaMalloc( reinterpret_cast<void**>( &gpu_out ), complexImgSize );
if( err != cudaSuccess )
{
cout << "Alloc complex img failed\n";
return false;
}
if( !doSingleFT( gpu_in, gpu_out, f_width, f_height ) )
{
cout << "Single FT failed\n";
return false;
}
// Copy the data back from the device
err = cudaMemcpy( f_dataOut, gpu_out, complexImgSize, cudaMemcpyDeviceToHost );
if( err != cudaSuccess )
{
cout << "Device to host copy failed\n";
return false;
}
if( cudaFree( gpu_in ) != cudaSuccess ||
cudaFree( gpu_out ) != cudaSuccess )
{
cout << "Deallocation failed\n";
return false;
}
return true;
}
bool batchTransform( const float* const f_dataIn, cufftComplex* const f_dataOut,
const unsigned f_width, const unsigned f_height,
const unsigned f_complexWidth, const unsigned f_complexHeight,
const size_t f_batch )
{
// Number of elements in each input image.
const size_t realImgElements{ static_cast<size_t>( f_width ) *
static_cast<size_t>( f_height ) };
// Real single image size.
//const size_t realImgSize{ realImgElements * sizeof( float ) };
// Input batch-array elements.
const size_t dataInElements{ realImgElements * f_batch };
// Input batch-array data size
const size_t dataInSize{ dataInElements * sizeof( float ) };
// Complex single image size.
const size_t complexImgElements{ f_complexWidth * f_complexHeight };
// Number of complex elements in the frequency domain.
//const size_t complexImgSize{ complexImgElements * sizeof( cufftComplex ) };
// Output batch-array elements.
const size_t dataOutElements{ complexImgElements * f_batch };
// Output batch-array size.
const size_t dataOutSize{ dataOutElements * sizeof( cufftComplex ) };
// Create in the host the batch-array.
unique_ptr<float> batchArray{ new float[ dataInElements ] };
for( size_t i = 0; i < f_batch; ++i )
{
const size_t index{ i * realImgElements };
std::copy_n( f_dataIn, realImgElements, batchArray.get() + index );
}
// Input device buffer.
cufftReal* gpu_in{ nullptr };
cudaError_t err = cudaMalloc( reinterpret_cast<void**>( &gpu_in ), dataInSize );
if( err != cudaSuccess )
{
cout << "Alloc real img failed\n";
return false;
}
err = cudaMemcpy( gpu_in, batchArray.get(), dataInSize, cudaMemcpyHostToDevice );
if( err != cudaSuccess )
{
cout << "Host to device copy failed\n";
return false;
}
// Output device buffer.
cufftComplex* gpu_out{ nullptr };
if( cudaMalloc( reinterpret_cast<void**>( &gpu_out ), dataOutSize ) != cudaSuccess )
{
cout << "Alloc complex img failed\n";
return false;
}
if( !doBatchFT( gpu_in, gpu_out, f_width, f_height,
f_complexWidth, f_complexHeight, f_batch ) )
{
cout << "Batch transform failed\n";
}
err = cudaMemcpy( f_dataOut, gpu_out, dataOutSize, cudaMemcpyDeviceToHost );
if( err != cudaSuccess )
{
cout << "Device to host copy failed\n";
return false;
}
if( cudaFree( gpu_in ) != cudaSuccess ||
cudaFree( gpu_out ) != cudaSuccess )
{
cout << "Deallocation failed\n";
return false;
}
return true;
}
void printComplexArray( const cufftComplex* const f_data, const unsigned f_width,
const unsigned f_height )
{
for( unsigned y = 0; y < f_height; ++y )
{
for( unsigned x = 0; x < f_width; ++x )
{
const unsigned index{ y * f_width + x };
cout << std::setprecision( 10 ) << "( "
<< std::setw( 10 + 6 ) << f_data[ index ].x
<< ", "
<< std::setw( 10 + 6 ) <<f_data[ index ].y
<< " ) ";
}
cout << '\n';
}
}
size_t compare( const cufftComplex* const f_singleFT,
const cufftComplex* const f_multiFT,
const unsigned f_complexWidth, const unsigned f_complexHeight,
const size_t f_batch )
{
size_t wrongImages{ 0 };
const size_t imgElements{ static_cast<size_t>( f_complexWidth * f_complexHeight ) };
for( size_t i = 0; i < f_batch; ++i )
{
const cufftComplex* const currentMulti = f_multiFT + ( i * imgElements );
for( size_t j = 0; j < imgElements; ++j )
{
if( currentMulti[ j ].x != f_singleFT[ j ].x ||
currentMulti[ j ].y != f_singleFT[ j ].y )
{
++wrongImages;
break;
}
}
}
return wrongImages;
}
bool allEqual( const cufftComplex* const f_multiFT,
const unsigned f_complexWidth, const unsigned f_complexHeight,
const size_t f_batch )
{
const size_t imgElements{ static_cast<size_t>( f_complexWidth * f_complexHeight ) };
// Compare the first image, with all the others. All must be equal.
for( size_t i = 1; i < f_batch; ++i )
{
const cufftComplex* const current = f_multiFT + ( i * imgElements );
for( size_t j = 0; j < imgElements; ++j )
{
if( current[ j ].x != f_multiFT[ j ].x ||
current[ j ].y != f_multiFT[ j ].y )
{
return false;
}
}
}
return true;
}
Cheers,
Sandino