Hi I am currently trying to create a multi-precision multiplication for two array of integers using CUFFT, however I am getting an error when trying to convert into a real type value. Also I am not sure whether that will provide the right output, please let me know if I am missing anything
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
// Complex data type
typedef float2 Complex;
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMul(Complex*, const Complex*, int);
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 10
#define FILTER_KERNEL_SIZE 11
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char** argv) {
printf("[simpleCUFFT] is starting...\n");
// Allocate initial memory for the array
Complex* h_signal = (Complex*) malloc(sizeof(Complex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = 1; // An array of 50 1,0s
h_signal[i].y = 0;
}
// Pad signal and filter kernel
int mem_size = sizeof(cufftComplex) * SIGNAL_SIZE;
// Allocate device memory for signal
cufftComplex* d_signal;
cudaMalloc((void**) &d_signal, mem_size);
// Copy host memory to device
cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice);
// CUFFT plan
cufftHandle plan;
if ( cufftPlan1d(&plan, mem_size, CUFFT_R2C, 1) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
return;
}
// Transform signal and kernel
printf("Transforming signal cufftExecR2C\n");
if (cufftExecR2C(plan, (cufftReal*) d_signal, d_signal) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
return;
}
// Multiply the coefficients together and normalize the result
printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");
ComplexPointwiseMul<<<32, 256>>>(d_signal, d_signal, mem_size);
// Transform signal back
printf("Transforming signal back cufftExecC2C\n");
if ( cufftExecC2R(plan, d_signal, (cufftReal*) d_signal) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Inverse failed");
return;
}
printf("Transferring host memory to device\n");
// Copy device memory to host
float* f_signal = (float*) malloc(sizeof(float) * SIGNAL_SIZE);
cudaMemcpy(f_signal, d_signal, mem_size, cudaMemcpyDeviceToHost);
for (int j = 0; j < SIGNAL_SIZE; j++) {
printf("h_s:\t");
printf("%i\t", j);
printf("%f\n", f_signal[j]);
}
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
free(f_signal);
free(h_signal);
cudaFree(d_signal);
}
////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////
// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
Complex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMul(Complex* a, const Complex* b, int size)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
a[i] = ComplexMul(a[i], b[i]);
}
-The plan is expecting the size of the transform in elements, not in bytes.
-You need to decide if you want to do a real to complex or a complex to complex transform.
If you decide to do a real to complex in place, you need to add padding to the real array
-You need to normalize the result, IFFT(FFT(A))=len(A)*A
-Your last memcpy is wrong, there is not enough space on the destination array to hold all the data you are transferring
This is simplecufft example adapted for R2C is this what you meant? Also some of the output is invalid and is missing data. The result should be 1234554321 not 345543.
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
// Complex data type
typedef float2 Complex;
static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(Complex*, const Complex*, int, float);
// Filtering functions
void Convolve(const Complex*, int, const Complex*, int, Complex*);
// Padding functions
int PadData(const Complex*, Complex**, int, const Complex*, Complex**, int);
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 6
#define FILTER_KERNEL_SIZE 5
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char** argv) {
printf("[simpleCUFFT] is starting...\n");
// Allocate host memory for the signal
Complex* h_signal = (Complex*) malloc(sizeof(Complex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; i++) {
h_signal[i].x = 1;
h_signal[i].y = 0;
}
// Allocate host memory for the filter
Complex* h_signal_2 = (Complex*) malloc(sizeof(Complex)
* FILTER_KERNEL_SIZE);
// Initalize the memory for the filter
for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; i++) {
h_signal_2[i].x = 1;
h_signal_2[i].y = 0;
}
printf("\n");
for (int i = 0; i < SIGNAL_SIZE; i++) {
printf("%f\t", h_signal[i].x);
printf("%f\n", h_signal[i].y);
}
printf("\n");
for (int i = 0; i < FILTER_KERNEL_SIZE; i++) {
printf("%f\t", h_signal_2[i].x);
printf("%f\n", h_signal_2[i].y);
}
// Pad signal and filter kernel
Complex* h_padded_signal;
Complex* h_padded_signal_2;
int new_size = PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_signal_2,
&h_padded_signal_2, FILTER_KERNEL_SIZE);
int mem_size = sizeof(Complex) * new_size;
printf("\n");
for (int i = 0; i < new_size; i++) {
printf("%f\t", h_padded_signal[i].x);
printf("%f\n", h_padded_signal[i].y);
}
printf("\n");
for (int i = 0; i < new_size; i++) {
printf("%f\t", h_padded_signal_2[i].x);
printf("%f\n", h_padded_signal_2[i].y);
}
// Allocate device memory for signal
Complex* d_signal;
cudaMalloc((void**) &d_signal, mem_size);
// Copy host memory to device
cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice);
// Allocate device memory for filter kernel
Complex* d_signal_2;
cudaMalloc((void**) &d_signal_2, mem_size);
// Copy host memory to device
cudaMemcpy(d_signal_2, h_padded_signal_2, mem_size, cudaMemcpyHostToDevice);
// CUFFT plan
cufftHandle plan;
cufftPlan1d(&plan, new_size, CUFFT_R2C, 1);
// Transform signal and kernel
printf("Transforming signal cufftExecC2C\n");
cufftExecR2C(plan, (cufftReal*) d_signal, d_signal);
cufftExecR2C(plan, (cufftReal*) d_signal_2, d_signal_2);
// Multiply the coefficients together and normalize the result
printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");
ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_signal_2, new_size, 1.0f / new_size);
// Transform signal back
//printf("Transforming signal back cufftExecC2C\n");
//cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal,
// CUFFT_INVERSE);
// Copy device memory to host
Complex* h_convolved_signal = h_padded_signal;
cudaMemcpy(h_convolved_signal, d_signal, mem_size, cudaMemcpyDeviceToHost);
printf("\n");
for (int i = 0; i < SIGNAL_SIZE; i++) {
printf("%f\t", h_convolved_signal[i].x);
printf("%f\n", h_convolved_signal[i].y);
}
// Allocate host memory for the convolution result
Complex* h_convolved_signal_ref = (Complex*) malloc(sizeof(Complex)
* SIGNAL_SIZE);
// Convolve on the host
Convolve(h_signal, SIGNAL_SIZE, h_signal_2, FILTER_KERNEL_SIZE,
h_convolved_signal_ref);
printf("\n");
for (int i = 0; i < SIGNAL_SIZE; i++) {
printf("%f\t", h_convolved_signal_ref[i].x);
printf("%f\n", h_convolved_signal_ref[i].y);
}
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
free(h_signal);
free(h_signal_2);
free(h_padded_signal);
free(h_padded_signal_2);
free(h_convolved_signal_ref);
cudaFree(d_signal);
cudaFree(d_signal_2);
}
// Pad data
int PadData(const Complex* signal, Complex** padded_signal, int signal_size,
const Complex* signal_2, Complex** padded_signal_2,
int signal_2_size) {
int minRadius = signal_2_size / 2;
int maxRadius = signal_2_size - minRadius;
int new_size = signal_size + maxRadius * 2;
// Pad signal
Complex* new_data = (Complex*) malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, signal, signal_size * sizeof(Complex));
memset(new_data + signal_size, 0, (new_size - signal_size)
* sizeof(Complex));
*padded_signal = new_data;
// Pad signal
new_data = (Complex*) malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, signal_2, signal_2_size * sizeof(Complex));
memset(new_data + signal_2_size, 0, (new_size - signal_2_size)
* sizeof(Complex));
*padded_signal_2 = new_data;
/* new_data = (Complex*) malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, signal_2 + minRadius, maxRadius * sizeof(Complex));
memset(new_data + maxRadius, 0, (new_size - signal_2_size)
* sizeof(Complex));
memcpy(new_data + new_size - minRadius, signal_2, minRadius
* sizeof(Complex));
*padded_signal_2 = new_data;*/
return new_size;
}
////////////////////////////////////////////////////////////////////////////////
// Filtering operations
////////////////////////////////////////////////////////////////////////////////
// Computes convolution on the host
void Convolve(const Complex* signal, int signal_size,
const Complex* filter_kernel, int filter_kernel_size,
Complex* filtered_signal) {
int minRadius = filter_kernel_size / 2;
int maxRadius = filter_kernel_size - minRadius;
// Loop over output element indices
for (int i = 0; i < signal_size; ++i) {
filtered_signal[i].x = filtered_signal[i].y = 0;
// Loop over convolution indices
for (int j = -maxRadius + 1; j <= minRadius; ++j) {
int k = i + j;
if (k >= 0 && k < signal_size)
filtered_signal[i] = ComplexAdd(filtered_signal[i], ComplexMul(
signal[k], filter_kernel[minRadius - j]));
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Complex operations
////////////////////////////////////////////////////////////////////////////////
// Complex addition
static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b)
{
Complex c;
c.x = a.x + b.x;
c.y = a.y + b.y;
return c;
}
// Complex scale
static __device__ __host__ inline Complex ComplexScale(Complex a, float s)
{
Complex c;
c.x = s * a.x;
c.y = s * a.y;
return c;
}
// Complex multiplication
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)
{
Complex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
// Complex pointwise multiplication
static __global__ void ComplexPointwiseMulAndScale(Complex* a, const Complex* b, int size, float scale)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = threadID; i < size; i += numThreads)
a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);
}
You are still using the wrong kind of transform (R2C) when your data is complex.
Unless you are consistent between data type and parameters passed to CUFFT, you will not get the right results.