Mulitprecision Multiplication CUFFT

Hi I am currently trying to create a multi-precision multiplication for two array of integers using CUFFT, however I am getting an error when trying to convert into a real type value. Also I am not sure whether that will provide the right output, please let me know if I am missing anything

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cufft.h>

// Complex data type

typedef float2 Complex;

static __device__ __host__ inline Complex ComplexMul(Complex, Complex);

static __global__ void ComplexPointwiseMul(Complex*, const Complex*, int);

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void runTest(int argc, char** argv);

// The filter size is assumed to be a number smaller than the signal size

#define SIGNAL_SIZE        10

#define FILTER_KERNEL_SIZE 11

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv) {

	runTest(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

void runTest(int argc, char** argv) {

	printf("[simpleCUFFT] is starting...\n");

	// Allocate initial memory for the array

	Complex* h_signal = (Complex*) malloc(sizeof(Complex) * SIGNAL_SIZE);

	// Initalize the memory for the signal

	for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {

		h_signal[i].x = 1; // An array of 50 1,0s

		h_signal[i].y = 0;

	}

	

	// Pad signal and filter kernel

	int mem_size = sizeof(cufftComplex) * SIGNAL_SIZE;

	// Allocate device memory for signal

	cufftComplex* d_signal;

	cudaMalloc((void**) &d_signal, mem_size);

	// Copy host memory to device

	cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice);

	// CUFFT plan

	cufftHandle plan;

	

	if ( cufftPlan1d(&plan, mem_size, CUFFT_R2C, 1)  != CUFFT_SUCCESS){

		fprintf(stderr, "CUFFT error: Plan creation failed");

		return;

	}

	

	// Transform signal and kernel

	printf("Transforming signal cufftExecR2C\n");

	if (cufftExecR2C(plan, (cufftReal*) d_signal, d_signal) != CUFFT_SUCCESS){

		fprintf(stderr, "CUFFT error: ExecC2C Forward failed");

		return;

	}

	

	// Multiply the coefficients together and normalize the result

	printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");

	ComplexPointwiseMul<<<32, 256>>>(d_signal, d_signal, mem_size);

	// Transform signal back

	printf("Transforming signal back cufftExecC2C\n");

	if ( cufftExecC2R(plan, d_signal, (cufftReal*) d_signal) != CUFFT_SUCCESS){

		fprintf(stderr, "CUFFT error: ExecC2C Inverse failed");

		return;

	}

	printf("Transferring host memory to device\n");

	

	// Copy device memory to host

	float* f_signal = (float*) malloc(sizeof(float) * SIGNAL_SIZE);

	cudaMemcpy(f_signal, d_signal, mem_size, cudaMemcpyDeviceToHost);

	for (int j = 0; j < SIGNAL_SIZE; j++) {

		printf("h_s:\t");

		printf("%i\t", j);

		printf("%f\n", f_signal[j]);

	}

	//Destroy CUFFT context

	cufftDestroy(plan);

	// cleanup memory

	free(f_signal);

	free(h_signal);

	cudaFree(d_signal);

}

////////////////////////////////////////////////////////////////////////////////

// Complex operations

////////////////////////////////////////////////////////////////////////////////

// Complex multiplication

static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)

{

	Complex c;

	c.x = a.x * b.x - a.y * b.y;

	c.y = a.x * b.y + a.y * b.x;

	return c;

}

// Complex pointwise multiplication

static __global__ void ComplexPointwiseMul(Complex* a, const Complex* b, int size)

{

	const int numThreads = blockDim.x * gridDim.x;

	const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

	for (int i = threadID; i < size; i += numThreads)

	a[i] = ComplexMul(a[i], b[i]);

}

There are several problems in your code:

-The plan is expecting the size of the transform in elements, not in bytes.

-You need to decide if you want to do a real to complex or a complex to complex transform.
If you decide to do a real to complex in place, you need to add padding to the real array

-You need to normalize the result, IFFT(FFT(A))=len(A)*A

-Your last memcpy is wrong, there is not enough space on the destination array to hold all the data you are transferring

Deleted
simpleCUFFT.cu (8.65 KB)

This is simplecufft example adapted for R2C is this what you meant? Also some of the output is invalid and is missing data. The result should be 1234554321 not 345543.

/*

 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

 *

 * Please refer to the NVIDIA end user license agreement (EULA) associated

 * with this source code for terms and conditions that govern your use of

 * this software. Any use, reproduction, disclosure, or distribution of

 * this software and related documentation outside the terms of the EULA

 * is strictly prohibited.

 *

 */

/* Example showing the use of CUFFT for fast 1D-convolution using FFT. */

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cufft.h>

// Complex data type

typedef float2 Complex;

static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);

static __device__ __host__ inline Complex ComplexScale(Complex, float);

static __device__ __host__ inline Complex ComplexMul(Complex, Complex);

static __global__ void ComplexPointwiseMulAndScale(Complex*, const Complex*, int, float);

// Filtering functions

void Convolve(const Complex*, int, const Complex*, int, Complex*);

// Padding functions

int PadData(const Complex*, Complex**, int, const Complex*, Complex**, int);

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void runTest(int argc, char** argv);

// The filter size is assumed to be a number smaller than the signal size

#define SIGNAL_SIZE        6

#define FILTER_KERNEL_SIZE 5

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv) {

	runTest(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

void runTest(int argc, char** argv) {

	printf("[simpleCUFFT] is starting...\n");

	// Allocate host memory for the signal

	Complex* h_signal = (Complex*) malloc(sizeof(Complex) * SIGNAL_SIZE);

	// Initalize the memory for the signal

	for (unsigned int i = 0; i < SIGNAL_SIZE; i++) {

		h_signal[i].x = 1;

		h_signal[i].y = 0;

	}

	// Allocate host memory for the filter

	Complex* h_signal_2 = (Complex*) malloc(sizeof(Complex)

			* FILTER_KERNEL_SIZE);

	// Initalize the memory for the filter

	for (unsigned int i = 0; i < FILTER_KERNEL_SIZE; i++) {

		h_signal_2[i].x = 1;

		h_signal_2[i].y = 0;

	}

	printf("\n");

	for (int i = 0; i < SIGNAL_SIZE; i++) {

		printf("%f\t", h_signal[i].x);

		printf("%f\n", h_signal[i].y);

	}

	printf("\n");

	for (int i = 0; i < FILTER_KERNEL_SIZE; i++) {

		printf("%f\t", h_signal_2[i].x);

		printf("%f\n", h_signal_2[i].y);

	}

	// Pad signal and filter kernel

	Complex* h_padded_signal;

	Complex* h_padded_signal_2;

	int new_size = PadData(h_signal, &h_padded_signal, SIGNAL_SIZE, h_signal_2,

			&h_padded_signal_2, FILTER_KERNEL_SIZE);

	int mem_size = sizeof(Complex) * new_size;

	printf("\n");

	for (int i = 0; i < new_size; i++) {

		printf("%f\t", h_padded_signal[i].x);

		printf("%f\n", h_padded_signal[i].y);

	}

	printf("\n");

	for (int i = 0; i < new_size; i++) {

		printf("%f\t", h_padded_signal_2[i].x);

		printf("%f\n", h_padded_signal_2[i].y);

	}

	// Allocate device memory for signal

	Complex* d_signal;

	cudaMalloc((void**) &d_signal, mem_size);

	// Copy host memory to device

	cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice);

	// Allocate device memory for filter kernel

	Complex* d_signal_2;

	cudaMalloc((void**) &d_signal_2, mem_size);

	// Copy host memory to device

	cudaMemcpy(d_signal_2, h_padded_signal_2, mem_size, cudaMemcpyHostToDevice);

	// CUFFT plan

	cufftHandle plan;

	cufftPlan1d(&plan, new_size, CUFFT_R2C, 1);

	// Transform signal and kernel

	printf("Transforming signal cufftExecC2C\n");

	cufftExecR2C(plan, (cufftReal*) d_signal, d_signal);

	cufftExecR2C(plan, (cufftReal*) d_signal_2, d_signal_2);

	// Multiply the coefficients together and normalize the result

	printf("Launching ComplexPointwiseMulAndScale<<< >>>\n");

	ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_signal_2, new_size, 1.0f / new_size);

	// Transform signal back

	//printf("Transforming signal back cufftExecC2C\n");

	//cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal,

	//		CUFFT_INVERSE);

	// Copy device memory to host

	Complex* h_convolved_signal = h_padded_signal;

	cudaMemcpy(h_convolved_signal, d_signal, mem_size, cudaMemcpyDeviceToHost);

	printf("\n");

	for (int i = 0; i < SIGNAL_SIZE; i++) {

		printf("%f\t", h_convolved_signal[i].x);

		printf("%f\n", h_convolved_signal[i].y);

	}

	

	// Allocate host memory for the convolution result

	Complex* h_convolved_signal_ref = (Complex*) malloc(sizeof(Complex)

			* SIGNAL_SIZE);

	// Convolve on the host

	Convolve(h_signal, SIGNAL_SIZE, h_signal_2, FILTER_KERNEL_SIZE,

			h_convolved_signal_ref);

	printf("\n");

	for (int i = 0; i < SIGNAL_SIZE; i++) {

		printf("%f\t", h_convolved_signal_ref[i].x);

		printf("%f\n", h_convolved_signal_ref[i].y);

	}

	//Destroy CUFFT context

	cufftDestroy(plan);

	// cleanup memory

	free(h_signal);

	free(h_signal_2);

	free(h_padded_signal);

	free(h_padded_signal_2);

	free(h_convolved_signal_ref);

	cudaFree(d_signal);

	cudaFree(d_signal_2);

}

// Pad data

int PadData(const Complex* signal, Complex** padded_signal, int signal_size,

		const Complex* signal_2, Complex** padded_signal_2,

		int signal_2_size) {

	int minRadius = signal_2_size / 2;

	int maxRadius = signal_2_size - minRadius;

	int new_size = signal_size + maxRadius * 2;

	// Pad signal

	Complex* new_data = (Complex*) malloc(sizeof(Complex) * new_size);

	memcpy(new_data + 0, signal, signal_size * sizeof(Complex));

	memset(new_data + signal_size, 0, (new_size - signal_size)

			* sizeof(Complex));

	*padded_signal = new_data;

	// Pad signal

	new_data = (Complex*) malloc(sizeof(Complex) * new_size);

	memcpy(new_data + 0, signal_2, signal_2_size * sizeof(Complex));

	memset(new_data + signal_2_size, 0, (new_size - signal_2_size)

			* sizeof(Complex));

	*padded_signal_2 = new_data;

/*	new_data = (Complex*) malloc(sizeof(Complex) * new_size);

	memcpy(new_data + 0, signal_2 + minRadius, maxRadius * sizeof(Complex));

	memset(new_data + maxRadius, 0, (new_size - signal_2_size)

			* sizeof(Complex));

	memcpy(new_data + new_size - minRadius, signal_2, minRadius

			* sizeof(Complex));

	*padded_signal_2 = new_data;*/

	return new_size;

}

////////////////////////////////////////////////////////////////////////////////

// Filtering operations

////////////////////////////////////////////////////////////////////////////////

// Computes convolution on the host

void Convolve(const Complex* signal, int signal_size,

		const Complex* filter_kernel, int filter_kernel_size,

		Complex* filtered_signal) {

	int minRadius = filter_kernel_size / 2;

	int maxRadius = filter_kernel_size - minRadius;

	// Loop over output element indices

	for (int i = 0; i < signal_size; ++i) {

		filtered_signal[i].x = filtered_signal[i].y = 0;

		// Loop over convolution indices

		for (int j = -maxRadius + 1; j <= minRadius; ++j) {

			int k = i + j;

			if (k >= 0 && k < signal_size)

				filtered_signal[i] = ComplexAdd(filtered_signal[i], ComplexMul(

						signal[k], filter_kernel[minRadius - j]));

		}

	}

}

////////////////////////////////////////////////////////////////////////////////

// Complex operations

////////////////////////////////////////////////////////////////////////////////

// Complex addition

static __device__ __host__ inline Complex ComplexAdd(Complex a, Complex b)

{

	Complex c;

	c.x = a.x + b.x;

	c.y = a.y + b.y;

	return c;

}

// Complex scale

static __device__ __host__ inline Complex ComplexScale(Complex a, float s)

{

	Complex c;

	c.x = s * a.x;

	c.y = s * a.y;

	return c;

}

// Complex multiplication

static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b)

{

	Complex c;

	c.x = a.x * b.x - a.y * b.y;

	c.y = a.x * b.y + a.y * b.x;

	return c;

}

// Complex pointwise multiplication

static __global__ void ComplexPointwiseMulAndScale(Complex* a, const Complex* b, int size, float scale)

{

	const int numThreads = blockDim.x * gridDim.x;

	const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

	for (int i = threadID; i < size; i += numThreads)

	a[i] = ComplexScale(ComplexMul(a[i], b[i]), scale);

}

You are still using the wrong kind of transform (R2C) when your data is complex.
Unless you are consistent between data type and parameters passed to CUFFT, you will not get the right results.

You should get a basic FFT/IFFT example to work.

I have fix my solution, many thanks