hello,
I have a problem with low-precision of calculations - I try to do a one-dimensional FFT transform on matrix(using DFT matrix). Unfortunately, the FFT and IFFT results compared with the input data vary considerably (above 0.1), and the strange is that the first line has the correct results. it does not matter whether the input data(matrix) has a size of 8x8 or more (256x256).
in each case, the results differ significantly, much more than could be due to the float precision.
I tried to build a kernel with option ‘-cl-opt-disable’, unfortunately it does not improve accuracy. i tried also separating two ‘for’ loops in one kernel into separate two kernels to avoid synchronization problems, but without result (only result from first kernel is valid). changing local size is also ineffective.
any ideas what I am doing wrong?
below the sample input and output data (the first and third row of the output is correct, but differs in the first column) and the host and kernel code.
please notice that row 1 and 3 in output are invalid and the same, while row 0 and 3 are correct.
input:
0.840188 0.394383 0.783099 0.798440
0.911647 0.197551 0.335223 0.768230
0.277775 0.553970 0.477397 0.628871
0.364784 0.513401 0.952230 0.916195
output:
0.840188 0.394383 0.783099 0.798440
0.638216 0.355476 0.643726 0.842212
0.277775 0.553970 0.477397 0.628871
0.638216 0.355476 0.643726 0.842212
fft1d.cl
[codebox]__kernel void mul(
__global float* InA,
__global float* OutRe,
__global float* OutIm,
int N,
__global float* Wre,
__global float* Wim,
__global float* Mre,
__global float* Mim,
__global float* tempre,
__global float* tempim
)
{
int gid0 = get_global_id(0);
int gid1 = get_global_id(1);
int i,j,k;
int size = N;
float wynikre = 0;
float wynikim = 0;
if(gid0<N && gid1<N) {
// fft1d
for(j=0;j<N;j++) {
wynikre += Wre[gid1*N+j]InA[jN+gid0]; //RE
wynikim += Wim[gid1*N+j]InA[jN+gid0]; //IM
}
tempre[gid1*N+gid0] = wynikre;
tempim[gid1*N+gid0] = wynikim;
barrier(CLK_GLOBAL_MEM_FENCE);
wynikre = 0;
wynikim = 0;
// ifft1d
for(j=0;j<N;j++) {
wynikre += Mre[gid1N+j]tempre[jN+gid0]-Mim[gid1N+j]tempim[jN+gid0] ;
wynikim += Mre[gid1N+j]tempim[jN+gid0]+Mim[gid1N+j]tempre[jN+gid0
] ;
}
OutRe[gid1*N+gid0] = wynikre;
OutIm[gid1*N+gid0] = wynikim;
barrier(CLK_GLOBAL_MEM_FENCE);
}
fft.cpp
}[/codebox]
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <unistd.h>
#include <time.h>
#include <math.h>
#include <complex.h>
#include “CL/cl.h”
using namespace std;
const char* cSourceFile = “fft1d.cl”; // dct kernel file
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQue; // OpenCL command que
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
size_t GlobalWorkSize[2];
size_t LocalWorkSize[2];
size_t ParmDataBytes; // Byte size of context information
cl_int ciErr1, ciErr2; // Error code var
char* cPathAndName = NULL; // Var for paths
char* cSourceCL = NULL; // Buffer to hold source for compilation
char* load_source(const char* filename)
{
FILE* stream = NULL;
size_t source_lenght;
stream = fopen(filename, “rb”);
if(stream == 0)
{
return NULL;
}
fseek(stream, 0, SEEK_END);
source_lenght = ftell(stream);
fseek(stream, 0, SEEK_SET);
size_t result =0;
char* source_string = (char *)malloc(source_lenght + 1);
result = fread(source_string, source_lenght, 1, stream);
fclose(stream);
source_string[source_lenght] = ‘\0’;
return source_string;
}
void showM(float* daneOut, int N) {
int i=0;
for(int i =0;i<N*N;i++) {
if (i%N == 0) { printf(“\n”); }
printf("%f ", *(daneOut+i));
}
printf(“\n”);
}
int main(int argc, char **argv)
{
const char* optio = “-cl-opt-disable”;
int N = 4;
int siz = sizeof(float)NN;
GlobalWorkSize[0] = N;
GlobalWorkSize[1] = N;
LocalWorkSize[0] = 2;
LocalWorkSize[1] = 2;
float PI = 3.14159265;
complex float a = cos(2PI/N)-sin(2PI/N)*I;
complex float b;
int k, w;
////////////////////////////////////////////////////////////////////////
cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clCreateContextFromType !!!\n\n”);}
ciErr1 = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
cdDevices = (cl_device_id*)malloc(ParmDataBytes);
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, cdDevices, NULL);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clGetContextInfo !!!\n\n”);}
cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clCreateCommandQueue !!!\n\n”);}
////////////////////////////////////////////////////////////////////////
float* daneInA;
daneInA = (float*)malloc(siz);
void * pointer_daneInA= (float *)daneInA;
for(int j=0;j<N*N;j++) {
*(daneInA+j)= (float) rand()/RAND_MAX;
}
float* daneOutRe;
daneOutRe = (float*)malloc(siz);
void * pointer_daneOutRe= (float *)daneOutRe;
for(int j=0;j<N*N;j++) {
*(daneOutRe+j)= 0;
}
float* daneOutIm;
daneOutIm = (float*)malloc(siz);
void * pointer_daneOutIm= (float *)daneOutIm;
for(int j=0;j<N*N;j++) {
*(daneOutIm+j)=0;
}
// Wre
float* data_fft_Wre;
data_fft_Wre = (float*)malloc(siz);
void * pointer_fft_Wre= (float *)data_fft_Wre;
for(k=0;k<N;k++) {
for(w=0;w<N;w++) {
b = k*w;
(data_fft_Wre+(k+wN))=crealf( cpow( a,b ) );
}
}
// Wim
float* data_fft_Wim;
data_fft_Wim = (float*)malloc(siz);
void * pointer_fft_Wim= (float *)data_fft_Wim;
for(k=0;k<N;k++) {
for(w=0;w<N;w++) {
b = k*w;
(data_fft_Wim+(k+wN))=cimagf( cpow( a,b ) );
}
}
// Mre
float* data_fft_Mre;
data_fft_Mre = (float*)malloc(siz);
void * pointer_fft_Mre= (float *)data_fft_Mre;
for(k=0;k<N;k++) {
for(w=0;w<N;w++) {
b = -k*w;
(data_fft_Mre+(k+wN))=crealf( cpow( a,b ) )/N;
}
}
// Mim
float* data_fft_Mim;
data_fft_Mim = (float*)malloc(siz);
void * pointer_fft_Mim= (float *)data_fft_Mim;
for(k=0;k<N;k++) {
for(w=0;w<N;w++) {
b = -k*w;
(data_fft_Mim+(k+wN))=cimagf( cpow( a,b ) )/N;
}
}
float* temp1;
temp1 = (float*)malloc(siz);
void * pointer_temp1= (float *)temp1;
for(int j=0;j<N*N;j++) {
*(temp1+j)=0;
}
float* temp2;
temp2 = (float*)malloc(siz);
void * pointer_temp2= (float *)temp2;
for(int j=0;j<N*N;j++) {
*(temp2+j)=0;
}
cl_mem Buffer_daneInA = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneIn !!!\n\n”);}
cl_mem Buffer_daneOutRe = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut!!!\n\n”);}
cl_mem Buffer_daneOutIm = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}
cl_mem Buffer_fft_Wre = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}
cl_mem Buffer_fft_Wim = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}
cl_mem Buffer_fft_Mre = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}
cl_mem Buffer_fft_Mim = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}
cl_mem Buffer_temp1 = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}
cl_mem Buffer_temp2 = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}
////////////////////////////////////////////////////////////////////////
cSourceCL = load_source(cSourceFile);
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clCreateProgramWithSource !!!\n\n”);}
ciErr1 = clBuildProgram(cpProgram, 0, NULL, optio, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clBuildProgram !!!\n\n”);}
ckKernel = clCreateKernel(cpProgram, “mul”, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clCreateKernel !!!\n\n”);}
////////////////////////////////////////////////////////////////////////
ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&Buffer_daneInA);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
ciErr1 = clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&Buffer_daneOutRe);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
ciErr1 = clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&Buffer_daneOutIm);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
ciErr1 = clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&N);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
ciErr1 = clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&Buffer_fft_Wre);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
ciErr1 = clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&Buffer_fft_Wre);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
ciErr1 = clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&Buffer_fft_Mre);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
ciErr1 = clSetKernelArg(ckKernel, 7, sizeof(cl_mem), (void*)&Buffer_fft_Mim);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
ciErr1 = clSetKernelArg(ckKernel, 8, sizeof(cl_mem), (void*)&Buffer_temp1);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
ciErr1 = clSetKernelArg(ckKernel, 9, sizeof(cl_mem), (void*)&Buffer_temp2);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}
////////////////////////////////////////////////////////////////////////
ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_daneInA, CL_TRUE, 0, siz, pointer_daneInA, 0, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}
ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Wre, CL_TRUE, 0, siz, pointer_fft_Wre, 0, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}
ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Wim, CL_TRUE, 0, siz, pointer_fft_Wim, 0, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}
ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Mre, CL_TRUE, 0, siz, pointer_fft_Mre, 0, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}
ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Mim, CL_TRUE, 0, siz, pointer_fft_Mim, 0, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}
ciErr1 = clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
clFinish(cqCommandQue);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueNDRangeKernel !!!\n\n”);}
ciErr1 = clEnqueueReadBuffer(cqCommandQue, Buffer_daneOutRe, CL_TRUE, 0, siz, pointer_daneOutRe, 0, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueReadBuffer !!!\n\n”);}
showM(daneInA, N);
showM(daneOutRe, N);
}[/codebox]