problem with low float precision only on part of output buffer

hello,

I have a problem with low-precision of calculations - I try to do a one-dimensional FFT transform on matrix(using DFT matrix). Unfortunately, the FFT and IFFT results compared with the input data vary considerably (above 0.1), and the strange is that the first line has the correct results. it does not matter whether the input data(matrix) has a size of 8x8 or more (256x256).

in each case, the results differ significantly, much more than could be due to the float precision.

I tried to build a kernel with option ‘-cl-opt-disable’, unfortunately it does not improve accuracy. i tried also separating two ‘for’ loops in one kernel into separate two kernels to avoid synchronization problems, but without result (only result from first kernel is valid). changing local size is also ineffective.

any ideas what I am doing wrong?

below the sample input and output data (the first and third row of the output is correct, but differs in the first column) and the host and kernel code.

please notice that row 1 and 3 in output are invalid and the same, while row 0 and 3 are correct.

input:

0.840188 0.394383 0.783099 0.798440

0.911647 0.197551 0.335223 0.768230

0.277775 0.553970 0.477397 0.628871

0.364784 0.513401 0.952230 0.916195

output:

0.840188 0.394383 0.783099 0.798440

0.638216 0.355476 0.643726 0.842212

0.277775 0.553970 0.477397 0.628871

0.638216 0.355476 0.643726 0.842212

fft1d.cl

[codebox]__kernel void mul(

__global float* InA,

__global float* OutRe,

__global float* OutIm,

int N,

__global float* Wre,

__global float* Wim,

__global float* Mre,

__global float* Mim,

__global float* tempre,

__global float* tempim

)

{

int gid0 = get_global_id(0);

int gid1 = get_global_id(1);

int i,j,k;

int size = N;

float wynikre = 0;

float wynikim = 0;

if(gid0<N && gid1<N) {

// fft1d

for(j=0;j<N;j++) {

wynikre += Wre[gid1*N+j]InA[jN+gid0]; //RE

wynikim += Wim[gid1*N+j]InA[jN+gid0]; //IM

}

tempre[gid1*N+gid0] = wynikre;

tempim[gid1*N+gid0] = wynikim;

barrier(CLK_GLOBAL_MEM_FENCE);

wynikre = 0;

wynikim = 0;

// ifft1d

for(j=0;j<N;j++) {

wynikre += Mre[gid1N+j]tempre[jN+gid0]-Mim[gid1N+j]tempim[jN+gid0] ;

wynikim += Mre[gid1N+j]tempim[jN+gid0]+Mim[gid1N+j]tempre[jN+gid0

] ;

}

OutRe[gid1*N+gid0] = wynikre;

OutIm[gid1*N+gid0] = wynikim;

barrier(CLK_GLOBAL_MEM_FENCE);

}

fft.cpp

}[/codebox]

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <malloc.h>

#include <unistd.h>

#include <time.h>

#include

#include

#include <math.h>

#include <complex.h>

#include “CL/cl.h”

using namespace std;

const char* cSourceFile = “fft1d.cl”; // dct kernel file

cl_context cxGPUContext; // OpenCL context

cl_command_queue cqCommandQue; // OpenCL command que

cl_device_id* cdDevices; // OpenCL device list

cl_program cpProgram; // OpenCL program

cl_kernel ckKernel; // OpenCL kernel

size_t GlobalWorkSize[2];

size_t LocalWorkSize[2];

size_t ParmDataBytes; // Byte size of context information

cl_int ciErr1, ciErr2; // Error code var

char* cPathAndName = NULL; // Var for paths

char* cSourceCL = NULL; // Buffer to hold source for compilation

char* load_source(const char* filename)

{

FILE* stream = NULL;

size_t source_lenght;

stream = fopen(filename, “rb”);

if(stream == 0)

{

return NULL;

}

fseek(stream, 0, SEEK_END);

source_lenght = ftell(stream);

fseek(stream, 0, SEEK_SET);

size_t result =0;

char* source_string = (char *)malloc(source_lenght + 1);

result = fread(source_string, source_lenght, 1, stream);

fclose(stream);

source_string[source_lenght] = ‘\0’;

return source_string;

}

void showM(float* daneOut, int N) {

int i=0;

for(int i =0;i<N*N;i++) {

if (i%N == 0) { printf(“\n”); }

printf("%f ", *(daneOut+i));

}

printf(“\n”);

}

int main(int argc, char **argv)

{

const char* optio = “-cl-opt-disable”;

int N = 4;

int siz = sizeof(float)NN;

GlobalWorkSize[0] = N;

GlobalWorkSize[1] = N;

LocalWorkSize[0] = 2;

LocalWorkSize[1] = 2;

float PI = 3.14159265;

complex float a = cos(2PI/N)-sin(2PI/N)*I;

complex float b;

int k, w;

////////////////////////////////////////////////////////////////////////

cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clCreateContextFromType !!!\n\n”);}

ciErr1 = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);

cdDevices = (cl_device_id*)malloc(ParmDataBytes);

ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, cdDevices, NULL);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clGetContextInfo !!!\n\n”);}

cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clCreateCommandQueue !!!\n\n”);}

////////////////////////////////////////////////////////////////////////

float* daneInA;

daneInA = (float*)malloc(siz);

void * pointer_daneInA= (float *)daneInA;

for(int j=0;j<N*N;j++) {

*(daneInA+j)= (float) rand()/RAND_MAX;

}

float* daneOutRe;

daneOutRe = (float*)malloc(siz);

void * pointer_daneOutRe= (float *)daneOutRe;

for(int j=0;j<N*N;j++) {

*(daneOutRe+j)= 0;

}

float* daneOutIm;

daneOutIm = (float*)malloc(siz);

void * pointer_daneOutIm= (float *)daneOutIm;

for(int j=0;j<N*N;j++) {

*(daneOutIm+j)=0;

}

// Wre

float* data_fft_Wre;

data_fft_Wre = (float*)malloc(siz);

void * pointer_fft_Wre= (float *)data_fft_Wre;

for(k=0;k<N;k++) {

for(w=0;w<N;w++) {

b = k*w;

(data_fft_Wre+(k+wN))=crealf( cpow( a,b ) );

}

}

// Wim

float* data_fft_Wim;

data_fft_Wim = (float*)malloc(siz);

void * pointer_fft_Wim= (float *)data_fft_Wim;

for(k=0;k<N;k++) {

for(w=0;w<N;w++) {

b = k*w;

(data_fft_Wim+(k+wN))=cimagf( cpow( a,b ) );

}

}

// Mre

float* data_fft_Mre;

data_fft_Mre = (float*)malloc(siz);

void * pointer_fft_Mre= (float *)data_fft_Mre;

for(k=0;k<N;k++) {

for(w=0;w<N;w++) {

b = -k*w;

(data_fft_Mre+(k+wN))=crealf( cpow( a,b ) )/N;

}

}

// Mim

float* data_fft_Mim;

data_fft_Mim = (float*)malloc(siz);

void * pointer_fft_Mim= (float *)data_fft_Mim;

for(k=0;k<N;k++) {

for(w=0;w<N;w++) {

b = -k*w;

(data_fft_Mim+(k+wN))=cimagf( cpow( a,b ) )/N;

}

}

float* temp1;

temp1 = (float*)malloc(siz);

void * pointer_temp1= (float *)temp1;

for(int j=0;j<N*N;j++) {

*(temp1+j)=0;

}

float* temp2;

temp2 = (float*)malloc(siz);

void * pointer_temp2= (float *)temp2;

for(int j=0;j<N*N;j++) {

*(temp2+j)=0;

}

cl_mem Buffer_daneInA = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneIn !!!\n\n”);}

cl_mem Buffer_daneOutRe = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut!!!\n\n”);}

cl_mem Buffer_daneOutIm = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}

cl_mem Buffer_fft_Wre = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}

cl_mem Buffer_fft_Wim = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}

cl_mem Buffer_fft_Mre = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}

cl_mem Buffer_fft_Mim = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}

cl_mem Buffer_temp1 = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}

cl_mem Buffer_temp2 = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clcreatebuffer Buffer_daneOut2!!!\n\n”);}

////////////////////////////////////////////////////////////////////////

cSourceCL = load_source(cSourceFile);

cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, NULL, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clCreateProgramWithSource !!!\n\n”);}

ciErr1 = clBuildProgram(cpProgram, 0, NULL, optio, NULL, NULL);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clBuildProgram !!!\n\n”);}

ckKernel = clCreateKernel(cpProgram, “mul”, &ciErr1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clCreateKernel !!!\n\n”);}

////////////////////////////////////////////////////////////////////////

ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&Buffer_daneInA);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

ciErr1 = clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&Buffer_daneOutRe);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

ciErr1 = clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&Buffer_daneOutIm);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

ciErr1 = clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&N);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

ciErr1 = clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&Buffer_fft_Wre);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

ciErr1 = clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&Buffer_fft_Wre);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

ciErr1 = clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&Buffer_fft_Mre);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

ciErr1 = clSetKernelArg(ckKernel, 7, sizeof(cl_mem), (void*)&Buffer_fft_Mim);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

ciErr1 = clSetKernelArg(ckKernel, 8, sizeof(cl_mem), (void*)&Buffer_temp1);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

ciErr1 = clSetKernelArg(ckKernel, 9, sizeof(cl_mem), (void*)&Buffer_temp2);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clSetKernelArg !!!\n\n”);}

////////////////////////////////////////////////////////////////////////

ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_daneInA, CL_TRUE, 0, siz, pointer_daneInA, 0, NULL, NULL);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}

ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Wre, CL_TRUE, 0, siz, pointer_fft_Wre, 0, NULL, NULL);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}

ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Wim, CL_TRUE, 0, siz, pointer_fft_Wim, 0, NULL, NULL);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}

ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Mre, CL_TRUE, 0, siz, pointer_fft_Mre, 0, NULL, NULL);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}

ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Mim, CL_TRUE, 0, siz, pointer_fft_Mim, 0, NULL, NULL);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueWriteBuffer !!!\n\n”);}

ciErr1 = clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);

clFinish(cqCommandQue);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueNDRangeKernel !!!\n\n”);}

ciErr1 = clEnqueueReadBuffer(cqCommandQue, Buffer_daneOutRe, CL_TRUE, 0, siz, pointer_daneOutRe, 0, NULL, NULL);

if (ciErr1 != CL_SUCCESS) {printf(“Error in clEnqueueReadBuffer !!!\n\n”);}

showM(daneInA, N);

showM(daneOutRe, N);

}[/codebox]

This is just an idea, but try casting the actual values as doubles and computing them as such. I had issues like this with CUDA and by doing the actual computation as double, it solved the accuracy problem. You can store as float, that’s fine. Though I’m not sure why rows 1 and 3 would be accurate and not the rest. Again, just an idea to try out.