Hello guys.

I’m studying image processing with CUDA. This time, a depared with a strange issue. The code bellow is the kernel that my friend and me developed together to calculate the 2D Discrete Fourier Transform on a given image.

```
__global__ void fftLines_kernel(float *BRd,float *BId, int m)
{
int Ai = blockIdx.x * m;
__shared__ float2 smA[MAX_WIDTH],smB[MAX_WIDTH];
for (int u = threadIdx.x; u<m;u+= blockDim.x)
{
smA[u].x = tex1Dfetch(t_fr, Ai+u);
smA[u].y = tex1Dfetch(t_fi, Ai+u);
}
__syncthreads();
for (int u = threadIdx.x; u<m;u+= blockDim.x)
{
float2 B = {0.0f,0.0f};
float v = (2 * PI * u)/m;
for (int i=0; i<m;i++)
{
//smB[u].x += smA[i].x * cos(v*i) - smA[i].y * -sin(v*i);
//smB[u].y += smA[i].x *-sin(v*i) + smA[i].y * cos(v*i);
B.x += smA[i].x * cos(v*i) - smA[i].y * -sin(v*i);
B.y += smA[i].x *-sin(v*i) + smA[i].y * cos(v*i);
}
BRd[Ai+u] = B.x;
BId[Ai+u] = B.y;
//BRd[Ai+u] = smA[u].x;
//BId[Ai+u] = smA[u].y;
}
}
```

The image to be processed goes in 2 textures, one for the real part and the other to the imaginary part, as the DFT result is complex. To execute the DFT, we need to pass this kernel on the image lines, rotate the image 90Â°, pass the kernel again to process image colunms and rotate again to right position.

The strange issue that we depared was the execution time. On a 256x256 pixels grayscale image, the whole process take about 15 ms to complete. Our goal is to match the execution time of a reference function fft2d implemented in the numpy library. The reference function runs in about 4 ms in the same image.

But curiosily, the execution of our CUDA program runs in about 2.5 ms, if we disconsider the copy of the result to the global memory. In the end of the code, there are two lines comented, that copy the data in shared memory to the global memory. Uncommneting this line and commenting the two lines above, give this result, even with the computing running aimlessly. The process is accumulated in a register variable, named B in the kernel.

We don’t know what is causin this. We would like to know if someone can help us to resolve this issue. How can a copy of a register variable make the performance drop so much? Note that if we only write in it, the code runs in 2 ms, but copying the register to the global memory drops the performance down.

This link shows a web page where the complete program can be viewed: http://parati.dca.fee.unicamp.br/adesso/wiki/courseIA366F2S2010/wd101245_11/view/

Here goes the complete CUDA code for the 2D DFT:

```
#include <string.h>
#include <math.h>
#include <cuda.h>
#undef _POSIX_C_SOURCE
#undef _XOPEN_SOURCE
#include "simple_arrays.h"
////////////////////////////////////////////////////////////////////////////////
// Kernel configuration
////////////////////////////////////////////////////////////////////////////////
#define BLOCK_SIZE_2D 16
#define BLOCK_SIZE_1D 256
#define PI 3.1415926535897931
#define MAX_WIDTH 512
//------------------------------------------------------------------------------
// DeclaraÃ§Ã£o de textura referencia
//------------------------------------------------------------------------------
texture<float, 1, cudaReadModeElementType> t_fr;
texture<float, 1, cudaReadModeElementType> t_fi;
//Arredonda a / b para o valor do vizinho superior
__host__ int iDivUp(int a, int b){
return (int)(a % b != 0) ? (a / b + 1) : (a / b);
}
//------------------------------------------------------------------------------
// CUDA Kernel
//------------------------------------------------------------------------------
__global__ void fftLines_kernel(float *BRd,float *BId, int m)
{
int Ai = blockIdx.x * m; // + u
__shared__ float2 smA[MAX_WIDTH],smB[MAX_WIDTH];
for (int u = threadIdx.x; u<m;u+= blockDim.x)
{
smA[u].x = tex1Dfetch(t_fr, Ai+u);
smA[u].y = tex1Dfetch(t_fi, Ai+u);
}
__syncthreads();
for (int u = threadIdx.x; u<m;u+= blockDim.x)
{
//float BRi = 0;
//float BIi = 0;
float2 B = {0.0f,0.0f};
float v = (2 * PI * u)/m;
for (int i=0; i<m;i++)
{
//smB[u].x += smA[i].x * cos(v*i) - smA[i].y * -sin(v*i);
//smB[u].y += smA[i].x *-sin(v*i) + smA[i].y * cos(v*i);
B.x += smA[i].x * cos(v*i) - smA[i].y * -sin(v*i);
B.y += smA[i].x *-sin(v*i) + smA[i].y * cos(v*i);
//BRi += smA[i].x * cos(v*i) - smA[i].y * -sin(v*i);
//BIi += smA[i].x *-sin(v*i) + smA[i].y * cos(v*i);
//BRi += tex1Dfetch(t_fr, Ai+i) * cos(v*i) - tex1Dfetch(t_fi, Ai+i) * -sin(v*i);
//BIi += tex1Dfetch(t_fr, Ai+i) *-sin(v*i) + tex1Dfetch(t_fi, Ai+i) * cos(v*i);
}
BRd[Ai+u] = B.x;
BId[Ai+u] = B.y;
//BRd[Ai+u] = BRi;
//BId[Ai+u] = BIi;
//smB[u] = B;
//BRd[Ai+u] = smA[u].x;
//BId[Ai+u] = smA[u].y;
}
/*
for (int u = threadIdx.x; u<m;u+= blockDim.x)
{
BRd[Ai+u] = 0.1f;//smB[u].x;
BId[Ai+u] = 0.1f;//smB[u].y;
}
*/
}
__global__ void fftTraspose_kernel(float *BRd,float *BId,int m,int n)
{
int Bx = blockDim.x*blockIdx.x + threadIdx.x;
int By = blockDim.y*blockIdx.y + threadIdx.y;
if (By >= m || Bx >= n)return;
BRd[By*n+Bx] = tex1Dfetch(t_fr, Bx*m+By);
BId[By*n+Bx] = tex1Dfetch(t_fi, Bx*m+By);
}
//------------------------------------------------------------------------------
// Kernel Stub
//------------------------------------------------------------------------------
int fft2d_stub(float *AR,float *AI, float *BR,float *BI,int height, int width)
{
float *ARd,*AId;
float *BRd,*BId;
int sizef = width * height * sizeof(float);
cudaMalloc((void **)&ARd, sizef);
cudaMalloc((void **)&AId, sizef);
cudaMalloc((void **)&BRd, sizef);
cudaMalloc((void **)&BId, sizef);
cudaMemcpy(ARd, AR, sizef, cudaMemcpyHostToDevice);
cudaMemcpy(AId, AI, sizef, cudaMemcpyHostToDevice);
// Texture
cudaBindTexture(0,t_fr,ARd, width * height * sizeof(float));
cudaBindTexture(0,t_fi,AId, width * height * sizeof(float));
dim3 BlockDim2D( BLOCK_SIZE_2D, BLOCK_SIZE_2D);
dim3 gridDim1 ( iDivUp(width,BLOCK_SIZE_2D) , iDivUp(height,BLOCK_SIZE_2D));
dim3 gridDim2 ( iDivUp(height,BLOCK_SIZE_2D), iDivUp(width,BLOCK_SIZE_2D));
fftLines_kernel<<<height,BLOCK_SIZE_1D>>>(BRd,BId,width); // A -> B
cudaThreadSynchronize();
cudaUnbindTexture(t_fr);
cudaUnbindTexture(t_fi);
cudaBindTexture(0,t_fr,BRd, width * height * sizeof(float));
cudaBindTexture(0,t_fi,BId, width * height * sizeof(float));
fftTraspose_kernel<<<gridDim1 , BlockDim2D>>>(ARd,AId,width,height); // B -> A
cudaThreadSynchronize();
cudaUnbindTexture(t_fr);
cudaUnbindTexture(t_fi);
cudaBindTexture(0,t_fr,ARd, width * height * sizeof(float));
cudaBindTexture(0,t_fi,AId, width * height * sizeof(float));
fftLines_kernel<<<width,BLOCK_SIZE_1D>>>(BRd,BId,height); // A -> B
cudaThreadSynchronize();
cudaUnbindTexture(t_fr);
cudaUnbindTexture(t_fi);
cudaBindTexture(0,t_fr,BRd, width * height * sizeof(float));
cudaBindTexture(0,t_fi,BId, width * height * sizeof(float));
fftTraspose_kernel<<<gridDim2 , BlockDim2D>>>(ARd,AId,height,width); // B -> A
cudaThreadSynchronize();
cudaUnbindTexture(t_fr);
cudaUnbindTexture(t_fi);
// Copy data from device
cudaMemcpy(BR, ARd, sizef, cudaMemcpyDeviceToHost);
cudaMemcpy(BI, AId, sizef, cudaMemcpyDeviceToHost);
// Release allocated device memory
cudaFree(ARd);
cudaFree(AId);
cudaFree(BRd);
cudaFree(BId);
return 1;
}
```