Hi,
I know that there’s an implementation of FFT to compute the Fourier transform, but this is part of a class exercise. So here is my cuda kernel to compute the DFT (not FFT) but when I try to execute the program there’s an error the launch timed out and was terminated I read over many pages on the web and I found out that this is due to many accesses to the global memory of the GPU but I don’t know how can I solve it. I’m trying to compute the DFT over images of 2 megapixels and more.
Any ideas? any recommendations (besides using cuFFT)?
__global__ void kernelDFT(u_char *matrix, float *real, float *imag, int row, int col)
{
int idxI = blockIdx.y * blockDim.y + threadIdx.y;
int idxJ = blockIdx.x * blockDim.x + threadIdx.x;
int idxM, idxN;
float angle, kreal, kimag;
while(idxI < row)
{
while(idxJ < col)
{
kreal = 0;
kimag = 0;
for (idxM = 0; idxM < row; idxM++) {
for (idxN = 0; idxN < col; idxN++) {
angle = (idxI*idxM/(float)row + idxJ*idxN/(float)col)*2.0f*M_PI;
kreal += matrix[idxM*col+idxN]*cosf( angle );
kimag += matrix[idxM*col+idxN]*sinf( angle );
}
}
real[idxI*col+idxJ] = kreal;
imag[idxI*col+idxJ] = kimag;
idxJ += blockDim.x * gridDim.x;
}
idxI += blockDim.y * gridDim.y;
idxJ = blockIdx.x * blockDim.x + threadIdx.x;
}
}
void pprDFT(pprMatrix *matrix, pprComplexMatrix *cmatrix)
{
u_char *d_matrix;
float *d_real, *d_imag;
cudaEvent_t start, stop;
float elapsedTime;
dim3 blocks(2,2);
dim3 threads(2,2);
cmatrix->row = matrix->row;
cmatrix->col = matrix->col;
pprComplexMatrixMem(cmatrix);
//Create start, stop events.
CHECK_ERROR( cudaEventCreate( &start ) );
CHECK_ERROR( cudaEventCreate( &stop ) );
//Launch the start event.
CHECK_ERROR( cudaEventRecord(start, 0) );
//Allocate memory on the GPU
CHECK_ERROR( cudaMalloc((void**)&d_matrix, sizeof(u_char)*matrix->row*matrix->col) );
CHECK_ERROR( cudaMalloc((void**)&d_real, sizeof(float)*matrix->row*matrix->col) );
CHECK_ERROR( cudaMalloc((void**)&d_imag, sizeof(float)*matrix->row*matrix->col) );
//Copy information from CPU(host) to GPU(device).
CHECK_ERROR( cudaMemcpy(d_matrix, matrix->data, sizeof(u_char)*matrix->row*matrix->col, cudaMemcpyHostToDevice ) );
//Launch the kernel computation to create the DFT.
kernelDFT<<<blocks, threads>>>(d_matrix, d_real, d_imag, matrix->row, matrix->col);
//Copy information from GPU(device) to CPU(host)
CHECK_ERROR( cudaMemcpy(cmatrix->data.real, d_real, sizeof(float)*matrix->row*matrix->col, cudaMemcpyDeviceToHost) );
CHECK_ERROR( cudaMemcpy(cmatrix->data.imag, d_imag, sizeof(float)*matrix->row*matrix->col, cudaMemcpyDeviceToHost) );
//Launch the stop event
CHECK_ERROR( cudaEventRecord(stop, 0) );
CHECK_ERROR( cudaEventSynchronize(stop));
//Print the elapsed time.
CHECK_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) );
printf( "%3.1f ms\n", elapsedTime );
//Destroy the events.
CHECK_ERROR( cudaEventDestroy( start ) );
CHECK_ERROR( cudaEventDestroy( stop ) );
//Free the allocated memory on GPU.
cudaFree(d_matrix);
cudaFree(d_real);
cudaFree(d_imag);
}