DFT algorithm, CUDA issue

Hi,

I know that there’s an implementation of FFT to compute the Fourier transform, but this is part of a class exercise. So here is my cuda kernel to compute the DFT (not FFT) but when I try to execute the program there’s an error the launch timed out and was terminated I read over many pages on the web and I found out that this is due to many accesses to the global memory of the GPU but I don’t know how can I solve it. I’m trying to compute the DFT over images of 2 megapixels and more.

Any ideas? any recommendations (besides using cuFFT)?

__global__ void kernelDFT(u_char *matrix, float *real, float *imag, int row, int col)

{

	int idxI = blockIdx.y * blockDim.y + threadIdx.y;

	int idxJ = blockIdx.x * blockDim.x + threadIdx.x;

	int idxM, idxN;

	float angle, kreal, kimag;

	

	while(idxI < row)

	{

		while(idxJ < col)

		{

			kreal = 0;

			kimag = 0;

			for (idxM = 0; idxM < row; idxM++) {

				for (idxN = 0; idxN < col; idxN++) {

					angle = (idxI*idxM/(float)row + idxJ*idxN/(float)col)*2.0f*M_PI;

					kreal += matrix[idxM*col+idxN]*cosf( angle );

					kimag += matrix[idxM*col+idxN]*sinf( angle );

				}

			}

			real[idxI*col+idxJ] = kreal;

			imag[idxI*col+idxJ] = kimag;

			

			idxJ += blockDim.x * gridDim.x;

		}

		idxI += blockDim.y * gridDim.y;

		idxJ = blockIdx.x * blockDim.x + threadIdx.x;

	}

}

void pprDFT(pprMatrix *matrix, pprComplexMatrix *cmatrix)

{

	u_char *d_matrix;

	float *d_real, *d_imag;

	cudaEvent_t start, stop;

	float elapsedTime;

	dim3 blocks(2,2);

	dim3 threads(2,2);

	cmatrix->row = matrix->row;

	cmatrix->col = matrix->col;

	

	pprComplexMatrixMem(cmatrix);

	

	//Create start, stop events.

	CHECK_ERROR( cudaEventCreate( &start ) );

	CHECK_ERROR( cudaEventCreate( &stop ) );

	

	//Launch the start event.

	CHECK_ERROR( cudaEventRecord(start, 0) );

	

	//Allocate memory on the GPU

	CHECK_ERROR( cudaMalloc((void**)&d_matrix, sizeof(u_char)*matrix->row*matrix->col) );

	CHECK_ERROR( cudaMalloc((void**)&d_real, sizeof(float)*matrix->row*matrix->col) );

	CHECK_ERROR( cudaMalloc((void**)&d_imag, sizeof(float)*matrix->row*matrix->col) );

	

	//Copy information from CPU(host) to GPU(device).

	CHECK_ERROR( cudaMemcpy(d_matrix, matrix->data, sizeof(u_char)*matrix->row*matrix->col, cudaMemcpyHostToDevice ) );

	

	//Launch the kernel computation to create the DFT.

	kernelDFT<<<blocks, threads>>>(d_matrix, d_real, d_imag, matrix->row, matrix->col);

	

	//Copy information from GPU(device) to CPU(host)

	CHECK_ERROR( cudaMemcpy(cmatrix->data.real, d_real, sizeof(float)*matrix->row*matrix->col, cudaMemcpyDeviceToHost) );

	CHECK_ERROR( cudaMemcpy(cmatrix->data.imag, d_imag, sizeof(float)*matrix->row*matrix->col, cudaMemcpyDeviceToHost) );

	

	//Launch the stop event

	CHECK_ERROR( cudaEventRecord(stop, 0) );

	CHECK_ERROR( cudaEventSynchronize(stop));

	

	//Print the elapsed time.

	CHECK_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) );

	printf( "%3.1f ms\n", elapsedTime );

	

	//Destroy the events.

	CHECK_ERROR( cudaEventDestroy( start ) );

	CHECK_ERROR( cudaEventDestroy( stop ) );

	

	//Free the allocated memory on GPU.

	cudaFree(d_matrix);

	cudaFree(d_real);

	cudaFree(d_imag);

}

“the launch timed out and was terminated” means exactly that - if a display is connected to the GPU, there is a limit of somewhere between 2 to 5 seconds for the execution time of a kernel. This prevents long-running (or infinite…) kernels from blocking display updates for excessive timespans, so you don’t get locked out of your computer.

Without having looked at your code, it is quite likely that a Fourier transform over several megapixels will exceed this execution time limit if FFT is not used.

To work around this problem, you can either use a dedicated GPU for CUDA (or under Linux turn off X and work from the console). Or divide the work between multiple shorter kernel launches to give the GPU a chance to update the screen in between.

Or rewrite your code to use the FFT algorithm.

Ok, thanks, I’m under linux so if I disable de X server it would or it will work, even if the kernel takes much time?

I’ll try it :)

Hi I disable the X server but still the same error message, so I decided to make a new code where the kernel performs a little part of the whole work and then executing that kernel several times should do the work :), here’s my new code but my DFT isn’t giving me the correct calculations…

I think my threads aren’t well syncronized but dunno where should I put a barrier…Any idea would be much appreciated !

__global__ void kernelDFT(u_char *matrix, float *real, float *imag, int idxI, int idxJ)

{

	int idxM = blockIdx.y * blockDim.y + threadIdx.y;

	int idxN = blockIdx.x * blockDim.x + threadIdx.x;

	float angle;

	

	while(idxM < d_row)

	{

		while(idxN < d_col)

		{

			angle = (idxI*idxM/(float)d_row + idxJ*idxN/(float)d_col)*2.0f*M_PI;

			real[idxI*d_col+idxJ] += matrix[idxM*d_col+idxN]*cosf( angle );

			imag[idxI*d_col+idxJ] += matrix[idxM*d_col+idxN]*sinf( angle );

			idxN += blockDim.x * gridDim.x;

		}

		idxM += blockDim.y * gridDim.y;

		idxN = blockIdx.x * blockDim.x + threadIdx.x;

	}

}

void pprDFT(pprMatrix *matrix, pprComplexMatrix *cmatrix)

{

	u_char *d_matrix;

	int idxI, idxJ;

	float *d_real, *d_imag, elapsedTime;

	cudaEvent_t start, stop;

	dim3 blocks(3,3);

	dim3 threads(16,16);

	cmatrix->row = matrix->row;

	cmatrix->col = matrix->col;

	

	pprComplexMatrixMem(cmatrix);

	

	//Create start, stop events.

	CHECK_ERROR( cudaEventCreate( &start ) );

	CHECK_ERROR( cudaEventCreate( &stop ) );

	

	//Launch the start event.

	CHECK_ERROR( cudaEventRecord(start, 0) );

	

	//Allocate memory on the GPU

	CHECK_ERROR( cudaMalloc((void**)&d_matrix, sizeof(u_char)*matrix->row*matrix->col) );

	CHECK_ERROR( cudaMalloc((void**)&d_real, sizeof(float)*matrix->row*matrix->col) );

	CHECK_ERROR( cudaMalloc((void**)&d_imag, sizeof(float)*matrix->row*matrix->col) );

	

	//Copy information from CPU(host) to GPU(device).

	CHECK_ERROR( cudaMemcpy(d_matrix, matrix->data, sizeof(u_char)*matrix->row*matrix->col, cudaMemcpyHostToDevice ) );

	cudaMemcpyToSymbol("d_row", &matrix->row, sizeof(int), 0, cudaMemcpyHostToDevice);

	cudaMemcpyToSymbol("d_col", &matrix->col, sizeof(int), 0, cudaMemcpyHostToDevice);

	//Set values of d_real and d_imag.

	CHECK_ERROR( cudaMemset( (void*) d_real, 0, sizeof(float)*matrix->row*matrix->col) );

	CHECK_ERROR( cudaMemset( (void*) d_imag, 0, sizeof(float)*matrix->row*matrix->col) );

	//Launch the kernel computation to create the DFT.

	for (idxI = 0; idxI < matrix->row; idxI++)

		for (idxJ = 0; idxJ < matrix->col; idxJ++){

			kernelDFT<<<blocks, threads>>>(d_matrix, d_real, d_imag, idxI, idxJ);

			CHECK_ERROR( cudaThreadSynchronize() );

		}

	//Copy information from GPU(device) to CPU(host)

	CHECK_ERROR( cudaMemcpy(cmatrix->data.real, d_real, sizeof(float)*matrix->row*matrix->col, cudaMemcpyDeviceToHost) );

	CHECK_ERROR( cudaMemcpy(cmatrix->data.imag, d_imag, sizeof(float)*matrix->row*matrix->col, cudaMemcpyDeviceToHost) );

	

	//Launch the stop event

	CHECK_ERROR( cudaEventRecord(stop, 0) );

	CHECK_ERROR( cudaEventSynchronize(stop));

	

	//Print the elapsed time.

	CHECK_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) );

	printf( "%3.1f ms\n", elapsedTime );

	

	//Destroy the events.

	CHECK_ERROR( cudaEventDestroy( start ) );

	CHECK_ERROR( cudaEventDestroy( stop ) );

	

	//Free the allocated memory on GPU.

	cudaFree(d_matrix);

	cudaFree(d_real);

	cudaFree(d_imag);

}

http://forums.nvidia.com/index.php?showtopic=199600&view=findpost&p=1233516

You would need to use atomic operations to ensure correctness.