hi everyone
yes it is me again and this time i have a problem with my cufft exec functions!? or so i though… until i did an error check of all my cuda calls and belive it or not i has been traced down to cudaMemcpy!! perhaps someone here can prove that i a noob after all copying data structure from cpu to gpu that are not suported… or maybe someone can point out a more efficient way of acomplishing what i’ve set out to do.
right, so here is the code:
[codebox]///////////////////////////////////////////////////////////////////////////////
// Name: fft_qpskDemod.cu
// Copyright: NUI Maynooth
// Author: Isa Iminov
// Date: 27/07/09
// Description:
///////////////////////////////////////////////////////////////////////////////
include <stdlib.h>
include <stdio.h>
include <string.h>
include <math.h>
include <cuda.h>
include <cuda_runtime.h>
include <cutil.h>
include <cutil_inline.h>
include <cufft.h>
//include “kernel_fft_qpskDemod.cu”
//include “structures.h”
///////////////////////////////////////////////////////////////////////////////
// Name: structures.h
// Copyright: NUI Maynooth
// Author: Isa Iminov
// Date: 17/07/09
// Description: Declaration of global types
///////////////////////////////////////////////////////////////////////////////
#ifndef STRUCTURES_H
define STRUCTURES_H
///////////////////////////////////////////////////////////////////////////////
// Global types
///////////////////////////////////////////////////////////////////////////////
typedef struct {
float real;
float imag;
} Complex;
///////////////////////////////////////////////////////////////////////////////
// Name: kernel_fft_qpskDemod.cu
// Copyright: NUI Maynooth
// Author: Isa Iminov
// Date: 27/07/09
// Description:
///////////////////////////////////////////////////////////////////////////////
#ifndef KERNEL_QPSK2D_CU
define KERNEL_QPSK2D_CU
////////////////////////////////////////////////////////////////////////////////
// Global types
////////////////////////////////////////////////////////////////////////////////
//include “structures.h”
////////////////////////////////////////////////////////////////////////////////
// Kernel that copies real data to complex data
////////////////////////////////////////////////////////////////////////////////
global void real2complex( Complex *d_b, cufftComplex *c, int R, int C ) {
/////////////////////////////////////////////////////////////////////////////
// Compute index for RxC array/data structure
/////////////////////////////////////////////////////////////////////////////
int ix = blockIdx.x * blockDim.x + threadIdx.x; // row of threads (single channel)
//int iy = blockIdx.y * blockDim.y + threadIdx.y;
//int idx = ix + iy * N;
/////////////////////////////////////////////////////////////////////////////
// Perform operation
/////////////////////////////////////////////////////////////////////////////
/*if ( ix < R && iy < C ) {
c[idx].x = d_b[idx].real;
c[idx].y = d_b[idx].imag;
} // end if
*/
for ( int iy = 0; iy < R; iy++ ) {
c[ iy*C + ix ].x = d_b[ iy*C + ix ].real;
c[ iy*C + ix ].y = d_b[ iy*C + ix ].imag;
} // end for
} // end real2complex
////////////////////////////////////////////////////////////////////////////////
// Kernel that copies complex data to real data
////////////////////////////////////////////////////////////////////////////////
global void complex2real( cufftComplex *c, Complex *d_b, int R, int C ) {
/////////////////////////////////////////////////////////////////////////////
// Compute index for RxC array/data structure
/////////////////////////////////////////////////////////////////////////////
int ix = blockIdx.x * blockDim.x + threadIdx.x; // row of threads (single channel)
//int iy = blockIdx.y * blockDim.y + threadIdx.y;
//int idx = ix + iy * N;
/////////////////////////////////////////////////////////////////////////////
// Perform operation
/////////////////////////////////////////////////////////////////////////////
/*if ( idx < N && idy < N ) {
d_b[idx].real = c[idx].x;
d_b[idx].imag = c[idx].y;
} // end if
*/
for ( int iy = 0; iy < R; iy++ ) {
d_b[ iy*C + ix ].real = c[ iy*C + ix ].x;
d_b[ iy*C + ix ].imag = c[ iy*C + ix ].y;
} // end for
} // end complex2real
////////////////////////////////////////////////////////////////////////////////
// Kernel that performs QPSK 2D mapping in global memory
////////////////////////////////////////////////////////////////////////////////
global void qpskModul( float *d_a, Complex *Lookup_d, Complex *d_b, int R, int C ) {
/////////////////////////////////////////////////////////////////////////////
// Compute index for RxC array/data structure
/////////////////////////////////////////////////////////////////////////////
int ix = blockIdx.x * blockDim.x + threadIdx.x; // row of threads (single channel)
//int iy = blockIdx.y * blockDim.y + threadIdx.y;
//int idx = ix + iy * N;
/////////////////////////////////////////////////////////////////////////////
// Perform mapping
/////////////////////////////////////////////////////////////////////////////
/*if ( ix < N && iy < N ) {
d_b[ idx ].real = Lookup_d[ ( int )d_a[ idx ] ].real;
d_b[ idx ].imag = Lookup_d[ ( int )d_a[ idx ] ].imag;
} // end if
*/
for ( int iy = 0; iy < R; iy++ ) {
d_b[ iy*C + ix ].real = Lookup_d[ ( int )d_a[ iy*C + ix ] ].real;
d_b[ iy*C + ix ].imag = Lookup_d[ ( int )d_a[ iy*C + ix ] ].imag;
} // end for
} // end qpskModul
////////////////////////////////////////////////////////////////////////////////
// Kernel that performs QPSK 2D mapping in global memory
////////////////////////////////////////////////////////////////////////////////
global void qpskDemod( float *d_a, Complex *Lookup_d, Complex *d_b, int R, int C ) {
/////////////////////////////////////////////////////////////////////////////
// Compute index for RxC array/data structure
/////////////////////////////////////////////////////////////////////////////
int ix = blockIdx.x * blockDim.x + threadIdx.x; // row of threads (single channel)
//int iy = blockIdx.y * blockDim.y + threadIdx.y;
//int idx = ix + iy * N;
/////////////////////////////////////////////////////////////////////////////
// Perform demapping
/////////////////////////////////////////////////////////////////////////////
/*if ( ix < N && iy < N ) {
if ( d_b[ idx ].real > 0 && d_b[ idx ].real < -3 ) {
if ( d_b[ idx ].imag > 0 && d_b[ idx ].imag < -3 ) d_a[ idx ] = 0;
else d_a[ idx ] = 2;
} // end if
else {
if ( d_b[ idx ].imag > 0 && d_b[ idx ].imag < -3 ) d_a[ idx ] = 3;
else d_a[ idx ] = 1;
} // end else
} // end if
*/
for ( int iy = 0; iy < R; iy++ ) {
if ( d_b[ iy*C + ix ].real > 0 && d_b[ iy*C + ix ].real < -3 ) {
if ( d_b[ iy*C + ix ].imag > 0 && d_b[ iy*C + ix ].imag < -3 ) d_a[ iy*C + ix ] = 0;
else d_a[ iy*C + ix ] = 2;
} // end if
else {
if ( d_b[ iy*C + ix ].imag > 0 && d_b[ iy*C + ix ].imag < -3 ) d_a[ iy*C + ix ] = 3;
else d_a[ iy*C + ix ] = 1;
} // end else
} // end for
} // end qpskDemod
void checkError() {
cudaError_t cudaError;
cudaError = cudaGetLastError();
if( cudaError != cudaSuccess ) {
fprintf(stderr, “CUDA Runtime API Error reported : %s\n”, cudaGetErrorString(cudaError));
exit(EXIT_FAILURE);
} // end if
} // end checkError
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
define DO_CPU
#undef DO_CPU
define DO_GPU
//#undef DO_GPU
int main( int argc, char **argv ) {
printf( “fft and QPSK demod program for a 2D matrix…\n” );
printf( “=============================================\n” );
////////////////////////////////////////////////////////////////////////////
// Create timers
////////////////////////////////////////////////////////////////////////////
printf( “Initialising timers…\n” );
unsigned int timer1, timer2, timer3, timer4 = 0;
cutilCheckError( cutCreateTimer( &timer1 ) ); // CPU timer
cutilCheckError( cutCreateTimer( &timer2 ) ); // GPU timer
cutilCheckError( cutCreateTimer( &timer3 ) ); // cudaMemcpy timer 1
cutilCheckError( cutCreateTimer( &timer4 ) ); // cudaMemcpy timer 2
checkError();
////////////////////////////////////////////////////////////////////////////
// Initialise memory
////////////////////////////////////////////////////////////////////////////
printf( “Initialising memory…\n” );
const int R = 128; // make a big array with RxC elements
const int C = 128; // where R = row (channel), C = col (channel elements)
float *a_h; // host data
Complex *b_h, *Lookup; // host data structures
a_h = ( float * )malloc( sizeof( float )RC ); // allocate array on host
b_h = ( Complex * )malloc( sizeof( Complex )RC ); // allocate structure on host
Lookup = ( Complex * )malloc( sizeof( Complex )*4 ); // allocate structure on host
float *a_d; // device data
Complex *b_d, *Lookup_d; // device data structures
cutilSafeCall( cudaMallocHost( ( void ** ) &a_d, sizeof( float )RC ) ); // allocate array on device
cutilSafeCall( cudaMallocHost( ( void ** ) &b_d, sizeof( Complex )RC ) ); // allocate structure on device
cutilSafeCall( cudaMallocHost( ( void ** ) &Lookup_d, sizeof( Complex )*4 ) ); // allocate structure on device
cufftComplex *r_complex_d;
cutilSafeCall( cudaMallocHost( (void **) &r_complex_d, sizeof(cufftComplex)RC) );
checkError();
////////////////////////////////////////////////////////////////////////////
// Initialize host array
////////////////////////////////////////////////////////////////////////////
printf( “Initialising data matrix…\n” );
unsigned int data = 0xf01919c6; // randome 4byte worth of data
unsigned int mask = 0x00000003; // bit mask
unsigned int temp = 0x00000000; // temporary storage
for ( int i = 0; i < R; i++ ) {
for ( int j = 0; j < C; j++ ) {
temp = data & mask;
a_h[ i*C + j ] = ( int )temp; // rand()/(float)RAND_MAX;
data = data >> 2;
} // end for
} // end for
checkError();
////////////////////////////////////////////////////////////////////////////
// Initialise lookup table
////////////////////////////////////////////////////////////////////////////
printf( “Initialising lookup table…\n” );
Lookup[ 0 ].real = -sqrt( 2.0 ); Lookup[ 0 ].imag = -sqrt( 2.0 );
Lookup[ 1 ].real = sqrt( 2.0 ); Lookup[ 1 ].imag = -sqrt( 2.0 );
Lookup[ 3 ].real = sqrt( 2.0 ); Lookup[ 3 ].imag = sqrt( 2.0 );
Lookup[ 2 ].real = -sqrt( 2.0 ); Lookup[ 2 ].imag = sqrt( 2.0 );
printf( “\nLookup Table:\n” );
for ( int i = 0; i < 4; i++ ) printf( “%2d real:%10f imag:%10f\n”, i, Lookup[ i ].real, Lookup[ i ].imag );
printf( "\nPerforming cuda memory copy operation… " );
cutilCheckError( cutStartTimer( timer3 ) ); // start cudaMemcpy timer 3
cutilSafeCall( cudaMemcpy( Lookup_d, Lookup, sizeof( Complex )*4, cudaMemcpyHostToDevice ) );
cutilCheckError( cutStopTimer( timer3 ) ); // stop cudaMemcpy timer 3
printf( " %f ms\n", cutGetTimerValue( timer3 ) );
cutilCheckError( cutResetTimer( timer3 ) );
checkError();
printf( “Initialising lookup table… passed\n” );
ifdef DO_CPU
////////////////////////////////////////////////////////////////////////////
// Do calculation on host
////////////////////////////////////////////////////////////////////////////
printf( “Performing QPSK 2D operation on CPU…\n” );
cutilCheckError( cutStartTimer( timer1 ) ); // start CPU timer
for ( int i = 0; i < R; i++ ) {
for ( int j = 0; j < C; j++ ) {
b_h[ i*C + j ] = Lookup[ ( int )a_h[ i*C + j ] ];
} // end for
} // end for
cutilCheckError( cutStopTimer( timer1 ) ); // stop CPU timer
ifdef DO_GPU
////////////////////////////////////////////////////////////////////////////
// Do calculation on device
////////////////////////////////////////////////////////////////////////////
cutilCheckError( cutStartTimer( timer2 ) ); // start GPU timer
printf( "Performing cuda memory copy operation… " );
cutilCheckError( cutStartTimer( timer3 ) ); // start cudaMemcpy timer 1
cutilSafeCall( cudaMemcpy( a_d, a_h, sizeof( float )RC, cudaMemcpyHostToDevice ) );
cutilCheckError( cutStopTimer( timer3 ) ); // stop cudaMemcpy timer 1
printf( " %f ms\n", cutGetTimerValue( timer3 ) );
checkError();
printf( “Initialising plan for CUDA fft…\n” );
cufftHandle plan;
cufftPlan2d( &plan, R, C, CUFFT_C2C );
//cufftPlan1d( &plan, R*C, CUFFT_C2C, 10 );
checkError();
printf( “Computing execution configuration…\n” );
int block_size_x = 16; // # of threads per block = x*y
int block_size_y = 32; // upto max of 512
dim3 blockDim( block_size_x, block_size_y ); // block size in threads
dim3 gridDim ( R/blockDim.x, C/blockDim.y ); // grid size in bloack
checkError();
printf( “Performing configuration check…\n” );
if ( R % block_size_x != 0 ) gridDim.x += 1;
if ( C % block_size_y != 0 ) gridDim.y += 1;
checkError();
checkError();
qpskModul <<< gridDim, blockDim >>> ( a_d, Lookup_d, b_d, R, C );
real2complex <<< gridDim, blockDim >>> ( b_d, r_complex_d, R, C );
cufftExecC2C( plan, r_complex_d, r_complex_d, CUFFT_INVERSE );
checkError();
cufftExecC2C( plan, r_complex_d, r_complex_d, CUFFT_FORWARD );
complex2real <<< gridDim, blockDim >>> ( r_complex_d, b_d, R, C );
qpskDemod <<< gridDim, blockDim >>> ( a_d, Lookup_d, b_d, R, C );
checkError();
////////////////////////////////////////////////////////////////////////////
// Retrieve result from device and store it in host array
////////////////////////////////////////////////////////////////////////////
printf( "Performing cuda memory copy operation… " );
cutilCheckError( cutStartTimer( timer4 ) ); // start cudaMemcpy timer 2
cutilSafeCall( cudaMemcpy( a_h, a_d, sizeof( float )RC, cudaMemcpyDeviceToHost ) );
cutilCheckError( cutStopTimer( timer4 ) ); // stop cudaMemcpy timer 2
printf( " %f ms\n", cutGetTimerValue( timer4 ) );
cutilCheckError( cutStopTimer( timer2 ) ); // stop GPU timer
checkError();
////////////////////////////////////////////////////////////////////////////
// Print results
////////////////////////////////////////////////////////////////////////////
// printf( “\n x. y data value: real part: imag part:\n” );
// for ( int i = 0; i < R; i++ ) {
// for ( int j = 0; j < C; j++ ) {
// printf( “%3d.%3d %11f %12f %12f\n”, i, j, a_h[ iC + j ], b_h[ iC + j ].real, b_h[ i*C + j ].imag );
// } //end for
// } // end for
printf( “\nCPU calculation time: %f ms\n”, cutGetTimerValue( timer1 ) );
printf( “GPU operation time: %f ms\n”, cutGetTimerValue( timer2 ) );
printf( “GPU calculation time: %f ms\n”, cutGetTimerValue( timer2 )-cutGetTimerValue( timer3 )-cutGetTimerValue( timer4 ) );
printf( “\nLookup Table:\n” );
for ( int i = 0; i < 4; i++ ) printf( “%2d real:%10f imag:%10f\n”, i, Lookup[ i ].real, Lookup[ i ].imag );
//printf( “\nGrid size: %2dx%2dx%2dx blocks (2D max)\nBlock size: %2dx%2dx%2dx threads (3D max)\n”, gridDim, blockDim );
////////////////////////////////////////////////////////////////////////////
// Cleanup
////////////////////////////////////////////////////////////////////////////
printf( “\nCleanup memory…\n” );
free( a_h ); cutilSafeCall( cudaFreeHost( a_d ) ); free( Lookup );
free( b_h ); cutilSafeCall( cudaFreeHost( b_d ) ); cutilSafeCall( cudaFreeHost( Lookup_d ) );
cutilSafeCall( cudaFreeHost( r_complex_d ) );
cufftDestroy( plan );
cutilCheckError( cutDeleteTimer( timer1 ) );
cutilCheckError( cutDeleteTimer( timer2 ) );
cutilCheckError( cutDeleteTimer( timer3 ) );
cutilCheckError( cutDeleteTimer( timer4 ) );
////////////////////////////////////////////////////////////////////////////
// Exit
////////////////////////////////////////////////////////////////////////////
printf( “Shutting down…” );
cutilExit( argc, argv );
} // end main
[/codebox]
so apparantly this is what causing the problem!? during execution now…
cutilSafeCall( cudaMemcpy( Lookup_d, Lookup, sizeof( Complex )*4, cudaMemcpyHostToDevice ) );
to be honest i dont see anything wrong with this statement but thats my point of view and i have a sneacky suspicion that this is not the real cause of the error…
any idea what might be causing this failure? i think i got all of my argumets and declarition and calls correct… as lighter version of this worked before, or so it seem like it worked.
help is much appreciated
thanks
ps: damn am i posting error topics like mad or what ?_?