cutil error problem with cudaMemcoy ?_?

hi everyone

yes it is me again and this time i have a problem with my cufft exec functions!? or so i though… until i did an error check of all my cuda calls and belive it or not i has been traced down to cudaMemcpy!! perhaps someone here can prove that i a noob after all copying data structure from cpu to gpu that are not suported… or maybe someone can point out a more efficient way of acomplishing what i’ve set out to do.

right, so here is the code:

[codebox]///////////////////////////////////////////////////////////////////////////////

// Name: fft_qpskDemod.cu

// Copyright: NUI Maynooth

// Author: Isa Iminov

// Date: 27/07/09

// Description:

///////////////////////////////////////////////////////////////////////////////

include <stdlib.h>

include <stdio.h>

include <string.h>

include <math.h>

include <cuda.h>

include <cuda_runtime.h>

include <cutil.h>

include <cutil_inline.h>

include <cufft.h>

//include “kernel_fft_qpskDemod.cu”

//include “structures.h”

///////////////////////////////////////////////////////////////////////////////

// Name: structures.h

// Copyright: NUI Maynooth

// Author: Isa Iminov

// Date: 17/07/09

// Description: Declaration of global types

///////////////////////////////////////////////////////////////////////////////

#ifndef STRUCTURES_H

define STRUCTURES_H

///////////////////////////////////////////////////////////////////////////////

// Global types

///////////////////////////////////////////////////////////////////////////////

typedef struct {

float real;

float imag;

} Complex;

endif

///////////////////////////////////////////////////////////////////////////////

// Name: kernel_fft_qpskDemod.cu

// Copyright: NUI Maynooth

// Author: Isa Iminov

// Date: 27/07/09

// Description:

///////////////////////////////////////////////////////////////////////////////

#ifndef KERNEL_QPSK2D_CU

define KERNEL_QPSK2D_CU

////////////////////////////////////////////////////////////////////////////////

// Global types

////////////////////////////////////////////////////////////////////////////////

//include “structures.h”

////////////////////////////////////////////////////////////////////////////////

// Kernel that copies real data to complex data

////////////////////////////////////////////////////////////////////////////////

global void real2complex( Complex *d_b, cufftComplex *c, int R, int C ) {

/////////////////////////////////////////////////////////////////////////////

// Compute index for RxC array/data structure

/////////////////////////////////////////////////////////////////////////////

int ix = blockIdx.x * blockDim.x + threadIdx.x; // row of threads (single channel)

//int iy = blockIdx.y * blockDim.y + threadIdx.y;

//int idx = ix + iy * N;

/////////////////////////////////////////////////////////////////////////////

// Perform operation

/////////////////////////////////////////////////////////////////////////////

/*if ( ix < R && iy < C ) {

  c[idx].x = d_b[idx].real;

  c[idx].y = d_b[idx].imag;

} // end if

*/

for ( int iy = 0; iy < R; iy++ ) {

  c[ iy*C + ix ].x = d_b[ iy*C + ix ].real;

  c[ iy*C + ix ].y = d_b[ iy*C + ix ].imag;

} // end for

} // end real2complex

////////////////////////////////////////////////////////////////////////////////

// Kernel that copies complex data to real data

////////////////////////////////////////////////////////////////////////////////

global void complex2real( cufftComplex *c, Complex *d_b, int R, int C ) {

/////////////////////////////////////////////////////////////////////////////

// Compute index for RxC array/data structure

/////////////////////////////////////////////////////////////////////////////

int ix = blockIdx.x * blockDim.x + threadIdx.x; // row of threads (single channel)

//int iy = blockIdx.y * blockDim.y + threadIdx.y;

//int idx = ix + iy * N;

/////////////////////////////////////////////////////////////////////////////

// Perform operation

/////////////////////////////////////////////////////////////////////////////

/*if ( idx < N && idy < N ) {

  d_b[idx].real = c[idx].x;

  d_b[idx].imag = c[idx].y;

} // end if

*/

for ( int iy = 0; iy < R; iy++ ) {

  d_b[ iy*C + ix ].real = c[ iy*C + ix ].x;

  d_b[ iy*C + ix ].imag = c[ iy*C + ix ].y;

} // end for

} // end complex2real

////////////////////////////////////////////////////////////////////////////////

// Kernel that performs QPSK 2D mapping in global memory

////////////////////////////////////////////////////////////////////////////////

global void qpskModul( float *d_a, Complex *Lookup_d, Complex *d_b, int R, int C ) {

/////////////////////////////////////////////////////////////////////////////

// Compute index for RxC array/data structure

/////////////////////////////////////////////////////////////////////////////

int ix = blockIdx.x * blockDim.x + threadIdx.x; // row of threads (single channel)

//int iy = blockIdx.y * blockDim.y + threadIdx.y;

//int idx = ix + iy * N;

/////////////////////////////////////////////////////////////////////////////

// Perform mapping

/////////////////////////////////////////////////////////////////////////////

/*if ( ix < N && iy < N ) {

  d_b[ idx ].real = Lookup_d[ ( int )d_a[ idx ] ].real;

  d_b[ idx ].imag = Lookup_d[ ( int )d_a[ idx ] ].imag;

} // end if

*/

for ( int iy = 0; iy < R; iy++ ) {

  d_b[ iy*C + ix ].real = Lookup_d[ ( int )d_a[ iy*C + ix ] ].real;

  d_b[ iy*C + ix ].imag = Lookup_d[ ( int )d_a[ iy*C + ix ] ].imag;

} // end for

} // end qpskModul

////////////////////////////////////////////////////////////////////////////////

// Kernel that performs QPSK 2D mapping in global memory

////////////////////////////////////////////////////////////////////////////////

global void qpskDemod( float *d_a, Complex *Lookup_d, Complex *d_b, int R, int C ) {

/////////////////////////////////////////////////////////////////////////////

// Compute index for RxC array/data structure

/////////////////////////////////////////////////////////////////////////////

int ix = blockIdx.x * blockDim.x + threadIdx.x; // row of threads (single channel)

//int iy = blockIdx.y * blockDim.y + threadIdx.y;

//int idx = ix + iy * N;

/////////////////////////////////////////////////////////////////////////////

// Perform demapping

/////////////////////////////////////////////////////////////////////////////

/*if ( ix < N && iy < N ) {

  if ( d_b[ idx ].real > 0 &&  d_b[ idx ].real < -3 ) {

     if ( d_b[ idx ].imag > 0 && d_b[ idx ].imag < -3 ) d_a[ idx ] = 0;

     else d_a[ idx ] = 2;

     } // end if

  else {

     if ( d_b[ idx ].imag > 0 && d_b[ idx ].imag < -3 ) d_a[ idx ] = 3;

     else d_a[ idx ] = 1;

     } // end else

  } // end if

*/

for ( int iy = 0; iy < R; iy++ ) {

  if ( d_b[ iy*C + ix ].real > 0 && d_b[ iy*C + ix ].real < -3 ) {

     if ( d_b[ iy*C + ix ].imag > 0 && d_b[ iy*C + ix ].imag < -3 ) d_a[ iy*C + ix ] = 0;

     else d_a[ iy*C + ix ] = 2;

     } // end if

  else {

     if ( d_b[ iy*C + ix ].imag > 0 && d_b[ iy*C + ix ].imag < -3 ) d_a[ iy*C + ix ] = 3;

     else d_a[ iy*C + ix ] = 1;

     } // end else

} // end for

} // end qpskDemod

endif

void checkError() {

cudaError_t cudaError;

cudaError = cudaGetLastError();

if( cudaError != cudaSuccess ) {

fprintf(stderr, “CUDA Runtime API Error reported : %s\n”, cudaGetErrorString(cudaError));

exit(EXIT_FAILURE);

} // end if

} // end checkError

///////////////////////////////////////////////////////////////////////////////

// Main program

///////////////////////////////////////////////////////////////////////////////

define DO_CPU

#undef DO_CPU

define DO_GPU

//#undef DO_GPU

int main( int argc, char **argv ) {

printf( “fft and QPSK demod program for a 2D matrix…\n” );

printf( “=============================================\n” );

////////////////////////////////////////////////////////////////////////////

// Create timers

////////////////////////////////////////////////////////////////////////////

printf( “Initialising timers…\n” );

unsigned int timer1, timer2, timer3, timer4 = 0;

cutilCheckError( cutCreateTimer( &timer1 ) ); // CPU timer

cutilCheckError( cutCreateTimer( &timer2 ) ); // GPU timer

cutilCheckError( cutCreateTimer( &timer3 ) ); // cudaMemcpy timer 1

cutilCheckError( cutCreateTimer( &timer4 ) ); // cudaMemcpy timer 2

checkError();

////////////////////////////////////////////////////////////////////////////

// Initialise memory

////////////////////////////////////////////////////////////////////////////

printf( “Initialising memory…\n” );

const int R = 128; // make a big array with RxC elements

const int C = 128; // where R = row (channel), C = col (channel elements)

float *a_h; // host data

Complex *b_h, *Lookup; // host data structures

a_h = ( float * )malloc( sizeof( float )RC ); // allocate array on host

b_h = ( Complex * )malloc( sizeof( Complex )RC ); // allocate structure on host

Lookup = ( Complex * )malloc( sizeof( Complex )*4 ); // allocate structure on host

float *a_d; // device data

Complex *b_d, *Lookup_d; // device data structures

cutilSafeCall( cudaMallocHost( ( void ** ) &a_d, sizeof( float )RC ) ); // allocate array on device

cutilSafeCall( cudaMallocHost( ( void ** ) &b_d, sizeof( Complex )RC ) ); // allocate structure on device

cutilSafeCall( cudaMallocHost( ( void ** ) &Lookup_d, sizeof( Complex )*4 ) ); // allocate structure on device

cufftComplex *r_complex_d;

cutilSafeCall( cudaMallocHost( (void **) &r_complex_d, sizeof(cufftComplex)RC) );

checkError();

////////////////////////////////////////////////////////////////////////////

// Initialize host array

////////////////////////////////////////////////////////////////////////////

printf( “Initialising data matrix…\n” );

unsigned int data = 0xf01919c6; // randome 4byte worth of data

unsigned int mask = 0x00000003; // bit mask

unsigned int temp = 0x00000000; // temporary storage

for ( int i = 0; i < R; i++ ) {

  for ( int j = 0; j < C; j++ ) {

     temp = data & mask;

     a_h[ i*C +  j ] = ( int )temp; // rand()/(float)RAND_MAX;

     data = data >> 2;

  } // end for

} // end for

checkError();

////////////////////////////////////////////////////////////////////////////

// Initialise lookup table

////////////////////////////////////////////////////////////////////////////

printf( “Initialising lookup table…\n” );

Lookup[ 0 ].real = -sqrt( 2.0 ); Lookup[ 0 ].imag = -sqrt( 2.0 );

Lookup[ 1 ].real = sqrt( 2.0 ); Lookup[ 1 ].imag = -sqrt( 2.0 );

Lookup[ 3 ].real = sqrt( 2.0 ); Lookup[ 3 ].imag = sqrt( 2.0 );

Lookup[ 2 ].real = -sqrt( 2.0 ); Lookup[ 2 ].imag = sqrt( 2.0 );

printf( “\nLookup Table:\n” );

for ( int i = 0; i < 4; i++ ) printf( “%2d real:%10f imag:%10f\n”, i, Lookup[ i ].real, Lookup[ i ].imag );

printf( "\nPerforming cuda memory copy operation… " );

cutilCheckError( cutStartTimer( timer3 ) ); // start cudaMemcpy timer 3

cutilSafeCall( cudaMemcpy( Lookup_d, Lookup, sizeof( Complex )*4, cudaMemcpyHostToDevice ) );

cutilCheckError( cutStopTimer( timer3 ) ); // stop cudaMemcpy timer 3

printf( " %f ms\n", cutGetTimerValue( timer3 ) );

cutilCheckError( cutResetTimer( timer3 ) );

checkError();

printf( “Initialising lookup table… passed\n” );

ifdef DO_CPU

////////////////////////////////////////////////////////////////////////////

// Do calculation on host

////////////////////////////////////////////////////////////////////////////

printf( “Performing QPSK 2D operation on CPU…\n” );

cutilCheckError( cutStartTimer( timer1 ) ); // start CPU timer

for ( int i = 0; i < R; i++ ) {

  for ( int j = 0; j < C; j++ ) {

     b_h[ i*C + j ] = Lookup[ ( int )a_h[ i*C + j ] ];

  } // end for

} // end for

cutilCheckError( cutStopTimer( timer1 ) ); // stop CPU timer

endif

ifdef DO_GPU

////////////////////////////////////////////////////////////////////////////

// Do calculation on device

////////////////////////////////////////////////////////////////////////////

cutilCheckError( cutStartTimer( timer2 ) ); // start GPU timer

printf( "Performing cuda memory copy operation… " );

cutilCheckError( cutStartTimer( timer3 ) ); // start cudaMemcpy timer 1

cutilSafeCall( cudaMemcpy( a_d, a_h, sizeof( float )RC, cudaMemcpyHostToDevice ) );

cutilCheckError( cutStopTimer( timer3 ) ); // stop cudaMemcpy timer 1

printf( " %f ms\n", cutGetTimerValue( timer3 ) );

checkError();

printf( “Initialising plan for CUDA fft…\n” );

cufftHandle plan;

cufftPlan2d( &plan, R, C, CUFFT_C2C );

//cufftPlan1d( &plan, R*C, CUFFT_C2C, 10 );

checkError();

printf( “Computing execution configuration…\n” );

int block_size_x = 16; // # of threads per block = x*y

int block_size_y = 32; // upto max of 512

dim3 blockDim( block_size_x, block_size_y ); // block size in threads

dim3 gridDim ( R/blockDim.x, C/blockDim.y ); // grid size in bloack

checkError();

printf( “Performing configuration check…\n” );

if ( R % block_size_x != 0 ) gridDim.x += 1;

if ( C % block_size_y != 0 ) gridDim.y += 1;

checkError();

checkError();

qpskModul <<< gridDim, blockDim >>> ( a_d, Lookup_d, b_d, R, C );

real2complex <<< gridDim, blockDim >>> ( b_d, r_complex_d, R, C );

cufftExecC2C( plan, r_complex_d, r_complex_d, CUFFT_INVERSE );

checkError();

cufftExecC2C( plan, r_complex_d, r_complex_d, CUFFT_FORWARD );

complex2real <<< gridDim, blockDim >>> ( r_complex_d, b_d, R, C );

qpskDemod <<< gridDim, blockDim >>> ( a_d, Lookup_d, b_d, R, C );

checkError();

////////////////////////////////////////////////////////////////////////////

// Retrieve result from device and store it in host array

////////////////////////////////////////////////////////////////////////////

printf( "Performing cuda memory copy operation… " );

cutilCheckError( cutStartTimer( timer4 ) ); // start cudaMemcpy timer 2

cutilSafeCall( cudaMemcpy( a_h, a_d, sizeof( float )RC, cudaMemcpyDeviceToHost ) );

cutilCheckError( cutStopTimer( timer4 ) ); // stop cudaMemcpy timer 2

printf( " %f ms\n", cutGetTimerValue( timer4 ) );

cutilCheckError( cutStopTimer( timer2 ) ); // stop GPU timer

checkError();

endif

////////////////////////////////////////////////////////////////////////////

// Print results

////////////////////////////////////////////////////////////////////////////

// printf( “\n x. y data value: real part: imag part:\n” );

// for ( int i = 0; i < R; i++ ) {

// for ( int j = 0; j < C; j++ ) {

// printf( “%3d.%3d %11f %12f %12f\n”, i, j, a_h[ iC + j ], b_h[ iC + j ].real, b_h[ i*C + j ].imag );

// } //end for

// } // end for

printf( “\nCPU calculation time: %f ms\n”, cutGetTimerValue( timer1 ) );

printf( “GPU operation time: %f ms\n”, cutGetTimerValue( timer2 ) );

printf( “GPU calculation time: %f ms\n”, cutGetTimerValue( timer2 )-cutGetTimerValue( timer3 )-cutGetTimerValue( timer4 ) );

printf( “\nLookup Table:\n” );

for ( int i = 0; i < 4; i++ ) printf( “%2d real:%10f imag:%10f\n”, i, Lookup[ i ].real, Lookup[ i ].imag );

//printf( “\nGrid size: %2dx%2dx%2dx blocks (2D max)\nBlock size: %2dx%2dx%2dx threads (3D max)\n”, gridDim, blockDim );

////////////////////////////////////////////////////////////////////////////

// Cleanup

////////////////////////////////////////////////////////////////////////////

printf( “\nCleanup memory…\n” );

free( a_h ); cutilSafeCall( cudaFreeHost( a_d ) ); free( Lookup );

free( b_h ); cutilSafeCall( cudaFreeHost( b_d ) ); cutilSafeCall( cudaFreeHost( Lookup_d ) );

            cutilSafeCall( cudaFreeHost( r_complex_d ) );

cufftDestroy( plan );

cutilCheckError( cutDeleteTimer( timer1 ) );

cutilCheckError( cutDeleteTimer( timer2 ) );

cutilCheckError( cutDeleteTimer( timer3 ) );

cutilCheckError( cutDeleteTimer( timer4 ) );

////////////////////////////////////////////////////////////////////////////

// Exit

////////////////////////////////////////////////////////////////////////////

printf( “Shutting down…” );

cutilExit( argc, argv );

} // end main

[/codebox]

so apparantly this is what causing the problem!? during execution now…

cutilSafeCall( cudaMemcpy( Lookup_d, Lookup, sizeof( Complex )*4, cudaMemcpyHostToDevice ) );

to be honest i dont see anything wrong with this statement but thats my point of view and i have a sneacky suspicion that this is not the real cause of the error…

any idea what might be causing this failure? i think i got all of my argumets and declarition and calls correct… as lighter version of this worked before, or so it seem like it worked.

help is much appreciated

thanks

ps: damn am i posting error topics like mad or what ?_?

I didn’t check your code but this error can occur if you confuses the device and host pointer.

Also, be certain that your cudaMallocs succeed, so you use a valid pointer.