hi every one
i’ve been messing with cuda for about 7 weeks now and i have this sneaky suspition that i still not know much about it. so perhaps you guys can help me out a bit or at least point me in the right direction.
///////////////////////////////////////////////////////////////////////////////
// Name: qpsk1D.cu
// Copyright: NUI Maynooth
// Author: xxx
// Date: 03/07/09
// Description: This progam translates a bit stream into symbols via lookup
// table. Assummin grey coded symbols where:
// 00 is -sqrt(2), -j sqrt(2)
///////////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <cutil_inline.h>
//#include "kernel_qpsk2D.cu"
//#include "structures.h"
///////////////////////////////////////////////////////////////////////////////
// Name: structures.h
// Copyright: NUI Maynooth
// Author: xxx
// Date: 17/07/09
// Description: Declaration of global types
///////////////////////////////////////////////////////////////////////////////
#ifndef STRUCTURES_H
#define STRUCTURES_H
///////////////////////////////////////////////////////////////////////////////
// Global types
///////////////////////////////////////////////////////////////////////////////
typedef struct {
float real;
float imag;
} Complex;
#endif
///////////////////////////////////////////////////////////////////////////////
// Name: kernel_qpsk1D.cu
// Copyright: NUI Maynooth
// Author: xxx
// Date: 03/07/09
// Description: Definition of QPSK 1D kernels
///////////////////////////////////////////////////////////////////////////////
#ifndef KERNEL_QPSK1D_CU
#define KERNEL_QPSK1D_CU
////////////////////////////////////////////////////////////////////////////////
// Global types
////////////////////////////////////////////////////////////////////////////////
//#include "structures.h"
////////////////////////////////////////////////////////////////////////////////
// Kernel that performs QPSK 1D mapping in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void qpskGlobal( float *d_a, Complex *Lookup, Complex *d_b, int N ) {
/////////////////////////////////////////////////////////////////////////////
// Compute index for N array/data structure
/////////////////////////////////////////////////////////////////////////////
int idx = blockIdx.x * blockDim.x + threadIdx.x;
/////////////////////////////////////////////////////////////////////////////
// Perform transformation
/////////////////////////////////////////////////////////////////////////////
if ( idx < N ) {
d_b[ idx ].real = Lookup[ ( int )d_a[ idx ] ].real;
d_b[ idx ].imag = Lookup[ ( int )d_a[ idx ] ].imag;
} //end if
} // end qpskGlobal
////////////////////////////////////////////////////////////////////////////////
// Kernel that performs QPSK 1D mapping in shared memory
////////////////////////////////////////////////////////////////////////////////
__global__ void qpskShared( float *d_a, Complex *Lookup, Complex *d_b, int N ) {
/////////////////////////////////////////////////////////////////////////////
// Compute index for N array/data structure
/////////////////////////////////////////////////////////////////////////////
int idx = blockIdx.x * blockDim.x + threadIdx.x;
/////////////////////////////////////////////////////////////////////////////
// Initialise lookup table in shared memory
/////////////////////////////////////////////////////////////////////////////
extern __shared__ Complex Table[];
for ( int i = 0; i < 4; i++ ) Table[ i ] = Lookup[ i ];
__syncthreads();
/////////////////////////////////////////////////////////////////////////////
// Perform transformation
/////////////////////////////////////////////////////////////////////////////
if ( idx < N ) {
d_b[ idx ].real = Table[ ( int )d_a[ idx ] ].real;
d_b[ idx ].imag = Table[ ( int )d_a[ idx ] ].imag;
} // end if
} // end qpskShared
#endif
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
#define DO_CPU
//#undef DO_CPU
#define DO_GPU
//#undef DO_GPU
int main( int argc, char **argv ) {
printf( "QPSK program for a 1D matrix...\n" );
printf( "===============================\n" );
////////////////////////////////////////////////////////////////////////////
// Create timers
////////////////////////////////////////////////////////////////////////////
printf( "Initialising timers...\n" );
unsigned int timer1, timer2, timer3, timer4 = 0;
cutCreateTimer( &timer1 ); // CPU timer
cutCreateTimer( &timer2 ); // GPU timer
cutCreateTimer( &timer3 ); // cudaMemcpy timer 1
cutCreateTimer( &timer4 ); // cudaMemcpy timer 2
////////////////////////////////////////////////////////////////////////////
// Initialise memory
////////////////////////////////////////////////////////////////////////////
printf( "Initialising memory...\n" );
const int N = 20000; // make a big array with NxN elements
float *a_h; // host data
Complex *b_h, *Lookup; // host data structures
a_h = ( float * )malloc( sizeof( float )*N ); // allocate array on host
b_h = ( Complex * )malloc( sizeof( Complex )*N ); // allocate structure on host
Lookup = ( Complex * )malloc( sizeof( Complex )*4 ); // allocate structure on host
float *a_d; // device data
Complex *b_d, *Lookup_d; // device data structures
cudaMallocHost( ( void ** ) &a_d, sizeof( float )*N ); // allocate array on device
cudaMallocHost( ( void ** ) &b_d, sizeof( Complex )*N ); // allocate structure on device
cudaMallocHost( ( void ** ) &Lookup_d, sizeof( Complex )*4 ); // allocate structure on device
////////////////////////////////////////////////////////////////////////////
// Initialize host array
////////////////////////////////////////////////////////////////////////////
printf( "Initiaising data matrix...\n" );
unsigned int data = 0xf01919c6; // randome 4byte worth of data
unsigned int mask = 0x00000003; // bit mask
unsigned int temp = 0x00000000; // temporary storage
for ( int i = 0; i < N; i++ ) {
temp = data & mask;
a_h[ i ] = ( int )temp;
data = data >> 2;
} // end for
////////////////////////////////////////////////////////////////////////////
// Initialise lookup table
////////////////////////////////////////////////////////////////////////////
printf( "Initialising lookup table..\n" );
Lookup[ 0 ].real = -sqrt( 2.0 ); Lookup[ 0 ].imag = -sqrt( 2.0 );
Lookup[ 1 ].real = sqrt( 2.0 ); Lookup[ 1 ].imag = -sqrt( 2.0 );
Lookup[ 3 ].real = sqrt( 2.0 ); Lookup[ 3 ].imag = sqrt( 2.0 );
Lookup[ 2 ].real = -sqrt( 2.0 ); Lookup[ 2 ].imag = sqrt( 2.0 );
printf( "\nPerforming cudaMemcpyHostToDevice operation... " );
cutStartTimer( timer3 ); // start cudaMemcpy timer 3
cudaMemcpy( Lookup_d, Lookup, sizeof( Complex )*4, cudaMemcpyHostToDevice );
cutStopTimer( timer3 ); // stop cudaMemcpy timer 3
printf( " %f ms\n", cutGetTimerValue( timer3 ) );
cutResetTimer( timer3 );
#ifdef DO_CPU
////////////////////////////////////////////////////////////////////////////
// Do calculation on host
////////////////////////////////////////////////////////////////////////////
printf( "Performing QPSK 1D operation on CPU...\n" );
cutStartTimer( timer1 ); // start CPU timer
for ( int i = 0; i < N; i++ ) b_h[ i ] = Lookup[ ( int )a_h[ i ] ];
cutStopTimer( timer1 ); // stop CPU timer
#endif
#ifdef DO_GPU
////////////////////////////////////////////////////////////////////////////
// Do calculation on device
////////////////////////////////////////////////////////////////////////////
printf( "Performing QPSK 1D operation on GPU...\n" );
int blockDim = 256; // block size in threads
int gridDim = N/blockDim + ( N%blockDim == 0 ? 0:1 ); // grid size in bloack
cutStartTimer( timer2 ); // start GPU timer
printf( "Performing cudaMemcpyHostToDevice operation... " );
cutStartTimer( timer3 ); // start cudaMemcpy timer 1
cudaMemcpy( a_d, a_h, sizeof( float )*N, cudaMemcpyHostToDevice );
cutStopTimer( timer3 ); // stop cudaMemcpy timer 1
printf( " %f ms\n", cutGetTimerValue( timer3 ) );
qpskGlobal <<< gridDim, blockDim >>> ( a_d, Lookup_d, b_d, N );
//qpskShared <<< gridDim, blockDim, sizeof( float )*4 >>> ( a_d, Lookup_d, b_d, N );
////////////////////////////////////////////////////////////////////////////
// Retrieve result from device and store it in host array
////////////////////////////////////////////////////////////////////////////
printf( "Performing cudaMemcpyDeviceToHost operation... " );
cutStartTimer( timer4 ); // start cudaMemcpy timer 2
cudaMemcpy( b_h, a_d, sizeof( Complex )*N, cudaMemcpyDeviceToHost );
cutStopTimer( timer4 ); // stop cudaMemcpy timer 2
printf( " %f ms\n", cutGetTimerValue( timer4 ) );
cutStopTimer( timer2 ); // stop GPU timer
#endif
////////////////////////////////////////////////////////////////////////////
// Print results
////////////////////////////////////////////////////////////////////////////
// printf( "\n x. data value: real part: imag part:\n" );
// for ( int i = 0; i< N; i++ ) printf( "%3d %11f %12f %12f\n", i, a_h[ i ], b_h[ i ].real, b_h[ i ].imag );
printf( "\nCPU calculation time: %f ms\n", cutGetTimerValue( timer1 ) );
printf( "GPU operation time: %f ms\n", cutGetTimerValue( timer2 ) );
printf( "GPU calculation time: %f ms\n", cutGetTimerValue( timer2 )-cutGetTimerValue( timer3 )-cutGetTimerValue( timer4 ) );
printf( "\nLookup Table:\n" );
for ( int i = 0; i < 4; i++ ) printf( "%2d real:%10f imag:%10f\n", i, Lookup[ i ].real, Lookup[ i ].imag );
printf( "\nGrid size: %3d blocks (2D max)\nBlock size: %3d threads (3D max)\n", gridDim, blockDim );
////////////////////////////////////////////////////////////////////////////
// Cleanup
////////////////////////////////////////////////////////////////////////////
printf( "\nCleanup memory...\n" );
free( a_h ); cudaFreeHost( a_d ); free( Lookup );
free( b_h ); cudaFreeHost( b_d ); cudaFreeHost( Lookup_d );
////////////////////////////////////////////////////////////////////////////
// Exit
////////////////////////////////////////////////////////////////////////////
printf( "Shutting down..." );
cutilExit( argc, argv );
} // end main
righ this code performs a qpsk lookup (for no particular reason yet).
and i have a few question about it:
1: why is global memory version runs faster than shared memory?
2: is it a good idea to put lookup table into shared memory? or maybe i should use constatn memory instead?
3: is there any way to get this code to run faster?
4: in case any one wonders why cudaMallocHost instead of cudaMalloc, the reason is simple code runs faster.
- “N” was chosen to approximately match the time of execution between CPU and GPU global memory.
just to give you a flavour of what the execution looks like.