hello ladies and gents
i have this code (program if you like):
[codebox]///////////////////////////////////////////////////////////////////////////////
// Name: qpsk2D.cu
// Copyright: xxx
// Author: xxx
// Date: xxx
// Description:
///////////////////////////////////////////////////////////////////////////////
include <stdlib.h>
include <stdio.h>
include <string.h>
include <math.h>
include <cuda.h>
include <cuda_runtime.h>
include <cutil.h>
include <cutil_inline.h>
//include “kernel_qpsk2D.cu”
//include “structures.h”
///////////////////////////////////////////////////////////////////////////////
// Name: structures.h
// Copyright: xxx
// Author: xxx
// Date: xxx
// Description: Declaration of global types
///////////////////////////////////////////////////////////////////////////////
#ifndef STRUCTURES_H
define STRUCTURES_H
///////////////////////////////////////////////////////////////////////////////
// Global types
///////////////////////////////////////////////////////////////////////////////
typedef struct {
float real;
float imag;
} Complex;
///////////////////////////////////////////////////////////////////////////////
// Name: kernel_qpsk2D.cu
// Copyright: xxx
// Author: xxx
// Date: xxx
// Description:
///////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Kernel that performs QPSK 2D mapping in global memory
////////////////////////////////////////////////////////////////////////////////
global void qpsk2DGlobal( float *d_a, Complex *Lookup_d, Complex *d_b, int N ) {
// compute index in NxN array
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
// // create Lookup in shared memory
// float Lookup[ 4 ] = { -sqrt( 2.0 ), sqrt( 2.0 ), -sqrt( 2.0 ), sqrt( 2.0 ) };
// perform transformation
if ( ix < N && iy < N) {
int idx = ix + iy * N;
d_b[ idx ].real = Lookup_d[ d_a[ idx ] ].real;
d_b[ idx ].real = Lookup_d[ d_a[ idx ] ].imag;
} // end if
} // end qpsk2DGlobal
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
define DO_CPU
//#undef DO_CPU
define DO_GPU
//#undef DO_GPU
int main ( int argc, char **argv ) {
printf( “QPSK program for a 2D matrix…\n” );
printf( “===============================\n” );
////////////////////////////////////////////////////////////////////////////
// Create timers
////////////////////////////////////////////////////////////////////////////
printf( “Initialising timers…\n” );
unsigned int timer1, timer2, timer3, timer4 = 0;
cutCreateTimer( &timer1 ); // CPU timer
cutCreateTimer( &timer2 ); // GPU timer
cutCreateTimer( &timer3 ); // cudaMemcpy timer 1
cutCreateTimer( &timer4 ); // cudaMemcpy timer 2
////////////////////////////////////////////////////////////////////////////
// Initialise memory
////////////////////////////////////////////////////////////////////////////
printf( “Initialising memory…\n” );
const int N = 8; // make a big array with N elements
// float a_h[ N ][ N ]; // host data
// Complex b_h[ N ][ N ];
float *a_h[ N ]; // host data
Complex *b_h[ N ], *Lookup; // host data structure
a_h = ( float * )malloc( sizeof( float )NN ); // allocate array on host
b_h = ( Complex * )malloc( sizeof( Complex )NN ); // allocate array on host
Lookup = ( Complex * )malloc( sizeof( Complex )*4 );
// float *a_d, *b_d; // device data
float *a_d; // device data
Complex *b_d, *Lookup_d; // device data structure
cudaMallocHost( ( void ** ) &a_d, sizeof( float )NN ); // allocate array on device
cudaMallocHost( ( void ** ) &b_d, sizeof( Complex )NN ); // allocate array on device
cudaMallocHost( ( void ** ) &Lookup_d, sizeof( Complex )*4 ); // allocate array on device
////////////////////////////////////////////////////////////////////////////
// Initialize host array
////////////////////////////////////////////////////////////////////////////
printf( “Initiaising data matrix…\n” );
unsigned int data = 0xf01919c6; // randome 4byte worth of data
unsigned int mask = 0x00000003; // bit mask
unsigned int temp = 0x00000000; // temporary storage
for ( int i = 0; i < N; i++ ) {
for ( int j = 0; j < N; j++ ) {
temp = data & mask;
a_h[ i ][ j ] = ( int )temp;
data = data >> 2;
} // end for
} // end for
////////////////////////////////////////////////////////////////////////////
// Initialise lookup table
////////////////////////////////////////////////////////////////////////////
Lookup[ 0 ].real = -sqrt( 2.0 ); Lookup[ 0 ].imag = -sqrt( 2.0 );
Lookup[ 1 ].real = sqrt( 2.0 ); Lookup[ 1 ].imag = -sqrt( 2.0 );
Lookup[ 3 ].real = sqrt( 2.0 ); Lookup[ 3 ].imag = sqrt( 2.0 );
Lookup[ 2 ].real = -sqrt( 2.0 ); Lookup[ 2 ].imag = sqrt( 2.0 );
cudaMemcpy( Lookup_d, Lookup, sizeof( Complex )*4, cudaMemcpyHostToDevice );
// float Lookup[ 4 ] = { -sqrt( 2.0 ), sqrt( 2.0 ), -sqrt( 2.0 ), sqrt( 2.0 ) };
ifdef DO_CPU
////////////////////////////////////////////////////////////////////////////
// Do calculation on host
////////////////////////////////////////////////////////////////////////////
printf( “Performing QPSK 2D operation on CPU…\n” );
cutStartTimer( timer1 ); // start CPU timer
for ( int i = 0; i < N; i++ ) {
for ( int j = 0; j < N; j++ ) {
b_h[ i ][ j ] = Lookup[ ( int )a_h[ i ][ j ] ];
} // end for
} // end for
cutStopTimer( timer1 ); // stop CPU timer
ifdef DO_GPU
////////////////////////////////////////////////////////////////////////////
// Do calculation on device
////////////////////////////////////////////////////////////////////////////
printf( “Performing QPSK 2D operation on GPU…\n” );
dim3 dimBlock( 2, 2 ); // # of threads per 2D block
//int n_blocks = N / block_size + ( N%block_size == 0 ? 0:1 ); // # of blocks per 1D grid
dim3 dimGrid( 4, 4 ); // # of blocks per 2D grid
cutStartTimer( timer2 ); // start GPU timer
printf( "Performing cuda memory copy operation… " );
cutStartTimer( timer3 ); // start cudaMemcpy timer 1
cudaMemcpy( a_d, a_h, sizeof( float )NN, cudaMemcpyHostToDevice );
cutStopTimer( timer3 ); // stop cudaMemcpy timer 1
printf( " %f ms\n", cutGetTimerValue( timer3 ) );
//qpsk <<< n_blocks, block_size >>> ( a_d, N );
qpsk2DGlobal <<< dimGrid, dimBlock >>> ( a_d, Lookup_d, b_d, N );
//qpsk2DShared <<< dim3( 16, 16 ), block_size, 4 * sizeof( float ) >>> ( a_d, N );
////////////////////////////////////////////////////////////////////////////
// Retrieve result from device and store it in host array
////////////////////////////////////////////////////////////////////////////
printf( "Performing memory copy operation… " );
cutStartTimer( timer4 ); // start cudaMemcpy timer 2
cudaMemcpy( b_h, a_d, sizeof( Complex )NN, cudaMemcpyDeviceToHost );
cutStopTimer( timer4 ); // stop cudaMemcpy timer 2
printf( " %f ms\n", cutGetTimerValue( timer4 ) );
cutStopTimer( timer2 ); // stop GPU timer
////////////////////////////////////////////////////////////////////////////
// Print results
////////////////////////////////////////////////////////////////////////////
for ( int i = 0; i < N; i++ ) {
for ( int j = 0; j < N; j++ ) {
printf( "%2d.%2d %10f real:%10f imag:%10f\n", i, j, a_h[ i ][ j ], b_h[ i ][ j ].real, b_h[ i ][ j ].imag );
} //end for
} // end for
printf( “CPU calculation time: %f ms\n”, cutGetTimerValue( timer1 ) );
printf( “GPU operation time: %f ms\n”, cutGetTimerValue( timer2 ) );
printf( “GPU calculation time: %f ms\n”, cutGetTimerValue( timer2 )-cutGetTimerValue( timer3 )-cutGetTimerValue( timer4 ) );
printf( “Lookup Table:\n” );
// for ( int i = 0; i < 4; i++ ) printf( “%d %10f\n”, i, Lookup[ i ] );
for ( int i = 0; i < N; i++ ) printf( “%2d real:%10f imag:%10f\n”, i, Lookup[ i ].real, Lookup[ i ].imag );
//printf( “# blocks: %d\n# threads per block: %d\n”, n_blocks, block_size );
////////////////////////////////////////////////////////////////////////////
// Cleanup
////////////////////////////////////////////////////////////////////////////
printf( “Cleanup memory…\n” );
free( a_h ); cudaFreeHost( a_d ); free( Lookup );
free( b_h ); cudaFreeHost( b_d ); cudaFreeHost( Lookup_d );
////////////////////////////////////////////////////////////////////////////
// Exit
////////////////////////////////////////////////////////////////////////////
printf( “Shutting down…\n” );
cutilExit( argc, argv );
} // end main
[/codebox]
which is giving me a headech:
i know you cant be bother counting lines and to be honest with you i dont know how to insert line numbers so here goes:
068: d_b[ idx ].real = Lookup_d[ d_a[ idx ] ].real;
069: d_b[ idx ].real = Lookup_d[ d_a[ idx ] ].imag;
106: float *a_h[ N ]; // host data
107: Complex *b_h[ N ], *Lookup; // host data structure
[list=1]
[*]i am pretty sure that thouse of you who are a bit better at programming are going to say to me that 106 & 107 is the result of my stupidity (inexprerience). so perhaps someone can show me how it is supposed to be done.
[*]i dont understand why lines 68 & 69 are giving me the trouble? so perhaps someone with a bit more knowledge can help me get this thing running, as i have been strugling with this code for about a week now, i managed to reduce errors to lines 68 & 69 but thats about how far my programming skills got me for now…
ps: dont mind my bad grammar and spelling
pss: all comments are WELCOME
psss: help is especially WELCOME
regards
:wacko: