Hi pasoleatis.
The file are the same as the ones in the CUDA Toolkit. I tried the MC_SingleAsianOptionP example. Here are the codes:
%%%%%%%%%%%%% asianoption.h
#ifndef ASIANOPTION_H
#define ASIANOPTION_H
template
struct AsianOption
{
enum CallPut {Call, Put};
// Parameters
Real spot;
Real strike;
Real r;
Real sigma;
Real tenor;
Real dt;
// Value
Real golden;
Real value;
// Option type
CallPut type;
};
#endif
%%%%%%%%%%%%% cudasharedmem.h
#ifndef CUDASHAREDMEM_H
#define CUDASHAREDMEM_H
//****************************************************************************
// Because dynamically sized shared memory arrays are declared “extern”,
// we can’t templatize them directly. To get around this, we declare a
// simple wrapper struct that will declare the extern array with a different
// name depending on the type. This avoids compiler errors about duplicate
// definitions.
//
// To use dynamically allocated shared memory in a templatized global or
// device function, just replace code like this:
// template
// global void
// foo( T* g_idata, T* g_odata)
// {
// // Shared mem size is determined by the host app at run time
// extern shared T sdata;
// …
// x = sdata[i];
// sdata[i] = x;
// …
// }
//
// With this:
// template
// global void
// foo( T* g_idata, T* g_odata)
// {
// // Shared mem size is determined by the host app at run time
// SharedMemory sdata;
// …
// x = sdata[i];
// sdata[i] = x;
// …
// }
//****************************************************************************
// This is the un-specialized struct. Note that we prevent instantiation of this
// struct by making it abstract (i.e. with pure virtual methods).
template
struct SharedMemory
{
// Ensure that we won’t compile any un-specialized types
virtual device T &operator*() = 0;
virtual device T &operator(int i) = 0;
};
#define BUILD_SHAREDMEMORY_TYPE(t, n)
template <>
struct SharedMemory
{
device t &operator*() { extern shared t n; return *n; }
device t &operator(int i) { extern shared t n; return n[i]; }
}
BUILD_SHAREDMEMORY_TYPE(int, s_int);
BUILD_SHAREDMEMORY_TYPE(unsigned int, s_uint);
BUILD_SHAREDMEMORY_TYPE(char, s_char);
BUILD_SHAREDMEMORY_TYPE(unsigned char, s_uchar);
BUILD_SHAREDMEMORY_TYPE(short, s_short);
BUILD_SHAREDMEMORY_TYPE(unsigned short, s_ushort);
BUILD_SHAREDMEMORY_TYPE(long, s_long);
BUILD_SHAREDMEMORY_TYPE(unsigned long, s_ulong);
BUILD_SHAREDMEMORY_TYPE(bool, s_bool);
BUILD_SHAREDMEMORY_TYPE(float, s_float);
BUILD_SHAREDMEMORY_TYPE(double, s_double);
#endif
%%%%%%%%%%%%% pricingengine.h
#ifndef PRICINGENGINE_H
#define PRICINGENGINE_H
#include “asianoption.h”
template
class PricingEngine
{
public:
PricingEngine(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed);
void operator()(AsianOption &option);
private:
unsigned int m_seed;
unsigned int m_numSims;
unsigned int m_device;
unsigned int m_threadBlockSize;
};
#endif
%%%%%%%%%%%%% test.h
#ifndef TEST_H
#define TEST_H
template
struct Test
{
Test() : pass(false) {};
int device;
unsigned int numSims;
unsigned int threadBlockSize;
unsigned int seed;
bool pass;
double elapsedTime;
bool operator()();
};
// Defaults are arbitrary to give sensible runtime
#define k_sims_min 100000
#define k_sims_max 1000000
#define k_sims_def 100000
#define k_sims_qa 100000
#define k_bsize_min 32
#define k_bsize_def 128
#define k_bsize_qa 128
#define k_seed_def 1234
#define k_seed_qa 1234
#endif
%%%%%%%%%%%%% main.cpp
///////////////////////////////////////////////////////////////////////////////
// Monte Carlo: Single Asian Option
// ================================
//
// This sample demonstrates pricing a single European-exercise average-price
// Asian option (arithmetic discrete average).
//
// This file, main.cpp, contains the setup information to run the test, for
// example parsing the command line and integrating this sample with the
// samples framework. As such it is perhaps less interesting than the guts of
// the sample. Readers wishing to skip the clutter are advised to skip straight
// to Test.operator() in test.cpp.
///////////////////////////////////////////////////////////////////////////////
#include <QtCore/QCoreApplication>
#include
#include
#include
#include <cuda_runtime.h>
//#include <helper_timer.h>
//#include <helper_cuda.h>
#include “test.h”
// Forward declarations
void showHelp(const int argc, const char **argv);
template
void runTest(int argc, const char **argv);
int main(int argc, char **argv)
{
using std::invalid_argument;
using std::string;
QCoreApplication a(argc, argv);
// Open the log file
printf("Monte Carlo Single Asian Option (with PRNG)\n");
printf("===========================================\n\n");
// If help flag is set, display help and exit immediately
if (checkCmdLineFlag(argc, (const char **)argv, "help"))
{
printf("Displaying help on console\n");
showHelp(argc, (const char **)argv);
exit(EXIT_SUCCESS);
}
// Check the precision (checked against the device capability later)
try
{
char *value;
if (getCmdLineArgumentString(argc, (const char **)argv, "precision", &value))
{
// Check requested precision is valid
string prec(value);
if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0)
{
runTest<float>(argc, (const char **)argv);
}
else if (prec.compare("double") == 0 || prec.compare("\"double\"") == 0)
{
runTest<double>(argc, (const char **)argv);
}
else
{
printf("specified precision (%s) is invalid, must be \"single\" or \"double\".\n", value);
throw invalid_argument("precision");
}
}
else
{
runTest<float>(argc, (const char **)argv);
}
}
catch (invalid_argument &e)
{
printf("invalid command line argument (%s)\n", e.what());
}
return a.exec();
// Finish
//exit(EXIT_SUCCESS);
}
template
void runTest(int argc, const char **argv)
{
using std::invalid_argument;
using std::runtime_error;
try
{
Test<Real> test;
int deviceCount = 0;
cudaError_t cudaResult = cudaSuccess;
// by default specify GPU Device == 0
test.device = 0;
// Get number of available devices
cudaResult = cudaGetDeviceCount(&deviceCount);
if (cudaResult != cudaSuccess)
{
printf("could not get device count.\n");
throw runtime_error("cudaGetDeviceCount");
}
// (default parameters)
test.numSims = k_sims_qa;
test.threadBlockSize = k_bsize_qa;
test.seed = k_seed_qa;
{
char *value = 0;
if (getCmdLineArgumentString(argc, argv, "device", &value))
{
test.device = (int)atoi(value);
if (test.device >= deviceCount)
{
printf("invalid target device specified on command line (device %d does not exist).\n", test.device);
throw invalid_argument("device");
}
}
else
{
test.device = gpuGetMaxGflopsDeviceId();
}
if (getCmdLineArgumentString(argc, argv, "sims", &value))
{
test.numSims = (unsigned int)atoi(value);
if (test.numSims < k_sims_min || test.numSims > k_sims_max)
{
printf("specified number of simulations (%d) is invalid, must be between %d and %d.\n", test.numSims, k_sims_min, k_sims_max);
throw invalid_argument("sims");
}
}
else
{
test.numSims = k_sims_def;
}
if (getCmdLineArgumentString(argc, argv, "block-size", &value))
{
// Determine max threads per block
cudaDeviceProp deviceProperties;
cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device);
if (cudaResult != cudaSuccess)
{
printf("cound not get device properties for device %d.\n", test.device);
throw runtime_error("cudaGetDeviceProperties");
}
// Check requested size is valid
test.threadBlockSize = (unsigned int)atoi(value);
if (test.threadBlockSize < k_bsize_min || test.threadBlockSize > static_cast<unsigned int>(deviceProperties.maxThreadsPerBlock))
{
printf("specified block size (%d) is invalid, must be between %d and %d for device %d.\n", test.threadBlockSize, k_bsize_min, deviceProperties.maxThreadsPerBlock, test.device);
throw invalid_argument("block-size");
}
if (test.threadBlockSize & test.threadBlockSize-1)
{
printf("specified block size (%d) is invalid, must be a power of two (see reduction function).\n", test.threadBlockSize);
throw invalid_argument("block-size");
}
}
else
{
test.threadBlockSize = k_bsize_def;
}
if (getCmdLineArgumentString(argc, argv, "seed", &value))
{
// Check requested seed is valid
test.seed = (unsigned int)atoi(value);
if (test.seed == 0)
{
printf("specified seed (%d) is invalid, must be non-zero.\n", test.seed);
throw invalid_argument("seed");
}
}
else
{
test.seed = k_seed_def;
}
}
// Execute
test();
}
catch (invalid_argument &e)
{
printf("invalid command line argument (%s)\n", e.what());
}
catch (runtime_error &e)
{
printf("runtime error (%s)\n", e.what());
}
}
void showHelp(int argc, const char **argv)
{
using std::cout;
using std::endl;
using std::left;
using std::setw;
if (argc > 0)
{
cout << endl << argv[0] << endl;
}
cout << endl << "Syntax:" << endl;
cout << left;
cout << " " << setw(20) << "--device=<device>" << "Specify device to use for execution" << endl;
cout << " " << setw(20) << "--sims=<N>" << "Specify number of Monte Carlo simulations" << endl;
cout << " " << setw(20) << "--block-size=<N>" << "Specify number of threads per block" << endl;
cout << " " << setw(20) << "--seed=<N>" << "Specify the seed to use for the random number generator" << endl;
cout << " " << setw(20) << "--precision=<P>" << "Specify the precision (\"single\" or \"double\")" << endl;
cout << endl;
cout << " " << setw(20) << "--noprompt" << "Skip prompt before exit" << endl;
cout << endl;
}
%%%%%%%%%%%%% test.cpp
#include “test.h”
#include
#include
#include
#include
#include
#include
#include
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
//#include <helper_timer.h>
#include “asianoption.h”
#include “pricingengine.h”
template
bool Test::operator()()
{
using std::stringstream;
using std::endl;
using std::setw;
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
// Get device properties
struct cudaDeviceProp deviceProperties;
cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device);
if (cudaResult != cudaSuccess)
{
std::string msg("Could not get device properties: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
// This test prices a single Asian call option with European
// exercise, with the priced averaged arithmeticly on discrete
// trading days (weekdays).
AsianOption<Real> option;
option.spot = static_cast<Real>(40);
option.strike = static_cast<Real>(35);
option.r = static_cast<Real>(0.03);
option.sigma = static_cast<Real>(0.20);
option.tenor = static_cast<Real>(1.0/3.0);
option.dt = static_cast<Real>(1.0/261);
option.type = AsianOption<Real>::Call;
option.value = static_cast<Real>(0.0);
option.golden = static_cast<Real>(5.162534);
// Evaluate on GPU
printf("Pricing option on GPU (%s)\n\n", deviceProperties.name);
PricingEngine<Real> pricer(numSims, device, threadBlockSize, seed);
sdkStartTimer(&timer);
pricer(option);
sdkStopTimer(&timer);
elapsedTime = sdkGetAverageTimerValue(&timer)/1000.0f;
// Tolerance to compare result with expected
// This is just to check that nothing has gone very wrong with the
// test, the actual accuracy of the result depends on the number of
// Monte Carlo trials
const Real tolerance = static_cast<Real>(0.1);
// Display results
stringstream output;
output << "Precision: " << ((typeid(Real) == typeid(double)) ? "double" : "single") << endl;
output << "Number of sims: " << numSims << endl;
output << endl;
output << " Spot | Strike | r | sigma | tenor | Call/Put | Value | Expected |" << endl;
output << "-----------|------------|------------|------------|------------|------------|------------|------------|" << endl;
output << setw(10) << option.spot << " | ";
output << setw(10) << option.strike << " | ";
output << setw(10) << option.r << " | ";
output << setw(10) << option.sigma << " | ";
output << setw(10) << option.tenor << " | ";
output << setw(10) << (option.type == AsianOption<Real>::Call ? "Call" : "Put") << " | ";
output << setw(10) << option.value << " | ";
output << setw(10) << option.golden << " |";
printf("%s\n\n", output.str().c_str());
// Check result
if (fabs(option.value - option.golden) > tolerance)
{
printf("computed result (%e) does not match expected result (%e).\n", option.value, option.golden);
pass = false;
}
else
{
pass = true;
}
// Print results
printf("MonteCarloSingleAsianOptionP, Performance = %.2f sims/s, Time = %.2f(ms), NumDevsUsed = %u, Blocksize = %u\n",
numSims / elapsedTime, elapsedTime*1000.0f, 1, threadBlockSize);
sdkDeleteTimer(&timer);
return pass;
}
// Explicit template instantiation
template struct Test;
template struct Test;
%%%%%%%%%%%%% pricingengine.cu
#include “pricingengine.h”
#include
#include
#include
#include
#include
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include “asianoption.h”
#include “cudasharedmem.h”
using std::string;
using std::vector;
// RNG init kernel
global void initRNG(curandState *const rngStates,
const unsigned int seed)
{
// Determine thread ID
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Initialise the RNG
curand_init(seed, tid, 0, &rngStates[tid]);
}
device inline float getPathStep(float &drift, float &diffusion, curandState &state)
{
return expf(drift + diffusion * curand_normal(&state));
}
device inline double getPathStep(double &drift, double &diffusion, curandState &state)
{
return exp(drift + diffusion * curand_normal_double(&state));
}
// Path generation kernel
template
global void generatePaths(Real *const paths,
curandState *const rngStates,
const AsianOption *const option,
const unsigned int numSims,
const unsigned int numTimesteps)
{
// Determine thread ID
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int step = gridDim.x * blockDim.x;
// Compute parameters
Real drift = (option->r - static_cast<Real>(0.5) * option->sigma * option->sigma) * option->dt;
Real diffusion = option->sigma * sqrt(option->dt);
// Initialise the RNG
curandState localState = rngStates[tid];
for (unsigned int i = tid ; i < numSims ; i += step)
{
// Shift the output pointer
Real *output = paths + i;
// Simulate the path
Real s = static_cast<Real>(1);
for (unsigned int t = 0 ; t < numTimesteps ; t++, output += numSims)
{
s *= getPathStep(drift, diffusion, localState);
*output = s;
}
}
}
template
device Real reduce_sum(Real in)
{
SharedMemory sdata;
// Perform first level of reduction:
// - Write to shared memory
unsigned int ltid = threadIdx.x;
sdata[ltid] = in;
__syncthreads();
// Do reduction in shared mem
for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
{
if (ltid < s)
{
sdata[ltid] += sdata[ltid + s];
}
__syncthreads();
}
return sdata[0];
}
// Valuation kernel
template
global void computeValue(Real *const values,
const Real *const paths,
const AsianOption *const option,
const unsigned int numSims,
const unsigned int numTimesteps)
{
// Determine thread ID
unsigned int bid = blockIdx.x;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int step = gridDim.x * blockDim.x;
Real sumPayoffs = static_cast<Real>(0);
for (unsigned int i = tid ; i < numSims ; i += step)
{
// Shift the input pointer
const Real *path = paths + i;
// Compute the arithmetic average
Real avg = static_cast<Real>(0);
for (unsigned int t = 0 ; t < numTimesteps ; t++, path += numSims)
{
avg += *path;
}
avg = avg * option->spot / numTimesteps;
// Compute the payoff
Real payoff = avg - option->strike;
if (option->type == AsianOption<Real>::Put)
{
payoff = - payoff;
}
payoff = max(static_cast<Real>(0), payoff);
// Accumulate payoff locally
sumPayoffs += payoff;
}
// Reduce within the block
sumPayoffs = reduce_sum<Real>(sumPayoffs);
// Store the result
if (threadIdx.x == 0)
{
values[bid] = sumPayoffs;
}
}
template
PricingEngine::PricingEngine(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed)
: m_numSims(numSims),
m_device(device),
m_threadBlockSize(threadBlockSize),
m_seed(seed)
{
}
template
void PricingEngine::operator()(AsianOption &option)
{
cudaError_t cudaResult = cudaSuccess;
struct cudaDeviceProp deviceProperties;
struct cudaFuncAttributes funcAttributes;
// Get device properties
cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);
if (cudaResult != cudaSuccess)
{
string msg("Could not get device properties: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
// Check precision is valid
unsigned int deviceVersion = deviceProperties.major * 10 + deviceProperties.minor;
if (typeid(Real) == typeid(double) && deviceVersion < 13)
{
throw std::runtime_error("Device does not have double precision support");
}
// Attach to GPU
cudaResult = cudaSetDevice(m_device);
if (cudaResult != cudaSuccess)
{
string msg("Could not set CUDA device: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
// Determine how to divide the work between cores
dim3 block;
dim3 grid;
block.x = m_threadBlockSize;
grid.x = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;
// Aim to launch around ten or more times as many blocks as there
// are multiprocessors on the target device.
unsigned int blocksPerSM = 10;
unsigned int numSMs = deviceProperties.multiProcessorCount;
while (grid.x > 2 * blocksPerSM * numSMs)
{
grid.x >>= 1;
}
// Get initRNG function properties and check the maximum block size
cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG);
if (cudaResult != cudaSuccess)
{
string msg("Could not get function attributes: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
{
throw std::runtime_error("Block X dimension is too large for initRNG kernel");
}
// Get generatePaths function properties and check the maximum block size
cudaResult = cudaFuncGetAttributes(&funcAttributes, generatePaths<Real>);
if (cudaResult != cudaSuccess)
{
string msg("Could not get function attributes: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
{
throw std::runtime_error("Block X dimension is too large for generatePaths kernel");
}
// Get computeValue function properties and check the maximum block size
cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real>);
if (cudaResult != cudaSuccess)
{
string msg("Could not get function attributes: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
{
throw std::runtime_error("Block X dimension is too large for computeValue kernel");
}
// Setup problem on GPU
AsianOption<Real> *d_option = 0;
cudaResult = cudaMalloc((void **)&d_option, sizeof(AsianOption<Real>));
if (cudaResult != cudaSuccess)
{
string msg("Could not allocate memory on device for option data: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
cudaResult = cudaMemcpy(d_option, &option, sizeof(AsianOption<Real>), cudaMemcpyHostToDevice);
if (cudaResult != cudaSuccess)
{
string msg("Could not copy data to device: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
// Allocate memory for paths
Real *d_paths = 0;
int numTimesteps = static_cast<int>(option.tenor / option.dt);
cudaResult = cudaMalloc((void **)&d_paths, m_numSims * numTimesteps * sizeof(Real));
if (cudaResult != cudaSuccess)
{
string msg("Could not allocate memory on device for paths: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
// Allocate memory for RNG states
curandState *d_rngStates = 0;
cudaResult = cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState));
if (cudaResult != cudaSuccess)
{
string msg("Could not allocate memory on device for RNG state: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
// Allocate memory for result
Real *d_values = 0;
cudaResult = cudaMalloc((void **)&d_values, grid.x * sizeof(Real));
if (cudaResult != cudaSuccess)
{
string msg("Could not allocate memory on device for partial results: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
// Initialise RNG
initRNG<<<grid, block>>>(d_rngStates, m_seed);
// Generate paths
generatePaths<Real><<<grid, block>>>(d_paths, d_rngStates, d_option, m_numSims, numTimesteps);
// Compute value
computeValue<<<grid, block, block.x *sizeof(Real)>>>(d_values, d_paths, d_option, m_numSims, numTimesteps);
// Copy partial results back
vector<Real> values(grid.x);
cudaResult = cudaMemcpy(&values[0], d_values, grid.x * sizeof(Real), cudaMemcpyDeviceToHost);
if (cudaResult != cudaSuccess)
{
string msg("Could not copy partial results to host: ");
msg += cudaGetErrorString(cudaResult);
throw std::runtime_error(msg);
}
// Complete sum-reduction on host
option.value = std::accumulate(values.begin(), values.end(), static_cast<Real>(0));
// Compute the mean
option.value /= m_numSims;
// Discount to present value
option.value *= exp(- option.r * option.tenor);
// Cleanup
if (d_option)
{
cudaFree(d_option);
d_option = 0;
}
if (d_paths)
{
cudaFree(d_paths);
d_paths = 0;
}
if (d_rngStates)
{
cudaFree(d_rngStates);
d_rngStates = 0;
}
if (d_values)
{
cudaFree(d_values);
d_values = 0;
}
}
// Explicit template instantiation
template class PricingEngine;
template class PricingEngine;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
and every time there is a kernel function (foo<<<x,y>>>(…)) this one is underlined in red. :(