My kernel functions aren't recognized

Jingo54 · April 14, 2013, 11:22am

Hi everything,

I’m a newbie with CUDA and I have one simple question for you. I’m trying to compile an example of the CUDA Toolkit (Asian Option by MC). I’m using Qt. I copied a pro file adapted to CUDA that I found on the Internet, but every time I have a kernel function (foo<<<x,y>>>(…)) in my code, this last one is underlined (red colour). And unsurprisingly, the code crash when I try to compile it.
Do you have any idea where it could come from?

Thanks.

pasoleatis · April 14, 2013, 8:57pm

Do you have a cuda device on your pc? If yes did you install the cuda toolkit and the necessary dependencies?

Jingo54 · April 15, 2013, 7:26pm

Hi pasoleatis and thank you for your answer.

Yes I have installed CUDA Toolkit on my pc, but what do you mean by necessary dependencies?

Below is the pro file that I use (I don’t know if something is missing?):

#################################################

QT += core

QT -= gui

TARGET = TestCUDA
CONFIG += console
CONFIG -= app_bundle

TEMPLATE = app

Define output directories

DESTDIR = release
OBJECTS_DIR = release/obj
CUDA_OBJECTS_DIR = release/cuda

Source files

SOURCES += main.cpp

This makes the .cu files appear in your project

OTHER_FILES += vectorAddition.cu

CUDA settings ← may change depending on your system

CUDA_SOURCES += vectorAddition.cu
#CUDA_SDK = “C:/ProgramData/NVIDIA Corporation/NVIDIA GPU Computing SDK
4.2/C” # Path to cuda SDK install
CUDA_DIR = “C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v5.0”
# Path to cuda toolkit install
SYSTEM_NAME = win32 # Depending on your system either ‘Win32’,
‘x64’, or ‘Win64’
SYSTEM_TYPE = 32 # ‘32’ or ‘64’, depending on your system
CUDA_ARCH = sm_11 # Type of CUDA architecture, for example
‘compute_10’, ‘compute_11’, ‘sm_10’
NVCC_OPTIONS = --use_fast_math

include paths

INCLUDEPATH += $$CUDA_DIR/include
#$$CUDA_SDK/common/inc/ \

Add the necessary libraries

CUDA_LIBS = cuda cudart

The following makes sure all path names (which often include spaces)

are put between quotation marks
CUDA_INC = $$join(INCLUDEPATH,‘" -I"’,‘-I"’,‘"’)

LIBRARIES IN FORMAT NEEDED BY NVCC

NVCC_LIBS = $$join(CUDA_LIBS,’ -l’,‘-l’, ‘’)

LIBRARIES IN FORMAT NEEDED BY VISUAL C++ LINKER

LIBS += $$join(CUDA_LIBS,'.lib ', ‘’, ‘.lib’)

pasoleatis · April 16, 2013, 10:22am

Please post here the code and the error you get when you tried to compile. Did you tried to compile the examples? are those working?

Jingo54 · April 16, 2013, 12:21pm

Hi pasoleatis.

The file are the same as the ones in the CUDA Toolkit. I tried the MC_SingleAsianOptionP example. Here are the codes:

%%%%%%%%%%%%% asianoption.h

#ifndef ASIANOPTION_H
#define ASIANOPTION_H

template
struct AsianOption
{
enum CallPut {Call, Put};

// Parameters
Real spot;
Real strike;
Real r;
Real sigma;
Real tenor;
Real dt;

// Value
Real golden;
Real value;

// Option type
CallPut type;

};

#endif

%%%%%%%%%%%%% cudasharedmem.h

#ifndef CUDASHAREDMEM_H
#define CUDASHAREDMEM_H

//****************************************************************************
// Because dynamically sized shared memory arrays are declared “extern”,
// we can’t templatize them directly. To get around this, we declare a
// simple wrapper struct that will declare the extern array with a different
// name depending on the type. This avoids compiler errors about duplicate
// definitions.
//
// To use dynamically allocated shared memory in a templatized global or
// device function, just replace code like this:
// template
// global void
// foo( T* g_idata, T* g_odata)
// {
// // Shared mem size is determined by the host app at run time
// extern shared T sdata;
// …
// x = sdata[i];
// sdata[i] = x;
// …
// }
//
// With this:
// template
// global void
// foo( T* g_idata, T* g_odata)
// {
// // Shared mem size is determined by the host app at run time
// SharedMemory sdata;
// …
// x = sdata[i];
// sdata[i] = x;
// …
// }
//****************************************************************************

// This is the un-specialized struct. Note that we prevent instantiation of this
// struct by making it abstract (i.e. with pure virtual methods).
template
struct SharedMemory
{
// Ensure that we won’t compile any un-specialized types
virtual device T &operator*() = 0;
virtual device T &operator(int i) = 0;
};

#define BUILD_SHAREDMEMORY_TYPE(t, n)
template <>
struct SharedMemory
{
device t &operator*() { extern shared t n; return *n; }
device t &operator(int i) { extern shared t n; return n[i]; }
}

BUILD_SHAREDMEMORY_TYPE(int, s_int);
BUILD_SHAREDMEMORY_TYPE(unsigned int, s_uint);
BUILD_SHAREDMEMORY_TYPE(char, s_char);
BUILD_SHAREDMEMORY_TYPE(unsigned char, s_uchar);
BUILD_SHAREDMEMORY_TYPE(short, s_short);
BUILD_SHAREDMEMORY_TYPE(unsigned short, s_ushort);
BUILD_SHAREDMEMORY_TYPE(long, s_long);
BUILD_SHAREDMEMORY_TYPE(unsigned long, s_ulong);
BUILD_SHAREDMEMORY_TYPE(bool, s_bool);
BUILD_SHAREDMEMORY_TYPE(float, s_float);
BUILD_SHAREDMEMORY_TYPE(double, s_double);

#endif

%%%%%%%%%%%%% pricingengine.h

#ifndef PRICINGENGINE_H
#define PRICINGENGINE_H

#include “asianoption.h”

template
class PricingEngine
{
public:
PricingEngine(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed);
void operator()(AsianOption &option);

private:
    unsigned int m_seed;
    unsigned int m_numSims;
    unsigned int m_device;
    unsigned int m_threadBlockSize;

};

#endif

%%%%%%%%%%%%% test.h

#ifndef TEST_H
#define TEST_H

template
struct Test
{
Test() : pass(false) {};

int          device;
unsigned int numSims;
unsigned int threadBlockSize;
unsigned int seed;

bool   pass;
double elapsedTime;

bool operator()();

};

// Defaults are arbitrary to give sensible runtime
#define k_sims_min 100000
#define k_sims_max 1000000
#define k_sims_def 100000
#define k_sims_qa 100000
#define k_bsize_min 32
#define k_bsize_def 128
#define k_bsize_qa 128
#define k_seed_def 1234
#define k_seed_qa 1234

#endif

%%%%%%%%%%%%% main.cpp

///////////////////////////////////////////////////////////////////////////////
// Monte Carlo: Single Asian Option
// ================================
//
// This sample demonstrates pricing a single European-exercise average-price
// Asian option (arithmetic discrete average).
//
// This file, main.cpp, contains the setup information to run the test, for
// example parsing the command line and integrating this sample with the
// samples framework. As such it is perhaps less interesting than the guts of
// the sample. Readers wishing to skip the clutter are advised to skip straight
// to Test.operator() in test.cpp.
///////////////////////////////////////////////////////////////////////////////

#include <QtCore/QCoreApplication>

#include
#include
#include
#include <cuda_runtime.h>
//#include <helper_timer.h>
//#include <helper_cuda.h>

#include “test.h”

// Forward declarations
void showHelp(const int argc, const char **argv);
template
void runTest(int argc, const char **argv);

int main(int argc, char **argv)
{
using std::invalid_argument;
using std::string;

QCoreApplication a(argc, argv);

// Open the log file
printf("Monte Carlo Single Asian Option (with PRNG)\n");
printf("===========================================\n\n");

// If help flag is set, display help and exit immediately
if (checkCmdLineFlag(argc, (const char **)argv, "help"))
{
    printf("Displaying help on console\n");
    showHelp(argc, (const char **)argv);
    exit(EXIT_SUCCESS);
}

// Check the precision (checked against the device capability later)
try
{
    char *value;

    if (getCmdLineArgumentString(argc, (const char **)argv, "precision", &value))
    {
        // Check requested precision is valid
        string prec(value);

        if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0)
        {
            runTest<float>(argc, (const char **)argv);
        }
        else if (prec.compare("double") == 0 || prec.compare("\"double\"") == 0)
        {
            runTest<double>(argc, (const char **)argv);
        }
        else
        {
            printf("specified precision (%s) is invalid, must be \"single\" or \"double\".\n", value);
            throw invalid_argument("precision");
        }
    }
    else
    {
        runTest<float>(argc, (const char **)argv);
    }
}
catch (invalid_argument &e)
{
    printf("invalid command line argument (%s)\n", e.what());
}

return a.exec();

// Finish
//exit(EXIT_SUCCESS);

}

template
void runTest(int argc, const char **argv)
{
using std::invalid_argument;
using std::runtime_error;

try
{
    Test<Real> test;
    int deviceCount = 0;
    cudaError_t cudaResult  = cudaSuccess;

    // by default specify GPU Device == 0
    test.device             = 0;

    // Get number of available devices
    cudaResult = cudaGetDeviceCount(&deviceCount);

    if (cudaResult != cudaSuccess)
    {
        printf("could not get device count.\n");
        throw runtime_error("cudaGetDeviceCount");
    }

    // (default parameters)
    test.numSims         = k_sims_qa;
    test.threadBlockSize = k_bsize_qa;
    test.seed            = k_seed_qa;

    {
        char *value = 0;

        if (getCmdLineArgumentString(argc, argv, "device", &value))
        {
            test.device = (int)atoi(value);

            if (test.device >= deviceCount)
            {
                printf("invalid target device specified on command line (device %d does not exist).\n", test.device);
                throw invalid_argument("device");
            }
        }
        else
        {
            test.device = gpuGetMaxGflopsDeviceId();
        }

        if (getCmdLineArgumentString(argc, argv, "sims", &value))
        {
            test.numSims = (unsigned int)atoi(value);

            if (test.numSims < k_sims_min || test.numSims > k_sims_max)
            {
                printf("specified number of simulations (%d) is invalid, must be between %d and %d.\n", test.numSims, k_sims_min, k_sims_max);
                throw invalid_argument("sims");
            }
        }
        else
        {
            test.numSims = k_sims_def;
        }

        if (getCmdLineArgumentString(argc, argv, "block-size", &value))
        {
            // Determine max threads per block
            cudaDeviceProp deviceProperties;
            cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device);

            if (cudaResult != cudaSuccess)
            {
                printf("cound not get device properties for device %d.\n", test.device);
                throw runtime_error("cudaGetDeviceProperties");
            }

            // Check requested size is valid
            test.threadBlockSize = (unsigned int)atoi(value);

            if (test.threadBlockSize < k_bsize_min || test.threadBlockSize > static_cast<unsigned int>(deviceProperties.maxThreadsPerBlock))
            {
                printf("specified block size (%d) is invalid, must be between %d and %d for device %d.\n", test.threadBlockSize, k_bsize_min, deviceProperties.maxThreadsPerBlock, test.device);
                throw invalid_argument("block-size");
            }

            if (test.threadBlockSize & test.threadBlockSize-1)
            {
                printf("specified block size (%d) is invalid, must be a power of two (see reduction function).\n", test.threadBlockSize);
                throw invalid_argument("block-size");
            }
        }
        else
        {
            test.threadBlockSize = k_bsize_def;
        }

        if (getCmdLineArgumentString(argc, argv, "seed", &value))
        {
            // Check requested seed is valid
            test.seed = (unsigned int)atoi(value);

            if (test.seed == 0)
            {
                printf("specified seed (%d) is invalid, must be non-zero.\n", test.seed);
                throw invalid_argument("seed");
            }
        }
        else
        {
            test.seed = k_seed_def;
        }
    }
    // Execute
    test();
}
catch (invalid_argument &e)
{
    printf("invalid command line argument (%s)\n", e.what());
}
catch (runtime_error &e)
{
    printf("runtime error (%s)\n", e.what());
}

}

void showHelp(int argc, const char **argv)
{
using std::cout;
using std::endl;
using std::left;
using std::setw;

if (argc > 0)
{
    cout << endl << argv[0] << endl;
}

cout << endl << "Syntax:" << endl;
cout << left;
cout << "    " << setw(20) << "--device=<device>" << "Specify device to use for execution" << endl;
cout << "    " << setw(20) << "--sims=<N>" << "Specify number of Monte Carlo simulations" << endl;
cout << "    " << setw(20) << "--block-size=<N>" << "Specify number of threads per block" << endl;
cout << "    " << setw(20) << "--seed=<N>" << "Specify the seed to use for the random number generator" << endl;
cout << "    " << setw(20) << "--precision=<P>" << "Specify the precision (\"single\" or \"double\")" << endl;
cout << endl;
cout << "    " << setw(20) << "--noprompt" << "Skip prompt before exit" << endl;
cout << endl;

}

%%%%%%%%%%%%% test.cpp

#include “test.h”

#include
#include
#include
#include
#include
#include
#include
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

//#include <helper_timer.h>

#include “asianoption.h”
#include “pricingengine.h”

template
bool Test::operator()()
{
using std::stringstream;
using std::endl;
using std::setw;

StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);

// Get device properties
struct cudaDeviceProp deviceProperties;
cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device);

if (cudaResult != cudaSuccess)
{
    std::string msg("Could not get device properties: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// This test prices a single Asian call option with European
// exercise, with the priced averaged arithmeticly on discrete
// trading days (weekdays).
AsianOption<Real> option;
option.spot   = static_cast<Real>(40);
option.strike = static_cast<Real>(35);
option.r      = static_cast<Real>(0.03);
option.sigma  = static_cast<Real>(0.20);
option.tenor  = static_cast<Real>(1.0/3.0);
option.dt     = static_cast<Real>(1.0/261);
option.type   = AsianOption<Real>::Call;
option.value  = static_cast<Real>(0.0);
option.golden = static_cast<Real>(5.162534);

// Evaluate on GPU
printf("Pricing option on GPU (%s)\n\n", deviceProperties.name);
PricingEngine<Real> pricer(numSims, device, threadBlockSize, seed);
sdkStartTimer(&timer);
pricer(option);
sdkStopTimer(&timer);
elapsedTime = sdkGetAverageTimerValue(&timer)/1000.0f;

// Tolerance to compare result with expected
// This is just to check that nothing has gone very wrong with the
// test, the actual accuracy of the result depends on the number of
// Monte Carlo trials
const Real tolerance = static_cast<Real>(0.1);

// Display results
stringstream output;
output << "Precision:      " << ((typeid(Real) == typeid(double)) ? "double" : "single") << endl;
output << "Number of sims: " << numSims << endl;
output << endl;
output << "   Spot    |   Strike   |     r      |   sigma    |   tenor    |  Call/Put  |   Value    |  Expected  |" << endl;
output << "-----------|------------|------------|------------|------------|------------|------------|------------|" << endl;
output << setw(10) << option.spot << " | ";
output << setw(10) << option.strike << " | ";
output << setw(10) << option.r << " | ";
output << setw(10) << option.sigma << " | ";
output << setw(10) << option.tenor << " | ";
output << setw(10) << (option.type == AsianOption<Real>::Call ? "Call" : "Put") << " | ";
output << setw(10) << option.value << " | ";
output << setw(10) << option.golden << " |";

printf("%s\n\n", output.str().c_str());

// Check result
if (fabs(option.value - option.golden) > tolerance)
{
    printf("computed result (%e) does not match expected result (%e).\n", option.value, option.golden);
    pass = false;
}
else
{
    pass = true;
}

// Print results
printf("MonteCarloSingleAsianOptionP, Performance = %.2f sims/s, Time = %.2f(ms), NumDevsUsed = %u, Blocksize = %u\n",
       numSims / elapsedTime, elapsedTime*1000.0f, 1, threadBlockSize);

sdkDeleteTimer(&timer);

return pass;

}

// Explicit template instantiation
template struct Test;
template struct Test;

%%%%%%%%%%%%% pricingengine.cu

#include “pricingengine.h”

#include
#include
#include
#include
#include
#include <cuda_runtime.h>
#include <curand_kernel.h>

#include “asianoption.h”
#include “cudasharedmem.h”

using std::string;
using std::vector;

// RNG init kernel
global void initRNG(curandState *const rngStates,
const unsigned int seed)
{
// Determine thread ID
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

// Initialise the RNG
curand_init(seed, tid, 0, &rngStates[tid]);

}

device inline float getPathStep(float &drift, float &diffusion, curandState &state)
{
return expf(drift + diffusion * curand_normal(&state));
}
device inline double getPathStep(double &drift, double &diffusion, curandState &state)
{
return exp(drift + diffusion * curand_normal_double(&state));
}

// Path generation kernel
template
global void generatePaths(Real *const paths,
curandState *const rngStates,
const AsianOption *const option,
const unsigned int numSims,
const unsigned int numTimesteps)
{
// Determine thread ID
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int step = gridDim.x * blockDim.x;

// Compute parameters
Real drift     = (option->r - static_cast<Real>(0.5) * option->sigma * option->sigma) * option->dt;
Real diffusion = option->sigma * sqrt(option->dt);

// Initialise the RNG
curandState localState = rngStates[tid];

for (unsigned int i = tid ; i < numSims ; i += step)
{
    // Shift the output pointer
    Real *output = paths + i;

    // Simulate the path
    Real s = static_cast<Real>(1);

    for (unsigned int t = 0 ; t < numTimesteps ; t++, output += numSims)
    {
        s *= getPathStep(drift, diffusion, localState);
        *output = s;
    }
}

}

template
device Real reduce_sum(Real in)
{
SharedMemory sdata;

// Perform first level of reduction:
// - Write to shared memory
unsigned int ltid = threadIdx.x;

sdata[ltid] = in;
__syncthreads();

// Do reduction in shared mem
for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
{
    if (ltid < s)
    {
        sdata[ltid] += sdata[ltid + s];
    }

    __syncthreads();
}

return sdata[0];

}

// Valuation kernel
template
global void computeValue(Real *const values,
const Real *const paths,
const AsianOption *const option,
const unsigned int numSims,
const unsigned int numTimesteps)
{
// Determine thread ID
unsigned int bid = blockIdx.x;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int step = gridDim.x * blockDim.x;

Real sumPayoffs = static_cast<Real>(0);

for (unsigned int i = tid ; i < numSims ; i += step)
{
    // Shift the input pointer
    const Real *path = paths + i;
    // Compute the arithmetic average
    Real avg = static_cast<Real>(0);

    for (unsigned int t = 0 ; t < numTimesteps ; t++, path += numSims)
    {
        avg += *path;
    }

    avg = avg * option->spot / numTimesteps;
    // Compute the payoff
    Real payoff = avg - option->strike;

    if (option->type == AsianOption<Real>::Put)
    {
        payoff = - payoff;
    }

    payoff = max(static_cast<Real>(0), payoff);
    // Accumulate payoff locally
    sumPayoffs += payoff;
}

// Reduce within the block
sumPayoffs = reduce_sum<Real>(sumPayoffs);

// Store the result
if (threadIdx.x == 0)
{
    values[bid] = sumPayoffs;
}

}

template
PricingEngine::PricingEngine(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed)
: m_numSims(numSims),
m_device(device),
m_threadBlockSize(threadBlockSize),
m_seed(seed)
{
}

template
void PricingEngine::operator()(AsianOption &option)
{
cudaError_t cudaResult = cudaSuccess;
struct cudaDeviceProp deviceProperties;
struct cudaFuncAttributes funcAttributes;

// Get device properties
cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);

if (cudaResult != cudaSuccess)
{
    string msg("Could not get device properties: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Check precision is valid
unsigned int deviceVersion = deviceProperties.major * 10 + deviceProperties.minor;

if (typeid(Real) == typeid(double) && deviceVersion < 13)
{
    throw std::runtime_error("Device does not have double precision support");
}

// Attach to GPU
cudaResult = cudaSetDevice(m_device);

if (cudaResult != cudaSuccess)
{
    string msg("Could not set CUDA device: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Determine how to divide the work between cores
dim3 block;
dim3 grid;
block.x = m_threadBlockSize;
grid.x  = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;

// Aim to launch around ten or more times as many blocks as there
// are multiprocessors on the target device.
unsigned int blocksPerSM = 10;
unsigned int numSMs      = deviceProperties.multiProcessorCount;

while (grid.x > 2 * blocksPerSM * numSMs)
{
    grid.x >>= 1;
}

// Get initRNG function properties and check the maximum block size
cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG);

if (cudaResult != cudaSuccess)
{
    string msg("Could not get function attributes: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
{
    throw std::runtime_error("Block X dimension is too large for initRNG kernel");
}

// Get generatePaths function properties and check the maximum block size
cudaResult = cudaFuncGetAttributes(&funcAttributes, generatePaths<Real>);

if (cudaResult != cudaSuccess)
{
    string msg("Could not get function attributes: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
{
    throw std::runtime_error("Block X dimension is too large for generatePaths kernel");
}

// Get computeValue function properties and check the maximum block size
cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real>);

if (cudaResult != cudaSuccess)
{
    string msg("Could not get function attributes: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
{
    throw std::runtime_error("Block X dimension is too large for computeValue kernel");
}

// Setup problem on GPU
AsianOption<Real> *d_option = 0;
cudaResult = cudaMalloc((void **)&d_option, sizeof(AsianOption<Real>));

if (cudaResult != cudaSuccess)
{
    string msg("Could not allocate memory on device for option data: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

cudaResult = cudaMemcpy(d_option, &option, sizeof(AsianOption<Real>), cudaMemcpyHostToDevice);

if (cudaResult != cudaSuccess)
{
    string msg("Could not copy data to device: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Allocate memory for paths
Real *d_paths  = 0;
int numTimesteps = static_cast<int>(option.tenor / option.dt);
cudaResult = cudaMalloc((void **)&d_paths, m_numSims * numTimesteps * sizeof(Real));

if (cudaResult != cudaSuccess)
{
    string msg("Could not allocate memory on device for paths: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Allocate memory for RNG states
curandState *d_rngStates = 0;
cudaResult = cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState));

if (cudaResult != cudaSuccess)
{
    string msg("Could not allocate memory on device for RNG state: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Allocate memory for result
Real *d_values = 0;
cudaResult = cudaMalloc((void **)&d_values, grid.x * sizeof(Real));

if (cudaResult != cudaSuccess)
{
    string msg("Could not allocate memory on device for partial results: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Initialise RNG
initRNG<<<grid, block>>>(d_rngStates, m_seed);

// Generate paths
generatePaths<Real><<<grid, block>>>(d_paths, d_rngStates, d_option, m_numSims, numTimesteps);

// Compute value
computeValue<<<grid, block, block.x *sizeof(Real)>>>(d_values, d_paths, d_option, m_numSims, numTimesteps);

// Copy partial results back
vector<Real> values(grid.x);
cudaResult = cudaMemcpy(&values[0], d_values, grid.x * sizeof(Real), cudaMemcpyDeviceToHost);

if (cudaResult != cudaSuccess)
{
    string msg("Could not copy partial results to host: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Complete sum-reduction on host
option.value = std::accumulate(values.begin(), values.end(), static_cast<Real>(0));

// Compute the mean
option.value /= m_numSims;

// Discount to present value
option.value *= exp(- option.r * option.tenor);

// Cleanup
if (d_option)
{
    cudaFree(d_option);
    d_option = 0;
}
if (d_paths)
{
    cudaFree(d_paths);
    d_paths = 0;
}
if (d_rngStates)
{
    cudaFree(d_rngStates);
    d_rngStates = 0;
}
if (d_values)
{
    cudaFree(d_values);
    d_values = 0;
}

}

// Explicit template instantiation
template class PricingEngine;
template class PricingEngine;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

and every time there is a kernel function (foo<<<x,y>>>(…)) this one is underlined in red. :(

pasoleatis · April 16, 2013, 5:28pm

What is the error you get when compiling? Can you compile and run the device query example?