My kernel functions aren't recognized

Hi everything,

I’m a newbie with CUDA and I have one simple question for you. I’m trying to compile an example of the CUDA Toolkit (Asian Option by MC). I’m using Qt. I copied a pro file adapted to CUDA that I found on the Internet, but every time I have a kernel function (foo<<<x,y>>>(…)) in my code, this last one is underlined (red colour). And unsurprisingly, the code crash when I try to compile it.
Do you have any idea where it could come from?

Thanks.

Do you have a cuda device on your pc? If yes did you install the cuda toolkit and the necessary dependencies?

Hi pasoleatis and thank you for your answer.

Yes I have installed CUDA Toolkit on my pc, but what do you mean by necessary dependencies?

Below is the pro file that I use (I don’t know if something is missing?):

#################################################

QT += core

QT -= gui

TARGET = TestCUDA
CONFIG += console
CONFIG -= app_bundle

TEMPLATE = app

Define output directories

DESTDIR = release
OBJECTS_DIR = release/obj
CUDA_OBJECTS_DIR = release/cuda

Source files

SOURCES += main.cpp

This makes the .cu files appear in your project

OTHER_FILES += vectorAddition.cu

CUDA settings ← may change depending on your system

CUDA_SOURCES += vectorAddition.cu
#CUDA_SDK = “C:/ProgramData/NVIDIA Corporation/NVIDIA GPU Computing SDK
4.2/C” # Path to cuda SDK install
CUDA_DIR = “C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v5.0”
# Path to cuda toolkit install
SYSTEM_NAME = win32 # Depending on your system either ‘Win32’,
‘x64’, or ‘Win64’
SYSTEM_TYPE = 32 # ‘32’ or ‘64’, depending on your system
CUDA_ARCH = sm_11 # Type of CUDA architecture, for example
‘compute_10’, ‘compute_11’, ‘sm_10’
NVCC_OPTIONS = --use_fast_math

include paths

INCLUDEPATH += $$CUDA_DIR/include
#$$CUDA_SDK/common/inc/ \

Add the necessary libraries

CUDA_LIBS = cuda cudart

The following makes sure all path names (which often include spaces)

are put between quotation marks
CUDA_INC = $$join(INCLUDEPATH,‘" -I"’,‘-I"’,‘"’)

LIBRARIES IN FORMAT NEEDED BY NVCC

NVCC_LIBS = $$join(CUDA_LIBS,’ -l’,‘-l’, ‘’)

LIBRARIES IN FORMAT NEEDED BY VISUAL C++ LINKER

LIBS += $$join(CUDA_LIBS,'.lib ', ‘’, ‘.lib’)

Please post here the code and the error you get when you tried to compile. Did you tried to compile the examples? are those working?

Hi pasoleatis.

The file are the same as the ones in the CUDA Toolkit. I tried the MC_SingleAsianOptionP example. Here are the codes:

%%%%%%%%%%%%% asianoption.h

#ifndef ASIANOPTION_H
#define ASIANOPTION_H

template
struct AsianOption
{
enum CallPut {Call, Put};

// Parameters
Real spot;
Real strike;
Real r;
Real sigma;
Real tenor;
Real dt;

// Value
Real golden;
Real value;

// Option type
CallPut type;

};

#endif

%%%%%%%%%%%%% cudasharedmem.h

#ifndef CUDASHAREDMEM_H
#define CUDASHAREDMEM_H

//****************************************************************************
// Because dynamically sized shared memory arrays are declared “extern”,
// we can’t templatize them directly. To get around this, we declare a
// simple wrapper struct that will declare the extern array with a different
// name depending on the type. This avoids compiler errors about duplicate
// definitions.
//
// To use dynamically allocated shared memory in a templatized global or
// device function, just replace code like this:
// template
// global void
// foo( T* g_idata, T* g_odata)
// {
// // Shared mem size is determined by the host app at run time
// extern shared T sdata;
// …
// x = sdata[i];
// sdata[i] = x;
// …
// }
//
// With this:
// template
// global void
// foo( T* g_idata, T* g_odata)
// {
// // Shared mem size is determined by the host app at run time
// SharedMemory sdata;
// …
// x = sdata[i];
// sdata[i] = x;
// …
// }
//****************************************************************************

// This is the un-specialized struct. Note that we prevent instantiation of this
// struct by making it abstract (i.e. with pure virtual methods).
template
struct SharedMemory
{
// Ensure that we won’t compile any un-specialized types
virtual device T &operator*() = 0;
virtual device T &operator(int i) = 0;
};

#define BUILD_SHAREDMEMORY_TYPE(t, n)
template <>
struct SharedMemory
{
device t &operator*() { extern shared t n; return *n; }
device t &operator(int i) { extern shared t n; return n[i]; }
}

BUILD_SHAREDMEMORY_TYPE(int, s_int);
BUILD_SHAREDMEMORY_TYPE(unsigned int, s_uint);
BUILD_SHAREDMEMORY_TYPE(char, s_char);
BUILD_SHAREDMEMORY_TYPE(unsigned char, s_uchar);
BUILD_SHAREDMEMORY_TYPE(short, s_short);
BUILD_SHAREDMEMORY_TYPE(unsigned short, s_ushort);
BUILD_SHAREDMEMORY_TYPE(long, s_long);
BUILD_SHAREDMEMORY_TYPE(unsigned long, s_ulong);
BUILD_SHAREDMEMORY_TYPE(bool, s_bool);
BUILD_SHAREDMEMORY_TYPE(float, s_float);
BUILD_SHAREDMEMORY_TYPE(double, s_double);

#endif

%%%%%%%%%%%%% pricingengine.h

#ifndef PRICINGENGINE_H
#define PRICINGENGINE_H

#include “asianoption.h”

template
class PricingEngine
{
public:
PricingEngine(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed);
void operator()(AsianOption &option);

private:
    unsigned int m_seed;
    unsigned int m_numSims;
    unsigned int m_device;
    unsigned int m_threadBlockSize;

};

#endif

%%%%%%%%%%%%% test.h

#ifndef TEST_H
#define TEST_H

template
struct Test
{
Test() : pass(false) {};

int          device;
unsigned int numSims;
unsigned int threadBlockSize;
unsigned int seed;

bool   pass;
double elapsedTime;

bool operator()();

};

// Defaults are arbitrary to give sensible runtime
#define k_sims_min 100000
#define k_sims_max 1000000
#define k_sims_def 100000
#define k_sims_qa 100000
#define k_bsize_min 32
#define k_bsize_def 128
#define k_bsize_qa 128
#define k_seed_def 1234
#define k_seed_qa 1234

#endif

%%%%%%%%%%%%% main.cpp

///////////////////////////////////////////////////////////////////////////////
// Monte Carlo: Single Asian Option
// ================================
//
// This sample demonstrates pricing a single European-exercise average-price
// Asian option (arithmetic discrete average).
//
// This file, main.cpp, contains the setup information to run the test, for
// example parsing the command line and integrating this sample with the
// samples framework. As such it is perhaps less interesting than the guts of
// the sample. Readers wishing to skip the clutter are advised to skip straight
// to Test.operator() in test.cpp.
///////////////////////////////////////////////////////////////////////////////

#include <QtCore/QCoreApplication>

#include
#include
#include
#include <cuda_runtime.h>
//#include <helper_timer.h>
//#include <helper_cuda.h>

#include “test.h”

// Forward declarations
void showHelp(const int argc, const char **argv);
template
void runTest(int argc, const char **argv);

int main(int argc, char **argv)
{
using std::invalid_argument;
using std::string;

QCoreApplication a(argc, argv);

// Open the log file
printf("Monte Carlo Single Asian Option (with PRNG)\n");
printf("===========================================\n\n");

// If help flag is set, display help and exit immediately
if (checkCmdLineFlag(argc, (const char **)argv, "help"))
{
    printf("Displaying help on console\n");
    showHelp(argc, (const char **)argv);
    exit(EXIT_SUCCESS);
}

// Check the precision (checked against the device capability later)
try
{
    char *value;

    if (getCmdLineArgumentString(argc, (const char **)argv, "precision", &value))
    {
        // Check requested precision is valid
        string prec(value);

        if (prec.compare("single") == 0 || prec.compare("\"single\"") == 0)
        {
            runTest<float>(argc, (const char **)argv);
        }
        else if (prec.compare("double") == 0 || prec.compare("\"double\"") == 0)
        {
            runTest<double>(argc, (const char **)argv);
        }
        else
        {
            printf("specified precision (%s) is invalid, must be \"single\" or \"double\".\n", value);
            throw invalid_argument("precision");
        }
    }
    else
    {
        runTest<float>(argc, (const char **)argv);
    }
}
catch (invalid_argument &e)
{
    printf("invalid command line argument (%s)\n", e.what());
}

return a.exec();

// Finish
//exit(EXIT_SUCCESS);

}

template
void runTest(int argc, const char **argv)
{
using std::invalid_argument;
using std::runtime_error;

try
{
    Test<Real> test;
    int deviceCount = 0;
    cudaError_t cudaResult  = cudaSuccess;

    // by default specify GPU Device == 0
    test.device             = 0;

    // Get number of available devices
    cudaResult = cudaGetDeviceCount(&deviceCount);

    if (cudaResult != cudaSuccess)
    {
        printf("could not get device count.\n");
        throw runtime_error("cudaGetDeviceCount");
    }

    // (default parameters)
    test.numSims         = k_sims_qa;
    test.threadBlockSize = k_bsize_qa;
    test.seed            = k_seed_qa;

    {
        char *value = 0;

        if (getCmdLineArgumentString(argc, argv, "device", &value))
        {
            test.device = (int)atoi(value);

            if (test.device >= deviceCount)
            {
                printf("invalid target device specified on command line (device %d does not exist).\n", test.device);
                throw invalid_argument("device");
            }
        }
        else
        {
            test.device = gpuGetMaxGflopsDeviceId();
        }

        if (getCmdLineArgumentString(argc, argv, "sims", &value))
        {
            test.numSims = (unsigned int)atoi(value);

            if (test.numSims < k_sims_min || test.numSims > k_sims_max)
            {
                printf("specified number of simulations (%d) is invalid, must be between %d and %d.\n", test.numSims, k_sims_min, k_sims_max);
                throw invalid_argument("sims");
            }
        }
        else
        {
            test.numSims = k_sims_def;
        }

        if (getCmdLineArgumentString(argc, argv, "block-size", &value))
        {
            // Determine max threads per block
            cudaDeviceProp deviceProperties;
            cudaResult = cudaGetDeviceProperties(&deviceProperties, test.device);

            if (cudaResult != cudaSuccess)
            {
                printf("cound not get device properties for device %d.\n", test.device);
                throw runtime_error("cudaGetDeviceProperties");
            }

            // Check requested size is valid
            test.threadBlockSize = (unsigned int)atoi(value);

            if (test.threadBlockSize < k_bsize_min || test.threadBlockSize > static_cast<unsigned int>(deviceProperties.maxThreadsPerBlock))
            {
                printf("specified block size (%d) is invalid, must be between %d and %d for device %d.\n", test.threadBlockSize, k_bsize_min, deviceProperties.maxThreadsPerBlock, test.device);
                throw invalid_argument("block-size");
            }

            if (test.threadBlockSize & test.threadBlockSize-1)
            {
                printf("specified block size (%d) is invalid, must be a power of two (see reduction function).\n", test.threadBlockSize);
                throw invalid_argument("block-size");
            }
        }
        else
        {
            test.threadBlockSize = k_bsize_def;
        }

        if (getCmdLineArgumentString(argc, argv, "seed", &value))
        {
            // Check requested seed is valid
            test.seed = (unsigned int)atoi(value);

            if (test.seed == 0)
            {
                printf("specified seed (%d) is invalid, must be non-zero.\n", test.seed);
                throw invalid_argument("seed");
            }
        }
        else
        {
            test.seed = k_seed_def;
        }
    }
    // Execute
    test();
}
catch (invalid_argument &e)
{
    printf("invalid command line argument (%s)\n", e.what());
}
catch (runtime_error &e)
{
    printf("runtime error (%s)\n", e.what());
}

}

void showHelp(int argc, const char **argv)
{
using std::cout;
using std::endl;
using std::left;
using std::setw;

if (argc > 0)
{
    cout << endl << argv[0] << endl;
}

cout << endl << "Syntax:" << endl;
cout << left;
cout << "    " << setw(20) << "--device=<device>" << "Specify device to use for execution" << endl;
cout << "    " << setw(20) << "--sims=<N>" << "Specify number of Monte Carlo simulations" << endl;
cout << "    " << setw(20) << "--block-size=<N>" << "Specify number of threads per block" << endl;
cout << "    " << setw(20) << "--seed=<N>" << "Specify the seed to use for the random number generator" << endl;
cout << "    " << setw(20) << "--precision=<P>" << "Specify the precision (\"single\" or \"double\")" << endl;
cout << endl;
cout << "    " << setw(20) << "--noprompt" << "Skip prompt before exit" << endl;
cout << endl;

}

%%%%%%%%%%%%% test.cpp

#include “test.h”

#include
#include
#include
#include
#include
#include
#include
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

//#include <helper_timer.h>

#include “asianoption.h”
#include “pricingengine.h”

template
bool Test::operator()()
{
using std::stringstream;
using std::endl;
using std::setw;

StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);

// Get device properties
struct cudaDeviceProp deviceProperties;
cudaError_t cudaResult = cudaGetDeviceProperties(&deviceProperties, device);

if (cudaResult != cudaSuccess)
{
    std::string msg("Could not get device properties: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// This test prices a single Asian call option with European
// exercise, with the priced averaged arithmeticly on discrete
// trading days (weekdays).
AsianOption<Real> option;
option.spot   = static_cast<Real>(40);
option.strike = static_cast<Real>(35);
option.r      = static_cast<Real>(0.03);
option.sigma  = static_cast<Real>(0.20);
option.tenor  = static_cast<Real>(1.0/3.0);
option.dt     = static_cast<Real>(1.0/261);
option.type   = AsianOption<Real>::Call;
option.value  = static_cast<Real>(0.0);
option.golden = static_cast<Real>(5.162534);

// Evaluate on GPU
printf("Pricing option on GPU (%s)\n\n", deviceProperties.name);
PricingEngine<Real> pricer(numSims, device, threadBlockSize, seed);
sdkStartTimer(&timer);
pricer(option);
sdkStopTimer(&timer);
elapsedTime = sdkGetAverageTimerValue(&timer)/1000.0f;

// Tolerance to compare result with expected
// This is just to check that nothing has gone very wrong with the
// test, the actual accuracy of the result depends on the number of
// Monte Carlo trials
const Real tolerance = static_cast<Real>(0.1);

// Display results
stringstream output;
output << "Precision:      " << ((typeid(Real) == typeid(double)) ? "double" : "single") << endl;
output << "Number of sims: " << numSims << endl;
output << endl;
output << "   Spot    |   Strike   |     r      |   sigma    |   tenor    |  Call/Put  |   Value    |  Expected  |" << endl;
output << "-----------|------------|------------|------------|------------|------------|------------|------------|" << endl;
output << setw(10) << option.spot << " | ";
output << setw(10) << option.strike << " | ";
output << setw(10) << option.r << " | ";
output << setw(10) << option.sigma << " | ";
output << setw(10) << option.tenor << " | ";
output << setw(10) << (option.type == AsianOption<Real>::Call ? "Call" : "Put") << " | ";
output << setw(10) << option.value << " | ";
output << setw(10) << option.golden << " |";

printf("%s\n\n", output.str().c_str());

// Check result
if (fabs(option.value - option.golden) > tolerance)
{
    printf("computed result (%e) does not match expected result (%e).\n", option.value, option.golden);
    pass = false;
}
else
{
    pass = true;
}

// Print results
printf("MonteCarloSingleAsianOptionP, Performance = %.2f sims/s, Time = %.2f(ms), NumDevsUsed = %u, Blocksize = %u\n",
       numSims / elapsedTime, elapsedTime*1000.0f, 1, threadBlockSize);

sdkDeleteTimer(&timer);

return pass;

}

// Explicit template instantiation
template struct Test;
template struct Test;

%%%%%%%%%%%%% pricingengine.cu

#include “pricingengine.h”

#include
#include
#include
#include
#include
#include <cuda_runtime.h>
#include <curand_kernel.h>

#include “asianoption.h”
#include “cudasharedmem.h”

using std::string;
using std::vector;

// RNG init kernel
global void initRNG(curandState *const rngStates,
const unsigned int seed)
{
// Determine thread ID
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

// Initialise the RNG
curand_init(seed, tid, 0, &rngStates[tid]);

}

device inline float getPathStep(float &drift, float &diffusion, curandState &state)
{
return expf(drift + diffusion * curand_normal(&state));
}
device inline double getPathStep(double &drift, double &diffusion, curandState &state)
{
return exp(drift + diffusion * curand_normal_double(&state));
}

// Path generation kernel
template
global void generatePaths(Real *const paths,
curandState *const rngStates,
const AsianOption *const option,
const unsigned int numSims,
const unsigned int numTimesteps)
{
// Determine thread ID
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int step = gridDim.x * blockDim.x;

// Compute parameters
Real drift     = (option->r - static_cast<Real>(0.5) * option->sigma * option->sigma) * option->dt;
Real diffusion = option->sigma * sqrt(option->dt);

// Initialise the RNG
curandState localState = rngStates[tid];

for (unsigned int i = tid ; i < numSims ; i += step)
{
    // Shift the output pointer
    Real *output = paths + i;

    // Simulate the path
    Real s = static_cast<Real>(1);

    for (unsigned int t = 0 ; t < numTimesteps ; t++, output += numSims)
    {
        s *= getPathStep(drift, diffusion, localState);
        *output = s;
    }
}

}

template
device Real reduce_sum(Real in)
{
SharedMemory sdata;

// Perform first level of reduction:
// - Write to shared memory
unsigned int ltid = threadIdx.x;

sdata[ltid] = in;
__syncthreads();

// Do reduction in shared mem
for (unsigned int s = blockDim.x / 2 ; s > 0 ; s >>= 1)
{
    if (ltid < s)
    {
        sdata[ltid] += sdata[ltid + s];
    }

    __syncthreads();
}

return sdata[0];

}

// Valuation kernel
template
global void computeValue(Real *const values,
const Real *const paths,
const AsianOption *const option,
const unsigned int numSims,
const unsigned int numTimesteps)
{
// Determine thread ID
unsigned int bid = blockIdx.x;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int step = gridDim.x * blockDim.x;

Real sumPayoffs = static_cast<Real>(0);

for (unsigned int i = tid ; i < numSims ; i += step)
{
    // Shift the input pointer
    const Real *path = paths + i;
    // Compute the arithmetic average
    Real avg = static_cast<Real>(0);

    for (unsigned int t = 0 ; t < numTimesteps ; t++, path += numSims)
    {
        avg += *path;
    }

    avg = avg * option->spot / numTimesteps;
    // Compute the payoff
    Real payoff = avg - option->strike;

    if (option->type == AsianOption<Real>::Put)
    {
        payoff = - payoff;
    }

    payoff = max(static_cast<Real>(0), payoff);
    // Accumulate payoff locally
    sumPayoffs += payoff;
}

// Reduce within the block
sumPayoffs = reduce_sum<Real>(sumPayoffs);

// Store the result
if (threadIdx.x == 0)
{
    values[bid] = sumPayoffs;
}

}

template
PricingEngine::PricingEngine(unsigned int numSims, unsigned int device, unsigned int threadBlockSize, unsigned int seed)
: m_numSims(numSims),
m_device(device),
m_threadBlockSize(threadBlockSize),
m_seed(seed)
{
}

template
void PricingEngine::operator()(AsianOption &option)
{
cudaError_t cudaResult = cudaSuccess;
struct cudaDeviceProp deviceProperties;
struct cudaFuncAttributes funcAttributes;

// Get device properties
cudaResult = cudaGetDeviceProperties(&deviceProperties, m_device);

if (cudaResult != cudaSuccess)
{
    string msg("Could not get device properties: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Check precision is valid
unsigned int deviceVersion = deviceProperties.major * 10 + deviceProperties.minor;

if (typeid(Real) == typeid(double) && deviceVersion < 13)
{
    throw std::runtime_error("Device does not have double precision support");
}

// Attach to GPU
cudaResult = cudaSetDevice(m_device);

if (cudaResult != cudaSuccess)
{
    string msg("Could not set CUDA device: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Determine how to divide the work between cores
dim3 block;
dim3 grid;
block.x = m_threadBlockSize;
grid.x  = (m_numSims + m_threadBlockSize - 1) / m_threadBlockSize;

// Aim to launch around ten or more times as many blocks as there
// are multiprocessors on the target device.
unsigned int blocksPerSM = 10;
unsigned int numSMs      = deviceProperties.multiProcessorCount;

while (grid.x > 2 * blocksPerSM * numSMs)
{
    grid.x >>= 1;
}

// Get initRNG function properties and check the maximum block size
cudaResult = cudaFuncGetAttributes(&funcAttributes, initRNG);

if (cudaResult != cudaSuccess)
{
    string msg("Could not get function attributes: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
{
    throw std::runtime_error("Block X dimension is too large for initRNG kernel");
}

// Get generatePaths function properties and check the maximum block size
cudaResult = cudaFuncGetAttributes(&funcAttributes, generatePaths<Real>);

if (cudaResult != cudaSuccess)
{
    string msg("Could not get function attributes: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
{
    throw std::runtime_error("Block X dimension is too large for generatePaths kernel");
}

// Get computeValue function properties and check the maximum block size
cudaResult = cudaFuncGetAttributes(&funcAttributes, computeValue<Real>);

if (cudaResult != cudaSuccess)
{
    string msg("Could not get function attributes: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

if (block.x > (unsigned int)funcAttributes.maxThreadsPerBlock)
{
    throw std::runtime_error("Block X dimension is too large for computeValue kernel");
}

// Setup problem on GPU
AsianOption<Real> *d_option = 0;
cudaResult = cudaMalloc((void **)&d_option, sizeof(AsianOption<Real>));

if (cudaResult != cudaSuccess)
{
    string msg("Could not allocate memory on device for option data: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

cudaResult = cudaMemcpy(d_option, &option, sizeof(AsianOption<Real>), cudaMemcpyHostToDevice);

if (cudaResult != cudaSuccess)
{
    string msg("Could not copy data to device: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Allocate memory for paths
Real *d_paths  = 0;
int numTimesteps = static_cast<int>(option.tenor / option.dt);
cudaResult = cudaMalloc((void **)&d_paths, m_numSims * numTimesteps * sizeof(Real));

if (cudaResult != cudaSuccess)
{
    string msg("Could not allocate memory on device for paths: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Allocate memory for RNG states
curandState *d_rngStates = 0;
cudaResult = cudaMalloc((void **)&d_rngStates, grid.x * block.x * sizeof(curandState));

if (cudaResult != cudaSuccess)
{
    string msg("Could not allocate memory on device for RNG state: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Allocate memory for result
Real *d_values = 0;
cudaResult = cudaMalloc((void **)&d_values, grid.x * sizeof(Real));

if (cudaResult != cudaSuccess)
{
    string msg("Could not allocate memory on device for partial results: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Initialise RNG
initRNG<<<grid, block>>>(d_rngStates, m_seed);

// Generate paths
generatePaths<Real><<<grid, block>>>(d_paths, d_rngStates, d_option, m_numSims, numTimesteps);

// Compute value
computeValue<<<grid, block, block.x *sizeof(Real)>>>(d_values, d_paths, d_option, m_numSims, numTimesteps);

// Copy partial results back
vector<Real> values(grid.x);
cudaResult = cudaMemcpy(&values[0], d_values, grid.x * sizeof(Real), cudaMemcpyDeviceToHost);

if (cudaResult != cudaSuccess)
{
    string msg("Could not copy partial results to host: ");
    msg += cudaGetErrorString(cudaResult);
    throw std::runtime_error(msg);
}

// Complete sum-reduction on host
option.value = std::accumulate(values.begin(), values.end(), static_cast<Real>(0));

// Compute the mean
option.value /= m_numSims;

// Discount to present value
option.value *= exp(- option.r * option.tenor);

// Cleanup
if (d_option)
{
    cudaFree(d_option);
    d_option = 0;
}
if (d_paths)
{
    cudaFree(d_paths);
    d_paths = 0;
}
if (d_rngStates)
{
    cudaFree(d_rngStates);
    d_rngStates = 0;
}
if (d_values)
{
    cudaFree(d_values);
    d_values = 0;
}

}

// Explicit template instantiation
template class PricingEngine;
template class PricingEngine;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

and every time there is a kernel function (foo<<<x,y>>>(…)) this one is underlined in red. :(

What is the error you get when compiling? Can you compile and run the device query example?