Program freezes at `cudaDeviceSynchronize()`

I am running the cuda program below to perform a monte carlo simulation but once my kernel finishes, it gets stuck and doesn’t move on from cudaDeviceSynchronize(). Please help.

Program:

/**
 * NEED to WRITE
 */
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/reduce.h>

#include <algorithm>  // Needed for the "max" function
#include <chrono>
#include <cmath>
#include <iostream>
using namespace std::chrono;

__global__ void gaussian_box_muller(int n, float *gaussBMArray, float *S_cur,
                                    double S_adjust, const double v,
                                    const double T, const double K,
                                    float *payoff_sum, float *rand1,
                                    float *rand2) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    double x = 0.0;
    double y = 0.0;
    double euclid_sq = 0.0;

    thrust::maximum<float> mx;

    for (int i = index; i < n; i += stride) {
        do {

            x = 2.0 * rand1[i] / static_cast<double>(RAND_MAX) - 1;

            y = 2.0 * rand2[i] / static_cast<double>(RAND_MAX) - 1;

            euclid_sq = x * x + y * y;

        } while (euclid_sq >= 1.0);

        printf("  euclid_sq: %f \n", euclid_sq);

        gaussBMArray[i] = (x * sqrt(-2 * log(euclid_sq) / euclid_sq));
        printf("  gaussBMArray: %f \n", gaussBMArray[i]);
        S_cur[i] = S_adjust * exp(sqrt(v * v * T) * gaussBMArray[i]);
        printf("  S_cur[i]: %f \n", S_cur[i]);
        payoff_sum[i] = mx(K - S_cur[i], 0.0);
        printf("  payoff_sum[i]: %f \n", payoff_sum[i]);
    }
}

__global__ void plus(float sum, float*arr) {
    sum+=arr[blockIdx.x];
}

// Pricing a European vanilla call option with a Monte Carlo method
double monte_carlo_call_price(const int num_sims, const double S,
                              const double K, const double r, const double v,
                              const double T) {  

    printf("\nENTERING monte_carlo_call_price\n");                          

    double S_adjust = S * exp(T * (r - 0.5 * v * v));

    float *gaussBMArray;
    float *S_cur;
    float *payoff_sum;
    float *rand1;
    float *rand2;

    printf("---allocating shared memory---\n");

    cudaMallocManaged(&gaussBMArray, num_sims * sizeof(float));
    cudaMallocManaged(&S_cur, num_sims * sizeof(float));
    cudaMallocManaged(&payoff_sum, num_sims * sizeof(float));
    cudaMallocManaged(&rand1, num_sims * sizeof(float));
    cudaMallocManaged(&rand2, num_sims * sizeof(float));

    printf("---filling random values in arrays---\n");

    for (int i = 0; i < num_sims; i++) {
        rand1[i] = rand();
        rand2[i] = rand();
    }

    printf("\nENTERING gausian_box_muller (device func)\n");
    // int blockSize = 256;
    // int numBlocks = (num_sims + blockSize - 1) / blockSize;
    gaussian_box_muller<<<num_sims, 1>>>(num_sims, gaussBMArray, S_cur, S_adjust, v, T, K, payoff_sum, rand1, rand2);

    cudaDeviceSynchronize();

    printf("\nENTERING monte_carlo_call_price\n");                          

    // for (int i = 0; i < num_sims; i++) {
    //     std::cout << "  final payoff: " << payoff_sum[i] << std::endl;
    // }    // for (int i = 0; i < num_sims; i++) {
    //     std::cout << "  final payoff: " << payoff_sum[i] << std::endl;
    // }
    
    float price = 0;
    plus<<<num_sims,1>>>(price, payoff_sum);
    std::cout << "  price is " << price << std::endl;

    printf("---freeing memory---\n");
    cudaFree(gaussBMArray);
    cudaFree(S_cur);
    cudaFree(payoff_sum);
    cudaFree(rand1);
    cudaFree(rand2);

    return 0;
    // return (payoff_sum / static_cast<double>(num_sims)) * exp(-r * T);
}

int main(int argc, char **argv) {
    printf("ENTERING: main\n");

    int num_sims = 1 << 2;  // Number of simulated asset paths
    double S = 100.0;  // Option price
    double K = 100.0;  // Strike price
    double r = 0.05;   // Risk-free rate (5%)
    double v = 0.2;    // Volatility of the underlying (20%)
    double T = 1.0;    // One year until expiry

    printf("================================\n");
    printf("PARAMS:\n");
    std::cout << "  Number of Paths: " << num_sims << std::endl;
    std::cout << "  Underlying:      " << S << std::endl;
    std::cout << "  Strike:          " << K << std::endl;
    std::cout << "  Risk-Free Rate:  " << r << std::endl;
    std::cout << "  Volatility:      " << v << std::endl;
    std::cout << "  Maturity:        " << T << std::endl;

    printf("================================\n");
    printf("ACTION: monte carlo call pricing\n");
    printf("================================\n");
    double call = monte_carlo_call_price(num_sims, S, K, r, v, T);

    printf("\nENTERING: main\n");

    printf("================================\n");
    printf("FINAL OUTPUT:\n");
    std::cout << "  Call Price: " << call << std::endl;
    printf("================================\n");

    return 0;
}

Output:

ENTERING: main
================================
PARAMS:
  Number of Paths: 4
  Underlying:      100
  Strike:          100
  Risk-Free Rate:  0.05
  Volatility:      0.2
  Maturity:        1
================================
ACTION: monte carlo call pricing
================================

ENTERING monte_carlo_call_price
---allocating shared memory---
---filling random values in arrays---

ENTERING gausian_box_muller (device func)
  euclid_sq: 0.396395 
  euclid_sq: 0.676847 
  euclid_sq: 0.507531 
  gaussBMArray: -0.712082 
  gaussBMArray: 0.608056 
  gaussBMArray: 1.112272 
  S_cur[i]: 89.367203 
  S_cur[i]: 116.370781 
  S_cur[i]: 128.718063 
  payoff_sum[i]: 10.632797 
  payoff_sum[i]: 0.000000 
  payoff_sum[i]: 0.000000 

It gets stuck over here and doesn’t move on. It should print “ENTERING monte_carlo_call_price”

It printed that. It’s already depicted in your printout.

That usually indicates a hung kernel (when the host code “freezes” at cudaDeviceSynchronize()). The first thing to look for is an infinite loop in the kernel. Your do…while loop is certainly suspicious in that regard.

My guess would be that the exit criteria is not being met. You should investigate why.

Your do…while-loop doesn’t make sense. I think you were trying to implement a grid-stride loop but did it incorrectly.

For a given i value, you have this code:

    do {

        x = 2.0 * rand1[i] / static_cast<double>(RAND_MAX) - 1;

        y = 2.0 * rand2[i] / static_cast<double>(RAND_MAX) - 1;

        euclid_sq = x * x + y * y;

    } while (euclid_sq >= 1.0);

Nothing in that do…while loop changes the i value. So suppose that for the given i value, the rand1[i] and rand2[i] values result in a calculation that does not fall below 1.0. What will your loop do? It will spin there, forever. (Note that depicted this way, you don’t have to know anything about CUDA to answer this question. It’s based on C or C++ knowledge.)

Thanks for the answer. I will try using curand to create a new random number at every iteration of the do-while loop.

If you want to just cover the arrays you have already created, you could try a kernel like this:

__global__ void gaussian_box_muller(int n, float *gaussBMArray, float *S_cur,
                                    double S_adjust, const double v,
                                    const double T, const double K,
                                    float *payoff_sum, float *rand1,
                                    float *rand2) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    double x = 0.0;
    double y = 0.0;

    thrust::maximum<float> mx;

    for (int i = index; i < n; i += stride) {
        double euclid_sq = 0.0;
        x = 2.0 * rand1[i] / static_cast<double>(RAND_MAX) - 1;
        y = 2.0 * rand2[i] / static_cast<double>(RAND_MAX) - 1;
        euclid_sq = x * x + y * y;

        if(euclid_sq < 1.0){

            printf("  euclid_sq: %f \n", euclid_sq);

            gaussBMArray[i] = (x * sqrt(-2 * log(euclid_sq) / euclid_sq));
            printf("  gaussBMArray: %f \n", gaussBMArray[i]);
            S_cur[i] = S_adjust * exp(sqrt(v * v * T) * gaussBMArray[i]);
            printf("  S_cur[i]: %f \n", S_cur[i]);
            payoff_sum[i] = mx(K - S_cur[i], 0.0);
            printf("  payoff_sum[i]: %f \n", payoff_sum[i]);
        }
    }
}

FWIW your sum kernel is also broken. You have a couple issues:

  1. Just like in C or C++, values passed by-value to the kernel can be modified in the kernel, but the modifications won’t show up in the calling environment.

  2. You’re trying to perform a reduction in a multithreaded environment. CUDA doesn’t automatically sort that out for you. Its a commonly discussed topic, and a presentation like this one may help to organize your thinking.

Just for my understanding, your kernel above worked, but if the euclid_sq>=1, how does the program loop back (as in the do while loop)?

You already have a grid stride loop that causes a “loop back” for the next set of random values:

for (int i = index; i < n; i += stride) {

That is looping over each set of points in your input (random) data set. As previously stated, the “loop back” when nothing has changed is problematic.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.