Hi all,
I’m coding my first CUDA application. The idea is that I have 5 slot reel strips and I am bruteforcing each possible combination of stops and grabbing the pay to determine the overall expected return. In the example code my reelstrip lengths are 100, 100, 10, 100, and 100 which totals to 1b combinations to evaluate. The main driver of the function happens in the evaluateCombo function in kernel.cu. Everything is working as expected when I evaluate each of the 1b combinations.
I then introduced an extra layer of complexity where for each of the 1 billion combinations I now evaluate 30 lines around that set of stops*, similar to a real slot machine, which is where my code starts to break down and return garbage values. This part is confusing to me because the extra logic is not very complicated. The only difference doing the same computations 30 times across the different lines instead just once.
*This added complexity is the loop: for (int iLine = 0; iLine < NUM_LINES; iLine++). All else the same, my code works fine when I remove the loop and query into lineset.
I noticed that the lower I set the number of lines to loop through the higher chance of the run completing, however I don’t always get the same result when running the same configuration multiple times. I also noticed that if I hard-code the payout the overall payout reflects that hard-coded payout accurately, which tells me that my computation of temp_idx is working and that the breakage may occur when fetching the payout from the payout_table. I’m not sure how that could be happening as we are simply indexing into a 2D array to grab a value. My best guess is that encountering some sort of race condition or memory issue that becomes more apparent as I add complexity. Based on my code below, is there any glaring issue that stands out?
My project isn’t too big, there’s a main.cpp and kernel.cu. Please let me know if there’s any additional clarification I can provide. Thank you for any help.
main.cpp:
#include <iostream>
// Function declaration for the CUDA slot simulation
extern "C" void runDoCycle(long long num_batches, long long batch_size, long long* h_payouts_accumulated, long long* h_hits_accumulated, int* h_reelSize, int offset);
int main() {
const long long bet_amount = 50;
long long total_payout = 0;
long long total_hits = 0;
int reelSize[5] = { 100, 100, 10, 100, 100 };
long long total_combinations = static_cast<long long>(reelSize[0]) * reelSize[1] * reelSize[2] * reelSize[3] * reelSize[4];
int num_rounds = (total_combinations / INT_MAX) + 1;
long long num_batches = 20000;
if (total_combinations < num_batches)
num_batches = total_combinations;
const long long batch_size = (total_combinations / num_batches / num_rounds); // Adjust the batch size based on available memory
for (int iRound = 0; iRound < num_rounds; iRound++) {
// Allocate host memory for the payout results and hit results
long long* h_payouts_accumulated = new long long[num_batches];
long long* h_hits_accumulated = new long long[num_batches];
// Run the CUDA simulation
runDoCycle(num_batches, batch_size, h_payouts_accumulated, h_hits_accumulated, reelSize, iRound);
// Calculate the total payout, total hits, and track specific symbol combinations
for (long long i = 0; i < num_batches; i++) {
total_payout += h_payouts_accumulated[i];
total_hits += h_hits_accumulated[i];
}
// Free host memory
delete[] h_payouts_accumulated;
delete[] h_hits_accumulated;
}
// Calculate total cost and Expected Value (EV)
long long total_cost = total_combinations * bet_amount;
float expected_value = static_cast<float>(total_payout) / total_cost;
// Display the overall results
std::cout << "Cycle Size: " << total_combinations << std::endl;
std::cout << "Total Payout: " << total_payout << std::endl;
std::cout << "Total Cost: " << total_cost << std::endl;
std::cout << "Expected Value (EV): " << expected_value << std::endl;
std::cout << "Total Hits: " << total_hits << std::endl;
return 0;
}
kernel.cu:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <curand_kernel.h>
#include <stdio.h>
#define THREADS_PER_BLOCK 256
#define NUM_REELS 5
#define NUM_SYMBOLS 3
#define NUM_LINES 30
__device__ int lineset[NUM_LINES][NUM_REELS] = {
{1,1,1,1,1}, //0
{0,0,0,0,0},
{2,2,2,2,2},
{0,1,2,1,0},
{2,1,0,1,2},
{1,0,1,2,1}, //5
{1,2,1,0,1},
{0,0,1,2,2},
{2,2,1,0,0},
{0,1,0,1,0},
{2,1,2,1,2}, //10
{1,0,0,0,1},
{1,2,2,2,1},
{0,1,1,1,0},
{2,1,1,1,2},
{1,1,0,1,1}, //15
{1,1,2,1,1},
{1,0,1,0,1},
{1,2,1,2,1},
{0,0,0,1,2},
{2,2,2,1,0}, //20
{0,1,2,2,2},
{2,1,0,0,0},
{1,1,1,0,1},
{1,1,1,2,1},
{0,1,1,1,2}, //25
{2,1,1,1,0},
{2,2,1,0,1},
{0,0,1,2,1},
{2,1,2,2,2}
};
// Define symbol payouts: each row is for a specific symbol (0, 1, 2),
// and columns represent payouts for 1, 2, and 3 in a row
__device__ int payout_table[NUM_SYMBOLS][NUM_REELS] = {
{0, 0, 0, 0, 1000}, // Symbol 0: 4 in a row pays 100
{0, 0, 0, 0, 750}, // Symbol 1: 4 in a row pays 75
{0, 0, 0, 0, 500} // Symbol 2: 4 in a row pays 50
};
// Reel strips (2D array)
__device__ int reels[NUM_REELS][100] = {
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, // Reel 1
{1, 2, 0, 1, 2, 0, 1, 2, 0, 1,1, 2, 0, 1, 2, 0, 1, 2, 0, 1,1, 2, 0, 1, 2, 0, 1, 2, 0, 1,1, 2, 0, 1, 2, 0, 1, 2, 0, 1,1, 2, 0, 1, 2, 0, 1, 2, 0, 1,1, 2, 0, 1, 2, 0, 1, 2, 0, 1,1, 2, 0, 1, 2, 0, 1, 2, 0, 1,1, 2, 0, 1, 2, 0, 1, 2, 0, 1,1, 2, 0, 1, 2, 0, 1, 2, 0, 1,1, 2, 0, 1, 2, 0, 1, 2, 0, 1}, // Reel 2
{2, 0, 1, 2, 0, 1, 2, 0, 1, 2}, // Reel 3
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, // Reel 4
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0,0, 1, 2, 0, 1, 2, 0, 1, 2, 0} // Reel 5
};
__global__ void evaluateCombo(long long num_batches, long long batch_size, long long* payouts_accumulated, long long* hits_accumulated, int* reelSize, int offset) {
long long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_batches) {
int symbols[NUM_REELS];
long long batch_payout = 0;
long long batch_hits = 0;
for (long long i = 0; i < batch_size; i++) {
long long temp_idx = ((batch_size * idx) + i) + (offset * batch_size * num_batches);
if (temp_idx >= 1000000000)
break;
for (int iLine = 0; iLine < NUM_LINES; iLine++) {
long long temp_line_idx = temp_idx;
// Calculate the symbol on each reel dynamically
for (int reel = 0; reel < NUM_REELS; ++reel) {
symbols[reel] = reels[reel][(temp_line_idx + lineset[iLine][reel]) % reelSize[reel]];
temp_line_idx /= reelSize[reel];
}
// Assume all reels have the same symbol initially
int symbol = symbols[0];
int consecutive_count = 1;
// Count consecutive reels showing the same symbol
for (int j = 1; j < NUM_REELS; j++) {
if (symbols[j] == symbol) {
consecutive_count++;
}
else {
break; // Stop counting if symbols don't match consecutively
}
}
// Retrieve payout based on symbol and consecutive match count
// Make sure consecutive_count does not exceed the number of columns in payout_table
int payout = payout_table[symbol][consecutive_count - 1];
//int payout = 10;
if (payout > 0) {
batch_payout += payout;
batch_hits++;
}
}
}
payouts_accumulated[idx] = batch_payout;
hits_accumulated[idx] = batch_hits;
}
}
extern "C" void runDoCycle(long long num_batches, long long batch_size, long long* h_payouts_accumulated, long long* h_hits_accumulated, int* h_reelSize, int offset) {
long long* d_payouts_accumulated;
long long* d_hits_accumulated;
int* d_reelSize;
cudaMalloc(&d_payouts_accumulated, num_batches * sizeof(long long));
cudaMalloc(&d_hits_accumulated, num_batches * sizeof(long long));
cudaMalloc(&d_reelSize, NUM_REELS * sizeof(int));
cudaMemcpy(d_reelSize, h_reelSize, NUM_REELS * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_payouts_accumulated, h_payouts_accumulated, num_batches * sizeof(long long), cudaMemcpyHostToDevice);
cudaMemcpy(d_hits_accumulated, h_hits_accumulated, num_batches * sizeof(long long), cudaMemcpyHostToDevice);
int blocks = (num_batches + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
evaluateCombo << <blocks, THREADS_PER_BLOCK >> > (num_batches, batch_size, d_payouts_accumulated, d_hits_accumulated, d_reelSize, offset);
cudaMemcpy(h_payouts_accumulated, d_payouts_accumulated, num_batches * sizeof(long long), cudaMemcpyDeviceToHost);
cudaMemcpy(h_hits_accumulated, d_hits_accumulated, num_batches * sizeof(long long), cudaMemcpyDeviceToHost);
cudaFree(d_payouts_accumulated);
cudaFree(d_hits_accumulated);
cudaFree(d_reelSize);
}