I’ve been working extensively on optimizing and debugging the MCX kernel for multi-GPU execution using OpenACC, but I’m still encountering incorrect results. The issue seems specific to the multi-GPU setup, as the kernel runs correctly on a single GPU.
Observations:
- When running with multiple GPUs, the detected photon data (
detdata.detphotondata
) is consistently zero, even though energy escape and absorption appear normal. - The same kernel, when run on a single GPU, provides expected results (non-zero detected photons).
- Even enforcing explicit atomic updates in OpenACC for
detphotonbuffer
didn’t help.
Possible Issue:
One potential culprit might be detphotonbuffer, which may be getting overwritten across GPUs or not being correctly assigned per photon/thread. However, I haven’t been able to pinpoint the exact reason why this only happens with multiple GPUs.
If anyone has insights into potential causes or best practices for handling device-local buffers in OpenACC multi-GPU environments, I would really appreciate any guidance.
template<const bool isreflect, const bool issavedet>
double MCX_kernel(json& cfg, const MCX_param& gcfg, MCX_volume<int>& inputvol, MCX_volume<float>& outputvol, float4* detpos, MCX_medium* prop, MCX_detect& detdata) {
double total_energyescape = 0.0;
std::srand(!cfg["Session"].contains("RNGSeed") ? 1648335518 : (cfg["Session"]["RNGSeed"].get<int>() > 0 ? cfg["Session"]["RNGSeed"].get<int>() : std::time(0)));
const uint64_t nphoton = cfg["Session"].value("Photons", 1000000);
const dim4 seeds = {(uint32_t)std::rand(), (uint32_t)std::rand(), (uint32_t)std::rand(), (uint32_t)std::rand()};
const float4 pos = {cfg["Optode"]["Source"]["Pos"][0].get<float>(), cfg["Optode"]["Source"]["Pos"][1].get<float>(), cfg["Optode"]["Source"]["Pos"][2].get<float>(), 1.f};
const float4 dir = {cfg["Optode"]["Source"]["Dir"][0].get<float>(), cfg["Optode"]["Source"]["Dir"][1].get<float>(), cfg["Optode"]["Source"]["Dir"][2].get<float>(), 0.f};
// Get number of available GPUs
int num_devices = 1;
#ifdef _OPENACC
num_devices = acc_get_num_devices(acc_get_device_type());
#endif
// Split photons across GPUs
const uint64_t photons_per_gpu = nphoton / num_devices;
// Create GPU buffers once (persistent memory)
std::vector<MCX_volume<float>> gpu_outputs(num_devices, outputvol);
std::vector<MCX_detect> gpu_detdata(num_devices, detdata);
#ifdef _OPENACC
#pragma acc enter data copyin(pos, dir, seeds, gcfg, inputvol, prop[0:gcfg.mediumnum], detpos[0:gcfg.detnum], inputvol.vol[0:inputvol.dimxyzt])
#pragma acc enter data create(outputvol.vol[0:outputvol.dimxyzt], detdata.detphotondata[0:detdata.maxdetphotons*detdata.detphotondatalen])
#endif
#pragma omp parallel for reduction(+:total_energyescape) num_threads(num_devices)
for(int dev = 0; dev < num_devices; ++dev) {
double device_energy = 0.0;
MCX_rand ran(seeds.x, seeds.y, seeds.z, seeds.w);
MCX_photon p(pos, dir, ran, gcfg);
#ifdef GPU_OFFLOAD
#pragma acc set device_num(dev)
#endif
const uint64_t start = dev * photons_per_gpu;
const uint64_t end = (dev == num_devices-1) ? nphoton : start + photons_per_gpu;
#ifdef _OPENACC
float* detphotonbuffer = nullptr; // Declare before OpenACC region
#pragma acc enter data create(detphotonbuffer[0:detdata.ppathlen * issavedet])
#endif
#ifdef GPU_OFFLOAD
#pragma acc parallel loop gang vector collapse(1) reduction(+:device_energy) firstprivate(ran, p)
#endif
for(uint64_t i = start; i < end; ++i) {
#ifdef _OPENACC
#pragma acc data create(detphotonbuffer[0:detdata.ppathlen * issavedet])
{
#pragma acc parallel loop
for (int j = 0; j < detdata.ppathlen * issavedet; j++) {
detphotonbuffer[j] = 0.0f;
}
ran.reseed(seeds.x ^ i, seeds.y | i, seeds.z ^ i, seeds.w | i);
p.launch(pos, dir, ran, gcfg);
p.run<isreflect, issavedet>(inputvol, gpu_outputs[dev], prop, detpos,
gpu_detdata[dev], detphotonbuffer, ran, gcfg);
device_energy += p.pos.w;
if(issavedet){
for(int j = 0; j < detdata.ppathlen * issavedet; j++){
#pragma acc atomic update
gpu_detdata[dev].detphotondata[j + i * detdata.ppathlen * issavedet] += detphotonbuffer[j];
}
}
} // End of OpenACC data region
#endif
}
#ifdef _OPENACC
#pragma acc exit data delete(detphotonbuffer[0:detdata.ppathlen * issavedet])
#endif
total_energyescape += device_energy;
}
#ifdef _OPENACC
// Copy back device results only once
#pragma acc update self(outputvol.vol[0:outputvol.dimxyzt], detdata.detphotondata[0:detdata.maxdetphotons*detdata.detphotondatalen])
// Cleanup GPU memory
#pragma acc exit data delete(pos, dir, seeds, gcfg, inputvol, prop[0:gcfg.mediumnum], detpos[0:gcfg.detnum], inputvol.vol[0:inputvol.dimxyzt])
#pragma acc exit data delete(outputvol.vol[0:outputvol.dimxyzt], detdata.detphotondata[0:detdata.maxdetphotons*detdata.detphotondatalen])
#endif
// Merge results from all GPUs (if necessary)
for(int dev = 0; dev < num_devices; ++dev) {
for(size_t i = 0; i < outputvol.dimxyzt; ++i) {
outputvol.vol[i] += gpu_outputs[dev].vol[i];
}
if(issavedet) {
for(size_t i = 0; i < detdata.maxdetphotons*detdata.detphotondatalen; ++i) {
detdata.detphotondata[i] += gpu_detdata[dev].detphotondata[i];
}
}
}
return total_energyescape;
}