Resolving CUDA struct alignment mismatch for Atom in atoms.h

I’m facing an issue with a CUDA program where the Atom struct in atoms.h is misaligned between the host (MSVC) and device (NVCC), causing incorrect data access in the gb_energy_kernel. The struct is defined as:

#pragma pack(push, 1)
struct Atom {
    float x, y, z;  // Coordinates (Å)
    float q;        // Partial charge (e)
    float r;        // Radius (Å)
    Element elem;   // Enum (int32_t)
};
#pragma pack(pop)
static_assert(sizeof(Atom) == 24, "Atom must be 24 bytes");

On the host, GB input prep logs show correct values (e.g., x=33.630, y=-36.625, z=73.092, q=-0.050, r=1.55), but in the kernel, atoms[i].x and atoms[i].y are read as 0, q is read as y (e.g., -37.130), and r as q (e.g., -0.050). The index j is also invalid (e.g., 1482630944912 for 59694 atoms). Using #pragma pack(push, 1) didn’t resolve the issue, suggesting NVCC interprets the struct differently.

How can I ensure consistent Atom struct alignment between host and device? Should I use attribute((packed, aligned(1))), manual padding to 32 bytes, or another approach? Any advice on verifying data after cudaMemcpy or debugging the kernel’s struct access?

Cannot reproduce (see simple test case below). Please post a complete minimal self-contained reproducer code that others can compile and run.

On the GPU, all data must be naturally aligned. This means the alignment for this struct must be at least 4.

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

enum Element { FOO, BAR, BAZ};

struct Atom {
    float x, y, z;  // Coordinates (Å)
    float q;        // Partial charge (e)
    float r;        // Radius (Å)
    Element elem;   // Enum (int32_t)
};

__global__ void kernel (struct Atom x)
{
    printf ("GPU: sizeof Atom = %llu\n", sizeof x);
    printf ("GPU: offsetof x = %d\n", offsetof (struct Atom, x));
    printf ("GPU: offsetof y = %d\n", offsetof (struct Atom, y));
    printf ("GPU: offsetof z = %d\n", offsetof (struct Atom, z));
    printf ("GPU: offsetof q = %d\n", offsetof (struct Atom, q));
    printf ("GPU: offsetof r = %d\n", offsetof (struct Atom, r));
    printf ("GPU: offsetof elem = %d\n", offsetof (struct Atom, elem));
    printf ("GPU: x=%15.8e y=%15.8e z=%15.8e q=%15.8e r=%15.8e elem=%x\n",
            x.x, x.y, x.z, x.q, x.r, x.elem);
}

int main (void)
{
    struct Atom test = {33.360f, -36.625f, 73.092f, -0.050f, 1.55f};
    test.elem = BAZ;
    printf ("CPU: sizeof Atom = %llu\n", sizeof test);
    printf ("CPU: offsetof x = %d\n", offsetof (struct Atom, x));
    printf ("CPU: offsetof y = %d\n", offsetof (struct Atom, y));
    printf ("CPU: offsetof z = %d\n", offsetof (struct Atom, z));
    printf ("CPU: offsetof q = %d\n", offsetof (struct Atom, q));
    printf ("CPU: offsetof r = %d\n", offsetof (struct Atom, r));
    printf ("CPU: offsetof elem = %d\n", offsetof (struct Atom, elem));
    printf ("CPU: x=%15.8e y=%15.8e z=%15.8e q=%15.8e r=%15.8e elem=%x\n",
            test.x, test.y, test.z, test.q, test.r, test.elem);

    kernel<<<1,1>>>(test);
    cudaDeviceSynchronize();
    return EXIT_SUCCESS;
}

This prints:

CPU: sizeof Atom = 24
CPU: offsetof x = 0
CPU: offsetof y = 4
CPU: offsetof z = 8
CPU: offsetof q = 12
CPU: offsetof r = 16
CPU: offsetof elem = 20
CPU: x=3.33600006e+001 y=-3.66250000e+001 z=7.30920029e+001 q=-5.00000007e-002 r=1.54999995e+000 elem=2
GPU: sizeof Atom = 24
GPU: offsetof x = 0
GPU: offsetof y = 4
GPU: offsetof z = 8
GPU: offsetof q = 12
GPU: offsetof r = 16
GPU: offsetof elem = 20
GPU: x= 3.33600006e+01 y=-3.66250000e+01 z= 7.30920029e+01 q=-5.00000007e-02 r= 1.54999995e+00 elem=2

I have attached the test code.

kernel.cu

#include "atoms.h"
#include <cuda_runtime.h>
#include <cstdio>
#include <cstdlib>

// Macro for checking CUDA errors
#define CUDA_CHECK(err) do { \
    if ((err) != cudaSuccess) { \
        fprintf(stderr, "CUDA Error: %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

// Kernel to print Atom field values on GPU
__global__ void test_atom_kernel(const Atom* atoms, size_t n) {
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    if (idx < 5) {
        printf("GPU Atom %zu: x=%.6f, y=%.6f, z=%.6f, q=%.6f, r=%.6f, elem=%d\n",
            idx, atoms[idx].x, atoms[idx].y, atoms[idx].z, atoms[idx].q, atoms[idx].r, atoms[idx].elem);
    }
}

int main() {
    // Print size of Atom struct on CPU
    printf("Sizeof(Atom) on CPU: %zu bytes\n", sizeof(Atom));

    // Initialize test data (inspired by 5LUQ.pdb output)
    const size_t n = 5;
    Atom* h_atoms = (Atom*)malloc(n * sizeof(Atom));
    if (!h_atoms) {
        fprintf(stderr, "Failed to allocate host memory\n");
        return 1;
    }

    // Sample data from 5LUQ.pdb
    h_atoms[0] = { 33.630f, -36.625f, 73.092f, -0.050f, 1.55f, N };
    h_atoms[1] = { 32.299f, -37.130f, 72.782f, 0.000f, 1.70f, C };
    h_atoms[2] = { 31.805f, -37.978f, 73.943f, 0.000f, 1.70f, C };
    h_atoms[3] = { 30.866f, -37.614f, 74.649f, -0.050f, 1.52f, O };
    h_atoms[4] = { 32.326f, -37.952f, 71.495f, 0.000f, 1.70f, C };

    // Print data on CPU
    for (size_t i = 0; i < n; ++i) {
        printf("CPU Atom %zu: x=%.6f, y=%.6f, z=%.6f, q=%.6f, r=%.6f, elem=%d\n",
            i, h_atoms[i].x, h_atoms[i].y, h_atoms[i].z, h_atoms[i].q, h_atoms[i].r, h_atoms[i].elem);
    }

    // Allocate memory on GPU
    Atom* d_atoms = nullptr;
    CUDA_CHECK(cudaMalloc(&d_atoms, n * sizeof(Atom)));

    // Copy data to GPU
    CUDA_CHECK(cudaMemcpy(d_atoms, h_atoms, n * sizeof(Atom), cudaMemcpyHostToDevice));

    // Verify data after transfer
    Atom* h_atoms_check = (Atom*)malloc(n * sizeof(Atom));
    if (!h_atoms_check) {
        fprintf(stderr, "Failed to allocate host check memory\n");
        CUDA_CHECK(cudaFree(d_atoms));
        free(h_atoms);
        return 1;
    }
    CUDA_CHECK(cudaMemcpy(h_atoms_check, d_atoms, n * sizeof(Atom), cudaMemcpyDeviceToHost));
    for (size_t i = 0; i < n; ++i) {
        printf("Post-cudaMemcpy check: atom %zu, x=%.6f, y=%.6f, z=%.6f, q=%.6f, r=%.6f, elem=%d\n",
            i, h_atoms_check[i].x, h_atoms_check[i].y, h_atoms_check[i].z, h_atoms_check[i].q, h_atoms_check[i].r, h_atoms_check[i].elem);
    }

    // Launch test kernel
    size_t threads = 256;
    size_t blocks = (n + threads - 1) / threads;
    test_atom_kernel << <blocks, threads >> > (d_atoms, n);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    // Free memory
    CUDA_CHECK(cudaFree(d_atoms));
    free(h_atoms);
    free(h_atoms_check);

    return 0;
}

atoms.h

#ifndef ATOMS_H
#define ATOMS_H

#include <cstdint>
#include <cstddef>

// Enforce 1-byte alignment for CPU
#pragma pack(push, 1)

// Enum without explicit alignment (int32_t ensures 4 bytes)
enum Element : int32_t {
    UNKNOWN = 0,
    C,
    N,
    O,
    S,
    F,
    H
};

// Atom structure with explicit alignment
#ifdef __CUDACC__
#define CUDA_ALIGN(n) __align__(n)
#else
#define CUDA_ALIGN(n)
#endif

struct CUDA_ALIGN(16) Atom {
    float x;        // X-coordinate (Å)
    float y;        // Y-coordinate (Å)
    float z;        // Z-coordinate (Å)
    float q;        // Partial charge (e)
    float r;        // Radius (Å)
    Element elem;   // Chemical element (int32_t)
    int32_t pad[2]; // Padding to align to 32 bytes
};

// Restore default alignment
#pragma pack(pop)

// Verify structure size
static_assert(sizeof(Atom) == 32, "Atom must be exactly 32 bytes");

// Ligand representation for CPU code
struct Ligand {
    Atom* atoms;          // Pointer to Atom array
    size_t atom_count;    // Number of atoms
    float delta_G;        // Binding energy (kJ/mol)
    float mw;             // Molecular weight
    char name[32];        // Ligand identifier (e.g., CID)
};

#endif // ATOMS_H

Console output:

Sizeof(Atom) on CPU: 32 bytes
CPU Atom 0: x=33.630001, y=-36.625000, z=73.092003, q=-0.050000, r=1.550000, elem=2
CPU Atom 1: x=32.299000, y=-37.130001, z=72.781998, q=0.000000, r=1.700000, elem=1
CPU Atom 2: x=31.805000, y=-37.978001, z=73.943001, q=0.000000, r=1.700000, elem=1
CPU Atom 3: x=30.865999, y=-37.613998, z=74.649002, q=-0.050000, r=1.520000, elem=3
CPU Atom 4: x=32.326000, y=-37.952000, z=71.495003, q=0.000000, r=1.700000, elem=1
Post-cudaMemcpy check: atom 0, x=33.630001, y=-36.625000, z=73.092003, q=-0.050000, r=1.550000, elem=2
Post-cudaMemcpy check: atom 1, x=32.299000, y=-37.130001, z=72.781998, q=0.000000, r=1.700000, elem=1
Post-cudaMemcpy check: atom 2, x=31.805000, y=-37.978001, z=73.943001, q=0.000000, r=1.700000, elem=1
Post-cudaMemcpy check: atom 3, x=30.865999, y=-37.613998, z=74.649002, q=-0.050000, r=1.520000, elem=3
Post-cudaMemcpy check: atom 4, x=32.326000, y=-37.952000, z=71.495003, q=0.000000, r=1.700000, elem=1
GPU Atom 0: x=0.000000, y=33.630001, z=-36.625000, q=73.092003, r=-0.050000, elem=-1073741824
GPU Atom 1: x=0.000000, y=32.299000, z=-37.130001, q=72.781998, r=0.000000, elem=1073741824
GPU Atom 2: x=0.000000, y=31.805000, z=-37.978001, q=73.943001, r=0.000000, elem=1073741824
GPU Atom 3: x=0.000000, y=30.865999, z=-37.613998, q=74.649002, r=-0.050000, elem=-2147483648
GPU Atom 4: x=0.000000, y=32.326000, z=-37.952000, q=71.495003, r=0.000000, elem=1073741824

Problem: The GPU kernel (test_atom_kernel) reads incorrect values:

x is always 0.0 (likely reading padding or invalid memory).
y reads CPU x (e.g., 33.630001 for atom 0).
z reads CPU y (e.g., -36.625000).
q reads CPU z (e.g., 73.092003).
r reads CPU q (e.g., -0.050000).
elem reads garbage values (e.g., -1073741824, 1073741824, -2147483648), indicating access to uninitialized or invalid memory.

This matches the alignment mismatch issue seen in your previous outputs, where fields are shifted (e.g., y reads x, z reads y), suggesting that the align(16) and padding did not resolve the issue.

While I am manually setting up your code: What happens if you avoid manually interfering with alignment and packing and just let the compiler do its thing?

The problem seems to be here:

If I change the size_t variables to int and change this format specifier to %d, the output from the device-side printf() is as expected. So the problem seems to be that device-side printf() assumes the wrong size of the data associated with %zu, and thus grabs the data for the following format specifiers from the wrong offset in its buffer.

Specifically, the output suggests that %zu is assumed to refer to 4 bytes of data rather than 8. My working hypotheses are therefore that (1) either %zu is not a valid format specifier for device-side printf(), or (2) that there is a bug in handling this format specifier. You could check the documentation to decide which hypothesis applies.

[Later:]

Changing size_t to unsigned long long int to retain the desired value range and printing with the %llu format specifier fixes the device-side output as well.

Note: GPUs are 32-bit processors with 64-bit addressing capability; unnecessary use of 64-bit integer types can have a negative impact on performance.

[Even later:]

Per the CUDA Programing Guide, “%zu” is not a supported format specifier for device-side printf. You may wish to file a feature request with NVIDIA to add this support.

10.35.1. Format Specifiers
As for standard printf(), format specifiers take the form: %[flags][width][.
precision][size]type
The following fields are supported (see widely-available documentation for a complete description of
all behaviors):
▶ Flags: ‘#’ ’ ’ ‘0’ ‘+’ ‘-’
▶ Width: ‘*’ ‘0-9’
▶ Precision: ‘0-9’
▶ Size: ‘h’ ‘l’ ‘ll’
▶ Type: “%cdiouxXpeEfgGaAs”

1 Like

Thank you for your help and advise which fixed my problem.

The test program now works as it should do:

Sizeof(Atom) on CPU: 32 bytes
CPU Atom 0: x=33.630001, y=-36.625000, z=73.092003, q=-0.050000, r=1.550000, elem=2
CPU Atom 1: x=32.299000, y=-37.130001, z=72.781998, q=0.000000, r=1.700000, elem=1
CPU Atom 2: x=31.805000, y=-37.978001, z=73.943001, q=0.000000, r=1.700000, elem=1
CPU Atom 3: x=30.865999, y=-37.613998, z=74.649002, q=-0.050000, r=1.520000, elem=3
CPU Atom 4: x=32.326000, y=-37.952000, z=71.495003, q=0.000000, r=1.700000, elem=1
Post-cudaMemcpy check: atom 0, x=33.630001, y=-36.625000, z=73.092003, q=-0.050000, r=1.550000, elem=2
Post-cudaMemcpy check: atom 1, x=32.299000, y=-37.130001, z=72.781998, q=0.000000, r=1.700000, elem=1
Post-cudaMemcpy check: atom 2, x=31.805000, y=-37.978001, z=73.943001, q=0.000000, r=1.700000, elem=1
Post-cudaMemcpy check: atom 3, x=30.865999, y=-37.613998, z=74.649002, q=-0.050000, r=1.520000, elem=3
Post-cudaMemcpy check: atom 4, x=32.326000, y=-37.952000, z=71.495003, q=0.000000, r=1.700000, elem=1
GPU Atom 0: x=33.630001, y=-36.625000, z=73.092003, q=-0.050000, r=1.550000, elem=2
GPU Atom 1: x=32.299000, y=-37.130001, z=72.781998, q=0.000000, r=1.700000, elem=1
GPU Atom 2: x=31.805000, y=-37.978001, z=73.943001, q=0.000000, r=1.700000, elem=1
GPU Atom 3: x=30.865999, y=-37.613998, z=74.649002, q=-0.050000, r=1.520000, elem=3
GPU Atom 4: x=32.326000, y=-37.952000, z=71.495003, q=0.000000, r=1.700000, elem=1

kernel.cu

#include "atoms.h"
#include <cuda_runtime.h>
#include <cstdio>
#include <cstdlib>

// Macro for checking CUDA errors
#define CUDA_CHECK(err) do { \
    if ((err) != cudaSuccess) { \
        fprintf(stderr, "CUDA Error: %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

// Kernel to print Atom field values on GPU
__global__ void test_atom_kernel(const Atom* atoms, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    if (idx < 5) {
        printf("GPU Atom %d: x=%.6f, y=%.6f, z=%.6f, q=%.6f, r=%.6f, elem=%d\n",
            idx, atoms[idx].x, atoms[idx].y, atoms[idx].z, atoms[idx].q, atoms[idx].r, atoms[idx].elem);
    }
}

int main() {
    // Print size of Atom struct on CPU
    printf("Sizeof(Atom) on CPU: %zu bytes\n", sizeof(Atom));

    // Initialize test data (inspired by 5LUQ.pdb output)
    const int n = 5;
    Atom* h_atoms = (Atom*)malloc(n * sizeof(Atom));
    if (!h_atoms) {
        fprintf(stderr, "Failed to allocate host memory\n");
        return 1;
    }

    // Sample data from 5LUQ.pdb
    h_atoms[0] = { 33.630f, -36.625f, 73.092f, -0.050f, 1.55f, N };
    h_atoms[1] = { 32.299f, -37.130f, 72.782f, 0.000f, 1.70f, C };
    h_atoms[2] = { 31.805f, -37.978f, 73.943f, 0.000f, 1.70f, C };
    h_atoms[3] = { 30.866f, -37.614f, 74.649f, -0.050f, 1.52f, O };
    h_atoms[4] = { 32.326f, -37.952f, 71.495f, 0.000f, 1.70f, C };

    // Print data on CPU
    for (int i = 0; i < n; ++i) {
        printf("CPU Atom %d: x=%.6f, y=%.6f, z=%.6f, q=%.6f, r=%.6f, elem=%d\n",
            i, h_atoms[i].x, h_atoms[i].y, h_atoms[i].z, h_atoms[i].q, h_atoms[i].r, h_atoms[i].elem);
    }

    // Allocate memory on GPU
    Atom* d_atoms = nullptr;
    CUDA_CHECK(cudaMalloc(&d_atoms, n * sizeof(Atom)));

    // Copy data to GPU
    CUDA_CHECK(cudaMemcpy(d_atoms, h_atoms, n * sizeof(Atom), cudaMemcpyHostToDevice));

    // Verify data after transfer
    Atom* h_atoms_check = (Atom*)malloc(n * sizeof(Atom));
    if (!h_atoms_check) {
        fprintf(stderr, "Failed to allocate host check memory\n");
        CUDA_CHECK(cudaFree(d_atoms));
        free(h_atoms);
        return 1;
    }
    CUDA_CHECK(cudaMemcpy(h_atoms_check, d_atoms, n * sizeof(Atom), cudaMemcpyDeviceToHost));
    for (int i = 0; i < n; ++i) {
        printf("Post-cudaMemcpy check: atom %d, x=%.6f, y=%.6f, z=%.6f, q=%.6f, r=%.6f, elem=%d\n",
            i, h_atoms_check[i].x, h_atoms_check[i].y, h_atoms_check[i].z, h_atoms_check[i].q, h_atoms_check[i].r, h_atoms_check[i].elem);
    }

    // Launch test kernel
    int threads = 256;
    int blocks = (n + threads - 1) / threads;
    test_atom_kernel << <blocks, threads >> > (d_atoms, n);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    // Free memory
    CUDA_CHECK(cudaFree(d_atoms));
    free(h_atoms);
    free(h_atoms_check);

    return 0;
}

atoms.h

#ifndef ATOMS_H
#define ATOMS_H

#include <cstdint>
#include <cstddef>

// Enforce 1-byte alignment for CPU
#pragma pack(push, 1)

// Enum without explicit alignment (int32_t ensures 4 bytes)
enum Element : int32_t {
    UNKNOWN = 0,
    C,
    N,
    O,
    S,
    F,
    H
};

// Atom structure with explicit alignment
#ifdef __CUDACC__
#define CUDA_ALIGN(n) __align__(n)
#else
#define CUDA_ALIGN(n)
#endif

struct CUDA_ALIGN(16) Atom {
    float x;        // X-coordinate (Å)
    float y;        // Y-coordinate (Å)
    float z;        // Z-coordinate (Å)
    float q;        // Partial charge (e)
    float r;        // Radius (Å)
    Element elem;   // Chemical element (int32_t)
    int32_t pad[2]; // Padding to align to 32 bytes
};

// Restore default alignment
#pragma pack(pop)

// Verify structure size
static_assert(sizeof(Atom) == 32, "Atom must be exactly 32 bytes");

// Ligand representation for CPU code
struct Ligand {
    Atom* atoms;          // Pointer to Atom array
    size_t atom_count;    // Number of atoms
    float delta_G;        // Binding energy (kJ/mol)
    float mw;             // Molecular weight
    char name[32];        // Ligand identifier (e.g., CID)
};

#endif // ATOMS_H