I have attached the test code.
kernel.cu
#include "atoms.h"
#include <cuda_runtime.h>
#include <cstdio>
#include <cstdlib>
// Macro for checking CUDA errors
#define CUDA_CHECK(err) do { \
if ((err) != cudaSuccess) { \
fprintf(stderr, "CUDA Error: %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while(0)
// Kernel to print Atom field values on GPU
__global__ void test_atom_kernel(const Atom* atoms, size_t n) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) return;
if (idx < 5) {
printf("GPU Atom %zu: x=%.6f, y=%.6f, z=%.6f, q=%.6f, r=%.6f, elem=%d\n",
idx, atoms[idx].x, atoms[idx].y, atoms[idx].z, atoms[idx].q, atoms[idx].r, atoms[idx].elem);
}
}
int main() {
// Print size of Atom struct on CPU
printf("Sizeof(Atom) on CPU: %zu bytes\n", sizeof(Atom));
// Initialize test data (inspired by 5LUQ.pdb output)
const size_t n = 5;
Atom* h_atoms = (Atom*)malloc(n * sizeof(Atom));
if (!h_atoms) {
fprintf(stderr, "Failed to allocate host memory\n");
return 1;
}
// Sample data from 5LUQ.pdb
h_atoms[0] = { 33.630f, -36.625f, 73.092f, -0.050f, 1.55f, N };
h_atoms[1] = { 32.299f, -37.130f, 72.782f, 0.000f, 1.70f, C };
h_atoms[2] = { 31.805f, -37.978f, 73.943f, 0.000f, 1.70f, C };
h_atoms[3] = { 30.866f, -37.614f, 74.649f, -0.050f, 1.52f, O };
h_atoms[4] = { 32.326f, -37.952f, 71.495f, 0.000f, 1.70f, C };
// Print data on CPU
for (size_t i = 0; i < n; ++i) {
printf("CPU Atom %zu: x=%.6f, y=%.6f, z=%.6f, q=%.6f, r=%.6f, elem=%d\n",
i, h_atoms[i].x, h_atoms[i].y, h_atoms[i].z, h_atoms[i].q, h_atoms[i].r, h_atoms[i].elem);
}
// Allocate memory on GPU
Atom* d_atoms = nullptr;
CUDA_CHECK(cudaMalloc(&d_atoms, n * sizeof(Atom)));
// Copy data to GPU
CUDA_CHECK(cudaMemcpy(d_atoms, h_atoms, n * sizeof(Atom), cudaMemcpyHostToDevice));
// Verify data after transfer
Atom* h_atoms_check = (Atom*)malloc(n * sizeof(Atom));
if (!h_atoms_check) {
fprintf(stderr, "Failed to allocate host check memory\n");
CUDA_CHECK(cudaFree(d_atoms));
free(h_atoms);
return 1;
}
CUDA_CHECK(cudaMemcpy(h_atoms_check, d_atoms, n * sizeof(Atom), cudaMemcpyDeviceToHost));
for (size_t i = 0; i < n; ++i) {
printf("Post-cudaMemcpy check: atom %zu, x=%.6f, y=%.6f, z=%.6f, q=%.6f, r=%.6f, elem=%d\n",
i, h_atoms_check[i].x, h_atoms_check[i].y, h_atoms_check[i].z, h_atoms_check[i].q, h_atoms_check[i].r, h_atoms_check[i].elem);
}
// Launch test kernel
size_t threads = 256;
size_t blocks = (n + threads - 1) / threads;
test_atom_kernel << <blocks, threads >> > (d_atoms, n);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
// Free memory
CUDA_CHECK(cudaFree(d_atoms));
free(h_atoms);
free(h_atoms_check);
return 0;
}
atoms.h
#ifndef ATOMS_H
#define ATOMS_H
#include <cstdint>
#include <cstddef>
// Enforce 1-byte alignment for CPU
#pragma pack(push, 1)
// Enum without explicit alignment (int32_t ensures 4 bytes)
enum Element : int32_t {
UNKNOWN = 0,
C,
N,
O,
S,
F,
H
};
// Atom structure with explicit alignment
#ifdef __CUDACC__
#define CUDA_ALIGN(n) __align__(n)
#else
#define CUDA_ALIGN(n)
#endif
struct CUDA_ALIGN(16) Atom {
float x; // X-coordinate (Å)
float y; // Y-coordinate (Å)
float z; // Z-coordinate (Å)
float q; // Partial charge (e)
float r; // Radius (Å)
Element elem; // Chemical element (int32_t)
int32_t pad[2]; // Padding to align to 32 bytes
};
// Restore default alignment
#pragma pack(pop)
// Verify structure size
static_assert(sizeof(Atom) == 32, "Atom must be exactly 32 bytes");
// Ligand representation for CPU code
struct Ligand {
Atom* atoms; // Pointer to Atom array
size_t atom_count; // Number of atoms
float delta_G; // Binding energy (kJ/mol)
float mw; // Molecular weight
char name[32]; // Ligand identifier (e.g., CID)
};
#endif // ATOMS_H
Console output:
Sizeof(Atom) on CPU: 32 bytes
CPU Atom 0: x=33.630001, y=-36.625000, z=73.092003, q=-0.050000, r=1.550000, elem=2
CPU Atom 1: x=32.299000, y=-37.130001, z=72.781998, q=0.000000, r=1.700000, elem=1
CPU Atom 2: x=31.805000, y=-37.978001, z=73.943001, q=0.000000, r=1.700000, elem=1
CPU Atom 3: x=30.865999, y=-37.613998, z=74.649002, q=-0.050000, r=1.520000, elem=3
CPU Atom 4: x=32.326000, y=-37.952000, z=71.495003, q=0.000000, r=1.700000, elem=1
Post-cudaMemcpy check: atom 0, x=33.630001, y=-36.625000, z=73.092003, q=-0.050000, r=1.550000, elem=2
Post-cudaMemcpy check: atom 1, x=32.299000, y=-37.130001, z=72.781998, q=0.000000, r=1.700000, elem=1
Post-cudaMemcpy check: atom 2, x=31.805000, y=-37.978001, z=73.943001, q=0.000000, r=1.700000, elem=1
Post-cudaMemcpy check: atom 3, x=30.865999, y=-37.613998, z=74.649002, q=-0.050000, r=1.520000, elem=3
Post-cudaMemcpy check: atom 4, x=32.326000, y=-37.952000, z=71.495003, q=0.000000, r=1.700000, elem=1
GPU Atom 0: x=0.000000, y=33.630001, z=-36.625000, q=73.092003, r=-0.050000, elem=-1073741824
GPU Atom 1: x=0.000000, y=32.299000, z=-37.130001, q=72.781998, r=0.000000, elem=1073741824
GPU Atom 2: x=0.000000, y=31.805000, z=-37.978001, q=73.943001, r=0.000000, elem=1073741824
GPU Atom 3: x=0.000000, y=30.865999, z=-37.613998, q=74.649002, r=-0.050000, elem=-2147483648
GPU Atom 4: x=0.000000, y=32.326000, z=-37.952000, q=71.495003, r=0.000000, elem=1073741824
Problem: The GPU kernel (test_atom_kernel) reads incorrect values:
x is always 0.0 (likely reading padding or invalid memory).
y reads CPU x (e.g., 33.630001 for atom 0).
z reads CPU y (e.g., -36.625000).
q reads CPU z (e.g., 73.092003).
r reads CPU q (e.g., -0.050000).
elem reads garbage values (e.g., -1073741824, 1073741824, -2147483648), indicating access to uninitialized or invalid memory.
This matches the alignment mismatch issue seen in your previous outputs, where fields are shifted (e.g., y reads x, z reads y), suggesting that the align(16) and padding did not resolve the issue.