Symptom
I am observing a CUDA memory checker failure complaining about an unaligned load where I believe the load is properly aligned. The error message looks like the address is actually aligned.
Log
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source
-----------------------------------------------------------------------------------------------------------------------------------
209820590 8 mis ld g 36 224 {36,0,0} {224,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 225 {36,0,0} {225,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 226 {36,0,0} {226,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 227 {36,0,0} {227,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 228 {36,0,0} {228,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 229 {36,0,0} {229,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 230 {36,0,0} {230,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 231 {36,0,0} {231,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 232 {36,0,0} {232,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 233 {36,0,0} {233,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 234 {36,0,0} {234,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 235 {36,0,0} {235,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 236 {36,0,0} {236,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 237 {36,0,0} {237,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 238 {36,0,0} {238,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 239 {36,0,0} {239,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 240 {36,0,0} {240,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 241 {36,0,0} {241,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 242 {36,0,0} {242,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 243 {36,0,0} {243,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 244 {36,0,0} {244,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 245 {36,0,0} {245,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 246 {36,0,0} {246,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 247 {36,0,0} {247,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 248 {36,0,0} {248,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 249 {36,0,0} {249,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 250 {36,0,0} {250,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 251 {36,0,0} {251,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 252 {36,0,0} {252,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 253 {36,0,0} {253,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 254 {36,0,0} {254,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
209820590 8 mis ld g 36 255 {36,0,0} {255,0,0} _Z12sieve_kernelIyEvT_PhS0_PKS0_S0_+000630 c:\users\spyffe\documents\visual studio 2015\projects\projecteuler\euler10\euler10.cu:30
Configuration
I am using the following software:
- Microsoft Visual Studio Community 2015 (version 14.0.25431.01 Update 3)
- NVIDIA Nsight Visual Studio Edition 5.2.0.16223
- NVIDIA CUDA compiler v8.0
- NVIDIA driver 378.92 (WHQL)
- Windows 10
The hardware is a single-GPU machine with a Titan X (Pascal).
Source code
This program is my attempt at a solution to Project Euler problem 10, implementing a prime sieve for the numbers up to 2,000,000. It’s a work in progress.
euler10.cu
#include "cuda_stdlib.h"
#include "cuda_sum.h"
#include <assert.h>
#include <algorithm>
#include <tuple>
template <typename PrimeType>
__global__ void sieve_kernel(
PrimeType sieve_base,
uint8_t *sieve,
PrimeType sieve_size,
const PrimeType *primes,
PrimeType num_primes) {
const PrimeType sieve_offset = cuda::get_thread_id();
if (sieve_offset >= sieve_size) {
// Deal with sieve sizes that aren't an exact multiple of the block size
// This can cause thread deviation in warps in the last block
return;
}
const PrimeType candidate = sieve_base + sieve_offset;
bool is_prime = true;
for (PrimeType prime_idx = 0;
prime_idx < num_primes;
++prime_idx) {
const PrimeType candidate_divisor = primes[prime_idx];
const PrimeType remainder = candidate % candidate_divisor;
is_prime &= (remainder != 0);
}
sieve[sieve_offset] = (is_prime ? 1 : 0);
}
template <typename PrimeType>
__global__ void sieve_to_counts_kernel(
PrimeType *counts,
PrimeType num_counts,
const uint8_t *sieve,
PrimeType sieve_size,
PrimeType sieve_entries_per_count
) {
const PrimeType count_index = cuda::get_thread_id();
if (count_index > num_counts) {
// Deal with num_counts not an exact multiple of threads_per_block
return;
}
PrimeType count = 0;
const uint8_t *start_ptr = &sieve[count_index * sieve_entries_per_count];
const uint8_t *end_ptr = &sieve[(count_index + 1) * sieve_entries_per_count];
if (end_ptr >= &sieve[sieve_size]) {
// Deal with sieve sizes that aren't an exact multiple of the block size
// This can cause thread deviation in warps in the last block
end_ptr = &sieve[sieve_size];
}
for (const uint8_t *sieve_ptr = start_ptr; sieve_ptr < end_ptr; ++sieve_ptr) {
count += ((PrimeType)*sieve_ptr ? 1 : 0);
}
counts[count_index] = count;
}
template <typename PrimeType>
__global__ void counts_to_starts_kernel(
PrimeType *counts, // modified in place
PrimeType num_counts
) {
PrimeType start = 0;
for (PrimeType *count_ptr = &counts[0]; count_ptr < &counts[num_counts + 1]; ++count_ptr) {
PrimeType start_for_this_count = start;
start += (PrimeType)*count_ptr;
*count_ptr = start_for_this_count;
}
}
template <typename PrimeType>
__global__ void sieve_to_primes_kernel(
PrimeType *primes,
const PrimeType *starts,
PrimeType num_starts,
PrimeType sieve_entries_per_count,
const uint8_t *sieve,
PrimeType sieve_size,
PrimeType sieve_base
) {
const PrimeType start_index = cuda::get_thread_id();
if (start_index > num_starts) {
return; // when num_starts isn't a multiple of threads_per_block
}
PrimeType *prime_ptr = &primes[starts[start_index]];
PrimeType prime_value = sieve_base + (start_index * sieve_entries_per_count);
for (const uint8_t *sieve_ptr = &sieve[start_index * sieve_entries_per_count];
sieve_ptr < &sieve[(start_index + 1) * sieve_entries_per_count] &&
sieve_ptr < &sieve[sieve_size];
++sieve_ptr, ++prime_value) {
if (*sieve_ptr == 1) {
*prime_ptr = prime_value;
++prime_ptr;
}
}
}
#define DEBUGGING
template <typename PrimeType>
std::pair<cuda::ptr<PrimeType>, PrimeType> sieve_primes(PrimeType max) {
struct Mark {
private:
PrimeType m;
public:
Mark(PrimeType m) { this->m = m; }
PrimeType max() const { return m; }
PrimeType size() const { return m + 1; }
} mark(2);
cuda::ptr<PrimeType> primes(std::vector<PrimeType>{ 2 });
PrimeType num_primes = 1;
while (mark.max() < max) {
const Mark new_mark((max / mark.max() >= mark.max()) ? (mark.max() * mark.max()) : max);
const PrimeType sieve_base = mark.max() + 1;
const PrimeType sieve_size = (new_mark.size() - mark.size());
cuda::ptr<uint8_t> sieve(cuda::array_sizeof<uint8_t>(sieve_size));
{
const std::pair<dim3, dim3> dims = cuda::get_dimensions(sieve_size);
sieve_kernel<PrimeType> <<<dims.first, dims.second>>>(
sieve_base,
sieve.get(),
sieve_size,
primes.get(),
num_primes);
if ((cuda::maybe_report_error("cudaDeviceSynchronize failed: ", cudaDeviceSynchronize()) != cudaSuccess) ||
(cuda::maybe_report_error("addKernel launch failed: ", cudaGetLastError()) != cudaSuccess)) {
exit(-1);
}
}
mark = new_mark;
const PrimeType sieve_entries_per_count = 16;
const PrimeType num_counts = (sieve_size + sieve_entries_per_count - 1) / sieve_entries_per_count;
// We allocate an extra count so that, when we convert to starts,
// the first element is 0 and last element contains the # of primes.
cuda::ptr<PrimeType> counts(cuda::array_sizeof<PrimeType>(num_counts + 1));
{
const std::pair<dim3, dim3> dims = cuda::get_dimensions(num_counts);
sieve_to_counts_kernel<PrimeType> <<<dims.first, dims.second>>> (
counts.get(),
num_counts,
sieve.get(),
sieve_size,
sieve_entries_per_count
);
}
#ifdef DEBUGGING
std::vector<PrimeType> primes_local(num_primes);
cuda::memcpy(primes_local.data(), primes, cuda::array_sizeof<PrimeType>(num_primes));
std::vector<uint8_t> sieve_local(sieve_size);
cuda::memcpy(sieve_local.data(), sieve, cuda::array_sizeof<uint8_t>(sieve_size));
std::vector<PrimeType> counts_local(num_counts);
cuda::memcpy(counts_local.data(), counts, cuda::array_sizeof<PrimeType>(num_counts));
#endif
{
counts_to_starts_kernel<PrimeType> <<<1, 1>>> (
counts.get(),
num_counts
);
}
#ifdef DEBUGGING
std::vector<PrimeType> starts_local(num_counts);
cuda::memcpy(starts_local.data(), counts, cuda::array_sizeof<PrimeType>(num_counts));
#endif
PrimeType num_additional_primes = 0;
cuda::maybe_report_error("Couldn't memcpy count from device: ",
cudaMemcpy(&num_additional_primes, &counts.get()[num_counts], sizeof(PrimeType), cudaMemcpyDeviceToHost));
const PrimeType new_num_primes = num_primes + num_additional_primes;
cuda::ptr<PrimeType> new_primes(cuda::array_sizeof<PrimeType>(new_num_primes));
cuda::memcpy(new_primes, primes, cuda::array_sizeof<PrimeType>(num_primes));
{
const std::pair<dim3, dim3> dims = cuda::get_dimensions(num_counts);
sieve_to_primes_kernel<PrimeType> <<<dims.first, dims.second >>> (
&new_primes.get()[num_primes],
counts.get(),
num_counts,
sieve_entries_per_count,
sieve.get(),
sieve_size,
sieve_base
);
}
#ifdef DEBUGGING
std::vector<PrimeType> new_primes_local(new_num_primes);
cuda::memcpy(new_primes_local.data(), new_primes, cuda::array_sizeof<PrimeType>(new_num_primes));
#endif
primes = std::move(new_primes);
num_primes = new_num_primes;
}
return std::make_pair(std::move(primes), num_primes);
}
int main()
{
cuda::session session;
typedef uint64_t PrimeType;
PrimeType num_primes;
cuda::ptr<PrimeType> primes;
std::tie(primes, num_primes) = sieve_primes<PrimeType>(2000000);
PrimeType sum = cuda::sum<PrimeType>(primes, (size_t)num_primes);
return 0;
}
cuda_stdlib.h
So I don’t have to write as much boilerplate to allocate memory.
#ifndef CUDA_STDLIB_H
#define CUDA_STDLIB_H
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <vector>
namespace cuda {
void report_error(const char *str, const cudaError_t error) {
std::cerr << str << cudaGetErrorString(error);
}
cudaError_t maybe_report_error(const char *str, const cudaError_t error) {
if (error != cudaSuccess) {
report_error(str, error);
}
return error;
}
struct session {
bool v = false;
public:
session(int device = 0) {
v = (maybe_report_error("Couldn't select CUDA device (doesn't exist?): ",
cudaSetDevice(device)) == cudaSuccess);
}
bool valid() {
return v;
}
~session() {
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
maybe_report_error("Couldn't reset CUDA: ", cudaDeviceReset());
}
};
template <typename T> struct ptr {
private:
T *p = nullptr;
public:
ptr(size_t size = sizeof(T)) {
if (maybe_report_error("Couldn't malloc on device: ",
cudaMalloc(&p, size)) != cudaSuccess) {
p = nullptr;
}
}
ptr(ptr<T> &&rhs) {
p = rhs.p;
rhs.p = nullptr;
}
ptr(const std::vector<T> &init) {
if (maybe_report_error("Couldn't malloc on device: ",
cudaMalloc(&p, init.size() * sizeof(T))) != cudaSuccess) {
p = nullptr;
return;
}
if (maybe_report_error("Couldn't memcpy to device: ",
cudaMemcpy(p, init.data(), init.size() * sizeof(T), cudaMemcpyHostToDevice)) != cudaSuccess) {
cudaFree(p);
p = nullptr;
}
}
ptr<T> &operator=(ptr<T> &&rhs) {
if (p) {
cudaFree(p);
p = nullptr;
}
p = rhs.p;
rhs.p = nullptr;
return *this;
}
~ptr() {
if (p) {
cudaFree(p);
p = nullptr;
}
}
T *get() {
return p;
}
const T *get() const {
return p;
}
};
template <typename T> cudaError_t memcpy(T *dst, const ptr<T> &src, size_t size) {
return maybe_report_error("Couldn't memcpy from device: ",
cudaMemcpy(dst, src.get(), size, cudaMemcpyDeviceToHost));
}
template <typename T> cudaError_t memcpy(ptr<T> &dst, const T *src, size_t size) {
return maybe_report_error("Couldn't memcpy to device: ",
cudaMemcpy(dst.get(), src, size, cudaMemcpyHostToDevice));
}
template <typename T> cudaError_t memcpy(ptr<T> &dst, const ptr<T> &src, size_t size) {
return maybe_report_error("Couldn't memcpy on device: ",
cudaMemcpy(dst.get(), src.get(), size, cudaMemcpyDeviceToDevice));
}
template <typename T>
size_t array_sizeof(size_t num_items) {
return sizeof(T) * num_items;
}
const size_t threads_per_block = 512;
std::pair<dim3, dim3> get_dimensions(size_t num_threads) {
const dim3 blockDim = num_threads > threads_per_block ? (unsigned int)((num_threads + threads_per_block - 1) / threads_per_block) : 1;
const dim3 threadDim = num_threads > threads_per_block ? threads_per_block : (unsigned int)num_threads;
return std::make_pair(blockDim, threadDim);
}
__device__ size_t get_thread_id() {
return ((size_t)blockIdx.x * (size_t)blockDim.x) + threadIdx.x;
}
}
#endif //CUDA_STDLIB_H
cuda_sum.h
This in particular is a WIP… I intend to make this a parallel sum but I have a placeholder for debugging purposes.
#ifndef CUDA_SUM_H
#define CUDA_SUM_H
#include "cuda_stdlib.h"
template <typename T> __global__ void sum_kernel(T *work, uint8_t *overflow, const T *inputs, size_t input_size) {
size_t tid = cuda::get_thread_id();
if (tid == 0) {
T sum = 0;
*overflow = 0;
for (size_t i = 0; i < input_size; ++i) {
T new_sum = sum + inputs[i];
if (new_sum < sum) {
*overflow = 1;
}
sum = new_sum;
}
work[0] = sum;
}
}
namespace cuda {
template <typename T> T sum(const ptr<T> &inputs, const size_t num_inputs) {
ptr<T> work(array_sizeof<T>(num_inputs));
ptr<uint8_t> overflow(sizeof(uint8_t));
std::pair<dim3, dim3> dimensions = get_dimensions(num_inputs);
sum_kernel <<<dimensions.first, dimensions.second>>> (work.get(), overflow.get(), inputs.get(), num_inputs);
T ret;
cuda::memcpy(&ret, work, sizeof(T));
uint8_t overflow_local = 0;
cuda::memcpy(&overflow_local, overflow, sizeof(uint8_t));
return ret;
}
}
#endif