Hi,
I have a problem with misaligned violation access errors. My Setup is:
Windows 10 Pro, v1703
NVIDIA GeForce GTX 1080 Ti, driver version 22.21.13.8253 (2017/06/07)
Visual Studio Professional 2015, Version 14.0.25425.01 Update 3
cuda_8.0.61
NVIDIA Nsight Visual Studio Edition 5.3.0.17162
Originally I wanted to compare two image regions. I reduced the code to a minimum, still producing the behaviour that I don’t understand.
#include "cuda_runtime.h"
#include <stdio.h>
#include <iostream>
cudaError_t doSomethingInCuda(float *data0, unsigned int width, unsigned int height);
__global__ void testKernel(float *data0, unsigned int width, unsigned int height)
{
for (size_t i = 0; i < width * height; i++)
{
double im0;
double sum = 0.0;
for (int j = -70; j <= 70; j++)
{
im0 = data0[50000];
sum += im0;
}
}
}
int main()
{
const unsigned int width = 512;
const unsigned int height = 1024;
const unsigned int dataSize = width * height;
const unsigned int memSize = dataSize * sizeof(float);
float* data0 = (float*)malloc(memSize);
//Fill data array...
for (size_t i = 0; i < dataSize; i++)
{
data0[i] = (float)i;
}
doSomethingInCuda(data0, width, height);
free(data0);
std::cout << "Computation finished. Press any key...";
std::cin.ignore();
return 0;
}
cudaError_t doSomethingInCuda(float *data0, unsigned int width, unsigned int height)
{
float *d_data0 = NULL;
cudaError_t cudaStatus;
const unsigned int size = width * height;
const unsigned int memSize = size * sizeof(float);
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&d_data0, memSize);
if (cudaStatus != cudaSuccess) {
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(d_data0, data0, memSize, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
testKernel << <1, 1 >> > (d_data0, width, height);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
Error:
cudaFree(d_data0);
return cudaStatus;
}
This code leads to misaligned access violations, which seem to be random. Here are three examples:
i = 391465, j = -14
CUDA context created : 279dfef6140
CUDA module loaded: 279e228af00 kernel.cu
================================================================================
CUDA Memory Checker detected 1 threads caused an access violation:
Launch Parameters
CUcontext = 279dfef6140
CUstream = 279e228c900
CUmodule = 279e228af00
CUfunction = 279ef77c680
FunctionName = _Z10testKernelPfjj
GridId = 1
gridDim = {1,1,1}
blockDim = {1,1,1}
sharedSize = 256
Parameters:
data0 = 0x0000000b09600000 0
width = 512
height = 1024
Parameters (raw):
0x09600000 0x0000000b 0x00000200 0x00000400
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source
------------------------------------------------------------------------------------------------------------------
b09630d40 4 mis ld g 0 0 {0,0,0} {0,0,0} _Z10testKernelPfjj+0003b0 ...\kernel.cu:16
Summary of access violations:
...\kernel.cu(16): error MemoryChecker: #misaligned=1 #invalidAddress=0
================================================================================
Memory Checker detected 1 access violations.
error = misaligned load (global memory)
gridid = 1
blockIdx = {0,0,0}
threadIdx = {0,0,0}
address = 0xb09630d40
accessSize = 4
i = 26934, j = 19
CUDA context created : 1f1618a5d60
CUDA module loaded: 1f163c849b0 kernel.cu
================================================================================
CUDA Memory Checker detected 1 threads caused an access violation:
Launch Parameters
CUcontext = 1f1618a5d60
CUstream = 1f163c84fb0
CUmodule = 1f163c849b0
CUfunction = 1f1711b9ce0
FunctionName = _Z10testKernelPfjj
GridId = 1
gridDim = {1,1,1}
blockDim = {1,1,1}
sharedSize = 256
Parameters:
data0 = 0x0000000d09600000 0
width = 512
height = 1024
Parameters (raw):
0x09600000 0x0000000d 0x00000200 0x00000400
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source
------------------------------------------------------------------------------------------------------------------
d09630d40 4 mis ld g 0 0 {0,0,0} {0,0,0} _Z10testKernelPfjj+0003b0 ...\kernel.cu:16
Summary of access violations:
...\kernel.cu(16): error MemoryChecker: #misaligned=1 #invalidAddress=0
================================================================================
Memory Checker detected 1 access violations.
error = misaligned load (global memory)
gridid = 1
blockIdx = {0,0,0}
threadIdx = {0,0,0}
address = 0xd09630d40
accessSize = 4
i = 372617, j = -3
CUDA context created : 23a134f5d00
CUDA module loaded: 23a158ae630 kernel.cu
================================================================================
CUDA Memory Checker detected 1 threads caused an access violation:
Launch Parameters
CUcontext = 23a134f5d00
CUstream = 23a158add30
CUmodule = 23a158ae630
CUfunction = 23a22da7430
FunctionName = _Z10testKernelPfjj
GridId = 1
gridDim = {1,1,1}
blockDim = {1,1,1}
sharedSize = 256
Parameters:
data0 = 0x0000000b09600000 0
width = 512
height = 1024
Parameters (raw):
0x09600000 0x0000000b 0x00000200 0x00000400
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source
------------------------------------------------------------------------------------------------------------------
b09630d40 4 mis ld g 0 0 {0,0,0} {0,0,0} _Z10testKernelPfjj+0003b0 ...\kernel.cu:16
Summary of access violations:
...\kernel.cu(16): error MemoryChecker: #misaligned=1 #invalidAddress=0
================================================================================
Memory Checker detected 1 access violations.
error = misaligned load (global memory)
gridid = 1
blockIdx = {0,0,0}
threadIdx = {0,0,0}
address = 0xb09630d40
accessSize = 4
If I remove the inner loop (for j=…) I don’t get an Error. If I reduce the range of j, the probability of getting the misaligned access error decreases.
What I found so far is a similar problem but no solution:
https://devtalk.nvidia.com/default/topic/1005607/shared-memory-debug-errors-in-nsight/
Thanks for your help
Marcus