Device emulator alignment bug

Hello,

I found a bug in the 1.0 version of the emulator. If you typecast a char* to a uint4* when the char is not aligned correctly (i.e. &char %16 !=0) then dereference the uint4*, the data is accessed as if it were correctly aligned on the emulator.

This code obviously does not work on the GPU.

I might be able to post source that triggers the bug, if it is needed, but I would have to write something up that is not related to the project I’m working on.

This is running on OpenSuse 10.2 x86_64, stock install. Pentium D and 8800GTX(as second display card with no monitor attached to it). Latest CUDA, sdk and drivers.

Please attach a test case which reproduces the problem, along with:

  • Build instructions
  • Expected behavior
  • Actual behavior
  • Reproduction instructions

thanks,
Lonni

This is a throwaway program that should output a number that is not 42. If it outputs 42 then it somehow successfully read an unaligned uint4.

I posted the contents to 2 files and some output.

See the comment above the makefile for directions on how to make it.

First the output.

workbird:~/Documents/gpgpu_base/bug_test> make

nvcc -o bugcheck bugcheck.cu  -I. -I/usr/local/cuda/include -I/home/jeff.hagen/NVIDIA_CUDA_SDK/common/inc -L/usr/local/cuda/lib -L../../lib -L../../common/lib -L/home/jeff.hagen/NVIDIA_CUDA_SDK/lib/   -lcuda -lcudart -lGL -lGLU  -lcutil  -DUNIX

workbird:~/Documents/gpgpu_base/bug_test> ./bugcheck 

10752

workbird:~/Documents/gpgpu_base/bug_test> make emulate

nvcc -o bugcheck bugcheck.cu  -I. -I/usr/local/cuda/include -I/home/jeff.hagen/NVIDIA_CUDA_SDK/common/inc -L/usr/local/cuda/lib -L../../lib -L../../common/lib -L/home/jeff.hagen/NVIDIA_CUDA_SDK/lib/   -lcuda -lcudart -lGL -lGLU  -lcutil -deviceemu -DUNIX

workbird:~/Documents/gpgpu_base/bug_test> ./bugcheck 

42

workbird:~/Documents/gpgpu_base/bug_test>

name this file ‘bugcheck.cu’

#include <stdlib.h>

#include <stdio.h>

#include <assert.h>

#include <cutil.h>

__global__ void device_code(char* foo,uint4* output) {

    /* make a uint4 that is not aligned correctly, but does have data in it

       see below where we put 42 into an unaligned uint4 */

    uint4* x = (uint4*)&foo[1];

    /* try to copy unaligned value into output.

       As the card won't give us an error, we should get random output */

    output->x = x->x;

}

int main (int argc, char** argv) {

    /* sizeof(uint4 == 16), so I'm allocating 3 uint4s */

    char * foo = (char*)calloc(sizeof(char), 16 * 3);

   /* make sure it is aligned correctly */

    assert((((long)foo) %16) == 0);

   /* make a uint4 pointer that is not aligned correctly*/

    uint4* x = (uint4*)&foo[1];

   /* make sure it is not aligned correctly */

    assert(((long)x %16) != 0);

   /* set an unaligned value of 42 

        the CPU can deal with unaligned ints, so this works. 

   */

    x->x=42;

   /* make variables */

    char* fooOnDevice;

    uint4* outputOnDevice;

    uint4 output;

   /* copy as aligned types */

    cudaMalloc((void**)&fooOnDevice,sizeof(uint4));

    cudaMalloc((void**)&outputOnDevice,sizeof(uint4));

    cudaMemcpy(fooOnDevice,foo,sizeof(uint4),cudaMemcpyHostToDevice);

   /* run on one single PE */

    dim3 block =(1);

    dim3 grid = (1);

    device_code<<<block,grid>>>(fooOnDevice,outputOnDevice);

   /* print output... this SHOULD print garbage, as the device code tried to 

        do an unaligned copy 

       If it prints 42 then the device somehow read an unaligned uint4, which it should

        not be able to do

    */

    cudaMemcpy(&output,outputOnDevice,sizeof(uint4),cudaMemcpyDeviceToHost);

    printf("%d\n",output.x);

}

name this file ‘Makefile’.

#

# A simple makefile for CUDA projects.

#

# This is much shorter then the default makefile, and is so by making the following assumptions:

# 1) The CUDA SDK is installed to ~/NVIDIA_CUDA_SDK

# 2) The example programs have been built (so we can -lcutil)

# 3) CUDA is installed to /usr/local/CUDA

# 4) The project does not use openGL/DirectX

# 5) You are on linux.

#

# Add source files here

EXECUTABLE      := bugcheck

# Cuda source files (compiled with cudacc)

CUFILES         := bugcheck.cu

# C/C++ source files (compiled with gcc / c++)

CCFILES         :=

CCOFILES        :=

################################################################################

# Rules and targets

CUDA_INSTALL_PATH := /usr/local/cuda

# Basic directory setup for SDK

# (override directories only if they are not already defined)

SRCDIR     ?=

ROOTDIR    ?= $(HOME)/NVIDIA_CUDA_SDK

ROOTBINDIR ?= $(ROOTDIR)/bin

BINDIR     ?= $(ROOTBINDIR)/linux

ROOTOBJDIR ?= obj

LIBDIR     := $(ROOTDIR)/lib

COMMONDIR  := $(ROOTDIR)/common

# Compilers

NVCC       := nvcc

CXX        := g++

CC         := gcc

LINK       := g++ -fPIC

# Includes

INCLUDES  += -I. -I$(CUDA_INSTALL_PATH)/include -I$(COMMONDIR)/inc

# Libs

LIB       := -L/usr/local/cuda/lib -L../../lib -L../../common/lib -L$(ROOTDIR)/lib/

DOLINK    := -lcuda -lcudart -lGL -lGLU  -lcutil

default: $(CCOFILES)

        nvcc -o $(EXECUTABLE) $(CUFILES) $(CCOFILES) $(INCLUDES) $(LIB) $(DOLINK)  -DUNIX

emulate:  $(CCOFILES)

        nvcc -o $(EXECUTABLE) $(CUFILES) $(CCOFILES) $(INCLUDES) $(LIB) $(DOLINK) -deviceemu -DUNIX

cubin:  $(CCOFILES)

        nvcc -cubin $(CUFILES) $(CCOFILES) $(INCLUDES) $(LIB) $(DOLINK) -deviceemu -DUNIX

%.o: %.c

        g++ -fPIC -c $? $(INCLUDES) $(LIB) -DUNIX

clean:

        rm -f $(CCOFILES) $(EXECUTABLE)

moreclean: clean

        rm -f *.cu.c