Bug in pointer alignment

The code listed below gives incorrect results (cpu version /= gpu version) when not compiled in emulation mode, but gives correct results with emulation mode enabled.

The problem is with this function:

__device__ __host__ void function(unsigned char* outbuf) {

	*(unsigned*)&(outbuf[1]) = 0x62;


When not in emulation mode, the value 0x62 is always written in outbuf[0] (should be in outbuf[1]).


os = Gentoo Linux x86_64 running kernel 2.6.25-gentoo-r6

nvidia kernel module 180.22

cuda toolkit and sdk version 2.1

$ gcc -v

Using built-in specs.

Target: x86_64-pc-linux-gnu

Configured with: /var/tmp/portage/sys-devel/gcc-4.1.2/work/gcc-4.1.2/configure --prefix=/usr --bindir=/usr/x86_64-pc-linux-gnu/gcc-bin/4.1.2 --includedir=/usr/lib/gcc/x86_64-pc-linux-gnu/4.1.2/include --datadir=/usr/share/gcc-data/x86_64-pc-linux-gnu/4.1.2 --mandir=/usr/share/gcc-data/x86_64-pc-linux-gnu/4.1.2/man --infodir=/usr/share/gcc-data/x86_64-pc-linux-gnu/4.1.2/info --with-gxx-include-dir=/usr/lib/gcc/x86_64-pc-linux-gnu/4.1.2/include/g++-v4 --host=x86_64-pc-linux-gnu --build=x86_64-pc-linux-gnu --disable-altivec --enable-nls --without-included-gettext --with-system-zlib --disable-checking --disable-werror --enable-secureplt --disable-libunwind-exceptions --enable-multilib --enable-libmudflap --disable-libssp --disable-libgcj --enable-languages=c,c++,fortran --enable-shared --enable-threads=posix --enable-__cxa_atexit --enable-clocale=gnu

Thread model: posix

gcc version 4.1.2 (Gentoo 4.1.2 p1.0.1)
$ nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver

Copyright (c) 2005-2007 NVIDIA Corporation

Built on Wed_Dec__3_16:25:17_PST_2008

Cuda compilation tools, release 2.1, V0.2.1221

Intel® Core™2 Duo CPU E8200 @ 2.66GHz


nvidia MCP55 chipset

GeForce 8800 GTX

The full code listing:

#include <cuda_runtime.h>

#include <cutil.h>

#include <stdio.h>

__device__ __host__ void function(unsigned char* outbuf) {

	*(unsigned*)&(outbuf[1]) = 0x62;


__global__ void testkernel(unsigned char* outbuf) {



int main( int argc, char *argv[] ) {

	dim3 threads(1);

	dim3 grid(1);

	const unsigned BUFSIZE = 10;

	unsigned char buf[BUFSIZE];

	memset(buf, 0, BUFSIZE);


	unsigned char* deviceBuf;

	cudaError err = cudaMalloc((void**)&deviceBuf, BUFSIZE);

	if (err != cudaSuccess)

		return cudaGetLastError();

	err = cudaMemcpy(deviceBuf, buf, BUFSIZE, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)

		return cudaGetLastError();

	testkernel<<< grid, threads >>>(deviceBuf);

	err = cudaGetLastError();

	if (err != cudaSuccess)

		return err;

	err = cudaMemcpy(buf, deviceBuf, BUFSIZE, cudaMemcpyDeviceToHost);

	if (err != cudaSuccess)

		return cudaGetLastError();


	printf("gpu output: %X %X %X %X %X\n", buf[0], buf[1], buf[2], buf[3], buf[4]);

	memset(buf, 0, 4);


	printf("cpu output: %X %X %X %X %X\n", buf[0], buf[1], buf[2], buf[3], buf[4]);

	return 0;