The code listed below gives incorrect results (cpu version /= gpu version) when not compiled in emulation mode, but gives correct results with emulation mode enabled.
The problem is with this function:
__device__ __host__ void function(unsigned char* outbuf) {
*(unsigned*)&(outbuf[1]) = 0x62;
}
When not in emulation mode, the value 0x62 is always written in outbuf[0] (should be in outbuf[1]).
Info:
os = Gentoo Linux x86_64 running kernel 2.6.25-gentoo-r6
nvidia kernel module 180.22
cuda toolkit and sdk version 2.1
$ gcc -v
Using built-in specs.
Target: x86_64-pc-linux-gnu
Configured with: /var/tmp/portage/sys-devel/gcc-4.1.2/work/gcc-4.1.2/configure --prefix=/usr --bindir=/usr/x86_64-pc-linux-gnu/gcc-bin/4.1.2 --includedir=/usr/lib/gcc/x86_64-pc-linux-gnu/4.1.2/include --datadir=/usr/share/gcc-data/x86_64-pc-linux-gnu/4.1.2 --mandir=/usr/share/gcc-data/x86_64-pc-linux-gnu/4.1.2/man --infodir=/usr/share/gcc-data/x86_64-pc-linux-gnu/4.1.2/info --with-gxx-include-dir=/usr/lib/gcc/x86_64-pc-linux-gnu/4.1.2/include/g++-v4 --host=x86_64-pc-linux-gnu --build=x86_64-pc-linux-gnu --disable-altivec --enable-nls --without-included-gettext --with-system-zlib --disable-checking --disable-werror --enable-secureplt --disable-libunwind-exceptions --enable-multilib --enable-libmudflap --disable-libssp --disable-libgcj --enable-languages=c,c++,fortran --enable-shared --enable-threads=posix --enable-__cxa_atexit --enable-clocale=gnu
Thread model: posix
gcc version 4.1.2 (Gentoo 4.1.2 p1.0.1)
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2007 NVIDIA Corporation
Built on Wed_Dec__3_16:25:17_PST_2008
Cuda compilation tools, release 2.1, V0.2.1221
Intel® Core™2 Duo CPU E8200 @ 2.66GHz
1GB DDR2
nvidia MCP55 chipset
GeForce 8800 GTX
The full code listing:
#include <cuda_runtime.h>
#include <cutil.h>
#include <stdio.h>
__device__ __host__ void function(unsigned char* outbuf) {
*(unsigned*)&(outbuf[1]) = 0x62;
}
__global__ void testkernel(unsigned char* outbuf) {
function(outbuf);
}
int main( int argc, char *argv[] ) {
dim3 threads(1);
dim3 grid(1);
const unsigned BUFSIZE = 10;
unsigned char buf[BUFSIZE];
memset(buf, 0, BUFSIZE);
unsigned char* deviceBuf;
cudaError err = cudaMalloc((void**)&deviceBuf, BUFSIZE);
if (err != cudaSuccess)
return cudaGetLastError();
err = cudaMemcpy(deviceBuf, buf, BUFSIZE, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
return cudaGetLastError();
testkernel<<< grid, threads >>>(deviceBuf);
err = cudaGetLastError();
if (err != cudaSuccess)
return err;
err = cudaMemcpy(buf, deviceBuf, BUFSIZE, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
return cudaGetLastError();
cudaFree(deviceBuf);
printf("gpu output: %X %X %X %X %X\n", buf[0], buf[1], buf[2], buf[3], buf[4]);
memset(buf, 0, 4);
function(buf);
printf("cpu output: %X %X %X %X %X\n", buf[0], buf[1], buf[2], buf[3], buf[4]);
return 0;
}