#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#define SAMP_SIZE (512 * 512)
typedef u_int32_t uint32;
typedef u_int64_t uint64;
typedef struct __align__(16) {
uint64 x, y;
} uint128;
static __inline__ __host__ __device__ uint32 cump128_mod_ui(uint128 a, uint32 p)
{
uint64 ret;
ret = a.y % p;
ret = ((ret << 32) | (a.x >> 32)) % p;
ret = ((ret << 32) | ((uint32) a.x)) % p;
return ret;
}
__global__ void repro_kernel(uint128 *src1, uint32 *src2, uint32 *dst)
{
uint128 a;
uint32 b, c, idx;
idx = blockIdx.x * blockDim.x + threadIdx.x;
a = src1[idx];
b = src2[idx];
c = cump128_mod_ui(a, b);
dst[idx] = c;
return;
}
int main(void)
{
uint128 *h_src1, *d_src1;
uint32 i, *h_src2, *d_src2, *h_dst, *d_dst, *h_tst;
cudaMallocHost((void **) &h_src1, SAMP_SIZE * sizeof(uint128));
cudaMalloc((void **) &d_src1, SAMP_SIZE * sizeof(uint128));
cudaMallocHost((void **) &h_src2, SAMP_SIZE * sizeof(uint32));
cudaMalloc((void **) &d_src2, SAMP_SIZE * sizeof(uint32));
cudaMallocHost((void **) &h_dst, SAMP_SIZE * sizeof(uint128));
cudaMalloc((void **) &d_dst, SAMP_SIZE * sizeof(uint32));
cudaMallocHost((void **) &h_tst, SAMP_SIZE * sizeof(uint128));
srandom(5);
for(i = 0; i < SAMP_SIZE; i++)
{
h_src1[i].x = ((uint64) random() << 32) | random();
h_src1[i].y = ((uint64) random() << 32) | random();
h_src2[i] = (random() & 0xffff) + 1;
}
cudaMemcpy(d_src1, h_src1, SAMP_SIZE * sizeof(uint128),
cudaMemcpyHostToDevice);
cudaMemcpy(d_src2, h_src2, SAMP_SIZE * sizeof(uint32),
cudaMemcpyHostToDevice);
repro_kernel<<<SAMP_SIZE / 512, 512>>>(d_src1, d_src2, d_dst);
cudaThreadSynchronize();
cudaMemcpy(h_dst, d_dst, SAMP_SIZE * sizeof(uint32),
cudaMemcpyDeviceToHost);
for(i = 0; i < SAMP_SIZE; i++)
h_tst[i] = cump128_mod_ui(h_src1[i], h_src2[i]);
for(i = 0; i < SAMP_SIZE; i++)
{
if(h_tst[i] != h_dst[i])
{
printf("DISCREPANCY AT: %u host: %u gpu: %u\n",
i, h_tst[i], h_dst[i]);
}
}
return 0;
}
[pstach@beast ~]$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2007 NVIDIA Corporation
Built on Thu_Jun_19_03:38:28_PDT_2008
Cuda compilation tools, release 2.0, V0.2.1221
[pstach@beast ~]$ nvcc -deviceemu -O3 -o repro repro.cu
[pstach@beast ~]$ ./repro |head
[pstach@beast ~]$ nvcc -O3 -o repro repro.cu
[pstach@beast ~]$ ./repro |head
DISCREPANCY AT: 0 host: 9414 gpu: 99605327
DISCREPANCY AT: 1 host: 42546 gpu: 602511920
DISCREPANCY AT: 2 host: 20224 gpu: 589806848
DISCREPANCY AT: 3 host: 29924 gpu: 780437020
DISCREPANCY AT: 4 host: 29089 gpu: 990569005
DISCREPANCY AT: 5 host: 1503 gpu: 2072822815
DISCREPANCY AT: 6 host: 5711 gpu: 1392349175
DISCREPANCY AT: 7 host: 15718 gpu: 1180618940
DISCREPANCY AT: 8 host: 38073 gpu: 1347951473
DISCREPANCY AT: 9 host: 271 gpu: 280271323
Extra information that doesn’t really have much of an effect:
GTX 280, driver rev 177.67, CentOS 5.2 x86_64