Hi,
My research group has a GeForce GTX 285 that has recently begun to exhibit incorrect behavior when performing 128-byte stores to device memory. Our accelerator reliably fails the SDK and CUDPP tests that use vec-2 stores when the problem sizes are big enough to use all 30 streaming multiprocessors (SMs). Has anyone else had a similar problem?
Although it “smells” like a hardware problem, it would be nice to rule out software/configuration as the root cause. We are running 64-bit version of Ubuntu 8.04.3, the Cuda 2.2 toolkit, and driver version 185.18.14.
What happens is that a consecutive pair of threads fails to write back their vec-2 data. Because the GT200 architecture is double-pumped, this would indicate that one of the eight stream processors (SPs) within one of the thirty SMs is faulty. A post-mortem of the incorrect output always indicates that 8th stream processor within one of the SMs is the culprit: when performing a 128-byte store, a half-warp writes 32 four-byte words (16 threads * two-element-vectors), and the incorrect words are always at offsets 28, 29, 30, and 31 within the 32-word memory transaction.
Below is a trivial “memcpy” kernel that reliably triggers the incorrect behaviors for problem sizes n > 65536. (And doesn’t fail when specifying 64-byte memory txns.)
Thanks!
Duane Merrill
Dept. of Computer Science
University of Virginia
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
//
// Simple array-copy routine to run on host for verification
//
void computeGold( int* reference, int* idata, const unsigned int len)
{
for( unsigned int i = 0; i < len; i++)
{
reference[i] = idata[i];
}
}
//
// Simple copying kernel.
//
// Uses global memory accesses to copy 2 ints per thread.
//
template<bool use_64_byte_loads> // whether or not to use 64-byte loads/stores
__global__ void
testKernel(int* g_idata, int* g_odata)
{
// cast device input and output pointers to 2-item vectors
int2* in = (int2*) g_idata;
int2* out = (int2*) g_odata;
// my global location
const unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
// scratch storage in local registers
int2 local;
// read data from global mem
if (use_64_byte_loads) {
local.x = g_idata[tid];
local.y = g_idata[tid + (gridDim.x * blockDim.x)];
} else {
local = in[tid];
}
__syncthreads(); // prevent overlapped i/o
// write data to global mem
if (use_64_byte_loads) {
g_odata[tid] = local.x;
g_odata[tid + (gridDim.x * blockDim.x)] = local.y;
} else {
out[tid] = local;
}
}
//
// Main
//
// Usage: "fail [--n=<num-elements>] [--i=<num-iterations>] [--64]"
//
// Defaults to 256K elements (must be a positive multiple of 512),
// and 1000 iterations. Specify "--64" to perform two 64-byte memory
// transactions per thread instead of one 128-byte transaction.
//
int main( int argc, char** argv)
{
unsigned int num_iterations = 1000;
unsigned int num_threads = 128;
unsigned int num_elements = 1024 * 256;
unsigned int mem_size;
unsigned int timer = 0;
bool use64byteMemOps;
CUT_DEVICE_INIT(argc, argv);
cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int*)&num_elements);
cutGetCmdLineArgumenti( argc, (const char**) argv, "i", (int*)&num_iterations);
use64byteMemOps = cutCheckCmdLineFlag( argc, (const char**) argv, "64");
mem_size = sizeof(int) * num_elements;
if (cutCheckCmdLineFlag( argc, (const char**) argv, "help")) {
printf("fail [--n=<num-elements>] [--i=<num-iterations>] [--64]\n");
fflush(stdout);
return 0;
}
CUT_SAFE_CALL( cutCreateTimer( &timer));
// allocate host memory
int* h_idata = (int*) malloc( mem_size);
// initalize the memory
for( unsigned int i = 0; i < num_elements; ++i) {
h_idata[i] = i;
}
// allocate device memory
int* d_idata;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, mem_size));
// copy host memory to device
CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) );
// allocate device memory for result
int* d_odata;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, mem_size));
// calculate grid size (each thread handles 4 items)
unsigned int grid_size = num_elements / (num_threads * 2);
printf("%d-element memcopy using %d-byte device-memory txns, %d iterations.\nLaunch config: <<< %d, %d >>>\n",
num_elements,
use64byteMemOps ? 64 : 128,
num_iterations,
grid_size,
num_threads);
fflush(stdout);
// execute the kernel
CUT_SAFE_CALL( cutStartTimer( timer));
for (int i = 0; i < num_iterations; i++) {
if (use64byteMemOps) {
testKernel<true><<< grid_size, num_threads >>>( d_idata, d_odata);
} else {
testKernel<false><<< grid_size, num_threads >>>( d_idata, d_odata);
}
}
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Avg processing time: %f (ms)\n", cutGetTimerValue( timer) / (float) num_iterations);
CUT_SAFE_CALL( cutDeleteTimer( timer));
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// allocate mem for the result on host side
int* h_odata = (int*) malloc( mem_size);
// copy result from device to host
CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost) );
// compute reference solution
int* reference = (int*) malloc( mem_size);
computeGold( reference, h_idata, num_elements);
// check for correctness
bool passed = true;
for (int i = 0; i < num_elements; i++) {
if (reference[i] != h_odata[i]) {
passed = false;
printf("ERROR: reference[%d] != output[%d], half-warp offset %d\n", i, i, i % (2*16));
printf("\treference[...");
for (int j = -4; j <= 4; j++) {
if ((i + j > 0) && (i + j < num_elements))
printf("%d, ", reference[i + j]);
}
printf("...]\n");
printf("\toutput[...");
for (int j = -4; j <= 4; j++) {
if ((i + j > 0) && (i + j < num_elements))
printf("%d, ", h_odata[i + j]);
}
printf("...]\n");
break;
}
}
printf("\n");
printf( "Test %s\n", passed ? "PASSED" : "FAILED");
// cleanup memory
free( h_idata);
free( h_odata);
free( reference);
CUDA_SAFE_CALL(cudaFree(d_idata));
CUDA_SAFE_CALL(cudaFree(d_odata));
CUT_EXIT(argc, argv);
}