Spurious 128-byte memory store failures


My research group has a GeForce GTX 285 that has recently begun to exhibit incorrect behavior when performing 128-byte stores to device memory. Our accelerator reliably fails the SDK and CUDPP tests that use vec-2 stores when the problem sizes are big enough to use all 30 streaming multiprocessors (SMs). Has anyone else had a similar problem?

Although it “smells” like a hardware problem, it would be nice to rule out software/configuration as the root cause. We are running 64-bit version of Ubuntu 8.04.3, the Cuda 2.2 toolkit, and driver version 185.18.14.

What happens is that a consecutive pair of threads fails to write back their vec-2 data. Because the GT200 architecture is double-pumped, this would indicate that one of the eight stream processors (SPs) within one of the thirty SMs is faulty. A post-mortem of the incorrect output always indicates that 8th stream processor within one of the SMs is the culprit: when performing a 128-byte store, a half-warp writes 32 four-byte words (16 threads * two-element-vectors), and the incorrect words are always at offsets 28, 29, 30, and 31 within the 32-word memory transaction.

Below is a trivial “memcpy” kernel that reliably triggers the incorrect behaviors for problem sizes n > 65536. (And doesn’t fail when specifying 64-byte memory txns.)


Duane Merrill

Dept. of Computer Science

University of Virginia

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil.h>


// Simple array-copy routine to run on host for verification


void computeGold( int* reference, int* idata, const unsigned int len)


		for( unsigned int i = 0; i < len; i++)


				reference[i] = idata[i];




// Simple copying kernel.


// Uses global memory accesses to copy 2 ints per thread.


template<bool use_64_byte_loads>						// whether or not to use 64-byte loads/stores

__global__ void

testKernel(int* g_idata, int* g_odata)


		// cast device input and output pointers to 2-item vectors

		int2* in = (int2*) g_idata;

		int2* out = (int2*) g_odata;

		// my global location

		const unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;

		// scratch storage in local registers

		int2 local;

		// read data from global mem

		if (use_64_byte_loads) {

				local.x = g_idata[tid];

				local.y = g_idata[tid + (gridDim.x * blockDim.x)];

		} else {

				local = in[tid];


		__syncthreads();				// prevent overlapped i/o

		// write data to global mem

		if (use_64_byte_loads) {

				g_odata[tid] = local.x;

				g_odata[tid + (gridDim.x * blockDim.x)] = local.y;

		} else {

				out[tid] = local;




// Main


// Usage: "fail [--n=<num-elements>] [--i=<num-iterations>] [--64]"


// Defaults to 256K elements (must be a positive multiple of 512),

// and 1000 iterations.  Specify "--64" to perform two 64-byte memory

// transactions per thread instead of one 128-byte transaction.


int main( int argc, char** argv)


	unsigned int num_iterations = 1000;

	unsigned int num_threads = 128;

	unsigned int num_elements = 1024 * 256;

	unsigned int mem_size;

	unsigned int timer = 0;

	bool use64byteMemOps;

	CUT_DEVICE_INIT(argc, argv);

	cutGetCmdLineArgumenti( argc, (const char**) argv, "n", (int*)&num_elements);

	cutGetCmdLineArgumenti( argc, (const char**) argv, "i", (int*)&num_iterations);

	use64byteMemOps = cutCheckCmdLineFlag( argc, (const char**) argv, "64");

	mem_size = sizeof(int) * num_elements;

	if (cutCheckCmdLineFlag( argc, (const char**) argv, "help")) {

		printf("fail [--n=<num-elements>] [--i=<num-iterations>] [--64]\n");


		return 0;


	CUT_SAFE_CALL( cutCreateTimer( &timer));

	// allocate host memory

	int* h_idata = (int*) malloc( mem_size);

	// initalize the memory

	for( unsigned int i = 0; i < num_elements; ++i) {

		h_idata[i] = i;


	// allocate device memory

	int* d_idata;

	CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, mem_size));

	// copy host memory to device

	CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) );

	// allocate device memory for result

	int* d_odata;

	CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, mem_size));

	// calculate grid size (each thread handles 4 items)

	unsigned int grid_size = num_elements / (num_threads * 2);

	printf("%d-element memcopy using %d-byte device-memory txns, %d iterations.\nLaunch config: <<< %d, %d >>>\n",


		use64byteMemOps ? 64 : 128,





	// execute the kernel

	CUT_SAFE_CALL( cutStartTimer( timer));

	for (int i = 0; i < num_iterations; i++) {

		if (use64byteMemOps) {

			testKernel<true><<< grid_size, num_threads >>>( d_idata, d_odata);

		} else {

			testKernel<false><<< grid_size, num_threads >>>( d_idata, d_odata);



	CUT_SAFE_CALL( cutStopTimer( timer));

	printf( "Avg processing time: %f (ms)\n", cutGetTimerValue( timer) / (float) num_iterations);

	CUT_SAFE_CALL( cutDeleteTimer( timer));

	// check if kernel execution generated and error

	CUT_CHECK_ERROR("Kernel execution failed");

	// allocate mem for the result on host side

	int* h_odata = (int*) malloc( mem_size);

	// copy result from device to host

	CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost) );

	// compute reference solution

	int* reference = (int*) malloc( mem_size);

	computeGold( reference, h_idata, num_elements);

	// check for correctness

	bool passed = true;

	for (int i = 0; i < num_elements; i++) {

		if (reference[i] != h_odata[i]) {

			passed = false;

			printf("ERROR: reference[%d] != output[%d], half-warp offset %d\n", i, i, i % (2*16));


			for (int j = -4; j <= 4; j++) {

				if ((i + j > 0) && (i + j < num_elements))

					printf("%d, ", reference[i + j]);




			for (int j = -4; j <= 4; j++) {

				if ((i + j > 0) && (i + j < num_elements))

					printf("%d, ", h_odata[i + j]);







   printf( "Test %s\n", passed ? "PASSED" : "FAILED");

	// cleanup memory

	free( h_idata);

	free( h_odata);

	free( reference);



	CUT_EXIT(argc, argv);


Oh, and we do have a GeForce GTX 280 and Tesla C1060 that can also experience this problem


Was there ever an official statement by nVidia about this supposed hardware bug?

Does it only affect the big (GTX 285) GPUs, or also smaller models with compute capability 1.3?

Was there ever an official statement by nVidia about this supposed hardware bug?

Does it only affect the big (GTX 285) GPUs, or also smaller models with compute capability 1.3?