The following code fails after some iterations on our FX360M and 8600M GT, but works fine on our 8800 Ultra.
#include <assert.h>
#include <cutil_inline.h>
#ifdef __DEVICE_EMULATION__
#define UIMUL(a, b) (((size_t)(a)) * ((size_t)(b)))
#define EMUSYNC __syncthreads()
#define THREADS 128
#else
#define UIMUL(a, b) __umul24(a, b)
#define EMUSYNC
#define THREADS 128
#endif
template<size_t blockSize>
__global__ void
gemv_kernel(float const * src1, size_t const strideSrc1,
size_t const rows, size_t const cols,
float const * src2, float const * src3, float * dst,
float const alpha, float const beta)
{
__shared__ float sdata[blockSize];
float const * const row = src1 + UIMUL(blockIdx.x, strideSrc1);
size_t const tid = threadIdx.x;
// Reduce multiple elements per thread
sdata[tid] = 0;
for (size_t i = tid; i < cols; i += blockSize)
sdata[tid] += row[i] * src2[i];
__syncthreads();
// Do reduction in shared memory
if (blockSize >= 512) {
if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads();
}
if (blockSize >= 256) {
if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads();
}
if (blockSize >= 128) {
if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads();
}
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; EMUSYNC; }
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; EMUSYNC; }
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; EMUSYNC; }
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; EMUSYNC; }
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; EMUSYNC; }
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; EMUSYNC; }
}
// Write result to global memory
if (tid == 0) {
dst[blockIdx.x] = alpha * sdata[0] + beta * src3[blockIdx.x];
}
}
void gemv(float const * src1, size_t strideSrc1,
size_t rowsSrc1, size_t colsSrc1,
float const * src2, size_t lenSrc2,
float const * src3, size_t lenSrc3,
float * dst)
{
assert(src1 && strideSrc1 >= colsSrc1);
assert(rowsSrc1 > 0 && colsSrc1 > 0);
assert(src2 && lenSrc2 == colsSrc1);
assert(src3 && lenSrc3 == rowsSrc1);
assert(dst);
gemv_kernel<THREADS><<<rowsSrc1, THREADS>>>
(src1, strideSrc1, rowsSrc1, colsSrc1, src2, src3, dst, 1, 1);
cudaError err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Kernel call 'gemv_kernel' failed [%s].\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaThreadSynchronize();
if (err != cudaSuccess) {
printf("Kernel call 'gemv_kernel' failed [%s].\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
void randInit(float * ptr, size_t const len)
{
for (size_t i = 0; i < len; ++i)
ptr[i] = rand() / (float) RAND_MAX;
}
int main(int argc, char** argv)
{
size_t const rows = 300, cols = 100;
float *h_A = (float*) malloc(rows * cols * sizeof(float)),
*h_x = (float*) malloc(cols * sizeof(float)),
*h_y = (float*) malloc(rows * sizeof(float)),
*h_z = (float*) malloc(rows * sizeof(float));
srand((unsigned int)time(NULL));
randInit(h_A, rows * cols);
randInit(h_x, cols);
randInit(h_y, rows);
randInit(h_z, rows);
size_t d_strideA = 0;
float *d_A = NULL, *d_x = NULL, *d_y = NULL, *d_z = NULL;
cutilSafeCall(cudaMallocPitch((void**)&d_A, &d_strideA, cols * sizeof(float), rows));
d_strideA /= sizeof(float);
cutilSafeCall(cudaMalloc((void**)&d_x, cols * sizeof(float)));
cutilSafeCall(cudaMalloc((void**)&d_y, rows * sizeof(float)));
cutilSafeCall(cudaMalloc((void**)&d_z, rows * sizeof(float)));
cutilSafeCall(cudaMemcpy2D(
d_A, d_strideA * sizeof(float), h_A, cols * sizeof(float),
cols * sizeof(float), rows, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(d_x, h_x, cols * sizeof(float), cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(d_y, h_y, rows * sizeof(float), cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(d_z, h_z, rows * sizeof(float), cudaMemcpyHostToDevice));
for (size_t k = 1; k <= 100; ++k)
{
printf("try %03d ... ", k);
for (size_t i = 1; i <= (rows * cols); ++i)
{
gemv(d_A, d_strideA, rows, cols, d_x, cols, d_y, rows, d_z);
gemv(d_A, d_strideA, rows, cols, d_x, cols, d_y, rows, d_z);
}
printf("OK\n");
}
cutilSafeCall(cudaFree(d_A));
cutilSafeCall(cudaFree(d_x));
cutilSafeCall(cudaFree(d_y));
cutilSafeCall(cudaFree(d_z));
free(h_A);
free(h_x);
free(h_y);
free(h_z);
}
When I start this code on our Quadro FX360M I got the following result:
try 001 ... OK
try 002 ... OK
...
try 068 ... OK
try 069 ... Kernel call 'gemv_kernel' failed [unspecified launch failure].
It’s not realy deterministic when the error occurs, but it occurs after some iterations.
Does anyone have an idea what’s going wrong?
Additional info: When the error occurs the notebook display flickers for a short moment!
Thanks and regards,
Daniel.