Okay, this one has officially stumped me and I feel like a chump. We have a little work server with a GTX 295 and a Tesla C1060 we’ve been using for months. One of our guys has been seeing his program produce bogus output periodically, and in the course of commenting out every part of his program that didn’t change the results, found a standardized case that fails predictably (in itself a long haul).
The offending kernel simply sets every value to -1. Nothing cosmic. On the GTX 295, it works; on the C1060, it fails STRANGELY: some values are set to 0.
The weird thing to me is that a line which modifies a DIFFERENT array can cause issues. If you comment it out, the problem goes away.
I are mystified. Here’s the offending kernel:
[codebox]#include#include <stdlib.h>
// CHANGE ME
#include “/home/vputz/NVIDIA_GPU_Computing_SDK/C/common/inc/cutil.h”
//#include “/opt/cuda/sdk/C/common/inc/cutil.h”
#define device 3
#define grids 2
#define blocksPerGrid 64
#define threadsPerBlock 256
#define streamsPerGrid (blocksPerGrid * threadsPerBlock)
#define iterationsPerOutput 1000
/*
- Sets every element of values and states to 0.
*/
global void initialize(double * values, int * states)
{
int streamIdx;
streamIdx = blockIdx.x * blockDim.x + threadIdx.x;
states[streamIdx * 3] = 4;
states[streamIdx * 3 + 1] = 5;
states[streamIdx * 3 + 2] = 6;
values[streamIdx] = 5.6;
}
/*
- Sets every element of values to -1.
*/
global void tick(double * values, int iterations, int * states)
{
int streamIdx;
streamIdx = blockIdx.x * blockDim.x + threadIdx.x;
while (iterations > 0)
{
// IF YOU COMMENT OUT THE NEXT LINE, THE PROGRAM WORKS
states[3 * streamIdx] = states[3 * streamIdx + 1];
values[streamIdx] = -1;
–iterations;
}
}
/*
- Appends the elements of values to a file.
*/
host void writeValues(char * filename, double * values)
{
FILE * file;
int streamIdx;
file = fopen(filename, “a”);
for (streamIdx = 0; streamIdx < streamsPerGrid; streamIdx++)
{
fprintf(file, "%G\n", values[streamIdx]);
}
fclose(file);
}
/*
- Main function.
*/
int main(int argc, char ** argv)
{
FILE * file;
double * values_h, * values_d;
int * states_d;
int gridIdx;
cudaSetDevice(device);
// Allocate memory on the host
values_h = (double *) malloc(streamsPerGrid * sizeof(double));
// Allocate memory on the device
CUDA_SAFE_CALL(cudaMalloc((void **) &values_d, streamsPerGrid * sizeof(double)));
CUDA_SAFE_CALL(cudaMalloc((void **) &states_d, streamsPerGrid * 3 * sizeof(int)));
// Create files on the host
file = fopen("initial.dat", "w");
fclose(file);
file = fopen("final.dat", "w");
fclose(file);
for (gridIdx = 0; gridIdx < grids; gridIdx++)
{
printf("Grid %u\n", gridIdx + 1);
// This sets EVERY NUMBER in values[…] and states[…] to junk values
initialize<<<blocksPerGrid, threadsPerBlock>>>(values_d, states_d);
CUDA_SAFE_CALL(cudaThreadSynchronize());
CUDA_SAFE_CALL(cudaMemcpy(values_h, values_d, streamsPerGrid * sizeof(double), cudaMemcpyDeviceToHost));
writeValues("initial.dat", values_h);
// This sets EVERY NUMBER in values[…] to -1 and it SHOULD leave states[…] unchanged
tick<<<blocksPerGrid, threadsPerBlock>>>(values_d, iterationsPerOutput, states_d);
CUDA_SAFE_CALL(cudaThreadSynchronize());
CUDA_SAFE_CALL(cudaMemcpy(values_h, values_d, streamsPerGrid * sizeof(double), cudaMemcpyDeviceToHost));
writeValues("final.dat", values_h);
}
return 0;
}
[/codebox]