I seem to be encountering a CUDA bug:
[codebox]#include <stdio.h>
#include <unistd.h>
void checkCUDAError() {
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
fprintf(stdout, "CUDA error: %s\n", cudaGetErrorString(error));
exit(1);
}
}
device float val(const float p[2]) {
float dx = p[0];
float r2 = dx*dx;
dx = p[1];
r2 += dx*dx;
float rv;
if (2.0f+r2 < 1.0f) {rv = -1.0f;} else {rv = 0.0f;}
return -rv;
}
global void doCalc(float *results) {
float p[2];
p[0] = 0;
p[1] = 0;
results[0] = val(p);
return;
}
int main(int argc, char** argv) {
float* d_results;
cudaMalloc((void**)&d_results, sizeof(float));
checkCUDAError();
doCalc<<<1, 1>>>(d_results);
checkCUDAError();
float *h_results = new float[1];
cudaMemcpy(h_results, d_results, sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError();
cudaFree(d_results);
fprintf(stdout, “result: %f\n”, h_results[0]);
return 0;
}[/codebox]
The result should be 0 (or -0), but it’s 1. I see this behavior with toolkit versions 2.2 and 2.3 and driver versions 185.18.36 and 190.32 (beta). My system is a quad-core (2.83GHz) running RHEL 5.4. The GPU is a GTX295.