Hello,
here I have a simple programm, that sometimes works and sometimes fails. More accurate, I have two GTX580 mounted on one board.
When I run the code on decive 0, the example fails.
When I run the code on decive 1, the example works.
Has anyone a idea what might be the issue?
The code shall write 1.0 in an array of size[750][56][56], thats it.
I use official OpenCL 1.1 C++ Bindings.
const unsigned int deviceID = 0;
// Platform
VECTOR_CLASS< cl::Platform > platformList;
cl::Platform::get(&platformList);
cl::Platform thePlatform = platformList[0];
// Device
VECTOR_CLASS< cl::Device > gpuList;
thePlatform.getDevices(CL_DEVICE_TYPE_GPU, &gpuList);
cl::Device theDevice = gpuList[deviceID];
// Context & Q
cl::Context theContext( VECTOR_CLASS< cl::Device >(1,theDevice) );
cl::CommandQueue theQueue(theContext,theDevice);
// Kernel
const string kernelString="\n\
__kernel\n\
void ErrorKernel( __global float data[750][56][56] )\n\
{\n\
const size_t threadblocknr = get_group_id(0);\n\
const size_t lid0 = get_local_id(0);\n\
const size_t lid1 = get_local_id(1);\n\
const size_t element_k = threadblocknr*1+lid0;\n\
for (size_t col=0; col < 56; col++)\n\
{\n\
data[element_k][col][lid1] = 1.0f;\n\
}\n\
}";
cout << kernelString << endl;
// todo check +1
cl::Program::Sources theSource (1,make_pair(kernelString.c_str(), kernelString.length()+1));
cl::Program theProgram (theContext, theSource);
theProgram.build( VECTOR_CLASS< cl::Device >(1,theDevice) );
cl::Kernel theKernel(theProgram,"ErrorKernel");
// Kernel Arguments
cl::Buffer data(theContext,CL_MEM_READ_WRITE,sizeof(cl_float)*750*56*56);
theKernel.setArg(0,data);
// Enqueue Kernel
const cl::NDRange offset = cl::NullRange;
const cl::NDRange local ( 1,56);
const cl::NDRange global( 750*1,56);
theQueue.finish(); // safety first
theQueue.enqueueNDRangeKernel(theKernel,offset,global,local);
theQueue.finish();
// read back data
float* ones = new float [750*56*56];
theQueue.enqueueReadBuffer(data,CL_TRUE,0,sizeof(cl_float)*750*56*56,ones);
theQueue.finish(); // safety first
// sum over all entries
float sum = 0.0;
for(unsigned int i=0;i<750*56*56;i++) sum += ones[i];
if (sum == 750*56*56) cout << "SUCCESS: sum = 2352000"<<endl;
else cout << "ERROR: sum = " << sum << endl;
cudatoolkit_3.2.16_linux_64_ubuntu10.04
SDK Version: gpucomputingsdk_3.2.16_linux
Driver: 260.19.36 on a GTX580
Thanks.