I’m trying to use surface objects on my secondary cuda device, without success. It seems to work fine on my primary device though.
I have tried it on both CUDA 6.5 and 7.0EA with the same result. I’m running Windows 7 64-bit and Visual Studio 2012, compiling for x64 target.
Here’s a piece of code that can be used to reproduce the problem.
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
// Simple copy kernel
__global__ void copyKernel(float *source,
cudaSurfaceObject_t targetSurface,
unsigned int size) {
unsigned int tx = threadIdx.x;
if (tx < size) {
surf1Dwrite(source[tx], targetSurface, tx * sizeof(float));
}
}
int main() {
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);
for (int deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
cudaDeviceProp deviceProperties;
cudaGetDeviceProperties(&deviceProperties, deviceIndex);
std::cerr << "Running test on " << deviceProperties.name
<< " with device index " << deviceIndex << std::endl;
cudaSetDevice(deviceIndex);
// Setup target array
const unsigned int size = 10;
const unsigned int sizeBytes = size * sizeof(float);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray_t targetArray;
cudaMallocArray(&targetArray,
&channelDesc,
size, 1,
cudaArraySurfaceLoadStore);
std::vector<float> targetData(size, 0.0f); // Make sure it is zero.
cudaMemcpyToArray(targetArray,
0, 0,
&targetData[0],
sizeBytes,
cudaMemcpyHostToDevice);
// Setup surface object associated with the source array
cudaResourceDesc resourceDesc;
memset(&resourceDesc, 0, sizeof(cudaResourceDesc));
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = targetArray;
cudaSurfaceObject_t targetSurfaceObject;
cudaCreateSurfaceObject(&targetSurfaceObject, &resourceDesc);
// Setup source memory
float *sourceMemory;
cudaMalloc(&sourceMemory, sizeBytes);
std::vector<float> sourceData(size, 1.2345f);
cudaMemcpy(sourceMemory,
&sourceData[0],
sizeBytes,
cudaMemcpyHostToDevice);
// Launch copy kernel
copyKernel<<<1, size>>>(sourceMemory, targetSurfaceObject, size);
// Retrieve result
std::vector<float> retrievedData(size, 0.0f);
cudaMemcpyFromArray(&retrievedData[0],
targetArray,
0, 0,
sizeBytes,
cudaMemcpyDeviceToHost);
// Cleanup
cudaDestroySurfaceObject(targetSurfaceObject);
cudaFreeArray(targetArray);
cudaFree(sourceMemory);
// Display result
bool testPassed = true;
for (unsigned int n = 0; n < size; n++) {
if (sourceData[n] != retrievedData[n]) {
std::cerr << "Mismatch at " << n << " : " << sourceData[n]
<< " != " << retrievedData[n] << std::endl;
testPassed = false;
}
}
std::cerr << "Test result: " << (testPassed ? "OK" : "FAILED") << std::endl;
}
return 0;
}
And here is the output:
Running test on GeForce GTX TITAN with device index 0
Test result: OK
Running test on Quadro K2200 with device index 1
Mismatch at 0 : 1.2345 != 0
Mismatch at 1 : 1.2345 != 0
Mismatch at 2 : 1.2345 != 0
Mismatch at 3 : 1.2345 != 0
Mismatch at 4 : 1.2345 != 0
Mismatch at 5 : 1.2345 != 0
Mismatch at 6 : 1.2345 != 0
Mismatch at 7 : 1.2345 != 0
Mismatch at 8 : 1.2345 != 0
Mismatch at 9 : 1.2345 != 0
Test result: FAILED
I’d appreciate if anybody could help me verify this problem.