I have a quadro M770 which I’m trying to run the following convolution kernel on:
global void Convolve3DEdgeExtractionTexture(cudaPitchedPtr d_FilterResponse, DataSize dataSizeVolume, DataSize dataSizeFilter, int blocksInY, float invBlocksInY, int dataWidthTimesDataHeight)
{
unsigned int blockIdxz = __float2uint_rd(blockIdx.y * invBlocksInY);
unsigned int blockIdxy = blockIdx.y - __umul24(blockIdxz,blocksInY);
volatile int x = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;
volatile int y = __umul24(blockIdxy ,blockDim.y) + threadIdx.y;
volatile int z = __umul24(blockIdxz ,blockDim.z) + threadIdx.z;
if (x < dataSizeVolume.dataWidth && y < dataSizeVolume.dataHeight && z < dataSizeVolume.dataDepth) {
int xOffset;
int yOffset;
int zOffset;
float sum = 0.0f;
zOffset = -(dataSizeFilter.dataDepth - 1)/2;
for (int filterZ = dataSizeFilter.dataDepth - 1; filterZ >= 0; filterZ--) {
yOffset = -(dataSizeFilter.dataHeight - 1)/2;
for (int filterY = dataSizeFilter.dataHeight - 1; filterY >= 0; filterY--) {
xOffset = -(dataSizeFilter.dataWidth - 1)/2;
for (int filterX = dataSizeFilter.dataWidth - 1; filterX >= 0; filterX--) {
[b]sum += c_Convolution3DFilter[filterX + filterY * dataSizeFilter.dataWidth + filterZ * dataSizeFilter.dataWidth * dataSizeFilter.dataHeight] * tex3D(convolution3DTexture, x + xOffset + 0.5f, y + yOffset + 0.5f, z + zOffset + 0.5f);[/b]
xOffset++;
}
yOffset++;
}
zOffset++;
}
size_t offSet = x * sizeof(float) + y * d_FilterResponse.pitch + z * d_FilterResponse.pitch * dataSizeVolume.dataHeight;
*(float *)((char *)d_FilterResponse.ptr + offSet) = sum;
}
}
But this fails in the line with bold since a CUDA error is thrown. However if I remove + zOffset from the texture call then it works but produces an incorrect result. What am I doing wrong?
The texture is initialized with the following code:
texture<float, 3, cudaReadModeElementType> convolution3DTexture;
cudaArray *convolution3DArray;
void InitializeAndBindConvolution3DTexture(cudaPitchedPtr d_Input, DataSize dataSize) {
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc();
CheckCUDAError();
// Set texture parameters
convolution3DTexture.addressMode[0] = cudaAddressModeClamp;
convolution3DTexture.addressMode[1] = cudaAddressModeClamp;
convolution3DTexture.addressMode[2] = cudaAddressModeClamp;
convolution3DTexture.normalized = false;
convolution3DTexture.filterMode = cudaFilterModeLinear;
// Allocate 3D array for modified volume (for fast interpolation)
cudaExtent textSize = make_cudaExtent(dataSize.dataWidth, dataSize.dataHeight, dataSize.dataDepth);
cudaMalloc3DArray(&convolution3DArray, &channelDesc, textSize);
CheckCUDAError();
// Copy modified volume data to the 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = d_Input;
copyParams.dstArray = convolution3DArray;
copyParams.extent = textSize;
copyParams.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(©Params);
CheckCUDAError();
// Bind the array to the 3D texture
cudaBindTextureToArray(convolution3DTexture, convolution3DArray, channelDesc);
CheckCUDAError();
}
/Danne