I haven’t directly experimented with filters, but I did conduct some experiments related to them. Here’s the code:
#include <cuda_runtime.h>
#include <iostream>
__global__ void transformKernelTexture(float *output, cudaTextureObject_t texObj, int width, int height, float theta)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / static_cast<float>(width);
float v = y / static_cast<float>(height);
u -= 0.5f;
v -= 0.5f;
float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
output[y * width + x] = tex2D<float>(texObj, tu, tv);
}
__global__ void transformKernelNoTexture(float *output, int width, int height, float theta)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / static_cast<float>(width);
float v = y / static_cast<float>(height);
u -= 0.5f;
v -= 0.5f;
float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
output[y * width + x] = tu * tv;
}
int main()
{
const int height = 1024;
const int width = 1024;
float angle = 0.5;
float *h_data = static_cast<float *>(std::malloc(sizeof(float) * width * height));
for (int i = 0; i < height * width; ++i)
{
h_data[i] = static_cast<float>(i);
}
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray_t cuArray;
cudaMallocArray(&cuArray, &channelDesc, width, height);
const size_t spitch = width * sizeof(float);
cudaMemcpy2DToArray(cuArray, 0, 0, h_data, spitch, width * sizeof(float), height, cudaMemcpyHostToDevice);
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, nullptr);
float *outputTexture, *outputNoTexture;
cudaMalloc(&outputTexture, width * height * sizeof(float));
cudaMalloc(&outputNoTexture, width * height * sizeof(float));
dim3 threadsperBlock(32, 32);
dim3 numBlocks((width + threadsperBlock.x - 1) / threadsperBlock.x, (height + threadsperBlock.y - 1) / threadsperBlock.y);
transformKernelTexture<<<numBlocks, threadsperBlock>>>(outputTexture, texObj, width, height, angle);
transformKernelNoTexture<<<numBlocks, threadsperBlock>>>(outputNoTexture, width, height, angle);
cudaDestroyTextureObject(texObj);
cudaFreeArray(cuArray);
cudaFree(outputTexture);
cudaFree(outputNoTexture);
free(h_data);
return 0;
}
Upon profiling with tools like Nsight System and Nsight Compute, I observed that the method without utilizing textures is faster. However, what perplexes me is the prevalent use of textures when working with filters. This observation was made evident through my use of Nsight System, which revealed that OpenCV in C++ with CUDA employs functions like cudaMallocPitch and cudaMemcpy2D, and I also noticed similar usage in CUVI due to the presence of parameters related to “Pitch.”
This prompts me to wonder whether textures possess a significantly greater performance advantage that I might not be aware of.