When I used txq’s texture query instruction as described by cuda ptx isa, I encountered a cuda error of 500.
I found I can successfully querywidth, height, depth
, but when I tried to query other object such as normalized_coords, channel_data_type, etc...
, my code will return an error code with 500: cuda named symbol not found.
I think maybe there is some difference between width
and normalized_coords
, cause in cuda spec the former is integer and the latter is enum, but in ptx isa define they are all .b32
type, and I can’t find out exactly how to access normalized_coords
with txq instrction in ptx isa spec.
Tried on both RTX3080 and RTX4060 and all tests are failed. I present a demo code texture_query_demo.cu
below, which can be reproduced by compiling instructions:
nvcc texture_query_demo.cu -o texture_query_demo.o
./texture_query_demo.o
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#include <sys/time.h>
#include <iostream>
__global__ void Texture_query(cudaTextureObject_t texObj, unsigned* output) {
uint32_t result;
#if 0
// success
asm("txq.width.b32 %0, [%1];" : "=r"(result) : "l"(texObj));
#else
// failed
asm("txq.normalized_coords.b32 %0, [%1];" : "=r"(result) : "l"(texObj));
#endif
*output = result;
}
int main() {
cudaError_t err = cudaSuccess;
int w = 0x10;
std::cout << "witdh is " << w << std::endl;
int h = 1;
int d = 1;
int numElements = w * h * d;
int dataLen = sizeof(float);
int size = numElements * dataLen;
cudaExtent extent;
extent.width = w;
extent.height = h == 1 ? 0 : h;
extent.depth = d == 1 ? 0 : d;
cudaExtent copyExtent;
copyExtent.width = w;
copyExtent.height = h;
copyExtent.depth = d;
uint cudaArrayFlag = cudaArrayDefault;
// uint cudaArrayFlag = cudaArrayLayered;
float *f_A = reinterpret_cast<float *>(malloc(size));
for (int i = 0; i < numElements; ++i) {
f_A[i] = static_cast<float>(i) * 1.0f + 1.0f;
}
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatKindFloat);
cudaArray *cu_input_array_a;
// err = cudaMallocArray(&cu_input_array_a, &channelDesc, w, h);
err = cudaMalloc3DArray(&cu_input_array_a, &channelDesc, extent,
cudaArrayFlag);
if (err != cudaSuccess) {
std::cout << "Failed to allocate device cu_input_array_a (error code "
<< cudaGetErrorString(err) << ")!\n";
}
cudaMemcpy3DParms copyParams = { 0 };
copyParams.srcPtr = make_cudaPitchedPtr(reinterpret_cast<void *>(f_A),
copyExtent.width * dataLen, copyExtent.width, copyExtent.height);
copyParams.extent = copyExtent;
copyParams.kind = cudaMemcpyHostToDevice;
copyParams.dstArray = cu_input_array_a;
err = cudaMemcpy3D(©Params);
if (err != cudaSuccess) {
std::cout
<< "Failed to copy cu_input_array_a from host to device (error code "
<< cudaGetErrorString(err) << ")!\n";
}
// Specify surface
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cu_input_array_a;
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeMirror;
texDesc.filterMode = cudaFilterModePoint;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
texDesc.borderColor[0] = 0.0f;
cudaTextureObject_t texobj = 0;
cudaCreateTextureObject(&texobj, &resDesc, &texDesc, NULL);
unsigned *output = (unsigned *) malloc(sizeof(unsigned) * 1);
unsigned* d_output;
cudaMalloc((unsigned**) &d_output, sizeof(unsigned) * 1);
// Launch the Vector Add CUDA Kernel
dim3 thread(1, h);
dim3 block(d, 1);
constexpr int execution_times = 1;
auto start_time = std::chrono::system_clock::now();
for (int execution_time = 0; execution_time < execution_times;
execution_time++) {
Texture_query<<<1, 1>>>(texobj, d_output);
cudaMemcpy(output, d_output, sizeof(unsigned) * 1,
cudaMemcpyDeviceToHost);
}
cudaDeviceSynchronize();
auto end_time = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
end_time - start_time);
std::cout << "total_time(ms)"
<< float(duration.count()) / float(execution_times) << std::endl;
cudaDeviceSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess) {
std::cout << "Failed to launch Texture kernel (error code " << err
<< " : " << cudaGetErrorString(err) << ")!\n";
} else {
std::cout << "query result is " << *output << std::endl;
}
cudaFreeArray(cu_input_array_a);
cudaDestroySurfaceObject(texobj);
// Free host memory
free(f_A);
}