Got cuda error code 500 while using texture query instruction

When I used txq’s texture query instruction as described by cuda ptx isa, I encountered a cuda error of 500.
I found I can successfully querywidth, height, depth, but when I tried to query other object such as normalized_coords, channel_data_type, etc..., my code will return an error code with 500: cuda named symbol not found.
I think maybe there is some difference between width and normalized_coords, cause in cuda spec the former is integer and the latter is enum, but in ptx isa define they are all .b32 type, and I can’t find out exactly how to access normalized_coords with txq instrction in ptx isa spec.
Tried on both RTX3080 and RTX4060 and all tests are failed. I present a demo code texture_query_demo.cu below, which can be reproduced by compiling instructions:

nvcc texture_query_demo.cu -o texture_query_demo.o
./texture_query_demo.o
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>

#include <sys/time.h>
#include <iostream>

__global__ void Texture_query(cudaTextureObject_t texObj, unsigned* output) {
	uint32_t result;
#if 0
	// success
	asm("txq.width.b32 %0, [%1];" : "=r"(result) : "l"(texObj));
#else
	// failed
	asm("txq.normalized_coords.b32 %0, [%1];" : "=r"(result) : "l"(texObj));
#endif
	*output = result;
}

int main() {
	cudaError_t err = cudaSuccess;
	int w = 0x10;
	std::cout << "witdh is " << w << std::endl;
	int h = 1;
	int d = 1;
	int numElements = w * h * d;
	int dataLen = sizeof(float);
	int size = numElements * dataLen;

	cudaExtent extent;
	extent.width = w;
	extent.height = h == 1 ? 0 : h;
	extent.depth = d == 1 ? 0 : d;

	cudaExtent copyExtent;
	copyExtent.width = w;
	copyExtent.height = h;
	copyExtent.depth = d;

	uint cudaArrayFlag = cudaArrayDefault;
	// uint cudaArrayFlag = cudaArrayLayered;

	float *f_A = reinterpret_cast<float *>(malloc(size));
	for (int i = 0; i < numElements; ++i) {
		f_A[i] = static_cast<float>(i) * 1.0f + 1.0f;
	}

	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
			cudaChannelFormatKindFloat);

	cudaArray *cu_input_array_a;

	// err = cudaMallocArray(&cu_input_array_a, &channelDesc, w, h);
	err = cudaMalloc3DArray(&cu_input_array_a, &channelDesc, extent,
			cudaArrayFlag);
	if (err != cudaSuccess) {
		std::cout << "Failed to allocate device cu_input_array_a (error code "
				<< cudaGetErrorString(err) << ")!\n";
	}

	cudaMemcpy3DParms copyParams = { 0 };
	copyParams.srcPtr = make_cudaPitchedPtr(reinterpret_cast<void *>(f_A),
			copyExtent.width * dataLen, copyExtent.width, copyExtent.height);
	copyParams.extent = copyExtent;
	copyParams.kind = cudaMemcpyHostToDevice;
	copyParams.dstArray = cu_input_array_a;
	err = cudaMemcpy3D(&copyParams);
	if (err != cudaSuccess) {
		std::cout
				<< "Failed to copy cu_input_array_a from host to device (error code "
				<< cudaGetErrorString(err) << ")!\n";
	}

	// Specify surface
	struct cudaResourceDesc resDesc;
	memset(&resDesc, 0, sizeof(resDesc));
	resDesc.resType = cudaResourceTypeArray;
	resDesc.res.array.array = cu_input_array_a;

	struct cudaTextureDesc texDesc;
	memset(&texDesc, 0, sizeof(texDesc));
	texDesc.addressMode[0] = cudaAddressModeMirror;
	texDesc.filterMode = cudaFilterModePoint;
	texDesc.readMode = cudaReadModeElementType;
	texDesc.normalizedCoords = 1;
	texDesc.borderColor[0] = 0.0f;

	cudaTextureObject_t texobj = 0;
	cudaCreateTextureObject(&texobj, &resDesc, &texDesc, NULL);

	unsigned *output = (unsigned *) malloc(sizeof(unsigned) * 1);
	unsigned* d_output;
	cudaMalloc((unsigned**) &d_output, sizeof(unsigned) * 1);

	// Launch the Vector Add CUDA Kernel
	dim3 thread(1, h);
	dim3 block(d, 1);

	constexpr int execution_times = 1;
	auto start_time = std::chrono::system_clock::now();
	for (int execution_time = 0; execution_time < execution_times;
			execution_time++) {
		Texture_query<<<1, 1>>>(texobj, d_output);
		cudaMemcpy(output, d_output, sizeof(unsigned) * 1,
				cudaMemcpyDeviceToHost);
	}
	cudaDeviceSynchronize();
	auto end_time = std::chrono::system_clock::now();
	auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
			end_time - start_time);
	std::cout << "total_time(ms)"
			<< float(duration.count()) / float(execution_times) << std::endl;

	cudaDeviceSynchronize();

	err = cudaGetLastError();
	if (err != cudaSuccess) {
		std::cout << "Failed to launch Texture kernel (error code " << err
				<< " : " << cudaGetErrorString(err) << ")!\n";
	} else {
		std::cout << "query result is " << *output << std::endl;
	}

	cudaFreeArray(cu_input_array_a);
	cudaDestroySurfaceObject(texobj);

	// Free host memory
	free(f_A);
}

Can you create C++ kernels using those queries and look at the resulting PTX?

According to my search results, nvidia doesn’t provide cuda functional level api, so I can only use it through ptx.