i test this metrics lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex (# of bytes requested for TEX instructions)through this kernel:
__global__ void LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0(float *output, cudaTextureObject_t tex_obj,
int width, int height, float theta) {
// Calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / static_cast<float>(width);
float v = y / static_cast<float>(height);
// Transform coordinates
// u -= 0.5f;
// v -= 0.5f;
// float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
// float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
// Read from texture and write to global memory
output[y * width + x] = tex2D<float>(tex_obj, u, v);
}
void LtsTBytesEquivL1sectormissPipeTexMemTexture() {
const int test_num = 1;
const int loop_num = 1;
int grid_num[test_num]{32};
int thread_num[test_num]{32};
float *h_src[test_num] = {nullptr};
float *d_res[test_num] = {nullptr};
float angle = 0.5;
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray_t cu_array[test_num];
std::srand(time(nullptr)); // use current time as seed for random generator
CUDA_CHECK(cudaSetDevice(0));
for (int i = 0; i < loop_num; i++) {
dim3 block(thread_num[i]);
dim3 grid(grid_num[i]);
// alloc cpu memory
h_src[i] = new float[thread_num[i] * grid_num[i]];
for (int j = 0; j < grid_num[i] * thread_num[i]; j++) {
h_src[i][j] = (std::rand() % 255);
}
// Allocate CUDA array in device memory
CUDA_CHECK(cudaMallocArray(&cu_array[i], &channel_desc, thread_num[i], grid_num[i]));
// Set pitch of the source (the width in memory in bytes of the 2D array pointed
// to by src, including padding), we dont have any padding
const size_t spitch = thread_num[i] * sizeof(float);
// Copy data located at address h_data in host memory to device memory
CUDA_CHECK(cudaMemcpy2DToArray(cu_array[i], 0, 0, h_src[i], spitch, thread_num[i] * sizeof(float), grid_num[i],
cudaMemcpyHostToDevice));
// Specify texture
struct cudaResourceDesc res_desc;
memset(&res_desc, 0, sizeof(res_desc));
res_desc.resType = cudaResourceTypeArray;
res_desc.res.array.array = cu_array[i];
// Specify texture object parameters
struct cudaTextureDesc tex_desc;
memset(&tex_desc, 0, sizeof(tex_desc));
tex_desc.addressMode[0] = cudaAddressModeWrap;
tex_desc.addressMode[1] = cudaAddressModeWrap;
tex_desc.filterMode = cudaFilterModeLinear;
tex_desc.readMode = cudaReadModeElementType;
tex_desc.normalizedCoords = 1;
// Create texture object
cudaTextureObject_t tex_obj = 0;
cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
// Allocate result of transformation in device memory
CUDA_CHECK(cudaMalloc(&d_res[i], grid_num[i] * thread_num[i] * sizeof(float)));
// kernel
switch (i) {
case 0: {
LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0<<<grid, block>>>(d_res[i], tex_obj, thread_num[i],
grid_num[i], angle);
break;
}
default:
break;
}
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaGetLastError());
// Destroy texture object
cudaDestroyTextureObject(tex_obj);
cudaFreeArray(cu_array[i]);
delete[] h_src[i];
CUDA_FREE(d_res[i]);
}
}
i lanuch my kernel with (32,32) threads, and the kernel just read them ,and write to global memory(without Transform coordinates), here is the results:
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.avg byte 512
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.max byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.min byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.sum Kbyte 16.38
i want to ask why it is 16384byte , i only malloc 32324=4096byte?