I am trying to implement basic intrinsic function for streaming load/store operation for floating point and integer numbers. These functions are based on the examples given in the Inline PTX in CUDA guide and Cache operator PTX instruction givein Version 2.3, Section 8.7.5.1, Page 110).
__device__ inline int load_streaming_int(int* addr)
{
int res;
asm("ld.cs.global.s32 %0, [%1];" : "=r"(res) : "l"(addr));
return res;
}
__device__ inline void store_streaming_int(int* addr, int val)
{
asm("st.cs.global.s32 [%0], %1;" : "+l"(addr): "r"(val));
}
I ran a simple example kernel (given below) and used the NVIDIA profiler for measuring the load/store performance. Strangely, the profiler counters “uncached global load transaction” is always zero. I was expecting a non-zero value for this counter. I checked the generated PTX code and they seem to use the st.cs and ld.cs (cache streaming) instructions.
__device__ int d_global_mem[NUM_VALUES];
int h_global_mem[NUM_VALUES];
__global__ void test_streaming()
{
int gid=threadIdx.x+blockIdx.x*blockDim.x;
int nthreads = blockDim.x*gridDim.x;
for (int i = gid; i < NUM_VALUES; i+= nthreads) {
int val = load_streaming_int(&d_global_mem[i]);
val = val*val;
store_streaming_int(&d_global_mem[i], val);
}
}
Any suggestions would be appreciated.