I have a test kernel doing some basic memory copies from a source array to a destination array. When the array elements are 64-bit or 128-bit in size, the L2 memory counters look as I’d expect, and show no excessive loads. However, when the array elements are 256-bit or 512-bit in size, I’m showing substantial “Theoretical Sectors Global Excessive”. This is confusing, because from a review of the C++ source as well as the assembly, the 256-bit and 512-bit loads/stores are reduced to multiple 128-bit loads/stores - so what’s the difference?
The kernel is very basic and looks like this. The arrays are of elements 64-bit, 128-bit, 256-bit or 512-bit in size, depending on the test case.
for (int i = 0, idx = ThreadStartOffset; i < Iterations; ++i, idx += Stride)
{
ptrout[idx] = ptrin[idx];
}
The stores in the 256-bit case look like this (they are unrolled, in practice)
|00000023 00ba6260| LDG.E.128.SYS R4, [R26+0xc00] |
|00000023 00ba6270| LDG.E.128.SYS R8, [R26+0xc10] |
…
|00000023 00ba6300| STG.E.128.SYS [R24+0xc00], R4 |
|00000023 00ba6310| STG.E.128.SYS [R24+0xc10], R8 |
Any ideas where the excessive memory usage could be coming from or how I can trace it? My understanding was that by using 128-bit accesses I could avoid any sector under-utilization patterns due to striding or lack of coalescing.
Note that this is a test case distilled from a larger problem which exhibits the same large theoretical excessive sectors.
What is shown in the post is merely a code snippet, not a complete kernel. In order to make your observations reproducible by third parties, it is necessary to post minimal and complete code that others can cut & paste from the forum, then build & run on their local systems.
Here’s a full test case which runs under VS 2022 and should be easy to get working on other platforms. The int256 and int512 memory access sizes show the issue, and the other sizes do not.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <memory.h>
#include <random>
#include <cstdio>
__declspec(align(16)) struct int128
{
__int64 a[2];
};
__declspec(align(16)) struct int256
{
__int64 a[4];
};
__declspec(align(16)) struct int512
{
__int64 a[8];
};
template<class T>
__global__ void basic_test(char* in, char* out, size_t ByteCount)
{
const int ByteCountPerThread = ByteCount / gridDim.x / blockDim.x;
const int ByteCountPerBlock = ByteCountPerThread * blockDim.x;
const int BlockStartOffsetByteIndex = ByteCountPerBlock * blockIdx.x;
const int ThreadStartOffsetByteIndex = BlockStartOffsetByteIndex + threadIdx.x * sizeof(T);
const int ThreadStartOffset = ThreadStartOffsetByteIndex / sizeof(T);
const int Iterations = ByteCountPerThread / sizeof(T);
const int Stride = 32; // Elements
T* ptrin = (T*)in;
T* ptrout = (T*)out;
for (int i = 0, idx = ThreadStartOffset; i < Iterations; ++i, idx += Stride)
{
ptrout[idx] = ptrin[idx];
}
}
int main()
{
cudaSetDevice(0);
const int TEST_MEMORY_SIZE = 1024 * 1024 * 256;
char* in = (char*)malloc(TEST_MEMORY_SIZE);
char* out = (char*)malloc(TEST_MEMORY_SIZE);
char* in_device;
char* out_device;
cudaMalloc(&in_device, TEST_MEMORY_SIZE);
cudaMalloc(&out_device, TEST_MEMORY_SIZE);
memset(in, 0, TEST_MEMORY_SIZE);
memset(out, 0, TEST_MEMORY_SIZE);
for (int i = 0; i < TEST_MEMORY_SIZE; ++i)
{
// Set some values
in[i] = i;
}
cudaMemcpy(out_device, out, TEST_MEMORY_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(in_device, in, TEST_MEMORY_SIZE, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
basic_test<int><<<1024,32>>>(in_device, out_device, TEST_MEMORY_SIZE);
basic_test<__int64><<<1024,32>>>(in_device, out_device, TEST_MEMORY_SIZE);
basic_test<int128><<<1024,32>>>(in_device, out_device, TEST_MEMORY_SIZE);
basic_test<int256><<<1024,32>>>(in_device, out_device, TEST_MEMORY_SIZE);
basic_test<int512><<<1024,32>>>(in_device, out_device, TEST_MEMORY_SIZE);
cudaDeviceSynchronize();
cudaMemcpy(out, out_device, TEST_MEMORY_SIZE, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaFree(in_device);
cudaFree(out_device);
free(in);
free(out);
printf("Done\n");
}