How does Turing architecture's L1 cache and L2 cache work?

Ahuan · September 23, 2021, 11:16am

I tried to test this code on a T4 graphics card, but I still got the result that the merged version of the memory fetch is not better than the merged version of the memory fetch.Can you help me explain this phenomenon?

non-coalesced code：

#include <algorithm>
#include <chrono>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>

#define CUDA_CHECK(status)                                                    \
    do                                                                        \
    {                                                                         \
        auto ret = (status);                                                  \
        if (ret != 0)                                                         \
        {                                                                     \
            throw std::runtime_error("cuda failure: " + std::to_string(ret) + \
                                     " (" + cudaGetErrorString(ret) + ")" +   \
                                     " at " + __FILE__ + ":" +                \
                                     std::to_string(__LINE__));               \
        }                                                                     \
    } while (0)

#define CUDAOP_CHECK_CUDA_SATUS(status)                                      \
    do                                                                       \
    {                                                                        \
        auto rst = status;                                                   \
        if ((rst) != cudaSuccess)                                            \
        {                                                                    \
            throw std::runtime_error("cuda err: " +                          \
                                     std::to_string(static_cast<int>(rst)) + \
                                     " (" + cudaGetErrorString(rst) + ")" +  \
                                     " at " + __FILE__ + ":" +               \
                                     std::to_string(__LINE__));              \
        }                                                                    \
    } while (0)

#define CUDA_CHECK_AND_FREE(device_ptr) \
    do                                  \
    {                                   \
        if (device_ptr)                 \
        {                               \
            cudaFree(device_ptr);       \
        }                               \
    } while (0)

class Clock
{
public:
    Clock() = delete;

    Clock(std::string event) : _event(event)
    {
        _start_time = std::chrono::high_resolution_clock::now();
    }

    ~Clock()
    {
        std::cout << _event << formatTime() << std::endl;
    };

    double DurationMs()
    {
        using namespace std::chrono;
        typedef duration<double, std::ratio<1, 1000>> milliSecond;
        milliSecond duration_ms =
            duration_cast<milliSecond>(high_resolution_clock::now() - _start_time);
        return duration_ms.count();
    };

private:
    std::string formatTime()
    {
        auto ms = DurationMs();
        if (ms < 1.0)
        {
            return std::to_string(ms * 1000.0) + "μs";
        }
        if (ms > 1000.0)
        {
            return std::to_string(ms / 1000.0) + "s";
        }
        return std::to_string(ms) + "ms";
    }

    std::string _event;
    std::chrono::high_resolution_clock::time_point _start_time;
};

#define Type float

inline void RandomInt8Vector(std::vector<unsigned char> &vec_uc)
{
    srand((unsigned int)(time(NULL)));
    std::for_each(vec_uc.begin(), vec_uc.end(), [](unsigned char &uc)
                  { uc = rand() % 255; });
}

inline void RandomFloatVector(std::vector<float> &vec_f)
{
    srand((unsigned int)(time(NULL)));
    std::for_each(vec_f.begin(), vec_f.end(), [](float &f)
                  { f = (rand() % 255) / 255.f; });
}

__global__ void kernel(void *src, void *dst, int height, int width, int size)
{
    int block_top_left_x = blockIdx.x * blockDim.x;
    int block_top_left_y = blockIdx.y * blockDim.y;

    int index = threadIdx.x + block_top_left_x + (threadIdx.y + block_top_left_y) * width;
    for (int i = 0; i < 3; i++)
    {
        static_cast<Type *>(dst)[index * 3 + i] = static_cast<Type *>(src)[index * 3 + i];
    }
}

int main(int argv, char **argc)
{
    void *src, *dst;
    int height = 1024, width = 20480;
    int size = height * width * 3;
    std::vector<Type> input(size);
    RandomFloatVector(input);
    CUDA_CHECK(cudaMalloc(&src, size * sizeof(Type)));
    CUDA_CHECK(cudaMalloc(&dst, size * sizeof(Type)));
    CUDA_CHECK(cudaMemcpy(src, input.data(), size * sizeof(Type), cudaMemcpyHostToDevice));

    dim3 block(32, 16);
    int grid_x = (width + block.x - 1) / block.x;
    int grid_y = (height + block.y - 1) / block.y;
    dim3 grid(grid_x, grid_y);

    for (int i = 0; i < 10; i++)
    {
        Clock clk("ori test ");
        kernel<<<grid, block, 0, 0>>>(src, dst, height, width, size);
        cudaDeviceSynchronize();
    }
    CUDAOP_CHECK_CUDA_SATUS(cudaGetLastError());

    std::vector<Type> cuda_res(size * sizeof(Type));
    CUDA_CHECK(cudaMemcpy(cuda_res.data(), dst, size * sizeof(Type), cudaMemcpyDeviceToHost));

    int cnt = 0;
    for (int i = 0; i < size; i++)
    {
        if (cuda_res[i] != input[i])
        {
            cnt++;
        }
    }

    std::cout << "error rate = " << (float)cnt / size * 100 << "%" << std::endl;

    CUDA_CHECK_AND_FREE(src);
    CUDA_CHECK_AND_FREE(dst);
    return 0;
}

coalesced code:

#include <algorithm>
#include <chrono>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>

#define CUDA_CHECK(status)                                                    \
    do                                                                        \
    {                                                                         \
        auto ret = (status);                                                  \
        if (ret != 0)                                                         \
        {                                                                     \
            throw std::runtime_error("cuda failure: " + std::to_string(ret) + \
                                     " (" + cudaGetErrorString(ret) + ")" +   \
                                     " at " + __FILE__ + ":" +                \
                                     std::to_string(__LINE__));               \
        }                                                                     \
    } while (0)

#define CUDAOP_CHECK_CUDA_SATUS(status)                                      \
    do                                                                       \
    {                                                                        \
        auto rst = status;                                                   \
        if ((rst) != cudaSuccess)                                            \
        {                                                                    \
            throw std::runtime_error("cuda err: " +                          \
                                     std::to_string(static_cast<int>(rst)) + \
                                     " (" + cudaGetErrorString(rst) + ")" +  \
                                     " at " + __FILE__ + ":" +               \
                                     std::to_string(__LINE__));              \
        }                                                                    \
    } while (0)

#define CUDA_CHECK_AND_FREE(device_ptr) \
    do                                  \
    {                                   \
        if (device_ptr)                 \
        {                               \
            cudaFree(device_ptr);       \
        }                               \
    } while (0)

class Clock
{
public:
    Clock() = delete;

    Clock(std::string event) : _event(event)
    {
        _start_time = std::chrono::high_resolution_clock::now();
    }

    ~Clock()
    {
        std::cout << _event << formatTime() << std::endl;
    };

    double DurationMs()
    {
        using namespace std::chrono;
        typedef duration<double, std::ratio<1, 1000>> milliSecond;
        milliSecond duration_ms =
            duration_cast<milliSecond>(high_resolution_clock::now() - _start_time);
        return duration_ms.count();
    };

private:
    std::string formatTime()
    {
        auto ms = DurationMs();
        if (ms < 1.0)
        {
            return std::to_string(ms * 1000.0) + "μs";
        }
        if (ms > 1000.0)
        {
            return std::to_string(ms / 1000.0) + "s";
        }
        return std::to_string(ms) + "ms";
    }

    std::string _event;
    std::chrono::high_resolution_clock::time_point _start_time;
};

#define Type float
inline void RandomInt8Vector(std::vector<unsigned char> &vec_uc)
{
    srand((unsigned int)(time(NULL)));
    std::for_each(vec_uc.begin(), vec_uc.end(), [](unsigned char &uc)
                  { uc = rand() % 255; });
}

inline void RandomFloatVector(std::vector<float> &vec_f)
{
    srand((unsigned int)(time(NULL)));
    std::for_each(vec_f.begin(), vec_f.end(), [](float &f)
                  { f = (rand() % 255) / 255.f; });
}

__global__ void kernel(void *src, void *dst, int height, int width)
{
    int block_top_left_x = blockIdx.x * blockDim.x;
    int block_top_left_y = blockIdx.y * blockDim.y;

    for (int i = 0; i < 3; i++)
    {
        int index = threadIdx.x + block_top_left_x * 3 + i * blockDim.x + (threadIdx.y + block_top_left_y) * width * 3;
        static_cast<Type *>(dst)[index] = static_cast<Type *>(src)[index];
    }
}

int main(int argv, char **argc)
{
    void *src, *dst;
    int height = 1024, width = 20480;
    int size = height * width * 3;
    std::vector<Type> input(size);
    RandomFloatVector(input);
    CUDA_CHECK(cudaMalloc(&src, size * sizeof(Type)));
    CUDA_CHECK(cudaMalloc(&dst, size * sizeof(Type)));
    CUDA_CHECK(cudaMemcpy(src, input.data(), size * sizeof(Type), cudaMemcpyHostToDevice));

    dim3 block(32, 16);
    int grid_x = (width + block.x - 1) / block.x;
    int grid_y = (height + block.y - 1) / block.y;
    dim3 grid(grid_x, grid_y);
    for (int i = 0; i < 10; i++)
    {
        Clock clk("coalesced test ");
        kernel<<<grid, block, 0, 0>>>(src, dst, height, width);
        cudaDeviceSynchronize();
    }

    std::vector<Type> cuda_res(size * sizeof(Type));
    CUDA_CHECK(cudaMemcpy(cuda_res.data(), dst, size * sizeof(Type), cudaMemcpyDeviceToHost));

    int cnt = 0;
    for (int i = 0; i < size; i++)
    {
        if (cuda_res[i] != input[i])
        {
            cnt++;
        }
    }

    std::cout << "error rate = " << (float)cnt / size * 100 << "%" << std::endl;

    CUDA_CHECK_AND_FREE(src);
    CUDA_CHECK_AND_FREE(dst);

    return 0;
}

test result:

1024*20480
//ori version
ori test 1.018737ms
ori test 942.434000μs
ori test 941.261000μs
ori test 929.056000μs
ori test 932.231000μs
ori test 929.277000μs
ori test 929.843000μs
ori test 930.698000μs
ori test 929.305000μs
ori test 929.050000μs
    
    //coalesced version
coalesced test 1.020968ms
coalesced test 951.732000μs
coalesced test 949.066000μs
coalesced test 954.971000μs
coalesced test 949.890000μs
coalesced test 950.264000μs
coalesced test 950.621000μs
coalesced test 950.881000μs
coalesced test 951.645000μs
coalesced test 948.397000μs

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  GeForce RTX 208...  Off  | 00000000:02:00.0 Off |                  N/A |
| 26%   50C    P0    19W / 250W |      0MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

Topic		Replies	Views
Transfer-Bound Application Looking for ideas to speed it up CUDA Programming and Performance	36	29327	April 23, 2010
2x slower kernel if the inner dimension is divsible by 16/32 CUDA Programming and Performance cuda , kernel	13	59	July 26, 2024
Using Shared Memory in CUDA C/C++ Technical Blog	36	1997	October 8, 2020
How to get the most dot products of batched vectors out of L4 GPU CUDA Programming and Performance	13	64	May 28, 2025
full warp Vs. half warp coalesced global memory loads CUDA Programming and Performance	7	8042	May 27, 2009
Coalesced Memory access related doubt CUDA Programming and Performance	13	2017	December 9, 2010
Local memory performance Using more than 4kb kills it.. why? CUDA Programming and Performance	24	5091	September 6, 2008
Efficient use of shared memory CUDA Programming and Performance	29	4513	December 2, 2019
How to accurately time individual memory operations CUDA Programming and Performance	17	6254	September 12, 2016
Putting the GPU at work CUDA Programming and Performance	21	20177	July 5, 2007

How does Turing architecture's L1 cache and L2 cache work?

Related topics