I tried to test this code on a T4 graphics card, but I still got the result that the merged version of the memory fetch is not better than the merged version of the memory fetch.Can you help me explain this phenomenon?
non-coalesced code:
#include <algorithm>
#include <chrono>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#define CUDA_CHECK(status) \
do \
{ \
auto ret = (status); \
if (ret != 0) \
{ \
throw std::runtime_error("cuda failure: " + std::to_string(ret) + \
" (" + cudaGetErrorString(ret) + ")" + \
" at " + __FILE__ + ":" + \
std::to_string(__LINE__)); \
} \
} while (0)
#define CUDAOP_CHECK_CUDA_SATUS(status) \
do \
{ \
auto rst = status; \
if ((rst) != cudaSuccess) \
{ \
throw std::runtime_error("cuda err: " + \
std::to_string(static_cast<int>(rst)) + \
" (" + cudaGetErrorString(rst) + ")" + \
" at " + __FILE__ + ":" + \
std::to_string(__LINE__)); \
} \
} while (0)
#define CUDA_CHECK_AND_FREE(device_ptr) \
do \
{ \
if (device_ptr) \
{ \
cudaFree(device_ptr); \
} \
} while (0)
class Clock
{
public:
Clock() = delete;
Clock(std::string event) : _event(event)
{
_start_time = std::chrono::high_resolution_clock::now();
}
~Clock()
{
std::cout << _event << formatTime() << std::endl;
};
double DurationMs()
{
using namespace std::chrono;
typedef duration<double, std::ratio<1, 1000>> milliSecond;
milliSecond duration_ms =
duration_cast<milliSecond>(high_resolution_clock::now() - _start_time);
return duration_ms.count();
};
private:
std::string formatTime()
{
auto ms = DurationMs();
if (ms < 1.0)
{
return std::to_string(ms * 1000.0) + "μs";
}
if (ms > 1000.0)
{
return std::to_string(ms / 1000.0) + "s";
}
return std::to_string(ms) + "ms";
}
std::string _event;
std::chrono::high_resolution_clock::time_point _start_time;
};
#define Type float
inline void RandomInt8Vector(std::vector<unsigned char> &vec_uc)
{
srand((unsigned int)(time(NULL)));
std::for_each(vec_uc.begin(), vec_uc.end(), [](unsigned char &uc)
{ uc = rand() % 255; });
}
inline void RandomFloatVector(std::vector<float> &vec_f)
{
srand((unsigned int)(time(NULL)));
std::for_each(vec_f.begin(), vec_f.end(), [](float &f)
{ f = (rand() % 255) / 255.f; });
}
__global__ void kernel(void *src, void *dst, int height, int width, int size)
{
int block_top_left_x = blockIdx.x * blockDim.x;
int block_top_left_y = blockIdx.y * blockDim.y;
int index = threadIdx.x + block_top_left_x + (threadIdx.y + block_top_left_y) * width;
for (int i = 0; i < 3; i++)
{
static_cast<Type *>(dst)[index * 3 + i] = static_cast<Type *>(src)[index * 3 + i];
}
}
int main(int argv, char **argc)
{
void *src, *dst;
int height = 1024, width = 20480;
int size = height * width * 3;
std::vector<Type> input(size);
RandomFloatVector(input);
CUDA_CHECK(cudaMalloc(&src, size * sizeof(Type)));
CUDA_CHECK(cudaMalloc(&dst, size * sizeof(Type)));
CUDA_CHECK(cudaMemcpy(src, input.data(), size * sizeof(Type), cudaMemcpyHostToDevice));
dim3 block(32, 16);
int grid_x = (width + block.x - 1) / block.x;
int grid_y = (height + block.y - 1) / block.y;
dim3 grid(grid_x, grid_y);
for (int i = 0; i < 10; i++)
{
Clock clk("ori test ");
kernel<<<grid, block, 0, 0>>>(src, dst, height, width, size);
cudaDeviceSynchronize();
}
CUDAOP_CHECK_CUDA_SATUS(cudaGetLastError());
std::vector<Type> cuda_res(size * sizeof(Type));
CUDA_CHECK(cudaMemcpy(cuda_res.data(), dst, size * sizeof(Type), cudaMemcpyDeviceToHost));
int cnt = 0;
for (int i = 0; i < size; i++)
{
if (cuda_res[i] != input[i])
{
cnt++;
}
}
std::cout << "error rate = " << (float)cnt / size * 100 << "%" << std::endl;
CUDA_CHECK_AND_FREE(src);
CUDA_CHECK_AND_FREE(dst);
return 0;
}
coalesced code:
#include <algorithm>
#include <chrono>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#define CUDA_CHECK(status) \
do \
{ \
auto ret = (status); \
if (ret != 0) \
{ \
throw std::runtime_error("cuda failure: " + std::to_string(ret) + \
" (" + cudaGetErrorString(ret) + ")" + \
" at " + __FILE__ + ":" + \
std::to_string(__LINE__)); \
} \
} while (0)
#define CUDAOP_CHECK_CUDA_SATUS(status) \
do \
{ \
auto rst = status; \
if ((rst) != cudaSuccess) \
{ \
throw std::runtime_error("cuda err: " + \
std::to_string(static_cast<int>(rst)) + \
" (" + cudaGetErrorString(rst) + ")" + \
" at " + __FILE__ + ":" + \
std::to_string(__LINE__)); \
} \
} while (0)
#define CUDA_CHECK_AND_FREE(device_ptr) \
do \
{ \
if (device_ptr) \
{ \
cudaFree(device_ptr); \
} \
} while (0)
class Clock
{
public:
Clock() = delete;
Clock(std::string event) : _event(event)
{
_start_time = std::chrono::high_resolution_clock::now();
}
~Clock()
{
std::cout << _event << formatTime() << std::endl;
};
double DurationMs()
{
using namespace std::chrono;
typedef duration<double, std::ratio<1, 1000>> milliSecond;
milliSecond duration_ms =
duration_cast<milliSecond>(high_resolution_clock::now() - _start_time);
return duration_ms.count();
};
private:
std::string formatTime()
{
auto ms = DurationMs();
if (ms < 1.0)
{
return std::to_string(ms * 1000.0) + "μs";
}
if (ms > 1000.0)
{
return std::to_string(ms / 1000.0) + "s";
}
return std::to_string(ms) + "ms";
}
std::string _event;
std::chrono::high_resolution_clock::time_point _start_time;
};
#define Type float
inline void RandomInt8Vector(std::vector<unsigned char> &vec_uc)
{
srand((unsigned int)(time(NULL)));
std::for_each(vec_uc.begin(), vec_uc.end(), [](unsigned char &uc)
{ uc = rand() % 255; });
}
inline void RandomFloatVector(std::vector<float> &vec_f)
{
srand((unsigned int)(time(NULL)));
std::for_each(vec_f.begin(), vec_f.end(), [](float &f)
{ f = (rand() % 255) / 255.f; });
}
__global__ void kernel(void *src, void *dst, int height, int width)
{
int block_top_left_x = blockIdx.x * blockDim.x;
int block_top_left_y = blockIdx.y * blockDim.y;
for (int i = 0; i < 3; i++)
{
int index = threadIdx.x + block_top_left_x * 3 + i * blockDim.x + (threadIdx.y + block_top_left_y) * width * 3;
static_cast<Type *>(dst)[index] = static_cast<Type *>(src)[index];
}
}
int main(int argv, char **argc)
{
void *src, *dst;
int height = 1024, width = 20480;
int size = height * width * 3;
std::vector<Type> input(size);
RandomFloatVector(input);
CUDA_CHECK(cudaMalloc(&src, size * sizeof(Type)));
CUDA_CHECK(cudaMalloc(&dst, size * sizeof(Type)));
CUDA_CHECK(cudaMemcpy(src, input.data(), size * sizeof(Type), cudaMemcpyHostToDevice));
dim3 block(32, 16);
int grid_x = (width + block.x - 1) / block.x;
int grid_y = (height + block.y - 1) / block.y;
dim3 grid(grid_x, grid_y);
for (int i = 0; i < 10; i++)
{
Clock clk("coalesced test ");
kernel<<<grid, block, 0, 0>>>(src, dst, height, width);
cudaDeviceSynchronize();
}
std::vector<Type> cuda_res(size * sizeof(Type));
CUDA_CHECK(cudaMemcpy(cuda_res.data(), dst, size * sizeof(Type), cudaMemcpyDeviceToHost));
int cnt = 0;
for (int i = 0; i < size; i++)
{
if (cuda_res[i] != input[i])
{
cnt++;
}
}
std::cout << "error rate = " << (float)cnt / size * 100 << "%" << std::endl;
CUDA_CHECK_AND_FREE(src);
CUDA_CHECK_AND_FREE(dst);
return 0;
}
test result:
1024*20480
//ori version
ori test 1.018737ms
ori test 942.434000μs
ori test 941.261000μs
ori test 929.056000μs
ori test 932.231000μs
ori test 929.277000μs
ori test 929.843000μs
ori test 930.698000μs
ori test 929.305000μs
ori test 929.050000μs
//coalesced version
coalesced test 1.020968ms
coalesced test 951.732000μs
coalesced test 949.066000μs
coalesced test 954.971000μs
coalesced test 949.890000μs
coalesced test 950.264000μs
coalesced test 950.621000μs
coalesced test 950.881000μs
coalesced test 951.645000μs
coalesced test 948.397000μs
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03 Driver Version: 460.91.03 CUDA Version: 11.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 GeForce RTX 208... Off | 00000000:02:00.0 Off | N/A |
| 26% 50C P0 19W / 250W | 0MiB / 11019MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+