I am testing how I can overlap cudaMemcpy
s with kernel executions without having to use cudaMallocHost
i.e. pinned memory. This is the example I have come up with:
#include <atomic>
#include <cstdio>
#include <iostream>
#include <thread>
#include <vector>
#define gpuErrchk(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
__host__ inline void gpuAssert(cudaError_t code, const char* file, int line,
bool abort = true) {
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(EXIT_FAILURE);
}
}
#define gpuErrchkInternal(ans, file, line) \
{ gpuAssert((ans), file, line); }
#define kernelStreamCheck(stream) \
kernelStreamCheckFunc(stream, __FILE__, __LINE__)
__host__ inline void kernelStreamCheckFunc(cudaStream_t stream,
const char* file, int line) {
gpuErrchkInternal(cudaStreamSynchronize(stream), file, line);
gpuErrchkInternal(cudaPeekAtLastError(), file, line);
}
// Kernel that sleeps for some time
__global__ void sleepKernel(size_t sleep_time) {
int start = clock();
int clock_offset = 0;
while (clock_offset < sleep_time) {
clock_offset = clock() - start;
}
}
__host__ int main() {
using ItemType = int;
struct cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
size_t const num_items = prop.totalGlobalMem / (10 * sizeof(ItemType));
std::vector<ItemType> vec(num_items, 0);
// Divide indices into chunks
uint32_t const num_chunks = 10;
size_t const chunk_size = num_items / num_chunks;
size_t const last_chunk_size = chunk_size + num_items % num_chunks;
ItemType* d_items;
gpuErrchk(cudaMalloc(&d_items, num_items * sizeof(ItemType)));
ItemType* d_results;
gpuErrchk(cudaMalloc(&d_results, num_items * sizeof(ItemType)));
std::atomic<uint32_t> chunk_index(0);
std::vector<ItemType> results(num_items);
// Copy All chunks serially
std::thread t_HtoD(
[=](std::atomic<uint32_t>& chunk_index) {
for (uint32_t i = 0; i < num_chunks; ++i) {
uint32_t const current_chunk_size =
i == (num_chunks - 1) ? last_chunk_size : chunk_size;
gpuErrchk(cudaMemcpy(
d_items + chunk_size * i, vec.data() + chunk_size * i,
current_chunk_size * sizeof(ItemType), cudaMemcpyHostToDevice));
std::cout << "Chunk " << i << " of " << num_chunks << " copied"
<< std::endl;
chunk_index++;
}
},
std::ref(chunk_index));
for (uint32_t i = 1; i <= num_chunks; ++i) {
while (chunk_index < i) {
std::this_thread::yield();
}
std::cout << "Chunk " << i << " of " << num_chunks << " started"
<< std::endl;
std::chrono::high_resolution_clock::time_point start =
std::chrono::high_resolution_clock::now();
sleepKernel<<<1, 1>>>(10000000);
kernelStreamCheck(0);
std::chrono::high_resolution_clock::time_point end =
std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_span =
std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Chunk " << i << " of " << num_chunks << " slept"
<< " in " << time_span.count() << " seconds" << std::endl;
}
t_HtoD.join();
// copy results back to host
gpuErrchk(cudaMemcpy(results.data(), d_results, num_items * sizeof(ItemType),
cudaMemcpyDeviceToHost));
gpuErrchk(cudaFree(d_items));
gpuErrchk(cudaFree(d_results));
return 0;
}
Which is compiled with nvcc -o example -arch=sm_75 --default-stream per-thread test.cu
with CUDA 12.4 and run on an RTX 2080 Ti on Linux. When running by itself, I get outputs similar to this:
Chunk 0 of 10 copied
Chunk 1 of 10 started
Chunk 1 of 10 copied
Chunk 2 of 10 copied
Chunk 1 of 10 slept in 0.0680279 seconds
Chunk 2 of 10 started
Chunk 2 of 10 slept in 0.00734371 seconds
Chunk 3 of 10 started
Chunk 3 of 10 slept in 0.00734193 seconds
Chunk 3 of 10 copied
Chunk 4 of 10 started
Chunk 4 of 10 slept in 0.00734278 seconds
Chunk 4 of 10 copied
Chunk 5 of 10 started
Chunk 5 of 10 slept in 0.00734305 seconds
Chunk 5 of 10 copied
Chunk 6 of 10 started
Chunk 6 of 10 slept in 0.0054466 seconds
Chunk 6 of 10 copied
Chunk 7 of 10 started
Chunk 7 of 10 slept in 0.00544596 seconds
Chunk 7 of 10 copied
Chunk 8 of 10 started
Chunk 8 of 10 slept in 0.00544566 seconds
Chunk 8 of 10 copied
Chunk 9 of 10 started
Chunk 9 of 10 slept in 0.00544588 seconds
Chunk 9 of 10 copied
Chunk 10 of 10 started
Chunk 10 of 10 slept in 0.00543664 seconds
Which is what I would expect. But when profiling with nvprof --export-profile "./test2.nvvp" -f --trace api,gpu ./example
, the output tends to be much less interleaved, for example:
Chunk 0 of 10 copied
Chunk 1 of 10 started
Chunk 1 of 10 copied
Chunk 2 of 10 copied
Chunk 3 of 10 copied
Chunk 4 of 10 copied
Chunk 5 of 10 copied
Chunk 6 of 10 copied
Chunk 7 of 10 copied
Chunk 8 of 10 copied
Chunk 9 of 10 copied
Chunk 1 of 10 slept in 0.275931 seconds
Chunk 2 of 10 started
Chunk 2 of 10 slept in 0.00544324 seconds
Chunk 3 of 10 started
Chunk 3 of 10 slept in 0.00544005 seconds
Chunk 4 of 10 started
Chunk 4 of 10 slept in 0.00544078 seconds
Chunk 5 of 10 started
Chunk 5 of 10 slept in 0.00543852 seconds
Chunk 6 of 10 started
Chunk 6 of 10 slept in 0.00543788 seconds
Chunk 7 of 10 started
Chunk 7 of 10 slept in 0.00543785 seconds
Chunk 8 of 10 started
Chunk 8 of 10 slept in 0.00543824 seconds
Chunk 9 of 10 started
Chunk 9 of 10 slept in 0.00543945 seconds
Chunk 10 of 10 started
Chunk 10 of 10 slept in 0.00544015 seconds
This seems to defeat the purpose of profiling in this case, since from the profile alone, I would have thought the program was not running correctly. Is this expected behaviour from NVPROF?
Thank you for the help!