If you’d like help, I suggest providing more information.
When people ask for help, if they don’t provide a complete, compilable test case that I can directly use without having to add anything or change anything, it immediately makes it more difficult for me to help. Often times, I simply don’t have the time to invest in such inquiries. I will certainly prioritize other questions above those that are lacking important information.
I can’t run your code. Moreover, you seem to be asking about a benchmark performance, and the extent of description of that, that I can find in your post is this:
benchmark::DoNotOptimize(allocated_ptrs[count * 2]);
benchmark::DoNotOptimize(allocated_ptrs[count * 2 + 1]);
If that is supposed to be descriptive or informative, in my view it is not.
Additional information that would be useful would be the CUDA version, the operating system, and also how you measure performance (host based timing, profiler, etc.) and what the actual performance data was. For some folks, this description:
might mean a 25% reduction in performance, for others it might mean 10x reduction in performance.
Quite simply, if you want help, make it easy for others to help you.
I went ahead and wrote my own test case, attempting to follow as best I could what you have provided. Obviously I have no idea what the benchmark code is. Nevertheless, the baseline performance of the pool allocate/free mechanism plus a relatively simple memory test seems to be unaffected by kMemoryPoolSizeUnit
. I have not done any sort of exhaustive testing, this is just one datapoint:
$ cat t1958.cu
#include <cstdint>
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void copy_kernel(int *data, uint64_t sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < sz) data[idx] = data[idx+sz];
}
typedef unsigned char T;
uint64_t test(void *ptr){
uint64_t t = dtime_usec(0);
const int blocks = 160;
const int threads = 1024;
copy_kernel<<<blocks,threads>>>((int *)ptr, blocks*threads);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {std::cout << "err 100: " << cudaGetErrorString(err) << std::endl; return 0;}
t = dtime_usec(t);
return t;
}
int main(int argc, char *argv[]){
bool supported = false;
int device = 0;
int driverVersion = 0;
int deviceSupportsMemoryPools = 0;
int poolSupportedHandleTypes = 0;
cudaDriverGetVersion(&driverVersion);
if (driverVersion >= 11020) {
cudaDeviceGetAttribute(&deviceSupportsMemoryPools,
cudaDevAttrMemoryPoolsSupported, device);
}
if (deviceSupportsMemoryPools != 0) {
// `device` supports the Stream Ordered Memory Allocator
supported = true;
std::cout << "Memory Pools supported!" << std::endl;
}
if (driverVersion >= 11030) {
cudaDeviceGetAttribute(&poolSupportedHandleTypes,
cudaDevAttrMemoryPoolSupportedHandleTypes, device);
}
if (poolSupportedHandleTypes & cudaMemHandleTypePosixFileDescriptor) {
// Pools on the specified device can be created with posix file descriptor-based IPC
std::cout << "including IPC!" << std::endl;
}
if (!supported) return 0;
int loops = 2;
int incr = 0;
if (argc > 1) loops = atoi(argv[1]);
if (argc > 2) incr = atoi(argv[2]);
const int64_t data_length_0 = 1920 * 1080 * 3;
const int64_t data_length_1 = 2560 * 1920 * 3;
const int64_t num_allocations = 32;
const int64_t allocation_size_in_bytes_0 = data_length_0 * sizeof(T);
const int64_t allocation_size_in_bytes_1 = data_length_1 * sizeof(T);
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaMemPoolProps pool_props;
memset(&pool_props, 0, sizeof(pool_props));
pool_props.allocType = cudaMemAllocationTypePinned;
pool_props.location.id = 0;
pool_props.location.type = cudaMemLocationTypeDevice;
cudaMemPool_t mem_pool;
cudaError_t err = cudaMemPoolCreate(&mem_pool, &pool_props);
if (err != cudaSuccess) {std::cout << "err 1: " << cudaGetErrorString(err) << std::endl; return 0;}
int64_t kMemoryPoolSizeUnit = (1ul << 25)+incr; // 32 MiB, undocumented???
uint64_t memory_pool_size =
((allocation_size_in_bytes_0 + allocation_size_in_bytes_1) * num_allocations +
kMemoryPoolSizeUnit - 1) /
kMemoryPoolSizeUnit * kMemoryPoolSizeUnit;
// uint64_t memory_pool_size = (allocation_size_in_bytes_0 + allocation_size_in_bytes_1) * num_allocations; // much slower!!
err = cudaMemPoolSetAttribute(mem_pool, cudaMemPoolAttrReleaseThreshold, &memory_pool_size);
if (err != cudaSuccess) {std::cout << "err 2: " << cudaGetErrorString(err) << std::endl; return 0;}
err = cudaStreamSynchronize(stream);
if (err != cudaSuccess) {std::cout << "err 3: " << cudaGetErrorString(err) << std::endl; return 0;}
void* allocated_ptrs[num_allocations * 2];
uint64_t time_sum = 0;
for (int qq = 0; qq < loops; qq++) {
for (int count = 0; count < num_allocations; count++) {
err = cudaMallocFromPoolAsync(&allocated_ptrs[count * 2],
allocation_size_in_bytes_0,
mem_pool,
stream);
if (err != cudaSuccess) {std::cout << "err 4: " << cudaGetErrorString(err) << std::endl; return 0;}
err = cudaMallocFromPoolAsync(&allocated_ptrs[count * 2 + 1],
allocation_size_in_bytes_1,
mem_pool,
stream);
if (err != cudaSuccess) {std::cout << "err 5: " << cudaGetErrorString(err) << std::endl; return 0;}
time_sum += test(allocated_ptrs[count * 2]);
time_sum += test(allocated_ptrs[count * 2 + 1]);
}
err = cudaStreamSynchronize(stream);
if (err != cudaSuccess) {std::cout << "err 6: " << cudaGetErrorString(err) << std::endl; return 0;}
for (int count = 0; count < num_allocations * 2; count++) {
err = cudaFreeAsync(allocated_ptrs[count], stream);
if (err != cudaSuccess) {std::cout << "err 7: " << cudaGetErrorString(err) << std::endl; return 0;}
}
err = cudaStreamSynchronize(stream);
if (err != cudaSuccess) {std::cout << "err 8: " << cudaGetErrorString(err) << std::endl; return 0;}
}
err = cudaMemPoolDestroy(mem_pool);
if (err != cudaSuccess) {std::cout << "err 9: " << cudaGetErrorString(err) << std::endl; return 0;}
std::cout << "elapsed time: " << time_sum << "us" << std::endl;
}
$ nvcc -o t1958 t1958.cu
$ ./t1958 10
Memory Pools supported!
including IPC!
elapsed time: 8786us
$ ./t1958 10 1024
Memory Pools supported!
including IPC!
elapsed time: 8373us
$ ./t1958 100
Memory Pools supported!
including IPC!
elapsed time: 79157us
$ ./t1958 100 1024
Memory Pools supported!
including IPC!
elapsed time: 77558us
$
Tesla V100, CentOS 7, CUDA 11.4, 470.57.02
The above data simply indicates the performance of the memory test. I observed the overall application peformance as follows:
$ time ./t1958 10000
Memory Pools supported!
including IPC!
elapsed time: 6850860us
real 0m8.507s
user 0m6.916s
sys 0m1.586s
$ time ./t1958 10000 1024
Memory Pools supported!
including IPC!
elapsed time: 6817939us
real 0m8.517s
user 0m6.902s
sys 0m1.609s
$
The overall application measured wallclock time showed no significant difference between the case where kMemoryPoolSizeUnit
was 32MB or 32MB+1024. Therefore I conclude that there was no significant difference in the aggregate time for the allocation/free mechanism.