When I benchmark the write speeds for 1,000,000 objects, it seems very slow, where a million writes takes ~2ms:
__global__ void write_test( int* edges, int size)
{
int thid = blockIdx.x * blockDim.x + threadIdx.x;
if (thid >= size){ return; }
else { edges[thid] = 23; }
}
void some_func()
{
int size = 1000000;
int* edges;
cudaMalloc((void **)&edges, (size)*sizeof(int) );
write_test<<<dim3(1024), dim3(1024)>>>( edges, size );
}
In contrast, timing a torch.nn.Conv2d with similar numbers of inputs takes less than 0.5ms:
def test_conv(self):
device = torch.cuda.current_device()
mod = torch.nn.Conv2d(32,32,3).cuda().eval()
data = t.randint(0,255,[40,32,32,32], dtype=t.float, device=device)
torch.cuda.synchronize(device)
tic = time.time()
out = mod(data)
t.cuda.synchronize(device)
print(time.time() - tic)
The time to convolve and then write the outputs includes however many multiplications.
Is there some way for a kernel to write to global memory on the GPU faster?
1,000,000 objects * 4 bytes/object is ~4MB
4MB in 2ms is ~2GB/s
What GPU are you running this on?
njuffa
November 17, 2019, 1:47am
#3
Robert Crovella already asked the question I wanted to ask. Below is a version of your program quickly reworked to follow good practices. With this, I see the following on my Quadro P2000 (theoretical throughput is 140 GB/sec at memory clock of 1750 MHz):
write_test: data = 4.000e+006 bytes mintime = 0.091 msec
write_test: throughput = 44.01 GB/sec
write_test: data = 4.000e+007 bytes mintime = 0.387 msec
write_test: throughput = 103.42 GB/sec
write_test: data = 4.000e+008 bytes mintime = 3.204 msec
write_test: throughput = 124.84 GB/sec
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define THREADS_PER_BLOCK (256)
#define ITERATIONS (10)
__global__ void write_test( int* edges, int size)
{
int thid = blockIdx.x * blockDim.x + threadIdx.x;
if (thid >= size){ return; }
else { edges[thid] = 23; }
}
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaDeviceSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// A routine to give access to a high precision timer on most systems.
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
LARGE_INTEGER t;
static double oofreq;
static int checkedForHighResTimer;
static BOOL hasHighResTimer;
if (!checkedForHighResTimer) {
hasHighResTimer = QueryPerformanceFrequency (&t);
oofreq = 1.0 / (double)t.QuadPart;
checkedForHighResTimer = 1;
}
if (hasHighResTimer) {
QueryPerformanceCounter (&t);
return (double)t.QuadPart * oofreq;
} else {
return (double)GetTickCount() * 1.0e-3;
}
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif
int main (void)
{
double start, stop, elapsed, mintime;
const double INFTY = exp (1000.0);
const int size = 100000000;
int* edges = 0;
CUDA_SAFE_CALL (cudaMalloc ((void **)&edges, sizeof (edges[0]) * size));
/* Compute execution configuration */
dim3 dimBlock (THREADS_PER_BLOCK);
int threadBlocks = (size + (dimBlock.x - 1)) / dimBlock.x;
dim3 dimGrid (threadBlocks);
mintime = INFTY;
cudaDeviceSynchronize();
for (int k = 0; k < ITERATIONS; k++) {
start = second();
write_test<<<dimGrid, dimBlock>>> (edges, size);
CHECK_LAUNCH_ERROR();
stop = second();
elapsed = stop - start;
if (elapsed < mintime) mintime = elapsed;
}
printf ("write_test: data = %.3e bytes mintime = %.3f msec\n",
(double)sizeof (edges[0]) * size, 1.0e3 * mintime);
printf ("write_test: throughput = %.2f GB/sec\n",
(1.0e-9 * sizeof (edges[0]) * size) / mintime);
CUDA_SAFE_CALL (cudaFree (edges));
cudaDeviceSynchronize();
return EXIT_SUCCESS;
}
Having known-good best-practices code is extremely helpful; many thanks!
After testing on my GTX 1050 ti Max-Q and my GTX 1080 with your code, I can be pretty sure my bottlenecks are elsewhere in my application.
I’m writing a CUDA extension for Python and there is most likely quite a bit of overhead in whatever mechanism PyTorch and PyBind employs to switch contexts, support object type conversions, etc. Lacking a good C++ timing mechanic, I was timing from my Python code and probably capturing a bunch of other time-intensive stuff at that interface.
It may also be that my CUDA code elsewhere, with other read/write/compute patterns, are also poorly optimized, but with better timings (and this sanity check) I can better figure those out.
As such:
GTX 1080 (320.3 GB/s theoretical memory throughput):
write_test: data = 4.000e+06 bytes mintime = 0.019 msec
write_test: throughput = 205.59 GB/sec
write_test: data = 4.000e+07 bytes mintime = 0.157 msec
write_test: throughput = 254.48 GB/sec
write_test: data = 4.000e+08 bytes mintime = 1.540 msec
write_test: throughput = 259.77 GB/sec
GTX 1050 ti Max-Q (112.1 GB/s theoretical memory throughput):
write_test: data = 4.000e+06 bytes mintime = 0.043 msec
write_test: throughput = 92.46 GB/sec
write_test: data = 4.000e+07 bytes mintime = 0.394 msec
write_test: throughput = 101.59 GB/sec
write_test: data = 4.000e+08 bytes mintime = 3.895 msec
write_test: throughput = 102.69 GB/sec
which looks reasonable, given the theoretical bandwidths, and your runs on a P2000.