The amount of copy is around 3 KiBs. This is the minimal reproducible code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <vector>
#include <iostream>
#include <string>
#define gpuErrchk() { gpuAssert(__FILE__, __LINE__); }
inline void gpuAssert(const char* file, int line, bool abort = true)
{
cudaDeviceSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void dotProduct_v1(float* out, float* in1, float* in2, const int len)
{
__shared__ float cache[1024];
int i = threadIdx.x;
int cacheIdx = threadIdx.x;
float tmp{};
while (i < len)
{
tmp += in1[i] * in2[i];
i += blockDim.x;
}
cache[cacheIdx] = tmp;
__syncthreads();
i = blockDim.x / 2;
while (i != 0)
{
if (cacheIdx < i)
cache[cacheIdx] += cache[cacheIdx + i];
__syncthreads();
i /= 2;
}
if (cacheIdx == 0)
out[0] = cache[0];
}
void dotProductExample()
{
const int inputLen = 1024 - 200;
std::vector<float> in1(inputLen), in2(inputLen);
float innerProduct;
std::fill(in1.begin(), in1.end(), 1.0);
std::fill(in2.begin(), in2.end(), 1.0);
float* d_in1, * d_in2, * d_out;
cudaMalloc(&d_in1, inputLen * sizeof(float));
cudaMalloc(&d_in2, inputLen * sizeof(float));
cudaMalloc(&d_out, sizeof(float));
gpuErrchk();
cudaMemcpy(d_in1, in1.data(), inputLen * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), inputLen * sizeof(float), cudaMemcpyHostToDevice);
cudaMemset(d_out, 0.0, sizeof(float));
dotProduct_v1 << <1, 1024 >> > (d_out, d_in1, d_in2, inputLen);
gpuErrchk();
cudaMemcpy(&innerProduct, d_out, sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "dotProduct result: " << innerProduct << std::endl;
}
int main()
{
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
dotProductExample();
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
That makes sense. I increased the input len to 2MiB and see these in the timeline (I had to increase it to at least 500KiB to see some overlap between the API call and the actual execution on the GPU):
I’ll file a bug and request a more detailed explanation of cudaMemcpy behavior in the docs. Thank you.