I’m trying to figure out why a number of cudaMemcpyHostToDevice and cudaMemcpyDeviceToHost calls are significantly slower than the rest of my transfers. I have a program that copies a matrix to the gpu, finds the inverse of each element, and then copies it back which then repeats this process on the cpu. It does this 1000 times, printing out the time taken for each matrix. For the most part, all of the memory transfers max out my 970M’s bandwidth except for around 10 percent of them.
Here’s the code
#include <iostream>
#include <numeric>
#include <stdlib.h>
static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
/**
* CUDA kernel that computes reciprocal values for a given vector
*/
__global__ void reciprocalKernel(float *data, unsigned vectorSize) {
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < vectorSize)
data[idx] = 1.0/data[idx];
}
/**
* Host function that copies the data and launches the work on GPU
*/
float *gpuReciprocal(float *data, unsigned size)
{
float *rc = new float;
float *gpuData;
CUDA_CHECK_RETURN(cudaMalloc((void **)&gpuData, sizeof(float)*size));
CUDA_CHECK_RETURN(cudaMemcpy(gpuData, data, sizeof(float)*size, cudaMemcpyHostToDevice));
static const int BLOCK_SIZE = 256;
const int blockCount = (size+BLOCK_SIZE-1)/BLOCK_SIZE;
reciprocalKernel<<<blockCount, BLOCK_SIZE>>> (gpuData, size);
CUDA_CHECK_RETURN(cudaMemcpy(rc, gpuData, sizeof(float)*size, cudaMemcpyDeviceToHost));
CUDA_CHECK_RETURN(cudaFree(gpuData));
return rc;
}
float *cpuReciprocal(float *data, unsigned size)
{
float *rc = new float;
for (unsigned cnt = 0; cnt < size; ++cnt) rc[cnt] = 1.0/data[cnt];
return rc;
}
void initialize(float *data, unsigned size)
{
for (unsigned i = 0; i < size; ++i)
data[i] = .5*(i+1);
}
int main(void)
{
static const int WORK_SIZE = 65530;
float *data = new float[WORK_SIZE];
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (int i = 0; i < 1000; i++)
{
cudaEventRecord(start, 0);
initialize (data, WORK_SIZE);
float *recCpu = cpuReciprocal(data, WORK_SIZE);
float *recGpu = gpuReciprocal(data, WORK_SIZE);
float cpuSum = std::accumulate (recCpu, recCpu+WORK_SIZE, 0.0);
float gpuSum = std::accumulate (recGpu, recGpu+WORK_SIZE, 0.0);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
/* Verify the results */
std::cout << i << ") gpuSum = "<<gpuSum<< " cpuSum = " <<cpuSum<< " time = " <<time<< std::endl;
delete[] recCpu;
delete[] recGpu;
}
/* Free memory */
delete[] data;
return 0;
}
/**
* Check the return value of the CUDA runtime API call and exit
* the application if the call has failed.
*/
static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
if (err == cudaSuccess)
return;
std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
exit (1);
}
Has anyone experienced this before? I’ve uploaded some screenshots (Sli Mag) of the data I’ve collected. The first two graphs show data from a 970, then a 960 and then a 745. What’s interesting is this curve is apparent on all the 900 series GPU’s I’ve tested, but not in the 745.