Hello CUDA Performance Forum,

first of all I would like to apologize for every mistake I make, since English is not my native language. Second of all, the code below might be a little bit confusing. I was responsible to implement MPI and CUDA for our university project and recently switched to running some benchmark tests(The .cu file gets compiled apart and is linked to the C++ compiler who is compiling a .cpp file with the main routine)

Well, I am currently comparing the memory usage options as know as unified memory and zero copy running on the same kernel. The kernel at hand is a matrix vector multiplication with the matrix being stored in the ELLPack storage format. Since the matrix consists of 7 diagonals the data and indices array have a dimension of dim*7. I get the following run time results:

zero copy vs unified memory run time

On the left side we see the kernel run time where the unified memory kernel performance x3 better than the zero copy kernel. I hit a peak bandwidth of close to 12GB/s out of the possible 14.9GB/s using unified memory. Overall both methods don’t fall apart very much.

Now, please compare my results to those graphs (page 53 and page 54) found in a presentation hold at the GPU Technology Conference 2014. Here we see the kernel using zero copy score better or equal than the kernel using unified memory which is a discrepancy to my results. Overall the zero copy method has a slight offset compared to other options but that does not seem to significant and is somewhat the same result i got.

So (finally) here are my questions:

- Why is the zero copy option slower than the unified memory method in my example?
- Why is the zero copy option better or equal in the Tech.Conference example?
- Why is there a discrepancy in the two above?
- Do I do time everything right or is there a mistake in my code?

As stated above, all CUDA functions are in a different .cu file than the main routine. In between the following allocation and kernel call functions, data is copied from a CPU computed sample matrix. I do multiple runs to guaranty I run long enough for smaller dimensions. Standard is 1000 iterations.

**Kernel:**

```
template<typename type>
__global__ void gpu_ax(type* data, type* fvec, type* result, int* indices, int max_row_length, int dim_local)
{
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if(idx<dim_local)
{
int col;
type svalue = 0, value;
for(int i = 0;i < max_row_length; i++)
{
value = data[i*dim_local+idx];
col = indices[i*dim_local+idx];
svalue += value*fvec[col];
}
result[idx]=svalue;
}
}
```

Unified Memory:

Allocation:

```
template<typename Scalar>
void alloc_unified(Scalar **data, Scalar **fvec, Scalar **result, int **indices, int max_row_length, int dim_local,int dim_fvec)
{
cudaMallocManaged((void **)data, sizeof(Scalar)*dim_local*max_row_length);
cudaMallocManaged((void **)fvec, sizeof(Scalar)*dim_fvec);
cudaMallocManaged((void **)result, sizeof(Scalar)*dim_local);
cudaMallocManaged((void **)indices, sizeof(int)*dim_local*max_row_length);
}
template void alloc_unified<int>(int **data, int **fvec, int **result, int **indices, int max_row_length, int dim_local, int dim_fvec);
template void alloc_unified<float>(float **data, float **fvec, float **result, int **indices, int max_row_length, int dim_local, int dim_fvec);
template void alloc_unified<double>(double **data, double **fvec, double **result, int **indices, int max_row_length, int dim_local, int dim_fvec);
```

Kernel setup and call:

```
template<typename Scalar>
float mult_vec_unified_time(Scalar *data, Scalar *fvec, Scalar *result, int *indices, int max_row_length, int dim_local, int dim_fvec, int runs)
{
cudaEvent_t start_unified, stop_unified;
cudaEventCreate(&start_unified);
cudaEventCreate(&stop_unified);
int num_blocks = ceil((double)dim_local/1024);
int num_threads = ceil(((double)dim_local/num_blocks)/32)*32;
cudaEventRecord(start_unified);
for (int i = 0; i < runs; i++)
{
gpu_ax<<<num_blocks,num_threads>>>(data,fvec,result,indices,max_row_length, dim_local);
}
cudaEventRecord(stop_unified);
cudaEventSynchronize(stop_unified);
float elapsedTime_unified = 0.0;
cudaEventElapsedTime(&elapsedTime_unified, start_unified, stop_unified);
cudaEventDestroy(start_unified);
cudaEventDestroy(stop_unified);
cudaDeviceSynchronize();
return (elapsedTime_unified / (float)runs);
}
template float mult_vec_unified_time<int>(int* data, int* fvec, int* result, int* indices, int max_row_length, int dim_local,int dim_fvec, int runs);
template float mult_vec_unified_time<float>(float* data, float* fvec, float* result, int* indices, int max_row_length, int dim_local, int dim_fvec, int runs);
template float mult_vec_unified_time<double>(double* data, double* fvec, double* restult, int* indices, int max_row_length, int dim_local, int dim_fvec, int runs);
```

**Zero Copy:**

Allocation:

```
template<typename Scalar>
void alloc_zero(Scalar **data, Scalar **fvec, Scalar **result, int ** indices, int max_row_length, int dim_local, int dim_fvec)
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop,0);
if(prop.canMapHostMemory)
{
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaHostAlloc((void **)data, sizeof(Scalar)*max_row_length*dim_local, cudaHostAllocMapped);
cudaHostAlloc((void **)fvec, sizeof(Scalar)*dim_fvec, cudaHostAllocMapped);
cudaHostAlloc((void **)result, sizeof(Scalar)*dim_local, cudaHostAllocMapped);
cudaHostAlloc((void **)indices, sizeof(int)*max_row_length*dim_local, cudaHostAllocMapped);
}
}
template void alloc_zero<int>(int **data, int **fvec, int **result, int **indices, int max_row_length, int dim_local, int dim_fvec);
template void alloc_zero<float>(float **data, float **fvec, float **result, int **indices, int max_row_length, int dim_local, int dim_fvec);
template void alloc_zero<double>(double **data, double **fvec, double **result, int **indices, int max_row_length, int dim_local, int dim_fvec);
```

Kernel setup and call:

```
template<typename Scalar>
float mult_vec_zero_time(Scalar *data, Scalar *fvec, Scalar *result, int *indices, int max_row_length, int dim_local, int dim_fvec, int runs)
{
cudaEvent_t start_zero, stop_zero;
cudaEventCreate(&start_zero);
cudaEventCreate(&stop_zero);
Scalar *d_data, *d_fvec, *d_result;
int *d_indices;
cudaHostGetDevicePointer((void **)&d_data,(void *)data, 0);
cudaHostGetDevicePointer((void **)&d_fvec, (void *)fvec, 0);
cudaHostGetDevicePointer((void **)&d_result, (void *)result, 0);
cudaHostGetDevicePointer((void **)&d_indices, (void *)indices, 0);
int num_blocks = ceil((double)dim_local/1024);
int num_threads = ceil(((double)dim_local/num_blocks)/32)*32;
cudaEventRecord(start_zero);
for (int i=0;i<runs;i++)
{
gpu_ax<<<num_blocks,num_threads>>>(d_data, d_fvec, d_result, d_indices, max_row_length, dim_local);
}
cudaEventRecord(stop_zero);
cudaEventSynchronize(stop_zero);
float elapsedTime_zero = 0.0;
cudaEventElapsedTime(&elapsedTime_zero, start_zero, stop_zero);
cudaEventDestroy(start_zero);
cudaEventDestroy(stop_zero);
cudaDeviceSynchronize();
cleanup(d_data, d_fvec, d_result, d_indices, 0);
return (elapsedTime_zero /(float)runs);
}
template float mult_vec_zero_time<int>(int* data, int* fvec, int* result, int* indices, int max_row_length, int dim_local, int dim_fvec, int runs);
template float mult_vec_zero_time<float>(float* data, float* fvec, float* result, int* indices, int max_row_length, int dim_local, int dim_fvec, int runs);
template float mult_vec_zero_time<double>(double* data, double* fvec, double* restult, int* indices, int max_row_length, int dim_local, int dim_fvec, int runs);
```

I am looking forward to any discussion and answer and would like to thank you in advance!