I am taking the paid course and learning about prefetch for vector-add.cu
The output is like
/*
1 prefetch
CUDA Memory Operation Statistics (by time):
Time(%) Total Time (ns) Operations Average Minimum Maximum Operation
------- --------------- ---------- ------- ------- ------- ---------------------------------
100.0 21203394 768 27608.6 1599 160828 [CUDA Unified Memory memcpy DtoH]
CUDA Kernel Statistics:
Time(%) Total Time (ns) Instances Average Minimum Maximum Name
------- --------------- --------- ---------- ------- -------- -------------------------------------------
96.0 40732060 3 13577353.3 619599 20614807 initWith(float, float*, int)
4.0 1710960 1 1710960.0 1710960 1710960 addVectorsInto(float*, float*, float*, int)
2 prefetches
CUDA Kernel Statistics:
Time(%) Total Time (ns) Instances Average Minimum Maximum Name
------- --------------- --------- --------- ------- -------- -------------------------------------------
92.9 22514595 3 7504865.0 620718 21269383 initWith(float, float*, int)
7.1 1712015 1 1712015.0 1712015 1712015 addVectorsInto(float*, float*, float*, int)
40732060
CUDA Memory Operation Statistics (by time):
Time(%) Total Time (ns) Operations Average Minimum Maximum Operation
------- --------------- ---------- ------- ------- ------- ---------------------------------
100.0 21200406 768 27604.7 1599 160123 [CUDA Unified Memory memcpy DtoH]
21203394
3 prefetches
CUDA Kernel Statistics:
Time(%) Total Time (ns) Instances Average Minimum Maximum Name
------- --------------- --------- --------- ------- ------- -------------------------------------------
52.2 1867563 3 622521.0 618639 624558 initWith(float, float*, int)
47.8 1711696 1 1711696.0 1711696 1711696 addVectorsInto(float*, float*, float*, int)
40732060
CUDA Memory Operation Statistics (by time):
Time(%) Total Time (ns) Operations Average Minimum Maximum Operation
------- --------------- ---------- ------- ------- ------- ---------------------------------
100.0 21201498 768 27606.1 1599 160028 [CUDA Unified Memory memcpy DtoH]
prefetch back to cpu
CUDA Kernel Statistics:
Time(%) Total Time (ns) Instances Average Minimum Maximum Name
------- --------------- --------- --------- ------- ------- -------------------------------------------
52.3 1871467 3 623822.3 620335 627342 initWith(float, float*, int)
47.7 1706448 1 1706448.0 1706448 1706448 addVectorsInto(float*, float*, float*, int)
CUDA Memory Operation Statistics (by time):
Time(%) Total Time (ns) Operations Average Minimum Maximum Operation
------- --------------- ---------- ------- ------- ------- ---------------------------------
21201498
100.0 21007255 592 35485.2 1599 319735 [CUDA Unified Memory memcpy DtoH]
*/
cudaMallocManaged() is malloc’ing in Unified Memory it may not be resident on the CPU nor GPU. So since in the code I am initializing the cudaMallocManaged()'ed arrays in a gpu kernel, should the above say something about that? Why is just “device to host”? I am also prefetching back to the CPU for the host function taht confirms that the vector add result array is correct.
The code
#include <stdio.h>
__global__
void initWith(float num, float *a, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
a[i] = num;
}
}
__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
result[i] = a[i] + b[i];
}
}
void checkElementsAre(float target, float *vector, int N)
{
for(int i = 0; i < N; i++)
{
if(vector[i] != target)
{
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
exit(1);
}
}
printf("Success! All values calculated correctly.\n");
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("num SMs = %d\n", numberOfSMs);
const int N = 2<<24;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
cudaMemPrefetchAsync(a, size, deviceId);
cudaMemPrefetchAsync(b, size, deviceId);
cudaMemPrefetchAsync(c, size, deviceId);
size_t threadsPerBlock;
size_t numberOfBlocks;
threadsPerBlock = 256;
numberOfBlocks = 32 * numberOfSMs;
cudaError_t addVectorsErr;
cudaError_t asyncErr;
initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
addVectorsErr = cudaGetLastError();
if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
asyncErr = cudaDeviceSynchronize();
if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));
cudaMemPrefetchAsync(c, N, cudaCpuDeviceId);
checkElementsAre(7, c, N);
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
/*
1 prefetch
CUDA Memory Operation Statistics (by time):
Time(%) Total Time (ns) Operations Average Minimum Maximum Operation
------- --------------- ---------- ------- ------- ------- ---------------------------------
100.0 21203394 768 27608.6 1599 160828 [CUDA Unified Memory memcpy DtoH]
CUDA Kernel Statistics:
Time(%) Total Time (ns) Instances Average Minimum Maximum Name
------- --------------- --------- ---------- ------- -------- -------------------------------------------
96.0 40732060 3 13577353.3 619599 20614807 initWith(float, float*, int)
4.0 1710960 1 1710960.0 1710960 1710960 addVectorsInto(float*, float*, float*, int)
2 prefetches
CUDA Kernel Statistics:
Time(%) Total Time (ns) Instances Average Minimum Maximum Name
------- --------------- --------- --------- ------- -------- -------------------------------------------
92.9 22514595 3 7504865.0 620718 21269383 initWith(float, float*, int)
7.1 1712015 1 1712015.0 1712015 1712015 addVectorsInto(float*, float*, float*, int)
40732060
CUDA Memory Operation Statistics (by time):
Time(%) Total Time (ns) Operations Average Minimum Maximum Operation
------- --------------- ---------- ------- ------- ------- ---------------------------------
100.0 21200406 768 27604.7 1599 160123 [CUDA Unified Memory memcpy DtoH]
21203394
3 prefetches
CUDA Kernel Statistics:
Time(%) Total Time (ns) Instances Average Minimum Maximum Name
------- --------------- --------- --------- ------- ------- -------------------------------------------
52.2 1867563 3 622521.0 618639 624558 initWith(float, float*, int)
47.8 1711696 1 1711696.0 1711696 1711696 addVectorsInto(float*, float*, float*, int)
40732060
CUDA Memory Operation Statistics (by time):
Time(%) Total Time (ns) Operations Average Minimum Maximum Operation
------- --------------- ---------- ------- ------- ------- ---------------------------------
100.0 21201498 768 27606.1 1599 160028 [CUDA Unified Memory memcpy DtoH]
prefetch back to cpu
CUDA Kernel Statistics:
Time(%) Total Time (ns) Instances Average Minimum Maximum Name
------- --------------- --------- --------- ------- ------- -------------------------------------------
52.3 1871467 3 623822.3 620335 627342 initWith(float, float*, int)
47.7 1706448 1 1706448.0 1706448 1706448 addVectorsInto(float*, float*, float*, int)
CUDA Memory Operation Statistics (by time):
Time(%) Total Time (ns) Operations Average Minimum Maximum Operation
------- --------------- ---------- ------- ------- ------- ---------------------------------
21201498
100.0 21007255 592 35485.2 1599 319735 [CUDA Unified Memory memcpy DtoH]
*/