NVPROF showing GPU Fault though I am using cudaPrefetch

for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NL; j++)
		{
			// D[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;	
			D_gpu[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;
		}
	}
	cudaMemAdvise(D_gpu, sizeof(DATA_TYPE) * NI * NL , cudaMemAdviseSetReadMostly, 0);
	cudaMemPrefetchAsync(D_gpu, sizeof(DATA_TYPE) * NI * NL, 0, s4);

cudaDeviceSynchronize();
mm2_kernel1<<<grid1,block, 0, s5>>>(A_gpu, B_gpu, C_gpu);
cudaDeviceSynchronize();
mm2_kernel2<<<grid2,block, 0, s5>>>(C_gpu, D_gpu, E_gpu);
cudaDeviceSynchronize();

Consider the above code snippet as an example. When I run the above code snippet without cudaMemAdvise, it still has the same execution time. Note that, in the kernels, D_gpu is always read.
Moreover, when I profile the same code using nvprof, it shows Host-To-Device Page Faults have occurred on the GPU side, but since all the pages should have been transferred to the GPU, there should not be any page faults. Can someone please help me understand this?

Without a complete example, for me, anyway, I can only speculate.

Perhaps one or more of A_gpu, B_gpu, C_gpu, or E_gpu are managed allocations that have not been fully prefetched.

Perhaps your kernel is accessing the D_gpu allocation outside of the apparent size.

Perhaps your usage of cudaMemAdvise is incorrect. There are sub-questions that could be asked, but I’d prefer to just have a complete example.

Here is the full example:

__global__ void mm2_kernel1(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C)
{
	int j = blockIdx.x * blockDim.x + threadIdx.x;
	int i = blockIdx.y * blockDim.y + threadIdx.y;

	if ((i < NI) && (j < NJ))
	{ 
		int k;
		for (k = 0; k < NK; k++)
		{
			C[i * NJ + j] += A[i * NK + k] * B[k * NJ + j];
		}
	}
}


__global__ void mm2_kernel2(DATA_TYPE *C, DATA_TYPE *D, DATA_TYPE *E)
{
	int j = blockIdx.x * blockDim.x + threadIdx.x;
	int i = blockIdx.y * blockDim.y + threadIdx.y;

	if ((i < NI) && (j < NL))
	{ 
		int k;
		for (k = 0; k < NJ; k++)
		{
			E[i * NL + j] += C[i * NJ + k] * D[k * NL + j];
		}
	}
}


int i, j;

	
	
	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NK; j++)
		{
			// A[i*NI + j] = ((DATA_TYPE) i*j) / NI;
			A_gpu[i*NI + j] = ((DATA_TYPE) i*j) / NI;
		}
	}

	cudaMemAdvise(A_gpu, sizeof(DATA_TYPE) * NI * NK , cudaMemAdviseSetReadMostly, 0);
	cudaMemPrefetchAsync(A_gpu, sizeof(DATA_TYPE) * NI * NK, 0, s1);

	for (i = 0; i < NK; i++)
	{
		for (j = 0; j < NJ; j++)
		{
			// B[i*NK + j] = ((DATA_TYPE) i*(j+1)) / NJ;
			B_gpu[i*NK + j] = ((DATA_TYPE) i*(j+1)) / NJ;
		}
	}
	cudaMemAdvise(B_gpu, sizeof(DATA_TYPE) * NK * NJ , cudaMemAdviseSetReadMostly, 0);
	cudaMemPrefetchAsync(B_gpu, sizeof(DATA_TYPE) * NK * NJ, 0, s2);
	for (i = 0; i < NL; i++)
	{
		for (j = 0; j < NJ; j++)
		{
			// C[i*NL + j] = ((DATA_TYPE) i*(j+3)) / NL;
			C_gpu[i*NL + j] = ((DATA_TYPE) i*(j+3)) / NL;
		}
	}
	cudaMemPrefetchAsync(C_gpu, sizeof(DATA_TYPE) * NL * NJ, 0, s3);
	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NL; j++)
		{
			// D[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;	
			D_gpu[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;
		}
	}
	cudaMemAdvise(D_gpu, sizeof(DATA_TYPE) * NI * NL , cudaMemAdviseSetReadMostly, 0);
	cudaMemPrefetchAsync(D_gpu, sizeof(DATA_TYPE) * NI * NL, 0, s4);

cudaDeviceSynchronize();
mm2_kernel1<<<grid1,block, 0, s5>>>(A_gpu, B_gpu, C_gpu);
cudaDeviceSynchronize();
mm2_kernel2<<<grid2,block, 0, s5>>>(C_gpu, D_gpu, E_gpu);
cudaDeviceSynchronize();

The nvprof output is the following:

==3942511== Unified Memory profiling result:
Device "Tesla P100-PCIE-16GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
     128  2.0000MB  2.0000MB  2.0000MB  256.0000MB  37.47997ms  Host To Device
     219         -         -         -           -  37.55766ms  Gpu page fault groups
Total CPU Page faults: 768

If you want to provide an example that I can copy, paste, compile, and run, without having to add anything or change anything, I’ll take a look as time permits.

Please use the following code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <unistd.h>
#include <sys/time.h>
#include <cuda.h>
#include <iostream>
#include <chrono>

#define PREF 1

// #include "../../../common/polybenchUtilFuncts.h"

//define the error threshold for the results "not matching"
#define PERCENT_DIFF_ERROR_THRESHOLD 0.05

#define GPU_DEVICE 0

/* Problem size. */
# define NI 4096
# define NJ 4096
# define NK 4096
# define NL 4096

/* Thread block dimensions */
#define DIM_THREAD_BLOCK_X 32
#define DIM_THREAD_BLOCK_Y 8

/* Can switch DATA_TYPE between float and double */
typedef float DATA_TYPE;

cudaStream_t s1, s2, s3, s4, s5;

void init_array(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, DATA_TYPE* A_gpu, DATA_TYPE* B_gpu, DATA_TYPE* C_gpu, DATA_TYPE* D_gpu)
{
	int i, j;

	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NK; j++)
		{
			// A[i*NI + j] = ((DATA_TYPE) i*j) / NI;
			A_gpu[i*NI + j] = ((DATA_TYPE) i*j) / NI;
		}
	}

	for (i = 0; i < NK; i++)
	{
		for (j = 0; j < NJ; j++)
		{
			// B[i*NK + j] = ((DATA_TYPE) i*(j+1)) / NJ;
			B_gpu[i*NK + j] = ((DATA_TYPE) i*(j+1)) / NJ;
		}
	}

	for (i = 0; i < NL; i++)
	{
		for (j = 0; j < NJ; j++)
		{
			// C[i*NL + j] = ((DATA_TYPE) i*(j+3)) / NL;
			C_gpu[i*NL + j] = ((DATA_TYPE) i*(j+3)) / NL;
		}
	}

	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NL; j++)
		{
			// D[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;	
			D_gpu[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;
		}
	}
}


// void compareResults(DATA_TYPE *E, DATA_TYPE *E_outputFromGpu)
// {
// 	int i,j,fail;
// 	fail = 0;

// 	for (i=0; i < NL; i++)
// 	{
// 		for (j=0; j < NI; j++)
// 		{
// 			if (percentDiff(E[i*NI + j], E_outputFromGpu[i*NI + j]) > PERCENT_DIFF_ERROR_THRESHOLD)
// 			{
// 				fail++;
// 			}
// 		}
// 	}
	
// 	// print results
// 	printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
// }


void GPU_argv_init()
{
	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, GPU_DEVICE);
	printf("setting device %d with name %s\n",GPU_DEVICE,deviceProp.name);
	cudaSetDevice( GPU_DEVICE );
}


__global__ void mm2_kernel1(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C)
{
	int j = blockIdx.x * blockDim.x + threadIdx.x;
	int i = blockIdx.y * blockDim.y + threadIdx.y;

	if ((i < NI) && (j < NJ))
	{ 
		int k;
		for (k = 0; k < NK; k++)
		{
			C[i * NJ + j] += A[i * NK + k] * B[k * NJ + j];
		}
	}
}


__global__ void mm2_kernel2(DATA_TYPE *C, DATA_TYPE *D, DATA_TYPE *E)
{
	int j = blockIdx.x * blockDim.x + threadIdx.x;
	int i = blockIdx.y * blockDim.y + threadIdx.y;

	if ((i < NI) && (j < NL))
	{ 
		int k;
		for (k = 0; k < NJ; k++)
		{
			E[i * NL + j] += C[i * NJ + k] * D[k * NL + j];
		}
	}
}


void mm2_cpu(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* C, DATA_TYPE* D, DATA_TYPE* E)
{
	int i, j, k;
	
  	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NJ; j++)
		{
			C[i*NJ + j] = 0.0;
			for (k = 0; k < NK; ++k)
			{
				C[i*NJ + j] += A[i*NK + k] * B[k*NJ + j];
			}
		}
	}
	
	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NL; j++)
		{
			E[i*NL + j] = 0.0;
			for (k = 0; k < NJ; ++k)
			{
				E[i*NL + j] += C[i*NJ + k] * D[k*NL + j];
			}
		}
	}
}


void mm2Cuda(DATA_TYPE* A_gpu, DATA_TYPE* B_gpu, DATA_TYPE* C_gpu, DATA_TYPE* D_gpu, DATA_TYPE* E_gpu)
{
	double t_start, t_end;
	dim3 block(DIM_THREAD_BLOCK_X, DIM_THREAD_BLOCK_Y);
	dim3 grid1((size_t)ceil( ((float)NJ) / ((float)block.x) ), (size_t)ceil( ((float)NI) / ((float)block.y)) );
	dim3 grid2((size_t)ceil( ((float)NL) / ((float)block.x) ), (size_t)ceil( ((float)NI) / ((float)block.y)) );
	// t_start = rtclock();
	mm2_kernel1<<<grid1,block, 0, s5>>>(A_gpu, B_gpu, C_gpu);
	cudaDeviceSynchronize();
	mm2_kernel2<<<grid2,block, 0, s5>>>(C_gpu, D_gpu, E_gpu);
	cudaDeviceSynchronize();
	// t_end = rtclock();
	// fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
}


int main(int argc, char** argv)
{
	double t_start, t_end;
	
	DATA_TYPE* C;
	DATA_TYPE* A;
	DATA_TYPE* B;
	DATA_TYPE* D;
	DATA_TYPE* E;

	DATA_TYPE *A_gpu;
	DATA_TYPE *B_gpu;
	DATA_TYPE *C_gpu;
	DATA_TYPE *D_gpu;
	DATA_TYPE *E_gpu;

	C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
	A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE));
	B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));
	D = (DATA_TYPE*)malloc(NJ*NL*sizeof(DATA_TYPE));
	E = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE));


	cudaMallocManaged(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
	cudaMallocManaged(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
	cudaMallocManaged(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
	cudaMallocManaged(&D_gpu, sizeof(DATA_TYPE) * NJ * NL);
	cudaMallocManaged(&E_gpu, sizeof(DATA_TYPE) * NI * NL);

	cudaStreamCreate(&s1);
	cudaStreamCreate(&s2);
	cudaStreamCreate(&s3);
	cudaStreamCreate(&s4);
	cudaStreamCreate(&s5);


	auto start = std::chrono::high_resolution_clock::now();
  	// init_array(A, B, C, D, A_gpu, B_gpu, C_gpu, D_gpu);
	#if PREF
	int i, j;

	
	
	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NK; j++)
		{
			// A[i*NI + j] = ((DATA_TYPE) i*j) / NI;
			A_gpu[i*NI + j] = ((DATA_TYPE) i*j) / NI;
		}
	}

	cudaMemAdvise(A_gpu, sizeof(DATA_TYPE) * NI * NK , cudaMemAdviseSetReadMostly, 0);
	cudaMemPrefetchAsync(A_gpu, sizeof(DATA_TYPE) * NI * NK, 0, s1);

	for (i = 0; i < NK; i++)
	{
		for (j = 0; j < NJ; j++)
		{
			// B[i*NK + j] = ((DATA_TYPE) i*(j+1)) / NJ;
			B_gpu[i*NK + j] = ((DATA_TYPE) i*(j+1)) / NJ;
		}
	}
	cudaMemAdvise(B_gpu, sizeof(DATA_TYPE) * NK * NJ , cudaMemAdviseSetReadMostly, 0);
	cudaMemPrefetchAsync(B_gpu, sizeof(DATA_TYPE) * NK * NJ, 0, s2);
	for (i = 0; i < NL; i++)
	{
		for (j = 0; j < NJ; j++)
		{
			// C[i*NL + j] = ((DATA_TYPE) i*(j+3)) / NL;
			C_gpu[i*NL + j] = ((DATA_TYPE) i*(j+3)) / NL;
		}
	}
	cudaMemPrefetchAsync(C_gpu, sizeof(DATA_TYPE) * NL * NJ, 0, s3);
	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NL; j++)
		{
			// D[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;	
			D_gpu[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;
		}
	}
	cudaMemAdvise(D_gpu, sizeof(DATA_TYPE) * NI * NL , cudaMemAdviseSetReadMostly, 0);
	cudaMemPrefetchAsync(D_gpu, sizeof(DATA_TYPE) * NI * NL, 0, s4);
	#else
	int i, j;

	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NK; j++)
		{
			// A[i*NI + j] = ((DATA_TYPE) i*j) / NI;
			A_gpu[i*NI + j] = ((DATA_TYPE) i*j) / NI;
		}
	}

	for (i = 0; i < NK; i++)
	{
		for (j = 0; j < NJ; j++)
		{
			// B[i*NK + j] = ((DATA_TYPE) i*(j+1)) / NJ;
			B_gpu[i*NK + j] = ((DATA_TYPE) i*(j+1)) / NJ;
		}
	}

	for (i = 0; i < NL; i++)
	{
		for (j = 0; j < NJ; j++)
		{
			// C[i*NL + j] = ((DATA_TYPE) i*(j+3)) / NL;
			C_gpu[i*NL + j] = ((DATA_TYPE) i*(j+3)) / NL;
		}
	}

	for (i = 0; i < NI; i++)
	{
		for (j = 0; j < NL; j++)
		{
			// D[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;	
			D_gpu[i*NL + j] = ((DATA_TYPE) i*(j+2)) / NK;
		}
	}
	#endif

	GPU_argv_init();

	cudaDeviceSynchronize();
	mm2Cuda(A_gpu, B_gpu, C_gpu, D_gpu, E_gpu);

	auto end = std::chrono::high_resolution_clock::now();
	auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
	std::cout<<"Elapsed Time: "<<duration<<"\n";

	// t_start = rtclock();
	// mm2_cpu(A, B, C, D, E);
	// t_end = rtclock();
	// fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);

	// compareResults(E, E_gpu);

	free(C);
	free(A);
	free(B);
	free(D);
	free(E);
	cudaFree(A_gpu);
	cudaFree(B_gpu);
	cudaFree(C_gpu);
	cudaFree(D_gpu);
	cudaFree(E_gpu);
	
  	return 0;
}

If you want to use prefetcher use #define PREF 1, else use #define PREF 0

No, it doesn’t.

It shows host-to-device migrations of 256MB total, corresponding to your 4 arrays A_gpu through D_gpu. These are migrations:

which are triggered by the cudaMemPrefetchAsync() calls. The words “page fault” do not appear anywhere up through and including the “Host To Device” notation. Those are not representing page faults.

These are page faults:

Those page faults are not representing migration of data, but instead instantiation of storage/pages, as a result of the fact that your mm2 kernel uses E_gpu but you do not prefetch it or instantiate it anywhere prior to its usage in that kernel.

This question/answer may be of interest.

If you want to see page fault migration of data, comment out one of the prefetch lines. For example when I comment out the prefetch of A_gpu, the profiler output changes to this:

==2274== Unified Memory profiling result:
Device "NVIDIA GeForce GTX 1660 SUPER (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
    1704  153.84KB  4.0000KB  2.0000MB  256.0000MB  22.81851ms  Host To Device

We now see that instead of 128 migrations of 2MB chunks, we have a mixture of chunk sizes including 2MB and 4KB. The 4KB entries represent page-fault triggered migration.

That really helps, thank you.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.