Problem with cusolver pivoting

I am trying to use cuSolver’s LU to solve a linear equation. My code is basically copied from the sample code in the manual.
When I run the same code WITHOUT pivot, it produces the correct solution to the linear equation. However, I want to be able to run with pivoting. When I run it WITH pivot , it produces a pivot list which does not make any sense.

Not sure if I am following forum etiquette here so if the question needs to be improved please tell me how. Code follows:

#define EQNS_TOTAL 2

CallMAC(cudaMalloc((void**)&d_Ipiv512, sizeof(int) * 4*EQNS_TOTAL));
CallMAC(cudaMalloc((void**)&p_RHS, sizeof(double) * 4 * EQNS_TOTAL));
CallMAC(cudaMalloc((void**)&d_info, sizeof(int) * 4 * EQNS_TOTAL));
CallMAC(cudaMalloc((void**)&p_eqns, sizeof(double) * 4 * EQNS_TOTAL * 4 * EQNS_TOTAL));

	cusolverDnHandle_t cusolverH = NULL;
	cudaStream_t stream = NULL;
	cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
	cudaError_t cudaStat1 = cudaSuccess;
	cudaError_t cudaStat2 = cudaSuccess;
	cudaError_t cudaStat3 = cudaSuccess;
	cudaError_t cudaStat4 = cudaSuccess;
	const int m = 4 * EQNS_TOTAL;
	const int lda = m;
	const int ldb = m;
	int lwork = 0;     /* size of workspace */

	const int pivot_on = 1;

// 4. Solve equations : cuSolver parallel solve…
//====================================================

		// Spit out matrix eqn for purpose of query on forums:
		cudaMemcpy(p_eqns_host, p_eqns, 4 * EQNS_TOTAL * 4 * EQNS_TOTAL * sizeof(f64),
			cudaMemcpyDeviceToHost);
		cudaMemcpy(p_temphost5, p_RHS, 4 * EQNS_TOTAL * sizeof(f64), cudaMemcpyDeviceToHost);
		for (i = 0; i < m; i++)
		{
			for (j = 0; j < m; j++)	printf("%1.3E  ", p_eqns_host[i + j*m]);
			printf(" | %1.4E\n", p_temphost5[i]);
		};
		printf("\n");

		if (pivot_on) {
			printf("pivot is on : compute P*A = L*U \n");
		} else {
			printf("pivot is off: compute A = L*U (not numerically stable)\n");
		};

		// step 1: create cusolver handle, bind a stream 
		status = cusolverDnCreate(&cusolverH);
		if (CUSOLVER_STATUS_SUCCESS != status) { printf("cusolverDnCreate(&cusolverH) failed.\n"); getch(); }
		cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
		if (cudaSuccess != cudaStat1) { printf("cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); failed.\n"); getch(); }
		status = cusolverDnSetStream(cusolverH, stream);
		if (CUSOLVER_STATUS_SUCCESS != status) {printf("cusolverDnSetStream(cusolverH, stream) failed.\n");	getch();}
		
		// step 2: 'copy A to device'
		// p_eqns is allocated on device, and so is p_RHS, d_Ipiv512, d_info
		// step 3: query working space of getrf 	
		//	status = cusolverDnDgetrf_bufferSize(
		//		cusolverH,	m,	m,	d_A,lda,&lwork);			
		status = cusolverDnDgetrf_bufferSize(
			cusolverH,
			4 * EQNS_TOTAL,	4 * EQNS_TOTAL,
			p_eqns,
			4 * EQNS_TOTAL,	&lwork);
		if (CUSOLVER_STATUS_SUCCESS != status) {
			printf("cusolverDnDgetrf_bufferSize failed.\n"); getch();
		} else {
			printf("Success:cusolverDnDgetrf_bufferSize\n");
		};
		CallMAC(cudaMalloc((void**)&d_work, sizeof(double)*lwork));

		// step 4: LU factorization 
		if (pivot_on) {
			status = cusolverDnDgetrf(
				cusolverH,
				4 * EQNS_TOTAL,	4 * EQNS_TOTAL,
				p_eqns,	lda, d_work, d_Ipiv512,	d_info);

// status = cusolverDnDgetrf(
// cusolverH, m, m, p_eqns, // 4N4N
// lda,d_work, d_Ipiv512, // 128
4 = 512
// d_info);
} else {
status = cusolverDnDgetrf(
cusolverH,
4*EQNS_TOTAL, 4 * EQNS_TOTAL,
p_eqns, lda, d_work, NULL, d_info);
}
if (CUSOLVER_STATUS_SUCCESS != status) {
printf(“cusolverDnDgetrf failed.\n”); getch();
} else {
printf(“cusolverDnDgetrf : CUSOLVER_STATUS_SUCCESS == status\n”);
}
CallMAC(cudaDeviceSynchronize());

		int info;
		if (pivot_on) {
			CallMAC(cudaMemcpy(Ipiv512, d_Ipiv512, sizeof(int)*m, cudaMemcpyDeviceToHost));
		} 
	//	cudaStat2 = cudaMemcpy(LU, p_eqns, sizeof(double)*4*4*EQNS_TOTAL, cudaMemcpyDeviceToHost);
		CallMAC(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
		if (0 > info) {
			printf("%d-th parameter is wrong \n", -info);
			while (1) getch();
		}
		if ((pivot_on)) {
			printf("pivoting sequence: (m = %d; lda = %d)\n", m, lda);
			for (int j = 0; j < 4*EQNS_TOTAL; j++) {
				printf("Ipiv[%d] = %d\n", j, Ipiv512[j]);
				//if (j % 4 == 0) printf("");
			};
		}
		
		while (1) getch();

The results of this code are as follows:

Solving equations.
8.572E+01 -3.108E+00 -8.500E-03 -3.278E-01 -2.540E+01 -2.573E+00 2.961E-03 1.070E-01 | 1.0897E-01
-4.214E+00 2.537E+00 3.783E-03 1.868E-02 -3.014E+00 2.921E-01 -6.221E-04 1.185E-02 | -8.3694E-03
-9.300E-03 -3.135E-03 1.000E+00 0.000E+00 1.882E-03 9.359E-04 -2.495E-07 0.000E+00 | 2.9530E-04
-9.054E-05 4.613E-07 0.000E+00 1.000E+00 3.362E-05 3.446E-06 0.000E+00 -1.279E-05 | -5.3642E-05
-2.496E+01 -1.681E+00 2.863E-03 1.051E-01 6.689E+01 -2.609E+00 -6.629E-03 -2.823E-01 | 5.7258E-02
-4.116E+00 6.478E-01 -4.760E-04 1.659E-02 -2.502E+00 4.028E+00 3.267E-03 1.449E-02 | -8.0247E-03
2.367E-03 7.722E-04 -3.022E-07 0.000E+00 -5.390E-03 -2.485E-03 1.000E+00 0.000E+00 | 4.3246E-05
3.516E-05 5.624E-06 0.000E+00 -1.336E-05 -9.226E-05 2.015E-06 0.000E+00 1.000E+00 | -4.3637E-04

pivot is on : compute PA = LU
Success:cusolverDnDgetrf_bufferSize
Success: cudaMalloc((void**)&d_work, sizeof(double)*lwork) |||
cusolverDnDgetrf : CUSOLVER_STATUS_SUCCESS == status
Success: cudaDeviceSynchronize() |||
pivoting sequence: (m = 8; lda = 8)
Ipiv[0] = 1
Ipiv[1] = 5
Ipiv[2] = 3
Ipiv[3] = 4
Ipiv[4] = 5
Ipiv[5] = 6
Ipiv[6] = 7
Ipiv[7] = 8

To me, that does not make any sense. The solution vector also comes out incorrect. Thanks for any suggestions.