compilation with nvcc 9 takes 4 hours instead of 2 seconds.

Hi.
In my library, I experience a very long compilation time when I compile the code with NVCC 9,
which prevents me from implementing CUDA 9 stuff.

I have tracked down the problem to a specific kernel which is included in the minimal example below.
When I compile with NVCC V9.0.176, it takes 255 minutes to compile. Most of the time is spent by tool cicc.
With NVCC V8.0.61, compilation takes 2 seconds.

I use this command to compile. Host compiler is gcc 5.4.1:

nvcc -std=c++11 -O2 -gencode arch=compute_60,code=sm_60 -Xcudafe "--diag_suppress=subscript_out_of_range" -Xcompiler "-Wall" -Xptxas "" file.cu -o file

For further tests, I added the two defines in the beginning of the code.
If BASIS is not defined, part of the code is disabled.
If MEMBERELEM is not defined, the function getPitchedElement(…) is a global function rather than a const member function.

The following compilation times were observed with NVCC 9:
!BASIS MEMBERELEM 16s
!BASIS !MEMBERELEM 2s
BASIS !MEMBERELEM 15m15s
BASIS MEMBERELEM 255m16s

If I change the kernel template parameter in the main function to 3 instead of 4, compilation with nvcc 9 takes only 8 seconds for both BASIS and MEMBERELEM defined.

Can somebody explain to me what is going wrong? Could this be a compiler bug?

#define BASIS
#define MEMBERELEM

enum NeutrinoType { Neutrino = 0, Antineutrino = 1, Both = 2};

#ifdef BASIS
enum Basis {Mass = 0, Flavor = 1};
#endif

__device__
constexpr double HI_constants(){
	return 1.41421356237310 * 1.16639e-23 * 6.0221415e+23 / (1.0e-2*5.06773093741e6) / (1.0e-2*5.06773093741e6) / (1.0e-2*5.06773093741e6);
}

#ifndef MEMBERELEM
	template<typename T>
	__device__
	T* getPitchedElement(T* base, size_t row, size_t col, size_t pitch){
		return (T*)((char*)base + row * pitch) + col;
	}	
#endif

template<unsigned int N>
__device__
void iCommutator(double* result, const double* left, const double* right){
	static_assert(3 <= N && N <= 4, "3 <= N && N <= 4");

	switch(N){
	case 3:
		result[0] = 0;
		result[1] = left[7]*right[2] + 2.*left[4]*right[3] 
			- 2.*left[3]*right[4] + left[6]*right[5] 
			- left[5]*right[6] - left[2]*right[7];
		result[2] = -(left[7]*right[1]) - left[5]*right[3] 
			+ left[3]*right[5] + (left[4] + sqrt((double)3)*left[8])*right[6] 
			+ left[1]*right[7] - left[6]*(right[4] + sqrt((double)3)*right[8]);
		result[3] = -2.*left[4]*right[1] + left[5]*right[2] 
			+ 2*left[1]*right[4] - left[2]*right[5] 
			+ left[7]*right[6] - left[6]*right[7];
		result[4] = 2.*left[3]*right[1] + left[6]*right[2] 
			- 2.*left[1]*right[3] - left[7]*right[5] 
			- left[2]*right[6] + left[5]*right[7];
		result[5] = -(left[6]*right[1]) - left[3]*right[2] 
			+ left[2]*right[3] + left[1]*right[6] 
			+ (-left[4] + sqrt((double)3)*left[8])*right[7] 
			+ left[7]*(right[4] - sqrt((double)3)*right[8]);
		result[6] = left[5]*right[1] - (left[4] + sqrt((double)3)*left[8])*right[2] 
			- left[7]*right[3] - left[1]*right[5] + left[3]*right[7] 
			+ left[2]*(right[4] + sqrt((double)3)*right[8]);
		result[7] = left[2]*right[1] - left[1]*right[2] 
			+ left[6]*right[3] + (left[4] - sqrt((double)3)*left[8])*right[5] 
			- left[3]*right[6] + left[5]*(-right[4] + sqrt((double)3)*right[8]);
		result[8] = sqrt((double)3)*(left[6]*right[2] + left[7]*right[5] - left[2]*right[6] - left[5]*right[7]);
		break;
	case 4:
		result[0] = 0;
		result[1] = left[9]*right[2] 
			+ left[13]*right[3] 
			+ 2*left[5]*right[4] 
			- 2*left[4]*right[5] 
			+ left[8]*right[6] 
			+ left[12]*right[7] 
			- left[6]*right[8] 
			- left[2]*right[9] 
			- left[7]*right[12] 
			- left[3]*right[13];
		result[2] = -(left[9]*right[1]) 
			+ left[14]*right[3] 
			- left[6]*right[4] 
			+ left[4]*right[6] 
			+ (left[5] + sqrt((double)3)*left[10])*right[8] 
			+ left[1]*right[9] 
			- left[8]*(right[5] + sqrt((double)3)*right[10]) 
			+ left[12]*right[11] 
			- left[11]*right[12] 
			- left[3]*right[14];
		result[3] = -(left[13]*right[1]) 
			- left[14]*right[2] 
			- left[7]*right[4] 
			+ left[4]*right[7] 
			- left[11]*right[8] 
			+ left[8]*right[11] 
			+ ((3*left[5] + sqrt((double)3)*left[10] + 2*sqrt((double)6)*left[15])*right[12])/3. 
			+ left[1]*right[13] 
			+ left[2]*right[14] 
			- (left[12]*(3*right[5] + sqrt((double)3)*right[10] + 2*sqrt((double)6)*right[15]))/3.;
		result[4] = -2*left[5]*right[1] 
			+ left[6]*right[2] 
			+ left[7]*right[3] 
			+ 2*left[1]*right[5] 
			- left[2]*right[6] 
			- left[3]*right[7] 
			+ left[9]*right[8] 
			- left[8]*right[9] 
			+ left[13]*right[12] 
			- left[12]*right[13];
		result[5] = 2*left[4]*right[1] 
			+ left[8]*right[2] 
			+ left[12]*right[3] 
			- 2*left[1]*right[4] 
			- left[9]*right[6] 
			- left[13]*right[7] 
			- left[2]*right[8] 
			+ left[6]*right[9] 
			- left[3]*right[12] 
			+ left[7]*right[13];
		result[6] = -(left[8]*right[1]) 
			- left[4]*right[2] 
			+ left[2]*right[4] 
			+ left[14]*right[7] 
			+ left[1]*right[8] 
			+ (-left[5] + sqrt((double)3)*left[10])*right[9] 
			+ left[9]*(right[5] - sqrt((double)3)*right[10]) 
			+ left[13]*right[11] 
			- left[11]*right[13] 
			- left[7]*right[14];
		result[7] = -(left[12]*right[1]) 
			- left[4]*right[3] 
			+ left[3]*right[4] 
			- left[14]*right[6] 
			- left[11]*right[9] 
			+ left[9]*right[11] 
			+ left[1]*right[12] 
			+ ((-3*left[5] + sqrt((double)3)*left[10] + 2*sqrt((double)6)*left[15])*right[13])/3. 
			+ left[6]*right[14] 
			+ left[13]*(right[5] - (right[10] + 2*sqrt((double)2)*right[15])/sqrt((double)3));
		result[8] = left[6]*right[1] 
			- (left[5] + sqrt((double)3)*left[10])*right[2] 
			+ left[11]*right[3] 
			- left[9]*right[4] 
			- left[1]*right[6] 
			+ left[4]*right[9] 
			+ left[2]*(right[5] + sqrt((double)3)*right[10]) 
			- left[3]*right[11] 
			+ left[14]*right[12] 
			- left[12]*right[14];
		result[9] = left[2]*right[1] 
			- left[1]*right[2] 
			+ left[8]*right[4] 
			+ (left[5] - sqrt((double)3)*left[10])*right[6] 
			+ left[11]*right[7] 
			- left[4]*right[8] 
			+ left[6]*(-right[5] + sqrt((double)3)*right[10]) 
			- left[7]*right[11] 
			+ left[14]*right[13] 
			- left[13]*right[14];
		result[10] = (3*left[8]*right[2] 
			+ left[12]*right[3] 
			+ 3*left[9]*right[6] 
			+ left[13]*right[7] 
			- 3*left[2]*right[8] 
			- 3*left[6]*right[9] 
			- 2*left[14]*right[11] 
			- left[3]*right[12] 
			- left[7]*right[13] 
			+ 2*left[11]*right[14])
			/sqrt((double)3);
		result[11] = -(left[12]*right[2]) 
			- left[8]*right[3] 
			- left[13]*right[6] 
			- left[9]*right[7] 
			+ left[3]*right[8] 
			+ left[7]*right[9] 
			+ (3*left[2]*right[12] + 3*left[6]*right[13] - 2*sqrt((double)3)
				*(left[10] - sqrt((double)2)*left[15])*right[14] 
				+ 2*sqrt((double)3)*left[14]
				*(right[10] - sqrt((double)2)*right[15])
			)/3.;
		result[12] = left[7]*right[1] 
			+ (3*left[11]*right[2] 
				- (3*left[5] + sqrt((double)3)*left[10] + 2*sqrt((double)6)*left[15])*right[3] 
				- 3*left[13]*right[4] 
				- 3*(left[1]*right[7] 
					+ left[14]*right[8] 
					+ left[2]*right[11] 
					- left[4]*right[13] 
					- left[8]*right[14]) 
				+ left[3]*(3*right[5] + sqrt((double)3)*right[10] + 2*sqrt((double)6)*right[15])
			)/3.;
		result[13] = left[3]*right[1] 
			- left[1]*right[3] 
			+ left[12]*right[4] 
			+ left[11]*right[6] 
			+ left[5]*right[7] 
			- ((left[10] + 2*sqrt((double)2)*left[15])*right[7])/sqrt((double)3) 
			- left[14]*right[9] 
			- left[6]*right[11] 
			- left[4]*right[12] 
			+ left[9]*right[14] 
			+ left[7]*(-right[5] + (right[10] + 2*sqrt((double)2)*right[15])/sqrt((double)3));
		result[14] = left[3]*right[2] 
			- left[2]*right[3] 
			+ left[7]*right[6] 
			+ (-3*left[6]*right[7] 
				+ 3*left[12]*right[8] 
				+ 3*left[13]*right[9] 
				+ 2*sqrt((double)3)*(left[10] - sqrt((double)2)*left[15])*right[11] 
				- 3*(left[8]*right[12] + left[9]*right[13]) -
				 2*sqrt((double)3)*left[11]*(right[10] - sqrt((double)2)*right[15])
			)/3.;
		result[15] = 2*sqrt((double)0.6666666666666666)
			*(left[12]*right[3] 
				+ left[13]*right[7] 
				+ left[14]*right[11] 
				- left[3]*right[12] 
				- left[7]*right[13] 
				- left[11]*right[14]);
		break;
	}
}

template<unsigned int N>
struct Physics{

	static_assert(3 <= N && N <= 4, "3 <= N && N <= 4");

	size_t n_rhos; 
	size_t n_energies;	
	double* states;
	const double* cstates;
	const double* y;
	double* y_derived;	
	double density;
	double electronFraction;
	double t;
	double* evolB1proj;	
	const double* H0_array;
	size_t evolB1pitch; 
	size_t evoloffset; 
	size_t h0pitch; 
	size_t h0offset;
	size_t statesOffset;
	size_t statesPitch;
	NeutrinoType neutrinoType;

#ifdef BASIS
	Basis basis;
#endif

	__host__ __device__
	Physics(){}

	__device__
	void barrier_path() const{
		__syncthreads();
	}
	
#ifdef MEMBERELEM
	template<typename T>
	__device__
	T* getPitchedElement(T* base, size_t row, size_t col, size_t pitch) const{
		return (T*)((char*)base + row * pitch) + col;
	}	
#endif
        
	__device__
	void endDerive(){
		if(threadIdx.x + blockIdx.x * blockDim.x == 0){
			cstates = states;
		}
		barrier_path();
	}

	__device__
	double getNucleonNumber() const{
		return 1.0e-10;
	};

        __device__
        void HI(double out[], size_t index_rho, size_t index_energy) const{

	    #pragma unroll
            for(size_t i = 0; i < N * N; ++i){
                out[i] = 0;
            }

	    const double* evolB1projCosRho = getPitchedElement(evolB1proj, 
                                        index_rho * N * N * N,
                                        0, 
                                        evolB1pitch);

            double CC = HI_constants() * density * electronFraction;
            double NC;

            if (electronFraction < 1.0e-10){
                NC = HI_constants() * density;
            }else {
                NC = CC*(-0.5*(1.0-electronFraction)/electronFraction);
            }

            if((index_rho == 1 && neutrinoType == Both) 
			|| neutrinoType == Antineutrino){
                CC *= -1;
                NC *= -1;
            }

const double CCNC = CC + NC;
            
            #pragma unroll
            for(unsigned int j = 0; j < 3; j++){
            
                double proj[N * N];
                
                const double* evoldata = getPitchedElement(evolB1proj, 
							index_rho * N * N * N + j * N * N,
                                                        index_energy, 
                                                        evolB1pitch); 

                #pragma unroll
                for(int i = 0; i < N * N; i++){
                    proj[i] = evoldata[i * evoloffset];
                }           

                #pragma unroll
                for(size_t i = 0; i < N * N; ++i){
                    if(j == 0)
                        out[i] += CCNC * proj[i];
                    else
                        out[i] += NC * proj[i];
                }

#ifdef BASIS
		if(basis == Mass){

		    const double* h0data = getPitchedElement(H0_array, 0, index_energy, h0pitch);

		    #pragma unroll
		    for(size_t i = 0; i < N * N; ++i){
			out[i] += h0data[i * h0offset];
		    }
		} 
#endif
            }

}

__device__
    void derive_osc(){

        double result[N*N];
        double tmp1[N*N];
        double myCurrentState[N*N];
 
        for(size_t index_rho = 0; index_rho < n_rhos; index_rho++){

            for(size_t index_energy = threadIdx.x + blockDim.x * blockIdx.x; 
                index_energy < n_energies; 
                index_energy += blockDim.x * gridDim.x){
            
                #pragma unroll
                for(int i = 0; i < N * N; i++){
                    result[i] = 0; 
                    tmp1[i] = 0;
                }
    
                const double* statedataIn = getPitchedElement(y, 
                                                index_rho * N * N,
                                                index_energy, 
                                                statesPitch); 
                    
                #pragma unroll
                for(int i = 0; i < N * N; i++)
                    myCurrentState[i] = statedataIn[i * statesOffset];
                
                HI(tmp1, index_rho, index_energy);
                               
		iCommutator<N>(result, myCurrentState, tmp1);

                double* statedataOut = getPitchedElement(y_derived, 
                                                index_rho * N * N,
                                                index_energy, 
                                                statesPitch);                            
                
                #pragma unroll
                for(int i = 0; i < N * N; i++)
                    statedataOut[i * statesOffset] = result[i];                      
            }
        }

        barrier_path();
    
        endDerive();
    }          
  
};

template<unsigned int N>
__global__
void deriveOscKernel(const size_t* const activePaths, const size_t nPaths,
                            Physics<N>** pptr){
	for(size_t index_y = blockIdx.y; index_y < nPaths; index_y += gridDim.y){

		const size_t index_cosine = activePaths[index_y];

		pptr[index_cosine]->derive_osc();
	}
}

int main(){
	deriveOscKernel<4><<<1, 1, 0, 0>>>(nullptr, 0, nullptr);
	cudaDeviceSynchronize();
	return 0;
}

I have filed a bug internally at NVIDIA to track this.

I note that if you change the compile target from 60 to 30, the compile time drops to a reasonable level. I’m not suggesting this is a workaround, just an observation.

That observation would suggest that the problem is inside PTXAS which does architecture-specific code generation (i.e. different code generators for architectures < sm_50 and >= sm_50).

Based on that, a recommended experiment for a possible workaround is to reduce the PTXAS optimization level (default: -O3), e.g. -Xptxas -O2 or -Xptxas -O1. That would have the advantage of leaving all architectural CUDA features accessible, with the trade-off of reduced performance.