Hi.
In my library, I experience a very long compilation time when I compile the code with NVCC 9,
which prevents me from implementing CUDA 9 stuff.
I have tracked down the problem to a specific kernel which is included in the minimal example below.
When I compile with NVCC V9.0.176, it takes 255 minutes to compile. Most of the time is spent by tool cicc.
With NVCC V8.0.61, compilation takes 2 seconds.
I use this command to compile. Host compiler is gcc 5.4.1:
nvcc -std=c++11 -O2 -gencode arch=compute_60,code=sm_60 -Xcudafe "--diag_suppress=subscript_out_of_range" -Xcompiler "-Wall" -Xptxas "" file.cu -o file
For further tests, I added the two defines in the beginning of the code.
If BASIS is not defined, part of the code is disabled.
If MEMBERELEM is not defined, the function getPitchedElement(…) is a global function rather than a const member function.
The following compilation times were observed with NVCC 9:
!BASIS MEMBERELEM 16s
!BASIS !MEMBERELEM 2s
BASIS !MEMBERELEM 15m15s
BASIS MEMBERELEM 255m16s
If I change the kernel template parameter in the main function to 3 instead of 4, compilation with nvcc 9 takes only 8 seconds for both BASIS and MEMBERELEM defined.
Can somebody explain to me what is going wrong? Could this be a compiler bug?
#define BASIS
#define MEMBERELEM
enum NeutrinoType { Neutrino = 0, Antineutrino = 1, Both = 2};
#ifdef BASIS
enum Basis {Mass = 0, Flavor = 1};
#endif
__device__
constexpr double HI_constants(){
return 1.41421356237310 * 1.16639e-23 * 6.0221415e+23 / (1.0e-2*5.06773093741e6) / (1.0e-2*5.06773093741e6) / (1.0e-2*5.06773093741e6);
}
#ifndef MEMBERELEM
template<typename T>
__device__
T* getPitchedElement(T* base, size_t row, size_t col, size_t pitch){
return (T*)((char*)base + row * pitch) + col;
}
#endif
template<unsigned int N>
__device__
void iCommutator(double* result, const double* left, const double* right){
static_assert(3 <= N && N <= 4, "3 <= N && N <= 4");
switch(N){
case 3:
result[0] = 0;
result[1] = left[7]*right[2] + 2.*left[4]*right[3]
- 2.*left[3]*right[4] + left[6]*right[5]
- left[5]*right[6] - left[2]*right[7];
result[2] = -(left[7]*right[1]) - left[5]*right[3]
+ left[3]*right[5] + (left[4] + sqrt((double)3)*left[8])*right[6]
+ left[1]*right[7] - left[6]*(right[4] + sqrt((double)3)*right[8]);
result[3] = -2.*left[4]*right[1] + left[5]*right[2]
+ 2*left[1]*right[4] - left[2]*right[5]
+ left[7]*right[6] - left[6]*right[7];
result[4] = 2.*left[3]*right[1] + left[6]*right[2]
- 2.*left[1]*right[3] - left[7]*right[5]
- left[2]*right[6] + left[5]*right[7];
result[5] = -(left[6]*right[1]) - left[3]*right[2]
+ left[2]*right[3] + left[1]*right[6]
+ (-left[4] + sqrt((double)3)*left[8])*right[7]
+ left[7]*(right[4] - sqrt((double)3)*right[8]);
result[6] = left[5]*right[1] - (left[4] + sqrt((double)3)*left[8])*right[2]
- left[7]*right[3] - left[1]*right[5] + left[3]*right[7]
+ left[2]*(right[4] + sqrt((double)3)*right[8]);
result[7] = left[2]*right[1] - left[1]*right[2]
+ left[6]*right[3] + (left[4] - sqrt((double)3)*left[8])*right[5]
- left[3]*right[6] + left[5]*(-right[4] + sqrt((double)3)*right[8]);
result[8] = sqrt((double)3)*(left[6]*right[2] + left[7]*right[5] - left[2]*right[6] - left[5]*right[7]);
break;
case 4:
result[0] = 0;
result[1] = left[9]*right[2]
+ left[13]*right[3]
+ 2*left[5]*right[4]
- 2*left[4]*right[5]
+ left[8]*right[6]
+ left[12]*right[7]
- left[6]*right[8]
- left[2]*right[9]
- left[7]*right[12]
- left[3]*right[13];
result[2] = -(left[9]*right[1])
+ left[14]*right[3]
- left[6]*right[4]
+ left[4]*right[6]
+ (left[5] + sqrt((double)3)*left[10])*right[8]
+ left[1]*right[9]
- left[8]*(right[5] + sqrt((double)3)*right[10])
+ left[12]*right[11]
- left[11]*right[12]
- left[3]*right[14];
result[3] = -(left[13]*right[1])
- left[14]*right[2]
- left[7]*right[4]
+ left[4]*right[7]
- left[11]*right[8]
+ left[8]*right[11]
+ ((3*left[5] + sqrt((double)3)*left[10] + 2*sqrt((double)6)*left[15])*right[12])/3.
+ left[1]*right[13]
+ left[2]*right[14]
- (left[12]*(3*right[5] + sqrt((double)3)*right[10] + 2*sqrt((double)6)*right[15]))/3.;
result[4] = -2*left[5]*right[1]
+ left[6]*right[2]
+ left[7]*right[3]
+ 2*left[1]*right[5]
- left[2]*right[6]
- left[3]*right[7]
+ left[9]*right[8]
- left[8]*right[9]
+ left[13]*right[12]
- left[12]*right[13];
result[5] = 2*left[4]*right[1]
+ left[8]*right[2]
+ left[12]*right[3]
- 2*left[1]*right[4]
- left[9]*right[6]
- left[13]*right[7]
- left[2]*right[8]
+ left[6]*right[9]
- left[3]*right[12]
+ left[7]*right[13];
result[6] = -(left[8]*right[1])
- left[4]*right[2]
+ left[2]*right[4]
+ left[14]*right[7]
+ left[1]*right[8]
+ (-left[5] + sqrt((double)3)*left[10])*right[9]
+ left[9]*(right[5] - sqrt((double)3)*right[10])
+ left[13]*right[11]
- left[11]*right[13]
- left[7]*right[14];
result[7] = -(left[12]*right[1])
- left[4]*right[3]
+ left[3]*right[4]
- left[14]*right[6]
- left[11]*right[9]
+ left[9]*right[11]
+ left[1]*right[12]
+ ((-3*left[5] + sqrt((double)3)*left[10] + 2*sqrt((double)6)*left[15])*right[13])/3.
+ left[6]*right[14]
+ left[13]*(right[5] - (right[10] + 2*sqrt((double)2)*right[15])/sqrt((double)3));
result[8] = left[6]*right[1]
- (left[5] + sqrt((double)3)*left[10])*right[2]
+ left[11]*right[3]
- left[9]*right[4]
- left[1]*right[6]
+ left[4]*right[9]
+ left[2]*(right[5] + sqrt((double)3)*right[10])
- left[3]*right[11]
+ left[14]*right[12]
- left[12]*right[14];
result[9] = left[2]*right[1]
- left[1]*right[2]
+ left[8]*right[4]
+ (left[5] - sqrt((double)3)*left[10])*right[6]
+ left[11]*right[7]
- left[4]*right[8]
+ left[6]*(-right[5] + sqrt((double)3)*right[10])
- left[7]*right[11]
+ left[14]*right[13]
- left[13]*right[14];
result[10] = (3*left[8]*right[2]
+ left[12]*right[3]
+ 3*left[9]*right[6]
+ left[13]*right[7]
- 3*left[2]*right[8]
- 3*left[6]*right[9]
- 2*left[14]*right[11]
- left[3]*right[12]
- left[7]*right[13]
+ 2*left[11]*right[14])
/sqrt((double)3);
result[11] = -(left[12]*right[2])
- left[8]*right[3]
- left[13]*right[6]
- left[9]*right[7]
+ left[3]*right[8]
+ left[7]*right[9]
+ (3*left[2]*right[12] + 3*left[6]*right[13] - 2*sqrt((double)3)
*(left[10] - sqrt((double)2)*left[15])*right[14]
+ 2*sqrt((double)3)*left[14]
*(right[10] - sqrt((double)2)*right[15])
)/3.;
result[12] = left[7]*right[1]
+ (3*left[11]*right[2]
- (3*left[5] + sqrt((double)3)*left[10] + 2*sqrt((double)6)*left[15])*right[3]
- 3*left[13]*right[4]
- 3*(left[1]*right[7]
+ left[14]*right[8]
+ left[2]*right[11]
- left[4]*right[13]
- left[8]*right[14])
+ left[3]*(3*right[5] + sqrt((double)3)*right[10] + 2*sqrt((double)6)*right[15])
)/3.;
result[13] = left[3]*right[1]
- left[1]*right[3]
+ left[12]*right[4]
+ left[11]*right[6]
+ left[5]*right[7]
- ((left[10] + 2*sqrt((double)2)*left[15])*right[7])/sqrt((double)3)
- left[14]*right[9]
- left[6]*right[11]
- left[4]*right[12]
+ left[9]*right[14]
+ left[7]*(-right[5] + (right[10] + 2*sqrt((double)2)*right[15])/sqrt((double)3));
result[14] = left[3]*right[2]
- left[2]*right[3]
+ left[7]*right[6]
+ (-3*left[6]*right[7]
+ 3*left[12]*right[8]
+ 3*left[13]*right[9]
+ 2*sqrt((double)3)*(left[10] - sqrt((double)2)*left[15])*right[11]
- 3*(left[8]*right[12] + left[9]*right[13]) -
2*sqrt((double)3)*left[11]*(right[10] - sqrt((double)2)*right[15])
)/3.;
result[15] = 2*sqrt((double)0.6666666666666666)
*(left[12]*right[3]
+ left[13]*right[7]
+ left[14]*right[11]
- left[3]*right[12]
- left[7]*right[13]
- left[11]*right[14]);
break;
}
}
template<unsigned int N>
struct Physics{
static_assert(3 <= N && N <= 4, "3 <= N && N <= 4");
size_t n_rhos;
size_t n_energies;
double* states;
const double* cstates;
const double* y;
double* y_derived;
double density;
double electronFraction;
double t;
double* evolB1proj;
const double* H0_array;
size_t evolB1pitch;
size_t evoloffset;
size_t h0pitch;
size_t h0offset;
size_t statesOffset;
size_t statesPitch;
NeutrinoType neutrinoType;
#ifdef BASIS
Basis basis;
#endif
__host__ __device__
Physics(){}
__device__
void barrier_path() const{
__syncthreads();
}
#ifdef MEMBERELEM
template<typename T>
__device__
T* getPitchedElement(T* base, size_t row, size_t col, size_t pitch) const{
return (T*)((char*)base + row * pitch) + col;
}
#endif
__device__
void endDerive(){
if(threadIdx.x + blockIdx.x * blockDim.x == 0){
cstates = states;
}
barrier_path();
}
__device__
double getNucleonNumber() const{
return 1.0e-10;
};
__device__
void HI(double out[], size_t index_rho, size_t index_energy) const{
#pragma unroll
for(size_t i = 0; i < N * N; ++i){
out[i] = 0;
}
const double* evolB1projCosRho = getPitchedElement(evolB1proj,
index_rho * N * N * N,
0,
evolB1pitch);
double CC = HI_constants() * density * electronFraction;
double NC;
if (electronFraction < 1.0e-10){
NC = HI_constants() * density;
}else {
NC = CC*(-0.5*(1.0-electronFraction)/electronFraction);
}
if((index_rho == 1 && neutrinoType == Both)
|| neutrinoType == Antineutrino){
CC *= -1;
NC *= -1;
}
const double CCNC = CC + NC;
#pragma unroll
for(unsigned int j = 0; j < 3; j++){
double proj[N * N];
const double* evoldata = getPitchedElement(evolB1proj,
index_rho * N * N * N + j * N * N,
index_energy,
evolB1pitch);
#pragma unroll
for(int i = 0; i < N * N; i++){
proj[i] = evoldata[i * evoloffset];
}
#pragma unroll
for(size_t i = 0; i < N * N; ++i){
if(j == 0)
out[i] += CCNC * proj[i];
else
out[i] += NC * proj[i];
}
#ifdef BASIS
if(basis == Mass){
const double* h0data = getPitchedElement(H0_array, 0, index_energy, h0pitch);
#pragma unroll
for(size_t i = 0; i < N * N; ++i){
out[i] += h0data[i * h0offset];
}
}
#endif
}
}
__device__
void derive_osc(){
double result[N*N];
double tmp1[N*N];
double myCurrentState[N*N];
for(size_t index_rho = 0; index_rho < n_rhos; index_rho++){
for(size_t index_energy = threadIdx.x + blockDim.x * blockIdx.x;
index_energy < n_energies;
index_energy += blockDim.x * gridDim.x){
#pragma unroll
for(int i = 0; i < N * N; i++){
result[i] = 0;
tmp1[i] = 0;
}
const double* statedataIn = getPitchedElement(y,
index_rho * N * N,
index_energy,
statesPitch);
#pragma unroll
for(int i = 0; i < N * N; i++)
myCurrentState[i] = statedataIn[i * statesOffset];
HI(tmp1, index_rho, index_energy);
iCommutator<N>(result, myCurrentState, tmp1);
double* statedataOut = getPitchedElement(y_derived,
index_rho * N * N,
index_energy,
statesPitch);
#pragma unroll
for(int i = 0; i < N * N; i++)
statedataOut[i * statesOffset] = result[i];
}
}
barrier_path();
endDerive();
}
};
template<unsigned int N>
__global__
void deriveOscKernel(const size_t* const activePaths, const size_t nPaths,
Physics<N>** pptr){
for(size_t index_y = blockIdx.y; index_y < nPaths; index_y += gridDim.y){
const size_t index_cosine = activePaths[index_y];
pptr[index_cosine]->derive_osc();
}
}
int main(){
deriveOscKernel<4><<<1, 1, 0, 0>>>(nullptr, 0, nullptr);
cudaDeviceSynchronize();
return 0;
}