Hello,
I have a kernel that is much slower when compiling it without the “-G” flag. The kernel is executed in debug mode in only 60ms, but in release mode it takes the GPU more than 400 ms to execute the same kernel. In order to make the two executables, debug and release, more comparable, I have deactivated any kind of optimization within each of the builds. The two compiles call commands look exactly the same, except for the “-G” attached to the debug compiler call.
I’m using CUDA 8.0 on Windows 7 with a Quadro M6000. Previously I used a GeForce GTX TITAN which only supported SM 3.5, but when compiling for this architecture, the kernel was executed in only 23ms, that was even faster. When I tried to launch the code compiled for SM 3.5 on the Quadro M6000, the code immediately crashed at the first cuda call which was a simple cudaFuncSetCacheConfig. I was thinking that there was some kind of backwards compability, am I wrong about this? Anyway, after changing to SM 5.2 the problem was fixed and all cuda calls worked again, except for the small performance problem.
I’m wondering how to debug this kind of error since I got no idea on how to fix this kind of bug. Btw: The rest of the code is changing as expected, being about 2 to 5 times faster when changing from debug to release, it is only this kernel.
This is the kernel I was talking about. I am sure it is far away from being perfectly optimized, but I still cannot explain the performance hit when changing removing the “-G” from the compiler call:
__global__ void gpu::jacobiSVD::calculateJacobiSVD2(float * const matR, float const * const scale,
unsigned int const covCnt,
float * const tmpStateMat, float * const stateMat, float * const tmpTrans,
float * const trans, float * const matL, float * const singVal) {
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < covCnt; i += gridDim.x * blockDim.x) {
unsigned int matOffsetl = i * 3 * 3;
unsigned int matOffsetm = i * 2 * 2;
//Step 1 is removed since covariances are always square matrices (atm. 3x3 matrices)
//Continuing with step 2, the Jacobi SVD iteration
bool running = true;
while (running) {
running = false;
// do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix
#pragma unroll 2
for (unsigned int p = 1; p < 3; ++p) {
#pragma unroll 3
for (unsigned int q = 0; q < p; ++q) {
//Threshold to make sure the loop ends!
float threshold = max(almostZero_d_, precision_d_ * max(abs(stateMat[matOffsetl + (p * 3 + p)]), abs(stateMat[matOffsetl + (q * 3 + q)]))) * 100;
if (abs(stateMat[matOffsetl + (q * 3 + p)]) > threshold || abs(stateMat[matOffsetl + (p * 3 + q)]) > threshold) {
running = true;
calculate2x2JacobiSVD(stateMat, matR, p, q, matOffsetl, matOffsetm, matL);
//Sum up resulting 2x2 Jacobi rotations
//Regarding State
multiplyState1(stateMat, matL, p, q, matOffsetl, matOffsetm, tmpStateMat);
multiplyState2(tmpStateMat, matR, p, q, matOffsetl, matOffsetm, stateMat);
//Regarding Transformation
multiplyTrans(trans, matL, p, q, matOffsetl, matOffsetm, tmpTrans);
memcpy(trans + matOffsetl, tmpTrans + matOffsetl, sizeof(float) * (3 * 3));
}
}
}
}
}
};
These are the device functions called by the kernel:
__device__ void gpu::jacobiSVD::multiplyState1(float const * const stateMat, float const * const matL,
unsigned int const p, unsigned int const q, unsigned int const matOffset1, unsigned int const matOffset2,
float * const tmpStateMat) {
tmpStateMat[matOffset1 + (0 * 3 + p)] = stateMat[matOffset1 + (0 * 3 + p)] * matL[matOffset2 + (0 * 2 + 0)]
+ stateMat[matOffset1 + (0 * 3 + q)] * matL[matOffset2 + (1 * 2 + 0)];
tmpStateMat[matOffset1 + (1 * 3 + p)] = stateMat[matOffset1 + (1 * 3 + p)] * matL[matOffset2 + (0 * 2 + 0)]
+ stateMat[matOffset1 + (1 * 3 + q)] * matL[matOffset2 + (1 * 2 + 0)];
tmpStateMat[matOffset1 + (2 * 3 + p)] = stateMat[matOffset1 + (2 * 3 + p)] * matL[matOffset2 + (0 * 2 + 0)]
+ stateMat[matOffset1 + (2 * 3 + q)] * matL[matOffset2 + (1 * 2 + 0)];
tmpStateMat[matOffset1 + (0 * 3 + q)] = stateMat[matOffset1 + (0 * 3 + p)] * matL[matOffset2 + (0 * 2 + 1)]
+ stateMat[matOffset1 + (0 * 3 + q)] * matL[matOffset2 + (1 * 2 + 1)];
tmpStateMat[matOffset1 + (1 * 3 + q)] = stateMat[matOffset1 + (1 * 3 + p)] * matL[matOffset2 + (0 * 2 + 1)]
+ stateMat[matOffset1 + (1 * 3 + q)] * matL[matOffset2 + (1 * 2 + 1)];
tmpStateMat[matOffset1 + (2 * 3 + q)] = stateMat[matOffset1 + (2 * 3 + p)] * matL[matOffset2 + (0 * 2 + 1)]
+ stateMat[matOffset1 + (2 * 3 + q)] * matL[matOffset2 + (1 * 2 + 1)];
unsigned int x = 3u - p - q; //Calc missing element
tmpStateMat[matOffset1 + (0 * 3 + x)] = stateMat[matOffset1 + (0 * 3 + x)];
tmpStateMat[matOffset1 + (1 * 3 + x)] = stateMat[matOffset1 + (1 * 3 + x)];
tmpStateMat[matOffset1 + (2 * 3 + x)] = stateMat[matOffset1 + (2 * 3 + x)];
}
__device__ void gpu::jacobiSVD::multiplyState2(float const * const stateMat, float const * const matR,
unsigned int const p, unsigned int const q, unsigned int const matOffset1, unsigned int const matOffset2,
float * const tmpStateMat) {
tmpStateMat[matOffset1 + (p * 3 + 0)] = stateMat[matOffset1 + (p * 3 + 0)] * matR[matOffset2 + (0 * 2 + 0)]
+ stateMat[matOffset1 + (q * 3 + 0)] * matR[matOffset2 + (0 * 2 + 1)];
tmpStateMat[matOffset1 + (p * 3 + 1)] = stateMat[matOffset1 + (p * 3 + 1)] * matR[matOffset2 + (0 * 2 + 0)]
+ stateMat[matOffset1 + (q * 3 + 1)] * matR[matOffset2 + (0 * 2 + 1)];
tmpStateMat[matOffset1 + (p * 3 + 2)] = stateMat[matOffset1 + (p * 3 + 2)] * matR[matOffset2 + (0 * 2 + 0)]
+ stateMat[matOffset1 + (q * 3 + 2)] * matR[matOffset2 + (0 * 2 + 1)];
tmpStateMat[matOffset1 + (q * 3 + 0)] = stateMat[matOffset1 + (p * 3 + 0)] * matR[matOffset2 + (1 * 2 + 0)]
+ stateMat[matOffset1 + (q * 3 + 0)] * matR[matOffset2 + (1 * 2 + 1)];
tmpStateMat[matOffset1 + (q * 3 + 1)] = stateMat[matOffset1 + (p * 3 + 1)] * matR[matOffset2 + (1 * 2 + 0)]
+ stateMat[matOffset1 + (q * 3 + 1)] * matR[matOffset2 + (1 * 2 + 1)];
tmpStateMat[matOffset1 + (q * 3 + 2)] = stateMat[matOffset1 + (p * 3 + 2)] * matR[matOffset2 + (1 * 2 + 0)]
+ stateMat[matOffset1 + (q * 3 + 2)] * matR[matOffset2 + (1 * 2 + 1)];
unsigned int x = 3u - p - q; //Calc missing element
tmpStateMat[matOffset1 + (x * 3 + 0)] = stateMat[matOffset1 + (x * 3 + 0)];
tmpStateMat[matOffset1 + (x * 3 + 1)] = stateMat[matOffset1 + (x * 3 + 1)];
tmpStateMat[matOffset1 + (x * 3 + 2)] = stateMat[matOffset1 + (x * 3 + 2)];
}
__device__ void gpu::jacobiSVD::multiplyTrans(float const * const trans, float const * const matL,
unsigned int const p, unsigned int const q, unsigned int const matOffset1, unsigned int const matOffset2,
float * const tmpTrans) {
tmpTrans[matOffset1 + (p * 3 + 0)] = trans[matOffset1 + (p * 3 + 0)] * matL[matOffset2 + (0 * 2 + 0)]
+ trans[matOffset1 + (q * 3 + 0)] * matL[matOffset2 + (1 * 2 + 0)];
tmpTrans[matOffset1 + (p * 3 + 1)] = trans[matOffset1 + (p * 3 + 1)] * matL[matOffset2 + (0 * 2 + 0)]
+ trans[matOffset1 + (q * 3 + 1)] * matL[matOffset2 + (1 * 2 + 0)];
tmpTrans[matOffset1 + (p * 3 + 2)] = trans[matOffset1 + (p * 3 + 2)] * matL[matOffset2 + (0 * 2 + 0)]
+ trans[matOffset1 + (q * 3 + 2)] * matL[matOffset2 + (1 * 2 + 0)];
tmpTrans[matOffset1 + (q * 3 + 0)] = trans[matOffset1 + (p * 3 + 0)] * matL[matOffset2 + (0 * 2 + 1)]
+ trans[matOffset1 + (q * 3 + 0)] * matL[matOffset2 + (1 * 2 + 1)];
tmpTrans[matOffset1 + (q * 3 + 1)] = trans[matOffset1 + (p * 3 + 1)] * matL[matOffset2 + (0 * 2 + 1)]
+ trans[matOffset1 + (q * 3 + 1)] * matL[matOffset2 + (1 * 2 + 1)];
tmpTrans[matOffset1 + (q * 3 + 2)] = trans[matOffset1 + (p * 3 + 2)] * matL[matOffset2 + (0 * 2 + 1)]
+ trans[matOffset1 + (q * 3 + 2)] * matL[matOffset2 + (1 * 2 + 1)];
unsigned int x = 3u - p - q; //Calc missing element
tmpTrans[matOffset1 + (x * 3 + 0)] = trans[matOffset1 + (x * 3 + 0)];
tmpTrans[matOffset1 + (x * 3 + 1)] = trans[matOffset1 + (x * 3 + 1)];
tmpTrans[matOffset1 + (x * 3 + 2)] = trans[matOffset1 + (x * 3 + 2)];
}