Hi, I have the following kernel.
__global__ void gCalcNNDists(float *tss, double *nnDistsComplete, int *numShAll,
const int numTrain, const int numPLabeled, const int tsLen, const int numSLens,
const int minSLen, const int maxSLen, const int sLenStep){
int tid = threadIdx.x;
int blockId = blockIdx.x;
int blockSize = blockDim.x;
extern __shared__ float array[];
int *sharedSeed = (int *)array;
int *sharedNumShAll = (int *)&sharedSeed[numPLabeled];
float *sharedDistMem = (float *)&sharedNumShAll[numSLens];
float *ts1 = (float*)array;
float *ts2 = (float*)&ts1[tsLen];
int *buf = (int*)&ts2[tsLen];
int *buf_0 = (int*)&buf[tsLen * 3];
int i, j, k, start, numIters = ceil((double)numSLens / blockSize);
for (i = 0; i < numIters; i++){
start = i * blockSize + tid;
if (start < numSLens)
sharedNumShAll[start] = numShAll[start];
}
__syncthreads();
double nnDist, term1, term2, s1, s1_2, mu1, sigma1, mu1_0, mu1_1, sigma1_0, sigma1_1, s2, s2_2, mu2, sigma2, corr, dotPr, dotPr_0, dotPr_1, nnCorr, nnCorr_0, nnCorr_1;
int sLen, numSh, numSh_0, shId, idx, cnt, nextSh, idx_1, idx_2, numCharPerSh, elm_0, elm_1, elm_2;
numCharPerSh = ceil((double)numTrain / (BITS_PER_BYTE * sizeof(char)));
numSh_0 = sharedNumShAll[0];
cnt = nextSh = 0;
for (sLen = minSLen; sLen <= maxSLen; sLen += sLenStep){
numSh = sharedNumShAll[cnt++];
numIters = ceil((double)tsLen / blockSize);
for (i = 0; i < numIters; i++){
start = i * blockSize + tid;
if (start < tsLen)
ts1[start] = tss[blockId * tsLen + start];
}
__syncthreads();
numSh = gridDim.x * (tsLen - sLen + 1);
for (i = 0; i < numIters; i++){
start = i * blockSize + tid;
if (start < tsLen - sLen + 1){
s1 = s1_2 = 0;
for (k = 0; k < sLen; k++){
term1 = ts1[start + k];
s1 += term1;
s1_2 += term1 * term1;
}
mu1 = s1 / sLen;
sigma1 = s1_2 / sLen > mu1 * mu1 ? sqrt(s1_2 / sLen - mu1 * mu1) : 1;
if (i == 0){
mu1_0 = mu1;
sigma1_0 = sigma1;
}
else{
mu1_1 = mu1;
sigma1_1 = sigma1;
}
}
}
for (j = 0; j < numTrain; j++){
for (i = 0; i < numIters; i++){
start = i * blockSize + tid;
if (start < tsLen)
ts2[start] = tss[j * tsLen + start];
}
__syncthreads();
for (i = 0; i < numIters; i++){
start = i * blockSize + tid;
if (start < tsLen - sLen + 1){
s2 = s2_2 = 0;
for (k = 0; k < sLen; k++){
term2 = ts2[k];
s2 += term2;
s2_2 += term2 * term2;
}
mu2 = s2 / sLen;
sigma2 = s2_2 / sLen > mu2 * mu2 ? sqrt(s2_2 / sLen - mu2 * mu2) : 1;
dotPr = 0;
for (k = 0; k < sLen; k++){
term1 = ts1[k];
term2 = ts2[start + k];
dotPr += term1 * term2;
}
decomposeDouble(dotPr, elm_0, elm_1, elm_2);
buf_0[3 * start] = elm_0;
buf_0[3 * start + 1] = elm_1;
buf_0[3 * start + 2] = elm_2;
dotPr = 0;
for (k = 0; k < sLen; k++){
term1 = ts1[start + k];
term2 = ts2[k];
dotPr += term1 * term2;
}
decomposeDouble(dotPr, elm_0, elm_1, elm_2);
buf[3 * start] = elm_0;
buf[3 * start + 1] = elm_1;
buf[3 * start + 2] = elm_2;
if(i == 0)
nnCorr_0 = (dotPr - sLen * mu1_0 * mu2) / (sLen * sigma1_0 * sigma2);
else
nnCorr_1 = (dotPr - sLen * mu1_1 * mu2) / (sLen * sigma1_1 * sigma2);
}
}
__syncthreads();
for (k = 1; k < tsLen - sLen + 1; k++){
term1 = ts2[k + sLen - 1];
term2 = ts2[k - 1];
s2 = s2 + term1 - term2;
s2_2 = s2_2 + term1 * term1 - term2 * term2;
mu2 = s2 / sLen;
sigma2 = s2_2 / sLen > mu2 * mu2 ? sqrt(s2_2 / sLen - mu2 * mu2) : 1;
for (i = 0; i < numIters; i++){
start = i * blockSize + tid;
if (start < tsLen - sLen + 1){
if (!start){
elm_0 = buf_0[3 * k];
elm_1 = buf_0[3 * k + 1];
elm_2 = buf_0[3 * k + 2];
dotPr = elm_0 + (double)elm_1 / 1e8 + (double)elm_2 / 1e16;
}
else{
elm_0 = buf[3 * (start - 1)];
elm_1 = buf[3 * (start - 1) + 1];
elm_2 = buf[3 * (start - 1) + 2];
term1 = ts1[start + sLen - 1];
term2 = ts2[k + sLen - 1];
dotPr = elm_0 + (double)elm_1 / 1e8 + (double)elm_2 / 1e16 + term1 * term2;
term1 = ts1[start - 1];
term2 = ts2[k - 1];
dotPr -= term1 * term2;
}
if (i == 0)
dotPr_0 = dotPr;
else
dotPr_1 = dotPr;
}
__syncthreads();
}
for (i = 0; i < numIters; i++){
start = i * blockSize + tid;
if (start < tsLen - sLen + 1){
if (i == 0)
dotPr = dotPr_0;
else
dotPr = dotPr_1;
decomposeDouble(dotPr, elm_0, elm_1, elm_2);
buf[3 * start] = elm_0;
buf[3 * start + 1] = elm_1;
buf[3 * start + 2] = elm_2;
}
__syncthreads();
if (i == 0)
nnCorr = nnCorr_0;
else
nnCorr = nnCorr_1;
if (start < tsLen - sLen + 1){
if (i == 0){
mu1 = mu1_0;
sigma1 = sigma1_0;
}
else{
mu1 = mu1_1;
sigma1 = sigma1_1;
}
corr = (dotPr - sLen * mu1 * mu2) / (sLen * sigma1 * sigma2);
if (corr > nnCorr)
nnCorr = corr;
}
if (i == 0)
nnCorr_0 = nnCorr;
else
nnCorr_1 = nnCorr;
}
}
for (i = 0; i < numIters; i++){
start = i * blockSize + tid;
if (i == 0)
nnCorr = nnCorr_0;
else
nnCorr = nnCorr_1;
if (start < tsLen - sLen + 1){
shId = blockId * (tsLen - sLen + 1) + start;
idx = j * numSh + shId;
if (nnCorr > 1 || blockId == j)
nnDist = 0;
else
nnDist = sqrt(2 * (1 - nnCorr));
nnDistsComplete[nextSh * numTrain + idx] = nnDist;
}
}
}
__syncthreads();
nextSh += numSh;
}
}
I ran this kernel on a card with compute capability 5.0. I noticed that adding line 204, namely writing back to the global memory, seemed to significantly affect the running time. In one test case, running the kernel without line 204 took some 2 seconds. Adding it increased the running time to around 25 seconds. I checked the bandwidth and it turned out extremely low. I wonder if there is a way to improve the efficiency performance?
Apart from this, is there any other improvement I can implement to further boost the efficiency?
Thank you!