CUDA C kernel implementation design for a big function

After initial gprof, I have found the following function in our protein alignment biological C++ tool code as taking most time overall:

void assignAlignedResiduePairs( vector< Atom * > vStruct1, vector< Atom * > vStruct2, int align_mode ){
    int n_1 = vStruct1.size();    // vStruct1.size() >= vStruct2.size();
    int n_2 = vStruct2.size();
    int n;

    if( n_1 >= n_2 )
        n = n_1;
    else
        n = n_2;

    float d_2 = 0;
    float d0_2 = d0 * d0;

    for( int i = 1 ; i <= n_1 ; i++ ){
        for( int j = 1 ; j <= n_2 ; j++ ){
            cost[i][j] = 2;
        }
    }

    if( align_mode == 1 ){
        for( int i = 0 ; i < n_1 ; i++ ){
            for( int j = 0 ; j < n_2 ; j++ ){
                d_2 = pow( vStruct1[i]->R[0] - vStruct2[j]->R_sup[0], 2 ) + pow( vStruct1[i]->R[1] - vStruct2[j]->R_sup[1], 2 ) + pow( vStruct1[i]->R[2] - vStruct2[j]->R_sup[2], 2 );
                score[i+1][j+1] = 1 / ( 1 + d_2 / d0_2 );               
            }
        }
    }   
    else if( align_mode == 2 ){
        for( int i = 0 ; i < n_1 ; i++ ){
            for( int j = 0 ; j < n_2 ; j++ ){
                d_2 = pow( vStruct1[i]->R_sup[0] - vStruct2[j]->R[0], 2 ) + pow( vStruct1[i]->R_sup[1] - vStruct2[j]->R[1], 2 ) + pow( vStruct1[i]->R_sup[2] - vStruct2[j]->R[2], 2 );
                score[i+1][j+1] = 1 / ( 1 + d_2 / d0_2 );               
            }
        }
    }


    for( int i = 1 ; i <= n_1 ; i++ ){
        for( int j = 1 ; j <= n_2 ; j++ ){
            cost[i][j] = 2 - score[i][j];
        }
    }


    // **********************************************
    //   solve Linear Sum Assignment Problem (LSAP)
    // **********************************************

    double eps = 1E-10;
    double sup = 1E+9;
    double cc, d, ui, vgl, vj, z;
    double dminus[3000];
    double dplus[3000];
    double ys[3000];
    double yt[3000];
    int i, j, u, w, ind, index, j0;
    int zeile[3000];
    int vor[3000];
    int label[3000];


    // construct an initial partial assignment
    for( i = 1 ; i <= n ; i++ ){
        zeile[i] = 0;
        spalte[i] = 0;
        vor[i] = 0;
        ys[i] = 0.0E+00;
        yt[i] = 0.0E+00;
    }

    for( i = 1 ; i <= n_2 ; i++ ){
        for( j = 1 ; j <= n_1 ; j++ ){
            cc = cost[j][i];

            if( j != 1 ){
                if( ( cc - ui ) >= eps ){
                }
                else{
                    ui = cc;
                    j0 = j;
                }
            }
            else{
                ui = cc;
                j0 = j;
            }
        }   

        ys[i] = ui;

        if( zeile[j0] == 0 ){
            zeile[j0] = i;
            spalte[i] = j0;
        }
    }

    for( j = 1 ; j <= n_1 ; j++ ){
        yt[j] = 0;

        if( zeile[j] == 0 ){
            yt[j] = sup; 
        }
    }


    for( i = 1 ; i <= n_2 ; i++ ){
        ui = ys[i];

        for( j = 1 ; j <= n_1 ; j++ ){
            vj = yt[j];

            if( eps < vj ){
                cc = cost[j][i] - ui;

                if( cc + eps < vj ){                
                    yt[j] = cc;
                    vor[j] = i;
                }
            }
        }
    }


    for( j = 1 ; j <= n_1 ; j++ ){
        i = vor[j];

        if( i != 0 ){
            if( spalte[i] == 0 ){
                spalte[i] = j;
                zeile[j] = i;
            }
        }       
    }


    for( i = 1 ; i <= n_2 ; i++ ){
        if( spalte[i] == 0 ){
            ui = ys[i];

            for( j = 1 ; j <= n_1 ; j++ ){
                if ( zeile[j] == 0 ){
                    cc = cost[j][i];

                    if ( cc - ui - yt[j] <= -eps ){
                        spalte[i] = j;
                        zeile[j] = i;
                    } 
                }
            }       
        }
    }


    for( u = 1 ; u <= n_2 ; u++ ){
        if( spalte[u] == 0 ){
            for( i = 1 ; i <= n_1 ; i++ ){
                vor[i] = u;
                label[i] = 0;
                dplus[i] = sup;
                dminus[i] = cost[i][u] - ys[u] - yt[i];
            }

            dplus[u] = 0.0E+00;

            while( 1 ){
                d = sup;
                index = 0;

                for( i = 1 ; i <= n_1 ; i++ ){
                    if( label[i] == 0 ){
                        if( dminus[i] + eps < d ){
                            d = dminus[i];
                            index = i;
                        }
                    }
                }

                if( index == 0 ){
                    //cout << "LSAPR - Fatal error! No unlabeled node with DMINUS" << "\n";
                    return;
                }

                if( zeile[index] <= 0 ){
                    while( 1 ){
                        w = vor[index];
                        zeile[index] = w;
                        ind = spalte[w];
                        spalte[w] = index;

                        if( w != u ){
                            index = ind;
                        }
                        else{
                            break;
                        }
                    }

                    for( i = 1 ; i <= n_2 ; i++ ){
                        if( dplus[i] != sup ){
                            ys[i] = ys[i] + d - dplus[i];
                        }

                        if( dminus[i] + eps < d ){
                            yt[i] = yt[i] + dminus[i] - d;
                        }
                    }

                    break;
                }
                else{
                    label[index] = 1;
                    w = zeile[index];
                    dplus[w] = d;

                    for( i = 1 ; i <= n_1 ; i++ ){
                        if( label[i] == 0 ){
                            vgl = d + cost[i][w] - ys[w] - yt[i];

                            if( vgl + eps < dminus[i] ){
                                dminus[i] = vgl;
                                vor[i] = w;
                            }
                        }
                    }
                }                           
            }
        }
    }   
}

I am having trouble in deciding how to approach this function for kernel implementation. This function has several kernel implementation opportunities. Since this sequential function has data dependencies on the previous operations, I’m also fearing that parallel execution in some part might jeopardize the accuracy. I have made several other posts on StackOverflow on this function based on my different attempts but no one seems to respond. I will appreciate if someone can help me with this function with kernel design decisions, or at least give me a direction/reference. I have read “Programming Massively Parallel Processors: A Hands-on Approach” by David Kirk and Wen-mei Hwu. As a beginner, while I have a basic to moderate experience with the implementations and other examples, this is really getting challenging for me to decide on an approach for the function that will preserve accuracy. I don’t know of any other site or group to discuss massively parallel programming other than this site. If you cannot give an answer, please at least direct me to forums or resources that are helpful for new CUDA developers to get hands-on help. Thank you for your support in advance!

Well the while(1) on line 165 and 184 is a bad start, I will take a look later

Thank you. I will appreciate your review and time very much.