Hi Mat,
This is a continuation of my previous post :
The code is shown below. I am running this on Tesla K40c, Ubuntu 14.04
I run it as :
pgcc -acc fitzhugh.c
./a.out
Enter grid size :
I am trying to see how the performance of this code can be improved.
For a grid of size 1, cpu runs the code in 0.6s (pgcc fitzhugh.c; a.out ) whereas on the gpu, the same code takes ~56s.
Since the cpu is at ~3.1GHz and the cuda cores are at ~750MHz, it appears to me that it must be possible to speedup the gpu code to a large extent. Most of the time is spent in running the sequential routine.
Any pointers would be greatly helpful.
Thanks,
Krishna.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define N_EQ1 2
#define NP1 6
#define N_EQ3 (3*N_EQ1)
#define NP3 (3*NP1)
#define N_EQ4 (4*N_EQ1)
#define NP4 (4*NP1)
#define NC 6
#define COUPLING_THRESHOLD 0. //
#define THRESHOLD_SLOPE 100. //
#pragma acc routine seq
double boltzmann(const double V, const double V_0, const double k)
{
return 1./(1.+exp(-k*(V-V_0)));
}
#pragma acc routine seq
void derivs_one(double* y, double* dxdt, const double* p)
{
dxdt[0] = p[5]*(y[0]-y[0]*y[0]*y[0])-y[1]+p[0]; // x' = m (x-x^3)-y+I
dxdt[1] = p[1]*(boltzmann(y[0], p[2], p[3])-y[1]); // y' = e (Bfun(x, x_0, k_2) - y)
}
void integrate_one_rk4(double* y, const double* params, double* output, const double dt, const unsigned N, const unsigned stride)
{
unsigned i, j, k;
double dt2, dt6;
double y1[N_EQ1], y2[N_EQ1], k1[N_EQ1], k2[N_EQ1], k3[N_EQ1], k4[N_EQ1];
dt2 = dt/2.; dt6 = dt/6.;
for(j=0; j<N_EQ1; j++) output[j] = y[j];
for(i=1; i<N; i++)
{
for(j=0; j<stride; j++)
{
derivs_one(y, k1, params);
for(k=0; k<N_EQ1; k++) y1[k] = y[k]+k1[k]*dt2;
derivs_one(y1, k2, params);
for(k=0; k<N_EQ1; k++) y2[k] = y[k]+k2[k]*dt2;
derivs_one(y2, k3, params);
for(k=0; k<N_EQ1; k++) y2[k] = y[k]+k3[k]*dt;
derivs_one(y2, k4, params);
for(k=0; k<N_EQ1; k++) y[k] += dt6*(k1[k]+2.*(k2[k]+k3[k])+k4[k]);
}
for(j=0; j<N_EQ1; j++) output[N_EQ1*i+j] = y[j];
}
}
/*
This method nullifies the need to have derivs_three, derivs_four etc.
Computes the following derivatives for each neuron and also adds the input from other neurons :
x' = m (x-x^3)-y+I
y' = e (Bfun(x, x_0, k_2) - y)
n => number of neurons
y => array with structure x0,y0,x1,y1,x2,y2,...x{n-1},y{n-1} for the above equations
dydt => derivatives of y i.e., x0',y0',x1',y1',x2',y2'...x{n-1}',y{n-1}'
p => parameter array storing NP1 parameters for each neuron i.e., p[0] to p[NP1-1] for x0',y0' followed by
p[NP] to p[2*NP-1] for x1',y1' and so on
coupling_strengths => array of size n*n storing coupling strengths in the order (0->0),(0->1),(0->2) ... (0->n-1) followed by
(1->0),(1->1),(1->2),(1->3)...(1->n-1) and so on
xi'_coupled = xi' + (E0 - xi) * ( coupling_strength(0->i)*bm_factor(x0) + coupling_strength(1->i)*bm_factor(x1) + ... )
yi'_coupled = yi'
*/
#pragma acc routine seq
void derivs_n_variablecoupling(const unsigned n, double* y, double* dydt, const double* p, const double* coupling_strengths, double* bm_factor)
{
unsigned i, j;
for(i=0;i <n; i++){
bm_factor[i] = boltzmann(y[i*N_EQ1], COUPLING_THRESHOLD, THRESHOLD_SLOPE);
derivs_one(y+i*N_EQ1, dydt+i*N_EQ1, p+i*NP1);
}
for(i=0; i<n; i++)
{
double bm_factor_sum = 0;
for(j=0;j<n;j++){
bm_factor_sum += coupling_strengths[j*n+i]*bm_factor[j];
}
dydt[i*N_EQ1] += (p[4+i*NP1]-y[i*N_EQ1])*bm_factor_sum;
}
}
/*
This method finds the rk4 approximation of the curve for a system of 'n' neurons obeying the following equations :
x' = m (x-x^3)-y+I
y' = e (Bfun(x, x_0, k_2) - y)
networkSize => number of neurons
y => array with structure x0,y0,x1,y1,x2,y2,...x{n-1},y{n-1} for the above equations
dydt => derivatives of y i.e., x0',y0',x1',y1',x2',y2'...x{n-1}',y{n-1}'
p => parameter array storing NP1 parameters for each neuron i.e., p[0] to p[NP1-1] for x0',y0' followed by
p[NP] to p[2*NP-1] for x1',y1' and so on
coupling_strengths => array of size n*n storing coupling strengths in the order (0->0),(0->1),(0->2) ... (0->n-1) followed by
(1->0),(1->1),(1->2),(1->3)...(1->n-1) and so on
output => array of size n*N in the following order x0,x1,...x{n-1} representing initial point on the curve
followed by x0-1,x1-1,...x{n-1}-1 representing the next point and so on
dt => step size
N => number of points to compute
stride => number of strides to make while computing each of the N points
*/
void integrate_n_rk4(const unsigned networkSize, const double* initial_state, const double* p, const double* coupling_strengths, double* output,
const double dt, const unsigned N, const unsigned stride) {
unsigned i, j, m, point_dimension = networkSize*N_EQ1;
double current_state[point_dimension], temp_state[point_dimension], k1[point_dimension], k2[point_dimension], k3[point_dimension], k4[point_dimension];
double bm_factor[networkSize];
for(m=0; m<point_dimension; m++){
current_state[m]=initial_state[m];
if(m%2==0)output[m/2]=current_state[m];
//output[m] = current_state[m];
}
for(i=1; i<N; i++)
{
for(j=0; j<stride; j++)
{
derivs_n_variablecoupling(networkSize, current_state, k1, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
temp_state[m] = current_state[m]+k1[m]*dt/2;
derivs_n_variablecoupling(networkSize, temp_state, k2, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
temp_state[m] = current_state[m]+k2[m]*dt/2;
derivs_n_variablecoupling(networkSize, temp_state, k3, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
temp_state[m] = current_state[m]+k3[m]*dt;
derivs_n_variablecoupling(networkSize, temp_state, k4, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
current_state[m] += (k1[m]+2.*(k2[m]+k3[m])+k4[m])*dt/6;
}
for(m=0; m<point_dimension; m++){
if(m%2==0)output[i*point_dimension/2+m/2]=current_state[m];
//output[i*point_dimension+m] = current_state[m];
}
}
}
/*
do everything in integrate_n_rk4
also in the output for each time point 1 to N, check crossings for each neuron
for each neuron store all the crossings and also the number of crossings present
compute phase differences of all cells w.r.t the first cell
return phase differences
*/
#pragma acc routine seq
void integrate_n_rk4_phasedifferences(const unsigned networkSize, const double* initial_state, const double* p, const double* coupling_strengths,
double* output, double* phase_differences_output, unsigned* numberOfPhaseDifferences,
double* current_state, double* temp_state, double*k1, double*k2, double*k3, double*k4,
int* lastCrossings, int* numberOfCrossingsToEdit, double *bm_factor,
const double dt, const unsigned N, const unsigned stride, const unsigned initial_states_start) {
unsigned i, j, m, point_dimension = networkSize*N_EQ1;
int threshhold = 0, allCellsHadZeroPhaseBefore = 0, numberOfCrossingsUnedited = 0, phase_differences_output_index=0;
for(m=0; m<point_dimension; m++){
current_state[m]=initial_state[m+initial_states_start];
if(m%2==0) {
output[m/2]=current_state[m];
lastCrossings[m/2]=N+1;
numberOfCrossingsToEdit[m/2] = 0;
}
}
for(i=1; i<N; i++)
{
for(j=0; j<stride; j++)
{
derivs_n_variablecoupling(networkSize, current_state, k1, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
temp_state[m] = current_state[m]+k1[m]*dt/2;
derivs_n_variablecoupling(networkSize, temp_state, k2, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
temp_state[m] = current_state[m]+k2[m]*dt/2;
derivs_n_variablecoupling(networkSize, temp_state, k3, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
temp_state[m] = current_state[m]+k3[m]*dt;
derivs_n_variablecoupling(networkSize, temp_state, k4, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
current_state[m] += (k1[m]+2.*(k2[m]+k3[m])+k4[m])*dt/6;
}
for(m=0; m<point_dimension; m++){
if(m%2==0) {
//Outputs only voltages of each cell.
output[i*point_dimension/2+m/2]=current_state[m];
// computing the crossings here itself
if(m==0){ // first cell
if(current_state[m]>=threshhold && output[(i-1)*point_dimension/2+m/2] < threshhold){
//implies a crossing in the first cell
if(!allCellsHadZeroPhaseBefore){
int allCellsHadZeroPhaseBefore1 = 1;
for(j=1;j<point_dimension/2;j++){
if(i<lastCrossings[j]) {
allCellsHadZeroPhaseBefore1=0;
}
}
allCellsHadZeroPhaseBefore = allCellsHadZeroPhaseBefore1;
}
if(allCellsHadZeroPhaseBefore){
for(j=1;j<point_dimension/2;j++){
phase_differences_output[phase_differences_output_index*(point_dimension/2 -1) + j-1] = i - lastCrossings[j];
numberOfCrossingsToEdit[j]=numberOfCrossingsToEdit[j]+1;
}
phase_differences_output_index++;
}
}
} else{ // cells other than first cell
if(current_state[m]>=threshhold && output[(i-1)*point_dimension/2+m/2] < threshhold){
if(!allCellsHadZeroPhaseBefore){
lastCrossings[m/2]=i;
}else{
j=numberOfCrossingsToEdit[m/2];
while(j>0){
phase_differences_output[(phase_differences_output_index-j)*(point_dimension/2-1)+m/2-1]/=i-lastCrossings[m/2];
j--;
}
lastCrossings[m/2]=i;
numberOfCrossingsToEdit[m/2]=0;
}
}
}
}
}
}
for(m=0; m<point_dimension; m++){
if(m%2==0) {
if(numberOfCrossingsToEdit[m/2]>numberOfCrossingsUnedited){
numberOfCrossingsUnedited = numberOfCrossingsToEdit[m/2];
}
}
}
//number of the phase differences in the array
*numberOfPhaseDifferences = phase_differences_output_index - numberOfCrossingsUnedited;
}
/*
do everything in integrate_n_rk4_phasedifferences
just doesn't need output array to be returned
*/
#pragma acc routine seq
void integrate_n_rk4_phasedifferences_nooutput(const unsigned networkSize, const double* initial_state, const double* p, const double* coupling_strengths,
double* output, double* phase_differences_output, unsigned* numberOfPhaseDifferences,
double* current_state, double* temp_state, double*k1, double*k2, double*k3, double*k4,
int* lastCrossings, int* numberOfCrossingsToEdit, double *bm_factor,
const double dt, const unsigned N, const unsigned stride, const unsigned initial_states_start) {
unsigned i, j, m, point_dimension = networkSize*N_EQ1;
int threshhold = 0, allCellsHadZeroPhaseBefore = 0, numberOfCrossingsUnedited = 0, phase_differences_output_index=0;
for(m=0; m<point_dimension; m++){
current_state[m]=initial_state[m+initial_states_start];
if(m%2==0) {
output[m/2]=current_state[m];
lastCrossings[m/2]=N+1;
numberOfCrossingsToEdit[m/2] = 0;
}
}
for(i=1; i<N; i++)
{
for(j=0; j<stride; j++)
{
derivs_n_variablecoupling(networkSize, current_state, k1, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
temp_state[m] = current_state[m]+k1[m]*dt/2;
derivs_n_variablecoupling(networkSize, temp_state, k2, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
temp_state[m] = current_state[m]+k2[m]*dt/2;
derivs_n_variablecoupling(networkSize, temp_state, k3, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
temp_state[m] = current_state[m]+k3[m]*dt;
derivs_n_variablecoupling(networkSize, temp_state, k4, p, coupling_strengths,bm_factor);
for(m=0; m<point_dimension; m++)
current_state[m] += (k1[m]+2.*(k2[m]+k3[m])+k4[m])*dt/6;
}
for(m=0; m<point_dimension; m++){
if(m%2==0) {
//Outputs only voltages of each cell.
output[(i%2)*point_dimension/2+m/2]=current_state[m];
// computing the crossings here itself
if(m==0){ // first cell
if(current_state[m]>=threshhold && output[((i-1)%2)*point_dimension/2+m/2] < threshhold){
//implies a crossing in the first cell
if(!allCellsHadZeroPhaseBefore){
int allCellsHadZeroPhaseBefore1 = 1;
for(j=1;j<point_dimension/2;j++){
if(i<lastCrossings[j]) {
allCellsHadZeroPhaseBefore1=0;
}
}
allCellsHadZeroPhaseBefore = allCellsHadZeroPhaseBefore1;
}
if(allCellsHadZeroPhaseBefore){
for(j=1;j<point_dimension/2;j++){
phase_differences_output[phase_differences_output_index*(point_dimension/2 -1) + j-1] = i - lastCrossings[j];
numberOfCrossingsToEdit[j]=numberOfCrossingsToEdit[j]+1;
}
phase_differences_output_index++;
}
}
} else{ // cells other than first cell
if(current_state[m]>=threshhold && output[((i-1)%2)*point_dimension/2+m/2] < threshhold){
if(!allCellsHadZeroPhaseBefore){
lastCrossings[m/2]=i;
}else{
j=numberOfCrossingsToEdit[m/2];
while(j>0){
phase_differences_output[(phase_differences_output_index-j)*(point_dimension/2-1)+m/2-1]/=i-lastCrossings[m/2];
j--;
}
lastCrossings[m/2]=i;
numberOfCrossingsToEdit[m/2]=0;
}
}
}
}
}
}
for(m=0; m<point_dimension; m++){
if(m%2==0) {
if(numberOfCrossingsToEdit[m/2]>numberOfCrossingsUnedited){
numberOfCrossingsUnedited = numberOfCrossingsToEdit[m/2];
}
}
}
//number of the phase differences in the array
*numberOfPhaseDifferences = phase_differences_output_index - numberOfCrossingsUnedited;
}
/*
for an array of initial conditions, do everything in integrate_n_rk4_phasedifferences and return just the last point in phase differences trajectory for each initial condition
*/
void sweeptraces(const long networkSize, const double* initial_states_array, const double* p, const double* coupling_strengths,
double* trajectory_targets_phasedifferences,
const long noOfInitialStates, const double dt, const long N, const long stride) {
int l,m, point_dimension = networkSize*N_EQ1;
//double output[N*networkSize]; not working when 'output' copied privately since size exceeds 2.3GB for all threads together; As of pgcc v15.5
//double output[N*networkSize*noOfInitialStates];
//experimenting with stateless computations; not using full output array since we are only using phase_difference array; also making it private since
// size won't be greater than 2.3GB anymore
double output[2*networkSize];
//double phase_differences_output[(networkSize-1)*(N/2+1)];
double phase_differences_output[(networkSize-1)*(N/2+1)*noOfInitialStates];
unsigned numberOfPhaseDifferences=0;
double current_state[point_dimension], temp_state[point_dimension], k1[point_dimension], k2[point_dimension], k3[point_dimension], k4[point_dimension];
int lastCrossings[point_dimension/2], numberOfCrossingsToEdit[point_dimension/2];
double bm_factor[networkSize];
struct timeval start_time, stop_time, elapsed_time;
gettimeofday(&start_time,NULL);
/*
In order to avoid memory issues, the number of threads, regulated here by num_gangs(16) vector_length(128), could also be dynamic, based on the size of N*networkSize*noOfInitialStates .
Also, for very large resolutions, say 150x150, rather than clogging memory and thereby slowing down everything, it is instead better to run in multiple batches of say, 50x50 -
something that makes a reasonable use of GPU memory.
*/
/*#pragma omp parallel for \
private(phase_differences_output,numberOfPhaseDifferences),\
private(current_state,temp_state,k1,k2,k3,k4,lastCrossings,numberOfCrossingsToEdit),\
private(bm_factor)*/
//copy(output)
/*
private(output) ; not working for >2.3GB
num_gangs(16) vector_length(128) ; Can set the maximum no.of parallel threads with this; Could adopt dynamically based on sizes but not worth the wait.
Rather, if memory requirement's too great to be fulfilled, split the input array and run in batches
private(phase_differences_output,numberOfPhaseDifferences),\
*/
#pragma acc parallel loop independent ,\
pcopyin(initial_states_array[0:point_dimension*noOfInitialStates],p[0:6*networkSize],coupling_strengths[0:networkSize*networkSize]), \
copy(trajectory_targets_phasedifferences[0:(networkSize-1)*noOfInitialStates]), \
private(current_state,temp_state,k1,k2,k3,k4,lastCrossings,numberOfCrossingsToEdit),\
private(bm_factor),\
private(output,numberOfPhaseDifferences),\
copy(phase_differences_output)
for (l=0;l<noOfInitialStates;l++){
/*
integrate_n_rk4_phasedifferences_nooutput(networkSize,initial_states_array, p, coupling_strengths,
output+l*N*networkSize, phase_differences_output, &numberOfPhaseDifferences,
current_state,temp_state,k1,k2,k3,k4,
lastCrossings,numberOfCrossingsToEdit,bm_factor,
dt, N, stride, l*point_dimension);
*/
integrate_n_rk4_phasedifferences_nooutput(networkSize,initial_states_array, p, coupling_strengths,
output, phase_differences_output+l*(networkSize-1)*(N/2+1), &numberOfPhaseDifferences,
current_state,temp_state,k1,k2,k3,k4,
lastCrossings,numberOfCrossingsToEdit,bm_factor,
dt, N, stride, l*point_dimension);
for(m=0;m<networkSize-1;m++){
if(numberOfPhaseDifferences>0)
trajectory_targets_phasedifferences[l*(networkSize-1)+m]=
phase_differences_output[(numberOfPhaseDifferences-1)*(networkSize-1)+m];
}
}
gettimeofday(&stop_time,NULL);
timersub(&stop_time, &start_time, &elapsed_time); // Unix time subtract routine
printf("Total time was %f seconds.\n", elapsed_time.tv_sec+elapsed_time.tv_usec/1000000.0);
}
int main(){
// test case for 4 cells
double *initial_states_array;
double p[] = {0.4,0.3,0.,10.,-1.5,1.,0.4,0.3,0.,10.,-1.5,1.,0.4,0.3,0.,10.,-1.5,1.,0.4,0.3,0.,10.,-1.5,1.};
double coupling_strengths[] = {0.,0.001,0.001,0.001,0.001,0.,0.001,0.,0.001,0.001,0.,0.,0.001,0.,0.,0.};
double *trajectory_targets_phasedifferences = (double*)malloc(3*200*200*sizeof(double));
void *ptr ;
int i,j,n;
printf("Enter size of grid\n");
scanf("%d",&n);
ptr = malloc(8*n*n*sizeof(double));
if(!ptr)
{
printf("malloc returns NULL\n");
return 1;
}
initial_states_array = (double*)ptr;
double initial_state[] = {-0.026541128437339426, 0.07017957749966305, -0.026541128437339426, 0.07017957749966305, -0.026541128437339426, 0.07017957749966305, -0.026541128437339426, 0.07017957749966305};
for(i=0;i<n*n;i++){
for(j=0;j<8;j++){
initial_states_array[i*8+j]=initial_state[j];
}
}
sweeptraces(4, initial_states_array, p, coupling_strengths,trajectory_targets_phasedifferences,n*n,
(double)3/1000, 21275, 30);
}