Hi,
I have wrotten this kind of code to update seismic sources in a FD seismic propagation code. This code is mostly generated by script an use a lot of ‘if’ statements.
I don’t ask here if this kind of code is smart, that’s not my problem for the moment. My problem is that I don’t have enought ressources to run the code on my 8800GTX.
What surprises me is the number of registers used by the compiler:
ptxas info : Used 52 registers, 308+280 bytes lmem, 100+96 bytes smem, 2064 bytes cmem[0], 80 bytes cmem[1]
and here is the code : (c_* arrays are in constant memory)
[codebox]global void updateSources( int nbsources, int it, float dt, float* d_fx, float* d_fy, float* d_fz,
int sizex, int sizey, int sizez, int pitchx, int sizex_tab, int sizey_tab, int sizez_tab, int pitchx_tab,
int xshift, int yshift, int zshift)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
for (int is = 0; is<nbsources; is++) {
for (int k = 0; k < sizez; k++) {
if ( c_insrc[is] == 1 ){
int isource = c_ixhypo[is] - xshift;
int jsource = c_iyhypo[is] - yshift;
int ksource = c_izhypo[is] - zshift;
float x_weight = c_xweight[is];
float y_weight = c_yweight[is];
float z_weight = c_zweight[is];
float mo = c_vel[is][it] * dt;
float pxx = radxx(c_strike[is], c_dip[is], c_rake[is]);
float pyy = radyy(c_strike[is], c_dip[is], c_rake[is]);
float pzz = radzz(c_strike[is], c_dip[is], c_rake[is]);
float pxy = radxy(c_strike[is], c_dip[is], c_rake[is]);
float pyz = radyz(c_strike[is], c_dip[is], c_rake[is]);
float pxz = radxz(c_strike[is], c_dip[is], c_rake[is]);
int offset = k*(pitchx_tab)*(sizey_tab) + j*(pitchx_tab) + i;
if (i == isource-1) {
if (j == jsource-1) {
if (k == ksource-1) {
d_fy[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)));
d_fz[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fy[offset] -= (0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pyy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight));
d_fz[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight));
}
else if (k == ksource+1) {
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pyy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight));
d_fz[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight));
}
else if (k == ksource+2) {
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight));
}
}
else if (j == jsource) {
if (k == ksource-1) {
d_fy[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)));
d_fz[offset] -= (0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pzz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] -= (0.5 * mo * pxx * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)));
d_fy[offset] -= (0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pyy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pyy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight));
d_fz[offset] -= (0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pzz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pzz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] -= (0.5 * mo * pxx * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight));
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))+(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))+(0.5 * mo * pyy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*z_weight))-(0.5 * mo * pyy * ((1.0 - x_weight)*y_weight*z_weight));
d_fz[offset] -= (0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))+(0.5 * mo * pzz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight));
}
else if (k == ksource+2) {
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))+(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight));
}
}
else if (j == jsource+1) {
if (k == ksource-1) {
d_fy[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)));
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pzz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] -= (0.5 * mo * pxx * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)));
d_fy[offset] -= (0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.5 * mo * pyy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight));
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.5 * mo * pzz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*z_weight))-(0.5 * mo * pzz * ((1.0 - x_weight)*y_weight*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] -= (0.5 * mo * pxx * ((1.0 - x_weight)*y_weight*z_weight));
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*z_weight))+(0.5 * mo * pyy * ((1.0 - x_weight)*y_weight*z_weight));
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*z_weight))+(0.5 * mo * pzz * ((1.0 - x_weight)*y_weight*z_weight));
}
else if (k == ksource+2) {
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight));
}
}
else if (j == jsource+2) {
if (k == ksource-1) {
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight));
}
else if (k == ksource+1) {
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight));
}
}
}
else if (i == isource) {
if (j == jsource-1) {
if (k == ksource-1) {
d_fy[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)));
d_fz[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] -= (0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)));
d_fy[offset] += (0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pyy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pyy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight));
d_fz[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] -= (0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight));
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pyy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*z_weight))-(0.5 * mo * pyy * (x_weight*(1.0 - y_weight)*z_weight));
d_fz[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight));
}
else if (k == ksource+2) {
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))+(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight));
}
}
else if (j == jsource) {
if (k == ksource-1) {
d_fx[offset] -= (0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)));
d_fy[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)));
d_fz[offset] += (0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pzz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pzz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] -= (0.5 * mo * pxx * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight));
d_fy[offset] += (0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pyy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pyy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pyy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxy * (x_weight*y_weight*(1.0 - z_weight)))-(0.5 * mo * pyy * (x_weight*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight))-(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
d_fz[offset] += (0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pzz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pzz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pzz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*z_weight))-(0.5 * mo * pzz * (x_weight*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight))-(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] += (0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxx * (x_weight*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*z_weight));
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))+(0.5 * mo * pyy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pyy * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*z_weight))-(0.5 * mo * pyy * ((1.0 - x_weight)*y_weight*z_weight))-(0.5 * mo * pxy * (x_weight*y_weight*z_weight))-(0.5 * mo * pyy * (x_weight*y_weight*z_weight));
d_fz[offset] += (0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))+(0.5 * mo * pzz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pzz * (x_weight*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight))-(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+2) {
d_fx[offset] += (0.5 * mo * pxz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight));
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))+(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight))+(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight))+(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
}
else if (j == jsource+1) {
if (k == ksource-1) {
d_fx[offset] -= (0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)));
d_fy[offset] -= (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)));
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pzz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxz * (x_weight*y_weight*(1.0 - z_weight)))-(0.5 * mo * pzz * (x_weight*y_weight*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] += (0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxx * (x_weight*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*z_weight));
d_fy[offset] += (0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.5 * mo * pyy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxy * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pyy * (x_weight*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight))-(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.5 * mo * pzz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxz * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pzz * (x_weight*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))+(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*z_weight))-(0.5 * mo * pzz * ((1.0 - x_weight)*y_weight*z_weight))-(0.5 * mo * pxz * (x_weight*y_weight*z_weight))-(0.5 * mo * pzz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] += (0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.5 * mo * pxy * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxx * (x_weight*y_weight*z_weight));
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*z_weight))+(0.5 * mo * pyy * ((1.0 - x_weight)*y_weight*z_weight))-(0.5 * mo * pxy * (x_weight*y_weight*z_weight))+(0.5 * mo * pyy * (x_weight*y_weight*z_weight));
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))+(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*z_weight))+(0.5 * mo * pzz * ((1.0 - x_weight)*y_weight*z_weight))-(0.5 * mo * pxz * (x_weight*y_weight*z_weight))+(0.5 * mo * pzz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+2) {
d_fx[offset] += (0.5 * mo * pxz * ((1.0 - x_weight)*y_weight*z_weight));
d_fy[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight))+(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
}
else if (j == jsource+2) {
if (k == ksource-1) {
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] += (0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)));
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight))+(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] += (0.5 * mo * pxy * ((1.0 - x_weight)*y_weight*z_weight));
d_fz[offset] += (0.125 * mo * pyz * ((1.0 - x_weight)*y_weight*z_weight))+(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
}
}
else if (i == isource+1) {
if (j == jsource-1) {
if (k == ksource-1) {
d_fy[offset] -= (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)));
d_fz[offset] -= (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] -= (0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)));
d_fy[offset] += (0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pyy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight));
d_fz[offset] -= (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] -= (0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*z_weight));
d_fy[offset] += (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*z_weight))-(0.5 * mo * pyy * (x_weight*(1.0 - y_weight)*z_weight));
d_fz[offset] -= (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight));
}
else if (k == ksource+2) {
d_fy[offset] += (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight));
}
}
else if (j == jsource) {
if (k == ksource-1) {
d_fx[offset] -= (0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)));
d_fy[offset] -= (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)));
d_fz[offset] += (0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pzz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] += (0.5 * mo * pxx * ((1.0 - x_weight)*(1.0 - y_weight)*(1.0 - z_weight)))-(0.5 * mo * pxy * (x_weight*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*z_weight));
d_fy[offset] += (0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pyy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxy * (x_weight*y_weight*(1.0 - z_weight)))-(0.5 * mo * pyy * (x_weight*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
d_fz[offset] += (0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pzz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*z_weight))-(0.5 * mo * pzz * (x_weight*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] += (0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxx * ((1.0 - x_weight)*(1.0 - y_weight)*z_weight))-(0.5 * mo * pxy * (x_weight*y_weight*z_weight));
d_fy[offset] += (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pyy * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pxy * (x_weight*y_weight*z_weight))-(0.5 * mo * pyy * (x_weight*y_weight*z_weight));
d_fz[offset] += (0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pzz * (x_weight*(1.0 - y_weight)*z_weight))-(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+2) {
d_fx[offset] += (0.5 * mo * pxz * (x_weight*(1.0 - y_weight)*z_weight));
d_fy[offset] += (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight))+(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
}
else if (j == jsource+1) {
if (k == ksource-1) {
d_fx[offset] -= (0.5 * mo * pxz * (x_weight*y_weight*(1.0 - z_weight)));
d_fy[offset] -= (0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)));
d_fz[offset] += (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxz * (x_weight*y_weight*(1.0 - z_weight)))-(0.5 * mo * pzz * (x_weight*y_weight*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] += (0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxx * ((1.0 - x_weight)*y_weight*(1.0 - z_weight)))-(0.5 * mo * pxz * (x_weight*y_weight*z_weight));
d_fy[offset] += (0.5 * mo * pxy * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pyy * (x_weight*y_weight*(1.0 - z_weight)))-(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
d_fz[offset] += (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)))+(0.5 * mo * pxz * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pzz * (x_weight*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pxz * (x_weight*y_weight*z_weight))-(0.5 * mo * pzz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] += (0.5 * mo * pxz * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pxy * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pxx * ((1.0 - x_weight)*y_weight*z_weight));
d_fy[offset] += (0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)))+(0.5 * mo * pxy * (x_weight*y_weight*z_weight))+(0.5 * mo * pyy * (x_weight*y_weight*z_weight));
d_fz[offset] += (0.125 * mo * pyz * (x_weight*(1.0 - y_weight)*z_weight))+(0.5 * mo * pxz * (x_weight*y_weight*z_weight))+(0.5 * mo * pzz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+2) {
d_fx[offset] += (0.5 * mo * pxz * (x_weight*y_weight*z_weight));
d_fy[offset] += (0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
}
else if (j == jsource+2) {
if (k == ksource-1) {
d_fz[offset] += (0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)));
}
else if (k == ksource) {
d_fx[offset] += (0.5 * mo * pxy * (x_weight*y_weight*(1.0 - z_weight)));
d_fz[offset] += (0.125 * mo * pyz * (x_weight*y_weight*(1.0 - z_weight)))+(0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
else if (k == ksource+1) {
d_fx[offset] += (0.5 * mo * pxy * (x_weight*y_weight*z_weight));
d_fz[offset] += (0.125 * mo * pyz * (x_weight*y_weight*z_weight));
}
}
}
else if (i == isource+2) {
if (j == jsource) {
if (k == ksource) {
d_fx[offset] += (0.5 * mo * pxx * (x_weight*(1.0 - y_weight)*(1.0 - z_weight)));
}
else if (k == ksource+1) {
d_fx[offset] += (0.5 * mo * pxx * (x_weight*(1.0 - y_weight)*z_weight));
}
}
else if (j == jsource+1) {
if (k == ksource) {
d_fx[offset] += (0.5 * mo * pxx * (x_weight*y_weight*(1.0 - z_weight)));
}
else if (k == ksource+1) {
d_fx[offset] += (0.5 * mo * pxx * (x_weight*y_weight*z_weight));
}
}
}
}
}
__syncthreads();
}
}[/codebox]
As you can see, i use few variables, just constant memory arrays for reading et 3 global memory arrays for read + write
It’s the first time i use so many if statements in a CUDA code, and the first time i need so many registers and local memory …
Is this a consequence of the massive use of if statements ? how can i reduce the amount of registry/localmem used ?
thanks.