I have a kernel that is reported as having uncoalesced global load/store by NSight compute. I am trying to go through the SASS file and was wondering how can the uncoalesced load pattern be detected from it.
The simple kernel is:
typedef struct { float x, y, z, vx, vy, vz; } Body;
void randomizeBodies(float *data, int n) {
for (int i = 0; i < n; i++) {
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}
__global__
void bodyForce(Body *p, float dt, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
for (int j = 0; j < n; j++) { // line 26
float dx = p[j].x - p[i].x; // line 27
float dy = p[j].y - p[i].y; // line 28
float dz = p[j].z - p[i].z; // line 29
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}
p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
}
}
int main(const int argc, const char** argv) {
int nBodies = 30000;
if (argc > 1) nBodies = atoi(argv[1]);
const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations
int bytes = nBodies*sizeof(Body);
float *buf = (float*)malloc(bytes);
Body *p = (Body*)buf;
randomizeBodies(buf, 6*nBodies); // Init pos / vel data
float *d_buf;
cudaMalloc(&d_buf, bytes);
Body *d_p = (Body*)d_buf;
int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
double totalTime = 0.0;
for (int iter = 1; iter <= nIters; iter++) {
StartTimer();
cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p, dt, nBodies); // compute interbody forces
cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
}
The corresponding SASS lines for global load:
//## File "nbody_simple.cu", line 26
/*00b0*/ IMAD.WIDE R2, R2, R7, c[0x0][0x160] ;
/*00c0*/ @!P0 BRA `(.L_x_0) ;
/*00d0*/ LDG.E.SYS R0, [R2] ;
/*00e0*/ LDG.E.SYS R4, [R2+0x4] ;
/*00f0*/ LDG.E.SYS R5, [R2+0x8] ;
.....
.L_x_2:
//## File "nbody_simple.cu", line 28
/*0190*/ LDG.E.SYS R15, [UR4+0x4] ;
//## File "nbody_simple.cu", line 27
/*01a0*/ LDG.E.SYS R13, [UR4] ;
//## File "nbody_simple.cu", line 29
/*01b0*/ LDG.E.SYS R12, [UR4+0x8] ;
//## File "nbody_simple.cu", line 28
/*01c0*/ LDG.E.SYS R19, [UR4+0x1c] ;
//## File "nbody_simple.cu", line 27
/*01d0*/ LDG.E.SYS R17, [UR4+0x18] ;
//## File "nbody_simple.cu", line 28
/*01e0*/ LDG.E.SYS R23, [UR4+0x34] ;
//## File "nbody_simple.cu", line 29
/*01f0*/ LDG.E.SYS R20, [UR4+0x20] ;
//## File "nbody_simple.cu", line 27
/*0200*/ LDG.E.SYS R21, [UR4+0x30] ;
//## File "nbody_simple.cu", line 28
/*0210*/ LDG.E.SYS R29, [UR4+0x4c] ;
//## File "nbody_simple.cu, line 29
/*0220*/ LDG.E.SYS R28, [UR4+0x38] ;
//## File "nbody_simple.cu", line 27
/*0230*/ LDG.E.SYS R25, [UR4+0x48] ;
//## File "nbody_simple.cu", line 29
/*0240*/ LDG.E.SYS R34, [UR4+0x50] ;
************** The part above has no warnings for uncoalesced global load/store ******************
.L_x_0:
//## File "nbody_simple.cu", line 37
/*0870*/ LDG.E.SYS R0, [R2+0xc] ;
/*0880*/ LDG.E.SYS R4, [R2+0x10] ;
/*0890*/ LDG.E.SYS R5, [R2+0x14] ;
/*08a0*/ FFMA R11, R11, c[0x0][0x168], R0 ;
/*08b0*/ FFMA R9, R9, c[0x0][0x168], R4 ;
/*08c0*/ FFMA R5, R10, c[0x0][0x168], R5 ;
/*08d0*/ STG.E.SYS [R2+0xc], R11 ;
/*08e0*/ STG.E.SYS [R2+0x10], R9 ;
/*08f0*/ STG.E.SYS [R2+0x14], R5 ;
***************** This part above has uncoalesced global load and store warnings ******************