float total = 0;
000000013F85B7B3 xorps xmm5,xmm5
000000013F85B7B6 movss dword ptr [total],xmm5
000000013F85B7BC mov dword ptr [i],0
000000013F85B7C4 jmp _Z26blockMultiplyColumnPerGridPfS_S_PiS0_+221h (13F85B7D1h)
000000013F85B7C6 mov eax,dword ptr [i]
000000013F85B7CA add eax,1
000000013F85B7CD mov dword ptr [i],eax
000000013F85B7D1 mov eax,dword ptr [columnSize_ (13F8D8EF8h)]
000000013F85B7D7 cmp dword ptr [i],eax
000000013F85B7DB jge _Z26blockMultiplyColumnPerGridPfS_S_PiS0_+28Eh (13F85B83Eh)
//Multiply column vector with our input vector
#pragma unroll
for (int i = 0; i < columnSize_; i++)
{
total += (float)(device_inputVector[(shared_rowIDs[threadIdx.x*columnSize_ + i] - 1 )]*shared_values[threadIdx.x*columnSize_ + i]);
000000013F85B7DD mov eax,dword ptr [threadIdx (13F8D8EB8h)]
000000013F85B7E3 imul eax,dword ptr [columnSize_ (13F8D8EF8h)]
000000013F85B7EA add eax,dword ptr [i]
000000013F85B7EE mov ecx,eax
000000013F85B7F0 mov rax,qword ptr [shared_rowIDs]
000000013F85B7F5 mov eax,dword ptr [rax+rcx*4]
000000013F85B7F8 sub eax,1
000000013F85B7FB movsxd rcx,eax
000000013F85B7FE mov rax,qword ptr [device_inputVector]
000000013F85B806 movss xmm1,dword ptr [rax+rcx*4]
000000013F85B80B mov eax,dword ptr [threadIdx (13F8D8EB8h)]
000000013F85B811 imul eax,dword ptr [columnSize_ (13F8D8EF8h)]
000000013F85B818 add eax,dword ptr [i]
000000013F85B81C mov ecx,eax
000000013F85B81E mov rax,qword ptr [shared_values]
000000013F85B823 movss xmm0,dword ptr [rax+rcx*4]
000000013F85B828 mulss xmm1,xmm0
000000013F85B82C movss xmm0,dword ptr [total]
000000013F85B832 addss xmm0,xmm1
000000013F85B836 movss dword ptr [total],xmm0
}
000000013F85B83C jmp _Z26blockMultiplyColumnPerGridPfS_S_PiS0_+216h (13F85B7C6h)
device_outputVector[0] += (float)total;
000000013F85B83E mov rax,qword ptr [device_outputVector]
000000013F85B846 movss xmm0,dword ptr [rax]
000000013F85B84A addss xmm0,dword ptr [total]
000000013F85B850 mov rax,qword ptr [device_outputVector]
000000013F85B858 movss dword ptr [rax],xmm0
000000013F85B85C mov dword ptr [i],0
000000013F85B864 jmp _Z26blockMultiplyColumnPerGridPfS_S_PiS0_+2C1h (13F85B871h)
000000013F85B866 mov eax,dword ptr [i]
000000013F85B86A add eax,1
000000013F85B86D mov dword ptr [i],eax
000000013F85B871 mov eax,dword ptr [columnSize_ (13F8D8EF8h)]
000000013F85B877 cmp dword ptr [i],eax
000000013F85B87B jge _Z26blockMultiplyColumnPerGridPfS_S_PiS0_+33Fh (13F85B8EFh)
//atomicAdd(&device_outputVector[0], (total) );
for (int i = 0; i < columnSize_; i++)
{
//device_outputVector[(shared_resultOrder[threadIdx.x*columnSize_ + i])] += device_inputVector[(shared_rowIDs[threadIdx.x*columnSize_ + i] - 1 )]*shared_values[threadIdx.x*columnSize_ + i];
device_outputVector[1] += device_inputVector[(shared_rowIDs[threadIdx.x*columnSize_ + i] - 1 )]*shared_values[threadIdx.x*columnSize_ + i];
000000013F85B87D mov eax,dword ptr [threadIdx (13F8D8EB8h)]
000000013F85B883 imul eax,dword ptr [columnSize_ (13F8D8EF8h)]
000000013F85B88A add eax,dword ptr [i]
000000013F85B88E mov ecx,eax
000000013F85B890 mov rax,qword ptr [shared_rowIDs]
000000013F85B895 mov eax,dword ptr [rax+rcx*4]
000000013F85B898 sub eax,1
000000013F85B89B movsxd rcx,eax
000000013F85B89E mov rax,qword ptr [device_inputVector]
000000013F85B8A6 movss xmm1,dword ptr [rax+rcx*4]
000000013F85B8AB mov eax,dword ptr [threadIdx (13F8D8EB8h)]
000000013F85B8B1 imul eax,dword ptr [columnSize_ (13F8D8EF8h)]
000000013F85B8B8 add eax,dword ptr [i]
000000013F85B8BC mov ecx,eax
000000013F85B8BE mov rax,qword ptr [shared_values]
000000013F85B8C3 movss xmm0,dword ptr [rax+rcx*4]
000000013F85B8C8 mulss xmm1,xmm0
000000013F85B8CC mov rax,qword ptr [device_outputVector]
000000013F85B8D4 movss xmm0,dword ptr [rax+4]
000000013F85B8D9 addss xmm0,xmm1
000000013F85B8DD mov rax,qword ptr [device_outputVector]
000000013F85B8E5 movss dword ptr [rax+4],xmm0
}
000000013F85B8EA jmp _Z26blockMultiplyColumnPerGridPfS_S_PiS0_+2B6h (13F85B866h)
000000013F85B8EF mov dword ptr [i],0
000000013F85B8F7 jmp _Z26blockMultiplyColumnPerGridPfS_S_PiS0_+354h (13F85B904h)
000000013F85B8F9 mov eax,dword ptr [i]
000000013F85B8FD add eax,1
000000013F85B900 mov dword ptr [i],eax
000000013F85B904 mov eax,dword ptr [columnSize_ (13F8D8EF8h)]
000000013F85B90A cmp dword ptr [i],eax
000000013F85B90E jge 000000013F85B9DB
Thats the assembly but i m pretty much lost in it