Hi there,
I have worked on code without success to upgrade the following snippet to performs multiple SAXPY calculations on the GPU.
for (int inDX = n; --inDX >= 0; )
{
for (outDX = m; --outDX >= 0; )
{
R[outDX] += A[inDX][outDX] * B[inDX];
}
}
I have worked for several days trying to write this code based on SDK examples. I thought it would be less hard since I am very comfortable using openmp.
I would be very grateful if I could get some sample(s) that are similar. From previous posts, I know this is a function that a lot of people could be interested in.
I am looking forward to having some comments.
Thanks,
Tim
t i m k r i e g AT h o t m a i l
JUST A NOTE HERE IT IS IN SIMD
_asm{
/* outer loop counter */
mov ecx, m;
mov ebx, a;
// prefetchnta [ebx];
mov eax, b;
mov edi, r;
/* clear registers */
pxor xmm2, xmm2;
pxor xmm3, xmm3;
pxor xmm4, xmm4;
pxor xmm5, xmm5;
ALIGN 16;
loop0:
movss xmm7,[eax];
shufps xmm7,xmm7,0;
add eax, 0x4;
ALIGN 16;
loop16:
prefetchnta [ebx+20h];
movups xmm1,[ebx+0x10];
movups xmm0,[ebx];
mulps xmm1,xmm7;
mulps xmm0,xmm7;
addps xmm3,xmm1;
addps xmm2,xmm0;
movups xmm6,[ebx+0x30];
movups xmm0,[ebx+0x20];
mulps xmm6,xmm7;
mulps xmm0,xmm7;
addps xmm5,xmm6;
addps xmm4,xmm0;
add ebx, 0x40;
endloop16:
mov edx, ecx;
and edx, 0xE;
jz endloop2;
shr edx, 1
ALIGN 16;
add edi, 0x40
loop2:
movss xmm1,[ebx+0x04]
movss xmm0,[ebx]
add ebx, 0x8;
mulss xmm0,xmm7
mulss xmm1,xmm7
addss xmm0,[edi]
addss xmm1,[edi+0x04]
movss [edi],xmm0
movss [edi+0x04],xmm1
add edi, 0x8
dec edx;
jnz loop2;
mov edi, r;
endloop2:
/* outside loop until done */
dec n;
jnz loop0;
mov a, ebx;
addps xmm5,[edi+0x30];
movaps [edi+0x30],xmm5;
addps xmm4,[edi+0x20];
movaps [edi+0x20],xmm4;
addps xmm3,[edi+0x10];
movaps [edi+0x10],xmm3;
addps xmm2,[edi];
movaps [edi],xmm2;
endnow:
emms;
}