Sample Needed for looping Array Operation multiple array calculations

Hi there,

I have worked on code without success to upgrade the following snippet to performs multiple SAXPY calculations on the GPU.

  for (int inDX = n; --inDX >= 0; )
 {
       for (outDX = m; --outDX >= 0; )
       {
              R[outDX] += A[inDX][outDX] * B[inDX];
       }
 }

I have worked for several days trying to write this code based on SDK examples. I thought it would be less hard since I am very comfortable using openmp.

I would be very grateful if I could get some sample(s) that are similar. From previous posts, I know this is a function that a lot of people could be interested in.

I am looking forward to having some comments.

Thanks,

Tim

t i m k r i e g AT h o t m a i l

JUST A NOTE HERE IT IS IN SIMD

_asm{

	/* outer loop counter */
	mov ecx, m;
	mov ebx, a;
	// prefetchnta [ebx]; 
	mov eax, b;
	mov edi, r;

	/* clear registers */
	pxor xmm2, xmm2;
	pxor xmm3, xmm3;
	pxor xmm4, xmm4;
	pxor xmm5, xmm5;

    ALIGN       16; 

loop0:
	movss    xmm7,[eax];
	shufps   xmm7,xmm7,0;
	add eax, 0x4;

    ALIGN       16;

loop16:
	prefetchnta [ebx+20h]; 
	movups    xmm1,[ebx+0x10];                                       
	movups    xmm0,[ebx];  

	mulps     xmm1,xmm7;                                             
	mulps     xmm0,xmm7;      
                                   
	addps     xmm3,xmm1;                                       
	addps     xmm2,xmm0;   

	movups    xmm6,[ebx+0x30];                                       
	movups    xmm0,[ebx+0x20];  
	                                    
	mulps     xmm6,xmm7;                                             
	mulps     xmm0,xmm7;      
                                   
	addps     xmm5,xmm6;                                       
	addps     xmm4,xmm0;   

	add       ebx, 0x40;                                    

endloop16:

	mov edx, ecx;
	and edx, 0xE;
	jz endloop2;
	shr edx, 1

    ALIGN       16;

	add		edi, 0x40
loop2:
	movss    xmm1,[ebx+0x04]
	movss    xmm0,[ebx]

	add      ebx, 0x8;
	mulss    xmm0,xmm7   
	mulss    xmm1,xmm7   
	addss    xmm0,[edi]   
	addss    xmm1,[edi+0x04]  

	movss    [edi],xmm0  
	movss    [edi+0x04],xmm1   
	add		 edi, 0x8

	dec edx;
	jnz loop2;
	mov edi, r;
	
endloop2:

	/* outside loop until done */
	dec n;
	jnz loop0;

	mov a, ebx;

	addps     xmm5,[edi+0x30];                                       
	movaps    [edi+0x30],xmm5;

	addps     xmm4,[edi+0x20];                                       
	movaps    [edi+0x20],xmm4;  

	addps     xmm3,[edi+0x10];                                       
	movaps    [edi+0x10],xmm3;  

	addps     xmm2,[edi];                                       
	movaps    [edi],xmm2;   

endnow:
	emms;
}

hmm, let each block calculate one element of R.

Then you can in a block have each thread calculate a A[inDx][outDX] *B[inDx] (inDx is like threadIdx) and do a reduction at the end of the kernel.