; inner_prod_asm_func.asm 
; Multiply two arrays. Called from inner_prod_asm.c
; A4=x address,B4=y address,A6=count(size of array),B3=return address
; uses parallel instructions to optimize code.
        
        .def            _inprod_asm_func_opt    ; inner product function
.text                                           ; text section
_inprod_asm_func_opt    MV      .L1     A6,A1   ; move loop count -->A1
                ||      ZERO    .S1     A7      ; init A7 for accumulation
LOOP                    LDH     .D1     *A4++,A2 ; A2=(x). A4 as address pointer
                ||      LDH     .D2     *B4++,B2 ; B2=(y). B4 as address pointer
                        SUB     .S1     A1,1,A1 ; decrement loop counter
        [A1]            B       .S2     LOOP    ; branch to LOOP after add
                        NOP     2               ; 4 delay slots for LDH
                        MPY     .M1x    B2,A2,A3 ; A3 = x * y
                        NOP                     ; 1 delay slot for MPY  
                        ADD     .L1     A3,A7,A7 ; accum. in A7, then branch
                        B       .S2     B3      ; return from func to addr in B3
                        MV      .L1     A7,A4   ; A4=result A4=return register
                        NOP     4               ; 5 delay slots for branch
