|
SprinterSB wrote:
Please notice that this code is completely untested and might contain typos or thinkos. It just passed my brain v0.9beta simulator.
Below please find some code to square a single uint32_t into a uint64_t result. It is basically a rewrite of the code provided by nick.parker and later optimised by SprinterSB. I have not done exhaustive testing, but the testing I have done suggests that it is sound, and I believe it to be so. Neverthelss, use at your own risk
I realise this is a pretty specific piece of code, but it will serve my purposes. Currently, under my build environment, GCC 64-bit mult costs 870 ticks (and 502 bytes, not counting mulsi3). The below code costs 79 cycles and 128 bytes. Since the time-critical parts of my application require only 64bit = 32bit^2, this is an excellent solution for me.
Of course, I invite comments and criticism.
Many thanks to nick.parker, SprinterSB, et al.
Cheers,
jj
Code:
#define __zero_reg__ r1
#define __tmp_reg__ r0
.macro DEFUN name
.global \name
.func \name
\name:
.endm
.macro ENDF name
.size \name, .-\name
.endfunc
.endm
.text
;; A[0..3]: In: Operand;
;; Out: Result.High (R25:R22)
#define A0 22
#define A1 A0+1
#define A2 A0+2
#define A3 A0+3
;; P[0..7]: Product
#define P0 18
#define P1 P0+1
#define P2 20
#define P3 P2+1
#define P4 26
#define P5 P4+1
#define P6 30
#define P7 P6+1
;; T[0..7]: Result
;; note T[0..3] are never referenced, as they are aliases for P[0..3]
#define T0 18
#define T1 T0+1
#define T2 20
#define T3 T2+1
#define T4 22
#define T5 T4+1
#define T6 24
#define T7 T6+1
#define zero 17
;; R25:R18 = (uint64_t) R25:R22 ^ 2
;; Ordinary ABI-Function
;; 79 cycles, 64 words (128 bytes)
DEFUN __umulsidi3
;; prologue
push zero
;; need a local zero register
clr zero
;; zero the 6 MSB of the product
clr P2
clr P3
movw P4, P2
movw P6, P2
;; 0 bytes offset
mul A0,A0 $ movw P0,r0
;; P0 is now complete
;; 1 byte offset
;; R1 is <= 0xfe = 255*255/256 -> first ADC don't set Carry
mul A0,A1 $ add P1,r0 $ adc P2,r1
add P1,r0 $ adc P2,r1 $ adc P3,zero
;; P1, P0 are now complete
;; 2 bytes offset
;; R1 is <= 0xfe and P3 <= 1 -> first ADC don't set Carry
mul A0,A2 $ add P2,r0 $ adc P3,r1
$ add P2,r0 $ adc P3,r1 $ adc P4,zero
mul A1,A1 $ add P2,r0 $ adc P3,r1 $ adc P4,zero
;; P2, P1, P0 are now complete
;; 3 bytes offset
mul A0,A3 $ add P3,r0 $ adc P4,r1 $ adc P5,zero
$ add P3,r0 $ adc P4,r1 $ adc P5,zero
mul A1,A2 $ add P3,r0 $ adc P4,r1 $ adc P5,zero
$ add P3,r0 $ adc P4,r1 $ adc P5,zero
;; A0 is done with
;; P3, P2, P1, P0 are now complete
;; 4 bytes offset
mul A1,A3 $ add P4,r0 $ adc P5,r1 $ adc P6,zero
$ add P4,r0 $ adc P5,r1 $ adc P6,zero
;; A1, A0 are done with
mul A2,A2 $ add P4,r0 $ adc P5,r1 $ adc P6,zero
;; P4, P3, P2, P1, P0 are now complete
;; 5 bytes offset
mul A2,A3 $ add P5,r0 $ adc P6,r1 $ adc P7,zero
$ add P5,r0 $ adc P6,r1 $ adc P7,zero
;; A2, A1, A0 are done with
;; P5, P4, P3, P2, P1, P0 are now complete
;; 6 bytes offset
mul A3,A3 $ add P6,r0 $ adc P7,r1
;; A3, A2, A1, A0 are done with
;; P7, P6, P5, P4, P3, P2, P1, P0 are now complete
;; Move Result P[7..0] to T[7..0] (R25:R18) according to ABI
;; P[3..0] already in place
movw T4, P4
movw T6, P6
;; Epilogue
pop zero
clr __zero_reg__
ret
ENDF __umulsidi3
|