## Fast division of 32 bits by 5, code in assembly

22 posts / 0 new
Author
Message

As you know, a function can be made faster if more registers are used in its algorithm.
Based on this, I guess the following code (div32b_5: in assembly) is the fastest one for dividing 32 bits by the constant 5.

Edit:

Sorry, my guess happened to be wrong (see next post)

It starts with 32 bits, followed by 16 bits then 8 bits and end up at 4 bits.
Minimum and Maximum cycles are 62 [0x00000000/5] and 254 [0xFFFFFFFF/5].
Note: "div16b_5:" and "div8b_5:" could also be the function entry with a minor addition.

;=====================================

;=====================================

Symbolic Code of "div32b_5:"

x in Y[x] is the byte number of the register Y

N the original 32-bit number
R the result of N/5
A, B, C and K temporary registers

Bhi <==> B3:B2
A<==> A1:A0
B<==> B1:B0
C<==> C1:C0

;======================

div32b_5:
; PUSH registers

A= N
C= R= 0

loop32b:
A= A-C

if A<2^16, div16b_5

B= (A/2^16)*13107
R= R+B
C= B*5
goto loop32b

;==========

div16b_5:
C= B= 0

loop16b:
A= A-C

if A<2^8, div8b_5

B  = (A/2^8)*51
Bhi= Bhi+B
C  = B*5
goto loop8b

;==========

div8b_5:
B1=B0=C0=0

loop4b:
A0= A0-C0

if A0<2^4, div2b_5

B0= (A0/16)*3
B1= B1+B0
C0= B0*5
goto loop4b

;==========

div2b_5:
K0=0

loop2b:
A0=A0-5

if A1<0, goto div_end

K0= K0+1
goto loop2b

;==========

div_end:
K0  = K0+B1
K= K0+Bhi
R= R+K

; POP registers
RET

;=====================================

;=====================================

```;==============================
; Test div32b_5
; Division 32-bit register by 5
;==============================

.def K_1 = r25     ; temporary for constants
.def K_0 = r24

.def N_3 = r23     ; the original 32-bit number
.def N_2 = r22
.def N_1 = r21
.def N_0 = r20

.def A_3 = r19     ; temporary
.def A_2 = r18
.def A_1 = r17
.def A_0 = r16

.def R_3 = r15     ; result of N_3:N_0 / 5
.def R_2 = r14
.def R_1 = r13
.def R_0 = r12

.def B_3 = r11     ; temporary
.def B_2 = r10
.def B_1 = r9
.def B_0 = r8

.def C_3 = r7      ; temporary
.def C_2 = r6
.def C_1 = r5
.def C_0 = r4

; divident , cycles [minimum 62, maximum 254]
.equ dd_Q = 0xFF
.equ dd_U = 0xFF
.equ dd_H = 0xFF
.equ dd_L = 0xFF

REPEAT:
LDI  N_3, dd_Q
LDI  N_2, dd_U
LDI  N_1, dd_H
LDI  N_0, dd_L

RCALL div32b_5

RJMP  REPEAT

;====================

div32b_5:
; PUSH registers

; A= N
MOV   A_0, N_0
MOV   A_1, N_1
MOV   A_2, N_2
MOV   A_3, N_3

; C= R= 0
CLR   C_0
CLR   C_1
CLR   C_2
CLR   C_3
CLR   R_0
CLR   R_1
CLR   R_2
CLR   R_3

loop32b:
; A= A-C
SUB   A_0, C_0
SBC   A_1, C_1
SBC   A_2, C_2
SBC   A_3, C_3

; if A<2^16, div16b_5
TST   A_3
BRNE  cnt_32b

TST   A_2
BREQ  div16b_5

cnt_32b:
; B= (A/2^16)*13107
LDI   K_1, high(13107)
LDI   K_0,  low(13107)

; Mul_16x16
MUL   A_3, K_1              ; Hi1 * Hi2
MOVW  B_3:B_2, r1:r0
MUL   A_2, K_0              ; Lo1 * Lo2
MOVW  B_1:B_0, r1:r0
MUL   A_3, K_0              ; Hi1 * Lo2
CLR	  K_0
MUL	  K_1, A_2              ; Hi2 * Lo1

; R= R+B

; C= B*5
LDI   K_0, 5
MUL   B_0, K_0
MOV   C_0, r0
MOV   C_1, r1

MUL   B_1, K_0
CLR   C_2

MUL   B_2, K_0
CLR   C_3

MUL   B_3, K_0

; goto loop1
RJMP  loop32b

;=============

div16b_5:
; C= B= 0
CLR   C_0
CLR   C_1
CLR   B_0
CLR   B_1
CLR   B_2
CLR   B_3

loop16b:
; A= A-C
SUB   A_0, C_0
SBC   A_1, C_1

; if A<2^8, div8b_5
TST   A_1
BREQ  div8b_5

; B= (A/2^8)*51
LDI   K_0, 51
MUL   A_1, K_0
MOV   B_0, r0
MOV   B_1, r1

; Bhi= Bhi+B

; C= B*5
LDI   K_0, 5
MUL   B_0, K_0
MOV   C_0, r0
MOV   C_1, r1

MUL   B_1, K_0

; goto loop16b
RJMP  loop16b

;=============

div8b_5:
; B1=B0=C0=0
CLR   C_0
CLR   B_0

loop4b:
; A0= A0-C0
SUB   A_0, C_0

; if A0<2^4, div2b_5
CPI   A_0, 16
BRLO  div2b_5

; B0= (A0/16)*3
MOV   A_1, A_0
SWAP  A_1
ANDI  A_1, 0x0F
LDI   K_0, 3
MUL   A_1, K_0
MOV   B_0, r0

; B1= B1+B0

; C0= B0*5
LDI   K_0, 5
MUL   B_0, K_0
MOV   C_0, r0

; goto loop4b
RJMP  loop4b

;=============

div2b_5:
; K0=0
CLR   K_0

loop2b:
; A0=A0-5
SUBI  A_0, 5

; if A0<0, goto div_end
BRLO  div_end

; K0= K0+1
INC   K_0

; goto loop2b
RJMP  loop2b

;=============

div_end:
; K0= K0+B1

; K= K0+Bhi
CLR   K_1

; R= R+K
CLR   K_0

; POP registers

RET
```

This topic has a solution.

Last Edited: Tue. Mar 31, 2020 - 07:08 AM
This reply has been marked as the solution.

A full 32bit*32bit=64bit take less than 100 clk (if everything is in registers)  , so a mul with 0XCCCC.... where only the top 32 bit are used and then some shifts should take about the same amount of clk so about 100 clk as both min and max.

Sorry, sparrow2, I couldn't get you well.

Is it about a faster algorithm that uses 32bit*32bit?

Edit1:

I guess you mean

N*0x33333333/2^32

Edit2:

You are right, sparrow2. It seems I like complicating things to have more fun in solving a problem, I guess ... like using SPI to generate sinewave signals Last Edited: Mon. Mar 30, 2020 - 01:27 PM

I think you should use a lookup table for the inner loop to divide an arbitrary 8-bit value by 5. Costs 256 bytes of flash, saves a LOT of cycles.

(0xFFFFFFFF * 0xCCCCCCCD) >> 18

When I wrote this in assembler, it was 146 clocks from call to return.

Last Edited: Tue. Mar 31, 2020 - 06:33 AM
```;***************************************************
;* Mutiply 32x32 -> 64 bit
;*
;*  R24:R21 x R5:R2 -> R15:R8
;*
;*  88 cycles + 4 (RET) = 92 Cycles
;*  86 cycles + 4 (RET) = 90 Cycles
;*
mult32:
clr	R16
mul	R21,R2
movw	R8,R0
clr	R10
clr	R11
movw R12,R10
movw R14,R10
;		clr	R12
;		clr	R13
;		clr	R14
;		clr	R15
mul	R22,R2
mul	R23,R2
mul	R24,R2

mul	R21,R3
mul	R22,R3
mul	R23,R3
mul	R24,R3

mul	R21,R4
mul	R22,R4
mul	R23,R4
mul	R24,R4

mul	R21,R5
mul	R22,R5
mul	R23,R5
mul	R24,R5
ret
```

Last Edited: Tue. Mar 31, 2020 - 07:45 AM

And remember that this is a full 32*32=64

And it really spring in my eyes that there are 6 clr in a row, it could have been 2 clr and 2 movw, save two clk and 4 bytes.

Last Edited: Tue. Mar 31, 2020 - 07:42 AM

pleasant.

```;***************************************************
;* Mutiply 32x32 -> 64 bit
;*
;*  R24:R21 x R5:R2 -> R15:R8
;*
;*  76 cycles + 4 (RET) = 80 Cycles
;*
mult32:
clr R16
mul     R21,R2
movw    R8,R0
mul     R22,R3
movw    R10,R0
mul     R23,R4
movw    R12,R0
mul     R24,R5
movw    R14,R0

mul     R22,R2
mul     R23,R2
mul     R24,R2

mul     R21,R3
mul     R23,R3
mul     R24,R3

mul     R21,R4
mul     R22,R4
mul     R24,R4

mul     R21,R5
mul     R22,R5
mul     R23,R5
ret```

Long division gives 58 cycles in a C-callable function:

```  .global divide32u5
divide32u5:
; input is R27:24, result goes back there
; uses long division with byte-sized digits
; R21:18 holds the quotient before being moved to R27:24
; Of the registers used, the avr-gcc ABI only requires R1 to be restored
.def C5 = R30
LDI C5, 5
.def C51 = R31
LDI C51, 51

.macro div i1, i0, q, flag
; q = i1:0/5, i0-=5*q
MUL \i0, C51
MOV \q, R1
.if \flag
; i1<=4
MUL \i1, C51
.ifend
MUL \q, C5
SUB \i0, R0
; q<=9
CP \i0, C5
BRLT 1f
SUB \i0, CR
INC \q
1: ; i0 <= 4
.endm

div  -1, R27, R21, 0
div R27, R26, R20, 1
div R26, R25, R19, 1
div R25, R24, R18, 1

MOVW R24, R18
MOVW R26, R20

CLR R1

RET```

The 58 is an upper bound that includes the ret, but not the call.

Moderation in all things. -- ancient proverb

This is my entry for the division by 5 "contest", numeric quality may not be the best even though I gave it some thought, but anyways... should take about 40 cycles if tweaked to be a C called function, I just used the first registers that came to mind.

```
.equ	testval = 100000000
.def	zero = r25
.def	magic = r24

start:
ldi	r19, byte4(testval)
ldi	r18, byte3(testval)
ldi	r17, byte2(testval)
ldi	r16, low(testval)

ldi	magic, 0xCC
clr	zero

mov	r1, r16

ror	r19
ror	r18
ror	r17
ror	r16
ror	r1

mul	r19, magic
movw	r22, r0
mul	r17, magic
movw	r20, r0

mul	r18, magic
movw	r2, r0
mul	r16, magic

sec

ror	r23
ror	r22
ror	r21
ror	r20

rjmp	start```

edit: forgot to say, the result will be in r20:r23

Last Edited: Wed. Apr 1, 2020 - 10:44 PM

forgot to say, the result will be in r20:r23

why not do this:

.def result_H = r20

.def result_L = r23

Then there is less confusion.  Also, the assembler will issue a warning if the same register is used elsewhere with another name---so potential conflicts can be spotted and checked (allowed or disallowed)

When in the dark remember-the future looks brighter than ever.   I look forward to being able to predict the future!

Thank you, El Tangas.

It happens that, in my actual application, I need just 16-bit / 10 which is somehow equivalent to 16-bit / 5.

I will try to deduce its algorithm (in assembly) based on yours on #10.

I just like to point out that, in my first test of this very fast algorithm (post #10), 0xFFFFFF0/5 gave 0x3333332F (the right answer – 1).

You're right it's messing up large numbers. Well, I expected precision would not be good, I guess I cut too many corners.

No because that is wrong, the result is 4 byte in size (r20:r23)

And personally I hate to use .def  that make it hard to read. And for an AVR you need to remember if it's a high or low register.

But relevant comments are always welcome.

I should have been more precise, the result is in r23:r22:r21:r20.

Anyway, I translated the algo to C so that I can test it better on a PC. I think I see some ways to improve precision without much cost.

```uint64_t div5(uint64_t value) {
uint64_t tmp;

value += value << 8;
tmp = value & 1;
value >>= 1;
value += (value >> 16) + tmp;
value >>= 8;
value *= 0xCC;
value += 0x100;
value >>= 9;

return value;
}```

The uint64_t are just to provide room for intermediate calculations, the algorithm is for 32 bit.

edit: first incorrect result for multiples of 5 is 0x8C0001FE (2348810750 decimal).

Last Edited: Thu. Apr 2, 2020 - 01:02 PM

Ok, I fixed it at the cost of 5 cycles ```.equ	testval = 0xFFFFFFF0
.def	zero = r25
.def	magic = r24

start:
ldi	r19, byte4(testval)
ldi	r18, byte3(testval)
ldi	r17, byte2(testval)
ldi	r16, low(testval)

ldi	magic, 0xCC
clr	zero

mov	r1, r16

ror	r19
ror	r18
ror	r17
ror	r16
ror	r1

;extra rounding step to correct numeric errors

mul	r19, magic
movw	r22, r0
mul	r17, magic
movw	r20, r0

mul	r18, magic
movw	r2, r0
mul	r16, magic

sec

ror	r23
ror	r22
ror	r21
ror	r20

rjmp	start
```

Have you checked for all values ?

It would not surprise me if you don't need all the adc's with "zero", because there never is a carry :)

sparrow2 wrote:
Have you checked for all values ?

Yes but I'm checking with C on a PC, to check in such detail would be a bit of a pain. Checking on an AVR with the actual code would probably take an hour or more.

on real cpu.

Check that the result get 1 bigger every 5th time.

that would give about 65 clk loop time or about 5 hours on a 16 MHz AVR.

So show the PC code you use, and see if we can "hack" some kind of check into that :)

To make 100% sure that the code in #17 work you should actually run all combinations ;)

It's a nice fast way to divide with 5 (and also 10).

Last Edited: Thu. Apr 2, 2020 - 02:47 PM

sparrow2 wrote:

So show the PC code you use, and see if we can "hack" some kind of check into that :)

Here, it's functionally equivalent but of course the arithmetic ops are not 8 bit. That's the problem, to do fine testing the operations need to be decomposed.

```
// test program for div5 algo

#include <stdio.h>
#include <stdint.h>

uint64_t div5(uint64_t);

int main()
{
for (uint64_t testval = 0; testval <= 0x33333333; testval ++) {
if (testval != div5(testval * 5)) {
printf("Incorrect result for n=%x\n", testval *5);
break;
}
}
return 0;
}

uint64_t div5(uint64_t value) {
uint64_t tmp;

value += value << 8;
tmp = value & 1;
value >>= 1;
value += (value >> 16) + tmp;
value += value & 0xFF;
value >>= 8;
value *= 0xCC;
value += 0x100;
value >>= 9;

return value;
}```

edit: C was a previous version. Replaced by more accurate representation of the asm code.

Last Edited: Thu. Apr 2, 2020 - 03:48 PM

Ah, I devised a simple hack to tilt the calculations in the right direction, I changed the lower order 0xCC into 0xCD. Now, I don't think this is actually mathematically equivalent to multiply by 0xCCCCCCCD, but it seems to be close enough to give the illusion of valid results Values that are not multiples of 5 will sometimes be round to nearest, sometimes not.

C test version:

```
// test program for div5 algo

#include <stdio.h>
#include <stdint.h>

uint64_t div5(uint64_t);

int main()
{
for (uint64_t testval = 0; testval <= 0x33333333; testval ++) {
if (testval != div5(testval * 5)) {
printf("Incorrect result for n=%x\n", testval *5);
break;
}
}
return 0;
}

uint64_t div5(uint64_t value) {
uint64_t tmp;
uint64_t mul0;
uint64_t mul1;
uint64_t mul2;
uint64_t mul3;

value += value << 8;
tmp = value & 1;
value >>= 1;
value += (value >> 16) + tmp;
value >>= 8;

mul0 = (value & 0xFF) * 0xCD;
mul1 = (value & 0xFF00) * 0xCC;
mul2 = (value & 0xFF0000) * 0xCC;
mul3 = (value & 0xFF000000) * 0xCC;

value = mul0 + mul1 + mul2 + mul3;
value += 0x100;
value >>= 9;

return value;
}```

Asm version:

```.equ	testval = 0xFFFFFFF0
.def	zero = r25
.def	magic = r24

start:
ldi	r19, byte4(testval)
ldi	r18, byte3(testval)
ldi	r17, byte2(testval)
ldi	r16, low(testval)

ldi	magic, 0xCC
clr	zero

mov	r1, r16

ror	r19
ror	r18
ror	r17
ror	r16
ror	r1

mul	r19, magic
movw	r22, r0
mul	r17, magic
movw	r20, r0

mul	r18, magic
movw	r2, r0
inc	magic
mul	r16, magic

sec