Fast division of 32 bits by 5, code in assembly

Go To Last Post
22 posts / 0 new
Author
Message
#1
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

As you know, a function can be made faster if more registers are used in its algorithm.
Based on this, I guess the following code (div32b_5: in assembly) is the fastest one for dividing 32 bits by the constant 5.

 

 

Edit:

Sorry, my guess happened to be wrong sad (see next post)

 

 

It starts with 32 bits, followed by 16 bits then 8 bits and end up at 4 bits.
Minimum and Maximum cycles are 62 [0x00000000/5] and 254 [0xFFFFFFFF/5].
Note: "div16b_5:" and "div8b_5:" could also be the function entry with a minor addition.
As usual, your comments are welcomed.

 

;=====================================

;=====================================

 

Symbolic Code of "div32b_5:"

x in Y[x] is the byte number of the register Y

 

N[4] the original 32-bit number
R[4] the result of N[4]/5
A[4], B[4], C[4] and K[2] temporary registers

 

B[4]hi <==> B3:B2
A[2]<==> A1:A0
B[2]<==> B1:B0
C[2]<==> C1:C0

 

;======================

 

div32b_5:
; PUSH registers

    A[4]= N[4]
    C[4]= R[4]= 0

loop32b:
    A[4]= A[4]-C[4]

  if A[4]<2^16, div16b_5

    B[4]= (A[4]/2^16)*13107
    R[4]= R[4]+B[4]
    C[4]= B[4]*5
    goto loop32b

 

;==========

 

div16b_5:
    C[2]= B[4]= 0

loop16b:
    A[2]= A[2]-C[2]

  if A[2]<2^8, div8b_5

    B[2]  = (A[2]/2^8)*51
    B[4]hi= B[4]hi+B[2]
    C[2]  = B[2]*5
    goto loop8b

 

;==========

 

div8b_5:
    B1=B0=C0=0

loop4b:
    A0= A0-C0

  if A0<2^4, div2b_5

    B0= (A0/16)*3
    B1= B1+B0
    C0= B0*5
    goto loop4b

 

;==========

 

div2b_5:
    K0=0

loop2b:
    A0=A0-5

  if A1<0, goto div_end

    K0= K0+1
    goto loop2b

 

;==========

 

div_end:
    K0  = K0+B1
    K[2]= K0+B[4]hi
    R[4]= R[4]+K[2]

; POP registers
    RET

 

;=====================================

;=====================================

 

;==============================
; Test div32b_5
; Division 32-bit register by 5
;==============================

.def K_1 = r25     ; temporary for constants
.def K_0 = r24

.def N_3 = r23     ; the original 32-bit number
.def N_2 = r22
.def N_1 = r21
.def N_0 = r20

.def A_3 = r19     ; temporary
.def A_2 = r18
.def A_1 = r17
.def A_0 = r16

.def R_3 = r15     ; result of N_3:N_0 / 5
.def R_2 = r14
.def R_1 = r13
.def R_0 = r12

.def B_3 = r11     ; temporary
.def B_2 = r10
.def B_1 = r9
.def B_0 = r8

.def C_3 = r7      ; temporary
.def C_2 = r6
.def C_1 = r5
.def C_0 = r4

; divident , cycles [minimum 62, maximum 254]
.equ dd_Q = 0xFF
.equ dd_U = 0xFF
.equ dd_H = 0xFF
.equ dd_L = 0xFF

REPEAT:
    LDI  N_3, dd_Q
    LDI  N_2, dd_U
    LDI  N_1, dd_H
    LDI  N_0, dd_L

    RCALL div32b_5

    RJMP  REPEAT

;====================

div32b_5:
; PUSH registers

; A[4]= N[4]
    MOV   A_0, N_0
    MOV   A_1, N_1
    MOV   A_2, N_2
    MOV   A_3, N_3

; C[4]= R[4]= 0
    CLR   C_0
    CLR   C_1
    CLR   C_2
    CLR   C_3
    CLR   R_0
    CLR   R_1
    CLR   R_2
    CLR   R_3

loop32b:
; A[4]= A[4]-C[4]
    SUB   A_0, C_0
    SBC   A_1, C_1
    SBC   A_2, C_2
    SBC   A_3, C_3

; if A[4]<2^16, div16b_5
    TST   A_3
    BRNE  cnt_32b

    TST   A_2
    BREQ  div16b_5

cnt_32b:
; B[4]= (A[4]/2^16)*13107
    LDI   K_1, high(13107)
    LDI   K_0,  low(13107)

; Mul_16x16
    MUL   A_3, K_1              ; Hi1 * Hi2
    MOVW  B_3:B_2, r1:r0
    MUL   A_2, K_0              ; Lo1 * Lo2
    MOVW  B_1:B_0, r1:r0
    MUL   A_3, K_0              ; Hi1 * Lo2
    CLR	  K_0
    ADD	  B_1, r0
    ADC	  B_2, r1
    ADC	  B_3, K_0
    MUL	  K_1, A_2              ; Hi2 * Lo1
    ADD	  B_1, r0
    ADC	  B_2, r1
    ADC	  B_3, K_0

; R[4]= R[4]+B[4]
    ADD   R_0, B_0
    ADC   R_1, B_1
    ADC   R_2, B_2
    ADC   R_3, B_3

; C[4]= B[4]*5
    LDI   K_0, 5
    MUL   B_0, K_0
    MOV   C_0, r0
    MOV   C_1, r1

    MUL   B_1, K_0
    CLR   C_2
    ADD   C_1, r0
    ADC   C_2, r1

    MUL   B_2, K_0
    CLR   C_3
    ADD   C_2, r0
    ADC   C_3, r1

    MUL   B_3, K_0
    ADD   C_3, r0

; goto loop1
    RJMP  loop32b

;=============

div16b_5:
; C[2]= B[4]= 0
    CLR   C_0
    CLR   C_1
    CLR   B_0
    CLR   B_1
    CLR   B_2
    CLR   B_3

loop16b:
; A[2]= A[2]-C[2]
    SUB   A_0, C_0
    SBC   A_1, C_1

; if A[2]<2^8, div8b_5
    TST   A_1
    BREQ  div8b_5

; B[2]= (A[2]/2^8)*51
    LDI   K_0, 51
    MUL   A_1, K_0
    MOV   B_0, r0
    MOV   B_1, r1

; B[4]hi= B[4]hi+B[2]
    ADD   B_2, B_0
    ADC   B_3, B_1

; C[2]= B[2]*5
    LDI   K_0, 5
    MUL   B_0, K_0
    MOV   C_0, r0
    MOV   C_1, r1

    MUL   B_1, K_0
    ADD   C_1, r0

; goto loop16b
    RJMP  loop16b

;=============

div8b_5:
; B1=B0=C0=0
    CLR   C_0
    CLR   B_0

loop4b:
; A0= A0-C0
    SUB   A_0, C_0

; if A0<2^4, div2b_5
    CPI   A_0, 16
    BRLO  div2b_5

; B0= (A0/16)*3
    MOV   A_1, A_0
    SWAP  A_1
    ANDI  A_1, 0x0F
    LDI   K_0, 3
    MUL   A_1, K_0
    MOV   B_0, r0

; B1= B1+B0
    ADD   B_1, B_0

; C0= B0*5
    LDI   K_0, 5
    MUL   B_0, K_0
    MOV   C_0, r0

; goto loop4b
    RJMP  loop4b

;=============

div2b_5:
; K0=0
    CLR   K_0

loop2b:
; A0=A0-5
    SUBI  A_0, 5

; if A0<0, goto div_end
    BRLO  div_end

; K0= K0+1
    INC   K_0

; goto loop2b
    RJMP  loop2b

;=============

div_end:
; K0= K0+B1
    ADD   K_0, B_1

; K[2]= K0+B[4]hi
    CLR   K_1
    ADD   K_0, B_2
    ADC   K_1, B_3

; R[4]= R[4]+K[2]
    ADD   R_0, K_0
    ADC   R_1, K_1
    CLR   K_0
    ADC   R_2, K_0
    ADC   R_3, K_0

; POP registers

    RET

 

This topic has a solution.

Last Edited: Tue. Mar 31, 2020 - 07:08 AM
This reply has been marked as the solution. 
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

A full 32bit*32bit=64bit take less than 100 clk (if everything is in registers)  , so a mul with 0XCCCC.... where only the top 32 bit are used and then some shifts should take about the same amount of clk so about 100 clk as both min and max.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Sorry, sparrow2, I couldn't get you well.

Is it about a faster algorithm that uses 32bit*32bit?

 

Edit1:

I guess you mean

N*0x33333333/2^32

 

Edit2:

You are right, sparrow2. It seems I like complicating things to have more fun in solving a problem, I guess laugh... like using SPI to generate sinewave signals sad  

 

 

Last Edited: Mon. Mar 30, 2020 - 01:27 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I think you should use a lookup table for the inner loop to divide an arbitrary 8-bit value by 5. Costs 256 bytes of flash, saves a LOT of cycles.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

(0xFFFFFFFF * 0xCCCCCCCD) >> 18

When I wrote this in assembler, it was 146 clocks from call to return.

Last Edited: Tue. Mar 31, 2020 - 06:33 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0
;***************************************************
;* Mutiply 32x32 -> 64 bit
;*
;*  R24:R21 x R5:R2 -> R15:R8
;*
;*  88 cycles + 4 (RET) = 92 Cycles
;*  86 cycles + 4 (RET) = 90 Cycles
;*
;* http://www.vfx.hu/avr/download/mult32.asm
mult32:
		clr	R16
		mul	R21,R2
		movw	R8,R0
		clr	R10
		clr	R11
                movw R12,R10
                movw R14,R10
;		clr	R12
;		clr	R13
;		clr	R14
;		clr	R15
		mul	R22,R2
		add	R9,R0
		adc	R10,R1
		mul	R23,R2
		add	R10,R0
		adc	R11,R1
		mul	R24,R2
		add	R11,R0
		adc	R12,R1

		mul	R21,R3
		add	R9,R0
		adc	R10,R1
		adc	R11,R16
		adc	R12,R16
		adc	R13,R16
		mul	R22,R3
		add	R10,R0
		adc	R11,R1
		adc	R12,R16
		adc	R13,R16
		mul	R23,R3
		add	R11,R0
		adc	R12,R1
		adc	R13,R16
		mul	R24,R3
		add	R12,R0
		adc	R13,R1

		mul	R21,R4
		add	R10,R0
		adc	R11,R1
		adc	R12,R16
		adc	R13,R16
		adc	R14,R16
		mul	R22,R4
		add	R11,R0
		adc	R12,R1
		adc	R13,R16
		adc	R14,R16
		mul	R23,R4
		add	R12,R0
		adc	R13,R1
		adc	R14,R16
		mul	R24,R4
		add	R13,R0
		adc	R14,R1

		mul	R21,R5
		add	R11,R0
		adc	R12,R1
		adc	R13,R16
		adc	R14,R16
		adc	R15,R16
		mul	R22,R5
		add	R12,R0
		adc	R13,R1
		adc	R14,R16
		adc	R15,R16
		mul	R23,R5
		add	R13,R0
		adc	R14,R1
		adc	R15,R16
		mul	R24,R5
		add	R14,R0
		adc	R15,R1
		ret

 

Last Edited: Tue. Mar 31, 2020 - 07:45 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

And remember that this is a full 32*32=64

 

And it really spring in my eyes that there are 6 clr in a row, it could have been 2 clr and 2 movw, save two clk and 4 bytes.

 

 

Last Edited: Tue. Mar 31, 2020 - 07:42 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

pleasant.

 

;***************************************************
;* Mutiply 32x32 -> 64 bit
;* 
;*  R24:R21 x R5:R2 -> R15:R8
;*
;*  76 cycles + 4 (RET) = 80 Cycles
;*
mult32:
        clr R16
        mul     R21,R2
        movw    R8,R0
        mul     R22,R3
        movw    R10,R0
        mul     R23,R4
        movw    R12,R0
        mul     R24,R5
        movw    R14,R0

        mul     R22,R2
        add     R9,R0
        adc     R10,R1
        mul     R23,R2
        add     R10,R0
        adc     R11,R1
        mul     R24,R2
        add     R11,R0
        adc     R12,R1

        mul     R21,R3
        add     R9,R0
        adc     R10,R1
        adc     R11,R16
        adc     R12,R16
        adc     R13,R16
        mul     R23,R3
        add     R11,R0
        adc     R12,R1
        adc     R13,R16
        mul     R24,R3
        add     R12,R0
        adc     R13,R1

        mul     R21,R4
        add     R10,R0
        adc     R11,R1
        adc     R12,R16
        adc     R13,R16
        adc     R14,R16
        mul     R22,R4
        add         R11,R0
        adc     R12,R1
        adc     R13,R16
        adc     R14,R16
        mul     R24,R4
        add     R13,R0
        adc     R14,R1

        mul     R21,R5
        add     R11,R0
        adc     R12,R1
        adc     R13,R16
        adc     R14,R16
        adc     R15,R16
        mul     R22,R5
        add     R12,R0
        adc     R13,R1
        adc     R14,R16
        adc     R15,R16
        mul     R23,R5
        add     R13,R0
        adc     R14,R1
        adc     R15,R16
        ret

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Long division gives 58 cycles in a C-callable function:

  .global divide32u5
  divide32u5:
  ; input is R27:24, result goes back there
  ; uses long division with byte-sized digits
  ; R21:18 holds the quotient before being moved to R27:24
  ; Of the registers used, the avr-gcc ABI only requires R1 to be restored
  .def C5 = R30
  LDI C5, 5
  .def C51 = R31
  LDI C51, 51

  .macro div i1, i0, q, flag
     ; q = i1:0/5, i0-=5*q
     MUL \i0, C51
     MOV \q, R1
     .if \flag
       ; i1<=4
       MUL \i1, C51
       ADD \q, R0
     .ifend
     MUL \q, C5
     SUB \i0, R0
     ; q<=9
     CP \i0, C5
     BRLT 1f
       SUB \i0, CR
       INC \q
     1: ; i0 <= 4
  .endm

  div  -1, R27, R21, 0
  div R27, R26, R20, 1
  div R26, R25, R19, 1
  div R25, R24, R18, 1

  MOVW R24, R18
  MOVW R26, R20

  CLR R1

  RET

The 58 is an upper bound that includes the ret, but not the call.

Moderation in all things. -- ancient proverb

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

This is my entry for the division by 5 "contest", numeric quality may not be the best even though I gave it some thought, but anyways... should take about 40 cycles if tweaked to be a C called function, I just used the first registers that came to mind.

 


.equ	testval = 100000000
.def	zero = r25
.def	magic = r24

start:
	ldi	r19, byte4(testval)
	ldi	r18, byte3(testval)
	ldi	r17, byte2(testval)
	ldi	r16, low(testval)

	ldi	magic, 0xCC
	clr	zero

	mov	r1, r16
	add	r16, r17
	adc	r17, r18
	adc	r18, r19
	adc	r19, zero

	ror	r19
	ror	r18
	ror	r17
	ror	r16
	ror	r1

	adc	r1, r17
	adc	r16, r18
	adc	r17, r19
	adc	r18, zero
	adc	r19, zero

	mul	r19, magic
	movw	r22, r0
	mul	r17, magic
	movw	r20, r0

	mul	r18, magic
	movw	r2, r0
	mul	r16, magic

	sec
	adc	r20, r1
	adc	r21, r2
	adc	r22, r3
	adc	r23, zero

	ror	r23
	ror	r22
	ror	r21
	ror	r20

	rjmp	start

 

edit: forgot to say, the result will be in r20:r23

Last Edited: Wed. Apr 1, 2020 - 10:44 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

 forgot to say, the result will be in r20:r23

why not do this:

 

.def result_H = r20

.def result_L = r23

 

Then there is less confusion.  Also, the assembler will issue a warning if the same register is used elsewhere with another name---so potential conflicts can be spotted and checked (allowed or disallowed)

When in the dark remember-the future looks brighter than ever.   I look forward to being able to predict the future!

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Thank you, El Tangas.

It happens that, in my actual application, I need just 16-bit / 10 which is somehow equivalent to 16-bit / 5.

I will try to deduce its algorithm (in assembly) based on yours on #10.

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I just like to point out that, in my first test of this very fast algorithm (post #10), 0xFFFFFF0/5 gave 0x3333332F (the right answer – 1).

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

You're right it's messing up large numbers. Well, I expected precision would not be good, I guess I cut too many corners.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

No because that is wrong, the result is 4 byte in size (r20:r23)  

 

And personally I hate to use .def  that make it hard to read. And for an AVR you need to remember if it's a high or low register.

 

But relevant comments are always welcome.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I should have been more precise, the result is in r23:r22:r21:r20.

 

Anyway, I translated the algo to C so that I can test it better on a PC. I think I see some ways to improve precision without much cost.

 

uint64_t div5(uint64_t value) {
	uint64_t tmp;

	value += value << 8;
	tmp = value & 1;
	value >>= 1;
	value += (value >> 16) + tmp;
	value >>= 8;
	value *= 0xCC;
	value += 0x100;
	value >>= 9;

	return value;
}

 

The uint64_t are just to provide room for intermediate calculations, the algorithm is for 32 bit.

 

edit: first incorrect result for multiples of 5 is 0x8C0001FE (2348810750 decimal).

Last Edited: Thu. Apr 2, 2020 - 01:02 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Ok, I fixed it at the cost of 5 cycles crying

 

.equ	testval = 0xFFFFFFF0
.def	zero = r25
.def	magic = r24

start:
        ldi	r19, byte4(testval)
        ldi	r18, byte3(testval)
        ldi	r17, byte2(testval)
        ldi	r16, low(testval)

        ldi	magic, 0xCC
        clr	zero

        mov	r1, r16
        add	r16, r17
        adc	r17, r18
        adc	r18, r19
        adc	r19, zero

        ror	r19
        ror	r18
        ror	r17
        ror	r16
        ror	r1

        adc	r1, r17
        adc	r16, r18
        adc	r17, r19
        adc	r18, zero
        adc	r19, zero

        ;extra rounding step to correct numeric errors
        add	r1, r1
        adc	r16, zero
        adc	r17, zero
        adc	r18, zero
        adc	r19, zero

        mul	r19, magic
        movw	r22, r0
        mul	r17, magic
        movw	r20, r0

        mul	r18, magic
        movw	r2, r0
        mul	r16, magic

        sec
        adc	r20, r1
        adc	r21, r2
        adc	r22, r3
        adc	r23, zero

        ror	r23
        ror	r22
        ror	r21
        ror	r20

        rjmp	start
        

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Have you checked for all values ?

 

It would not surprise me if you don't need all the adc's with "zero", because there never is a carry :) 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

sparrow2 wrote:
Have you checked for all values ?

 

Yes but I'm checking with C on a PC, to check in such detail would be a bit of a pain. Checking on an AVR with the actual code would probably take an hour or more.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

on real cpu. 

Check that the result get 1 bigger every 5th time.

 

that would give about 65 clk loop time or about 5 hours on a 16 MHz AVR.

 

So show the PC code you use, and see if we can "hack" some kind of check into that :) 

 

 

To make 100% sure that the code in #17 work you should actually run all combinations ;)

It's a nice fast way to divide with 5 (and also 10). 

Last Edited: Thu. Apr 2, 2020 - 02:47 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

sparrow2 wrote:

 

So show the PC code you use, and see if we can "hack" some kind of check into that :) 

 

Here, it's functionally equivalent but of course the arithmetic ops are not 8 bit. That's the problem, to do fine testing the operations need to be decomposed.

 


// test program for div5 algo

#include <stdio.h>
#include <stdint.h>

uint64_t div5(uint64_t);

int main()
{
	for (uint64_t testval = 0; testval <= 0x33333333; testval ++) {
		if (testval != div5(testval * 5)) {
			printf("Incorrect result for n=%x\n", testval *5);
			break;
		}
	}
    return 0;
}

uint64_t div5(uint64_t value) {
	uint64_t tmp;

	value += value << 8;
	tmp = value & 1;
	value >>= 1;
	value += (value >> 16) + tmp;
	value += value & 0xFF;
	value >>= 8;
	value *= 0xCC;
	value += 0x100;
	value >>= 9;

	return value;
}

 

edit: C was a previous version. Replaced by more accurate representation of the asm code.

Last Edited: Thu. Apr 2, 2020 - 03:48 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Ah, I devised a simple hack to tilt the calculations in the right direction, I changed the lower order 0xCC into 0xCD. Now, I don't think this is actually mathematically equivalent to multiply by 0xCCCCCCCD, but it seems to be close enough to give the illusion of valid results cheeky

Values that are not multiples of 5 will sometimes be round to nearest, sometimes not.

 

C test version:

 


// test program for div5 algo

#include <stdio.h>
#include <stdint.h>

uint64_t div5(uint64_t);

int main()
{
	for (uint64_t testval = 0; testval <= 0x33333333; testval ++) {
		if (testval != div5(testval * 5)) {
			printf("Incorrect result for n=%x\n", testval *5);
			break;
		}
	}
    return 0;
}

uint64_t div5(uint64_t value) {
	uint64_t tmp;
	uint64_t mul0;
	uint64_t mul1;
	uint64_t mul2;
	uint64_t mul3;

	value += value << 8;
	tmp = value & 1;
	value >>= 1;
	value += (value >> 16) + tmp;
	value >>= 8;

	mul0 = (value & 0xFF) * 0xCD;
	mul1 = (value & 0xFF00) * 0xCC;
	mul2 = (value & 0xFF0000) * 0xCC;
	mul3 = (value & 0xFF000000) * 0xCC;

	value = mul0 + mul1 + mul2 + mul3;
	value += 0x100;
	value >>= 9;

	return value;
}

 

Asm version:

.equ	testval = 0xFFFFFFF0
.def	zero = r25
.def	magic = r24

start:
	ldi	r19, byte4(testval)
	ldi	r18, byte3(testval)
	ldi	r17, byte2(testval)
	ldi	r16, low(testval)

	ldi	magic, 0xCC
	clr	zero

	mov	r1, r16
	add	r16, r17
	adc	r17, r18
	adc	r18, r19
	adc	r19, zero

	ror	r19
	ror	r18
	ror	r17
	ror	r16
	ror	r1

	adc	r1, r17
	adc	r16, r18
	adc	r17, r19
	adc	r18, zero
	adc	r19, zero

	mul	r19, magic
	movw	r22, r0
	mul	r17, magic
	movw	r20, r0

	mul	r18, magic
	movw	r2, r0
	inc	magic
	mul	r16, magic

	sec
	adc	r20, r1
	adc	r21, r2
	adc	r22, r3
	adc	r23, zero

	ror	r23
	ror	r22
	ror	r21
	ror	r20

	rjmp	start

 

Now the cost to give correct results is just 1 cycle, so I'm happy.