As you know, a function can be made faster if more registers are used in its algorithm.
Based on this, I guess the following code (div32b_5: in assembly) is the fastest one for dividing 32 bits by the constant 5.
Edit:
Sorry, my guess happened to be wrong (see next post)
It starts with 32 bits, followed by 16 bits then 8 bits and end up at 4 bits.
Minimum and Maximum cycles are 62 [0x00000000/5] and 254 [0xFFFFFFFF/5].
Note: "div16b_5:" and "div8b_5:" could also be the function entry with a minor addition.
As usual, your comments are welcomed.
;=====================================
;=====================================
Symbolic Code of "div32b_5:"
x in Y[x] is the byte number of the register Y
N[4] the original 32-bit number
R[4] the result of N[4]/5
A[4], B[4], C[4] and K[2] temporary registers
B[4]hi <==> B3:B2
A[2]<==> A1:A0
B[2]<==> B1:B0
C[2]<==> C1:C0
;======================
div32b_5:
; PUSH registers
A[4]= N[4]
C[4]= R[4]= 0
loop32b:
A[4]= A[4]-C[4]
if A[4]<2^16, div16b_5
B[4]= (A[4]/2^16)*13107
R[4]= R[4]+B[4]
C[4]= B[4]*5
goto loop32b
;==========
div16b_5:
C[2]= B[4]= 0
loop16b:
A[2]= A[2]-C[2]
if A[2]<2^8, div8b_5
B[2] = (A[2]/2^8)*51
B[4]hi= B[4]hi+B[2]
C[2] = B[2]*5
goto loop8b
;==========
div8b_5:
B1=B0=C0=0
loop4b:
A0= A0-C0
if A0<2^4, div2b_5
B0= (A0/16)*3
B1= B1+B0
C0= B0*5
goto loop4b
;==========
div2b_5:
K0=0
loop2b:
A0=A0-5
if A1<0, goto div_end
K0= K0+1
goto loop2b
;==========
div_end:
K0 = K0+B1
K[2]= K0+B[4]hi
R[4]= R[4]+K[2]
; POP registers
RET
;=====================================
;=====================================
;============================== ; Test div32b_5 ; Division 32-bit register by 5 ;============================== .def K_1 = r25 ; temporary for constants .def K_0 = r24 .def N_3 = r23 ; the original 32-bit number .def N_2 = r22 .def N_1 = r21 .def N_0 = r20 .def A_3 = r19 ; temporary .def A_2 = r18 .def A_1 = r17 .def A_0 = r16 .def R_3 = r15 ; result of N_3:N_0 / 5 .def R_2 = r14 .def R_1 = r13 .def R_0 = r12 .def B_3 = r11 ; temporary .def B_2 = r10 .def B_1 = r9 .def B_0 = r8 .def C_3 = r7 ; temporary .def C_2 = r6 .def C_1 = r5 .def C_0 = r4 ; divident , cycles [minimum 62, maximum 254] .equ dd_Q = 0xFF .equ dd_U = 0xFF .equ dd_H = 0xFF .equ dd_L = 0xFF REPEAT: LDI N_3, dd_Q LDI N_2, dd_U LDI N_1, dd_H LDI N_0, dd_L RCALL div32b_5 RJMP REPEAT ;==================== div32b_5: ; PUSH registers ; A[4]= N[4] MOV A_0, N_0 MOV A_1, N_1 MOV A_2, N_2 MOV A_3, N_3 ; C[4]= R[4]= 0 CLR C_0 CLR C_1 CLR C_2 CLR C_3 CLR R_0 CLR R_1 CLR R_2 CLR R_3 loop32b: ; A[4]= A[4]-C[4] SUB A_0, C_0 SBC A_1, C_1 SBC A_2, C_2 SBC A_3, C_3 ; if A[4]<2^16, div16b_5 TST A_3 BRNE cnt_32b TST A_2 BREQ div16b_5 cnt_32b: ; B[4]= (A[4]/2^16)*13107 LDI K_1, high(13107) LDI K_0, low(13107) ; Mul_16x16 MUL A_3, K_1 ; Hi1 * Hi2 MOVW B_3:B_2, r1:r0 MUL A_2, K_0 ; Lo1 * Lo2 MOVW B_1:B_0, r1:r0 MUL A_3, K_0 ; Hi1 * Lo2 CLR K_0 ADD B_1, r0 ADC B_2, r1 ADC B_3, K_0 MUL K_1, A_2 ; Hi2 * Lo1 ADD B_1, r0 ADC B_2, r1 ADC B_3, K_0 ; R[4]= R[4]+B[4] ADD R_0, B_0 ADC R_1, B_1 ADC R_2, B_2 ADC R_3, B_3 ; C[4]= B[4]*5 LDI K_0, 5 MUL B_0, K_0 MOV C_0, r0 MOV C_1, r1 MUL B_1, K_0 CLR C_2 ADD C_1, r0 ADC C_2, r1 MUL B_2, K_0 CLR C_3 ADD C_2, r0 ADC C_3, r1 MUL B_3, K_0 ADD C_3, r0 ; goto loop1 RJMP loop32b ;============= div16b_5: ; C[2]= B[4]= 0 CLR C_0 CLR C_1 CLR B_0 CLR B_1 CLR B_2 CLR B_3 loop16b: ; A[2]= A[2]-C[2] SUB A_0, C_0 SBC A_1, C_1 ; if A[2]<2^8, div8b_5 TST A_1 BREQ div8b_5 ; B[2]= (A[2]/2^8)*51 LDI K_0, 51 MUL A_1, K_0 MOV B_0, r0 MOV B_1, r1 ; B[4]hi= B[4]hi+B[2] ADD B_2, B_0 ADC B_3, B_1 ; C[2]= B[2]*5 LDI K_0, 5 MUL B_0, K_0 MOV C_0, r0 MOV C_1, r1 MUL B_1, K_0 ADD C_1, r0 ; goto loop16b RJMP loop16b ;============= div8b_5: ; B1=B0=C0=0 CLR C_0 CLR B_0 loop4b: ; A0= A0-C0 SUB A_0, C_0 ; if A0<2^4, div2b_5 CPI A_0, 16 BRLO div2b_5 ; B0= (A0/16)*3 MOV A_1, A_0 SWAP A_1 ANDI A_1, 0x0F LDI K_0, 3 MUL A_1, K_0 MOV B_0, r0 ; B1= B1+B0 ADD B_1, B_0 ; C0= B0*5 LDI K_0, 5 MUL B_0, K_0 MOV C_0, r0 ; goto loop4b RJMP loop4b ;============= div2b_5: ; K0=0 CLR K_0 loop2b: ; A0=A0-5 SUBI A_0, 5 ; if A0<0, goto div_end BRLO div_end ; K0= K0+1 INC K_0 ; goto loop2b RJMP loop2b ;============= div_end: ; K0= K0+B1 ADD K_0, B_1 ; K[2]= K0+B[4]hi CLR K_1 ADD K_0, B_2 ADC K_1, B_3 ; R[4]= R[4]+K[2] ADD R_0, K_0 ADC R_1, K_1 CLR K_0 ADC R_2, K_0 ADC R_3, K_0 ; POP registers RET