Versatile AVR Code for Dividing 32-Bit Number by 24-Bit Constant

Go To Last Post
17 posts / 0 new
Author
Message
#1
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

The fast algorithm used to divide 16 bits by 8-bit constant works also for 24bits-16bits, 32bits-24bits... etc.

 

The two previous related threads are:

https://www.avrfreaks.net/forum/versatile-avr-code-dividing-16-bits-8-bit-constant

https://www.avrfreaks.net/forum/fast-avr-code-dividing-16-bit-number-8-bit-number

 

Here is, how to divide 32-bit number [0 to 4,294,967,295] by 24-bit constant [2 to 16,777,215] in 142 clocks maximum:

 

;===============================================================
; *R* Division of 32-bit register {A} by 24-bit constant {N} ***
;===============================================================

; {R} = {A}/{N} = r31,r30,r29,r28 / {N}
;     = r31,r30,r29,r28 * {Kd} /256/256/256/256 then rounding
; {A} dividend in r19,r18,r17,r16 [0 to 2^32-1]
; {N} constant divisor [N=2 to 2^24-1]
; {R} result in r31,r30,r29,r28 = {A}/{N} rounded
; {Kd} = round(256*256*256*256/{N},0) , the division constant
; {Kr} = INT[ (1-{N})/2 ] , the rounding constant
; not used registers: r25:r24, r12:r2
; 110 words , excluding RET and RCALL
; 134, 139, 140, or 142 cycles = 127c|132c|133c|135c code + 4c RET + 3c RCALL

.set reg_N = 70000              ; {N}
.set reg_Kd=(2*2^32/reg_N+1)/2  ; {Kd}
.set reg_Kr=(1-reg_N)/2         ; {Kr}

D32_nnn:
    LDI   r20,   low(reg_Kd)    ;1abcd
    LDI   r21,  high(reg_Kd)    ;1abcd
    LDI   r22, byte3(reg_Kd)    ;1abcd
    LDI   r23, byte4(reg_Kd)    ;1abcd, 4abcd
    LDI   r23, byte3(reg_Kd)    ;1abcd, 4abcd
; r23:r20 = reg_Kd

; multiplicand in r19,r18,r17,r16 = {A}
; multiplier   in r23,r22,r21,r20 = {Kd}
; mul. result  in r31:r26, r14:r13
; valid result in r31,r30,r29,r28
    CLR r15                 ;1abcd
    MUL r16,r20             ;2abcd
    MOV r13,r0              ;1abcd
    MOV r14,r1              ;1abcd
    MUL r17,r21             ;2abcd
    MOVW r27:r26,r1:r0      ;1abcd
    MOV r26,r0              ;1abcd
    MOV r27,r1              ;1abcd
    MUL r18,r22             ;2abcd
    MOVW r29:r28,r1:r0      ;1abcd
    MOV r28,r0              ;1abcd 
    MOV r29,r1              ;1abcd
    MUL r19,r23             ;2abcd
    MOVW r31:r30,r1:r0      ;1abcd
    MOV r30,r0              ;1abcd
    MOV r31,r1              ;1abcd

    MUL r17,r20             ;2abcd
    ADD r14,r0              ;1abcd
    ADC r26,r1              ;1abcd
    MUL r18,r20             ;2abcd
    ADD r26,r0              ;1abcd
    ADC r27,r1              ;1abcd
    MUL r19,r20             ;2abcd
    ADD r27,r0              ;1abcd
    ADC r28,r1              ;1abcd

    MUL r16,r21             ;2abcd
    ADD r14,r0              ;1abcd
    ADC r26,r1              ;1abcd
    ADC r27,r15             ;1abcd
    ADC r28,r15             ;1abcd
    ADC r29,r15             ;1abcd
    MUL r18,r21             ;2abcd
    ADD r27,r0              ;1abcd
    ADC r28,r1              ;1abcd
    ADC r29,r15             ;1abcd
    MUL r19,r21             ;2abcd
    ADD r28,r0              ;1abcd
    ADC r29,r1              ;1abcd

    MUL r16,r22             ;2abcd
    ADD r26,r0              ;1abcd
    ADC r27,r1              ;1abcd
    ADC r28,r15             ;1abcd
    ADC r29,r15             ;1abcd
    ADC r30,r15             ;1abcd
    MUL r17,r22             ;2abcd
    ADD r27,r0              ;1abcd
    ADC r28,r1              ;1abcd
    ADC r29,r15             ;1abcd
    ADC r30,r15             ;1abcd
    MUL r19,r22             ;2abcd
    ADD r29,r0              ;1abcd
    ADC r30,r1              ;1abcd

    MUL r16,r23             ;2abcd
    ADD r27,r0              ;1abcd
    ADC r28,r1              ;1abcd
    ADC r29,r15             ;1abcd
    ADC r30,r15             ;1abcd
    ADC r31,r15             ;1abcd
    MUL r17,r23             ;2abcd
    ADD r28,r0              ;1abcd
    ADC r29,r1              ;1abcd
    ADC r30,r15             ;1abcd
    ADC r31,r15             ;1abcd
    MUL r18,r23             ;2abcd
    ADD r29,r0              ;1abcd
    ADC r30,r1              ;1abcd
    ADC r31,r15             ;1abcd +80abcd=84abcd
; {B} = {A}*{Kd}/256/256/256/256
; {R} = {B} or {B}+1

; for rounding
    LDI   r26, byte3(reg_N)     ;1abcd
    LDI   r25,  high(reg_N)     ;1abcd
    LDI   r24,   low(reg_N)     ;1abcd, +3abcd=  87abcd
; r26:r24 = {N}

; multiplicand in r31,r28 = {B}
; multiplier   in r26,r24 = {N}
; valid result in r23,r20
    MUL r28,r24             ;2abcd
    MOVW r21:r20,r1:r0      ;1abcd
    MOV r20,r0              ;1abcd
    MOV r21,r1              ;1abcd
    MUL r29,r25             ;2abcd
    MOVW r23:r22,r1:r0      ;1abcd
    MOV r22,r0              ;1abcd
    MOV r23,r1              ;1abcd

    MUL r29,r24             ;2abcd
    ADD r21,r0              ;1abcd
    ADC r22,r1              ;1abcd
    MUL r30,r24             ;2abcd
    ADD r22,r0              ;1abcd
    ADC r23,r1              ;1abcd
    MUL r31,r24             ;2abcd
    ADD r23,r0              ;1abcd

    MUL r28,r25             ;2abcd
    ADD r21,r0              ;1abcd
    ADC r22,r1              ;1abcd
    ADC r23,r15             ;1abcd
    MUL r30,r25             ;2abcd
    ADD r23,r0              ;1abcd

    MUL r28,r26             ;2abcd
    ADD r22,r0              ;1abcd
    ADC r23,r1              ;1abcd
    MUL r29,r26             ;2abcd
    ADD r23,r0              ;1abcd +34abcd=121abcd
; {C} = r23:r20 = {B}*{N} = r31:r28 * r26:r24

; the following conditions were deduced empirically
; if( Carry_1=0, {R}={B}, if( Zero_2=1 OR Carry_2=0, {R}={B}, {R}={B}+1 ) )

    SUB   r20, r16              ;1abcd
    SBC   r21, r17              ;1abcd
    SBC   r22, r18              ;1abcd
    SBC   r23, r19              ;1abcd
; {D1} = r23:r20 = {C} - {A} = r23:r20 - r19:r16

    BRCC  DIV32_R               ;2a|1bcd +6a=[127a]
; if Carry_1=0, {R}={B}

    SUBI  r20,   low(reg_Kr)    ;1bcd
    SBCI  r21,  high(reg_Kr)    ;1bcd
    SBCI  r22, byte3(reg_Kr)    ;1bcd
    SBCI  r23, byte4(reg_Kr)    ;1bcd
; {D2} = r23:r20 = {D1} - {Kr} = r23:r20 - {Kr}

    BRPL  DIV32_R               ;2b|1cd, +11b=[132b]
; if {D2} positive, {R}={B} 

    BREQ  DIV32_R               ;2b|1cd, +11b=[132b]
; if Zero_2=1, {R}={B}

    BRCC  DIV32_R               ;2c|1d, +12c=[133c]
; if Carry_2=0, {R}={B}

    ADIW  r29:r28, 1            ;1d
    SUBI  r28,   low(-1)        ;1d
    SBCI  r29,  high(-1)        ;1d
    SBCI  r30, byte3(-1)        ;1d
    SBCI  r31, byte4(-1)        ;1d, +14d=[135d]
; {R}={B}+1

DIV32_R:
    RET                         ;4
; {R} = r31:r28 = {B} or {B}+1 [ {A}/{N} rounded ]

 

Edited: (thanks to 'grohote' post #2)

One bug fixed [byte3 to byte4]

Code cycles decreased by 3 cycles.

Code words decreased by 3 words.

 

Edited:

Code cycles decreased by another 2 cycles.

Code words decreased by another 2 words.

 

Edited:

 

Edited: (thanks to 'grohote' post #6)

Code cycles decreased by another 1 cycle.

Code words decreased by another 1 word.

 

Edited: (thanks to 'grohote' post #12)

Code cycles increased by 1 cycle.

Code words increased by 1 word.

 

Last Edited: Fri. Feb 25, 2022 - 10:02 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Welcome back, with new goodies!

My question is related to this line:

 

     LDI r23, byte3(reg_Kd) ; byte4 ?

 

also, some movw saves here:

 

    MOV r26,r0  -> movw r26,r0
    MOV r27,r1              
    
    MOV r28,r0               
    MOV r29,r1               
    
    MOV r30,r0              
    MOV r31,r1               

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

grohote wrote:

Welcome back, with new goodies!

My question is related to this line:

 

     LDI r23, byte3(reg_Kd) ; byte4 ?

 

also, some movw saves here:

 

    MOV r26,r0  -> movw r26,r0
    MOV r27,r1              
    
    MOV r28,r0               
    MOV r29,r1               
    
    MOV r30,r0              
    MOV r31,r1               

 

 

And welcome back with good remarks. Thank you.

I hope this proves that I am human, not a robot, who may do silly mistakes anyrtime :)

 

By the way, for every new program I wrote, I had to fix about 100 bugs when I am real lucky; otherwise it would be many hundreds instead, before I let the program/code run as I like it to do. So I used, since always, to be very very patient anytime I work on a new project.

For example, about 45 years ago, when I liked to prove, by an MS thesis, that demodulating two symmetrical side bands (as in AM double side-band suppressed carrier system, DSB-SC) has to be, based on logic, much simpler and more reliable than demodulation just one side band (as in AM single side-band suppressed carrier system, SSB-SC), I almost lived about 3 months at the university's laboratory. During these 90 days, I lived a failure in every new experiment, almost daily. But, on the last day on which I was allowed to be at the lab, I did my last experiment while I was more than sure that it will be another failure though it was my last hope. To my big surprise (and of those who worked at the lab), it worked as in magic! Unfortunately, the scientific world, even in these days, has no right to recognize the existence of this simple reliable 1-PLL topology for demodulating DSB-SC (actually any AM signal). I couldn't submit my work as a thesis for financial reasons. So, when I returned home, I just used it in my scrambled private short-range RF links, for a few years, till I got a phone line at home.

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Not good

    MUL r18,r22             ;2abcd
    MOV r28,r0         <-
    MOVW r29:r28,r1:r0      ;1abcd
    

Why discarded
    MUL r19,r23             ;2abcd    

 

Not gradual mul (??) (Just I wonder)
    MUL r16,r21             ;2abcd
    ...            ;<- missing r17  
             
    MUL r17,r22             ;2abcd
    ...            ;<- missing r18  
    

    MUL r28,r25             ;2abcd
    ...            ;<- missing r29  
    

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

grohote wrote:

Not good

    MUL r18,r22             ;2abcd
    MOV r28,r0         <-
    MOVW r29:r28,r1:r0      ;1abcd

 

Corrected, thanks

 

grohote wrote:

Not gradual mul (??) (Just I wonder)
    MUL r16,r21             ;2abcd
    ...            ;<- missing r17  
             
    MUL r17,r22             ;2abcd
    ...            ;<- missing r18  
    

 

They are made at the beginning of the multiplication

 

grohote wrote:

    MUL r28,r25             ;2abcd
    ...            ;<- missing r29  

 

Also it is made at the beginning of the multiplication

 

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Not necessary to mystify those two, just use 255 instead.
    SBCI  r30, byte3(-1)     
    SBCI  r31, byte4(-1)    
   
    
Perhaps this can be reduced (?)
    BREQ  DIV32_R   
    BRCC  DIV32_R  

To:
    BRPL  DIV32_R     

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

grohote wrote:

Not necessary to mystify those two, just use 255 instead.
    SBCI  r30, byte3(-1)     
    SBCI  r31, byte4(-1)    
   

 

I agree with you. And if use 0xFF or 255, I won't be surprised hearing someone: "From where you got your 255?" smiley 

 

grohote wrote:

Perhaps this can be reduced (?)
    BREQ  DIV32_R   
    BRCC  DIV32_R  

To:
    BRPL  DIV32_R     

 

You are likely right.

Truth be said, after reading the AVR instruction manual (long ago), I had no clear/sure idea when the negative N is set exactly because zero could be seen as a positive or a negative number (two's complement of 0 is 0). Then I forgot to test it later.

But now, after hearing your suggestion, it seems that zero here is seen by AVR as a positive number. Thank you.

 

Last Edited: Thu. Feb 24, 2022 - 09:36 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

KerimF wrote:
I won't be surprised hearing someone: "From where you got your 255?"

 

Agree on this, too. The lack of 'adi' function pushes us to think about the best substitution.

 

But, your byte3(-1) is definitely mystifying. How this number can have byte3, or byte4?

Are we stretching too much the value -1=255 (seems that the compiler is clever enough to guess what is needed).

 

Perhaps just '-1' can be enough, without 'byte': sbci   r30, -1

 

Personally, I am using macros for adding, but here any macro is not welcome.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

grohote wrote:

Perhaps just '-1' can be enough, without 'byte': sbci   r30, -1

 

I am afraid that [ sbci   r30, -1 ] is compiled as [ sbci   r30, low(-1) ] not as [ sbci   r30, 255 ].

 

By the way, I heard about 'byte3' and 'byte4', from AVR Studio help, only lately.

I used instead in case of 32-bit operation:

low(number/256/256) for byte3

and

high(number/256/256) for byte4

 

Last Edited: Thu. Feb 24, 2022 - 11:38 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

KerimF wrote:
I am afraid that [ sbci   r30, -1 ] is compiled as [ sbci   r30, low(-1) ] not as [ sbci   r30, 255 ].

 

But, but, there is no difference between  sbci   r30, low(-1) and[ sbci   r30, 255. Seems that the problem is much worse.

 

Try this:

    ldi    r30, 0
    clc
    sbci   r30, 255           
    sbci   r30, -1   
    sbci   r30, low(-1)  
    sbci   r30, high(-1)  
    sbci   r30, byte1(-1)
    sbci   r30, byte2(-1)
    sbci   r30, byte3(-1)
    sbci   r30, byte4(-1)  

 

When Carry is not set, each and every sbci increases by 1 . But, each and every sbci does set carry flag, I am afraid.

So, the result is 1.

This is your problem to solve.

 

Now, try ldi r30,255

Result is 1 again, because first sbci will not set carry.

Another problem, you see.

 

 

Solution: Use 32 b math

    ldi    r30, 1  
    clc
    add    r26, r30           
    adc    r27, r30
    adc    r28, r30
    adc    r29, r30

 

Last Edited: Fri. Feb 25, 2022 - 08:07 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

grohote wrote:

KerimF wrote:

I am afraid that [ sbci   r30, -1 ] is compiled as [ sbci   r30, low(-1) ] not as [ sbci   r30, 255 ].

 

 

But, but, there is no difference between  sbci   r30, low(-1) and[ sbci   r30, 255. Seems that the problem is much worse.

 

You are right. I was going to sleep when I read your remark. I forgot that we are talking of the special case which is (-1).

Yes, (-1)=0x....FFFFFFFF; low(-1)=high(-1)=byte2(-1)=byte3(-1)=... etc=0xFF

  

grohote wrote:

Try this:

    ldi    r30, 0
    clc
    sbci   r30, 255           
    sbci   r30, -1   
    sbci   r30, low(-1)  
    sbci   r30, high(-1)  
    sbci   r30, byte1(-1)
    sbci   r30, byte2(-1)
    sbci   r30, byte3(-1)
    sbci   r30, byte4(-1)  

 

When Carry is not set, each and every sbci increases by 1 . But, each and every sbci does set carry flag, I am afraid.

So, the result is 1.

This is your problem to solve.

 

I am not sure yet if byte1==low and byte2==high. I will check this later.

 

I can't see the problem, sorry.

if we like increasing the value of a register (assumed of 32-bit as r16 to r19) by 1, we just do this:

    sbc    r16,   low(-1)
    sbci   r17,  high(-1)
    sbci   r18, byte3(-1)
    sbci   r19, byte4(-1)
  

Therefore, if r31=r30=r29=r28=0 r19=r18=r17=r16=0, the result would be 1 as expected.

    sbc    r16,   low(-1)   ; r16 = 0 - 0xFF         = 1 & carry=1
    sbci   r17,  high(-1)   ; r17 = 0 - 0xFF - carry = 0 & carry=1
    sbci   r18, byte3(-1)   ; r18 = 0 - 0xFF - carry = 0 & carry=1
    sbci   r19, byte4(-1)   ; r19 = 0 - 0xFF - carry = 0 & carry=1

 

grohote wrote:

Now, try ldi r30,255

Result is 1 again, because first sbci will not set carry.

Another problem, you see.

      

What about the values of the other registers with r30. We are working here with 32-bit numbers, right?

 

grohote wrote:

Solution: Use 32 b math

    ldi    r30, 1  
    clc
    add    r26, r30           
    adc    r27, r30
    adc    r28, r30
    adc    r29, r30

 

Yes, it works because 1 in r30 is no more a constant.

By the way, 'clc' here is redundant because 'add' ignores 'carry'.

As you see, your example needs 5 MCU cycles and 5 words.

While 

    ADIW   r27:r26, 1

    SBCI   r28, byte3(-1) or 255

    SBCI   r29, byte4(-1) or 255

needs 3 MCU cycles and 3 words only.

 

Last Edited: Fri. Feb 25, 2022 - 09:44 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

You should test my examples and then write a comment.

You are wasting /our/ time because you NEVER tested this adiw-sbci solution/

 

Do test this:  

    ldi    r26,0

    clr    r27

    clr    r28

    clr    r29
    adiw   r27:r26, 1       WILL NOT SET Cy
    sbci   r28, byte3(-1)   WILL SET Cy
    sbci   r29, byte4(-1)   WILL Set Cy

 

Result is  h'01010001.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

grohote wrote:

You should test my examples and then write a comment.

You are wasting /our/ time because you NEVER tested this adiw-sbci solution/

 

Do test this:  

    ldi    r26,0

    clr    r27

    clr    r28

    clr    r29
    adiw   r27:r26, 1       WILL NOT SET Cy
    sbci   r28, byte3(-1)   WILL SET Cy
    sbci   r29, byte4(-1)   WILL Set Cy

 

Result is  h'01010001.

 

Now your point is very clear, thank you. (Sorry, in these days I am rather busy in finishing a project. The above algorithm is for the next one... with a limited table).

It is my bad; I liked to play the smart by using ADIW as we did previously with the 16-bit number to replace SUBI and SBCI. I did it without thinking about it carefully.

Anyway, it should be as I used doing always:

    ldi    r26,0

    clr    r27

    clr    r28

    clr    r29
    subi   r26,   low(-1)
    sbci   r27,  high(-1)
    sbci   r28, byte3(-1)
    sbci   r29, byte4(-1)

 

Thank you again for being patient.

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Perfect, just get rid of 'ornaments', like this:

    subi   r26, -1
    sbci   r27, -1
    sbci   r28, -1
    sbci   r29, -1

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Perfect, just get rid of 'ornaments', like this:

Keep in mind the ornaments serve a purpose & prevent some mistakes..  In the special case of creating -1, they have no effect & could be put in the trashcan.  But if you came back later and decided you want to add 543, you could not simply swap out -1 for -543 , unless you were using the ornaments (so a chance for mistake).  Don't slip!

When in the dark remember-the future looks brighter than ever.   I look forward to being able to predict the future!

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

avrcandies wrote:

Keep in mind the ornaments serve a purpose & prevent some mistakes.

 

And, even by using ornaments always, I couldn't avoid making silly mistakes. So I can imagine what could be the case if I become lazy and stop using them sometimes smiley

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0


 

Seems that -1 means for AS just 32b. Thus, 'byting' them is possible: