Fast dividing by 5 and by 10 for uint32_t input data for ATMEGA16A

Go To Last Post
93 posts / 0 new

Pages

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

GccApplication2.elf:     file format elf32-avr

Sections:
Idx Name          Size      VMA       LMA       File off  Algn
  0 .text         00000104  00000000  00000000  00000054  2**1
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
  1 .data         00000000  00800060  00800060  00000158  2**0
                  CONTENTS, ALLOC, LOAD, DATA
  2 .comment      00000030  00000000  00000000  00000158  2**0
                  CONTENTS, READONLY
  3 .note.gnu.avr.deviceinfo 0000003c  00000000  00000000  00000188  2**2
                  CONTENTS, READONLY
  4 .debug_aranges 00000038  00000000  00000000  000001c4  2**0
                  CONTENTS, READONLY, DEBUGGING
  5 .debug_info   000007da  00000000  00000000  000001fc  2**0
                  CONTENTS, READONLY, DEBUGGING
  6 .debug_abbrev 00000631  00000000  00000000  000009d6  2**0
                  CONTENTS, READONLY, DEBUGGING
  7 .debug_line   000002b9  00000000  00000000  00001007  2**0
                  CONTENTS, READONLY, DEBUGGING
  8 .debug_frame  00000074  00000000  00000000  000012c0  2**2
                  CONTENTS, READONLY, DEBUGGING
  9 .debug_str    00000337  00000000  00000000  00001334  2**0
                  CONTENTS, READONLY, DEBUGGING
 10 .debug_loc    0000044a  00000000  00000000  0000166b  2**0
                  CONTENTS, READONLY, DEBUGGING
 11 .debug_ranges 00000040  00000000  00000000  00001ab5  2**0
                  CONTENTS, READONLY, DEBUGGING

Disassembly of section .text:

00000000 <__vectors>:
   0:    0c 94 2a 00     jmp    0x54    ; 0x54 <__ctors_end>
   4:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
   8:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
   c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  10:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  14:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  18:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  1c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  20:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  24:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  28:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  2c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  30:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  34:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  38:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  3c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  40:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  44:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  48:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  4c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  50:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>

00000054 <__ctors_end>:
  54:    11 24           eor    r1, r1
  56:    1f be           out    0x3f, r1    ; 63
  58:    cf e5           ldi    r28, 0x5F    ; 95
  5a:    d4 e0           ldi    r29, 0x04    ; 4
  5c:    de bf           out    0x3e, r29    ; 62
  5e:    cd bf           out    0x3d, r28    ; 61
  60:    0e 94 36 00     call    0x6c    ; 0x6c <main>
  64:    0c 94 80 00     jmp    0x100    ; 0x100 <_exit>

00000068 <__bad_interrupt>:
  68:    0c 94 00 00     jmp    0    ; 0x0 <__vectors>

0000006c <main>:
return    ;    
}

uint32_t DivideBy10(uint32_t inp)
{
return (uint32_t) (inp/10);     
  6c:    0f 2e           mov    r0, r31
  6e:    fa e0           ldi    r31, 0x0A    ; 10
  70:    cf 2e           mov    r12, r31
  72:    d1 2c           mov    r13, r1
  74:    e1 2c           mov    r14, r1
  76:    f1 2c           mov    r15, r1
  78:    f0 2d           mov    r31, r0
     
    // GetDigitsFromUint32_t(  in);
    uint32_t out;    
       out=   DivideBy10(in);
       //for example, out bytes to port D  
       DDRD=0xff;
  7a:    cf ef           ldi    r28, 0xFF    ; 255
    /* Replace with your application code */
    while (1) 
    {
     uint32_t in =0; //=  1599999;  // -> 15 9 9 9 9 9    
     //for example, input from port B
     DDRB=0;
  7c:    17 ba           out    0x17, r1    ; 23
     DDRB=0x00;
  7e:    17 ba           out    0x17, r1    ; 23
     in|=(uint32_t)((uint32_t)PINB<<0);
  80:    26 b3           in    r18, 0x16    ; 22
     in|=(uint32_t)((uint32_t)PINB<<8);
  82:    36 b3           in    r19, 0x16    ; 22
     in|=(uint32_t)((uint32_t)PINB<<16);
  84:    66 b3           in    r22, 0x16    ; 22
  86:    86 2f           mov    r24, r22
  88:    90 e0           ldi    r25, 0x00    ; 0
  8a:    a0 e0           ldi    r26, 0x00    ; 0
  8c:    b0 e0           ldi    r27, 0x00    ; 0
  8e:    dc 01           movw    r26, r24
  90:    99 27           eor    r25, r25
  92:    88 27           eor    r24, r24
  94:    93 2b           or    r25, r19
  96:    82 2b           or    r24, r18
     in|=(uint32_t)((uint32_t)PINB<<24);
  98:    26 b3           in    r18, 0x16    ; 22
return    ;    
}

uint32_t DivideBy10(uint32_t inp)
{
return (uint32_t) (inp/10);     
  9a:    bc 01           movw    r22, r24
  9c:    cd 01           movw    r24, r26
  9e:    92 2b           or    r25, r18
  a0:    a7 01           movw    r20, r14
  a2:    96 01           movw    r18, r12
  a4:    0e 94 5e 00     call    0xbc    ; 0xbc <__udivmodsi4>
     
    // GetDigitsFromUint32_t(  in);
    uint32_t out;    
       out=   DivideBy10(in);
       //for example, out bytes to port D  
       DDRD=0xff;
  a8:    c1 bb           out    0x11, r28    ; 17
       PORTD=(uint8_t)((out&0x000000ff)>>0);
  aa:    22 bb           out    0x12, r18    ; 18
       PORTD=(uint8_t)((out&0x0000ff00)>>8);
  ac:    32 bb           out    0x12, r19    ; 18
       PORTD=(uint8_t)((out&0x00ff0000)>>16);
  ae:    42 bb           out    0x12, r20    ; 18
       PORTD=(uint8_t)((out&0xff000000)>>24);
  b0:    85 2f           mov    r24, r21
  b2:    99 27           eor    r25, r25
  b4:    aa 27           eor    r26, r26
  b6:    bb 27           eor    r27, r27
  b8:    82 bb           out    0x12, r24    ; 18
  ba:    e0 cf           rjmp    .-64         ; 0x7c <main+0x10>

000000bc <__udivmodsi4>:
  bc:    a1 e2           ldi    r26, 0x21    ; 33
  be:    1a 2e           mov    r1, r26
  c0:    aa 1b           sub    r26, r26
  c2:    bb 1b           sub    r27, r27
  c4:    fd 01           movw    r30, r26
  c6:    0d c0           rjmp    .+26         ; 0xe2 <__udivmodsi4_ep>

000000c8 <__udivmodsi4_loop>:
  c8:    aa 1f           adc    r26, r26
  ca:    bb 1f           adc    r27, r27
  cc:    ee 1f           adc    r30, r30
  ce:    ff 1f           adc    r31, r31
  d0:    a2 17           cp    r26, r18
  d2:    b3 07           cpc    r27, r19
  d4:    e4 07           cpc    r30, r20
  d6:    f5 07           cpc    r31, r21
  d8:    20 f0           brcs    .+8          ; 0xe2 <__udivmodsi4_ep>
  da:    a2 1b           sub    r26, r18
  dc:    b3 0b           sbc    r27, r19
  de:    e4 0b           sbc    r30, r20
  e0:    f5 0b           sbc    r31, r21

000000e2 <__udivmodsi4_ep>:
  e2:    66 1f           adc    r22, r22
  e4:    77 1f           adc    r23, r23
  e6:    88 1f           adc    r24, r24
  e8:    99 1f           adc    r25, r25
  ea:    1a 94           dec    r1
  ec:    69 f7           brne    .-38         ; 0xc8 <__udivmodsi4_loop>
  ee:    60 95           com    r22
  f0:    70 95           com    r23
  f2:    80 95           com    r24
  f4:    90 95           com    r25
  f6:    9b 01           movw    r18, r22
  f8:    ac 01           movw    r20, r24
  fa:    bd 01           movw    r22, r26
  fc:    cf 01           movw    r24, r30
  fe:    08 95           ret

00000100 <_exit>:
 100:    f8 94           cli

00000102 <__stop_program>:
 102:    ff cf           rjmp    .-2          ; 0x102 <__stop_program>
 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

/*
 * GccApplication2.c
 *
 * Created: 29.03.2020 10:38:02
 * Author : USERPC01
 */ 

#include <avr/io.h>
#include <math.h>
/*
uint8_t  div32bit_mod (uint32_t  in, uint32_t  div,    uint32_t *mod    )
{
  uint32_t res;
      res=(uint32_t)in /div;
    
 *mod=(uint32_t)   (in - (res*div)) ;
*mod=(uint32_t)(in%div); //  (in - (res*div)) ;

return (uint8_t)(in/div) ;// res;
}*/

uint8_t divmod( uint32_t inp, uint32_t  div,  uint32_t *mod )
{
 //return (uint8_t) div32bit_mod( inp,  div, reminder  );
     *mod=(uint32_t)(inp%div); //  (in - (res*div)) ;

     return (uint8_t)(inp/div) ;// res;
}

void  GetDigitsFromUint32_t( uint32_t in )  //"uint24_t" 
{
 uint8_t digits[5];
    /*
    digits[0]=(uint8_t)(in/100000) ;  in=in%100000;     
    digits[1]=(uint8_t)(in/10000) ;  in=in%10000;  
    digits[2]=(uint8_t)(in/1000) ;  in=in%1000;
    digits[3]=(uint8_t)(in/100) ;  in=in%100;
    digits[4]=(uint8_t)(in/10) ;  in=in%10;
    digits[5]=(uint8_t) (in);
        */    
   digits[0]=(uint8_t) divmod(  in , 0x000186A0 /*100000*/ ,  &in );
   digits[1]=(uint8_t) divmod(  in , 0x00002710 /*10000*/,  &in );
   digits[2]=(uint8_t) divmod(  in , 0x000003e8 /*1000*/,  &in );
   digits[3]=(uint8_t) divmod(  in , 0x00000064 /*100*/,  &in );
   digits[4]=(uint8_t) divmod(  in , 0x0000000A /*10*/,  &in ); 
   digits[5]=(uint8_t) divmod(  in , 0x00000001 /* 1*/ ,  &in );   
     
     
     //for example, for output to PORTD 
          DDRD=0xFF;
         PORTD=digits[0];
         PORTD=digits[1];
         PORTD=digits[2];
         PORTD=digits[3];
         PORTD=digits[4];
         PORTD=digits[5];
     
return    ;    
}

uint32_t DivideBy5(uint32_t inp)
{
return (uint32_t) (inp/5);     
    
}
//for debugger test only 

int main(void)
{
    /* Replace with your application code */
    while (1) 
    {
     uint32_t in =0; //=  1599999;  // -> 15 9 9 9 9 9    
     //for example, input from port B
     DDRB=0;
     DDRB=0x00;
     in|=(uint32_t)((uint32_t)PINB<<0);
     in|=(uint32_t)((uint32_t)PINB<<8);
     in|=(uint32_t)((uint32_t)PINB<<16);
     in|=(uint32_t)((uint32_t)PINB<<24);
     
    // GetDigitsFromUint32_t(  in);
    uint32_t out;    
       out=   DivideBy5(in);
       //for example, out bytes to port D  
       DDRD=0xff;
       PORTD=(uint8_t)((out&0x000000ff)>>0);
       PORTD=(uint8_t)((out&0x0000ff00)>>8);
       PORTD=(uint8_t)((out&0x00ff0000)>>16);
       PORTD=(uint8_t)((out&0xff000000)>>24);
       
    }
}

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

GccApplication2.elf:     file format elf32-avr

Sections:
Idx Name          Size      VMA       LMA       File off  Algn
  0 .text         00000196  00000000  00000000  00000054  2**1
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
  1 .data         00000000  00800060  00800060  000001ea  2**0
                  CONTENTS, ALLOC, LOAD, DATA
  2 .comment      00000030  00000000  00000000  000001ea  2**0
                  CONTENTS, READONLY
  3 .note.gnu.avr.deviceinfo 0000003c  00000000  00000000  0000021c  2**2
                  CONTENTS, READONLY
  4 .debug_aranges 00000038  00000000  00000000  00000258  2**0
                  CONTENTS, READONLY, DEBUGGING
  5 .debug_info   000007da  00000000  00000000  00000290  2**0
                  CONTENTS, READONLY, DEBUGGING
  6 .debug_abbrev 00000610  00000000  00000000  00000a6a  2**0
                  CONTENTS, READONLY, DEBUGGING
  7 .debug_line   000002b3  00000000  00000000  0000107a  2**0
                  CONTENTS, READONLY, DEBUGGING
  8 .debug_frame  000000a4  00000000  00000000  00001330  2**2
                  CONTENTS, READONLY, DEBUGGING
  9 .debug_str    00000336  00000000  00000000  000013d4  2**0
                  CONTENTS, READONLY, DEBUGGING
 10 .debug_loc    000004d4  00000000  00000000  0000170a  2**0
                  CONTENTS, READONLY, DEBUGGING
 11 .debug_ranges 00000028  00000000  00000000  00001bde  2**0
                  CONTENTS, READONLY, DEBUGGING

Disassembly of section .text:

00000000 <__vectors>:
   0:    0c 94 2a 00     jmp    0x54    ; 0x54 <__ctors_end>
   4:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
   8:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
   c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  10:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  14:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  18:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  1c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  20:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  24:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  28:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  2c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  30:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  34:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  38:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  3c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  40:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  44:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  48:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  4c:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>
  50:    0c 94 34 00     jmp    0x68    ; 0x68 <__bad_interrupt>

00000054 <__ctors_end>:
  54:    11 24           eor    r1, r1
  56:    1f be           out    0x3f, r1    ; 63
  58:    cf e5           ldi    r28, 0x5F    ; 95
  5a:    d4 e0           ldi    r29, 0x04    ; 4
  5c:    de bf           out    0x3e, r29    ; 62
  5e:    cd bf           out    0x3d, r28    ; 61
  60:    0e 94 36 00     call    0x6c    ; 0x6c <main>
  64:    0c 94 c9 00     jmp    0x192    ; 0x192 <_exit>

00000068 <__bad_interrupt>:
  68:    0c 94 00 00     jmp    0    ; 0x0 <__vectors>

0000006c <main>:
     
    // GetDigitsFromUint32_t(  in);
    uint32_t out;    
       out=   DivideBy5(in);
       //for example, out bytes to port D  
       DDRD=0xff;
  6c:    cf ef           ldi    r28, 0xFF    ; 255
    /* Replace with your application code */
    while (1) 
    {
     uint32_t in =0; //=  1599999;  // -> 15 9 9 9 9 9    
     //for example, input from port B
     DDRB=0;
  6e:    17 ba           out    0x17, r1    ; 23
     DDRB=0x00;
  70:    17 ba           out    0x17, r1    ; 23
     in|=(uint32_t)((uint32_t)PINB<<0);
  72:    26 b3           in    r18, 0x16    ; 22
     in|=(uint32_t)((uint32_t)PINB<<8);
  74:    36 b3           in    r19, 0x16    ; 22
     in|=(uint32_t)((uint32_t)PINB<<16);
  76:    66 b3           in    r22, 0x16    ; 22
  78:    86 2f           mov    r24, r22
  7a:    90 e0           ldi    r25, 0x00    ; 0
  7c:    a0 e0           ldi    r26, 0x00    ; 0
  7e:    b0 e0           ldi    r27, 0x00    ; 0
  80:    dc 01           movw    r26, r24
  82:    99 27           eor    r25, r25
  84:    88 27           eor    r24, r24
  86:    93 2b           or    r25, r19
  88:    82 2b           or    r24, r18
     in|=(uint32_t)((uint32_t)PINB<<24);
  8a:    26 b3           in    r18, 0x16    ; 22
return    ;    
}

uint32_t DivideBy5(uint32_t inp)
{
return (uint32_t) (inp/5);     
  8c:    bc 01           movw    r22, r24
  8e:    cd 01           movw    r24, r26
  90:    92 2b           or    r25, r18
  92:    2d ec           ldi    r18, 0xCD    ; 205
  94:    3c ec           ldi    r19, 0xCC    ; 204
  96:    4c ec           ldi    r20, 0xCC    ; 204
  98:    5c ec           ldi    r21, 0xCC    ; 204
  9a:    0e 94 68 00     call    0xd0    ; 0xd0 <__umulsidi3>
  9e:    00 e2           ldi    r16, 0x20    ; 32
  a0:    0e 94 95 00     call    0x12a    ; 0x12a <__lshrdi3>
  a4:    82 2e           mov    r8, r18
  a6:    93 2e           mov    r9, r19
  a8:    a4 2e           mov    r10, r20
  aa:    b5 2e           mov    r11, r21
  ac:    b6 94           lsr    r11
  ae:    a7 94           ror    r10
  b0:    97 94           ror    r9
  b2:    87 94           ror    r8
  b4:    b6 94           lsr    r11
  b6:    a7 94           ror    r10
  b8:    97 94           ror    r9
  ba:    87 94           ror    r8
     
    // GetDigitsFromUint32_t(  in);
    uint32_t out;    
       out=   DivideBy5(in);
       //for example, out bytes to port D  
       DDRD=0xff;
  bc:    c1 bb           out    0x11, r28    ; 17
       PORTD=(uint8_t)((out&0x000000ff)>>0);
  be:    82 ba           out    0x12, r8    ; 18
       PORTD=(uint8_t)((out&0x0000ff00)>>8);
  c0:    92 ba           out    0x12, r9    ; 18
       PORTD=(uint8_t)((out&0x00ff0000)>>16);
  c2:    a2 ba           out    0x12, r10    ; 18
       PORTD=(uint8_t)((out&0xff000000)>>24);
  c4:    8b 2c           mov    r8, r11
  c6:    99 24           eor    r9, r9
  c8:    aa 24           eor    r10, r10
  ca:    bb 24           eor    r11, r11
  cc:    82 ba           out    0x12, r8    ; 18
  ce:    cf cf           rjmp    .-98         ; 0x6e <main+0x2>

000000d0 <__umulsidi3>:
  d0:    e8 94           clt

000000d2 <__umulsidi3_helper>:
  d2:    df 93           push    r29
  d4:    cf 93           push    r28
  d6:    fc 01           movw    r30, r24
  d8:    db 01           movw    r26, r22
  da:    0e 94 b1 00     call    0x162    ; 0x162 <__umulhisi3>
  de:    7f 93           push    r23
  e0:    6f 93           push    r22
  e2:    e9 01           movw    r28, r18
  e4:    9a 01           movw    r18, r20
  e6:    ac 01           movw    r20, r24
  e8:    bf 93           push    r27
  ea:    af 93           push    r26
  ec:    3f 93           push    r19
  ee:    2f 93           push    r18
  f0:    df 01           movw    r26, r30
  f2:    0e 94 b1 00     call    0x162    ; 0x162 <__umulhisi3>
  f6:    26 f4           brtc    .+8          ; 0x100 <__umulsidi3_helper+0x2e>
  f8:    6c 1b           sub    r22, r28
  fa:    7d 0b           sbc    r23, r29
  fc:    82 0b           sbc    r24, r18
  fe:    93 0b           sbc    r25, r19
 100:    9e 01           movw    r18, r28
 102:    eb 01           movw    r28, r22
 104:    fc 01           movw    r30, r24
 106:    0e 94 c0 00     call    0x180    ; 0x180 <__muldi3_6>
 10a:    af 91           pop    r26
 10c:    bf 91           pop    r27
 10e:    2f 91           pop    r18
 110:    3f 91           pop    r19
 112:    0e 94 c0 00     call    0x180    ; 0x180 <__muldi3_6>
 116:    be 01           movw    r22, r28
 118:    cf 01           movw    r24, r30
 11a:    f9 01           movw    r30, r18
 11c:    2f 91           pop    r18
 11e:    3f 91           pop    r19
 120:    cf 91           pop    r28
 122:    df 91           pop    r29
 124:    08 95           ret

00000126 <__ashrdi3>:
 126:    97 fb           bst    r25, 7
 128:    10 f8           bld    r1, 0

0000012a <__lshrdi3>:
 12a:    16 94           lsr    r1
 12c:    00 08           sbc    r0, r0
 12e:    0f 93           push    r16
 130:    08 30           cpi    r16, 0x08    ; 8
 132:    98 f0           brcs    .+38         ; 0x15a <__lshrdi3+0x30>
 134:    08 50           subi    r16, 0x08    ; 8
 136:    23 2f           mov    r18, r19
 138:    34 2f           mov    r19, r20
 13a:    45 2f           mov    r20, r21
 13c:    56 2f           mov    r21, r22
 13e:    67 2f           mov    r22, r23
 140:    78 2f           mov    r23, r24
 142:    89 2f           mov    r24, r25
 144:    90 2d           mov    r25, r0
 146:    f4 cf           rjmp    .-24         ; 0x130 <__lshrdi3+0x6>
 148:    05 94           asr    r0
 14a:    97 95           ror    r25
 14c:    87 95           ror    r24
 14e:    77 95           ror    r23
 150:    67 95           ror    r22
 152:    57 95           ror    r21
 154:    47 95           ror    r20
 156:    37 95           ror    r19
 158:    27 95           ror    r18
 15a:    0a 95           dec    r16
 15c:    aa f7           brpl    .-22         ; 0x148 <__lshrdi3+0x1e>
 15e:    0f 91           pop    r16
 160:    08 95           ret

00000162 <__umulhisi3>:
 162:    a2 9f           mul    r26, r18
 164:    b0 01           movw    r22, r0
 166:    b3 9f           mul    r27, r19
 168:    c0 01           movw    r24, r0
 16a:    a3 9f           mul    r26, r19
 16c:    70 0d           add    r23, r0
 16e:    81 1d           adc    r24, r1
 170:    11 24           eor    r1, r1
 172:    91 1d           adc    r25, r1
 174:    b2 9f           mul    r27, r18
 176:    70 0d           add    r23, r0
 178:    81 1d           adc    r24, r1
 17a:    11 24           eor    r1, r1
 17c:    91 1d           adc    r25, r1
 17e:    08 95           ret

00000180 <__muldi3_6>:
 180:    0e 94 b1 00     call    0x162    ; 0x162 <__umulhisi3>
 184:    46 0f           add    r20, r22
 186:    57 1f           adc    r21, r23
 188:    c8 1f           adc    r28, r24
 18a:    d9 1f           adc    r29, r25
 18c:    08 f4           brcc    .+2          ; 0x190 <__muldi3_6+0x10>
 18e:    31 96           adiw    r30, 0x01    ; 1
 190:    08 95           ret

00000192 <_exit>:
 192:    f8 94           cli

00000194 <__stop_program>:
 194:    ff cf           rjmp    .-2          ; 0x194 <__stop_program>
 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

What is the point of posting all of his oddball code?  At least explain what exactly you are doing (methods, how, why)  (this is a discussion forum)...otherwise it will likely all be unused (even if it is great) and just swept into the dust pan and put in the dumpster.   

What is the advantage(s) of all of this code?

Why have you completely forgotten to at least comment any of your code?  

When in the dark remember-the future looks brighter than ever.   I look forward to being able to predict the future!

Last Edited: Sun. Mar 29, 2020 - 09:18 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I have a pretty strong feeling we're being trolled. At the very least OP seems incapable of taking onboard any advice being given. #1 talked about an assembler solution, many excellent ideas were proposed, all ignored. When OP showed code using / and % I pointed out that there was no point in doing TWO divides when it could be done with one. Also ignored. Don't see the point in trying to advise any more.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

The OP does appear to have a history of asking questions and not replying.

#1 Hardware Problem? https://www.avrfreaks.net/forum/...

#2 Hardware Problem? Read AVR042.

#3 All grounds are not created equal

#4 Have you proved your chip is running at xxMHz?

#5 "If you think you need floating point to solve the problem then you don't understand the problem. If you really do need floating point then you have a problem you do not understand."

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Anyway, I wrote and tested the code below for a fast division by 5. Sorry if it was done already.

 

Symbolic code:

 

x in Y[x] is the byte number of the register Y

 

N[4] the original 32-bit number

R[4] the result of N[4]/5

A[4], B[4], C[4] and K[1] temporary registers

 

div32_5:

; PUSH registers

    A[4]= N[4]

    C[4]= R[4]= 0

 

loop1:

    A[4]= A[4]-C[4]

 

  if A[4]<256, break1

 

    B[4]= (A[4]/256)*51

    R[4]= R[4]+B[4]

    C[4]= B[4]*5

    goto loop1

 

break1:

    K[1]=0

 

loop2:

    A[1]=A[1]-5

 

  if A[1]<0, goto break2

 

    K[1]=K[1]+1

    goto loop2

 

break2:

   R[4]= R[4]+K[1]

 

; POP registers

    RET

 

 

 

Last Edited: Sun. Mar 29, 2020 - 04:33 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0
.def Kst = r24

.def N_3 = r23
.def N_2 = r22
.def N_1 = r21
.def N_0 = r20

.def A_3 = r19
.def A_2 = r18
.def A_1 = r17
.def A_0 = r16

.def R_3 = r15
.def R_2 = r14
.def R_1 = r13
.def R_0 = r12

.def B_3 = r11
.def B_2 = r10
.def B_1 = r9
.def B_0 = r8

.def C_3 = r7
.def C_2 = r6
.def C_1 = r5
.def C_0 = r4

; divident
.equ dd_Q = 0xFF
.equ dd_U = 0xFF
.equ dd_H = 0xFF
.equ dd_L = 0xF0

REPEAT:
    LDI  N_3, dd_Q
    LDI  N_2, dd_U
    LDI  N_1, dd_H
    LDI  N_0, dd_L

    RCALL div32_5

    RJMP  REPEAT


div32_5:
; PUSH registers
; A[4]= N[4]
    MOV   A_0, N_0              ;1
    MOV   A_1, N_1              ;1
    MOV   A_2, N_2              ;1
    MOV   A_3, N_3              ;1

; C[4]= R[4]= 0
    CLR   C_0                   ;1
    CLR   C_1                   ;1
    CLR   C_2                   ;1
    CLR   C_3                   ;1
    CLR   R_0                   ;1
    CLR   R_1                   ;1
    CLR   R_2                   ;1
    CLR   R_3                   ;1

loop1:
; A[4]= A[4]-C[4]
    SUB   A_0, C_0              ;1
    SBC   A_1, C_1              ;1
    SBC   A_2, C_2              ;1
    SBC   A_3, C_3              ;1

; if A[4]<256, break1
    TST   A_3                   ;1
    BRNE  cnt                   ;2

    TST   A_2                   ;1
    BRNE  cnt                   ;2

    TST   A_1                   ;1
    BREQ  break1                ;1,2

cnt:
; B[4]= (A[4]/256)*51
    LDI   Kst, 51               ;1
    MUL   A_1, Kst              ;2
    MOV   B_0, r0               ;1
    MOV   B_1, r1               ;1

    MUL   A_2, Kst              ;2
    CLR   B_2                   ;1
    ADD   B_1, r0               ;1
    ADC   B_2, r1               ;1

    MUL   A_3, Kst              ;2
    CLR   B_3                   ;1
    ADD   B_2, r0               ;1
    ADC   B_3, r1               ;1

; R[4]= R[4]+B[4]
    ADD   R_0, B_0              ;1
    ADC   R_1, B_1              ;1
    ADC   R_2, B_2              ;1
    ADC   R_3, B_3              ;1

; C[4]= B[4]*5
    LDI   Kst, 5                ;1
    MUL   B_0, Kst              ;2
    MOV   C_0, r0               ;1
    MOV   C_1, r1               ;1

    MUL   B_1, Kst              ;2
    CLR   C_2                   ;1
    ADD   C_1, r0               ;1
    ADC   C_2, r1               ;1

    MUL   B_2, Kst              ;2
    CLR   C_3                   ;1
    ADD   C_2, r0               ;1
    ADC   C_3, r1               ;1

    MUL   B_3, Kst              ;2
    ADD   C_3, r0               ;1

; goto loop1
    RJMP  loop1                 ;2

break1:
; K[1]=0
    CLR   Kst                   ;1

loop2:
; A[1]=A[1]-5
    SUBI  A_0, 5                ;1

; if A[1]<0, goto break2 
    BRLO  break2                ;2

; K[1]=K[1]+1
    INC   Kst                   ;1

; goto loop2
    RJMP  loop2


break2:
; R[4]= R[4]+K[1]
    ADD   R_0, Kst              ;1
    CLR   Kst                   ;1
    ADC   R_1, Kst              ;1
    ADC   R_2, Kst              ;1
    ADC   R_3, Kst              ;1

; POP registers

    RET                         ;4

Sorry, after I minimized the registers names for clarity, some of them were already defined by AS.

I will edit it very soon.

 

Updated:

 

Last Edited: Sun. Mar 29, 2020 - 04:11 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Anyway, I wrote and tested the code below for a fast division by 5. Sorry if it was done already.

Do you usually post code with no explanation?  What is K, What is B?  Nobody will use it in the future if you don't state how to use it 

 

 

When in the dark remember-the future looks brighter than ever.   I look forward to being able to predict the future!

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

You are right, it seems I erased them by mistake while editing :(

 

Symbolic Code:

 

x in Y[x] is the byte number of the register Y

 

N[4] the original 32-bit number

R[4] the result of N[4]/5

A[4], B[4], C[4] and K[1] temporary registers

 

(#58)

 

But the slowest part is last division after "break1:", I used the simplest method (subtraction)

I couldn't find division of 8-bit by 8-bit (or 4-bit since it is 5 here).

 

Last Edited: Sun. Mar 29, 2020 - 04:36 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0
;===========================================
; Dividing 32-bit register by the constant 5
;===========================================
;
; By using many registers, its cycles maximum is 289 only.
; Perhaps, there is a faster code which was done already (or will be done).
;

.def Kst = r24                  ; for constants

.def N_3 = r23                  ; the original 32-bit number
.def N_2 = r22
.def N_1 = r21
.def N_0 = r20

.def A_3 = r19                  ; temporary
.def A_2 = r18
.def A_1 = r17
.def A_0 = r16

.def R_3 = r15                  ; result of N_3:N_0 / 5
.def R_2 = r14
.def R_1 = r13
.def R_0 = r12

.def B_3 = r11                  ; temporary
.def B_2 = r10
.def B_1 = r9
.def B_0 = r8

.def C_3 = r7                   ; temporary
.def C_2 = r6
.def C_1 = r5
.def C_0 = r4

; divident , max cycles 289
.equ dd_Q = 0xFF
.equ dd_U = 0xFF
.equ dd_H = 0xFF
.equ dd_L = 0xFF

REPEAT:
    LDI  N_3, dd_Q
    LDI  N_2, dd_U
    LDI  N_1, dd_H
    LDI  N_0, dd_L

    RCALL div32_5

    RJMP  REPEAT

div32_5:
; PUSH registers
; A[4]= N[4]
    MOV   A_0, N_0              ;1
    MOV   A_1, N_1              ;1
    MOV   A_2, N_2              ;1
    MOV   A_3, N_3              ;1

; C[4]= R[4]= 0
    CLR   C_0                   ;1
    CLR   C_1                   ;1
    CLR   C_2                   ;1
    CLR   C_3                   ;1
    CLR   R_0                   ;1
    CLR   R_1                   ;1
    CLR   R_2                   ;1
    CLR   R_3                   ;1

loop1:
; A[4]= A[4]-C[4]
    SUB   A_0, C_0              ;1
    SBC   A_1, C_1              ;1
    SBC   A_2, C_2              ;1
    SBC   A_3, C_3              ;1

; if A[4]<256, break1
    TST   A_3                   ;1
    BRNE  cnt                   ;2

    TST   A_2                   ;1
    BRNE  cnt                   ;2

    TST   A_1                   ;1
    BREQ  break1                ;1,2

cnt:
; B[4]= (A[4]/256)*51
    LDI   Kst, 51               ;1
    MUL   A_1, Kst              ;2
    MOV   B_0, r0               ;1
    MOV   B_1, r1               ;1

    MUL   A_2, Kst              ;2
    CLR   B_2                   ;1
    ADD   B_1, r0               ;1
    ADC   B_2, r1               ;1

    MUL   A_3, Kst              ;2
    CLR   B_3                   ;1
    ADD   B_2, r0               ;1
    ADC   B_3, r1               ;1

; R[4]= R[4]+B[4]
    ADD   R_0, B_0              ;1
    ADC   R_1, B_1              ;1
    ADC   R_2, B_2              ;1
    ADC   R_3, B_3              ;1

; C[4]= B[4]*5
    LDI   Kst, 5                ;1
    MUL   B_0, Kst              ;2
    MOV   C_0, r0               ;1
    MOV   C_1, r1               ;1

    MUL   B_1, Kst              ;2
    CLR   C_2                   ;1
    ADD   C_1, r0               ;1
    ADC   C_2, r1               ;1

    MUL   B_2, Kst              ;2
    CLR   C_3                   ;1
    ADD   C_2, r0               ;1
    ADC   C_3, r1               ;1

    MUL   B_3, Kst              ;2
    ADD   C_3, r0               ;1

; goto loop1
    RJMP  loop1                 ;2

break1:
; A1=A0
    MOV   A_1, A_0
; C2=B2=0
    CLR   C_2
    CLR   B_2

loop2:
; A1= A1-C2
    SUB   A_1, C_2

; if A1<16, break2
    CPI   A_1, 16
    BRLO  break2

; B1= (A1/16)*3
    MOV   A_2, A_1
    SWAP  A_2
    ANDI  A_2, 0x0F
    LDI   Kst, 3
    MUL   A_2, Kst
    MOV   B_1, r0

; B2= B2+B1
    ADD   B_2, B_1

; C2= B1*5
    LDI   Kst, 5
    MUL   B_1, Kst
    MOV   C_2, r0

; goto loop2
    RJMP  loop2

break2:
; K[1]=0
    CLR   Kst                   ;1

loop3:
; A1=A1-5
    SUBI  A_1, 5                ;1

; if A1<0, goto break3
    BRLO  break3                ;2

; K[1]=K[1]+1
    INC   Kst                   ;1

; goto loop3
    RJMP  loop3

break3:
; K[1]= K[1]+B2
    ADD   Kst, B_2

; R[4]= R[4]+K[1]
    ADC   R_0, Kst              ;1
    CLR   Kst                   ;1
    ADC   R_1, Kst              ;1
    ADC   R_2, Kst              ;1
    ADC   R_3, Kst              ;1

; POP registers

    RET                         ;4

 

Last Edited: Sun. Mar 29, 2020 - 07:49 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

https://gmplib.org/~tege/divcnst...

 

So, for instance, consider:

 

#define BUFSIZE 50
char *decimal (unsigned int x)
{
    static char buf[BUFSIZE];
    char *bp = buf + BUFSIZE - 1;
    *bp = 0;
    do {
        *--bp = ’0’ + x % 10;
        x /= 10;
    } while (x != 0);
    return bp; /* Return pointer to first digit */
}

With this research, first published quite some time ago, the actual implementation can be... quite a bit different.

 

The paper gives samples in assembly for common targets, and whaddya know, it's very similar to the code you already listed, only with the constant 0xcccccccd because it's for 32-bit values.

 

 

That said: I don't think this is gonna work out. Looking at your most recent code, you have four consecutive reads from PINB, and then in another function, four consecutive writes to PORTD. That is not how anything works. I don't think detailed discussions of the multiplication algorithm are going to do you any good if you don't know how to produce working code, and can't articulate what you want to do with it.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Suggestion: I'd guess you could compute X/5, where X is an 8-bit value, using a lookup table. Progmem, obviously.

static const uint8_t over5[256] = {
    0, 0, 0, 0, 0,
    1, 1, 1, 1, 1,
    /* ... */
    50, 50, 50, 50, 50,
    51,
};

this is almost certainly faster than the loop, and it does impose any code size at all, but it seems pretty cheap.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I'd guess you could compute X/5, where X is an 8-bit value,

We want 32bits, not 8bits.  That makes a lookup table impractical.

An interesting question is whether you can take advantage of the CCCCCCCC... bit pattern to implement the multiplication faster or smaller than you could do a generic 32x32 multiply with an 8bit multiplier (and, I guess: do you need all 32bits of the constant to get an accurate divisor, for these specific cases?

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

???

does impose any code size at all

Before you start to code you have spend 256 byte of code space.  

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

westfw wrote:

I'd guess you could compute X/5, where X is an 8-bit value,

We want 32bits, not 8bits.  That makes a lookup table impractical.

An interesting question is whether you can take advantage of the CCCCCCCC... bit pattern to implement the multiplication faster or smaller than you could do a generic 32x32 multiply with an 8bit multiplier (and, I guess: do you need all 32bits of the constant to get an accurate divisor, for these specific cases?

 

That was directed specifically at KerimF's thing, which has an inner loop to compute X/5 on an 8-bit value.

 

So at a smallish overhead in progmem, that inner loop could go away.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

westfw wrote:
An interesting question is whether you can take advantage of the CCCCCCCC... bit pattern to implement the multiplication faster or smaller than you could do a generic 32x32 multiply with an 8bit multiplier (and, I guess: do you need all 32bits of the constant to get an accurate divisor, for these specific cases?

We can, we only need to do a 32*8 mul, the rest is just sums.

This is a test program:

#include <stdio.h>
#include<stdint.h>


int main(int argc, char *argv[])
{
	uint32_t number = 12345;
	uint64_t tmp;

	tmp = number * 0xcc;
	tmp += tmp >> 8;
	tmp += tmp >> 16;
	tmp += 512;
	tmp >>= 10;

	return tmp;
}

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Some samples for generator control :

Attachment(s): 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

>> take advantage of the CCCCCCCC ??

We can, we only need to do a 32*8 mul, the rest is just sums.

 32*4bits?  2bits?

a * 0xCC = a * 0xC + ((a * 0xC)<<4) = ( (a*3) + ((a*3)<<4)<<2  = ( (a+a+a) + ((a+a+a)<<4)<<2

But I guess once you get more than 8bits, a shift-by-4 doesn't optimize easily, and if you have an x8 multiply instruction, you might as well use it...

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Thought sure I'd posted this earlier:  Edit: I had, in another divide by 5 thread.

  ; C-callable returns result of dividing 32-bit unsigned integer by 5
  .global divide32u5
  divide32u5:
  ; input is R27:24, result goes back there
  ; uses long division with byte-sized digits
  ; R21:18 holds the quotient before being moved to R27:24
  ; Of the registers used, the avr-gcc ABI only requires R1 to be restored
  .def C5 = R30
  LDI C5, 5
  .def C51 = R31
  LDI C51, 51

  .macro div i1, i0, q, flag
     ; q = i1:0/5, i0-=5*q
     MUL \i0, C51
     MOV \q, R1
     .if \flag
       ; requires i1<=4
       MUL \i1, C51
       ADD \q, R0
     .ifend
     MUL \q, C5
     SUB \i0, R0
     ; i0<=9
     CP \i0, C5
     BRLT 1f
       SUB \i0, CR
       INC \q
     1: ; i0 <= 4
  .endm

  div  -1, R27, R21, 0
  div R27, R26, R20, 1
  div R26, R25, R19, 1
  div R25, R24, R18, 1

  MOVW R24, R18
  MOVW R26, R20

  CLR R1

  RET

44 words and up to 58 cycles, counting the return, but not the call.

Done in assembly, I suspect the 0xCCCCCCCC method would be less than 50 cycles.

Iluvatar is the better part of Valar.

Last Edited: Fri. Apr 3, 2020 - 04:29 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Edit: 3:00 AM work. NOT good.

C-callable, of course.

; x-= x % 5  ; x= x*0xCCCCCCCC+x
; 0xCCCCCCCD * 5 = 1 mod 2**32

  .global div5u32
  div5u32:

  ; the input registers, original x
  .def I0 = R24
  .def I1 = R25
  .def I2 = R26
  .def I3 = R27

  ; becomes x * 0xCC
  .def XCC0 = R20
  .def XCC1 = R21
  .def XCC2 = R22
  .def XCC3 = R23

  ; Holds 0xCC or x % 5
  .def CC = R19
  .def Xmod5 = R19

  ; Find x % 5
  MOV Xmod5, I0
  ADC Xmod5, I1
  ADC Xmod5, I2
  ADC Xmod5, I3
  ADC Xmod5, R1

  ; subtract
  SUB I0, Xmod5
  SBC I1, R1
  SBC I2, R1
  SBC I3, R1

  ; compute x * 0xCC
  PUSH CC
  LDI CC, 0xCC
  MUL CC, I0
  MOVW XCC0, R0
  MUL CC, I2
  MOVW XCC2, R0
  MUL CC, I3
  ADD XCC3, R0
  MUL CC, I1
  ADD XCC1, R0
  ADC XCC2, R1
  BRCC 1f
    INC XCC3
  1:

  ;  x+= XCC + (XCC<<8) + (XCC<<16) + (XCC<<24)
  ADD I0, XCC0
  ADC I1, XCC1
  ADC I2, XCC2
  ADC I3, XCC3
  ADD I1, XCC0
  ADC I2, XCC1
  ADC I3, XCC2
  ADD I2, XCC0
  ADD I3, XCC1
  ADD I3, XCC0

  CLR R1
  POP CC

  RET

44 cycles counting the return, but not the call.

Iluvatar is the better part of Valar.

Last Edited: Sat. Apr 4, 2020 - 04:02 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

skeeve wrote:

C-callable, of course.

; x-= x % 5  ; x= x*0xCCCCCCCC+x
; 0xCCCCCCCD * 5 = 1 mod 2**32

  .global div5u32
  div5u32:

  ; the input registers, original x
  .def I0 = R24
  .def I1 = R25
  .def I2 = R26
  .def I3 = R27

  ; becomes x * 0xCC
  .def XCC0 = R20
  .def XCC1 = R21
  .def XCC2 = R22
  .def XCC3 = R23

  ; Holds 0xCC or x % 5
  .def CC = R19
  .def Xmod5 = R19

  ; Find x % 5
  MOV Xmod5, I0
  ADC Xmod5, I1
  ADC Xmod5, I2
  ADC Xmod5, I3
  ADC Xmod5, R1

  ; subtract
  SUB I0, Xmod5
  SBC I1, R1
  SBC I2, R1
  SBC I3, R1

  ; compute x * 0xCC
  PUSH CC
  LDI CC, 0xCC
  MUL CC, I0
  MOVW XCC0, R0
  MUL CC, I2
  MOVW XCC2, R0
  MUL CC, I3
  ADD XCC3, R0
  MUL CC, I1
  ADD XCC1, R0
  ADC XCC2, R1
  BRCC 1f
    INC XCC3
  1:

  ;  x+= XCC + (XCC<<8) + (XCC<<16) + (XCC<<24)
  ADD I0, XCC0
  ADC I1, XCC1
  ADC I2, XCC2
  ADC I3, XCC3
  ADD I1, XCC0
  ADC I2, XCC1
  ADC I3, XCC2
  ADD I2, XCC0
  ADD I3, XCC1
  ADD I3, XCC0

  CLR R1
  POP CC

  RET

44 cycles counting the return, but not the call.

 

I don't think this works.  First, if it follows the C calling convention, x should be in R25:R22.  Second, it doesn't calculate x mod 5.  Running through the code in my head, for x in {0..255}, Xmod5 (R19) = x.  For x in {256..511}, Xmod5 = x mod 255.

 

 

I have no special talents.  I am only passionately curious. - Albert Einstein

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

ralphd wrote:
I don't think this works.  First, if it follows the C calling convention, x should be in R25:R22.  Second, it doesn't calculate x mod 5.  Running through the code in my head, for x in {0..255}, Xmod5 (R19) = x.  For x in {256..511}, Xmod5 = x mod 255.
This should work.

I've repaired the calling convention and added the code for Xmod5 %= 5.

; x-= x % 5  ; x= x*0xCCCCCCCC+x
; 0xCCCCCCCD * 5 = 1 mod 2**32

  .global div5u32
  div5u32:

  ; the input registers, original x
  .def I0 = R24
  .def I1 = R25
  .def I2 = R26
  .def I3 = R27

  ; becomes x * 0xCC
  .def XCC0 = R20
  .def XCC1 = R21
  .def XCC2 = R22
  .def XCC3 = R23

  .def Xmod5 = R19
  .def Rsave = R18
  .def CC    = R31

  ; Find x % 5
  MOV Xmod5, I0
  ADC Xmod5, I1
  ADC Xmod5, I2
  ADC Xmod5, I3
  ADC Xmod5, R1
  ; if you have a spare 256 bytes, the following 11 cycles can be replaced by 7 cycles
  MOV Rsave, Xmod5
  SWAP Rsave
  ADD Xmod5, Rsave  ; half carry same as carry
  ADC Xmod5, R1  ; no carry out
  ; carry clear
  ANDI Xmod5, 0xF
  SBRC Xmod5, 3
    SUBI Xmod5, 10
  BRCS add5
  ; Xmod5 in 0..7
  SUBI Xmod5, 5
  BRCC 1f
  add5:
  ADDI Xmod5, 5
  1:

  ; subtract
  SUB I0, Xmod5
  SBC I1, R1
  SBC I2, R1
  SBC I3, R1

  ; compute x * 0xCC
  LDI CC, 0xCC
  MUL CC, I0
  MOVW XCC0, R0
  MUL CC, I2
  MOVW XCC2, R0
  MUL CC, I3
  ADD XCC3, R0
  MUL CC, I1
  ADD XCC1, R0
  ADC XCC2, R1
  BRCC 1f
    INC XCC3
  1:

  ;  x+= XCC + (XCC<<8) + (XCC<<16) + (XCC<<24)
  ADD I0, XCC0
  ADC I1, XCC1
  ADC I2, XCC2
  ADC I3, XCC3
  ADD I1, XCC0
  ADC I2, XCC1
  ADC I3, XCC2
  ADD I2, XCC0
  ADD I3, XCC1
  ADD I3, XCC0

  CLR R1

  RET

The added code costs 11 cycles.  Better register selection save 4 cycles.

52 cycles isn't less than 50, but is slightly better than long division.

I expect that it does not scale: Long division would beat modular arithmetic even for 40-bit unsigneds.

Iluvatar is the better part of Valar.

Last Edited: Sat. Apr 4, 2020 - 06:09 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Hmm. So, idle thought: 2^24 is 16M. The OP's examples appeared to strongly suggest only needing 7-digit values... which means that you possibly don't need to actually compute the top byte of the result.

 

For extra credit: The real reason to do this is specifically to produce BCD encoding. Can it be made faster given the knowledge that the real goal is the BCD encoding?

 

 

 

Consider, if you will, an algorithm which does divide/remainder by 100, instead of by 5 or 10. So your basic loop is conceptually:

 

// converts 0-99 into digit pairs in hex. you could also use
// short and do 0x0502 for [52], for instance.
unsigned char bcdigits[100] = {
        0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
};

// this part is horribly inefficient; it's a placeholder for where
// you'd put fancy hand-tuned assembly for computing these
// partial results.
int divrem100(int x) {
        return ((x % 100) << 24) | (x / 100);
}

// called with a buffer of at least 10 characters, and a value
// that fits in that range, this unpacks it.
char *bcdinto(int x, char buf[10]) {
        char *s = buf+10;
        int digitpair;
        *--s = '\0';
        while (x > 0) {
                x = divrem100(x);
                digitpair = bcdigits[(x >> 24) & 0xFF];
                x = x & 0xFFFFFF;
                *--s = '0' + (digitpair & 0xF);
                *--s = '0' + (digitpair >> 4);
        }
        *--s = '0';
        while (s[0] == '0' && s[1] != '\0') {
                ++s;
        }
        return s;
}

The idea here is that we can perform half as many division-and-remainder ops to get pairs of digits, then extract the digits in a few cycles each. So we add I dunno, maybe 10-15 cycles per pair of digits, but remove half the iterations of the loop. (And you could drop the '0'+ parts from the lines if you really want 0x1 instead of '1' for the digits.)

 

 

Last Edited: Sun. Apr 5, 2020 - 06:17 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

westfw wrote:

I'd guess you could compute X/5, where X is an 8-bit value,

We want 32bits, not 8bits.  That makes a lookup table impractical.

 

Yes and no.  devil

 

Given a 32-bit input value, that's ~= 4 billion locations, each of which needs four bytes of space to store the result (less two bits, but who cares), so about 16G-Bytes of memory.  Digikey offers 16GB memory modules for US$200-300-ish:

 

https://www.digikey.com/product-...

 

But Digikey doesn't use the case-format for Bytes vs. bits, so you might need eight of them.  Still, not entirely unreasonable, if you really really have to do very fast look-up table division (and have awhile to load up the lookup table to start with). 

 

You could also compress the whole thing by only storing every other, or every fourth, result - you'll get some error, but all these ways do, and save some more table space - which means you might be able to use fewer memory modules.  Digikey also sells bigger ones.

 

And Yes, I Know, There Are Better Ways.  Get a chip with a hardware divider, for one.

 

Memory chips are just stupidly large these days...  laugh  S.

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Ah, modular arithmetic... 4%5 = 4 but we can interpret it as -1, therefore (4^n)%5 = (-1)^n.

This means the decimal representation of powers of 4 ends in either 4 or 6 (alternates). Yeah, OT but I figured this out thanks to this thread (thinking about the methods posted by skeeve).

I never studied number theory formally, I can just comprehend this basic stuff, but still I find it very interesting.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

This is the slowest fast division, 2 weeks and still calculating.

John Samperi

Ampertronics Pty. Ltd.

www.ampertronics.com.au

* Electronic Design * Custom Products * Contract Assembly

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

the_real_seebs wrote:

westfw wrote:

 

I'd guess you could compute X/5, where X is an 8-bit value,

We want 32bits, not 8bits.  That makes a lookup table impractical.

An interesting question is whether you can take advantage of the CCCCCCCC... bit pattern to implement the multiplication faster or smaller than you could do a generic 32x32 multiply with an 8bit multiplier (and, I guess: do you need all 32bits of the constant to get an accurate divisor, for these specific cases?

 

 

 

That was directed specifically at KerimF's thing, which has an inner loop to compute X/5 on an 8-bit value.

My long division algorithm could be sped up with a 0x500-byte table of quotients.

That would make it 47 cycles.  My modular arithmetic algorithm beats it.

A remainder table is actually slower.

If the table can be placed in RAM, that would save 4 cycles and beat my modular arithmetic algorithm.

Iluvatar is the better part of Valar.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I did confirm, by the way, that "divide by 100 instead of by 10" wins significantly.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Some programs for appilcation   (rebuilf for Atmega32A )

Attachment(s): 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

int32_t  div_const( int32_t N , uint32_t D  )
{
int32_t Q=0;
int32_t R=(int32_t )N; 

while(R>=D){ Q++; R-=D; }
printf ("\nN=%ld R=%ld  Q=%ld  D=%ld",N,R,Q,D); 
return  R;
}

int main()
{
int32_t n=1597501;
int32_t r; 

r =div_const(n , 100000   );
r =div_const(r , 10000   );
r =div_const(r , 1000   );
r =div_const(r , 100   );
r =div_const(r , 10   );
printf ("\nr= %d ", (int)r);
return 0;
}

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Please see Tip #1 in my signature (below) for how to properly post source code!

Top Tips:

  1. How to properly post source code - see: https://www.avrfreaks.net/comment... - also how to properly include images/pictures
  2. "Garbage" characters on a serial terminal are (almost?) invariably due to wrong baud rate - see: https://learn.sparkfun.com/tutorials/serial-communication
  3. Wrong baud rate is usually due to not running at the speed you thought; check by blinking a LED to see if you get the speed you expected
  4. Difference between a crystal, and a crystal oscillatorhttps://www.avrfreaks.net/comment...
  5. When your question is resolved, mark the solution: https://www.avrfreaks.net/comment...
  6. Beginner's "Getting Started" tips: https://www.avrfreaks.net/comment...
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

/*
int32_t  div_const( int32_t N , uint32_t D  )
{
uint8_t Q=0;
int32_t R=(int32_t )N; 
while(R>=D){ Q++; R-=D; }
printf ("\nN=%ld R=%ld  Q=%ld  D=%ld",N,R,Q,D); 


return  R;
}
*/
uint8_t digits[6];

void GetDigits ( int32_t N )
{
uint8_t Q=0;
int32_t R=(int32_t )N;


//Q=0; 
//while(R>=1000000){ Q++; R-=1000000; }
//printf ("\n  R=%ld  Q=%d   " ,R,Q );
//digits[6]=Q;


 
while(R>=100000){ Q++; R-=100000; }
printf ("\n  R=%ld  Q=%d   " ,R,Q );
digits[5]=Q;
Q=0;     
while(R>=10000){ Q++; R-=10000; }
printf ("\n  R=%ld  Q=%d   " ,R,Q );
digits[4]=Q;
Q=0;     
while(R>=1000){ Q++; R-=1000; }
printf ("\n  R=%ld  Q=%d   " ,R,Q );
digits[3]=Q;    
Q=0;     
while(R>=100){ Q++; R-=100; }
printf ("\n  R=%ld  Q=%d   " ,R,Q );
digits[2]=Q;
Q=0;     
while(R>=10){ Q++; R-=10; }
printf ("\n  R=%ld  Q=%d   " ,R,Q );
digits[1]=Q;

digits[0]=(uint8_t) R;
printf ("\n    R=%d   ", (int)R  );

}

int main()
{
int32_t n=1599999;
//int32_t r=n;  

GetDigits ( n );

//r =div_const(n , 1000000   );
/*
r =div_const(r , 100000   );
r =div_const(r , 10000   );
r =div_const(r , 1000   );
r =div_const(r , 100   );
r =div_const(r , 10   );
printf ("\nr= %d ", (int)r);
*/
return 0;
}

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

rebuild  for AVR

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

You don't believe in indentation in your code, then?

 

frown

Top Tips:

  1. How to properly post source code - see: https://www.avrfreaks.net/comment... - also how to properly include images/pictures
  2. "Garbage" characters on a serial terminal are (almost?) invariably due to wrong baud rate - see: https://learn.sparkfun.com/tutorials/serial-communication
  3. Wrong baud rate is usually due to not running at the speed you thought; check by blinking a LED to see if you get the speed you expected
  4. Difference between a crystal, and a crystal oscillatorhttps://www.avrfreaks.net/comment...
  5. When your question is resolved, mark the solution: https://www.avrfreaks.net/comment...
  6. Beginner's "Getting Started" tips: https://www.avrfreaks.net/comment...
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

awneil wrote:

You don't believe in indentation in your code, then?

 

frown

 

It'sAllAConspiracyByBigTab,andtheirfellowkeyboardFatCatstheSpacebar.Cutthemdowntosize!!11231!!enlightenedS.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Beware the Shift!

David

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

/*
int16_t  div_const( uint16_t N , uint16_t D  )
{
uint16_t Q=0;
int16_t R=(int16_t )N; 
while(R>=D){ Q++; R-=D; }
printf ("\nN=%ld R=%ld  Q=%ld  D=%ld",N,R,Q,D); 


return  R;
}
*/
 
int16_t  div_5( uint16_t N   )
{
int16_t Q=0;
int16_t R=(int16_t ) N; 
while(R>=0x0005){ Q++; R-=0x0005; }
//printf ("\nN=%ld R=%ld  Q=%ld  ",N,R,Q ); 
return  (int16_t)Q;
}

  
 

int main()
{
 uint16_t n ;
 int delta; 
 for(n=0; n<=16000; n++)
 {

 delta= (int)(n/5)-div_5(n);
 if(delta!=0)
  { printf ("\ndelta=% d , n=%d  ",delta,(int)n  ); }
 }



return 0;
}

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

/*
int32_t  div_const( int32_t N , uint32_t D  )
{
uint8_t Q=0;
int32_t R=(int32_t )N; 
while(R>=D){ Q++; R-=D; }
printf ("\nN=%ld R=%ld  Q=%ld  D=%ld",N,R,Q,D); 


return  R;
}
*/
uint8_t digits[6];

void GetDigits ( int32_t N )
{
uint8_t Q=0;
int32_t R=(int32_t )N;


//Q=0; 
//while(R>=1000000){ Q++; R-=1000000; }
//printf ("\n  R=%ld  Q=%d   " ,R,Q );
//digits[6]=Q;

 
while(R>=100000){ Q++; R-=100000; }
//printf ("\n  R=%ld  Q=%d   " ,R,Q );
 digits[5]=Q;
Q=0; 	
while(R>=10000){ Q++; R-=10000; }
//printf ("\n  R=%ld  Q=%d   " ,R,Q );
digits[4]=Q;
Q=0; 	
while(R>=1000){ Q++; R-=1000; }
//printf ("\n  R=%ld  Q=%d   " ,R,Q );
digits[3]=Q;	
Q=0; 	
while(R>=100){ Q++; R-=100; }
//printf ("\n  R=%ld  Q=%d   " ,R,Q );
digits[2]=Q;
Q=0; 	
while(R>=10){ Q++; R-=10; }
//printf ("\n  R=%ld  Q=%d   " ,R,Q );
digits[1]=Q;

digits[0]=(uint8_t) R;
//printf ("\n    R=%d   ", (int)R  );

}

int main()
{
int32_t n=0;
//int32_t r=n;  
for (n=0;n<=1599999;n++){

GetDigits ( n );

//r =div_const(n , 1000000   );
/*
r =div_const(r , 100000   );
r =div_const(r , 10000   );
r =div_const(r , 1000   );
r =div_const(r , 100   );
r =div_const(r , 10   );
printf ("\nr= %d ", (int)r);
*/
//printf ("\ndigit= %d ", (int)digits[5]  ); 
//printf ("\ndigit= %d ", (int)digits[4] ); 
//printf ("\ndigit= %d ", (int)digits[3] ); 
//printf ("\ndigit= %d ", (int)digits[2] ); 
//printf ("\ndigit= %d ", (int)digits[1] );
//printf ("\ndigit= %d ", (int)digits[0] );

//for test
 int32_t res=0;
res+=(int32_t)digits[0];
res+=(int32_t)digits[1]*10;
res+=(int32_t)digits[2]*100;
res+=(int32_t)digits[3]*1000;
res+=(int32_t)digits[4]*10000;
res+=(int32_t)digits[5]*100000;
if (res-(n) !=0) {printf  ("\n result= %ld ", res-(n));}
}
 


return 0;
}

(I use  0-15  for  MSB and 0-9  for other in my circuit )

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

and still no indentation.

 

frown

Top Tips:

  1. How to properly post source code - see: https://www.avrfreaks.net/comment... - also how to properly include images/pictures
  2. "Garbage" characters on a serial terminal are (almost?) invariably due to wrong baud rate - see: https://learn.sparkfun.com/tutorials/serial-communication
  3. Wrong baud rate is usually due to not running at the speed you thought; check by blinking a LED to see if you get the speed you expected
  4. Difference between a crystal, and a crystal oscillatorhttps://www.avrfreaks.net/comment...
  5. When your question is resolved, mark the solution: https://www.avrfreaks.net/comment...
  6. Beginner's "Getting Started" tips: https://www.avrfreaks.net/comment...
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I can't stand it any more!

#include <stdio.h>
#include <stdint.h>

uint8_t digits[6];

void GetDigits (int32_t N)
{

  int32_t R = N;
  uint8_t Q;

  Q = 0;
  while (R >= 100000)
  {
    Q++;
    R -= 100000;
  }
  digits[5] = Q;

  Q = 0;
  while (R >= 10000)
  {
    Q++;
    R-=10000;
  }
  digits[4] = Q;

  Q = 0;
  while (R >= 1000)
  {
    Q++;
    R-=1000;
  }
  digits[3] = Q;

  Q = 0;
  while (R >= 100)
  {
    Q++;
    R -= 100;
  }
  digits[2] = Q;

  Q = 0;
  while (R >= 10)
  {
    Q++;
    R -= 10;
  }
  digits[1] = Q;

  digits[0] = R;

}



int main()
{

  int32_t n = 0;
  for (n=0; n<=1599999; n++)
  {

    GetDigits (n);

    int32_t res=0;
    res += digits[0];
    res += digits[1] * 10;
    res += digits[2] * 100;
    res += digits[3] * 1000;
    res += digits[4] * 10000;
    res += digits[5] * 100000;
    if ((res - n) != 0)
    {
      printf("\n result= %ld ", res - n);
    }

  }

  return 0;
}

Wasn't this supposed to be 'Fast dividing by 5 and 10?'  What is fast about this code?

"Experience is what enables you to recognise a mistake the second time you make it."

"Good judgement comes from experience.  Experience comes from bad judgement."

"Wisdom is always wont to arrive late, and to be a little approximate on first possession."

"When you hear hoofbeats, think horses, not unicorns."

"Fast.  Cheap.  Good.  Pick two."

"We see a lot of arses on handlebars around here." - [J Ekdahl]

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

which still cries out for a macro or inline function

 

frown

 

and what does this have to do with "Fast dividing by 5 and by 10 for uint32_t input data for ATMEGA16A" ?

Top Tips:

  1. How to properly post source code - see: https://www.avrfreaks.net/comment... - also how to properly include images/pictures
  2. "Garbage" characters on a serial terminal are (almost?) invariably due to wrong baud rate - see: https://learn.sparkfun.com/tutorials/serial-communication
  3. Wrong baud rate is usually due to not running at the speed you thought; check by blinking a LED to see if you get the speed you expected
  4. Difference between a crystal, and a crystal oscillatorhttps://www.avrfreaks.net/comment...
  5. When your question is resolved, mark the solution: https://www.avrfreaks.net/comment...
  6. Beginner's "Getting Started" tips: https://www.avrfreaks.net/comment...

Pages