int main(void)
{
/* Replace with your application code */
while (1)
{
uint32_t in =0; //= 1599999; // -> 15 9 9 9 9 9
//for example, input from port B
DDRB=0;
DDRB=0x00;
in|=(uint32_t)((uint32_t)PINB<<0);
in|=(uint32_t)((uint32_t)PINB<<8);
in|=(uint32_t)((uint32_t)PINB<<16);
in|=(uint32_t)((uint32_t)PINB<<24);
// GetDigitsFromUint32_t( in);
uint32_t out;
out= DivideBy5(in);
//for example, out bytes to port D
DDRD=0xff;
PORTD=(uint8_t)((out&0x000000ff)>>0);
PORTD=(uint8_t)((out&0x0000ff00)>>8);
PORTD=(uint8_t)((out&0x00ff0000)>>16);
PORTD=(uint8_t)((out&0xff000000)>>24);
Posted by avrcandies: Sun. Mar 29, 2020 - 09:14 AM
1
2
3
4
5
Total votes: 0
What is the point of posting all of his oddball code? At least explain what exactly you are doing (methods, how, why) (this is a discussion forum)...otherwise it will likely all be unused (even if it is great) and just swept into the dust pan and put in the dumpster.
What is the advantage(s) of all of this code?
Why have you completely forgotten to at least comment any of your code?
When in the dark remember-the future looks brighter than ever. I look forward to being able to predict the future!
I have a pretty strong feeling we're being trolled. At the very least OP seems incapable of taking onboard any advice being given. #1 talked about an assembler solution, many excellent ideas were proposed, all ignored. When OP showed code using / and % I pointed out that there was no point in doing TWO divides when it could be done with one. Also ignored. Don't see the point in trying to advise any more.
#5 "If you think you need floating point to solve the problem then you don't understand the problem. If you really do need floating point then you have a problem you do not understand."
#define BUFSIZE 50
char *decimal (unsigned int x)
{
static char buf[BUFSIZE];
char *bp = buf + BUFSIZE - 1;
*bp = 0;
do {
*--bp = ’0’ + x % 10;
x /= 10;
} while (x != 0);
return bp; /* Return pointer to first digit */
}
With this research, first published quite some time ago, the actual implementation can be... quite a bit different.
The paper gives samples in assembly for common targets, and whaddya know, it's very similar to the code you already listed, only with the constant 0xcccccccd because it's for 32-bit values.
That said: I don't think this is gonna work out. Looking at your most recent code, you have four consecutive reads from PINB, and then in another function, four consecutive writes to PORTD. That is not how anything works. I don't think detailed discussions of the multiplication algorithm are going to do you any good if you don't know how to produce working code, and can't articulate what you want to do with it.
I'd guess you could compute X/5, where X is an 8-bit value,
We want 32bits, not 8bits. That makes a lookup table impractical.
An interesting question is whether you can take advantage of the CCCCCCCC... bit pattern to implement the multiplication faster or smaller than you could do a generic 32x32 multiply with an 8bit multiplier (and, I guess: do you need all 32bits of the constant to get an accurate divisor, for these specific cases?
I'd guess you could compute X/5, where X is an 8-bit value,
We want 32bits, not 8bits. That makes a lookup table impractical.
An interesting question is whether you can take advantage of the CCCCCCCC... bit pattern to implement the multiplication faster or smaller than you could do a generic 32x32 multiply with an 8bit multiplier (and, I guess: do you need all 32bits of the constant to get an accurate divisor, for these specific cases?
That was directed specifically at KerimF's thing, which has an inner loop to compute X/5 on an 8-bit value.
So at a smallish overhead in progmem, that inner loop could go away.
An interesting question is whether you can take advantage of the CCCCCCCC... bit pattern to implement the multiplication faster or smaller than you could do a generic 32x32 multiply with an 8bit multiplier (and, I guess: do you need all 32bits of the constant to get an accurate divisor, for these specific cases?
We can, we only need to do a 32*8 mul, the rest is just sums.
But I guess once you get more than 8bits, a shift-by-4 doesn't optimize easily, and if you have an x8 multiply instruction, you might as well use it...
Thought sure I'd posted this earlier: Edit: I had, in another divide by 5 thread.
; C-callable returns result of dividing 32-bit unsigned integer by 5
.global divide32u5
divide32u5:
; input is R27:24, result goes back there
; uses long division with byte-sized digits
; R21:18 holds the quotient before being moved to R27:24
; Of the registers used, the avr-gcc ABI only requires R1 to be restored
.def C5 = R30
LDI C5, 5
.def C51 = R31
LDI C51, 51
.macro div i1, i0, q, flag
; q = i1:0/5, i0-=5*q
MUL \i0, C51
MOV \q, R1
.if \flag
; requires i1<=4
MUL \i1, C51
ADD \q, R0
.ifend
MUL \q, C5
SUB \i0, R0
; i0<=9
CP \i0, C5
BRLT 1f
SUB \i0, CR
INC \q
1: ; i0 <= 4
.endm
div -1, R27, R21, 0
div R27, R26, R20, 1
div R26, R25, R19, 1
div R25, R24, R18, 1
MOVW R24, R18
MOVW R26, R20
CLR R1
RET
44 words and up to 58 cycles, counting the return, but not the call.
Done in assembly, I suspect the 0xCCCCCCCC method would be less than 50 cycles.
; x-= x % 5 ; x= x*0xCCCCCCCC+x
; 0xCCCCCCCD * 5 = 1 mod 2**32
.global div5u32
div5u32:
; the input registers, original x
.def I0 = R24
.def I1 = R25
.def I2 = R26
.def I3 = R27
; becomes x * 0xCC
.def XCC0 = R20
.def XCC1 = R21
.def XCC2 = R22
.def XCC3 = R23
; Holds 0xCC or x % 5
.def CC = R19
.def Xmod5 = R19
; Find x % 5
MOV Xmod5, I0
ADC Xmod5, I1
ADC Xmod5, I2
ADC Xmod5, I3
ADC Xmod5, R1
; subtract
SUB I0, Xmod5
SBC I1, R1
SBC I2, R1
SBC I3, R1
; compute x * 0xCC
PUSH CC
LDI CC, 0xCC
MUL CC, I0
MOVW XCC0, R0
MUL CC, I2
MOVW XCC2, R0
MUL CC, I3
ADD XCC3, R0
MUL CC, I1
ADD XCC1, R0
ADC XCC2, R1
BRCC 1f
INC XCC3
1:
; x+= XCC + (XCC<<8) + (XCC<<16) + (XCC<<24)
ADD I0, XCC0
ADC I1, XCC1
ADC I2, XCC2
ADC I3, XCC3
ADD I1, XCC0
ADC I2, XCC1
ADC I3, XCC2
ADD I2, XCC0
ADD I3, XCC1
ADD I3, XCC0
CLR R1
POP CC
RET
44 cycles counting the return, but not the call.
I don't think this works. First, if it follows the C calling convention, x should be in R25:R22. Second, it doesn't calculate x mod 5. Running through the code in my head, for x in {0..255}, Xmod5 (R19) = x. For x in {256..511}, Xmod5 = x mod 255.
I have no special talents. I am only passionately curious. - Albert Einstein
I don't think this works. First, if it follows the C calling convention, x should be in R25:R22. Second, it doesn't calculate x mod 5. Running through the code in my head, for x in {0..255}, Xmod5 (R19) = x. For x in {256..511}, Xmod5 = x mod 255.
This should work.
I've repaired the calling convention and added the code for Xmod5 %= 5.
; x-= x % 5 ; x= x*0xCCCCCCCC+x
; 0xCCCCCCCD * 5 = 1 mod 2**32
.global div5u32
div5u32:
; the input registers, original x
.def I0 = R24
.def I1 = R25
.def I2 = R26
.def I3 = R27
; becomes x * 0xCC
.def XCC0 = R20
.def XCC1 = R21
.def XCC2 = R22
.def XCC3 = R23
.def Xmod5 = R19
.def Rsave = R18
.def CC = R31
; Find x % 5
MOV Xmod5, I0
ADC Xmod5, I1
ADC Xmod5, I2
ADC Xmod5, I3
ADC Xmod5, R1
; if you have a spare 256 bytes, the following 11 cycles can be replaced by 7 cycles
MOV Rsave, Xmod5
SWAP Rsave
ADD Xmod5, Rsave ; half carry same as carry
ADC Xmod5, R1 ; no carry out
; carry clear
ANDI Xmod5, 0xF
SBRC Xmod5, 3
SUBI Xmod5, 10
BRCS add5
; Xmod5 in 0..7
SUBI Xmod5, 5
BRCC 1f
add5:
ADDI Xmod5, 5
1:
; subtract
SUB I0, Xmod5
SBC I1, R1
SBC I2, R1
SBC I3, R1
; compute x * 0xCC
LDI CC, 0xCC
MUL CC, I0
MOVW XCC0, R0
MUL CC, I2
MOVW XCC2, R0
MUL CC, I3
ADD XCC3, R0
MUL CC, I1
ADD XCC1, R0
ADC XCC2, R1
BRCC 1f
INC XCC3
1:
; x+= XCC + (XCC<<8) + (XCC<<16) + (XCC<<24)
ADD I0, XCC0
ADC I1, XCC1
ADC I2, XCC2
ADC I3, XCC3
ADD I1, XCC0
ADC I2, XCC1
ADC I3, XCC2
ADD I2, XCC0
ADD I3, XCC1
ADD I3, XCC0
CLR R1
RET
The added code costs 11 cycles. Better register selection save 4 cycles.
52 cycles isn't less than 50, but is slightly better than long division.
I expect that it does not scale: Long division would beat modular arithmetic even for 40-bit unsigneds.
Hmm. So, idle thought: 2^24 is 16M. The OP's examples appeared to strongly suggest only needing 7-digit values... which means that you possibly don't need to actually compute the top byte of the result.
For extra credit: The real reason to do this is specifically to produce BCD encoding. Can it be made faster given the knowledge that the real goal is the BCD encoding?
Consider, if you will, an algorithm which does divide/remainder by 100, instead of by 5 or 10. So your basic loop is conceptually:
// converts 0-99 into digit pairs in hex. you could also use
// short and do 0x0502 for [52], for instance.
unsigned char bcdigits[100] = {
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
};
// this part is horribly inefficient; it's a placeholder for where
// you'd put fancy hand-tuned assembly for computing these
// partial results.
int divrem100(int x) {
return ((x % 100) << 24) | (x / 100);
}
// called with a buffer of at least 10 characters, and a value
// that fits in that range, this unpacks it.
char *bcdinto(int x, char buf[10]) {
char *s = buf+10;
int digitpair;
*--s = '\0';
while (x > 0) {
x = divrem100(x);
digitpair = bcdigits[(x >> 24) & 0xFF];
x = x & 0xFFFFFF;
*--s = '0' + (digitpair & 0xF);
*--s = '0' + (digitpair >> 4);
}
*--s = '0';
while (s[0] == '0' && s[1] != '\0') {
++s;
}
return s;
}
The idea here is that we can perform half as many division-and-remainder ops to get pairs of digits, then extract the digits in a few cycles each. So we add I dunno, maybe 10-15 cycles per pair of digits, but remove half the iterations of the loop. (And you could drop the '0'+ parts from the lines if you really want 0x1 instead of '1' for the digits.)
I'd guess you could compute X/5, where X is an 8-bit value,
We want 32bits, not 8bits. That makes a lookup table impractical.
Yes and no.
Given a 32-bit input value, that's ~= 4 billion locations, each of which needs four bytes of space to store the result (less two bits, but who cares), so about 16G-Bytes of memory. Digikey offers 16GB memory modules for US$200-300-ish:
But Digikey doesn't use the case-format for Bytes vs. bits, so you might need eight of them. Still, not entirely unreasonable, if you really really have to do very fast look-up table division (and have awhile to load up the lookup table to start with).
You could also compress the whole thing by only storing every other, or every fourth, result - you'll get some error, but all these ways do, and save some more table space - which means you might be able to use fewer memory modules. Digikey also sells bigger ones.
And Yes, I Know, There Are Better Ways. Get a chip with a hardware divider, for one.
Memory chips are just stupidly large these days... S.
Ah, modular arithmetic... 4%5 = 4 but we can interpret it as -1, therefore (4^n)%5 = (-1)^n.
This means the decimal representation of powers of 4 ends in either 4 or 6 (alternates). Yeah, OT but I figured this out thanks to this thread (thinking about the methods posted by skeeve).
I never studied number theory formally, I can just comprehend this basic stuff, but still I find it very interesting.
I'd guess you could compute X/5, where X is an 8-bit value,
We want 32bits, not 8bits. That makes a lookup table impractical.
An interesting question is whether you can take advantage of the CCCCCCCC... bit pattern to implement the multiplication faster or smaller than you could do a generic 32x32 multiply with an 8bit multiplier (and, I guess: do you need all 32bits of the constant to get an accurate divisor, for these specific cases?
That was directed specifically at KerimF's thing, which has an inner loop to compute X/5 on an 8-bit value.
My long division algorithm could be sped up with a 0x500-byte table of quotients.
That would make it 47 cycles. My modular arithmetic algorithm beats it.
A remainder table is actually slower.
If the table can be placed in RAM, that would save 4 cycles and beat my modular arithmetic algorithm.
GccApplication2.elf: file format elf32-avr
Sections:
Idx Name Size VMA LMA File off Algn
0 .text 00000104 00000000 00000000 00000054 2**1
CONTENTS, ALLOC, LOAD, READONLY, CODE
1 .data 00000000 00800060 00800060 00000158 2**0
CONTENTS, ALLOC, LOAD, DATA
2 .comment 00000030 00000000 00000000 00000158 2**0
CONTENTS, READONLY
3 .note.gnu.avr.deviceinfo 0000003c 00000000 00000000 00000188 2**2
CONTENTS, READONLY
4 .debug_aranges 00000038 00000000 00000000 000001c4 2**0
CONTENTS, READONLY, DEBUGGING
5 .debug_info 000007da 00000000 00000000 000001fc 2**0
CONTENTS, READONLY, DEBUGGING
6 .debug_abbrev 00000631 00000000 00000000 000009d6 2**0
CONTENTS, READONLY, DEBUGGING
7 .debug_line 000002b9 00000000 00000000 00001007 2**0
CONTENTS, READONLY, DEBUGGING
8 .debug_frame 00000074 00000000 00000000 000012c0 2**2
CONTENTS, READONLY, DEBUGGING
9 .debug_str 00000337 00000000 00000000 00001334 2**0
CONTENTS, READONLY, DEBUGGING
10 .debug_loc 0000044a 00000000 00000000 0000166b 2**0
CONTENTS, READONLY, DEBUGGING
11 .debug_ranges 00000040 00000000 00000000 00001ab5 2**0
CONTENTS, READONLY, DEBUGGING
Disassembly of section .text:
00000000 <__vectors>:
0: 0c 94 2a 00 jmp 0x54 ; 0x54 <__ctors_end>
4: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
8: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
10: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
14: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
18: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
1c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
20: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
24: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
28: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
2c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
30: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
34: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
38: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
3c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
40: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
44: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
48: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
4c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
50: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
00000054 <__ctors_end>:
54: 11 24 eor r1, r1
56: 1f be out 0x3f, r1 ; 63
58: cf e5 ldi r28, 0x5F ; 95
5a: d4 e0 ldi r29, 0x04 ; 4
5c: de bf out 0x3e, r29 ; 62
5e: cd bf out 0x3d, r28 ; 61
60: 0e 94 36 00 call 0x6c ; 0x6c <main>
64: 0c 94 80 00 jmp 0x100 ; 0x100 <_exit>
00000068 <__bad_interrupt>:
68: 0c 94 00 00 jmp 0 ; 0x0 <__vectors>
0000006c <main>:
return ;
}
uint32_t DivideBy10(uint32_t inp)
{
return (uint32_t) (inp/10);
6c: 0f 2e mov r0, r31
6e: fa e0 ldi r31, 0x0A ; 10
70: cf 2e mov r12, r31
72: d1 2c mov r13, r1
74: e1 2c mov r14, r1
76: f1 2c mov r15, r1
78: f0 2d mov r31, r0
// GetDigitsFromUint32_t( in);
uint32_t out;
out= DivideBy10(in);
//for example, out bytes to port D
DDRD=0xff;
7a: cf ef ldi r28, 0xFF ; 255
/* Replace with your application code */
while (1)
{
uint32_t in =0; //= 1599999; // -> 15 9 9 9 9 9
//for example, input from port B
DDRB=0;
7c: 17 ba out 0x17, r1 ; 23
DDRB=0x00;
7e: 17 ba out 0x17, r1 ; 23
in|=(uint32_t)((uint32_t)PINB<<0);
80: 26 b3 in r18, 0x16 ; 22
in|=(uint32_t)((uint32_t)PINB<<8);
82: 36 b3 in r19, 0x16 ; 22
in|=(uint32_t)((uint32_t)PINB<<16);
84: 66 b3 in r22, 0x16 ; 22
86: 86 2f mov r24, r22
88: 90 e0 ldi r25, 0x00 ; 0
8a: a0 e0 ldi r26, 0x00 ; 0
8c: b0 e0 ldi r27, 0x00 ; 0
8e: dc 01 movw r26, r24
90: 99 27 eor r25, r25
92: 88 27 eor r24, r24
94: 93 2b or r25, r19
96: 82 2b or r24, r18
in|=(uint32_t)((uint32_t)PINB<<24);
98: 26 b3 in r18, 0x16 ; 22
return ;
}
uint32_t DivideBy10(uint32_t inp)
{
return (uint32_t) (inp/10);
9a: bc 01 movw r22, r24
9c: cd 01 movw r24, r26
9e: 92 2b or r25, r18
a0: a7 01 movw r20, r14
a2: 96 01 movw r18, r12
a4: 0e 94 5e 00 call 0xbc ; 0xbc <__udivmodsi4>
// GetDigitsFromUint32_t( in);
uint32_t out;
out= DivideBy10(in);
//for example, out bytes to port D
DDRD=0xff;
a8: c1 bb out 0x11, r28 ; 17
PORTD=(uint8_t)((out&0x000000ff)>>0);
aa: 22 bb out 0x12, r18 ; 18
PORTD=(uint8_t)((out&0x0000ff00)>>8);
ac: 32 bb out 0x12, r19 ; 18
PORTD=(uint8_t)((out&0x00ff0000)>>16);
ae: 42 bb out 0x12, r20 ; 18
PORTD=(uint8_t)((out&0xff000000)>>24);
b0: 85 2f mov r24, r21
b2: 99 27 eor r25, r25
b4: aa 27 eor r26, r26
b6: bb 27 eor r27, r27
b8: 82 bb out 0x12, r24 ; 18
ba: e0 cf rjmp .-64 ; 0x7c <main+0x10>
000000bc <__udivmodsi4>:
bc: a1 e2 ldi r26, 0x21 ; 33
be: 1a 2e mov r1, r26
c0: aa 1b sub r26, r26
c2: bb 1b sub r27, r27
c4: fd 01 movw r30, r26
c6: 0d c0 rjmp .+26 ; 0xe2 <__udivmodsi4_ep>
000000c8 <__udivmodsi4_loop>:
c8: aa 1f adc r26, r26
ca: bb 1f adc r27, r27
cc: ee 1f adc r30, r30
ce: ff 1f adc r31, r31
d0: a2 17 cp r26, r18
d2: b3 07 cpc r27, r19
d4: e4 07 cpc r30, r20
d6: f5 07 cpc r31, r21
d8: 20 f0 brcs .+8 ; 0xe2 <__udivmodsi4_ep>
da: a2 1b sub r26, r18
dc: b3 0b sbc r27, r19
de: e4 0b sbc r30, r20
e0: f5 0b sbc r31, r21
000000e2 <__udivmodsi4_ep>:
e2: 66 1f adc r22, r22
e4: 77 1f adc r23, r23
e6: 88 1f adc r24, r24
e8: 99 1f adc r25, r25
ea: 1a 94 dec r1
ec: 69 f7 brne .-38 ; 0xc8 <__udivmodsi4_loop>
ee: 60 95 com r22
f0: 70 95 com r23
f2: 80 95 com r24
f4: 90 95 com r25
f6: 9b 01 movw r18, r22
f8: ac 01 movw r20, r24
fa: bd 01 movw r22, r26
fc: cf 01 movw r24, r30
fe: 08 95 ret
00000100 <_exit>:
100: f8 94 cli
00000102 <__stop_program>:
102: ff cf rjmp .-2 ; 0x102 <__stop_program>
- Log in or register to post comments
Top/*
* GccApplication2.c
*
* Created: 29.03.2020 10:38:02
* Author : USERPC01
*/
#include <avr/io.h>
#include <math.h>
/*
uint8_t div32bit_mod (uint32_t in, uint32_t div, uint32_t *mod )
{
uint32_t res;
res=(uint32_t)in /div;
*mod=(uint32_t) (in - (res*div)) ;
*mod=(uint32_t)(in%div); // (in - (res*div)) ;
return (uint8_t)(in/div) ;// res;
}*/
uint8_t divmod( uint32_t inp, uint32_t div, uint32_t *mod )
{
//return (uint8_t) div32bit_mod( inp, div, reminder );
*mod=(uint32_t)(inp%div); // (in - (res*div)) ;
return (uint8_t)(inp/div) ;// res;
}
void GetDigitsFromUint32_t( uint32_t in ) //"uint24_t"
{
uint8_t digits[5];
/*
digits[0]=(uint8_t)(in/100000) ; in=in%100000;
digits[1]=(uint8_t)(in/10000) ; in=in%10000;
digits[2]=(uint8_t)(in/1000) ; in=in%1000;
digits[3]=(uint8_t)(in/100) ; in=in%100;
digits[4]=(uint8_t)(in/10) ; in=in%10;
digits[5]=(uint8_t) (in);
*/
digits[0]=(uint8_t) divmod( in , 0x000186A0 /*100000*/ , &in );
digits[1]=(uint8_t) divmod( in , 0x00002710 /*10000*/, &in );
digits[2]=(uint8_t) divmod( in , 0x000003e8 /*1000*/, &in );
digits[3]=(uint8_t) divmod( in , 0x00000064 /*100*/, &in );
digits[4]=(uint8_t) divmod( in , 0x0000000A /*10*/, &in );
digits[5]=(uint8_t) divmod( in , 0x00000001 /* 1*/ , &in );
//for example, for output to PORTD
DDRD=0xFF;
PORTD=digits[0];
PORTD=digits[1];
PORTD=digits[2];
PORTD=digits[3];
PORTD=digits[4];
PORTD=digits[5];
return ;
}
uint32_t DivideBy5(uint32_t inp)
{
return (uint32_t) (inp/5);
}
//for debugger test only
int main(void)
{
/* Replace with your application code */
while (1)
{
uint32_t in =0; //= 1599999; // -> 15 9 9 9 9 9
//for example, input from port B
DDRB=0;
DDRB=0x00;
in|=(uint32_t)((uint32_t)PINB<<0);
in|=(uint32_t)((uint32_t)PINB<<8);
in|=(uint32_t)((uint32_t)PINB<<16);
in|=(uint32_t)((uint32_t)PINB<<24);
// GetDigitsFromUint32_t( in);
uint32_t out;
out= DivideBy5(in);
//for example, out bytes to port D
DDRD=0xff;
PORTD=(uint8_t)((out&0x000000ff)>>0);
PORTD=(uint8_t)((out&0x0000ff00)>>8);
PORTD=(uint8_t)((out&0x00ff0000)>>16);
PORTD=(uint8_t)((out&0xff000000)>>24);
}
}
- Log in or register to post comments
TopGccApplication2.elf: file format elf32-avr
Sections:
Idx Name Size VMA LMA File off Algn
0 .text 00000196 00000000 00000000 00000054 2**1
CONTENTS, ALLOC, LOAD, READONLY, CODE
1 .data 00000000 00800060 00800060 000001ea 2**0
CONTENTS, ALLOC, LOAD, DATA
2 .comment 00000030 00000000 00000000 000001ea 2**0
CONTENTS, READONLY
3 .note.gnu.avr.deviceinfo 0000003c 00000000 00000000 0000021c 2**2
CONTENTS, READONLY
4 .debug_aranges 00000038 00000000 00000000 00000258 2**0
CONTENTS, READONLY, DEBUGGING
5 .debug_info 000007da 00000000 00000000 00000290 2**0
CONTENTS, READONLY, DEBUGGING
6 .debug_abbrev 00000610 00000000 00000000 00000a6a 2**0
CONTENTS, READONLY, DEBUGGING
7 .debug_line 000002b3 00000000 00000000 0000107a 2**0
CONTENTS, READONLY, DEBUGGING
8 .debug_frame 000000a4 00000000 00000000 00001330 2**2
CONTENTS, READONLY, DEBUGGING
9 .debug_str 00000336 00000000 00000000 000013d4 2**0
CONTENTS, READONLY, DEBUGGING
10 .debug_loc 000004d4 00000000 00000000 0000170a 2**0
CONTENTS, READONLY, DEBUGGING
11 .debug_ranges 00000028 00000000 00000000 00001bde 2**0
CONTENTS, READONLY, DEBUGGING
Disassembly of section .text:
00000000 <__vectors>:
0: 0c 94 2a 00 jmp 0x54 ; 0x54 <__ctors_end>
4: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
8: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
10: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
14: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
18: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
1c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
20: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
24: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
28: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
2c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
30: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
34: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
38: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
3c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
40: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
44: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
48: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
4c: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
50: 0c 94 34 00 jmp 0x68 ; 0x68 <__bad_interrupt>
00000054 <__ctors_end>:
54: 11 24 eor r1, r1
56: 1f be out 0x3f, r1 ; 63
58: cf e5 ldi r28, 0x5F ; 95
5a: d4 e0 ldi r29, 0x04 ; 4
5c: de bf out 0x3e, r29 ; 62
5e: cd bf out 0x3d, r28 ; 61
60: 0e 94 36 00 call 0x6c ; 0x6c <main>
64: 0c 94 c9 00 jmp 0x192 ; 0x192 <_exit>
00000068 <__bad_interrupt>:
68: 0c 94 00 00 jmp 0 ; 0x0 <__vectors>
0000006c <main>:
// GetDigitsFromUint32_t( in);
uint32_t out;
out= DivideBy5(in);
//for example, out bytes to port D
DDRD=0xff;
6c: cf ef ldi r28, 0xFF ; 255
/* Replace with your application code */
while (1)
{
uint32_t in =0; //= 1599999; // -> 15 9 9 9 9 9
//for example, input from port B
DDRB=0;
6e: 17 ba out 0x17, r1 ; 23
DDRB=0x00;
70: 17 ba out 0x17, r1 ; 23
in|=(uint32_t)((uint32_t)PINB<<0);
72: 26 b3 in r18, 0x16 ; 22
in|=(uint32_t)((uint32_t)PINB<<8);
74: 36 b3 in r19, 0x16 ; 22
in|=(uint32_t)((uint32_t)PINB<<16);
76: 66 b3 in r22, 0x16 ; 22
78: 86 2f mov r24, r22
7a: 90 e0 ldi r25, 0x00 ; 0
7c: a0 e0 ldi r26, 0x00 ; 0
7e: b0 e0 ldi r27, 0x00 ; 0
80: dc 01 movw r26, r24
82: 99 27 eor r25, r25
84: 88 27 eor r24, r24
86: 93 2b or r25, r19
88: 82 2b or r24, r18
in|=(uint32_t)((uint32_t)PINB<<24);
8a: 26 b3 in r18, 0x16 ; 22
return ;
}
uint32_t DivideBy5(uint32_t inp)
{
return (uint32_t) (inp/5);
8c: bc 01 movw r22, r24
8e: cd 01 movw r24, r26
90: 92 2b or r25, r18
92: 2d ec ldi r18, 0xCD ; 205
94: 3c ec ldi r19, 0xCC ; 204
96: 4c ec ldi r20, 0xCC ; 204
98: 5c ec ldi r21, 0xCC ; 204
9a: 0e 94 68 00 call 0xd0 ; 0xd0 <__umulsidi3>
9e: 00 e2 ldi r16, 0x20 ; 32
a0: 0e 94 95 00 call 0x12a ; 0x12a <__lshrdi3>
a4: 82 2e mov r8, r18
a6: 93 2e mov r9, r19
a8: a4 2e mov r10, r20
aa: b5 2e mov r11, r21
ac: b6 94 lsr r11
ae: a7 94 ror r10
b0: 97 94 ror r9
b2: 87 94 ror r8
b4: b6 94 lsr r11
b6: a7 94 ror r10
b8: 97 94 ror r9
ba: 87 94 ror r8
// GetDigitsFromUint32_t( in);
uint32_t out;
out= DivideBy5(in);
//for example, out bytes to port D
DDRD=0xff;
bc: c1 bb out 0x11, r28 ; 17
PORTD=(uint8_t)((out&0x000000ff)>>0);
be: 82 ba out 0x12, r8 ; 18
PORTD=(uint8_t)((out&0x0000ff00)>>8);
c0: 92 ba out 0x12, r9 ; 18
PORTD=(uint8_t)((out&0x00ff0000)>>16);
c2: a2 ba out 0x12, r10 ; 18
PORTD=(uint8_t)((out&0xff000000)>>24);
c4: 8b 2c mov r8, r11
c6: 99 24 eor r9, r9
c8: aa 24 eor r10, r10
ca: bb 24 eor r11, r11
cc: 82 ba out 0x12, r8 ; 18
ce: cf cf rjmp .-98 ; 0x6e <main+0x2>
000000d0 <__umulsidi3>:
d0: e8 94 clt
000000d2 <__umulsidi3_helper>:
d2: df 93 push r29
d4: cf 93 push r28
d6: fc 01 movw r30, r24
d8: db 01 movw r26, r22
da: 0e 94 b1 00 call 0x162 ; 0x162 <__umulhisi3>
de: 7f 93 push r23
e0: 6f 93 push r22
e2: e9 01 movw r28, r18
e4: 9a 01 movw r18, r20
e6: ac 01 movw r20, r24
e8: bf 93 push r27
ea: af 93 push r26
ec: 3f 93 push r19
ee: 2f 93 push r18
f0: df 01 movw r26, r30
f2: 0e 94 b1 00 call 0x162 ; 0x162 <__umulhisi3>
f6: 26 f4 brtc .+8 ; 0x100 <__umulsidi3_helper+0x2e>
f8: 6c 1b sub r22, r28
fa: 7d 0b sbc r23, r29
fc: 82 0b sbc r24, r18
fe: 93 0b sbc r25, r19
100: 9e 01 movw r18, r28
102: eb 01 movw r28, r22
104: fc 01 movw r30, r24
106: 0e 94 c0 00 call 0x180 ; 0x180 <__muldi3_6>
10a: af 91 pop r26
10c: bf 91 pop r27
10e: 2f 91 pop r18
110: 3f 91 pop r19
112: 0e 94 c0 00 call 0x180 ; 0x180 <__muldi3_6>
116: be 01 movw r22, r28
118: cf 01 movw r24, r30
11a: f9 01 movw r30, r18
11c: 2f 91 pop r18
11e: 3f 91 pop r19
120: cf 91 pop r28
122: df 91 pop r29
124: 08 95 ret
00000126 <__ashrdi3>:
126: 97 fb bst r25, 7
128: 10 f8 bld r1, 0
0000012a <__lshrdi3>:
12a: 16 94 lsr r1
12c: 00 08 sbc r0, r0
12e: 0f 93 push r16
130: 08 30 cpi r16, 0x08 ; 8
132: 98 f0 brcs .+38 ; 0x15a <__lshrdi3+0x30>
134: 08 50 subi r16, 0x08 ; 8
136: 23 2f mov r18, r19
138: 34 2f mov r19, r20
13a: 45 2f mov r20, r21
13c: 56 2f mov r21, r22
13e: 67 2f mov r22, r23
140: 78 2f mov r23, r24
142: 89 2f mov r24, r25
144: 90 2d mov r25, r0
146: f4 cf rjmp .-24 ; 0x130 <__lshrdi3+0x6>
148: 05 94 asr r0
14a: 97 95 ror r25
14c: 87 95 ror r24
14e: 77 95 ror r23
150: 67 95 ror r22
152: 57 95 ror r21
154: 47 95 ror r20
156: 37 95 ror r19
158: 27 95 ror r18
15a: 0a 95 dec r16
15c: aa f7 brpl .-22 ; 0x148 <__lshrdi3+0x1e>
15e: 0f 91 pop r16
160: 08 95 ret
00000162 <__umulhisi3>:
162: a2 9f mul r26, r18
164: b0 01 movw r22, r0
166: b3 9f mul r27, r19
168: c0 01 movw r24, r0
16a: a3 9f mul r26, r19
16c: 70 0d add r23, r0
16e: 81 1d adc r24, r1
170: 11 24 eor r1, r1
172: 91 1d adc r25, r1
174: b2 9f mul r27, r18
176: 70 0d add r23, r0
178: 81 1d adc r24, r1
17a: 11 24 eor r1, r1
17c: 91 1d adc r25, r1
17e: 08 95 ret
00000180 <__muldi3_6>:
180: 0e 94 b1 00 call 0x162 ; 0x162 <__umulhisi3>
184: 46 0f add r20, r22
186: 57 1f adc r21, r23
188: c8 1f adc r28, r24
18a: d9 1f adc r29, r25
18c: 08 f4 brcc .+2 ; 0x190 <__muldi3_6+0x10>
18e: 31 96 adiw r30, 0x01 ; 1
190: 08 95 ret
00000192 <_exit>:
192: f8 94 cli
00000194 <__stop_program>:
194: ff cf rjmp .-2 ; 0x194 <__stop_program>
- Log in or register to post comments
TopWhat is the point of posting all of his oddball code? At least explain what exactly you are doing (methods, how, why) (this is a discussion forum)...otherwise it will likely all be unused (even if it is great) and just swept into the dust pan and put in the dumpster.
What is the advantage(s) of all of this code?
Why have you completely forgotten to at least comment any of your code?
When in the dark remember-the future looks brighter than ever. I look forward to being able to predict the future!
- Log in or register to post comments
TopI have a pretty strong feeling we're being trolled. At the very least OP seems incapable of taking onboard any advice being given. #1 talked about an assembler solution, many excellent ideas were proposed, all ignored. When OP showed code using / and % I pointed out that there was no point in doing TWO divides when it could be done with one. Also ignored. Don't see the point in trying to advise any more.
- Log in or register to post comments
TopThe OP does appear to have a history of asking questions and not replying.
#1 Hardware Problem? https://www.avrfreaks.net/forum/...
#2 Hardware Problem? Read AVR042.
#3 All grounds are not created equal
#4 Have you proved your chip is running at xxMHz?
#5 "If you think you need floating point to solve the problem then you don't understand the problem. If you really do need floating point then you have a problem you do not understand."
- Log in or register to post comments
TopAnyway, I wrote and tested the code below for a fast division by 5. Sorry if it was done already.
Symbolic code:
x in Y[x] is the byte number of the register Y
N[4] the original 32-bit number
R[4] the result of N[4]/5
A[4], B[4], C[4] and K[1] temporary registers
div32_5:
; PUSH registers
A[4]= N[4]
C[4]= R[4]= 0
loop1:
A[4]= A[4]-C[4]
if A[4]<256, break1
B[4]= (A[4]/256)*51
R[4]= R[4]+B[4]
C[4]= B[4]*5
goto loop1
break1:
K[1]=0
loop2:
A[1]=A[1]-5
if A[1]<0, goto break2
K[1]=K[1]+1
goto loop2
break2:
R[4]= R[4]+K[1]
; POP registers
RET
- Log in or register to post comments
TopSorry, after I minimized the registers names for clarity, some of them were already defined by AS.
I will edit it very soon.
Updated:
- Log in or register to post comments
TopDo you usually post code with no explanation? What is K, What is B? Nobody will use it in the future if you don't state how to use it
When in the dark remember-the future looks brighter than ever. I look forward to being able to predict the future!
- Log in or register to post comments
TopYou are right, it seems I erased them by mistake while editing :(
Symbolic Code:
x in Y[x] is the byte number of the register Y
N[4] the original 32-bit number
R[4] the result of N[4]/5
A[4], B[4], C[4] and K[1] temporary registers
(#58)
But the slowest part is last division after "break1:", I used the simplest method (subtraction)
I couldn't find division of 8-bit by 8-bit (or 4-bit since it is 5 here).
- Log in or register to post comments
Top- Log in or register to post comments
Tophttps://gmplib.org/~tege/divcnst...
So, for instance, consider:
With this research, first published quite some time ago, the actual implementation can be... quite a bit different.
The paper gives samples in assembly for common targets, and whaddya know, it's very similar to the code you already listed, only with the constant 0xcccccccd because it's for 32-bit values.
That said: I don't think this is gonna work out. Looking at your most recent code, you have four consecutive reads from PINB, and then in another function, four consecutive writes to PORTD. That is not how anything works. I don't think detailed discussions of the multiplication algorithm are going to do you any good if you don't know how to produce working code, and can't articulate what you want to do with it.
- Log in or register to post comments
TopSuggestion: I'd guess you could compute X/5, where X is an 8-bit value, using a lookup table. Progmem, obviously.
this is almost certainly faster than the loop, and it does impose any code size at all, but it seems pretty cheap.
- Log in or register to post comments
TopWe want 32bits, not 8bits. That makes a lookup table impractical.
An interesting question is whether you can take advantage of the CCCCCCCC... bit pattern to implement the multiplication faster or smaller than you could do a generic 32x32 multiply with an 8bit multiplier (and, I guess: do you need all 32bits of the constant to get an accurate divisor, for these specific cases?
- Log in or register to post comments
Top???
Before you start to code you have spend 256 byte of code space.
- Log in or register to post comments
TopThat was directed specifically at KerimF's thing, which has an inner loop to compute X/5 on an 8-bit value.
So at a smallish overhead in progmem, that inner loop could go away.
- Log in or register to post comments
TopWe can, we only need to do a 32*8 mul, the rest is just sums.
This is a test program:
- Log in or register to post comments
TopSome samples for generator control :
Attachment(s):
- Log in or register to post comments
Top32*4bits? 2bits?
a * 0xCC = a * 0xC + ((a * 0xC)<<4) = ( (a*3) + ((a*3)<<4)<<2 = ( (a+a+a) + ((a+a+a)<<4)<<2
But I guess once you get more than 8bits, a shift-by-4 doesn't optimize easily, and if you have an x8 multiply instruction, you might as well use it...
- Log in or register to post comments
TopThought sure I'd posted this earlier: Edit: I had, in another divide by 5 thread.
44 words and up to 58 cycles, counting the return, but not the call.
Done in assembly, I suspect the 0xCCCCCCCC method would be less than 50 cycles.
Iluvatar is the better part of Valar.
- Log in or register to post comments
TopEdit: 3:00 AM work. NOT good.
C-callable, of course.
44 cycles counting the return, but not the call.
Iluvatar is the better part of Valar.
- Log in or register to post comments
TopI don't think this works. First, if it follows the C calling convention, x should be in R25:R22. Second, it doesn't calculate x mod 5. Running through the code in my head, for x in {0..255}, Xmod5 (R19) = x. For x in {256..511}, Xmod5 = x mod 255.
I have no special talents. I am only passionately curious. - Albert Einstein
- Log in or register to post comments
TopI've repaired the calling convention and added the code for Xmod5 %= 5.
The added code costs 11 cycles. Better register selection save 4 cycles.
52 cycles isn't less than 50, but is slightly better than long division.
I expect that it does not scale: Long division would beat modular arithmetic even for 40-bit unsigneds.
Iluvatar is the better part of Valar.
- Log in or register to post comments
TopHmm. So, idle thought: 2^24 is 16M. The OP's examples appeared to strongly suggest only needing 7-digit values... which means that you possibly don't need to actually compute the top byte of the result.
For extra credit: The real reason to do this is specifically to produce BCD encoding. Can it be made faster given the knowledge that the real goal is the BCD encoding?
Consider, if you will, an algorithm which does divide/remainder by 100, instead of by 5 or 10. So your basic loop is conceptually:
The idea here is that we can perform half as many division-and-remainder ops to get pairs of digits, then extract the digits in a few cycles each. So we add I dunno, maybe 10-15 cycles per pair of digits, but remove half the iterations of the loop. (And you could drop the '0'+ parts from the lines if you really want 0x1 instead of '1' for the digits.)
- Log in or register to post comments
TopYes and no.
Given a 32-bit input value, that's ~= 4 billion locations, each of which needs four bytes of space to store the result (less two bits, but who cares), so about 16G-Bytes of memory. Digikey offers 16GB memory modules for US$200-300-ish:
https://www.digikey.com/product-...
But Digikey doesn't use the case-format for Bytes vs. bits, so you might need eight of them. Still, not entirely unreasonable, if you really really have to do very fast look-up table division (and have awhile to load up the lookup table to start with).
You could also compress the whole thing by only storing every other, or every fourth, result - you'll get some error, but all these ways do, and save some more table space - which means you might be able to use fewer memory modules. Digikey also sells bigger ones.
And Yes, I Know, There Are Better Ways. Get a chip with a hardware divider, for one.
Memory chips are just stupidly large these days...
S.
- Log in or register to post comments
TopAh, modular arithmetic... 4%5 = 4 but we can interpret it as -1, therefore (4^n)%5 = (-1)^n.
This means the decimal representation of powers of 4 ends in either 4 or 6 (alternates). Yeah, OT but I figured this out thanks to this thread (thinking about the methods posted by skeeve).
I never studied number theory formally, I can just comprehend this basic stuff, but still I find it very interesting.
- Log in or register to post comments
TopThis is the slowest fast division, 2 weeks and still calculating.
John Samperi
Ampertronics Pty. Ltd.
www.ampertronics.com.au
* Electronic Design * Custom Products * Contract Assembly
- Log in or register to post comments
TopThat would make it 47 cycles. My modular arithmetic algorithm beats it.
A remainder table is actually slower.
If the table can be placed in RAM, that would save 4 cycles and beat my modular arithmetic algorithm.
Iluvatar is the better part of Valar.
- Log in or register to post comments
TopI did confirm, by the way, that "divide by 100 instead of by 10" wins significantly.
- Log in or register to post comments
TopSome programs for appilcation (rebuilf for Atmega32A )
Attachment(s):
- Log in or register to post comments
Top#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
int32_t div_const( int32_t N , uint32_t D )
{
int32_t Q=0;
int32_t R=(int32_t )N;
while(R>=D){ Q++; R-=D; }
printf ("\nN=%ld R=%ld Q=%ld D=%ld",N,R,Q,D);
return R;
}
int main()
{
int32_t n=1597501;
int32_t r;
r =div_const(n , 100000 );
r =div_const(r , 10000 );
r =div_const(r , 1000 );
r =div_const(r , 100 );
r =div_const(r , 10 );
printf ("\nr= %d ", (int)r);
return 0;
}
- Log in or register to post comments
TopPlease see Tip #1 in my signature (below) for how to properly post source code!
Top Tips:
- Log in or register to post comments
Top- Log in or register to post comments
Toprebuild for AVR
- Log in or register to post comments
TopYou don't believe in indentation in your code, then?
Top Tips:
- Log in or register to post comments
TopIt'sAllAConspiracyByBigTab,andtheirfellowkeyboardFatCatstheSpacebar.Cutthemdowntosize!!11231!!
S.
- Log in or register to post comments
TopBeware the Shift!
David
- Log in or register to post comments
Top- Log in or register to post comments
Top(I use 0-15 for MSB and 0-9 for other in my circuit )
- Log in or register to post comments
Topand still no indentation.
Top Tips:
- Log in or register to post comments
TopI can't stand it any more!
Wasn't this supposed to be 'Fast dividing by 5 and 10?' What is fast about this code?
"Experience is what enables you to recognise a mistake the second time you make it."
"Good judgement comes from experience. Experience comes from bad judgement."
"Wisdom is always wont to arrive late, and to be a little approximate on first possession."
"When you hear hoofbeats, think horses, not unicorns."
"Fast. Cheap. Good. Pick two."
"We see a lot of arses on handlebars around here." - [J Ekdahl]
- Log in or register to post comments
Topwhich still cries out for a macro or inline function
and what does this have to do with "Fast dividing by 5 and by 10 for uint32_t input data for ATMEGA16A" ?
Top Tips:
- Log in or register to post comments
TopPages