Smallest and fastest binary to bcd conversion

Go To Last Post
57 posts / 0 new
Author
Message
#1
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

As a hobby project during Christmas I have been trying to make a better bin2bcd conversion than what you get if you use the two most typical methods, using itoa or mod (%). I have found a third method that is normally used in hardware. (Xilinx application
note XAPP029. It is not smaller (and will probably not be) but it is faster. Smallest is using mod. But I have not yet tried to make the code better than it is below, this is my first running version. Maybe someone could help me do it smaller (without causing it to be more than marginally slower) before I post the totall results.

unsigned short Short2BCD3(unsigned short inValue)
// Overflow conversions (>=5). 1 is carry to next accumulator
// 5 => 10
// 6 => 12
// 7 => 14
// 8 => 16
// 9 => 18
//Note: Only works with nibbles.
{
  char BCDValue[5];	// 16 bits unsigned short can have a maximum of 5 digits
										// but currently only 4 are supported
  unsigned char loop;
	unsigned char acc0=0;
	unsigned char acc1=0;
	unsigned char acc2=0;
	unsigned char acc3=0;
	unsigned char carry0=0;
	unsigned char carry1=0;
	unsigned char carry2=0;
	unsigned char carry3=0;
	unsigned char carry4=0;
	unsigned char cIn0;

	for (loop=0;loop<=15;loop++)
	{
		// Put next bit of inValue in carry and shift up inValue one step.
		if (( inValue & 0x8000) != 0)	// Check if MSB = 1
			cIn0 = 1;	// Yes, add one
		else
			cIn0=0;
		inValue <<= 1;


		// If carry generated to next digit, convert and add carry.
		// If no carry generated to next digit, shift left 1 step and add carry.
		if (carry1 == 1)
			acc0 = ((acc0-5)*2) + cIn0;
		else
			acc0 = (acc0 << 1) + cIn0;

		if (carry2 == 1)
			acc1 = ((acc1-5)*2) + carry1;
		else
			acc1 = (acc1 << 1) + carry1;

		if (carry3 == 1)
			acc2 = ((acc2-5)*2) + carry2;
		else
			acc2 = (acc2 << 1) + carry2;

		if (carry4 == 1)
			acc3 = ((acc3-5)*2) + carry3;
		else
			acc3 = (acc3 << 1) + carry3;

		// Check each accumulator if >= 5. Generate carry if so.
		if (acc0 >= 5)
			carry1 = 1;
		else
			carry1 = 0;

		if (acc1 >= 5)
			carry2 = 1;
		else
			carry2 = 0;

		if (acc2 >= 5)
			carry3 = 1;
		else
			carry3 = 0;

		if (acc3 >= 5)
			carry4 = 1;
		else
			carry4 = 0;

	}	//END for

	return acc3<<12|acc2<<8|acc1<<4|acc0;
}

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

Last Edited: Thu. Jan 8, 2009 - 03:23 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

So the input is an unsigned short in xilinx? Is that 8 or 16 bits to us sw guys?
And the output is 5 values 0-15 each in a byte? Or you want the 5 values packed into 3 bytes? (I normally think of packed nibbles in a byte when someone mentions binary coded decimal, otherwise the nibble is just regular old binary). Repeated subtraction of powers of 10 is faster if you dont have any hw mult or div.... which AVR is this for? A real small one? Got a target to beat in bytes or cycles?

Imagecraft compiler user

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

One method that is pretty fast is subtraction loops using decreasing powers of 10.

For example, lets say that your integer limit is 65535. Start off by subtracting 10000 until the remainder is less than 10000 (no more than 6 loops). Once you have this value, you can add 0x30 to the result to get the ASCII value, then store the ASCII value. Continue conversion by repeating the process with each remainder by subtracting 1000, 100, 10. Of course, when you get to less than ten, add 0x30 to the last value and you're done.

It's quick because there is no division.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Something is not right.

You define

bengtr wrote:

  char BCDValue[5];	// 16 bits unsigned short can have 


But you don't use it. Instead you return
bengtr wrote:

	return acc3<<12|acc2<<8|acc1<<4|acc0;
}

As for the C code, I would have used a bunch of other idioms. But you would have to check for each idiom if the compiler manages to convert that into shorter or faster code.

I'll post my version in a few minutes, without checking if the algorithm indeed works, and without any guarantee that my version improves anything.

Stealing Proteus doesn't make you an engineer.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Small and fast:

;*************************************************************************
;*                                                                       *
;*                      Convert unsigned 16 bit to 5 digit ASCII         *
;*                                                                       *
;*              Author: Peter Dannegger                                  *
;*                                                                       *
;*************************************************************************
;
;input: R17, R16 = 16 bit value 0 ... 65535
;output: R20, R19, R18, R17, R16 = 5 digits (ASCII)
;cycle: 20 ... 170
;
bin16_ascii:

        ldi     r20, -1 + '0'
_bcd1:  inc     r20
        subi    r16, low(10000)         ;-10000
        sbci    r17, high(10000)
        brcc    _bcd1

        ldi     r19, 10 + '0'
_bcd2:  dec     r19
        subi    r16, low(-1000)         ;+1000
        sbci    r17, high(-1000)
        brcs    _bcd2

        ldi     r18, -1 + '0'
_bcd3:  inc     r18
        subi    r16, low(100)           ;-100
        sbci    r17, high(100)
        brcc    _bcd3

        ldi     r17, 10 + '0'
_bcd4:  dec     r17
        subi    r16, -10                ;+10
        brcs    _bcd4

        subi    r16, -'0'
        ret
;-------------------------------------------------------------------------

And in C:

char digit[5];


void bin2bcd( unsigned int val )
{
  char i;

  i = '0' - 1;
  do
    i++;
  while( !((val -= 10000) & 0x8000) );
  digit[4] = i;

  i = '0' + 10;
  do
    i--;
  while( (val += 1000) & 0x8000 );
  digit[3] = i;

  i = '0' - 1;
  do
    i++;
  while( !((val -= 100) & 0x8000) );
  digit[2] = i;

  i = '0' + 10;
  do
    i--;
  while( (val += 10) & 0x8000 );
  digit[1] = i;

  digit[0] = val | '0';
}

Peter

Last Edited: Thu. Jan 1, 2009 - 09:52 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Without much thinking about it (might contain typing errors and, there is for sure more room for improvement):

unsigned short Short2BCD3(unsigned short inValue) {
        int loop;
	unsigned char acc0 = 0;
	unsigned char acc1 = 0;
	unsigned char acc2 = 0;
	unsigned char acc3 = 0;
	unsigned char cIn0;

	for (loop = 0;loop < 16; loop++)
	{
                cIn0 = !!(inValue & 0x8000);
                inValue <<= 1;

                if(acc0 >= 5) { acc0 -= 5; }
                acc0 = acc0 * 2 + cIn0;

                if(acc1 >= 5) { acc1 -= 5; }
                acc1 = acc1 * 2 + cIn0;

                if(acc2 >= 5) { acc2 -= 5; }
                acc2 = acc2 * 2 + cIn0;

                if(acc3 >= 5) { acc3 -= 5; }
                acc3 = acc3 * 2 + cIn0;
        }

	return acc3 << 12 | acc2 << 8 | acc1 << 4 | acc0;
}

Stealing Proteus doesn't make you an engineer.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Still need to know if smallest trumps fastest. If size doesnt count, you can unroll everything to make it bigger but faster. Lets say stdwords=the itoa way and stdcycles=the itoa way, then points=words/stdwords + cycles/stdcycles, bigger is better? Since words/stdwords is dimensionless, the actual number of words doesnt penalize you. Can you verify you want output to be binary, not ascii?

Imagecraft compiler user

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I've always used this but its only for 8-bit types. It should be easy enough to extend but I rekcon it may be faster to handle each byte of larger types individually with this method.
Steve

unsigned char char2bcd(unsigned char hex)
{
   unsigned char MSD = 0;

   while (hex >= 10)
   {
      hex -= 10;	// hex becomes 1s (LSD)
      MSD += 0x10;	// add 1 to 10s (MSD)
   }
   return MSD + hex;	// pack BCD into char
}

unsigned char bcd2char(unsigned char bcd)
{
   unsigned char hex = 10;	// set up to be multiplied by MSD

   hex *= (bcd & 0xF0) >> 4;	// set 10s
   hex += (bcd & 0x0F);		// add 1s
   return hex;
}
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Thanks for pointing out so many things. Bob, I do not remember the format in the App note but it's just about principles. Short will always be 16 bits for me anyway. And yes, I output in binary coded decimal, not ASCII (coded decimal?). But the question is good because I didn't thought about it. Maybe it is more typical that what you want is output in ASCII. Displays usually is used this way, right?

The unused array is from another version, just forgot to delete it.

Goal was to find one version that uses least flash and (if not the same) one version that uses least clock cycles. (But I do not like the idea to roll out everything, that is to waste to much flash and readability without winning very much)

Target I am testing on is a mega88. Do that means any limitations?

Peters version was really fast. This may be the best version as it is also small. It was also very easy to change to non ASCII output if that is requested.
/edit: Speed is depending on number to convert but I think it will always be fast. Slowest number to convert some one?

Arnolds has some error in it, it doesn't work.

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I have reach some sort of conclusion in this matter.

I have tested four different methods. Se the table below for flash usage and speed in clock cycles on a mega88.
1: (itoa) 408 2136
2: (modulus) 164 2178
3: (Shifting) ~400 ~1700
4: (Subtract) 288 249 (max, nr of cycles depend on number converted)

So the subtraction method is without comparission the fastest but the modulus method is smaller.

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

(max, nr of cycles depend on number converted)

I looked at my compiler's itoa() and it uses the subtract. So I think you need to do your tests with at least a few values and show those results for us to compare. Which compiler are you using?

Note that danni's routines are for unsigned 16-bit; itoa() is for signed so there is some overhead in handling negative numbers. To get best range I often use a utoa() as danni does.

What sample value did you use for itoa()? I'd be interested in trying with my compiler, just for comparison. The implementation doesn't seem to be "optimal" but it isn't "ugly" either.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

but the modulus method is smaller.

I don't see any examples above of the "modulus" approach--can you show one?

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

So input is 16 bit unsigned, output is 5 binary bytes in an array? I was working on this at home I'll revisit it now I have a spec...

Imagecraft compiler user

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

5 binary bytes in an array?

5 ASCII bytes surely? (well that's the expected outcome of itoa() anyway)

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

That is also a "variable". The thread title talked of BCD. If we are looking for superlatives (smallEST, fastEST) then the rules need to be clear on the output -- ASCII, packed BCD, unpacked BCD -- and the input -- single byte, 16 bits, 32 bits, extensible, and signed or not.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Right I'm now totally confused. To try and understand what was actually required here I simply went back to the first post and built the routine into a test program:

int main (void) 
{ 
	unsigned short in = 1234, out;

	out = Short2BCD3(in);

which ends with 'out' holding 4,660 which is 0x1234 so this is a binary to packed BCD routine. What I cannot get my brain around is exactly what invocation of itoa() could give you this result? Sure I can use:

char buff[8];
itoa(in, buff, 10);

and end up with:

buff[0] = '1'
buff[1] = '2'
buff[3] = '3'
buff[4] = '4'
buff[5] = 0

but how does that help me to create 'out' containing 0x1234? Or is the suggestion that the itoa() version is actually followed by:

char buff[8];
itoa(in, buff, 10);
out = 0;
for(int i=0; i

Very confused.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

If you want to put em on an lcd you add 0x20 not 0x30 right? If you want to put em on a 7seg display, you need raw binary to look up segments, right? You need packed bcd to put em on a pair of TIL311 displays packed in a byte on a port.... so 4 choices....

Imagecraft compiler user

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

Very confused.

Exactly--what are the rules for this contest?

Referring to some earlier threads on a past life on Burroughs medium systems COBOL machines, we "thought" in packed BCD--PIC 9999 COMP. ;)

In practice, I use BCD on 7-segment apps. I can only recall using packed BCD on one AVR app, a tight '4433 app with 7-seg and lots of display "strings". Since the appearance of the Mega8 (and now 48/88) there are enough resources to do things more straightforward (and faster) with unpacked.

In a "real" 7-seg app, I usually have a few similar routines hanging about to make display easier for me. Placing implied decimal points, converting from internal units to display units, signed or not, right-justified (usually), leading-zero suppression or not. IME one-size-does-not-fit-all: the -est routine if generic may require "post processing" for the particular result.

I always enjoy the "contests" as a learning experience, to be amazed at the solutions that fellow 'Freaks come up wit, and to compare my toolchain's performance. But I can't play the game if I don't know the rules.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

OK, I am feeling a little "pushed" right now. I will try to explain some things.

The "rules" a very loose. This was just for fun. I wanted to find the fastest and smallest method respectively and was playing around with what I found here at AVRfreaks and elsewere. I had no ambition to make this a scientic level test, just rough numbers so I did not rewrote anything to make them use the same in and output that of course will affect the result a little. As Bob mentioned I even missed that I sometimes not even had the same output.

So first I want to aplogize to Bob, Lee and others that like this sort of games as I made it impossible as not setting up clear rules. I think I can do that now.

- Create one or two (see below) functions that do Binary to BCD conversions.
- Input in binary, unsigned short.
- Output is 5 digits BCD in ASCII but also show what needs to change if output should be hex BCD instead. The digits should be in a string with one digit per cell. Unsigned char. Not any packed BCD.
- The functions should be entire written in C.
- Should be as small as possible OR as fast as possible. Thats the reason that it may be two functions.
- If as fast as possible, do not roll out any loops.
- I use Imagecraft 7.15 but use what you have.
- Target mega88 (131 instruction set)

Some examples of what I have so far that is not already above:

1. Code example if using itoa:
This also returns number of digits (in case of leading zeros)

unsigned char Short2BCD(unsigned short inValue, char *outValue)
{ 
  char BCDValue[5];	// 16 bits unsigned short can have a maximum of 5 digits 
  unsigned char loop; 

  itoa(BCDValue, inValue, 10);
  loop = 0;
  while(BCDValue[loop])	// Just loop through BCDValue and check for end of file
  {
    outValue[loop] = BCDValue[loop] - '0';
    loop++;
  }
  return loop;
}

2. Code example using modulus

void Short2BCD(unsigned short in, unsigned char *digit)
// returns BCD-converted value. Max 5 digits
{
	unsigned short val;
	unsigned char i;

	for (i=0;i<=4;i++)
	{
		val = in % 10;
		digit[i]=val; //add value of '0' for ASCII
		in /= 10;
	}
}

/Bengt /edit: spelling

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Problem with the rules:
The size and speed will depend of compiler!
I will not give you code bacause I write in ASM but here is what I think about the problem:
The smallest and slowest is to make a counter with a base of 10 and just inc the number every time you dec the 16(2) base number.
For bytes this is very small and 255 loops will not kill the speed.
For 16 bit I use a version close to Peters but only with positiver numbers 10000,1000,100,10 that way the numbers can be placed in a table, and the code can be a loop of 4.
Because you use a micro with HW mul there should be a way to make a fast div with 10 by mul with 1/10 (mul with 26 and only use high byte is about the same as DIV with 10).

Have fun

Jens

Edit I had a extra look at Peters ASM code and I like it bacause it don't need to change any of XYZ.

Last Edited: Tue. Jan 6, 2009 - 07:54 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

A few comments:

Nearly always I want my display numbers to be right-justified. The biggest exception might be free--format "dump" strings, but then size/speed don't matter that much. So my home-built routines are almost always adaptations of "itoa()"-type.

The same goes for leading-zero suppression. Usually I want it for display, but not for manipulating setpoints and a few other situations. So I've got an option for that.

If you say that "mod is the shortest", that >>might<< be true. You cannot just look at the size of your modulus routine. Rather, you have to start with a "null" program and then add it in. While your routine may be the shortest in your quest, you need to count 16-bit divide and mod code.

[Note that if you already use printf() for something else, the size penalty is 0. [[addressing your yah-but responses to the size of divide and mod. ;) ]]

And a bit more on the mod routine later...

Summary: We are looking for the b-est utoa() -- itoa() for unsigned only. Note that with some methods we need to have specific input values. I'll start with 9999 as a near-worst-case.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:
The size and speed will depend of compiler!

Of course. But if everyone tells what they use we can still compare.
Quote:
Rather, you have to start with a "null" program and then add it in.

That was exactly the method I used.
Quote:
[Note that if you already use printf() for something else, the size penalty is 0

Sure? I noticed that if I include a file but do not use anything in it (itoa) size did not increase. I guess that I am wrong but I hoped that it just included what it needed. Good note!

I have noticed that some routines can not handle the full unsigned range, only half as if it was signed.

I agree that a really useful routine should have at least an option to report nr of digits (up to the first zero). I would prefere it that way but some of the routines may be a little complicated to adapt to it.

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Baseline numbers for a recent CodeVision version:

itoa (9999, buffer);
sprintf (buffer, "%u", 9999);

itoa (0, buffer);
sprintf (buffer, "%u", 0);

itoa -- 104 bytes; 113 cycles for "0"; 341 cycles for "9999"
sprintf -- 1036 bytes; 890 cycles for "0"; 2195 cycles for "9999".

Notes on the above: The routines themselves are actually a bit smaller and faster than given, as the results were obtained by >>adding a call to the routine to have it linked in<<. So it includes the itoa() etc. invocation line itelf, and call-ret overhead.

Quote:

- I use Imagecraft 7.15 but use what you have.

In some past "contests" some of the Imagecraft library/intrinsics didn't compare too well, so I was leery of your 2136 cycles for itoa(). My CV numbers are more in line with your subtraction numbers as I would expect.

As I mentioned earlier, itoa() has to handle the signed checks/adjustments, so it won't be as fast as danni's routine. His is also [more nearly?] optimal for subtracting/adding.

Let's see how my unoptimized not the best-est workhorse utoaz() routine--right-justified, optional leading-zero suppression via a parameter-- stacks up:

//
// **************************************************************************
// *
// *        U T O A Z
// *
// **************************************************************************
//
//
//    Transform a 16-bit unsigned value into unpacked ASCII.
//
//    Only the significant characters are output.  The number is right-justified.
//
//    There are 5 characters output with this version.
//
//    The resulting string is null-terminated.

eeprom unsigned int    powers[] = {10000, 1000, 100, 10};

void                        utoaz                (    unsigned char *outstring,
                                                    unsigned int number,
                                                    unsigned char zero)
{
unsigned int        work;
unsigned char        digit;    // this BCD digit to be output
unsigned char        index;    // which character are we working on.
unsigned int        power;

    index = 0;                // start at the left-most character

    work = (unsigned int) number;

    for (index = 0; index < 4; index++)
        {

           digit = 0;
           power = powers
; while (work >= power) { work -= power; digit++; } zero += digit; if (zero > 0) { *outstring++ = digit | 0x30; } else { *outstring++ = ' '; } } *outstring++ = ((unsigned char)work) | 0x30; // last character, even if 0. *outstring++ = 0; // null termination } ... utoaz (buffer, 9999, 0);

utoaz() -- 196 bytes; 502 cycles for "0"; 724 cycles for "9999".

Not too bad, 'cause it does more "stuff". Now, EEPROM for the table is going to be a bit ugly. Also, using the outstring and zero parameters right on the stack will be uglier in CV than perhaps other compilers. Let's poke at it a bit: simply change the table to flash, and
utoaz() -- 192 bytes; 337 cycles for "0"; 560 cycles for "9999".

Not bad again. Pulling the two mentioned variables off the stack and into register variables saves about 10%:
utoaz() -- 182 bytes; 285 cycles for "0"; 511 cycles for "9999".

My summary: With itoa() library performance as you found, I can see why this would be an area of interest to you. My compiler's itoa() is small enough/fast enough compared to your results of different methods so I might as well use it if the output is applicable.

My workhorse routine does more, and IMO for real work right-justify and leading-zero suppression are necessary features. The size is fine, and with a few obvious tweaks the performance is acceptable (remember it does more).

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

I agree that a really useful routine should have at least an option to report nr of digits (up to the first zero). I would prefere it that way but some of the routines may be a little complicated to adapt to it.

My zero-suppression as shown above takes only a few cycles per digit, and IMO is quite clever. I cannot remember, though, if I was the clever one or saw it someplace.

Quote:

Quote:
Rather, you have to start with a "null" program and then add it in.

That was exactly the method I used.


I'm getting to that one, but I'm finding it hard to believe that a null program with no divide or mod capability, then adding your routine, would come up with that small an increase. It implies that 16-bit divide takes only a few words? Perhaps. Continuing ... ;)

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

void Short2BCD(unsigned short in, unsigned char *digit)
// returns BCD-converted value. Max 5 digits
{
unsigned short val;
unsigned char i;

for (i=0;i<=4;i++)
{
val = in % 10;
digit[i]=val; //add value of '0' for ASCII
in /= 10;
}
}


You do realize that your routine has backwards output, don't you? "9876" input gives "6789" output.

Quote:

I'm finding it hard to believe

I am now a beleiver! Your routine including the call sequence added 140 bytes to my null program. Small: yes indeed, and elegant in that smallness. Assuming 16-bit divide is somewhere else in the app, I would think so in most real apps, the size is even smaller.

Speed? I came up with 2549 cycles. Even uglier than your results. ;) Let's see what we can do about that. ...

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

Let's see what we can do about that. ...

... and the simple answer was quite obvious--there is a "better" way to do /n and %n. The internal divide routine has the remainder--the modulus--right there for the taking.

The standard C library has a series of div() routines that take advantage of that fact, returning a structure that has quot and rem all waiting. Now, CodeVision has not seen fit to provide that routine, and it would be for signed arithmetic anyway limiting our range. However, inspection of the primitives in the code shows that mod does nothing but call divide, and return the rem instead of the quot.

And in this case we >>know<< that we are going to divide by 10, so the remainder is an 8-bit value living in a single register. So I cut the 2549 cycles down to 1315: cut in half (virtually) as I expected. [I didn't change the backwards part to make the comparison fair.]

That makes the modulus approach quite attractive as a size/speed tradeoff. Also, the LSD-first makes right-justification easy, but zero-suppression may be problematic. I guess I'd stick to my utoaz() unless really tight on space, and the cycles are still a fraction of the modulus.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

4: (Subtract) 288 249 (max, nr of cycles depend on number converted)

So the subtraction method is without comparission the fastest but the modulus method is smaller.


I thought that a bit odd as my utoaz() with extra stuff isn't that big. So I built danni's optimized bin2bcd() above using the global buffer. (I think this, with hard-coded subscripts as shown, will always give the fastest and smallest versus a pointer solution. While may not as useful when building (say) part of a character LCD line a 7-seg is very likely to have a small "hard" buffer.)

As expected with danni code, it is both small and fast:
bin2bcd(): 128 bytes; 237 cycles for 0 [strange!]; 165 cycles for "9999".

So at least with my toolchain danni's subtraction is indeed both smallest and fastest. I'll have to see if I can adapt my zero-suppression technique...

danni--May I steal your routine for my work? [I'll keep my debounce, though, as it was proven superior. [[LOL--a couple years back we went back-and-forth on our horizontal vs. vertical parallel debounce performance and IIRC it came out very nearly even.]] ]

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

theusch wrote:
danni--May I steal your routine for my work?

O.k.

But the C code can only handle numbers up to 42767.

Peter

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

But the C code can only handle numbers up to 42767.

Ah, yes, I see that now. In practice that may not be a problem. And thanks.

I'll have to think about that first loop a bit. Maybe unroll it, or simply add a check on the high nibble?

In looking for that limitation, now I see why I got the higher cycle counts for 0, because of the "overshoot" with the pin-pong add/subtract. Overall, though, perhaps that tends to even out the cycle counts over the entire range of numbers? I guess that one could add a loop counter and return it, and then cycle through a series of values...

================================
Side note, referring to the many first project "read a voltage with ADC and display the voltage on the LCD" threads:

We often see the float conversions and the simple program with floating-point sprintf() sucks up half of a Mega8. If you examine danni's routine or my utoaz() you can see how easy it is to do everything in "millivolts" and simply adapt one of these routines to spit out a decimal point after the whole volts. And only output as many decimal digits as are desired/meaningful. And round the number at the head of the routine. And you can see that the routines are small--a few hundred bytes-- and fast--a few hundred cycles. 800 cycles at 8MHz is 100us.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I took a look at how codevision (an old free version) do div. And it make 16 loops that take about 12 clk = 192 clk.
If you write a div by 10 routine in C it will take about 13-14 clk pr loop but because you know the number you only have to the loop 12 times = 162 clk , and you don't have to make tricks to get the reminder.
This is my test code (sorry it's delphi)

procedure TForm1.Button1Click(Sender: TObject);
var
 i : word;// the org number after div it holds the reminder
 j : word;// 10 shifted
 k,kk : word;// loop counters
 r : word; //result
begin

 for k:= 0 to 65535 do begin // try all numbers
   i:=k;

   j:=$5000; //10 shifted up
   r:=0;
   for kk:=1 to 12 do begin
     if j<=i then begin
       inc(r);//set low bit
       i:=i-j;
     end;
     j:= j shr 1 ;
     r:= r shl 1 ;
   end;
   r:= r shr 1 ;// one to many shifts
   memo1.Lines.Append(inttostr(k)+' '+inttostr(r)+' '+inttostr(i));
 end;

end;

Jens

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Do any one know if GNU c for the AVR use the HW mul when it div with a constant?
The ARM7 version does, it make a unsigned div in 6 instructions (no loop) for 32 bit.

Jens

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

But the C code can only handle numbers up to 42767.

A fast fix could be to temporary div the number by 2
and find the digit by add/sub 5000.(5000 is even so it should work).

Edit:
And because 1000 100 and 10 all are even you can do all the calcluations with 5000,500,50,5. It's only the last digit that need to know if LSB was high (then add 1).

Jens

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0
char bcd_CHAR2BCD2(char input)
{
    char high = 0;
    
    
    while (input >= 10)                 // Count tens
    {
        high++;
        input -= 10;
    }

    return  (high << 4) | input;        // Add ones and return answer
}
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

theusch wrote:
Quote:

But the C code can only handle numbers up to 42767.

Ah, yes, I see that now. In practice that may not be a problem.

I use mostly signed numbers and then after extracting the "-" sign, the value was always below 32769.

But if you want the full unsigned range, you can check the MSB (means above 32767) and then substract 30000 and add 3 to the digit value.

Peter

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

theusch wrote:
Baseline numbers for a recent CodeVision version:

itoa (9999, buffer);
sprintf (buffer, "%u", 9999);

itoa (0, buffer);
sprintf (buffer, "%u", 0);


You didn't say which AVR you compiled for - obviously if MUL can be used then it may affect the speed/size but I tried the following in GCC.

First I built a "null" program for a mega16 with nothing but while(1) in main() and this gave 152 bytes of code. I then added:

	itoa(9999, buff, 10);

which increased to 328 bytes so I conclude the call and lib function added 176 bytes (note that this itoa takes a third "radix" parm and can do other bases - so has some unnecessary overhead). The CALL alone with '9999' took 943 cycles. With a value of '0' it took 256 cycles.

Using the minimal printf lib with:

	sprintf(buff, "%u", 0);

The program was 1314 bytes and the CALL here took 776 cycles. With a value of 9999 it took 1463 cycles.

So it looks like GCC beats CV on the sprintf() but loses on the itoa()

Cliff

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

You didn't say which AVR you compiled for

I used Mega88 as specified in the contest "rules". ;)

I guess my summary would be that if your compiler does a decent itoa() job and itoa() fits the needs, then just go ahead and use it.

I find I rarely use itoa(), for reasons I've already outlined. For "fancy" formatting work where I feel that sprintf() can be justified for the app, then I'll use that. (If I can only "justify" one or two uses of sprintf() then I'll usually work around it and "hand build" those strings.)

But many of my apps use a variation on the utoaz() that I posted above. Right-justify; leading zero suppression; choice of precision; decimal point (or other formatting) insertion. Some apps have several variations; at ~200 bytes a pop it is not too painful.

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

http://www.piclist.com/techref/microchip/math/radix/index.htm
is a fantastic site for ideas/routines/tips/tricks... Yes I know it's PIC stuff, but the same ideas work for almost all uProc.

<º))))><

I am only one lab accident away from becoming a super villain.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

An update with the result using Imagecraft 7.15. I had some other functions in my previous tests and as some pointed out, they may use code that the BCD function use and therefore the test will be wrong. I haven't rechecked the speed as it should be the same. Flash usage is calculated with an empty program compared with the same program plus a call to the function.

Method flash
itoa 488
sub 116
mod 156

So the subtract method is even for me both fastest and smallest but most likely the modulus method will win in size in most real programs as some maths may already be used. On the other hand the subtract method is so small so there is not much to win. It still looks like Imagecraft itoa is not very good.

Also I must had some other error in my previous calculations as all functions now is even smaller.

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

Using the minimal printf lib with:
Code:
sprintf(buff, "%u", 0);

The program was 1314 bytes and the CALL here took 776 cycles. With a value of 9999 it took 1463 cycles.

So it looks like GCC beats CV on the sprintf() but loses on the itoa()

Recalling my "baseline" numbers

Quote:

itoa -- 104 bytes; 113 cycles for "0"; 341 cycles for "9999"
sprintf -- 1036 bytes; 890 cycles for "0"; 2195 cycles for "9999".

I was a bit surprised at the cycle difference, but shrugged it off. In working on another project today I found that the default printf type of "int,width" is not actually the skinniest and that "int [only]" is also offered.

Re-running the experiment with that selection, the size of printf is 800 bytes plus 32 for the call sequence. "9999" is 1674 cycles and "0" is 783 cycles. So CV size is fine, and "9999" processing is ~10% slower.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Here's my bgitoa

char tmpstr[7];
//              0   1    2   3
int pow10[4]={10000,1000,100,10};
//--------------------------------
void bgitoa(char *tmpstr,int num){
//cvt num to 5 ascii chars in tmpstr using repeated subtraction by powers of 10
char i,dig;
int pow10i;

  for(i=0; i<4; i++){
	  dig=0;
    pow10i=pow10[i]; 
	  while((num - pow10i) > 0){
      num -= pow10i;
      dig++;
    }
  	*tmpstr++ =dig+0x30;
  }
  *tmpstr++ =num+0x30; 
	*tmpstr=0;
}

1000 loops of itoa(tmpstr,29999,10) takes .192 sec
1000 loops of bgitoa(tmpstr,29999) takes .042 sec.... 4.5x faster 42usec ea.... 630 inst??

itoa is 135 words, bgitoa is 44 words.... 1/3rd the size

xtal is 14.7456mhz

Imagecraft compiler user

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

bgitoa is 44 words

I'd like to see that listing.

Given that you used an SRAM table for speed (gains you a few cycles each access) you also have to factor in the words used to store the constants in flash (and/or the init code is done piecemeal).

Why don't you use 9999 and 0 like us other mortals, and run it through AVRStudio simulator, so we don't have to guess at the cycle count?

The "contest" is for unsigned int. So technically your version doesn't qualify as it compares against 0.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Fot this kind of numbers you all should use the numbers for best case and worst case and AVG. (and care should be taken about worst case it will not allways be 59999).

Jens

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Darn. I meant to put a __flash in front of the pow10 table...
OK.. back at work Thurs... 1000 itoa of 9999 takes .156 sec, 1000 bgitoa takes .041 sec. Thats 41 usec each x 14.7456mhz = 605 inst?? Size is still 44 words...

_bgitoa:
  i                    --> R20
  dig                  --> R22
  pow10i               --> R10
  num                  --> R18
  tmpstr               --> R16
     1CD 940E 0965 CALL	push_xgsetF00C
(0126) }
(0127) 
(0128) char tmpstr[7];
(0129) //              0   1    2   3
(0130) __flash int pow10[4]={10000,1000,100,10};
(0131) //--------------------------------
(0132) void bgitoa(char *tmpstr,int num){
(0133) //cvt num to 5 ascii chars in tmpstr using repeated subtraction by powers of 10
(0134) char i,dig;
(0135) int pow10i;
(0136) 
(0137)   for(i=0; i<4; i++){
     1CF 2744      CLR	R20
     1D0 C01D      RJMP	0x01EE
(0138) 	  dig=0;
     1D1 2766      CLR	R22
(0139)     pow10i=pow10[i]; 
     1D2 E586      LDI	R24,0x56
     1D3 E090      LDI	R25,0
     1D4 2FE4      MOV	R30,R20
     1D5 27FF      CLR	R31
     1D6 0FEE      LSL	R30
     1D7 1FFF      ROL	R31
     1D8 0FE8      ADD	R30,R24
     1D9 1FF9      ADC	R31,R25
     1DA 90A5      LPM	R10,Z+
     1DB 90B4      LPM	R11,0(Z)
     1DC C003      RJMP	0x01E0
(0140) 	  while((num - pow10i) > 0){
(0141)       num -= pow10i;
     1DD 192A      SUB	R18,R10
     1DE 093B      SBC	R19,R11
(0142)       dig++;
     1DF 9563      INC	R22
     1E0 0119      MOVW	R2,R18
     1E1 182A      SUB	R2,R10
     1E2 083B      SBC	R3,R11
     1E3 2444      CLR	R4
     1E4 2455      CLR	R5
     1E5 1442      CP	R4,R2
     1E6 0453      CPC	R5,R3
     1E7 F3AC      BLT	0x01DD
(0143)     }
(0144)   	*tmpstr++ =dig+0x30;
     1E8 2F86      MOV	R24,R22
     1E9 5D80      SUBI	R24,0xD0
     1EA 01F8      MOVW	R30,R16
     1EB 9381      ST	R24,Z+
     1EC 018F      MOVW	R16,R30
     1ED 9543      INC	R20
     1EE 3044      CPI	R20,4
     1EF F308      BCS	0x01D1
(0145)   }
(0146)   *tmpstr++ =num+0x30; 
     1F0 01C9      MOVW	R24,R18
     1F1 96C0      ADIW	R24,0x30
     1F2 01F8      MOVW	R30,R16
     1F3 9381      ST	R24,Z+
     1F4 018F      MOVW	R16,R30
(0147) 	*tmpstr=0;
     1F5 2422      CLR	R2
     1F6 8220      STD	Z+0,R2
     1F7 940C 096C JMP	pop_xgsetF00C

Imagecraft compiler user

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Lee, is there any reason to prefere right-adjustment instead of having LSB in digit[0]? I returning number of digits and therefore I thought it more natural to start from [0].

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Then your job isn't done--you've got to >>do<< something with the result to make it anything more than an academic exercise.

If I'm doing this in a 7-segment app showing a value on the display I want it right-justified and nearly always suppression of leading zeros. And perhaps decimal point insertion, and limiting the displayed precision. I think that these additions to my "smallest fastest" routine have all the information "right there", and will be more efficient than the absolute smallest conversion to BCD and a post-processing pass. So the output puts a character right into the display buffer. (For my 7-segment apps it is usually a 2-step process, placing the character index into the shadow buffer and putting the right segment mask into the actual display buffer.)

Similarly, if I'm building a refresh for a character-LCD display that might have labels and values, I still want it right-justified so the number doesn't "jump around" when the magnitude changes. I point to the right spot in the display buffer.

Depending on the app, I either put out a null at the end or not. If the null is at the end then I can use C string-handling routines. With your output you have to do like

for (i=numdigits; i; i--) putchar(buf[i]);

which again will negate advantages of smallest/fastest.

So, I make the routine to do as much of the job as practical.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Hi.

I couldn't resist to take another turn in this. I have added right-adjustment and supression of leading zeros plus NULL termination to both of my routines. I even added a return parameter with number of digits so I can use either method. I think your suggested reasons are well motivated for this Lee. Thanks. The code did of course went up a bit and it costs some clock cycles but I think it is worth it. I haven't tried to squeeze it down to smallest and fastest possible by for example using predefined values but I prefere it this way for readability. Adding theese extra features take the code out of this "contest" anyway.

Code Size Cycles
Subtract 204 269 (just tested with one number, much faster than modulus anyway)
Modulus 278 1492 /edit. Corrected 410 to 278

A problem with this type of contest is that it is very theoretical. In a real project, it will also depend on if you use maths elsewere in the code. (Or other shared code). To test this I defines each routine two times, one as an dummy that was not used. Now I could compare how much the extra dummy routine took and calculate the cost of the routines part from the shared maths.

Code Size
Subtract' 168
Modulus' 160

As you can see the Subtract method is after adding the features the winner. Modulus is just 8 bytes smaller if common code are exluded and the subtract are so much faster. Oh, slowest number with my code is 30000. (First digit as large as possible and the rest zeros). Don't know how this change if the code ara adapted to support larger range.

Thanks all for all the terific education!

Bengt

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

Last Edited: Fri. Jan 9, 2009 - 04:26 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

In a real project for display purposes, speed isn't critical. Display contents are updated a few times a second. [I have never updated one as fast as possible--the value can't then be read anyway 'cause too much flicker.]

Size might be critical in a "little meter" with maybe a packed Mega48 or Tiny2xxx. then you just tune the routine as needed. In that type of app there are probably very few different display formats anyway.

So as I mentioned earlier, mine is fast enough and small enough even with the "slow" EEPROM table, etc.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

Oh, slowest number with my code is 30000.

Are you sure?
If it's the sub version with positive numbers 29999 should be slower than 30000 !
Peter's code that is a add/sub combo is slow with thise numbers 909(for <10000) 30909(for<32768) 50909(for <65535 only work in the ASM version).
I took a look to see if the div by mul with 1/x will gain anything. And the result (I compare ASM code) is the code will be bigger than peters add/sub. But the code will be able to produce a better worst case speed (Peters worst = 176 clk(without call/ret)). And the other way will be about 80 clk.
I will post some code when it works but my estimate come from this:
first digit (do normal sub) about 20 clk
div the 0..9999 reminder by 100 about 30 clk
break the two 0..99 numbers down about 2*10 clk
about 10 clk for extra problems :)

Jens

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Anybody tried 65535 yet? That wont work with itoa, have to use utoa. Spec says 'unsigned it' so I bet you can force utoa to be used if you put a U after it......

Imagecraft compiler user

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Sometimes even speed of int -> BCD matters. This could be very usefull if the conversion is inside an ISR or if the controller is very busy otherwise. For one programm a used BCD arithmatics to avoid a slow bin->bcd conversion. Though with these fast routines, i might think about rewriting that programm.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Lee, speed is absolutely important. I talk about power here and I want everything be done as fast as possible so I can go back to sleep.

About if 30000 is slowest for subtract, no I am not sure but it was slower than 99999, 0 , 0xFFFF and I think it is also slower than 29999, I do not have time to test it right now (it's morning here and I am not even at work).

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Ok if speed realy matter then I can tell that it looks like I can keep my word about a ASM version that use 80 clk.
I Just solved the biggest problem , div 0..9999 with 100, and calc the reminder, that can be done in 28 clk (24 instructions).(no jumps so it's allways 28 clk)

Split the 0..99 up to two digits take 13 clk (11 instructions)

But I have to add some move instructions before I put it together. But now it's bed time here.

So I think that even with the overhead in C's mul function this way can be fastest in C aswell.

Jens
Are there any C lib functions for 8bit*8bit=16bit and
8*16=24 in any of the used compilers.
I remember that in the past I asked for a 8*8=16 for the codevision and 2 hours later he showed how to write the function.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

What about this code?

	RSEG CODE:CODE:NOROOT(1)
	PUBLIC _i2a
//   13 __z void i2a(char *s, UINT16 v)
//R16:R17 - v
//R30:R31 - *s
//
_i2a:
//   14 {
//   15   UINT8 m0; //R16
//   16   UINT8 m1; //R17
//R18-R20 - 24bit fmul result
//R21 - c,b,a ->06 8D B9
//R22 - zero reg
	CLR	R22
	LDI	R21,0x06
//  v=__multiply_unsigned(m0,0x06)+3;
	MUL	R16,R21
	MOVW	R19:R18,R1:R0
	SUBI	R18,0xFD
	SBCI	R19,0xFF
//  v+=__multiply_unsigned(m1,0x06)<<8;
	MUL	R17,R21
	MOV	R20,R1
	ADD	R19,R0
	ADC	R20,R22
//  v+=__multiply_unsigned(m1,0x8D);
        LDI     R21, 0x8D
        MUL     R17, R21
        ADD     R18, R0
        ADC     R19, R1
	ADC	R20, R22
//  v+=__multiply_unsigned(m0,0x8D)>>8;
        MUL     R16, R21
        ADD     R18, R1
        ADC     R19, R22
	ADC	R20, R22
//  v+=__multiply_unsigned(m1,0xB9)>>8;
	LDI	R16,0x10		; Counter & flags
	LDI	R21,0xB9
        MUL     R17, R21
        LDI     R21, 10			; Next multiplicand
        ADD     R18, R1
        ADC     R19, R22
	ADC	R20, R22
	BREQ	??i2a_0
	SUBI	R20,208
	ST	Z+,R20
	INC	R16
??i2a_0:
//   39     UINT16 hv;
//   40     UINT8 bv;
//   41     bv=v>>8;
        MOV     R17, R19
//   42     v=__multiply_unsigned(v,10);
        MUL     R18, R21
        MOVW    R19:R18, R1:R0
//   43     hv=__multiply_unsigned(bv,10);
        MUL     R17, R21
//   44     v+=(hv&0xFF)<<8;
        ADD     R19, R0
//   45     if (SREG_Bit0) hv+=0x100;
	ADC	R1, R22
//   46     bv=hv>>8;
        MOV     R17, R1
//   47     if ((i|bv)&0x8F)
        MOV     R20, R1
        OR      R20, R16
        ANDI    R20, 0x8F
        BREQ    ??i2a_1
//   48     {
//   49       *s++=bv+'0';
	SUBI	R17,208
	ST	Z+,R17
//   50       i|=1;
//        ORI     R18, 0x01
??i2a_1:
//   51     }
//   52     i<<=1;
	ROL	R16
//   54   while(!SREG_Bit0);
        BRBC    0, ??i2a_0
//   55   *s=0;
        ST      Z, R22
//   56 }
        RET
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:

Lee, speed is absolutely important.

There is also "good enough". If you are going to agonize about 10 cycles in displaying the value, then you'd better dig through EVERY compiler primitive (e.g., mul & div routines; EEPROM read; EEPROM write; flash read, shift) and agonize about a possible cycle here and two there. And you had better know your compiler's code generation model intimately so that your C code translates int as-near-to-optimal sequences as you can get. [That actually is a good idea. But the way I might write the "best" function in Codevision may not be the same as the "best" way in ImageCraft or GCC as the code generation models are different.]

Meanwhile, your project never gets "done". Once you get your average current draw down below the typical battery-leakage value--the shelf life-- it doesn't matter anyway as you are going to get variations from battery to battery and brand to brand.

Now, if you are tuning the ISR for a high-speed operation such as encoder or industrial counter with "trip" comparison or pixel output or similar, then close examination and tweaking and cycle-shaving is certainly justified.

Many of my production apps have no C library includes at all. (Well, I almost have CV's delay.h that I use for my startup delay but that is trivial.) My "bigger" apps are Mega32/Mega64 class and might have a multi-line display and a menu system and different mode displays. those might have a few memcpy/strcpy uses, and some might have sprintf(). PErhaps a couple from math.h here-and-there. But I suspect a lot less than many apps that are of different types.

Oh, yeah, back to your agonizing: When you app is "done", then you have to carefully evaluate, FOR THAT PARTICULAR APP, whether you are better off going fast as heck to go back to sleep earlier or to plod along very sedately towards your bed conserving energy in the process. That answer isn't always obvious, and much will have to do with peripheral subsystems enabled when awake.

Lee

You can put lipstick on a pig, but it is still a pig.

I've never met a pig I didn't like, as long as you have some salt and pepper.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

We designer must also think about the evironment. One less clock cycle and one byte less flash means we will use slightly less wafer and battery. Just image how much of a Nuclear Power Plant and size of garbage one byte and one clock cycle in every computer in the world would mean?

Don't take this to seriously, remember that this started as a Christmas hobby project.

My favorites:
1. My oscilloscope, Yokogawa DLM2024.
2. My soldering iron, Weller WD2M, WMRP+WMRT.
3. JTAGICE3 debugger.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Code length is only important if you are short on flash and every byte count. I would guess that on evarage 1000s of bytes of flash are unused with the controllers.

A special case may be bootloaders, but there speed is of no importance.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

OK now I have a result of the uint16 to 5 BCD bytes.
This is ASM code, but can be put into a C function.
This code is optimized for worst case speed.
The code is not optimal but at least it show that the DIV by MUL by 1/X can be done fast.
The code can run on any AVR with a HW multiplier.
Just by moving the possible error correction on first digit down to the div by 100 code there can be saved 2 clk, but I don’t want to make it to messy.

Worst case the conversion take 69 CLK. Best case 67

The code is solving the problem this way.

Find the first digit.

Then div with 100, and calc the reminder

Take those 2 numbers and split them up to 2 digits.

This is the code. It can be pasted into AVR studio, I kept my test loop
It looks like there is a problem with the use of tabs.


.include "m88def.inc"
;
;********************************
; unsigned 16 bit to 5 byte BCD 
; Author   Jens Norgaard-Larsen
;******************************** 
; code size without loop
; 58 *2 = 116 byte 
; speed without loop update
; best case : 67 clk
; worst case : 69 clk
// just a part of the test code
	ldi		r16,low(10000)
	ldi		r17,high(10000)
	movw	r10,r16
// in:	16 bit UINT in R17:R16
// out: R24:R23:R22:R21:R20
// change R16,R17,R18,R19,R0,R1
// change R10,R11 only in test loop
top:
// find first digit by mul high byte with 6 this give
// the result in the high byte (can be 1 to small))
	ldi		r18,6
	mul		r17,r18
	mov		r24,r1 ;r24 now calcluated
;find reminder by sub 10000*result from R17:R16
	ldi		r18,low(10000)
	mul		r24,r18 
	movw		r20,r0
	ldi		r18,high(10000)
	mul		r24,r18
	add		r21,r0
	sub		r16,r20
	sbc		r17,r21
; if reminder >=10000 then inc result and 
; sub 10000 from R17:R16
	cpi		r16,low(10000)
	cpc		r17,r18
	brmi		L000
	inc		r24
	subi		r16,low(10000)
	sbci		r17,high(10000)
L000:
; DIV with 100 result in R22
; reminder in R20
; function used is result = (R17:R16*41-R17:R16>>10*41)>>12
; MUL by 41
	ldi		r18,41
	ldi		r22,0
	mul		r16,r18
	movw		r20,r0
	mul		r17,r18
	add		r21,r0
	adc		r22,r1
;>>10 is the same as highbyte >>2
	lsr		r17
	lsr		r17
;do mul and sub result
	mul		r17,r18
	sub		r20,r0
	sbc		r21,r1
	sbci		r22,0
;>>12 is the same as <<4 we know that result
; only is 1 byte
	swap		r22
	swap		r21
	eor		r22,r21
	andi		r22,240
	eor		r22,r21
;find reminder by number-result*100 
	ldi		r20,100
	mul		r20,r22
	movw		r20,r16
	sub		r20,r0

;split the value in R22 into 2 digits.
;formular y=(number*51+20)>>9
	ldi		r16,51
	mul		r22,r16
	movw		r18,r0
	subi		r18,low(-20)
	sbci		r19,high(-20)
	lsr		r19
;calc the reminder
	ldi		r17,10
	mul		r19,r17
	sub		r22,r0
	mov		r21,r19

;this is a repeat on the other 2 digits
	mul		r20,r16
	movw		r18,r0
	subi		r18,low(-20)
	sbci		r19,high(-20)
	lsr		r19
	mul		r19,r17
	sub		r20,r0
;this is just move so everything stays in order
	mov		r23,r21
	mov		r21,r19

; this is just a part of my test code
	nop
	movw		r16,r10
	subi		r16,1
	sbci		r17,0
	movw		r10,r16
	rjmp		top