code space saving tricks.

Go To Last Post
57 posts / 0 new
Author
Message
#1
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I have noticed there are some tricks one can apply to save some code space, compiler options in make, using define where appropriate, inilne keywords, proper size data types, etc.. But I think it goes without saying the best practices in code design, will help the most. What I'm curios about is what experienced FM developers have to say on the matter. I figure there was a site or post devote to this alone. I'm looking for a reference where one could get a few pointers to this sort of thing.

 

For example I have some code that I call maybe 100 times.

static void sendData(unsigned char data)
{

    oscilator = 0x2f;//starts with red high white low,
    for ( char loop=7;loop>=0;loop--)
    {    
        if ( (data >> loop) & 1 ) //its a 1
        { PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we will get a 1
        else /*its a 0*/{ PORTC = oscilator; PORTC = 0; }// on the falling edge we will get a 0
        oscilator = ~oscilator;//flip it.
    }
} 

or

static void sendStart(void)
{
	DDRC = 0x3F;//set to out
	PORTC= 0x3F;//both high
	WAIT1
	PORTC = 0x1f;WAIT2;//set red  low, keep white is high
	PORTC = 0;	 WAIT2;// bring white low /1
	PORTC = 0x1f;WAIT2;// bring white high 
	PORTC = 0;	 WAIT2;// bring white low /2
	PORTC = 0x1f;WAIT2;// bring white high
	PORTC = 0;	 WAIT2;// bring white low/2
	PORTC = 0x1f;WAIT2;// bring white high
	PORTC = 0;	 WAIT2;// bring white low/4
	PORTC = 0x1f;WAIT2;// bring white high
	PORTC = 0x3F;//red now must be high
}

 

I can't seem to save code space with this no matter how its defined but feel there is a more efficient way. Would putting this in a S file and writing asm be best? Just trying to learn ways to program more efficiently.

 

 

 

 

Last Edited: Fri. Jul 31, 2015 - 01:24 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Can you show the ASM code the function generate?

 

But one bad thing in your code is data >> loop

roll data by one each time.
 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Here this may help

static void sendData(unsigned char data)
{
                 		.loc 1 44 0
                 		.cfi_startproc
                 	.LVL0:
                 	/* prologue: function */
                 	/* frame size = 0 */
                 	/* stack size = 0 */
                 	.L__stack_usage = 0
**** 	oscilator = 0x2f;//starts with red high white low, 
                 		.loc 1 45 0
   0000 9FE2      		ldi r25,lo8(47)
   0002 9093 0000 		sts oscilator,r25
                 	.LVL1:
   0006 27E0      		ldi r18,lo8(7)
   0008 30E0      		ldi r19,0
                 	.LBB22:
**** 	for ( char loop=7;loop>=0;loop--)
**** 	{	
**** 		if ( (data >> loop) & 1 ) //its a 1
                 		.loc 1 48 0
   000a 90E0      		ldi r25,0
**** 		{ PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we wil
**** 		else /*its a 0*/{ PORTC = oscilator; PORTC = 0x0f; }// on the falling edge we will get a 0
                 		.loc 1 50 0
   000c 7FE0      		ldi r23,lo8(15)
**** 		{ PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we wil
                 		.loc 1 49 0
   000e EFE3      		ldi r30,lo8(63)
                 	.LVL2:
                 	.L5:
   0010 6091 0000 		lds r22,oscilator
**** 		{ PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we wil
                 		.loc 1 48 0
   0014 AC01      		movw r20,r24
   0016 022E      		mov r0,r18
   0018 00C0      		rjmp 2f
                 		1:
   001a 5595      		asr r21
   001c 4795      		ror r20
                 		2:
   001e 0A94      		dec r0
   0020 02F4      		brpl 1b
**** 		{ PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we wil
                 		.loc 1 49 0
   0022 68B9      		out 0x8,r22
**** 		{ PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we wil
                 		.loc 1 48 0
   0024 40FF      		sbrs r20,0
   0026 00C0      		rjmp .L2
**** 		{ PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we wil
                 		.loc 1 49 0
   0028 E8B9      		out 0x8,r30
   002a 4091 0000 		lds r20,oscilator
   002e 4095      		com r20
   0030 48B9      		out 0x8,r20
   0032 00C0      		rjmp .L3
                 	.L2:
                 		.loc 1 50 0
   0034 78B9      		out 0x8,r23
                 	.L3:
**** 		oscilator = ~oscilator;//flip it.
                 		.loc 1 51 0
   0036 4091 0000 		lds r20,oscilator
   003a 4095      		com r20
   003c 4093 0000 		sts oscilator,r20
                 	.LVL3:
                 	.LVL4:
   0040 2150      		subi r18,1
   0042 3109      		sbc r19,__zero_reg__
   0044 00F4      		brcc .L5
                 	/* epilogue start */
                 	.LBE22:
**** 	}
**** }
                 		.loc 1 53 0
   0046 0895      		ret
                 		.cfi_endproc
                 	.LFE7:
                 		.section	.text.sendStart,"ax",@progbits
                 	sendStart:
                 	.LFB8:

but really I'm looking for tips on storage. I read up on extern but its not helping. As I understand it all my global vars can stay in my main header file that is included throughout the project but I can add the extern key word. Then I can declare them in the main file. This works but saves me nothing (same percentage of code ).

 

 

Last Edited: Fri. Jul 31, 2015 - 02:28 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I see nothing bad in the generated code, and the >> code is small (but a bit slow, I feared a function call).

the only real saver would be if oscilator could be made local, or perhaps as a register variable.

 

And if size really matter don't look inside this function (you could same 10-20 bytes), but perhaps look at how data gets assigned before a function call, one saver could be to have data

as a register variable, so the function call don't need to handle a parameter.

 

Last Edited: Fri. Jul 31, 2015 - 03:30 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 1
        PORTC = 0x1f;WAIT2;//set red  low, keep white is high
	PORTC = 0;	 WAIT2;// bring white low /1
	PORTC = 0x1f;WAIT2;// bring white high 
	PORTC = 0;	 WAIT2;// bring white low /2
	PORTC = 0x1f;WAIT2;// bring white high
	PORTC = 0;	 WAIT2;// bring white low/2
	PORTC = 0x1f;WAIT2;// bring white high
	PORTC = 0;	 WAIT2;// bring white low/4
	PORTC = 0x1f;WAIT2;// bring white high

Are you really saying it doesn't pay to wrap a for() loop around this repetitive code?

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

change to this

static void sendData(unsigned char data)
{
	char oscilator = 0x2f;//starts with red high white low, 
	for ( unsigned char loop=0x80;loop > 0;loop >>= 1)
	{	
		if ( data & loop ) //its a 1
		{ PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we will get a 1
		else /*its a 0*/{ PORTC = oscilator; PORTC = 0x0f; }// on the falling edge we will get a 0
		oscilator = ~oscilator;//flip it.
	}
}

but here is what I was looking for  http://www.atmel.com/images/doc8...

Anything along these lines will help me a great deal.

 

 

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Guidelines are all very well but it's the practical realities that matter. If I build code that is too big for the target device and there is some reason why I can't simply trade up to the next model in the series where it would fit then I start with avr-nm and use --size-sort to get it to output a list of al the functions in the program with the largest listed first and so on. The big ones are the ones with most scope to be trimmed so then I would look at the .lss and see if the code generated for them was as compact as I could have made it if I wrote the thing in Asm myself. If not I look at restructuring the code, casting things and so on that might aid the compiler to come up with a "tighter" solution that still achieved the same job. Obviously if you ever find what's effectively the same sequence of code repeated more than once you have to ask yourself if space could be trimmed by putting it in a for() loop - though that obviously carries a small overhead itself. Obviously use the smallest/simplest types possible. The compiler much prefers adding uint8_t's rather than uint64_t's and so on.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

clawson wrote:

        PORTC = 0x1f; WAIT2;//set red  low, keep white is high
	PORTC = 0;    WAIT2;// bring white low /1

	PORTC = 0x1f; WAIT2;// bring white high 
	PORTC = 0;    WAIT2;// bring white low /2

	PORTC = 0x1f; WAIT2;// bring white high
	PORTC = 0;    WAIT2;// bring white low/2

	PORTC = 0x1f; WAIT2;// bring white high
	PORTC = 0;    WAIT2;// bring white low/4

	PORTC = 0x1f; WAIT2;// bring white high

Are you really saying it doesn't pay to wrap a for() loop around this repetitive code?

+1

 

Or, possibly, rather than trying to code the sequence in-line do it in a timer handler - so it just has to toggle the bits ...

 

However, as clawson's other post suggests, your really need to be looking at The Big Picture - rather than trying to micro-manage the little stuff like this

Top Tips:

  1. How to properly post source code - see: https://www.avrfreaks.net/comment... - also how to properly include images/pictures
  2. "Garbage" characters on a serial terminal are (almost?) invariably due to wrong baud rate - see: https://learn.sparkfun.com/tutorials/serial-communication
  3. Wrong baud rate is usually due to not running at the speed you thought; check by blinking a LED to see if you get the speed you expected
  4. Difference between a crystal, and a crystal oscillatorhttps://www.avrfreaks.net/comment...
  5. When your question is resolved, mark the solution: https://www.avrfreaks.net/comment...
  6. Beginner's "Getting Started" tips: https://www.avrfreaks.net/comment...
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I will just add:

Your function is about 70 bytes in size, with oscilator local it would be about 50 bytes, and compact ASM would be about 30 bytes.

So not much to gain.

But if you say the that the function are called 100 times, I will ask why?

Perhaps think of a way to reduce the number of calls with some small general functions.

 

And perhaps have data in a register, so the function call don't need a parameter, if you save 2 bytes for each call that would gain 200 bytes :)

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 1

And for sendStart I would make a table in flash holding

0x3f,0x1f,0x00,0x1f,0x00,0x1f,0x00,0x1f,0x00,0x1f,0x3f

and then a loop that output on the port.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

sendData is called every time I need to send data, and the code has to call it many times for communication. The single is very complicate and not SPI like. I like the table idea...

 

good info clawson, thx.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

When I was trying to squeeze the ASF's USB code into a 4k bootloader I found that using whole program optimization at the linker stage helped.

 

Try something like:

 

-fuse-linker-plugin -flto -flto-partition=max -Os

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Against the meaning with c's structure, avoid more than one c-file if you want optimal code (if you insist on more files use include of those), that way

the compiler can see everything and make a better global optimizing.

And with the speed of today's PC's who care about compile/link time.   

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

In a well-structured program, there shouldn't be much calling between modules - so this shouldn't make a significant difference ?

 

Of course, proper us of 'static' within files helps...

 

with the speed of today's PC's who care about compile/link time

With the cost of Flash on today's chips, is it really worth the effort?

 

Top Tips:

  1. How to properly post source code - see: https://www.avrfreaks.net/comment... - also how to properly include images/pictures
  2. "Garbage" characters on a serial terminal are (almost?) invariably due to wrong baud rate - see: https://learn.sparkfun.com/tutorials/serial-communication
  3. Wrong baud rate is usually due to not running at the speed you thought; check by blinking a LED to see if you get the speed you expected
  4. Difference between a crystal, and a crystal oscillatorhttps://www.avrfreaks.net/comment...
  5. When your question is resolved, mark the solution: https://www.avrfreaks.net/comment...
  6. Beginner's "Getting Started" tips: https://www.avrfreaks.net/comment...
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

With the cost of Flash on today's chips, is it really worth the effort?

If your implying one could always upgrade, that is not an option here. I'm using a 89/88/168/328 form factory chip. AFAIK 328 is the largest. It never really was an issue until I merged two projects. With the first few tips I received I when from %88 data o %80 data. mojo-chan (Paul guessing from the name?) and sparrow2 do have good points. I could join a few files but use I do use globals with static.

 

My compiler option I do

 

CFLAGS += -fno-inline-small-functions
CFLAGS += -ffunction-sections
CFLAGS += --save-temps

 

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Have you tried all of -O1, -O2, -O3 and -Os?

 

Although -Os says "optimize for size" in reality some have found that things like -O2 an d-O3 have sometimes delivered the "tighter" code.

 

As I mentioned above avr-nm --size-sort can be invaluable here - did you try it. Here's a very simple example:

:~/windows/sdboot-trunk/Debug$ avr-nm --size-sort -r pfboot.elf | grep " T "
000002fc T main
0000016c T disk_initialize
000000be T disk_readp
00000082 T UART_dumpsector
0000006a T crc_app_ok
00000056 T disk_read
00000022 T __eeupd_byte_m32
00000020 T __mulsi3
00000020 T __eerd_block_m32
0000001c T UART_puts
00000018 T __umulhisi3
00000018 T UART_putsP
00000016 T __muluhisi3
00000016 T UART_puthex
00000012 T UART_puthex16
0000000e T UART_putnibble
0000000c T __eeupd_word_m32
0000000c T __eerd_word_m32
0000000c T UART_newline
0000000a T UART_init
00000008 T UART_put
00000002 T start

Of these clearly main() and disk_initialize() are the "biggies" and where I should be concentrating my effort initially if I want to reduce the overall size.

 

2FC=764 bytes, 16C=364 bytes: clearly there's more "room for movement" in those than in the 8 bytes of UART_put() !

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

S_K_U_N_X wrote:
If your implying one could always upgrade, that is not an option here

You have to recognise that, at some point, your code is just going to be straight-up too large for a given chip - and there is then no option but to upgrade.

 

The question is when it comes more productive to expend the effort on "upgrading" than to spend it on trying to cram it into a chip which is, essentially, "too small".

 

I do use globals with static.

If they're static, they're not global - they have only file scope.

 

Note that static can be applied to both data and functions.

 

CFLAGS += --save-temps

That has no effect on generated code size.

Top Tips:

  1. How to properly post source code - see: https://www.avrfreaks.net/comment... - also how to properly include images/pictures
  2. "Garbage" characters on a serial terminal are (almost?) invariably due to wrong baud rate - see: https://learn.sparkfun.com/tutorials/serial-communication
  3. Wrong baud rate is usually due to not running at the speed you thought; check by blinking a LED to see if you get the speed you expected
  4. Difference between a crystal, and a crystal oscillatorhttps://www.avrfreaks.net/comment...
  5. When your question is resolved, mark the solution: https://www.avrfreaks.net/comment...
  6. Beginner's "Getting Started" tips: https://www.avrfreaks.net/comment...
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

No this is not a project in design it's a firmware in design, the device is already developed. The idea with this device is that the firmware can grow but if it reaches maximum then that is far as it will grow.

 

By static I'm referring to this.

http://www.atmel.com/images/doc8...

3.6 Tip #6  "For global data, use the static keyword whenever possible"

 

here are the percents I get with each level.

#s %80.07
#1 %86.9
#2 %85.5
#3 %110.4

 

when trying the  avr-nm --size-sort it kicks out all of my set up make options

 

avr-nm --size-sort  -c -mmcu=atmega328p -I. -gdwarf-2 -DF_CPU=12000000L -Os -Wall -fno-inline-small-functions -ffunction-sections  --save-temps -Wstrict-prototypes -Wa,-adhlns=./usbdrv/usbdrv.lst -Iusbdrv -std=gnu99 -Wundef -MMD -MP -MF .dep/usbdrv.o.d usbdrv/usbdrv.c -o usbdrv/usbdrv.o
        avr-nm: invalid option -- c

 

where normally I do

 

avr-gcc  -c -mmcu=atmega328p -I. -gdwarf-2 -DF_CPU=12000000L -Os -Wall -fno-inline-small-functions -ffunction-sections  --save-temps -Wstrict-prototypes -Wa,-adhlns=./usbdrv/usbdrv.lst -Iusbdrv -std=gnu99 -Wundef -MMD -MP -MF .dep/usbdrv.o.d usbdrv/usbdrv.c -o usbdrv/usbdrv.o

 

So I'm not sure how to fit that all in together.

 

I have this in my make

# Define programs and commands.
SHELL = sh
CC = avr-gcc
OBJCOPY = avr-objcopy
OBJDUMP = avr-objdump
SIZE = avr-size
AR = avr-ar rcs
NM = avr-nm
AVRDUDE = avrdude
REMOVE = rm -f
REMOVEDIR = rm -rf
COPY = cp
WINSHELL = cmd

 

Last Edited: Mon. Aug 3, 2015 - 03:05 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

S_K_U_N_X wrote:
(Paul guessing from the name?)

 

How did you come to that conclusion, out of interest?

 

Anyway, as well as the compiler options you use, you should use the linker options for whole program optimization. Try different combinations, like -O1 for the compiler and -Os for the linker, or even -O0 for the compiler maybe. I can't remember, I experimented with various options.

 

Maybe you can generate some data at run-time as well. Things like lookup tables can be calculated if you have free RAM, rather than stored. Do you use __flash / PROGMEM / PSTR much?

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

S_K_U_N_X wrote:
The idea with this device is that the firmware can grow but if it reaches maximum then that is far as it will grow.

laugh

Top Tips:

  1. How to properly post source code - see: https://www.avrfreaks.net/comment... - also how to properly include images/pictures
  2. "Garbage" characters on a serial terminal are (almost?) invariably due to wrong baud rate - see: https://learn.sparkfun.com/tutorials/serial-communication
  3. Wrong baud rate is usually due to not running at the speed you thought; check by blinking a LED to see if you get the speed you expected
  4. Difference between a crystal, and a crystal oscillatorhttps://www.avrfreaks.net/comment...
  5. When your question is resolved, mark the solution: https://www.avrfreaks.net/comment...
  6. Beginner's "Getting Started" tips: https://www.avrfreaks.net/comment...
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

when trying the  avr-nm --size-sort it kicks out all of my set up make options

Sorry. I sort of assumed most people know how to use nm just as they know how to use objdump, objcopy, readlef, ranlinb and so on.

 

avr-nm is NOT a replacement for avr-gcc. It's a tool to run on the .elf file once it has been built in the normal way. In my example above you can see:

~/windows/sdboot-trunk/Debug$ avr-nm --size-sort -r pfboot.elf | grep " T "

In this I have already built pfboot.elf. I then pass it as an input to avr-nm and when invoking it I use --size-sort (which does exactly what it says on the tin) and -r which means "sort from largest to smallest" rather than the default which is "smallest to largest". This is simply so the big ones appear near the top of the output. If I run it without the additional grep the output looks like this:

$ avr-nm --size-sort -r pfboot.elf 
000002fc T main
00000200 B Buff
0000016c T disk_initialize
000000be T disk_readp
00000082 T UART_dumpsector
0000007c t send_cmd
0000006a T crc_app_ok
00000056 T disk_read
00000022 t updcrc
00000022 T __eeupd_byte_m32
00000020 T __mulsi3
00000020 T __eerd_block_m32
0000001c T UART_puts
00000018 T __umulhisi3
00000018 T UART_putsP
00000016 T __muluhisi3
00000016 T UART_puthex
00000012 T UART_puthex16
0000000e T UART_putnibble
0000000c T __eeupd_word_m32
0000000c T __eerd_word_m32
0000000c t __c.2051
0000000c T UART_newline
0000000a t __c.2049
0000000a t __c.2004
0000000a T UART_init
00000008 T UART_put
00000006 t __c.2009
00000005 t __c.2006
00000002 T start
00000001 B CardType

That includes " t " and " B " entries as well as the " T " ones I'm actually interested. So I used the grep filter to just get the " T " entries I was particularly interested in.

 

If you don't know about GNU binutils it is well worth exploring this manual:

 

https://sourceware.org/binutils/...

 

As well as objdump and objcopy that your build is probably already using (all prefixed with "avr-") there are lots of useful tools supplied with GCC. nm (that is "avr-nm") is just one of them. The manual for nm specifically is here:

 

https://sourceware.org/binutils/...

 

That will also tell you what " T ", " t ", " B ", etc. are all about too.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 1

merge 2 projects, and optimize make me think, look to see if some of the function in those projects are the same (or close), to avoid some of them.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

If you want fast and compact code, rewrite your programs in assembler language.

 

----------------------------

Here is (more or less) your example in assembler:

.def temp   =  r16    
.def temp2  =  r17
.def temp3  =  r18
.def temp4  =  r19
.def temp5  =  r20     
.def temp6  =  r21
.def ZeroReg = r15

.dseg                           
oscilator:    .byte 1    
data:           .byte 1

.cseg       

sendData:  ; receive data from calling function  
        clr  ZeroReg
        ldi  temp6, 0x07f  ; for complimenting logic using EOR instruction
        ldi  temp, 7
        ldi  temp4, oscilator
        ldi  temp3, 0b01000000
    
loop: 

        lds  temp2, data
        and  temp2, temp3
        breq doZero
doOne: 

        out PORTC, temp4
        ldi temp5, 0x3f
        out PORTC, temp5
        eor temp4, temp6
        out PORTC, temp4
doChk: 

        lsr temp
        bcc loop
        rjmp exit   
doZero:

        out  PORTC, temp4    
        out  PORTC, ZeroReg
        rjmp doChk
        
exit:   ret
   -----------------

 

It's not exact and would require debugging, but its size is about 30-35 flash bytes.  Fast as hell.

 

My experience with non-trivial program code going between Arduino C++ (with libraries) and assembler is that the assembler code is about 1/3th of the final size as the Arduino C++ code.

I've seen examples of Nokia 5110 graphic LCD display code library that added about 4000 bytes to an Arduino program and about 350 bytes when rewritten for an assembler program.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

1/3th of the final size as the Arduino C++ code

The usual ratio between C/C++ and hand crafted Asm is more like 10% not 33%. If you get that kind of savings it's just because of the extremely inefficient way the Arduino authors chose to write their library code like pinMode() and digitalWrite() (which are notorious!)

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

But that code don't flip oscilator for zero!

And are 42 bytes of size!

And you use 6 high registers a real pain to "glue" together with the rest of the program.

 

And if OP had oscilator as local I guess that the C version only would be 50 bytes and close to same speed.

 

So I think that your code are a bad example

 

But a quick look at problem give a solution with something like 32 byte ASM code and a total of 3 high registers used (and like you one low that are zero)

 

Add:

and are missing a rjmp from init to the LSR, (first loop you check with 0b01000000 and not 0b10000000!)

 

Last Edited: Tue. Aug 4, 2015 - 10:00 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

We have been here before so only short, it's not a problem to gain 33% (50%) using ASM, but you will need to restructure the problem.

I guess that the 10% are correct if you just hand optimize the output from the C compiler.
 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

 

How did you come to that conclusion, out of interest?

 

mojo-chan I'm familiar with super-play and your retro adapter.

 

 

clawson, I understand now, ill check that out, thx.

Last Edited: Tue. Aug 4, 2015 - 01:56 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

This is very confusing - you appear to address a comment to me but you are quoting someone else?!?

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

yeah that quote didn't include the "quoter", ill fix it up.

Last Edited: Tue. Aug 4, 2015 - 10:28 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I just tried to compile this code from #1

static void sendData(unsigned char data)
{

    oscilator = 0x2f;//starts with red high white low,
    for ( char loop=7;loop>=0;loop--)
    {    
        if ( (data >> loop) & 1 ) //its a 1
        { PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we will get a 1
        else /*its a 0*/{ PORTC = oscilator; PORTC = 0; }// on the falling edge we will get a 0
        oscilator = ~oscilator;//flip it.
    }
} 

And had a problem with output, because the compiler didn't make a RET at the end of the function, and the forever loop in main was broken!

Then I found the problem :)  loop>=0 are always true so it's a forever loop!

So how did you get the compiler to make the code in #3 ?

I use default compiler with 6.2 (optimizer setting don't matter)

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

sparrow2 wrote:

I just tried to compile this code from #1

static void sendData(unsigned char data)
{

    oscilator = 0x2f;//starts with red high white low,
    for ( char loop=7;loop>=0;loop--)
    {    
        if ( (data >> loop) & 1 ) //its a 1
        { PORTC = oscilator;PORTC = 0x3f;/*for compat*/PORTC = ~oscilator; }// on the falling edge we will get a 1
        else /*its a 0*/{ PORTC = oscilator; PORTC = 0; }// on the falling edge we will get a 0
        oscilator = ~oscilator;//flip it.
    }
} 

And had a problem with output, because the compiler didn't make a RET at the end of the function, and the forever loop in main was broken!

Then I found the problem :)  loop>=0 are always true so it's a forever loop!

So how did you get the compiler to make the code in #3 ?

I use default compiler with 6.2 (optimizer setting don't matter)

 

How so?

 

loop starts with 7

then 6,5,4,3,2,1,0,-1 <- is less then 0

 

My second version shifts bits instead and seems to do a better job.

 

 

 

 

Last Edited: Thu. Aug 6, 2015 - 12:05 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

What is the numeric range of a "char" variable (and no I don't mean "signed char" and I don't mean "unsigned char" either)?

 

What is the range when the -funsigned-char build option is used? (as it is by default in Studio).

 

Only ever use "char" for things which are characters like 'A', 'M', 'x', etc. In all other cases use int8_t or uint8_t to show your exact intention to the reader (and compiler).

 

In your example code it looks like you were planning for it to be int8_t.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

ok I made char signed :)

 

And that code make 70 bytes.

first big change is by makeing oscilator local (from your code I see no reason not to ) then it go to 50 bytes.

 

your 2. code are 50 bytes as well but don't have the bad >> loop.

I keep that one for the next changes.

PORTC = oscilator; can go outside the if (both 0 and 1 start with that) down to 44 bytes.

 

Now I see that you have changed the code for 0 do you want to write 0 or 0x0f to the port?

 

now I changed so the  oscilator=~oscilator; are done in both 1 and 0, it don't really make any difference in C

and still come up with 44 bytes.

 

But that is what I have based a ASM program on and it takes 32 bytes. I use no loop counter because it shift direct on data, and use

carry to check when done.

So about 27% smaller and it's about 20% faster.

 

 

 

 

 

 

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

sparrow2, what did you change, code on your side? I dont see any change in that last post I didnt already do.  If you have some code you think is smaller I'm happy to try it.

 

clawson, I always used char since most code examples do that, but you do bring a valid point. I may need to go thru all of my code some time and replace char and unsigned char with a better type name but if char is used and understood by the developer it will not go over 128 (decimal) is there any other advantage? char, uchar, uint8_t, or int8_t are all the same size right? Are we talking purely code readability here? If so how does one make the avr IDE color code the others blue.

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

char, uchar, uint8_t, or int8_t are all the same size right?

They are for most AVR 8bit C compilers that I know of but one further thing you should note is that char is not guaranteed to be 8 bits on all CPUs. A C compiler should provide a symbol called CHAR_BIT. Usual;ly/often that is defined as 8 but it could be something else. Yet another reason to only use "char" itself when dealing with char(acters).

 

As it happens:

$ avr-gcc -mmcu=atmega16 -E -dM avr.c | grep CHAR_BIT
#define __CHAR_BIT__ 8
$

<stdint.h> is about code readability - which also means maintainability - the likelihood of the next programmer who deals with the code (could be you in 3 years!) to understand the original authors intentions but it is also (very importantly) about portability too.

 

If you see "unsigned char" in a C program you *might* expect the author meant this variable to hold 0 .. 255 but if you see uint8_t, where the "8" stresses that this really is 8 bits, you KNOW the author means a range of 0 .. 255.

 

This becomes even more important when you get beyond char to short/int. How wide are "short" and "int"? If you are talking about AVR code then both short and int are 16 bit. If you are talking about 32bit (x86) code then short is 16 bit and int is 32 bit. For a 64bit PC like AMD64 it might mean double again. If, however you used int16_t or uint32_t in a program then you could build it for AVR or x86 or AMD64 (or ARM or anything) and it would work identically because int16_t ALWAYS asks for a variable with a -32768..+32767 range and uint32_t always delivers a variable with a 0 .. 4,294,967,296 range. An AVR maps int16_t to "singed int" and uint32_t to "unsigned long" while an x86 maps the first to "signed short" and the second to "unsigned int". The "mapping" is done by way of a typedef.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

All points understood and agreed.

 

Makes it hard when other projects use char though, for example

 

PROGMEM const char usbDescriptorConfiguration[] = 

is part of a usb firmware driver, like many authors they use char (agreed this is unwise for the reasons above), so when implementing it with int8 I may get incomparable types. 

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

ok I found that if the for loop is a simple (char) counter it don't make that stupid 16 bit code.

so this code should do the same and are 38 bytes, (but start to use a lot of registors!)

static void sendData(unsigned char data)
{
	unsigned char loop=0x80;
	char oscilator = 0x2f;//starts with red high white low,
	for ( signed char count=7;count--;)
	{
		PORTC = oscilator;// both do this
		oscilator = ~oscilator;// if we do it here 1 can use it direct
		if ( data & loop ) //its a 1
		{
			PORTC = 0x3f;
			PORTC = oscilator;
		}
		else
		{
			PORTC = 0x0f;// in org code you had 0 here
		}
		loop >>=1;
		//		oscilator = ~oscilator;//flip it.
	}
}

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

First I didn't see any mention of profiling or code analysis. I just see a swag as to where the code is being used.

Second a function that is called 100 times uses the same space as if it's called once. You're looking in the wrong haystack for that needle.

Third assembly language cant fix bad design.

The way to truly save code space is by re-writing big chunks to use a more compact algorithm. I can save a lot of code space by writing better code, C++ or C.

( I haven't written asm, except some pic12 stuff to see how to do it, in over 20 years for a good reason )

 

If you're not looking at the MAP file and optimizing for size that will help, but your debugging will be  orders of magnitude harder.

 

 

Keith Vasilakes

Firmware engineer

Minnesota

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I would be interesting (and quite dangerous) to have a "super smart" compiler that would actually rewrite code to make it more optimized (in terms of size, speed, or other weighted parameters).  So for example, it might recognize that a for loop would be better replaced with an if else if else pattern.   Or that some complicated function be replaced with a state sequence.  Of course, like any form of over-automation, guiding the thing becomes impossible, since it will  (in)conveniently ignore what is not readily described, yet vital.  You see similar things when using autorouters (swearing at one to avoid certain layout routes rarely works).

When in the dark remember-the future looks brighter than ever.   I look forward to being able to predict the future!

Last Edited: Sat. Aug 8, 2015 - 02:28 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

When I look at to ASM output of the compiler I always get surprised, how smart it is at some things, and total lame generating code for others.

 

for code like this :

for ( signed char count=7;count--;

And count isn't used in the code it loads count with 8 not 7 !

 

but for code like this

for ( unsigned char loop=0x80;loop > 0;loop >>= 1)

it insist on making 16 bit update  and compare code!

 

But as Keith point out the real space/speed optimizer are you! by makeing a good plan, and if you have special needs then

plan after it by using a structure that fit the job, (perhaps even use an other compiler language if needed ).

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

 

But in this case "char" *IS* the right type here. It is an array of characters!

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

sparrow2, that code does work and saved 0.1% :) Every bit does help (pun was not intended).

 

Could make this an official how to crunch code topic, albeit it does show your knowledge of ASM and coding, its still fun to do.

 

for fun here is one I had issues with (Original code was complete unrolled) because of the timing. But found skipping data (hence the 500 us ) did the job.

long data=0;
long bit=0;

_delay_us(43);    //jump in to start bit.

for (bit=0x80;bit >       0;bit >>= 1) {_delay_us(100); if (PINC & 0x20 ) data |= bit;} //BYTE 0 (bits 8-1)
for (bit=0x8000;bit >       0x80;bit >>= 1) {_delay_us(99); if (PINC & 0x20 ) data |= bit;} //BYTE 1 (bits 16-9)
for (bit=0x800000;bit >   0x8000;bit >>= 1) {_delay_us(98); if (PINC & 0x20 ) data |= bit;} //BYTE 2 (bits 24-17)
_delay_us(500);//jump to data skipping the 4 that messes up the timing.
for (bit=0x8000000;bit > 0x800000;bit >>= 1) { if (PINC & 0x20 ) data |= bit;_delay_us(100);} //BYTE 3 (bits 32-25)

 

because of that timing issue this didn't work

 

for (bit=0x8000000;bit > 0;bit >>= 1){ _delay_us(100); if (PINC & 0x20 ) data |= bit; }

 

I'm guessing this would be near impossible to do with out the wave form.

 

 

 

 

Last Edited: Sat. Aug 8, 2015 - 06:51 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Don't you have a extra timer that can auto reload at baud rate ? (you don't need a ISR just make a while loop until timer run out), that should do the sampling with a jitter of max 1 clk.

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

yeah I normally make a pin change or just a simple while loop for most signals , just didnt for this one mainly because the timing "looked" so consistent. It may be a good candidate for a redesign in the fashion.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I'm glad that you saved 0.1% :)
but at my post #4 & #9 and Keith in #38 :
The real saving could be to make better code where the function are called from.

Can you show some (perhaps all in a file) of the code round where the function are called, I would guess that
there could be made bigger savings there (100 bytes+).
 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Actually here is one that I struggle with. I have a lot of code segments that run a loop and unload a shift register but the results must be in a certain order.  Take this for example.

 

    

   PORTC |= 0x02;PORTC &= ~0x02;_delay_us(1);//shift  high and low
        if (!(PINB & 0x20)) { reportBuffer[BUTTON_ROW_1] |= 0x02; } // blue

        PORTC |= 0x02;PORTC &= ~0x02;_delay_us(1);//shift  high and low
        if (!(PINB & 0x20)) { reportBuffer[BUTTON_ROW_1] |= 0x01; } // red

        PORTC |= 0x02;PORTC &= ~0x02;_delay_us(1);//shift  high and low
        if (!(PINB & 0x20)) { reportBuffer[BUTTON_ROW_1] |= 0x08; } // yellow

        PORTC |= 0x02;PORTC &= ~0x02;_delay_us(1);//shift  high and low
        if (!(PINB & 0x20)) { reportBuffer[BUTTON_ROW_1] |= 0x04; } // green

        PORTC |= 0x02;PORTC &= ~0x02;_delay_us(1);//shift  high and low
        if (!(PINB & 0x20)) { reportBuffer[BUTTON_ROW_1] |= 0x80; } // shoulder right
        
        PORTC |= 0x02;PORTC &= ~0x02;_delay_us(1);//shift  high and low
        if (!(PINB & 0x20)) { reportBuffer[BUTTON_ROW_1] |= 0x40; } // shoulder left
        
        PORTC |= 0x02;PORTC &= ~0x02;_delay_us(1);//shift  high and low
        if (!(PINB & 0x20)) { reportBuffer[BUTTON_ROW_1] |= 0x20; } // pause


Now one could certainly do this in a loop but to reassemble the reportBuffer I would need to if or switch all data. In the end its about the same either way. So  I was thinking maybe there is a way to make a look up table or something a head of time to pre assemble the output better.

 

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Something like this. (yes this does help)

		int8_t bits[7] = {0x02,0x01,0x08,0x04,0x80,0x40,0x20,0x20};//last two are the same button
		for (int8_t i=0; i<8; i++)
		{
			PORTC |= 0x02;PORTC &= ~0x02;_delay_us(1);//shift  high and low
			if (!(PINB & 0x20)) { reportBuffer[BUTTON_ROW_1] |= bits[i]; } 
		}

 

Last Edited: Sun. Aug 9, 2015 - 03:39 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

The only unique part of each of those 'if' statements is the RHS of the |=

 

Why not just assign the purpose of the bits in reportBuffer[BUTTON_ROW_1] linearly?:

  // bit 6: blue
  // bit 5: red
  // bit 4: yellow
  // bit 3: green
  // bit 2: shoulder right
  // bit 1: shoulder left
  // bit 0: pause

  uint8_t mask = 0x40;
  for (uint8_t bit=0; bit<7; bit++, mask>>=1) {
    PORTC |= 0x02;PORTC &= ~0x02;_delay_us(1);
    if (!(PINB & 0x20)) { reportBuffer[BUTTON_ROW_1] |= mask; }
  }

If you can't, just deconvolve them after the fact:

  reportBuffer[BUTTON_ROW_1] =
    __builtin_avr_insert_bits (0x210F4365,
                               reportBuffer[BUTTON_ROW_1],
                               reportBuffer[BUTTON_ROW_1]);

If bit 4 is completely unused, then you can make it even simpler:

  reportBuffer[BUTTON_ROW_1] =
    __builtin_avr_insert_bits (0x210F4365,
                               reportBuffer[BUTTON_ROW_1], 0);

 

"Experience is what enables you to recognise a mistake the second time you make it."

"Good judgement comes from experience.  Experience comes from bad judgement."

"Wisdom is always wont to arrive late, and to be a little approximate on first possession."

"When you hear hoofbeats, think horses, not unicorns."

"Fast.  Cheap.  Good.  Pick two."

"We see a lot of arses on handlebars around here." - [J Ekdahl]

 

Last Edited: Sun. Aug 9, 2015 - 07:32 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Why not just assign the purpose of the bits in reportBuffer[BUTTON_ROW_1] linearly?:

 

Because I have somewhere around 200 devices I read bits from. These bits represent the physical appearance of the device switches(or buttons). Not all hardware loads the register in the same way, thus I must correct some to match a global standard.

 

I didnt know about __builtin_avr_insert_bits that is going to help thx! I can't find where this function is documented but can anyone explain the parameters?  guessing return value = (bit order, in data, ?)

? being what is not clear.

Last Edited: Sun. Aug 9, 2015 - 11:43 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

"Experience is what enables you to recognise a mistake the second time you make it."

"Good judgement comes from experience.  Experience comes from bad judgement."

"Wisdom is always wont to arrive late, and to be a little approximate on first possession."

"When you hear hoofbeats, think horses, not unicorns."

"Fast.  Cheap.  Good.  Pick two."

"We see a lot of arses on handlebars around here." - [J Ekdahl]

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Google works I just had no luck using it :) I saw that page but it didn't look like a doc so a passed it by.

 

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

The docs are minimal, true... but not much more is required I'd say.

 

CTRL-F in your preferred browser is also handy for zeroing in on what you're looking for:

 

 

 

Truth be told, I myself only became aware of this builtin in the last year.

"Experience is what enables you to recognise a mistake the second time you make it."

"Good judgement comes from experience.  Experience comes from bad judgement."

"Wisdom is always wont to arrive late, and to be a little approximate on first possession."

"When you hear hoofbeats, think horses, not unicorns."

"Fast.  Cheap.  Good.  Pick two."

"We see a lot of arses on handlebars around here." - [J Ekdahl]

 

Last Edited: Mon. Aug 10, 2015 - 01:13 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

if we still talk about code size, then how are :

_delay_us(1);

implemented ?

I know it's not a function called with 1, but is it a function call or a short inline loop?

if it's inline, it will eat more a flash than an "almost" empty function! (for all delays over 7 clk that should work)

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

implemented ?

I know it's not a function called with 1, but is it a function call or a short inline loop?

if it's inline, it will eat more a flash than an "almost" empty function! (for all delays over 7 clk that should work)

_delay_us() (and _ms()) is a compile time macro wrapper for a call to __builtin_avr_delay_cycles(). It relies on optimisation so the calcultion based on the required time and the defined F_CPU are optimised away and it ends up just inserting __builtin_avr_delay_cycles(N) into the code where N is the result the compiler has calculated.

 

__builtin_avr_delay_cycles() is itself a "built-in" (clue in the name!), that is it is a compiler "intrinsic" which means that it is not a CALL to library code but that the compiler replaces this invocation with a code sequence to implement it. For example:

$ cat avr.c
#define F_CPU 1234567UL
#include <avr/io.h>
#include <util/delay.h>

int main(void) {
    DDRB = 0xFF;
    while (1) {
	PORTB ^= 0xFF;
	_delay_us(45.67);
    }
}
$ avr-gcc -mmcu=atmega16 -Os -save-temps avr.c -o avr.elf
$ cat avr.s
	.file	"avr.c"
__SREG__ = 0x3f
__SP_H__ = 0x3e
__SP_L__ = 0x3d
__CCP__ = 0x34
__tmp_reg__ = 0
__zero_reg__ = 1
	.text
.global	main
	.type	main, @function
main:
/* prologue: function */
/* frame size = 0 */
/* stack size = 0 */
.L__stack_usage = 0
	ldi r24,lo8(-1)
	out 55-32,r24
.L2:
	in r24,56-32
	com r24
	out 56-32,r24
	 ldi r24,lo8(19)
    1:dec r24
    brne 1b
	rjmp .L2
	.size	main, .-main

The compiler has determined that those 3 opcodes (looping twice) will get me as close as possible to 45.67us on 1.23456MHz. No CALL overhead is involved.

 

At 1.234567MHz the cycle time is 0.81us. For 45.67us I therefore need 56(.4) cycles. The LDI is 1 cycle. The DEC is 1 cycle and the BRNE is 2 cycles for all but the last. So I guess it is giving me 1 + 18 *( 1 + 2) + 1 = 56 cycles.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

yes that was how I remembered it.

And that means that OP can save 4 bytes for each 1us delay by making a function (if it can use rcall, only 2 bytes if call )

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

And presumably then reducing the length of the delay in the CALL'd version because of the RCALL/RET overhead itself?

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I just tryid this, I have not checked what the delay should be changed to

void __attribute__ ((noinline)) del(void)
{
	_delay_us(0.9);
}


static void sendData(unsigned char mybyte)
{
	if ((mybyte & (1<<7)) == 0) { PORTD &= ~(1 << 7); } else { PORTD |= (1 << 7); }
//_delay_us(1);	
del();
	if ((mybyte & (1<<6)) == 0) { PORTD &= ~(1 << 6); } else { PORTD |= (1 << 6); }
//_delay_us(1);
del();
	if ((mybyte & (1<<5)) == 0) { PORTD &= ~(1 << 5); } else { PORTD |= (1 << 5); }
//_delay_us(1);
del();
	if ((mybyte & (1<<4)) == 0) { PORTD &= ~(1 << 4); } else { PORTD |= (1 << 4); }
//_delay_us(1);
del();

}

but main thing it saved 34 bytes!