Hand optimisation of Byte Array Serialisation Code

Go To Last Post
21 posts / 0 new
Author
Message
#1
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Dear all,

I am currently attempting to optimise (for flash space) some byte array serialisation code. That is, take a float/unsigned int/whatever and put it into a byte array for transmission.

My old C routines were:

uint32_t Deserialise_uint32_t_old(uint8_t* array)
{
    return  (((uint32_t) array[0])      ) +
           	(((uint32_t) array[1]) <<  8) + 
           	(((uint32_t) array[2]) << 16) + 
           	(((uint32_t) array[3]) << 24);
}

void Serialise_uint32_t_old(uint8_t *array, uint32_t data)
{
    array[0] = (uint8_t) ((data & 0xFF));
    array[1] = (uint8_t) ((data) >> 8);
    array[2] = (uint8_t) ((data) >> 16);
    array[3] = (uint8_t) ((data) >> 24);
}

Which generated the following code:

uint32_t Deserialise_uint32_t_old(uint8_t* array)
{
   20a1a:	fc 01       	movw	r30, r24
   20a1c:	21 81       	ldd	r18, Z+1	; 0x01
   20a1e:	30 e0       	ldi	r19, 0x00	; 0
   20a20:	40 e0       	ldi	r20, 0x00	; 0
   20a22:	50 e0       	ldi	r21, 0x00	; 0
   20a24:	54 2f       	mov	r21, r20
   20a26:	43 2f       	mov	r20, r19
   20a28:	32 2f       	mov	r19, r18
   20a2a:	22 27       	eor	r18, r18
   20a2c:	82 81       	ldd	r24, Z+2	; 0x02
   20a2e:	90 e0       	ldi	r25, 0x00	; 0
   20a30:	a0 e0       	ldi	r26, 0x00	; 0
   20a32:	b0 e0       	ldi	r27, 0x00	; 0
   20a34:	dc 01       	movw	r26, r24
   20a36:	99 27       	eor	r25, r25
   20a38:	88 27       	eor	r24, r24
   20a3a:	28 0f       	add	r18, r24
   20a3c:	39 1f       	adc	r19, r25
   20a3e:	4a 1f       	adc	r20, r26
   20a40:	5b 1f       	adc	r21, r27
   20a42:	80 81       	ld	r24, Z
   20a44:	28 0f       	add	r18, r24
   20a46:	31 1d       	adc	r19, r1
   20a48:	41 1d       	adc	r20, r1
   20a4a:	51 1d       	adc	r21, r1
   20a4c:	83 81       	ldd	r24, Z+3	; 0x03
   20a4e:	90 e0       	ldi	r25, 0x00	; 0
   20a50:	a0 e0       	ldi	r26, 0x00	; 0
   20a52:	b0 e0       	ldi	r27, 0x00	; 0
   20a54:	b8 2f       	mov	r27, r24
   20a56:	aa 27       	eor	r26, r26
   20a58:	99 27       	eor	r25, r25
   20a5a:	88 27       	eor	r24, r24
   20a5c:	28 0f       	add	r18, r24
   20a5e:	39 1f       	adc	r19, r25
   20a60:	4a 1f       	adc	r20, r26
   20a62:	5b 1f       	adc	r21, r27
    return  (((uint32_t) array[0])      ) +
           	(((uint32_t) array[1]) <<  8) + 
           	(((uint32_t) array[2]) << 16) + 
           	(((uint32_t) array[3]) << 24);
}
   20a64:	b9 01       	movw	r22, r18
   20a66:	ca 01       	movw	r24, r20
   20a68:	08 95       	ret

00020a06 :

void Serialise_uint32_t_old(uint8_t *array, uint32_t data)
{
   20a06:	fc 01       	movw	r30, r24
    array[0] = (uint8_t) ((data & 0xFF));
   20a08:	40 83       	st	Z, r20
    array[1] = (uint8_t) ((data) >> 8);
   20a0a:	bb 27       	eor	r27, r27
   20a0c:	a7 2f       	mov	r26, r23
   20a0e:	96 2f       	mov	r25, r22
   20a10:	85 2f       	mov	r24, r21
   20a12:	81 83       	std	Z+1, r24	; 0x01
    array[2] = (uint8_t) ((data) >> 16);
   20a14:	cb 01       	movw	r24, r22
   20a16:	aa 27       	eor	r26, r26
   20a18:	bb 27       	eor	r27, r27
   20a1a:	82 83       	std	Z+2, r24	; 0x02
    array[3] = (uint8_t) ((data) >> 24);
   20a1c:	47 2f       	mov	r20, r23
   20a1e:	55 27       	eor	r21, r21
   20a20:	66 27       	eor	r22, r22
   20a22:	77 27       	eor	r23, r23
   20a24:	43 83       	std	Z+3, r20	; 0x03
}
   20a26:	08 95       	ret

As one would expect, writing some assembler drastically reduces the size:

Serialise_uint32_t:

	// Move the pointer to the Z register
	movw	r30, 	r24

	// Move the individual bytes into the array
	st		Z,		r20
	std		Z+1,	r21
	std		Z+2,	r22
	std		Z+3,	r23	
	ret

;-------------------------------------

.section .text
.global Deserialise_uint32_t

Deserialise_uint32_t:

	// Move the pointer to the Z-Register
	movw	r30,	r24

	// Move the array values into the return registers
	ld		r22,	Z+
	ld		r23,	Z+
	ld		r24,	Z+
	ld		r25,	Z
	ret


However, what's gnawing at me is that there is still quite a bit of overhead in shuffling registers around by gcc before the function calls.

Apart from further hand-optiminsation, is there anything else that I could/should consider to make it a little smaller overall? For example, inline assembly in an inline C function/macro?

-- Damien

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

You can use unions.
Something like this should work:

union data
{
	uint32_t d;
	struct
{
	uint8_t d1;
	uint8_t d2;
	uint8_t	d3;
	uint8_t d4;
};
};
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

damien_d wrote:

Apart from further hand-optiminsation, is there anything else that I could/should consider to make it a little smaller overall? For example, inline assembly in an inline C function/macro?

inline asm as a macro

JW

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0
// NOTE relies on LITTLE-ENDIAN
uint32_t Deserialise_uint32_t_old(uint8_t* array)
{
    return  *((uint32_t *) array);
}

// NOTE relies on LITTLE-ENDIAN
void Serialise_uint32_t_old(uint8_t *array, uint32_t data)
{
    uint32_t *p32 = (uint32_t *)array;
    *p32 = data; 
}

No, I have not tested or simulated this. At a guess the Compiler will produce fairly efficient code. And of course both functions could be written as macros. In which case you do not have to use any non-portable "__inline__" sort of constructs.

David.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

What does it generate if you | the bytes instead of + them? After all, you specify an addition, not a copy of indiviual bytes.

While we humans easily recognize the optimization possible, I think for a computer it's very difficult, Or more correctly, it's very difficult for us humans to come up with algorithm to detect such cases :)

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

TFrancuz wrote:
You can use unions.
Something like this should work:

union data
{
	uint32_t d;
	struct
{
	uint8_t d1;
	uint8_t d2;
	uint8_t	d3;
	uint8_t d4;
};
};

I thought I had tried this some time ago to bad results (though I did do it a little differently.

However, now it looks pretty good: This is what gets generated:

void Serialise_uint32_t_union_struct(uint8_t *buffer, uint32_t data)
{
   209f8:	fc 01       	movw	r30, r24
			uint8_t byData3;
		} sData;
	} uData;

	uData.uiData = data;
	buffer[0] = uData.sData.byData0;
   209fa:	40 83       	st	Z, r20
	buffer[1] = uData.sData.byData1;
   209fc:	51 83       	std	Z+1, r21	; 0x01
	buffer[2] = uData.sData.byData2;
   209fe:	62 83       	std	Z+2, r22	; 0x02
	buffer[3] = uData.sData.byData3;
   20a00:	73 83       	std	Z+3, r23	; 0x03
}
   20a02:	08 95       	ret

00020a04 :

uint32_t Deserialise_uint32_t_union_struct(uint8_t* buffer)
{
   20a04:	fc 01       	movw	r30, r24
			uint8_t byData2;
			uint8_t byData3;
		} sData;
	} uData;

	uData.sData.byData0 = buffer[0];
   20a06:	60 81       	ld	r22, Z
	uData.sData.byData1 = buffer[1];
   20a08:	71 81       	ldd	r23, Z+1	; 0x01
	uData.sData.byData2 = buffer[2];
	uData.sData.byData3 = buffer[3];

	return uData.uiData;
}
   20a0a:	82 81       	ldd	r24, Z+2	; 0x02
   20a0c:	93 81       	ldd	r25, Z+3	; 0x03
   20a0e:	08 95       	ret

Which is the equivalent of my hand-assembler above. It was generated from the following C code:

void Serialise_uint32_t_union_struct(uint8_t *buffer, uint32_t data)
{
	union
	{
		uint32_t uiData;
		struct
		{
		    uint8_t byData0;
			uint8_t byData1;
			uint8_t byData2;
			uint8_t byData3;
		} sData;
	} uData;

	uData.uiData = data;
	buffer[0] = uData.sData.byData0;
	buffer[1] = uData.sData.byData1;
	buffer[2] = uData.sData.byData2;
	buffer[3] = uData.sData.byData3;
}

uint32_t Deserialise_uint32_t_union_struct(uint8_t* buffer)
{
	union
	{
		uint32_t uiData;
		struct
		{
		    uint8_t byData0;
			uint8_t byData1;
			uint8_t byData2;
			uint8_t byData3;
		} sData;
	} uData;

	uData.sData.byData0 = buffer[0];
	uData.sData.byData1 = buffer[1];
	uData.sData.byData2 = buffer[2];
	uData.sData.byData3 = buffer[3];

	return uData.uiData;
}
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

david.prentice wrote:

// NOTE relies on LITTLE-ENDIAN
uint32_t Deserialise_uint32_t_old(uint8_t* array)
{
    return  *((uint32_t *) array);
}

// NOTE relies on LITTLE-ENDIAN
void Serialise_uint32_t_old(uint8_t *array, uint32_t data)
{
    uint32_t *p32 = (uint32_t *)array;
    *p32 = data; 
}

No, I have not tested or simulated this. At a guess the Compiler will produce fairly efficient code. And of course both functions could be written as macros. In which case you do not have to use any non-portable "__inline__" sort of constructs.

David.

Both are the equivalent to my hand-assembler.

00020a10 :

void Serialise_uint32_t_pointer_recast(uint8_t* buffer, uint32_t data)
{
   20a10:	fc 01       	movw	r30, r24
    uint32_t *p32 = (uint32_t *)buffer;
    *p32 = data; 
   20a12:	40 83       	st	Z, r20
   20a14:	51 83       	std	Z+1, r21	; 0x01
   20a16:	62 83       	std	Z+2, r22	; 0x02
   20a18:	73 83       	std	Z+3, r23	; 0x03
}
   20a1a:	08 95       	ret

00020a1c :

uint32_t Deserialise_uint32_t_pointer_recast(uint8_t* buffer)
{
   20a1c:	fc 01       	movw	r30, r24
   20a1e:	60 81       	ld	r22, Z
   20a20:	71 81       	ldd	r23, Z+1	; 0x01
	return *((uint32_t*) buffer);
}
   20a22:	82 81       	ldd	r24, Z+2	; 0x02
   20a24:	93 81       	ldd	r25, Z+3	; 0x03
   20a26:	08 95       	ret

Generated from the following C:

void Serialise_uint32_t_pointer_recast(uint8_t* buffer, uint32_t data)
{
    uint32_t *p32 = (uint32_t *)buffer;
    *p32 = data; 
}

uint32_t Deserialise_uint32_t_pointer_recast(uint8_t* buffer)
{
	return *((uint32_t*) buffer);
}

I have no idea why that when I originally tried them, they were not as good as the byte-shift. I must have changed somthing in the meantime.

-- Damien

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

The latter function does not compile in ICCAVR. The return should read 'return *(uint32_t *)buffer;'.

void Serialise_uint32_t_pointer_recast(char* buffer, unsigned long data)
(0002) {
(0003)     unsigned long *p32 = (unsigned long *)buffer;
    00099 0158      MOVW	R10,R16
(0004)     *p32 = data;
    0009A 802A      LDD	R2,Y+2
    0009B 803B      LDD	R3,Y+3
    0009C 804C      LDD	R4,Y+4
    0009D 805D      LDD	R5,Y+5
    0009E 01F5      MOVW	R30,R10
    0009F 8220      ST	Z,R2
    000A0 8231      STD	Z+1,R3
    000A1 8242      STD	Z+2,R4
    000A2 8253      STD	Z+3,R5
    000A3 90B9      LD	R11,Y+
    000A4 90A9      LD	R10,Y+
    000A5 9622      ADIW	R28,2
    000A6 9508      RET
(0005) }
(0006) 
(0007) unsigned long Deserialise_uint32_t_pointer_recast(char* buffer)
(0008) {
(0009)    return *(unsigned long *) buffer;
_Deserialise_uint32_t_pointer_recast:
  buffer               --> R16
    000A7 01F8      MOVW	R30,R16
    000A8 8100      LD	R16,Z
    000A9 8111      LDD	R17,Z+1
    000AA 8122      LDD	R18,Z+2
    000AB 8133      LDD	R19,Z+3
    000AC 9508      RET
(0010) } 
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

jayjay1974 wrote:
The latter function does not compile in ICCAVR. The return should read 'return *(uint32_t *)buffer;'.

Correct... my mistake. Fixed and reposted above.

-- Damien.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

OK, I've just changed back from a couple of similar assember routines to declaring the union-based C Functions as "static inline".

The theory was that I though there was far too much shuffing around of registers to actually do the work I wanted.

Once they were inlined, here's the result:

Orignal assembler optimisation = 4368 bytes
Inlined union-based functions  = 4304 bytes
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

My experience in trying to generate the smallest possible code is to massage your code until you get what you want and the resulting code is not always the most pretty. Bad thing is that it compiler dependant.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

jayjay1974 wrote:
My experience in trying to generate the smallest possible code is to massage your code until you get what you want and the resulting code is not always the most pretty. Bad thing is that it compiler dependant.

I would say the complete opposite.

If it looks neat and tidy, it probably is clear and efficient.

I note that Damien's code was 4368 bytes.

So he has another 3824 bytes of coding before he has to buy a bigger AVR.

Incidentally, either the use of unions or dereferencing pointers will work fine on any single platform. The original topic was about Serialising which in my mind means being able to re-assemble on another platform. This is whre you have to worry about big-endian etc...

David.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

One particular case with ICCAVR when you use a switch statement. Somehow it always generates 16 bit compares even if the switch argument is only a byte wasting lots of flash. If you're tight on flash you have to resort to an if-else-if construct.

Sometimes it take only a minor modification to coerce the compiler to generate smaller code.

Smaller code is not always more efficient in terms of speed.

The endianess is no problem with the union or the explicit shift and or/add solution, but it is with just plain pointer magic.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I am fairly certain that C specifies that the switch statement takes an int argument.

Yes, nested if-else statements are often more efficient.

Ok. When you want to squeeze something into a Tiny2313, you have to think of tricks. But just how often is it that your code just takes 8194 bytes? or just takes 16386 bytes ?

David.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Quote:
I note that Damien's code was 4368 bytes.

This does beg the question why you are trying to reduce the code size. As far as I know there are no 4k AVRs. Saving 64 bytes doesn't seem worth the trouble if you still have thousands to spare.

Regards,
Steve A.

The Board helps those that help themselves.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Koshchi wrote:
Quote:
I note that Damien's code was 4368 bytes.

This does beg the question why you are trying to reduce the code size. As far as I know there are no 4k AVRs. Saving 64 bytes doesn't seem worth the trouble if you still have thousands to spare.

??

There are plenty of 4K AVR's

look at the tiny4xxx devices as well as the mega4x

Writing code is like having sex.... make one little mistake, and you're supporting it for life.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

david.prentice wrote:
I am fairly certain that C specifies that the switch statement takes an int argument.

Yes, nested if-else statements are often more efficient.

Ok. When you want to squeeze something into a Tiny2313, you have to think of tricks. But just how often is it that your code just takes 8194 bytes? or just takes 16386 bytes ?

David.

But on a 8 bit target with a compiler that's non-standard anyway, it would have been simple to change it to generate 8 bit compares instead of 16 bit ones if the argument is only 8 bit anyway.

But I agree that needing to massage your code for either speed or size is not required that often. Still, it can be useful to inspect the generated code, sometimes a small change can make quite a change.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Koshchi wrote:
Quote:
I note that Damien's code was 4368 bytes.

This does beg the question why you are trying to reduce the code size. As far as I know there are no 4k AVRs. Saving 64 bytes doesn't seem worth the trouble if you still have thousands to spare.

It needs to get under 4K - Bootloader space on an xmega16A4 that can't be adjusted. It started at 4900 bytes or there abouts and I'm picking the low hanging fruit - especially bits that can be re-used.

Yes, there are small bootloaders out there, but this one is a semi-complete program in itself to prevent bricking of the board, regardless of a failed upload (or worse, a dodgy upload).

-- Damien

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

jayjay1974 wrote:
My experience in trying to generate the smallest possible code is to massage your code until you get what you want and the resulting code is not always the most pretty. Bad thing is that it compiler dependant.

Agreed. See the example in this thread - there is a compiler difference in the generated code for: void Serialise_uint32_t_pointer_recast(char* buffer, unsigned long data).

Personally, I much, much, much prefer optimising for code maintenance :) Speed and size for when I don't have a choice.

I may launch another thread on another optimisation problem when I get the chance which will probably go on this exact topic.

-- Damien

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

damien_d wrote:
OK, I've just changed back from a couple of similar assember routines to declaring the union-based C Functions as "static inline".

The theory was that I though there was far too much shuffing around of registers to actually do the work I wanted.

Once they were inlined, here's the result:

Orignal assembler optimisation = 4368 bytes
Inlined union-based functions  = 4304 bytes


Can we see the result of compilation of a relevant snippet?

JW

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

wek wrote:

Can we see the result of compilation of a relevant snippet?

That aggregate change (4368 -> 4304) was the overall result in changing to static inline in the various functions I have implemented, i.e.

static inline void Serialise_uint16_t(...);
static inline void Serialise_uint32_t(...);

static inline uint32_t Deserialise_uint32_t(...);
static inline uint16_t Deserialise_uint16_t(...);

// etc.

In actual fact, just testing with Serialise_uint32_t (inline vs non-inline) literally makes no difference as it currently stands (4304 stays). In the course of this thread, I have seen it **increase** from 4368 to 4374, but I have also seen it decrease from 4368 -> 4356.

For interest, there are quite a few functions that look something like:

void DataPacket_ReplyStoredCRC(SBootloaderPacket *packet)
{
    // Populate the header
    packet->packetID = PACKET_ID_REPLY_STORED_CRC;
    packet->dataLength = PACKET_LENGTH_REPLY_STORED_CRC;

    // Get the stored CRC and number of write cycles
    Serialise_uint32_t(packet->data CRC_GetStoredCRC());
    Serialise_uint32_t(packet->data + 4, CRC_GetStoredWriteCycles());
}

Which generates (with inlines):


void DataPacket_CreateReplyStoredCRC(SBootloaderPacket *packet)
{
   208a2:	ef 92       	push	r14
   208a4:	ff 92       	push	r15
   208a6:	0f 93       	push	r16
   208a8:	1f 93       	push	r17
   208aa:	7c 01       	movw	r14, r24
    // Populate the header
    packet->packetID = PACKET_ID_REPLY_STORED_CRC;
   208ac:	84 e9       	ldi	r24, 0x94	; 148
   208ae:	f7 01       	movw	r30, r14
   208b0:	80 83       	st	Z, r24
    packet->dataLength = PACKET_LENGTH_REPLY_STORED_CRC;
   208b2:	88 e0       	ldi	r24, 0x08	; 8
   208b4:	90 e0       	ldi	r25, 0x00	; 0
   208b6:	81 83       	std	Z+1, r24	; 0x01
   208b8:	92 83       	std	Z+2, r25	; 0x02

    // Get the stored CRC and number of write cycles
    Serialise_uint32_t(packet->data, CRC_GetStoredCRC());
   208ba:	04 81       	ldd	r16, Z+4	; 0x04
   208bc:	15 81       	ldd	r17, Z+5	; 0x05
   208be:	0f 94 ed 02 	call	0x205da	; 0x205da 
			uint8_t byData3;
		} sData;
	} uData;

	uData.uiData = data;
	buffer[0] = uData.sData.byData0;
   208c2:	f8 01       	movw	r30, r16
   208c4:	60 83       	st	Z, r22
	buffer[1] = uData.sData.byData1;
   208c6:	71 83       	std	Z+1, r23	; 0x01
	buffer[2] = uData.sData.byData2;
   208c8:	82 83       	std	Z+2, r24	; 0x02
	buffer[3] = uData.sData.byData3;
   208ca:	93 83       	std	Z+3, r25	; 0x03
    Serialise_uint32_t(packet->data + 4, CRC_GetStoredWriteCycles());
   208cc:	f7 01       	movw	r30, r14
   208ce:	e4 80       	ldd	r14, Z+4	; 0x04
   208d0:	f5 80       	ldd	r15, Z+5	; 0x05
   208d2:	87 01       	movw	r16, r14
   208d4:	0c 5f       	subi	r16, 0xFC	; 252
   208d6:	1f 4f       	sbci	r17, 0xFF	; 255
   208d8:	0f 94 e8 02 	call	0x205d0	; 0x205d0 
			uint8_t byData3;
		} sData;
	} uData;

	uData.uiData = data;
	buffer[0] = uData.sData.byData0;
   208dc:	f7 01       	movw	r30, r14
   208de:	64 83       	std	Z+4, r22	; 0x04
	buffer[1] = uData.sData.byData1;
   208e0:	f8 01       	movw	r30, r16
   208e2:	71 83       	std	Z+1, r23	; 0x01
	buffer[2] = uData.sData.byData2;
   208e4:	82 83       	std	Z+2, r24	; 0x02
	buffer[3] = uData.sData.byData3;
   208e6:	93 83       	std	Z+3, r25	; 0x03
}
   208e8:	1f 91       	pop	r17
   208ea:	0f 91       	pop	r16
   208ec:	ff 90       	pop	r15
   208ee:	ef 90       	pop	r14
   208f0:	08 95       	ret

000208f2 :
}

And without inlines (overall program 12 bytes more, not nessessarily in this function):



void DataPacket_CreateReplyStoredCRC(SBootloaderPacket *packet)
{
   208be:	ef 92       	push	r14
   208c0:	ff 92       	push	r15
   208c2:	0f 93       	push	r16
   208c4:	1f 93       	push	r17
   208c6:	8c 01       	movw	r16, r24
    // Populate the header
    packet->packetID = PACKET_ID_REPLY_STORED_CRC;
   208c8:	84 e9       	ldi	r24, 0x94	; 148
   208ca:	f8 01       	movw	r30, r16
   208cc:	80 83       	st	Z, r24
    packet->dataLength = PACKET_LENGTH_REPLY_STORED_CRC;
   208ce:	88 e0       	ldi	r24, 0x08	; 8
   208d0:	90 e0       	ldi	r25, 0x00	; 0
   208d2:	81 83       	std	Z+1, r24	; 0x01
   208d4:	92 83       	std	Z+2, r25	; 0x02

    // Get the stored CRC and number of write cycles
    Serialise_uint32_t(packet->data, CRC_GetStoredCRC());
   208d6:	e4 80       	ldd	r14, Z+4	; 0x04
   208d8:	f5 80       	ldd	r15, Z+5	; 0x05
   208da:	0f 94 ed 02 	call	0x205da	; 0x205da 
   208de:	ab 01       	movw	r20, r22
   208e0:	bc 01       	movw	r22, r24
   208e2:	c7 01       	movw	r24, r14
   208e4:	0f 94 e9 04 	call	0x209d2	; 0x209d2 
    Serialise_uint32_t(packet->data + 4, CRC_GetStoredWriteCycles());
   208e8:	f8 01       	movw	r30, r16
   208ea:	04 81       	ldd	r16, Z+4	; 0x04
   208ec:	15 81       	ldd	r17, Z+5	; 0x05
   208ee:	0c 5f       	subi	r16, 0xFC	; 252
   208f0:	1f 4f       	sbci	r17, 0xFF	; 255
   208f2:	0f 94 e8 02 	call	0x205d0	; 0x205d0 
   208f6:	ab 01       	movw	r20, r22
   208f8:	bc 01       	movw	r22, r24
   208fa:	c8 01       	movw	r24, r16
   208fc:	0f 94 e9 04 	call	0x209d2	; 0x209d2 
}
   20900:	1f 91       	pop	r17
   20902:	0f 91       	pop	r16
   20904:	ff 90       	pop	r15
   20906:	ef 90       	pop	r14
   20908:	08 95       	ret

0002090a :

}

Result of function above (assuming I've calculated correctly):
With inlines: 80 bytes
Without inlines: 88 bytes.

I think I'm better of looking elsewhere for savings.

-- Damien

EDIT: Added specific example.