Hello everyone.
Here is something that I found on one of my programming endeavor's. There was a task where I had to clear/fill an array with zero's as fast as possible for some refreshing purposes and to avoid a race condition. As I program only in C, I knew the fastest way would be to do this.
uint8_t data[64]; data[0] = 0; data[1] = 0; data[2] = 0; data[3] = 0; data[4] = 0; data[5] = 0; data[6] = 0; data[7] = 0; // . // . // . // and so on.
My compiler was AVR-GCC. Now looking at the generated assembly in simulator.
data[0] = 0; data[1] = 0; data[2] = 0; data[3] = 0; data[4] = 0; data[5] = 0; data[6] = 0; data[7] = 0; 00000054 STD Y+0,R1 Store indirect with displacement 00000055 STD Y+1,R1 Store indirect with displacement 00000056 STD Y+2,R1 Store indirect with displacement 00000057 STD Y+3,R1 Store indirect with displacement 00000058 STD Y+4,R1 Store indirect with displacement 00000059 STD Y+5,R1 Store indirect with displacement 0000005A STD Y+6,R1 Store indirect with displacement 0000005B STD Y+7,R1 Store indirect with displacement // . // . // . // and so on.
The compiler uses STD instruction which takes 1 word of program space and 2 clock cycles to complete. All fine until here. (this is about as fast as it can get on megaAVR)
But if I increase the array size to more than 64 bytes. Now how would complier handle this one? Because due to limitation of STD instruction, which can only add up to 63 to the address that it has.
To see let's increase the array size to 256 bytes. Here is dummy program. (I am writing statements horizontally to avoid huge scrolling)
static uint8_t data[256]; int main(void) { while (1) { for(uint16_t x = 0; x < 256; x++) // Filling the array with 0xFF { data[x] = 0xff; } data[0] = 0; data[1] = 0; data[2] = 0; data[3] = 0; data[4] = 0; data[5] = 0; data[6] = 0; data[7] = 0; data[8] = 0; data[9] = 0; data[10] = 0; data[11] = 0; data[12] = 0; data[13] = 0; data[14] = 0; data[15] = 0; data[16] = 0; data[17] = 0; data[18] = 0; data[19] = 0; data[20] = 0; data[21] = 0; data[22] = 0; data[23] = 0; data[24] = 0; data[25] = 0; data[26] = 0; data[27] = 0; data[28] = 0; data[29] = 0; data[30] = 0; data[31] = 0; data[32] = 0; data[33] = 0; data[34] = 0; data[35] = 0; data[36] = 0; data[37] = 0; data[38] = 0; data[39] = 0; data[40] = 0; data[41] = 0; data[42] = 0; data[43] = 0; data[44] = 0; data[45] = 0; data[46] = 0; data[47] = 0; data[48] = 0; data[49] = 0; data[50] = 0; data[51] = 0; data[52] = 0; data[53] = 0; data[54] = 0; data[55] = 0; data[56] = 0; data[57] = 0; data[58] = 0; data[59] = 0; data[60] = 0; data[61] = 0; data[62] = 0; data[63] = 0; data[64] = 0; data[65] = 0; data[66] = 0; data[67] = 0; data[68] = 0; data[69] = 0; data[70] = 0; data[71] = 0; data[72] = 0; data[73] = 0; data[74] = 0; data[75] = 0; data[76] = 0; data[77] = 0; data[78] = 0; data[79] = 0; data[80] = 0; data[81] = 0; data[82] = 0; data[83] = 0; data[84] = 0; data[85] = 0; data[86] = 0; data[87] = 0; data[88] = 0; data[89] = 0; data[90] = 0; data[91] = 0; data[92] = 0; data[93] = 0; data[94] = 0; data[95] = 0; data[96] = 0; data[97] = 0; data[98] = 0; data[99] = 0; data[100] = 0; data[101] = 0; data[102] = 0; data[103] = 0; data[104] = 0; data[105] = 0; data[106] = 0; data[107] = 0; data[108] = 0; data[109] = 0; data[110] = 0; data[111] = 0; data[112] = 0; data[113] = 0; data[114] = 0; data[115] = 0; data[116] = 0; data[117] = 0; data[118] = 0; data[119] = 0; data[120] = 0; data[121] = 0; data[122] = 0; data[123] = 0; data[124] = 0; data[125] = 0; data[126] = 0; data[127] = 0; data[128] = 0; data[129] = 0; data[130] = 0; data[131] = 0; data[132] = 0; data[133] = 0; data[134] = 0; data[135] = 0; data[136] = 0; data[137] = 0; data[138] = 0; data[139] = 0; data[140] = 0; data[141] = 0; data[142] = 0; data[143] = 0; data[144] = 0; data[145] = 0; data[146] = 0; data[147] = 0; data[148] = 0; data[149] = 0; data[150] = 0; data[151] = 0; data[152] = 0; data[153] = 0; data[154] = 0; data[155] = 0; data[156] = 0; data[157] = 0; data[158] = 0; data[159] = 0; data[160] = 0; data[161] = 0; data[162] = 0; data[163] = 0; data[164] = 0; data[165] = 0; data[166] = 0; data[167] = 0; data[168] = 0; data[169] = 0; data[170] = 0; data[171] = 0; data[172] = 0; data[173] = 0; data[174] = 0; data[175] = 0; data[176] = 0; data[177] = 0; data[178] = 0; data[179] = 0; data[180] = 0; data[181] = 0; data[182] = 0; data[183] = 0; data[184] = 0; data[185] = 0; data[186] = 0; data[187] = 0; data[188] = 0; data[189] = 0; data[190] = 0; data[191] = 0; data[192] = 0; data[193] = 0; data[194] = 0; data[195] = 0; data[196] = 0; data[197] = 0; data[198] = 0; data[199] = 0; data[200] = 0; data[201] = 0; data[202] = 0; data[203] = 0; data[204] = 0; data[205] = 0; data[206] = 0; data[207] = 0; data[208] = 0; data[209] = 0; data[210] = 0; data[211] = 0; data[212] = 0; data[213] = 0; data[214] = 0; data[215] = 0; data[216] = 0; data[217] = 0; data[218] = 0; data[219] = 0; data[220] = 0; data[221] = 0; data[222] = 0; data[223] = 0; data[224] = 0; data[225] = 0; data[226] = 0; data[227] = 0; data[228] = 0; data[229] = 0; data[230] = 0; data[231] = 0; data[232] = 0; data[233] = 0; data[234] = 0; data[235] = 0; data[236] = 0; data[237] = 0; data[238] = 0; data[239] = 0; data[240] = 0; data[241] = 0; data[242] = 0; data[243] = 0; data[244] = 0; data[245] = 0; data[246] = 0; data[247] = 0; data[248] = 0; data[249] = 0; data[250] = 0; data[251] = 0; data[252] = 0; data[253] = 0; data[254] = 0; data[255] = 0; for(uint16_t x = 0; x < 256; x++) // Ignore, this is just so that compiler won't optimize everything away. { GPIOR0 = data[x]; } } }
And the assembly :-
data[56] = 0; data[57] = 0; data[58] = 0; data[59] = 0; data[60] = 0; data[61] = 0; data[62] = 0; data[63] = 0; 000000B9 STD Y+56,R1 Store indirect with displacement 000000BA STD Y+57,R1 Store indirect with displacement 000000BB STD Y+58,R1 Store indirect with displacement 000000BC STD Y+59,R1 Store indirect with displacement 000000BD STD Y+60,R1 Store indirect with displacement 000000BE STD Y+61,R1 Store indirect with displacement 000000BF STD Y+62,R1 Store indirect with displacement 000000C0 STD Y+63,R1 Store indirect with displacement data[64] = 0; data[65] = 0; data[66] = 0; data[67] = 0; data[68] = 0; data[69] = 0; data[70] = 0; data[71] = 0; 000000C1 MOVW R26,R12 Copy register pair 000000C2 ST X,R1 Store indirect 000000C3 MOVW R26,R14 Copy register pair 000000C4 ST X,R1 Store indirect 000000C5 MOVW R26,R16 Copy register pair 000000C6 ST X,R1 Store indirect 000000C7 MOVW R26,R22 Copy register pair 000000C8 ST X,R1 Store indirect 000000C9 MOVW R26,R20 Copy register pair 000000CA ST X,R1 Store indirect 000000CB MOVW R26,R18 Copy register pair 000000CC ST X,R1 Store indirect 000000CD MOVW R26,R10 Copy register pair 000000CE ST X,R1 Store indirect 000000CF MOVW R26,R8 Copy register pair 000000D0 ST X,R1 Store indirect // . // . // . // and so on.
This program on ATmega328P takes :-
Notice how after clearing first 64 bytes the compiler goes bonkers and generates messy assembly (and we loose our taking 2 clock cycles, clearing per byte). And this is with optimization level -O1
Let's bump up the optimization to -O3 and see.
data[56] = 0; data[57] = 0; data[58] = 0; data[59] = 0; data[60] = 0; data[61] = 0; data[62] = 0; data[63] = 0; 000000C0 STS 0x0138,R1 Store direct to data space 000000C2 STS 0x0139,R1 Store direct to data space 000000C4 STS 0x013A,R1 Store direct to data space 000000C6 STS 0x013B,R1 Store direct to data space 000000C8 STS 0x013C,R1 Store direct to data space 000000CA STS 0x013D,R1 Store direct to data space 000000CC STS 0x013E,R1 Store direct to data space 000000CE STS 0x013F,R1 Store direct to data space data[64] = 0; data[65] = 0; data[66] = 0; data[67] = 0; data[68] = 0; data[69] = 0; data[70] = 0; data[71] = 0; 000000D0 STS 0x0140,R1 Store direct to data space 000000D2 STS 0x0141,R1 Store direct to data space 000000D4 STS 0x0142,R1 Store direct to data space 000000D6 STS 0x0143,R1 Store direct to data space 000000D8 STS 0x0144,R1 Store direct to data space 000000DA STS 0x0145,R1 Store direct to data space 000000DC STS 0x0146,R1 Store direct to data space 000000DE STS 0x0147,R1 Store direct to data space data[72] = 0; data[73] = 0; data[74] = 0; data[75] = 0; data[76] = 0; data[77] = 0; data[78] = 0; data[79] = 0; 000000E0 STS 0x0148,R1 Store direct to data space 000000E2 STS 0x0149,R1 Store direct to data space 000000E4 STS 0x014A,R1 Store direct to data space 000000E6 STS 0x014B,R1 Store direct to data space 000000E8 STS 0x014C,R1 Store direct to data space 000000EA STS 0x014D,R1 Store direct to data space 000000EC STS 0x014E,R1 Store direct to data space 000000EE STS 0x014F,R1 Store direct to data space // . // . // . // and so on.
And this takes even more memory :-
The compiler just went STS all the way. Notice now STS takes 2 words of program memory and 2 clock cycles.
This got me thinking that I should use the ST instruction. Since we are just placing zero in consecutive memory addresses. That would be perfect. Why compiler is not using ST instead of STS? And the good thing about it that it also takes 2 clock cycles but takes 1 word of program memory. It's a win win situation.
It's equivalent of doing STD instruction over the whole array which that compiler can't do. (1 word instruction for clearing 1 byte in SRAM in 2 clock cycles)
I tried writing code in every different way I could in C, so that compiler would use the ST instruction. But nope, that won't happen.
Then finally I threw some inline assembly in C. (never written assembly before and hoped for the best)
static uint8_t data[256]; int main(void) { while (1) { for(uint16_t x = 0; x < 256; x++) { data[x] = 0xff; } __asm__ __volatile__( "push r30 \n\t" "push r31 \n\t" "push r16 \n\t" "ldi r30,lo8(data) \n\t" "ldi r31,hi8(data) \n\t" "ldi r16,0 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "st z+,r16 \n\t" "pop r16 \n\t" "pop r30 \n\t" "pop r31 \n\t" ); for(uint16_t x = 0; x < 256; x++) // Ignore, this is just so that compiler won't optimize everything away. { GPIOR0 = data[x]; } } }
And this takes :-
732 bytes . That's so much better than before. So I think compiler does not like generating assembly code where there is a very long repetition of same instruction even though knowing that, it is best for speed and also saving some space at the same time.