After investigating some issues with a board based on a SAME54P20 (Cortex M4), I realized loops need to be aligned to 16 bytes multiples to ensure optimal execution.
I found out the _delay_cycles() function in hpl_core_m4.c is affected, as it aligns the loop to 8 bytes multiples. Depending on the rest of the code in the project, the function would execute as expected or produce much longer delays. This affects the functions in hal_delay and any other code using _delay_cycles(), delay_ms(), delay_us()
This can be fixed (at least in gcc) changing the function from:
/**
* \brief Delay loop to delay n number of cycles
*/
void _delay_cycles(void *const hw, uint32_t cycles)
{
#ifndef _UNIT_TEST_
(void)hw;
(void)cycles;
#if defined(__GNUC__) && (__ARMCOMPILER_VERSION > 6000000) /* Keil MDK with ARM Compiler 6 */
__asm(".align 3 \n"
"__delay:\n"
"subs r1, r1, #1\n"
"bhi __delay\n");
#elif defined __GNUC__
__asm(".syntax unified\n"
".align 3 \n"
"__delay:\n"
"subs r1, r1, #1\n"
"bhi __delay\n"
".syntax divided");
#elif defined __CC_ARM
__asm("__delay:\n"
"subs cycles, cycles, #1\n"
"bhi __delay\n");
#elif defined __ICCARM__
__asm("__delay:\n"
"subs r1, r1, #1\n"
"bhi.n __delay\n");
#endif
#endif
}
to:
/**
* \brief Delay loop to delay n number of cycles
*/
void _delay_cycles(void *const hw, uint32_t cycles)
{
#ifndef _UNIT_TEST_
(void)hw;
(void)cycles;
#if defined(__GNUC__) && (__ARMCOMPILER_VERSION > 6000000) /* Keil MDK with ARM Compiler 6 */
__asm(".align 4 \n"
"__delay:\n"
"subs r1, r1, #1\n"
"bhi __delay\n");
#elif defined __GNUC__
__asm(".syntax unified\n"
".align 4 \n"
"__delay:\n"
"subs r1, r1, #1\n"
"bhi __delay\n"
".syntax divided");
#elif defined __CC_ARM
__asm("__delay:\n"
"subs cycles, cycles, #1\n"
"bhi __delay\n");
#elif defined __ICCARM__
__asm("__delay:\n"
"subs r1, r1, #1\n"
"bhi.n __delay\n");
#endif
#endif
}