Help locating newlib source code

Go To Last Post
13 posts / 0 new
Author
Message
#1
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I'm tracking down a problem in my project from code generated by Atmel START using the USBHS module for a SAM E70 chip.  It's hard faulting inside the hpl_usbhs.c during a call to the memcpy() library function.  The weird part is it only fails with newlib but not newlib-nano.  I'm suspecting a byte alignment issue and want to step through it with the debugger.  But of course, I don't have a local copy of the library source code.

 

Searching a bit I know newlib is open source and can be found at sourceware.org/newlib.  But I don't know exactly which revision was included with Atmel Studio.  Any promising google searches run into dead Atmel website links.

 

So my question: what revision of newlib is included with Atmel Studio 7.0.2397?  Or better yet, I'd like a copy of the source code used to build both the newlib and newlib-nano libraries provided with AS7.0.2397.

 

Thanks!

This topic has a solution.

Last Edited: Wed. Mar 11, 2020 - 08:10 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Digging more I've found _newlib_version.h located at "C:\Program Files (x86)\Atmel\Studio\7.0\toolchain\arm\arm-gnu-toolchain\arm-none-eabi\include".  It's showing 2.5.0 but no further details

 

/* _newlib_version.h.  Generated from _newlib_version.hin by configure.  */
/* Version macros for internal and downstream use. */
#ifndef _NEWLIB_VERSION_H__
#define _NEWLIB_VERSION_H__ 1

#define _NEWLIB_VERSION "2.5.0"
#define __NEWLIB__ 2
#define __NEWLIB_MINOR__ 5
#define __NEWLIB_PATCHLEVEL__ 0

#endif /* !_NEWLIB_VERSION_H__ */

 

Here's the versions available via ftp://sourceware.org/pub/newlib

 

 

Any guidance to narrow it down further 2.5.0.2017XXXX ??

Last Edited: Tue. Mar 10, 2020 - 10:51 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

So I've found a smoking gun but have run out of time today to dig deeper.  I pulled down the newlib-2.5.0.tar.gz, extracted memcpy.c, cut and pasted the contents into hpl_usbhs.c while renaming the function to hpl_usbhs_memcpy, and surrounded the function with this #pragma calls to change the optimization level like this:

 

#pragma GCC push_options
#pragma GCC optimize("O3")      // fails, causes a hard fault
//#pragma GCC optimize("O2")      // works, no hard fault


... insert the contents of the newlib memcpy() function here ...


#pragma GCC pop_options


 

Here's the entire function

 

#pragma GCC push_options
#pragma GCC optimize("O3")      // fails, causes a hard fault
//#pragma GCC optimize("O2")      // works, no hard fault

/*
FUNCTION
        <<hpl_usbhs_memcpy>>---copy memory regions

ANSI_SYNOPSIS
        #include <string.h>
        void* hpl_usbhs_memcpy(void *restrict <[out]>, const void *restrict <[in]>,
                     size_t <[n]>);

TRAD_SYNOPSIS
        #include <string.h>
        void *hpl_usbhs_memcpy(<[out]>, <[in]>, <[n]>
        void *<[out]>;
        void *<[in]>;
        size_t <[n]>;

DESCRIPTION
        This function copies <[n]> bytes from the memory region
        pointed to by <[in]> to the memory region pointed to by
        <[out]>.

        If the regions overlap, the behavior is undefined.

RETURNS
        <<hpl_usbhs_memcpy>> returns a pointer to the first byte of the <[out]>
        region.

PORTABILITY
<<hpl_usbhs_memcpy>> is ANSI C.

<<hpl_usbhs_memcpy>> requires no supporting OS subroutines.

QUICKREF
        hpl_usbhs_memcpy ansi pure
	*/

#include <_ansi.h>
#include <string.h>

/* Nonzero if either X or Y is not aligned on a "long" boundary.  */
#define UNALIGNED(X, Y) \
  (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))

/* How many bytes are copied each iteration of the 4X unrolled loop.  */
#define BIGBLOCKSIZE    (sizeof (long) << 2)

/* How many bytes are copied each iteration of the word copy loop.  */
#define LITTLEBLOCKSIZE (sizeof (long))

/* Threshhold for punting to the byte copier.  */
#define TOO_SMALL(LEN)  ((LEN) < BIGBLOCKSIZE)

_PTR
_DEFUN (hpl_usbhs_memcpy, (dst0, src0, len0),
	_PTR __restrict dst0 _AND
	_CONST _PTR __restrict src0 _AND
	size_t len0)
{
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
  char *dst = (char *) dst0;
  char *src = (char *) src0;

  _PTR save = dst0;

  while (len0--)
    {
      *dst++ = *src++;
    }

  return save;
#else
  char *dst = dst0;
  _CONST char *src = src0;
  long *aligned_dst;
  _CONST long *aligned_src;

  /* If the size is small, or either SRC or DST is unaligned,
     then punt into the byte copy loop.  This should be rare.  */
  if (!TOO_SMALL(len0) && !UNALIGNED (src, dst))
    {
      aligned_dst = (long*)dst;
      aligned_src = (long*)src;

      /* Copy 4X long words at a time if possible.  */
      while (len0 >= BIGBLOCKSIZE)
        {
          *aligned_dst++ = *aligned_src++;
          *aligned_dst++ = *aligned_src++;
          *aligned_dst++ = *aligned_src++;
          *aligned_dst++ = *aligned_src++;
          len0 -= BIGBLOCKSIZE;
        }

      /* Copy one long word at a time if possible.  */
      while (len0 >= LITTLEBLOCKSIZE)
        {
          *aligned_dst++ = *aligned_src++;
          len0 -= LITTLEBLOCKSIZE;
        }

       /* Pick up any residual with a byte copier.  */
      dst = (char*)aligned_dst;
      src = (char*)aligned_src;
    }
    
    
    
  //////////////////////////////////////////////////////////////////////////
  #warning "this code snipet fails with optimize -O3 but works with -O2, investigate further"
  while (len0--)
    *dst++ = *src++;
  //////////////////////////////////////////////////////////////////////////


  return dst0;
#endif /* not PREFER_SIZE_OVER_SPEED */
}

#pragma GCC pop_options

 

Next up is to compare the compiler generated assembly code.  This is exactly why I tend to leave optimizations off in production code, what a pain in the arse...

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

With that level of conditional abstraction, won't your "stepping" need to be at the assembly level anyway?

 

Are you sure you have the right source?  The source has a lot of overriding that happens, and I'd expect a CM7 chip to have a .S file for memcpy()

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

westfw wrote:

 

Are you sure you have the right source?

 

 

No I'm not.  When I first created this thread I had no source files.  Some sleuthing and poking around the Atmel directory on my PC led to the open source newlib-2.5.0 git repo on the web.  The code above is memcpy.c from that code set.  I really want the exact files Atmel used along with their setup during compile time instead.  It's open source yes, but I can't find a link anywhere on Microchip's website to it.

 

Any hints where I can find Atmel's source files for libc.a/libm.a included with  AS7.0.2397?

 

 

 

 

Last Edited: Wed. Mar 11, 2020 - 12:52 PM
This reply has been marked as the solution. 
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Answering my own questions here.

 

1.  Atmel Studio v7.0.2397 uses ARM GNU Toolchain 6.3.1.508.  See the "arm-gnu-toolchain-6.3.1.508-readme.pdf" file in the Atmel install directory "C:\Program Files (x86)\Atmel\Studio\7.0\toolchain\arm\arm-gnu-toolchain"

 

2.  Deeper in the toolchain directory is a "readme.txt" file at "C:\Program Files (x86)\Atmel\Studio\7.0\toolchain\arm\arm-gnu-toolchain\share\doc\gcc-arm-none-eabi" with the following excerpt:

 

* C Libraries usage *

 

This toolchain is released with two prebuilt C libraries based on newlib:
one is the standard newlib and the other is newlib-nano for code size.
To distinguish them, we rename the size optimized libraries as:

 

  libc.a --> libc_nano.a
  libg.a --> libg_nano.a

 

To use newlib-nano, users should provide additional gcc compile and link time
option:
 --specs=nano.specs

 

 

3.  In that same directory are release notes in the "release.txt" file that has this:

 

Release notes for
*************************************************
GNU Tools for ARM Embedded Processors 6
                               - Q2 2017
*************************************************

 

<snip>

 

  * newlib and newlib-nano :
    git://sourceware.org/git/newlib-cygwin.git commit 0d79b021a4ec4e6b9aa1a9f6db0e29a137005ce7

 

 

4. From the release notes it's clear that the Atmel supplied toolchain is based off "GNU Arm Embedded Toolchain: 6-2017-q2-update".  This has all the precompiled binaries like gcc, gdb, etc, include files, and the precompiled newlib libraries (libc.a, libc-nano.c) located here:  https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-rm/downloads.  What's not included in this download is the .c source files for newlib.  Only the precompiled libraries are included.

 

5.  The source for newlib and newlib-nano are coming from sourceware.org/newlib, specifcally the git commit SHA-1 0d79b021a4ec4e6b9aa1a9f6db0e29a137005ce7 listed above.  <=== THIS IS THE ANSWER I NEEDED

 

6.  Compiling newlib and newlib-nano to match Atmel's distribution is a bit difficult.  I didn't bother following through as I only needed the source files to pick out a few files for my project.  But if you want to experiment start here for clues:  https://stackoverflow.com/questions/50154137/how-to-rebuild-newlib-and-newlib-nano-of-gnu-arm-embedded-toolchain.  You need to build it twice, once for newlib, then again for newlib-nano

 

 

 

Last Edited: Wed. Mar 11, 2020 - 08:26 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

And for those of you playing along at home, the whole reason for this journey was to trace down a hard fault in the standard C library function memcpy() when called from within Atmel START's SAM E70 hpl_usbhs.c generated source code.  I've been able to duplicate the problem calling a "simple" version of memcpy() for debugging, called hpl_usbhs_memcpy() so that it doesn't collide with the memset inside the library.  Here's the code:

 

#pragma GCC push_options
#pragma GCC optimize("O3")      // fails, causes a hard fault
//#pragma GCC optimize("O2")      // works, no hard fault

#include <stdint.h>
#include <stdlib.h>

void *hpl_usbhs_memcpy(void *dest, const void *src, size_t n)
{
    uint8_t *s = (uint8_t *)src;
    uint8_t *d = dest;

    //////////////////////////////////////////////////////////////////////////
    #warning "this code snipet fails with optimize -O3 but works with -O2, investigate further"
    while (n--)
        *d++ = *s++;
    //////////////////////////////////////////////////////////////////////////

    return d;
}

#pragma GCC pop_options

 

What happens is the gcc compiler is generating code with -O3 optimizations that causes a hard fault.  But with -O2 optimizations it works fine.  I only found it because I happened to switch my project to the full blown newlib from newlib-nano to include long long printf support.  It looks like Atmel precompiles newlib with -O3 but newlib-nano with -OS.

 

Still digging as to why this hard faults, stay tuned.

Last Edited: Wed. Mar 11, 2020 - 08:28 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Could you provide the compile command for your demo version of the code?

 

Also, do you happen to know for sure which binary libc.a your code ends up linking?

 

Last Edited: Thu. Mar 12, 2020 - 06:04 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

So I've found the root of the problem.  The Atmel supplied START file hpl_usbhs.c calls memcpy() with a destination address of USBHS RAM.  When linking in the non-size optimized precompiled library supplied with Atmel Studio (libc.a aka newlib), it causes a Hard Fault due to memory access misalignment.  It looks like libc.a was built using -O3.  But if you link in the size optimized library (libc-nano.a aka newlib-nano), it works without issues presumably because it was compiled with -OS.

 

Here's the test code confirming everything.  Feel free to use it as needed.

 

Bottom line - don't use the C Standard Library functions with USBHS RAM - I'm talking to you Atmel :)

 

//////////////////////////////////////////////////////////////////////////
// USBHS RAM memcpy() testing with misaligned pointers and various compiler
// optimization levels on a Atmel/Microchip SAM E70 processor
//
// (c) ScottMN 2020 - released to the public domain without restriction
//
// This example program creates a replacement functions for the Standard C Library
// memcpy() function:
//
//      #pragma GCC optimize("O2")  // very aggressive -O2 compiler optimization level
//      //#pragma GCC optimize("O3")  // most aggressive -O3 compiler optimization level
//      void *my_memcpy(void *dest, const void *src, size_t n)
//
// Compile and run the program with the very aggressive -O2 compiler option, then repeat
// with the most aggressive -O3 compiler option.
//
// The main() function calls the replacement memcpy() function while varying
// the input parameters <src> and <n>.  The destination address is always USBHS RAM
// on the SAM E70 processor.  Here are the scenarios:
//      1.  success: -O2 with any value for <n> and any alignment for <src>
//      2.  success: -O3 with <n> less than or equal to 11
//      3.  success: -O3 with <n> greater than 12 and <src> aligned to 32-bits
//      4.  HARD FAULT: -O3 with <n> greater than 12 and <src> NOT aligned to 32-bits
//
// As shown above, the very aggressive -O2 optimization option has no unexpected
// side effects when copying byte data to USBHS RAM.  On the other hand, the
// most aggressive -O3 optimization option causes a Hard Fault when byte copying
// 12 or more bytes to USBHS RAM and the source address is not 32-bit aligned.
//
// Looking at the compiler generated assembly code, the -O3 option both unravels
// the loop three times and uses 32-bit transfers.  This is where the 12 byte
// limit and non-32-bit aligned issues arise.
//
// The regular precompiled Standard C Library (libc.a aka newlib) supplied with
// Atmel Studio v7.0.2397 was built using the -O3 option.  The size optimized version
// supplied by Atmel (libc-nano.a aka newlib-nano) does not have this problem.
// You can select which library to link against by checking or unchecking the
// "Use size optimized library (--specs=nano.specs)" checkbox in the project properties.
//
// Bottom line:  Do not use the Standard C Library function calls with USBHS RAM
// memory addresses if the vendor supplied pre-compiled libraries libc.a or libc-nano.a
// were built with the -O3 optimization level.
//////////////////////////////////////////////////////////////////////////

#include <atmel_start.h>
#include <stdio.h>
#include <string.h>

#pragma GCC push_options

//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
// SELECT O2 or O3, recompile, and retest

#pragma GCC optimize("O2")  // very aggressive -O2 compiler optimization level
//#pragma GCC optimize("O3")  // most aggressive -O3 compiler optimization level

//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
void *my_memcpy(void *dest, const void *src, size_t n)
{
    const char *s = src;
    char *d = dest;

    while (n--)
        *d++ = *s++;

    return dest;
}
#pragma GCC pop_options

// replacement Hard Fault handler (overrides the weak symbol in the vector table)
void HardFault_Handler(void)
{
    while (1)   // put a breakpoint here during a debug session
        ;
}

#define USBHS_RAM_ADDR              0xA0100000u

uint8_t my_buffer[128] __attribute__ ((aligned (4)));

int main(void)
{
    char *ps, *pd;
    size_t n;

	atmel_start_init();

    printf("Hard Fault testing memcpy() to USBHS RAM\r\n");
    printf("\r\n");

    // setup the source buffer
    memset(my_buffer, 0x55, sizeof(my_buffer)); // 0x55 is an alternating bit pattern
    ps = (char *)my_buffer;
    printf(" src=0x%08lX\r\n", (uint32_t)ps);

    // setup the destination pointer
    pd = (char *)USBHS_RAM_ADDR;
    printf("dest=0x%08lX (USBHS RAM)\r\n", (uint32_t)pd);
    printf("\r\n");

    // test transferring 0 to 11 bytes to USBHS RAM from various aligned/misaligned addresses
    printf("Testing  0 to 11 bytes...");
    fflush(stdout); // wait for all bytes to be emitted before continuing
    for (n = 0; n <= 11; n++) {
        // test with -O2, success always
        // test with -O3, success always since less than 12 bytes transferred
        my_memcpy(pd, ps + 0, n);   // source is     32-bit aligned
        my_memcpy(pd, ps + 1, n);   // source is NOT 32-bit aligned
        my_memcpy(pd, ps + 2, n);   // source is NOT 32-bit aligned
        my_memcpy(pd, ps + 3, n);   // source is NOT 32-bit aligned
        my_memcpy(pd, ps + 4, n);   // source is     32-bit aligned
    }
    printf("PASSED!\r\n");
    fflush(stdout); // wait for all bytes to be emitted before continuing

    // test transferring 12 to 20 bytes to USBHS RAM from various aligned/misaligned addresses
    printf("Testing 12 to 20 bytes...");
    fflush(stdout); // wait for all bytes to be emitted before continuing
    for (n = 12; n <= 20; n++) {
        // test with -O2, success always
        // test the -O3 version of memcpy(): FAILS!! when source is NOT 32-bit aligned
        my_memcpy(pd, ps + 0, n);   // source is     32-bit aligned

        //////////////////////////////////////////////////////////////////////////
        // these function calls FAIL with a Hard Fault and will never return
        my_memcpy(pd, ps + 1, n);   // source is NOT 32-bit aligned
        my_memcpy(pd, ps + 2, n);   // source is NOT 32-bit aligned
        my_memcpy(pd, ps + 3, n);   // source is NOT 32-bit aligned
        //////////////////////////////////////////////////////////////////////////

        my_memcpy(pd, ps + 4, n);   // source is     32-bit aligned
    }
    printf("PASSED!\r\n");
    fflush(stdout); // wait for all bytes to be emitted before continuing

    // display a final message
    printf("\r\n");
    printf("Testing complete\r\n");
    while (1)
        ;
}

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

westfw wrote:

Could you provide the compile command for your demo version of the code?

 

Also, do you happen to know for sure which binary libc.a your code ends up linking?

 

 

Attached is a simple Atmel START project with stdio support via the EDBG port on a SAM E70 Xplained board.  You can see all the compiler and linker options in the .zip project. Connect a terminal program at 115200 8N1 and use the debugger to check it out.  Be sure to enable and disable "Use size optimized library (--specs=nano.specs)" in the linker options to cause the Hard Fault.

 

When this checkbox is enabled (checked), the program succeeds because it links against the toolchain supplied precompiled Standard C Library newlib-nano (libc_nano.a).  newlib-nano is built using the -OS optimization to optimize for size and does not care about incoming pointer alignment to 32-bit addresses.

 

When this checkbox is disabled (unchecked), the program fails with a Hard Fault because it links against the toolchain supplied precompiled Standard C Library newlib (libc.a).  newlib is built using the -O3 optimization which is the most aggressive optimization.  Unfortunately there are side effects in Atmel's supplied library - incoming pointers need to be aligned to 32-bit addresses.  I've proven the -O3 problem above with some test code.  -O3 unravels the tight copy loop and tries to use 32-bit memory accesses to increase performance.  That's great if the addresses are 32-bit aligned.  It Hard Faults when they aren't.

 

 

 

Here's the main() function from the attached project for those that don't want to try it themselves.  Nothing special, just calling memcpy() while linking to two different Atmel toolchain libraries.
 

#include <atmel_start.h>
#include <string.h>
#include <stdio.h>

#define USBHS_RAM_ADDR 0xA0100000u  // starting address for SAM E70 USBHS dual ported RAM

int main(void)
{
    char src_buffer[64];
    char *src, *dst;

	atmel_start_init();

    src = &src_buffer[1];           // source pointer NOT aligned to 32-bit addresses
    dst = (char *)USBHS_RAM_ADDR;

    // success always
    memcpy(dst, src, 11);

    // FAILURE with a Hard Fault when linked against the standard newlib library (libc.a)
    // success when linked to the size optimized newlilb-nano library (libc_nano.a)
    memcpy(dst, src, 12);

    printf("Success, no Hard Faults occurred!!\r\n");
    while (1)
        ;
}

 

And by the way, using -c99 or -gnu99 doesn't matter either.  All it does is link against libc.a / libg.a when using newlib, or libc_nano.a / libg_nano.a when using newlib-nano via the linker checkbox.

 

 

Attachment(s): 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Final analysis:  the problem lies in newlib's memcpy() function when compiled with certain optimization settings.  Here's the most recent memcpy.c file from sourceware.org/newlib:

 

/*
FUNCTION
        <<memcpy>>---copy memory regions

SYNOPSIS
        #include <string.h>
        void* memcpy(void *restrict <[out]>, const void *restrict <[in]>,
                     size_t <[n]>);

DESCRIPTION
        This function copies <[n]> bytes from the memory region
        pointed to by <[in]> to the memory region pointed to by
        <[out]>.

        If the regions overlap, the behavior is undefined.

RETURNS
        <<memcpy>> returns a pointer to the first byte of the <[out]>
        region.

PORTABILITY
<<memcpy>> is ANSI C.

<<memcpy>> requires no supporting OS subroutines.

QUICKREF
        memcpy ansi pure
	*/

#include <_ansi.h>
#include <string.h>
#include "local.h"

/* Nonzero if either X or Y is not aligned on a "long" boundary.  */
#define UNALIGNED(X, Y) \
  (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))

/* How many bytes are copied each iteration of the 4X unrolled loop.  */
#define BIGBLOCKSIZE    (sizeof (long) << 2)

/* How many bytes are copied each iteration of the word copy loop.  */
#define LITTLEBLOCKSIZE (sizeof (long))

/* Threshhold for punting to the byte copier.  */
#define TOO_SMALL(LEN)  ((LEN) < BIGBLOCKSIZE)

void *
__inhibit_loop_to_libcall
memcpy (void *__restrict dst0,
	const void *__restrict src0,
	size_t len0)
{
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
  char *dst = (char *) dst0;
  char *src = (char *) src0;

  void *save = dst0;

  while (len0--)
    {
      *dst++ = *src++;
    }

  return save;
#else
  char *dst = dst0;
  const char *src = src0;
  long *aligned_dst;
  const long *aligned_src;

  /* If the size is small, or either SRC or DST is unaligned,
     then punt into the byte copy loop.  This should be rare.  */
  if (!TOO_SMALL(len0) && !UNALIGNED (src, dst))
    {
      aligned_dst = (long*)dst;
      aligned_src = (long*)src;

      /* Copy 4X long words at a time if possible.  */
      while (len0 >= BIGBLOCKSIZE)
        {
          *aligned_dst++ = *aligned_src++;
          *aligned_dst++ = *aligned_src++;
          *aligned_dst++ = *aligned_src++;
          *aligned_dst++ = *aligned_src++;
          len0 -= BIGBLOCKSIZE;
        }

      /* Copy one long word at a time if possible.  */
      while (len0 >= LITTLEBLOCKSIZE)
        {
          *aligned_dst++ = *aligned_src++;
          len0 -= LITTLEBLOCKSIZE;
        }

       /* Pick up any residual with a byte copier.  */
      dst = (char*)aligned_dst;
      src = (char*)aligned_src;
    }

  while (len0--)
    *dst++ = *src++;

  return dst0;
#endif /* not PREFER_SIZE_OVER_SPEED */
}

 

Take a closer look here, and notice the local variables 'src' and 'dst' have different names than the arguments 'src0' and 'dst0' passed to the function:

 

  /* If the size is small, or either SRC or DST is unaligned,
     then punt into the byte copy loop.  This should be rare.  */
  if (!TOO_SMALL(len0) && !UNALIGNED (src, dst))
    {

 

With the wrong combination of optimization options such as -O3, 'src' and 'dst' get optimized out.  This in turn optimizes out the pointer alignment check.  Is it a compiler bug?  Maybe.  Perhaps it's related to using a macro for the alignment check and the pre-processor is messing up.  Either way the code snippet above becomes this:

 

  /* If the size is small, or either SRC or DST is unaligned,
     then punt into the byte copy loop.  This should be rare.  */
  if (!TOO_SMALL(len0))
    {

The result is a Hard Fault when passing misaligned pointers.

 

I'd bet a very large sum of money that Atmel built libc.a with -O3 or some other combination of compiler options that caused the memcpy() alignment check above to be optimized out.  This only affects Atmel's newlib library.  Atmel's newlib-nano is not affected, presumably because that was built using -OS for size optimization instead.

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0
void *my_memcpy(void *dest, const void *src, size_t n)
{
    while (n--)
        *d++ = *s++;
}

IIRC, one of the higher-level optimizations has the compiler "noticing" that a segment of code can be replaced by a standard function (ie memcpy())...

 

So what is the actual code produced, and on which instruction does it actually crash?

 

I see the following in a SAMe70 example project; it certainly LOOKS like the alignment checks have not been optimized away...

004061c0 <memcpy>:
  4061c0:	4684      	mov	ip, r0
  4061c2:	ea41 0300 	orr.w	r3, r1, r0
  4061c6:	f013 0303 	ands.w	r3, r3, #3     ;; check if both ptrs aligned
  4061ca:	d16d      	bne.n	4062a8 <memcpy+0xe8>
  4061cc:	3a40      	subs	r2, #64	; 0x40
  4061ce:	d341      	bcc.n	406254 <memcpy+0x94>
  4061d0:	f851 3b04 	ldr.w	r3, [r1], #4
  4061d4:	f840 3b04 	str.w	r3, [r0], #4
  4061d8:	f851 3b04 	ldr.w	r3, [r1], #4
  4061dc:	f840 3b04 	str.w	r3, [r0], #4
  4061e0:	f851 3b04 	ldr.w	r3, [r1], #4
  4061e4:	f840 3b04 	str.w	r3, [r0], #4
  4061e8:	f851 3b04 	ldr.w	r3, [r1], #4
  4061ec:	f840 3b04 	str.w	r3, [r0], #4
  4061f0:	f851 3b04 	ldr.w	r3, [r1], #4
  4061f4:	f840 3b04 	str.w	r3, [r0], #4
  4061f8:	f851 3b04 	ldr.w	r3, [r1], #4
  4061fc:	f840 3b04 	str.w	r3, [r0], #4
  406200:	f851 3b04 	ldr.w	r3, [r1], #4
  406204:	f840 3b04 	str.w	r3, [r0], #4
  406208:	f851 3b04 	ldr.w	r3, [r1], #4
  40620c:	f840 3b04 	str.w	r3, [r0], #4
  406210:	f851 3b04 	ldr.w	r3, [r1], #4
  406214:	f840 3b04 	str.w	r3, [r0], #4
  406218:	f851 3b04 	ldr.w	r3, [r1], #4
  40621c:	f840 3b04 	str.w	r3, [r0], #4
  406220:	f851 3b04 	ldr.w	r3, [r1], #4
  406224:	f840 3b04 	str.w	r3, [r0], #4
  406228:	f851 3b04 	ldr.w	r3, [r1], #4
  40622c:	f840 3b04 	str.w	r3, [r0], #4
  406230:	f851 3b04 	ldr.w	r3, [r1], #4
  406234:	f840 3b04 	str.w	r3, [r0], #4
  406238:	f851 3b04 	ldr.w	r3, [r1], #4
  40623c:	f840 3b04 	str.w	r3, [r0], #4
  406240:	f851 3b04 	ldr.w	r3, [r1], #4
  406244:	f840 3b04 	str.w	r3, [r0], #4
  406248:	f851 3b04 	ldr.w	r3, [r1], #4
  40624c:	f840 3b04 	str.w	r3, [r0], #4
  406250:	3a40      	subs	r2, #64	; 0x40
  406252:	d2bd      	bcs.n	4061d0 <memcpy+0x10>
  406254:	3230      	adds	r2, #48	; 0x30
  406256:	d311      	bcc.n	40627c <memcpy+0xbc>
  406258:	f851 3b04 	ldr.w	r3, [r1], #4
  40625c:	f840 3b04 	str.w	r3, [r0], #4
  406260:	f851 3b04 	ldr.w	r3, [r1], #4
  406264:	f840 3b04 	str.w	r3, [r0], #4
  406268:	f851 3b04 	ldr.w	r3, [r1], #4
  40626c:	f840 3b04 	str.w	r3, [r0], #4
  406270:	f851 3b04 	ldr.w	r3, [r1], #4
  406274:	f840 3b04 	str.w	r3, [r0], #4
  406278:	3a10      	subs	r2, #16
  40627a:	d2ed      	bcs.n	406258 <memcpy+0x98>
  40627c:	320c      	adds	r2, #12
  40627e:	d305      	bcc.n	40628c <memcpy+0xcc>
  406280:	f851 3b04 	ldr.w	r3, [r1], #4
  406284:	f840 3b04 	str.w	r3, [r0], #4
  406288:	3a04      	subs	r2, #4
  40628a:	d2f9      	bcs.n	406280 <memcpy+0xc0>
  40628c:	3204      	adds	r2, #4
  40628e:	d008      	beq.n	4062a2 <memcpy+0xe2>
  406290:	07d2      	lsls	r2, r2, #31
  406292:	bf1c      	itt	ne
  406294:	f811 3b01 	ldrbne.w	r3, [r1], #1
  406298:	f800 3b01 	strbne.w	r3, [r0], #1
  40629c:	d301      	bcc.n	4062a2 <memcpy+0xe2>
  40629e:	880b      	ldrh	r3, [r1, #0]
  4062a0:	8003      	strh	r3, [r0, #0]
  4062a2:	4660      	mov	r0, ip
  4062a4:	4770      	bx	lr
  4062a6:	bf00      	nop
  
  ;; One or more pointer is unaligned.
  ;; 8byte-chunks? (BIGBLOCKSIZE)
  4062a8:	2a08      	cmp	r2, #8
  4062aa:	d313      	bcc.n	4062d4 <memcpy+0x114>
  4062ac:	078b      	lsls	r3, r1, #30
  4062ae:	d08d      	beq.n	4061cc <memcpy+0xc>
  4062b0:	f010 0303 	ands.w	r3, r0, #3
  4062b4:	d08a      	beq.n	4061cc <memcpy+0xc>
  4062b6:	f1c3 0304 	rsb	r3, r3, #4
  4062ba:	1ad2      	subs	r2, r2, r3
  4062bc:	07db      	lsls	r3, r3, #31
  4062be:	bf1c      	itt	ne
  4062c0:	f811 3b01 	ldrbne.w	r3, [r1], #1
  4062c4:	f800 3b01 	strbne.w	r3, [r0], #1
  4062c8:	d380      	bcc.n	4061cc <memcpy+0xc>
  4062ca:	f831 3b02 	ldrh.w	r3, [r1], #2
  4062ce:	f820 3b02 	strh.w	r3, [r0], #2
  4062d2:	e77b      	b.n	4061cc <memcpy+0xc>
  4062d4:	3a04      	subs	r2, #4
  4062d6:	d3d9      	bcc.n	40628c <memcpy+0xcc>
  ;; bytewise loop?
  4062d8:	3a01      	subs	r2, #1
  4062da:	f811 3b01 	ldrb.w	r3, [r1], #1
  4062de:	f800 3b01 	strb.w	r3, [r0], #1
  4062e2:	d2f9      	bcs.n	4062d8 <memcpy+0x118>
  ;; final bytes ??
  4062e4:	780b      	ldrb	r3, [r1, #0]
  4062e6:	7003      	strb	r3, [r0, #0]
  4062e8:	784b      	ldrb	r3, [r1, #1]
  4062ea:	7043      	strb	r3, [r0, #1]
  4062ec:	788b      	ldrb	r3, [r1, #2]
  4062ee:	7083      	strb	r3, [r0, #2]
  4062f0:	4660      	mov	r0, ip
  4062f2:	4770      	bx	lr

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

westfw wrote:

 

So what is the actual code produced, and on which instruction does it actually crash?

 

 

See post #10 above, there is an example SAM E70 Xplained project attached.  It works when linked to libc_nano.a, but crashes when linked to libc.a inside the library's memcpy() function.  Both these libraries come from Atmel / ARM.