AVR GCC inline assembly help

Go To Last Post
12 posts / 0 new
Author
Message
#1
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Good day to all those smarter than I am. I'm pretty new to the inline assembly game and would like some help. I found a working function that drives the WS2812 RGB LEDs on an 8MHz AVR, but it's written as a standalone assembler function that you call in your C main. I would like to make it inline because I need to pass it more parameters than it's already accepting and because the 16MHz version is inline. I'll share the existing assembler function as well as my attempt at an inline version. My inline version doesn't work, the LED stays off until I remove the data line after which it shows a colour, but not the one I told it to.

 

Original:

#define __SFR_OFFSET 0 
#include <avr/io.h> 

;extern void output_grb(u8 * ptr, u16 count)
;
; r18 = data byte
; r19 = 7-bit count
; r20 = 1 output
; r21 = 0 output
; r22 = SREG save
; r24:25 = 16-bit count
; r26:27 (X) = data pointer

.equ      OUTBIT,   0


.global output_grb
output_grb:
         movw   r26, r24      ;r26:27 = X = p_buf
         movw   r24, r22      ;r24:25 = count
         in     r22, SREG     ;save SREG (global int state)
         cli                  ;no interrupts from here on, we're cycle-counting
         in     r20, PORTB
         ori    r20, (1<<OUTBIT)         ;our '1' output
         in     r21, PORTB
         andi   r21, ~(1<<OUTBIT)        ;our '0' output
         ldi    r19, 7        ;7 bit counter (8th bit is different)
         ld     r18,X+        ;get first data byte
loop1:
         out    PORTB, r20    ; 1   +0 start of a bit pulse
         lsl    r18           ; 1   +1 next bit into C, MSB first
         brcs   L1            ; 1/2 +2 branch if 1
         out    PORTB, r21    ; 1   +3 end hi for '0' bit (3 clocks hi)
         nop                  ; 1   +4
         bst    r18, 7        ; 1   +5 save last bit of data for fast branching
         subi   r19, 1        ; 1   +6 how many more bits for this byte?
         breq   bit8          ; 1/2 +7 last bit, do differently
         rjmp   loop1         ; 2   +8, 10 total for 0 bit
L1:
         nop                  ; 1   +4
         bst    r18, 7        ; 1   +5 save last bit of data for fast branching
         subi   r19, 1        ; 1   +6 how many more bits for this byte
         out    PORTB, r21    ; 1   +7 end hi for '1' bit (7 clocks hi)
         brne   loop1         ; 2/1 +8 10 total for 1 bit (fall thru if last bit)
bit8:
         ldi    r19, 7        ; 1   +9 bit count for next byte
         out    PORTB, r20    ; 1   +0 start of a bit pulse
         brts   L2            ; 1/2 +1 branch if last bit is a 1
         nop                  ; 1   +2
         out    PORTB, r21    ; 1   +3 end hi for '0' bit (3 clocks hi)
         ld     r18, X+       ; 2   +4 fetch next byte
         sbiw   r24, 1        ; 2   +6 dec byte counter
         brne   loop1         ; 2   +8 loop back or return
         out    SREG, r22     ; restore global int flag
         ret
L2:
         ld     r18, X+       ; 2   +3 fetch next byte
         sbiw   r24, 1        ; 2   +5 dec byte counter
         out    PORTB, r21   ; 1   +7 end hi for '1' bit (7 clocks hi)
         brne   loop1         ; 2   +8 loop back or return
         out    SREG, r22     ; restore global int flag
         ret

My attempt:

void writePixel(uint8_t red, uint8_t green, uint8_t blue)
{
	uint8_t buff[3] = {green, red, blue};
	volatile uint16_t i = 3;
	//Save SREG
	uint8_t sreg=SREG;
	uint8_t hi = pixelPort | (1<<pixelPin);
	uint8_t low = pixelPort & ~(1<<pixelPin);
	//Clear interrupts
	cli();
	asm volatile(
	"ldi   r19, 7              \n\t"   // 7 bit counter (8th bit is different)
	"ld    r18,%a[ptr]+        \n\t"   // get first data byte
	"loop1:                    \n\t"
	"out   %[port], %[hi]      \n\t"   // 1   +0 start of a bit pulse
	"lsl   r18                 \n\t"   // 1   +1 next bit into C, MSB first
	"brcs  L1                  \n\t"   // 1/2 +2 branch if 1
	"out   %[port], %[lo]      \n\t"   // 1   +3 end hi for '0' bit (3 clocks hi)
	"nop                       \n\t"   // 1   +4
	"bst   r18, 7              \n\t"   // 1   +5 save last bit of data for fast branching
	"subi  r19, 1              \n\t"   // 1   +6 how many more bits for this byte?
	"breq  bit8                \n\t"   // 1/2 +7 last bit, do differently
	"rjmp  loop1               \n\t"   // 2   +8, 10 total for 0 bit
	"L1:                       \n\t"
	"nop                       \n\t"   // 1   +4
	"bst   r18, 7              \n\t"   // 1   +5 save last bit of data for fast branching
	"subi  r19, 1              \n\t"   // 1   +6 how many more bits for this byte
	"out   %[port], %[lo]      \n\t"   // 1   +7 end hi for '1' bit (7 clocks hi)
	"brne  loop1               \n\t"   // 2/1 +8 10 total for 1 bit (fall through if last bit)
	"bit8:                     \n\t"
	"ldi   r19, 7              \n\t"   // 1   +9 bit count for next byte
	"out   %[port], %[hi]      \n\t"   // 1   +0 start of a bit pulse
	"brts  L2                  \n\t"   // 1/2 +1 branch if last bit is a 1
	"nop                       \n\t"   // 1   +2
	"out   %[port], %[lo]      \n\t"   // 1   +3 end hi for '0' bit (3 clocks hi)
	"ld    r18, %a[ptr]+       \n\t"   // 2   +4 fetch next byte
	"sbiw  %[count], 1         \n\t"   // 1   +6 dec byte counter
	"brne  loop1               \n\t"   // 2   +8 loop back or return
	"rjmp  end                 \n\t"
	"L2:                       \n\t"
	"ld    r18, %a[ptr]+       \n\t"   // 2   +3 fetch next byte
	"sbiw  %[count], 1         \n\t"   // 1   +5 dec byte counter
	"out   %[port], %[lo]      \n\t"   // 1   +7 end hi for '1' bit (7 clocks hi)
	"brne  loop1               \n\t"   // 2   +8 loop back or return
	"end:                      \n\t"
	"nop                         \n"
	: [count] "+w" (i)
	: [port]  "I" (_SFR_IO_ADDR(pixelPort)),
	[ptr]   "e" (buff),
	[hi]    "r" (hi),
	[lo]    "r" (low));
	SREG = sreg;
}

I kept the three loops exactly the same in structure, all I changed was I made r24 a variable called "count", I made X a pointer variable called "ptr", I'm passing the port to be used as a variable and I made r20 and r21 variables called "hi" and "lo". pixelPort and pixelPin are #defined so there is no data type clashes. Something that puzzles me about inline assembler is when to pass a variable as input and when to pass it as output.

 

Thanks in advance!!

This topic has a solution.
Last Edited: Thu. Aug 1, 2019 - 11:57 AM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Give in, why does the simple requirement for more parameters call for a switch from standalone to inline?

 

About the only upside of inline is the removal of a CALL/RET overhead but otherwise it's pretty much all downhill.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Because figuring out which registers the arguments are passed into has been giving me a headache. I need to pass in the port and the port bit. If you can help with figuring out which registers they'll be store in that'd be it.

This reply has been marked as the solution. 
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

The ABI is in the manual?

 

https://gcc.gnu.org/wiki/avr-gcc

 

But there's a "trick" anyway. Write the function interface in C first (and make some volatile use of the parms in the function) then build that with -save-temps and the .s file will show you exactly where the parameters are.

 

EDIT for example

void func(volatile uint8_t * port, uint8_t bit)

would pass the pointer in R25:R24. The next parameter down is R23:R22 but as it's only 8 bit only R22 is relevant. 

 

EDIT2 just realised that's exactly what was in the top post anyway (well apart from uint16_t) so presumably you want more than 2 parameters. Next will be R21:R20 and so on.

Last Edited: Wed. Jul 31, 2019 - 07:33 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Thanks so much. What does this mean:

 

clawson wrote:

 

build that with -save-temps

 

 

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

What did your compiler manual tell you?

 

Anyway when build a .c file the preprocessor executes first and turns it into a .i file which is the raw C. The C compiler then reads the .i file and converts to Asm in a .s file. Finally the assembler reads the .s and creates .o 

 

Usually you only get to see .o but if you use -save-temps the .i and .s files are left behind after the build so you can see exactly what Asm the compiler created. When doi g this the -fverbose-asm is another good option to add.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I'm passing the port to be used as a variable

Hopefully, you mean that you want the port to be an argument, rather than a variable?

Because OUT and IN don't work with variable ports; the port has to be compiled into the instruction...

("pixelport" is presumably a constant, or you'd get compile errors rather than non-working code.  I think.)

(You could change the IN/OUT to LD/ST via another index register.  But you might need to adjust the timing.)

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Actually the whole point of doing WS2812 code in hand crafted Asm in the first place as it needs to be as fast and as efficient as possible. You will blow that out of the water if you start messing with run time passed ports and bit positions as then you'll have to do shifting and indirection and stuff. You should simply hard code the port and output bits for the most efficient code. (I assume the LEDs aren't going to jump around from one pin to another at run time??)

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Yes, sorry I mean argument. I rewrote the code to use ST to send an output to the port which I store in the Z register now, but ST adds an extra cycle that might be a problem. I'm just hoping it's still within the timing limits of the WS2812B. If passing the port becomes too much of a problem I'll just switch case the port in C and call a routine that is port specific.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

BdT141 wrote:
I'm just hopin
But surely this is the point. The fastest way to set an output bit is:

SBI PORTB, 3

but this relies on knowing it's going to be B and bit 3 when you build the code. Slightly slower is:

LDI R16, 0x08
OUT PORTB, R16

this still needs to know B.3 up front.

 

If you try to make a function that is called with B.3 one time and D.5 another you create all kinds of issues. Now you probably need the port address in an indexing register and perhaps some kind of LD, OR, ST sequence. The fact that it could not nbe.3 one time and .5 another is even worse. So on one occasion you OR (and perhaps AND) mask is 0x08 (0xF7) and 0x20 (0xDF) another. So how do you achieve that - are you going to use a 1 << N to get a 1 bit into the bit 3 or bit 5 position? I suppose you might simplify things a bit by passing the bit mask in as 0x08/0x20 in the first place?

 

The bottom line though is, for the very fastest code, if you need one version for B.3 and another for D.5 then just duplicate the entire code and use SBI PORTB,3 in one and SBI PORTD,5 in the other. It's trading flash space for speed. If speed is the driving force here (for WS 28182 it is!) then forget about fancy parameterisation to cope with different ports/bits. Just have light_string_1() and light_string_2() functions and accept the fact you have a lot of repeated code.

 

(in the good old days of RAM based code you could do dreadful things like self-modifying code - sadly/fortunately(?) the AVR precludes this).

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Yes, I agree with the

sbi PORTB, 3

point and that speed is the driving force, but I would like to make a one size fits all type library where you don't need to go look for or change the function that has the PORTB pin 3 case. I've changed the original function to the one below:

.global output_grb
output_grb:
         movw   r26, r24                  ; r26:27 = X = p_buf
         movw   r24, r22                  ; r24:25 = count
         movw   r30, r20                  ; move port pointer to Z register
         mov    r23, r18                  ; move pin to r23

         in     r22, SREG                 ; save SREG (global int state)
         cli                              ; no interrupts from here on, we're cycle-counting
         ld     r20, Z                    ; store port val in r20
         ld     r21, Z                    ; store port val in r21
         ldi    r18,lo8(1)                ; r18 = 1
         ldi    r19,0                     ; r19 = 0
         rjmp   2f                        ;
1:
         lsl    r18                       ; r18 << 1
         rol    r19                       ; r19 << 1 (basically does nothing to r19)
2:
         dec    r23                       ; r23 (pin num) = r23 - 1
         brpl   1b                        ; branch if r23 >= 0
         or     r20, r18                  ; our '1' output
         com    r18                       ; invert pin mask to get "0" value
         and    r21, r18                  ; our '0' output
         ldi    r19, 7                    ; 7 bit counter (8th bit is different)
         ld     r18, X+                   ; get first data byte
loop1:
         st     Z, r20                    ; 1   +0 start of a bit pulse
         lsl    r18                       ; 1   +2 next bit into C, MSB first
         brcs   L1                        ; 1/2 +3 branch if 1
         st     Z, r21                    ; 1   +4 end hi for '0' bit (3 clocks hi)
         bst    r18, 7                    ; 1   +6 save last bit of data for fast branching
         subi   r19, 1                    ; 1   +7 how many more bits for this byte?
         breq   bit8                      ; 1/2 +8 last bit, do differently
         rjmp   loop1                     ; 2   +9, 11 total for 0 bit
L1:
         bst    r18, 7                    ; 1   +5 save last bit of data for fast branching
         subi   r19, 1                    ; 1   +6 how many more bits for this byte
         st     Z, r21                    ; 1   +7 end hi for '1' bit (7 clocks hi)
         brne   loop1                     ; 2/1 +9, 11 total for 1 bit (fall thru if last bit)
bit8:
         ldi    r19, 7                    ; 1   +10 bit count for next byte
         st     Z, r20                    ; 1   +0 start of a bit pulse
         brts   L2                        ; 1/2 +2 branch if last bit is a 1
         out    Z, r21                    ; 1   +3 end hi for '0' bit (3 clocks hi)
         ld     r18, X+                   ; 2   +4 fetch next byte
         sbiw   r24, 1                    ; 2   +6 dec byte counter
         brne   loop1                     ; 2   +8 loop back or return
         out    SREG, r22                 ; restore global int flag
         ret
L2:
         ld     r18, X+                   ; 2   +3 fetch next byte
         sbiw   r24, 1                    ; 2   +5 dec byte counter
         st     Z, r21                    ; 1   +7 end hi for '1' bit (7 clocks hi)
         brne   loop1                     ; 2   +9 loop back or return
         out    SREG, r22                 ; restore global int flag
         ret

Everything between "cli" and "ldi r19, 7" is code I got from using your trick of having the compiler save the .S intermediate file. That's how the compiler handles passing a port and a pin as arguments. It stores the port in Z and does a little loop to get the right pin mask. Then it uses st to write to Z, which then writes to the port. From there I just replaced all the "out"s with "st"s, which adds one cycle to the mix. I'm going to test it later and if it doesn't work I'll write a routine for each port and just pass in the pin to assembler.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

BdT141 wrote:
but I would like to make a one size fits all type library where you don't need to go look for or change the function that has the PORTB pin 3 case.
So make it a compile time #define'd configuration.

#define LEDPORT PORTB
#define LEDBIT 3


         in     r20, LEDPORT
         ori    r20, (1<<LEDBIT)         ;our '1' output
         in     r21, LEDPORT
         andi   r21, ~(1<<LEDBIT)        ;our '0' output
...