Good day to all those smarter than I am. I'm pretty new to the inline assembly game and would like some help. I found a working function that drives the WS2812 RGB LEDs on an 8MHz AVR, but it's written as a standalone assembler function that you call in your C main. I would like to make it inline because I need to pass it more parameters than it's already accepting and because the 16MHz version is inline. I'll share the existing assembler function as well as my attempt at an inline version. My inline version doesn't work, the LED stays off until I remove the data line after which it shows a colour, but not the one I told it to.

Original:

#define __SFR_OFFSET 0 #include <avr/io.h> ;extern void output_grb(u8 * ptr, u16 count) ; ; r18 = data byte ; r19 = 7-bit count ; r20 = 1 output ; r21 = 0 output ; r22 = SREG save ; r24:25 = 16-bit count ; r26:27 (X) = data pointer .equ OUTBIT, 0 .global output_grb output_grb: movw r26, r24 ;r26:27 = X = p_buf movw r24, r22 ;r24:25 = count in r22, SREG ;save SREG (global int state) cli ;no interrupts from here on, we're cycle-counting in r20, PORTB ori r20, (1<<OUTBIT) ;our '1' output in r21, PORTB andi r21, ~(1<<OUTBIT) ;our '0' output ldi r19, 7 ;7 bit counter (8th bit is different) ld r18,X+ ;get first data byte loop1: out PORTB, r20 ; 1 +0 start of a bit pulse lsl r18 ; 1 +1 next bit into C, MSB first brcs L1 ; 1/2 +2 branch if 1 out PORTB, r21 ; 1 +3 end hi for '0' bit (3 clocks hi) nop ; 1 +4 bst r18, 7 ; 1 +5 save last bit of data for fast branching subi r19, 1 ; 1 +6 how many more bits for this byte? breq bit8 ; 1/2 +7 last bit, do differently rjmp loop1 ; 2 +8, 10 total for 0 bit L1: nop ; 1 +4 bst r18, 7 ; 1 +5 save last bit of data for fast branching subi r19, 1 ; 1 +6 how many more bits for this byte out PORTB, r21 ; 1 +7 end hi for '1' bit (7 clocks hi) brne loop1 ; 2/1 +8 10 total for 1 bit (fall thru if last bit) bit8: ldi r19, 7 ; 1 +9 bit count for next byte out PORTB, r20 ; 1 +0 start of a bit pulse brts L2 ; 1/2 +1 branch if last bit is a 1 nop ; 1 +2 out PORTB, r21 ; 1 +3 end hi for '0' bit (3 clocks hi) ld r18, X+ ; 2 +4 fetch next byte sbiw r24, 1 ; 2 +6 dec byte counter brne loop1 ; 2 +8 loop back or return out SREG, r22 ; restore global int flag ret L2: ld r18, X+ ; 2 +3 fetch next byte sbiw r24, 1 ; 2 +5 dec byte counter out PORTB, r21 ; 1 +7 end hi for '1' bit (7 clocks hi) brne loop1 ; 2 +8 loop back or return out SREG, r22 ; restore global int flag ret

My attempt:

void writePixel(uint8_t red, uint8_t green, uint8_t blue) { uint8_t buff[3] = {green, red, blue}; volatile uint16_t i = 3; //Save SREG uint8_t sreg=SREG; uint8_t hi = pixelPort | (1<<pixelPin); uint8_t low = pixelPort & ~(1<<pixelPin); //Clear interrupts cli(); asm volatile( "ldi r19, 7 \n\t" // 7 bit counter (8th bit is different) "ld r18,%a[ptr]+ \n\t" // get first data byte "loop1: \n\t" "out %[port], %[hi] \n\t" // 1 +0 start of a bit pulse "lsl r18 \n\t" // 1 +1 next bit into C, MSB first "brcs L1 \n\t" // 1/2 +2 branch if 1 "out %[port], %[lo] \n\t" // 1 +3 end hi for '0' bit (3 clocks hi) "nop \n\t" // 1 +4 "bst r18, 7 \n\t" // 1 +5 save last bit of data for fast branching "subi r19, 1 \n\t" // 1 +6 how many more bits for this byte? "breq bit8 \n\t" // 1/2 +7 last bit, do differently "rjmp loop1 \n\t" // 2 +8, 10 total for 0 bit "L1: \n\t" "nop \n\t" // 1 +4 "bst r18, 7 \n\t" // 1 +5 save last bit of data for fast branching "subi r19, 1 \n\t" // 1 +6 how many more bits for this byte "out %[port], %[lo] \n\t" // 1 +7 end hi for '1' bit (7 clocks hi) "brne loop1 \n\t" // 2/1 +8 10 total for 1 bit (fall through if last bit) "bit8: \n\t" "ldi r19, 7 \n\t" // 1 +9 bit count for next byte "out %[port], %[hi] \n\t" // 1 +0 start of a bit pulse "brts L2 \n\t" // 1/2 +1 branch if last bit is a 1 "nop \n\t" // 1 +2 "out %[port], %[lo] \n\t" // 1 +3 end hi for '0' bit (3 clocks hi) "ld r18, %a[ptr]+ \n\t" // 2 +4 fetch next byte "sbiw %[count], 1 \n\t" // 1 +6 dec byte counter "brne loop1 \n\t" // 2 +8 loop back or return "rjmp end \n\t" "L2: \n\t" "ld r18, %a[ptr]+ \n\t" // 2 +3 fetch next byte "sbiw %[count], 1 \n\t" // 1 +5 dec byte counter "out %[port], %[lo] \n\t" // 1 +7 end hi for '1' bit (7 clocks hi) "brne loop1 \n\t" // 2 +8 loop back or return "end: \n\t" "nop \n" : [count] "+w" (i) : [port] "I" (_SFR_IO_ADDR(pixelPort)), [ptr] "e" (buff), [hi] "r" (hi), [lo] "r" (low)); SREG = sreg; }

I kept the three loops exactly the same in structure, all I changed was I made r24 a variable called "count", I made X a pointer variable called "ptr", I'm passing the port to be used as a variable and I made r20 and r21 variables called "hi" and "lo". pixelPort and pixelPin are #defined so there is no data type clashes. Something that puzzles me about inline assembler is when to pass a variable as input and when to pass it as output.

Thanks in advance!!