Skip to content

Interfacing with C++ and GPIO Control

Introduction

In this final tutorial, we'll bring together everything we've learned and apply it to real-world Raspberry Pi programming. We'll explore how to seamlessly integrate assembly code with C++, access hardware directly through GPIO registers, and optimize performance-critical sections of your programs.

Understanding assembly-C++ integration is valuable for:

  • Performance Optimization: Hand-optimize critical inner loops
  • Hardware Access: Direct control of memory-mapped peripherals
  • Learning: Understand what the compiler generates
  • Legacy Code: Interface with existing assembly libraries
  • Embedded Systems: Write bare-metal code when needed

This tutorial covers calling conventions between C++ and assembly, inline assembly syntax, direct GPIO manipulation, and complete practical examples.

Calling Assembly from C++

External Assembly Function

C++ Declaration (main.cpp):

// Declare external assembly function
extern "C" {
    long add_numbers(long a, long b);
    void print_message(const char* msg);
    long factorial(long n);
}

int main() {
    long result = add_numbers(10, 20);
    printf("10 + 20 = %ld\n", result);

    print_message("Hello from assembly!\n");

    long fact = factorial(5);
    printf("5! = %ld\n", fact);

    return 0;
}

Assembly Implementation (functions.s):

// functions.s - Assembly functions callable from C++

.global add_numbers
.global print_message
.global factorial

.section .text

// long add_numbers(long a, long b);
// Parameters: a=x0, b=x1
// Returns: x0
add_numbers:
    add     x0, x0, x1
    ret

// void print_message(const char* msg);
// Parameter: msg=x0
print_message:
    stp     x29, x30, [sp, #-16]!
    mov     x29, sp

    // Preserve x0 (msg pointer) in x19
    str     x19, [sp, #-16]!
    mov     x19, x0

    // Get string length
    bl      strlen          // Assumes strlen is available
    mov     x2, x0          // length

    // System call: write(1, msg, length)
    mov     x0, #1          // stdout
    mov     x1, x19         // msg
    mov     x8, #64         // syscall write
    svc     #0

    ldr     x19, [sp], #16
    ldp     x29, x30, [sp], #16
    ret

// long factorial(long n);
// Parameter: n=x0
// Returns: x0
factorial:
    cmp     x0, #1
    b.le    fact_base

    stp     x29, x30, [sp, #-32]!
    mov     x29, sp
    str     x19, [sp, #16]

    mov     x19, x0
    sub     x0, x0, #1
    bl      factorial
    mul     x0, x19, x0

    ldr     x19, [sp, #16]
    ldp     x29, x30, [sp], #32
    ret

fact_base:
    mov     x0, #1
    ret

// Helper: strlen implementation
strlen:
    mov     x1, x0
strlen_loop:
    ldrb    w2, [x0], #1
    cbnz    w2, strlen_loop
    sub     x0, x0, x1
    sub     x0, x0, #1
    ret

Build:

1
2
3
4
5
6
7
8
9
# Compile C++ and assembly separately
g++ -c main.cpp -o main.o
as -o functions.o functions.s

# Link together
g++ main.o functions.o -o program

# Run
./program

Output:

1
2
3
10 + 20 = 30
Hello from assembly!
5! = 120

Passing Complex Types

C++ Code:

// Struct for passing data
struct Point {
    long x;
    long y;
};

struct Vector3 {
    double x;
    double y;
    double z;
};

extern "C" {
    long point_distance_squared(Point p1, Point p2);
    double vector3_length(Vector3 v);
}

int main() {
    Point p1 = {3, 4};
    Point p2 = {6, 8};
    long dist_sq = point_distance_squared(p1, p2);
    printf("Distance squared: %ld\n", dist_sq);  // Output: 25

    Vector3 v = {3.0, 4.0, 0.0};
    double len = vector3_length(v);
    printf("Vector length: %f\n", len);  // Output: 5.0

    return 0;
}

Assembly Code:

.global point_distance_squared
.global vector3_length

// long point_distance_squared(Point p1, Point p2);
// p1.x=x0, p1.y=x1, p2.x=x2, p2.y=x3
point_distance_squared:
    sub     x4, x2, x0      // dx = p2.x - p1.x
    sub     x5, x3, x1      // dy = p2.y - p1.y
    mul     x4, x4, x4      // dx²
    madd    x0, x5, x5, x4  // dy² + dx²
    ret

// double vector3_length(Vector3 v);
// v.x=d0, v.y=d1, v.z=d2
vector3_length:
    fmul    d3, d0, d0      // x²
    fmadd   d3, d1, d1, d3  // x² + y²
    fmadd   d3, d2, d2, d3  // x² + y² + z²
    fsqrt   d0, d3          // sqrt(x² + y² + z²)
    ret

Calling C++ from Assembly

Calling C Standard Library

.global _start

.section .data
format: .asciz "Result: %d\n"
value:  .word  42

.section .text
_start:
    // Set up stack frame for C function call
    stp     x29, x30, [sp, #-16]!
    mov     x29, sp

    // Call printf(format, value)
    ldr     x0, =format
    ldr     x1, =value
    ldr     w1, [x1]        // Load the value
    bl      printf

    // Exit
    mov     x0, #0
    ldp     x29, x30, [sp], #16
    mov     x8, #93
    svc     #0

Build with C runtime:

1
2
3
as -o program.o program.s
gcc program.o -o program -no-pie
./program

Calling C++ Member Functions

C++ Class:

// calculator.hpp
class Calculator {
public:
    long add(long a, long b);
    long multiply(long a, long b);
};

// calculator.cpp
long Calculator::add(long a, long b) {
    return a + b;
}

long Calculator::multiply(long a, long b) {
    return a * b;
}

// C wrapper for assembly
extern "C" {
    long calculator_add(void* obj, long a, long b) {
        Calculator* calc = static_cast<Calculator*>(obj);
        return calc->add(a, b);
    }

    long calculator_multiply(void* obj, long a, long b) {
        Calculator* calc = static_cast<Calculator*>(obj);
        return calc->multiply(a, b);
    }
}

Assembly Code:

.global use_calculator

.extern calculator_add
.extern calculator_multiply

// void use_calculator(void* calc);
use_calculator:
    stp     x29, x30, [sp, #-32]!
    mov     x29, sp
    str     x19, [sp, #16]

    mov     x19, x0         // Save calc object

    // Call calc->add(10, 20)
    mov     x0, x19         // this pointer
    mov     x1, #10         // a
    mov     x2, #20         // b
    bl      calculator_add
    // x0 now contains 30

    // Call calc->multiply(5, 6)
    mov     x0, x19
    mov     x1, #5
    mov     x2, #6
    bl      calculator_multiply
    // x0 now contains 30

    ldr     x19, [sp, #16]
    ldp     x29, x30, [sp], #32
    ret

Inline Assembly in C++

Basic Inline Assembly Syntax

int main() {
    long a = 10, b = 20, result;

    // Basic inline assembly
    asm("add %0, %1, %2"
        : "=r" (result)      // Output: result in register
        : "r" (a), "r" (b)   // Inputs: a and b in registers
    );

    printf("Result: %ld\n", result);  // Output: 30
    return 0;
}

Extended Assembly Examples

Arithmetic Operations:

void inline_examples() {
    long a = 100, b = 50, result;

    // Addition
    asm("add %[res], %[x], %[y]"
        : [res] "=r" (result)
        : [x] "r" (a), [y] "r" (b)
    );
    printf("Add: %ld\n", result);  // 150

    // Subtraction
    asm("sub %[res], %[x], %[y]"
        : [res] "=r" (result)
        : [x] "r" (a), [y] "r" (b)
    );
    printf("Sub: %ld\n", result);  // 50

    // Multiplication
    asm("mul %[res], %[x], %[y]"
        : [res] "=r" (result)
        : [x] "r" (a), [y] "r" (b)
    );
    printf("Mul: %ld\n", result);  // 5000
}

Clobbered Registers:

void clobber_example() {
    long result;

    asm volatile(
        "mov x9, #100\n"
        "mov x10, #200\n"
        "add %0, x9, x10\n"
        : "=r" (result)
        :
        : "x9", "x10"  // Tell compiler these are clobbered
    );

    printf("Result: %ld\n", result);  // 300
}

Memory Operations:

void memory_example() {
    long array[4] = {10, 20, 30, 40};
    long sum = 0;

    asm volatile(
        "mov x9, #0\n"           // sum = 0
        "mov x10, #0\n"          // i = 0
        "1:\n"                   // loop label
        "ldr x11, [%1, x10, lsl #3]\n"  // load array[i]
        "add x9, x9, x11\n"      // sum += array[i]
        "add x10, x10, #1\n"     // i++
        "cmp x10, #4\n"          // i < 4?
        "b.lt 1b\n"              // loop back
        "mov %0, x9\n"           // output sum
        : "=r" (sum)
        : "r" (array)
        : "x9", "x10", "x11", "memory"
    );

    printf("Sum: %ld\n", sum);  // 100
}

Volatile Keyword:

1
2
3
4
5
6
7
8
9
// Without volatile, compiler may optimize away
void volatile_example() {
    long counter = 0;

    // This might be optimized away without volatile
    asm volatile("mov %0, #42" : "=r" (counter));

    printf("Counter: %ld\n", counter);  // Guaranteed to print 42
}

GPIO Hardware Access

Raspberry Pi GPIO Memory Map

// GPIO register base addresses
// Raspberry Pi 4/5: 0xFE200000
// Raspberry Pi 3:   0x3F200000
// Raspberry Pi 1/2: 0x20200000

#define BCM2711_PERI_BASE    0xFE000000  // Pi 4/5
#define GPIO_BASE            (BCM2711_PERI_BASE + 0x200000)

// GPIO Function Select Registers (0-5)
#define GPFSEL0              (GPIO_BASE + 0x00)
#define GPFSEL1              (GPIO_BASE + 0x04)
#define GPFSEL2              (GPIO_BASE + 0x08)

// GPIO Pin Output Set Registers (0-1)
#define GPSET0               (GPIO_BASE + 0x1C)
#define GPSET1               (GPIO_BASE + 0x20)

// GPIO Pin Output Clear Registers (0-1)
#define GPCLR0               (GPIO_BASE + 0x28)
#define GPCLR1               (GPIO_BASE + 0x2C)

// GPIO Pin Level Registers (0-1)
#define GPLEV0               (GPIO_BASE + 0x34)
#define GPLEV1               (GPIO_BASE + 0x38)

GPIO Assembly Functions

gpio_functions.s:

.global gpio_init
.global gpio_set_output
.global gpio_set_high
.global gpio_set_low
.global gpio_read

// Base address for GPIO (Raspberry Pi 4/5)
.equ GPIO_BASE, 0xFE200000

// Offsets
.equ GPFSEL0, 0x00
.equ GPSET0,  0x1C
.equ GPCLR0,  0x28
.equ GPLEV0,  0x34

.section .text

// void gpio_init(void);
// Maps GPIO memory (assumes /dev/mem or /dev/gpiomem access)
gpio_init:
    // In practice, you'd use mmap() from C
    // This is a placeholder
    ret

// void gpio_set_output(int pin);
// Sets a GPIO pin as output
// Parameter: pin number in x0
gpio_set_output:
    // Determine which GPFSEL register (pin / 10)
    mov     x2, #10
    udiv    x1, x0, x2          // x1 = pin / 10
    msub    x2, x1, x2, x0      // x2 = pin % 10

    // Load GPIO base address
    movz    x3, #0x0000, lsl #48
    movk    x3, #0xFE20, lsl #16

    // Read current GPFSEL value
    ldr     w4, [x3, x1, lsl #2]

    // Clear the 3 bits for this pin
    mov     x5, #7              // 0b111
    lsl     x5, x5, x2, lsl #1  // Shift to pin position (* 3)
    add     x5, x5, x5, lsl #1  // x5 = x5 * 3
    bic     w4, w4, w5          // Clear bits

    // Set to output (001)
    mov     x5, #1
    mov     x6, x2
    add     x6, x6, x6, lsl #1  // x6 = pin % 10 * 3
    lsl     x5, x5, x6
    orr     w4, w4, w5          // Set output bit

    // Write back
    str     w4, [x3, x1, lsl #2]
    ret

// void gpio_set_high(int pin);
// Sets a GPIO pin high
gpio_set_high:
    // Load GPIO base + GPSET0 offset
    movz    x1, #0x0000, lsl #48
    movk    x1, #0xFE20, lsl #16
    add     x1, x1, #0x1C       // GPSET0 offset

    // Set bit for this pin
    mov     x2, #1
    lsl     x2, x2, x0          // 1 << pin
    str     w2, [x1]
    ret

// void gpio_set_low(int pin);
// Sets a GPIO pin low
gpio_set_low:
    // Load GPIO base + GPCLR0 offset
    movz    x1, #0x0000, lsl #48
    movk    x1, #0xFE20, lsl #16
    add     x1, x1, #0x28       // GPCLR0 offset

    // Clear bit for this pin
    mov     x2, #1
    lsl     x2, x2, x0          // 1 << pin
    str     w2, [x1]
    ret

// int gpio_read(int pin);
// Reads a GPIO pin state
gpio_read:
    // Load GPIO base + GPLEV0 offset
    movz    x1, #0x0000, lsl #48
    movk    x1, #0xFE20, lsl #16
    add     x1, x1, #0x34       // GPLEV0 offset

    // Read register
    ldr     w2, [x1]

    // Extract bit for this pin
    lsr     w2, w2, w0          // Shift pin bit to position 0
    and     w0, w2, #1          // Mask to get single bit
    ret

led_blink.cpp:

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>

#define BCM2711_PERI_BASE 0xFE000000
#define GPIO_BASE (BCM2711_PERI_BASE + 0x200000)
#define PAGE_SIZE 4096
#define BLOCK_SIZE 4096

// Assembly functions
extern "C" {
    void gpio_setup_output_asm(volatile unsigned int* gpio, int pin);
    void gpio_write_asm(volatile unsigned int* gpio, int pin, int value);
}

volatile unsigned int *gpio = nullptr;

void setup_gpio() {
    int mem_fd = open("/dev/gpiomem", O_RDWR | O_SYNC);
    if (mem_fd < 0) {
        perror("Failed to open /dev/gpiomem");
        return;
    }

    gpio = (volatile unsigned int*)mmap(
        nullptr,
        BLOCK_SIZE,
        PROT_READ | PROT_WRITE,
        MAP_SHARED,
        mem_fd,
        0
    );

    close(mem_fd);

    if (gpio == MAP_FAILED) {
        perror("mmap failed");
        gpio = nullptr;
    }
}

int main() {
    const int LED_PIN = 17;  // GPIO 17

    setup_gpio();
    if (!gpio) {
        return 1;
    }

    // Set pin as output using assembly
    gpio_setup_output_asm(gpio, LED_PIN);

    printf("Blinking LED on GPIO %d (Ctrl+C to stop)\n", LED_PIN);

    while (true) {
        gpio_write_asm(gpio, LED_PIN, 1);  // LED on
        usleep(500000);  // 500ms

        gpio_write_asm(gpio, LED_PIN, 0);  // LED off
        usleep(500000);  // 500ms
    }

    return 0;
}

led_blink.s:

.global gpio_setup_output_asm
.global gpio_write_asm

// void gpio_setup_output_asm(volatile unsigned int* gpio, int pin);
// gpio=x0, pin=w1
gpio_setup_output_asm:
    // Calculate GPFSEL register and bit position
    mov     w2, #10
    udiv    w3, w1, w2          // Register index = pin / 10
    msub    w4, w3, w2, w1      // Bit group = pin % 10

    // Read current GPFSEL value
    ldr     w5, [x0, x3, lsl #2]

    // Calculate bit position (bit_group * 3)
    add     w6, w4, w4, lsl #1  // w6 = bit_group * 3

    // Clear 3 bits
    mov     w7, #7              // Mask: 0b111
    lsl     w7, w7, w6
    bic     w5, w5, w7

    // Set to output (001)
    mov     w7, #1
    lsl     w7, w7, w6
    orr     w5, w5, w7

    // Write back
    str     w5, [x0, x3, lsl #2]
    ret

// void gpio_write_asm(volatile unsigned int* gpio, int pin, int value);
// gpio=x0, pin=w1, value=w2
gpio_write_asm:
    // Determine register offset
    cbz     w2, gpio_clear

gpio_set:
    // GPSET0 offset = 0x1C / 4 = 7
    mov     w3, #1
    lsl     w3, w3, w1          // 1 << pin
    str     w3, [x0, #7]        // Write to GPSET0
    ret

gpio_clear:
    // GPCLR0 offset = 0x28 / 4 = 10
    mov     w3, #1
    lsl     w3, w3, w1          // 1 << pin
    str     w3, [x0, #10]       // Write to GPCLR0
    ret

Build and Run:

1
2
3
4
g++ -c led_blink.cpp -o led_blink_cpp.o
as -o led_blink_asm.o led_blink.s
g++ led_blink_cpp.o led_blink_asm.o -o led_blink
sudo ./led_blink

Performance Optimization Examples

Example 1: Optimized Memory Copy

C++ Version:

1
2
3
4
5
6
7
void memcpy_cpp(void* dest, const void* src, size_t n) {
    char* d = (char*)dest;
    const char* s = (const char*)src;
    for (size_t i = 0; i < n; i++) {
        d[i] = s[i];
    }
}

Optimized Assembly:

// void memcpy_asm(void* dest, const void* src, size_t n);
// dest=x0, src=x1, n=x2
.global memcpy_asm

memcpy_asm:
    cbz     x2, copy_done       // Return if n == 0

    // Copy 64 bytes at a time if possible
    cmp     x2, #64
    b.lt    copy_small

copy_64:
    ldp     x3, x4, [x1], #16
    ldp     x5, x6, [x1], #16
    ldp     x7, x8, [x1], #16
    ldp     x9, x10, [x1], #16

    stp     x3, x4, [x0], #16
    stp     x5, x6, [x0], #16
    stp     x7, x8, [x0], #16
    stp     x9, x10, [x0], #16

    sub     x2, x2, #64
    cmp     x2, #64
    b.ge    copy_64

copy_small:
    cbz     x2, copy_done

    // Copy 8 bytes at a time
    cmp     x2, #8
    b.lt    copy_bytes

copy_8:
    ldr     x3, [x1], #8
    str     x3, [x0], #8
    sub     x2, x2, #8
    cmp     x2, #8
    b.ge    copy_8

copy_bytes:
    cbz     x2, copy_done
    ldrb    w3, [x1], #1
    strb    w3, [x0], #1
    sub     x2, x2, #1
    b       copy_bytes

copy_done:
    ret

Example 2: SIMD Vector Addition

C++ Version:

1
2
3
4
5
void add_arrays_cpp(float* result, const float* a, const float* b, int n) {
    for (int i = 0; i < n; i++) {
        result[i] = a[i] + b[i];
    }
}

NEON SIMD Assembly:

// void add_arrays_simd(float* result, const float* a, const float* b, int n);
// result=x0, a=x1, b=x2, n=w3
.global add_arrays_simd

add_arrays_simd:
    cbz     w3, add_done

    // Process 4 floats at a time using NEON
    cmp     w3, #4
    b.lt    add_remainder

add_simd_loop:
    ld1     {v0.4s}, [x1], #16  // Load 4 floats from a
    ld1     {v1.4s}, [x2], #16  // Load 4 floats from b
    fadd    v0.4s, v0.4s, v1.4s // Add vectors
    st1     {v0.4s}, [x0], #16  // Store result

    sub     w3, w3, #4
    cmp     w3, #4
    b.ge    add_simd_loop

add_remainder:
    cbz     w3, add_done

    // Process remaining floats one by one
add_scalar_loop:
    ldr     s0, [x1], #4
    ldr     s1, [x2], #4
    fadd    s0, s0, s1
    str     s0, [x0], #4
    subs    w3, w3, #1
    b.ne    add_scalar_loop

add_done:
    ret

Example 3: Dot Product Optimization

Unoptimized:

1
2
3
4
5
6
7
double dot_product_cpp(const double* a, const double* b, int n) {
    double sum = 0.0;
    for (int i = 0; i < n; i++) {
        sum += a[i] * b[i];
    }
    return sum;
}

Optimized with Loop Unrolling:

// double dot_product_asm(const double* a, const double* b, int n);
// a=x0, b=x1, n=w2, return=d0
.global dot_product_asm

dot_product_asm:
    fmov    d0, xzr             // sum = 0.0
    cbz     w2, dot_done

    // Unroll loop by 4
    cmp     w2, #4
    b.lt    dot_remainder

dot_unrolled:
    ldp     d1, d2, [x0], #16   // Load a[i], a[i+1]
    ldp     d3, d4, [x1], #16   // Load b[i], b[i+1]
    fmul    d1, d1, d3          // a[i] * b[i]
    fmadd   d0, d2, d4, d0      // sum += a[i+1] * b[i+1]
    fadd    d0, d0, d1          // sum += a[i] * b[i]

    ldp     d1, d2, [x0], #16   // Load a[i+2], a[i+3]
    ldp     d3, d4, [x1], #16   // Load b[i+2], b[i+3]
    fmadd   d0, d1, d3, d0      // sum += a[i+2] * b[i+2]
    fmadd   d0, d2, d4, d0      // sum += a[i+3] * b[i+3]

    sub     w2, w2, #4
    cmp     w2, #4
    b.ge    dot_unrolled

dot_remainder:
    cbz     w2, dot_done
    ldr     d1, [x0], #8
    ldr     d2, [x1], #8
    fmadd   d0, d1, d2, d0
    subs    w2, w2, #1
    b.ne    dot_remainder

dot_done:
    ret

Debugging Mixed C++ and Assembly

Using GDB

# Compile with debug symbols
g++ -g -c main.cpp -o main.o
as -g -o functions.o functions.s
g++ -g main.o functions.o -o program

# Debug
gdb ./program

# Useful commands:
(gdb) break main                # Break at C++ main
(gdb) break add_numbers         # Break at assembly function
(gdb) run
(gdb) stepi                     # Step one assembly instruction
(gdb) info registers            # Show all registers
(gdb) x/10i $pc                 # Disassemble next 10 instructions
(gdb) layout asm                # Show assembly in TUI mode
(gdb) layout split              # Show source and assembly

Inline Assembly Debugging

int debug_example() {
    long a = 10, b = 20, result;

    asm volatile(
        "nop\n"              // Breakpoint marker
        "add %[res], %[x], %[y]\n"
        "nop\n"
        : [res] "=r" (result)
        : [x] "r" (a), [y] "r" (b)
    );

    return result;
}

Best Practices

1. Use extern "C" for Assembly Functions

1
2
3
extern "C" {
    void my_asm_function();  // Prevents C++ name mangling
}

2. Document Register Usage

// Function: process_data
// Inputs:
//   x0: pointer to data
//   x1: data length
// Outputs:
//   x0: result
// Clobbers:
//   x2, x3, x4
process_data:
    // ...

3. Preserve Stack Alignment

1
2
3
// Always maintain 16-byte alignment
stp     x29, x30, [sp, #-16]!   // Good
str     x29, [sp, #-8]!          // Bad - misaligns stack!

4. Use Appropriate Optimization Flags

1
2
3
4
5
# For C++
g++ -O2 -c main.cpp

# Don't over-optimize assembly
as -o functions.o functions.s  # No -O flags needed

5. Benchmark Your Code

#include <chrono>

void benchmark() {
    using namespace std::chrono;

    auto start = high_resolution_clock::now();
    // Call your assembly function
    asm_function();
    auto end = high_resolution_clock::now();

    auto duration = duration_cast<microseconds>(end - start);
    printf("Time: %ld μs\n", duration.count());
}

Summary

In this tutorial, we covered:

C++ Integration

  • ✅ Calling assembly from C++ (extern "C")
  • ✅ Calling C++ from assembly
  • ✅ Passing structures and complex types
  • ✅ C++ member function calls

Inline Assembly

  • ✅ Basic syntax and constraints
  • ✅ Input/output operands
  • ✅ Clobbered registers
  • ✅ Memory operations
  • ✅ Volatile keyword usage

GPIO Control

  • ✅ Memory-mapped I/O
  • ✅ GPIO register layout
  • ✅ Setting pins as output
  • ✅ Reading and writing GPIO
  • ✅ Complete LED blink example

Performance Optimization

  • ✅ Optimized memory copy
  • ✅ SIMD/NEON vector operations
  • ✅ Loop unrolling
  • ✅ Dot product optimization

Debugging

  • ✅ GDB with mixed code
  • ✅ Breakpoints in assembly
  • ✅ Register inspection

Best Practices

  • ✅ extern "C" linkage
  • ✅ Documentation
  • ✅ Stack alignment
  • ✅ Optimization strategies
  • ✅ Benchmarking

Conclusion

Congratulations! You've completed the Arm64 Assembly tutorial series. You now have the knowledge to:

  • Write efficient assembly code for Raspberry Pi
  • Interface seamlessly between C++ and assembly
  • Control hardware directly through GPIO
  • Optimize performance-critical code sections
  • Debug complex mixed-language programs

Where to Go Next

  1. Practice: Rewrite portions of your existing projects in assembly
  2. Experiment: Try controlling different peripherals (SPI, I2C, PWM)
  3. Optimize: Profile your code and optimize hotspots
  4. Learn More: Study the ARM Architecture Reference Manual
  5. Contribute: Share your assembly libraries with the community

Remember that assembly should be used judiciously - only for performance-critical sections or hardware access where C++ isn't sufficient. Modern compilers are excellent at optimization, so measure before you optimize!

Happy coding!