Interfacing with C++ and GPIO Control¶

Introduction¶

In this final tutorial, we'll bring together everything we've learned and apply it to real-world Raspberry Pi programming. We'll explore how to seamlessly integrate assembly code with C++, access hardware directly through GPIO registers, and optimize performance-critical sections of your programs.

Understanding assembly-C++ integration is valuable for:

Performance Optimization: Hand-optimize critical inner loops
Hardware Access: Direct control of memory-mapped peripherals
Learning: Understand what the compiler generates
Legacy Code: Interface with existing assembly libraries
Embedded Systems: Write bare-metal code when needed

This tutorial covers calling conventions between C++ and assembly, inline assembly syntax, direct GPIO manipulation, and complete practical examples.

Calling Assembly from C++¶

External Assembly Function¶

C++ Declaration (main.cpp):

// Declare external assembly function
extern "C" {
    long add_numbers(long a, long b);
    void print_message(const char* msg);
    long factorial(long n);
}

int main() {
    long result = add_numbers(10, 20);
    printf("10 + 20 = %ld\n", result);

    print_message("Hello from assembly!\n");

    long fact = factorial(5);
    printf("5! = %ld\n", fact);

    return 0;
}

Assembly Implementation (functions.s):

// functions.s - Assembly functions callable from C++

.global add_numbers
.global print_message
.global factorial

.section .text

// long add_numbers(long a, long b);
// Parameters: a=x0, b=x1
// Returns: x0
add_numbers:
    add     x0, x0, x1
    ret

// void print_message(const char* msg);
// Parameter: msg=x0
print_message:
    stp     x29, x30, [sp, #-16]!
    mov     x29, sp

    // Preserve x0 (msg pointer) in x19
    str     x19, [sp, #-16]!
    mov     x19, x0

    // Get string length
    bl      strlen          // Assumes strlen is available
    mov     x2, x0          // length

    // System call: write(1, msg, length)
    mov     x0, #1          // stdout
    mov     x1, x19         // msg
    mov     x8, #64         // syscall write
    svc     #0

    ldr     x19, [sp], #16
    ldp     x29, x30, [sp], #16
    ret

// long factorial(long n);
// Parameter: n=x0
// Returns: x0
factorial:
    cmp     x0, #1
    b.le    fact_base

    stp     x29, x30, [sp, #-32]!
    mov     x29, sp
    str     x19, [sp, #16]

    mov     x19, x0
    sub     x0, x0, #1
    bl      factorial
    mul     x0, x19, x0

    ldr     x19, [sp, #16]
    ldp     x29, x30, [sp], #32
    ret

fact_base:
    mov     x0, #1
    ret

// Helper: strlen implementation
strlen:
    mov     x1, x0
strlen_loop:
    ldrb    w2, [x0], #1
    cbnz    w2, strlen_loop
    sub     x0, x0, x1
    sub     x0, x0, #1
    ret

Build:

# Compile C++ and assembly separately
g++ -c main.cpp -o main.o
as -o functions.o functions.s

# Link together
g++ main.o functions.o -o program

# Run
./program

Output:

1
2
3

10 + 20 = 30
Hello from assembly!
5! = 120

Passing Complex Types¶

C++ Code:

// Struct for passing data
struct Point {
    long x;
    long y;
};

struct Vector3 {
    double x;
    double y;
    double z;
};

extern "C" {
    long point_distance_squared(Point p1, Point p2);
    double vector3_length(Vector3 v);
}

int main() {
    Point p1 = {3, 4};
    Point p2 = {6, 8};
    long dist_sq = point_distance_squared(p1, p2);
    printf("Distance squared: %ld\n", dist_sq);  // Output: 25

    Vector3 v = {3.0, 4.0, 0.0};
    double len = vector3_length(v);
    printf("Vector length: %f\n", len);  // Output: 5.0

    return 0;
}

Assembly Code:

.global point_distance_squared
.global vector3_length

// long point_distance_squared(Point p1, Point p2);
// p1.x=x0, p1.y=x1, p2.x=x2, p2.y=x3
point_distance_squared:
    sub     x4, x2, x0      // dx = p2.x - p1.x
    sub     x5, x3, x1      // dy = p2.y - p1.y
    mul     x4, x4, x4      // dx²
    madd    x0, x5, x5, x4  // dy² + dx²
    ret

// double vector3_length(Vector3 v);
// v.x=d0, v.y=d1, v.z=d2
vector3_length:
    fmul    d3, d0, d0      // x²
    fmadd   d3, d1, d1, d3  // x² + y²
    fmadd   d3, d2, d2, d3  // x² + y² + z²
    fsqrt   d0, d3          // sqrt(x² + y² + z²)
    ret

Calling C++ from Assembly¶

Calling C Standard Library¶

.global _start

.section .data
format: .asciz "Result: %d\n"
value:  .word  42

.section .text
_start:
    // Set up stack frame for C function call
    stp     x29, x30, [sp, #-16]!
    mov     x29, sp

    // Call printf(format, value)
    ldr     x0, =format
    ldr     x1, =value
    ldr     w1, [x1]        // Load the value
    bl      printf

    // Exit
    mov     x0, #0
    ldp     x29, x30, [sp], #16
    mov     x8, #93
    svc     #0

Build with C runtime:

as -o program.o program.s
gcc program.o -o program -no-pie
./program

Calling C++ Member Functions¶

C++ Class:

// calculator.hpp
class Calculator {
public:
    long add(long a, long b);
    long multiply(long a, long b);
};

// calculator.cpp
long Calculator::add(long a, long b) {
    return a + b;
}

long Calculator::multiply(long a, long b) {
    return a * b;
}

// C wrapper for assembly
extern "C" {
    long calculator_add(void* obj, long a, long b) {
        Calculator* calc = static_cast<Calculator*>(obj);
        return calc->add(a, b);
    }

    long calculator_multiply(void* obj, long a, long b) {
        Calculator* calc = static_cast<Calculator*>(obj);
        return calc->multiply(a, b);
    }
}

Assembly Code:

.global use_calculator

.extern calculator_add
.extern calculator_multiply

// void use_calculator(void* calc);
use_calculator:
    stp     x29, x30, [sp, #-32]!
    mov     x29, sp
    str     x19, [sp, #16]

    mov     x19, x0         // Save calc object

    // Call calc->add(10, 20)
    mov     x0, x19         // this pointer
    mov     x1, #10         // a
    mov     x2, #20         // b
    bl      calculator_add
    // x0 now contains 30

    // Call calc->multiply(5, 6)
    mov     x0, x19
    mov     x1, #5
    mov     x2, #6
    bl      calculator_multiply
    // x0 now contains 30

    ldr     x19, [sp, #16]
    ldp     x29, x30, [sp], #32
    ret

Inline Assembly in C++¶

Basic Inline Assembly Syntax¶

int main() {
    long a = 10, b = 20, result;

    // Basic inline assembly
    asm("add %0, %1, %2"
        : "=r" (result)      // Output: result in register
        : "r" (a), "r" (b)   // Inputs: a and b in registers
    );

    printf("Result: %ld\n", result);  // Output: 30
    return 0;
}

Extended Assembly Examples¶

Arithmetic Operations:

void inline_examples() {
    long a = 100, b = 50, result;

    // Addition
    asm("add %[res], %[x], %[y]"
        : [res] "=r" (result)
        : [x] "r" (a), [y] "r" (b)
    );
    printf("Add: %ld\n", result);  // 150

    // Subtraction
    asm("sub %[res], %[x], %[y]"
        : [res] "=r" (result)
        : [x] "r" (a), [y] "r" (b)
    );
    printf("Sub: %ld\n", result);  // 50

    // Multiplication
    asm("mul %[res], %[x], %[y]"
        : [res] "=r" (result)
        : [x] "r" (a), [y] "r" (b)
    );
    printf("Mul: %ld\n", result);  // 5000
}

Clobbered Registers:

void clobber_example() {
    long result;

    asm volatile(
        "mov x9, #100\n"
        "mov x10, #200\n"
        "add %0, x9, x10\n"
        : "=r" (result)
        :
        : "x9", "x10"  // Tell compiler these are clobbered
    );

    printf("Result: %ld\n", result);  // 300
}

Memory Operations:

void memory_example() {
    long array[4] = {10, 20, 30, 40};
    long sum = 0;

    asm volatile(
        "mov x9, #0\n"           // sum = 0
        "mov x10, #0\n"          // i = 0
        "1:\n"                   // loop label
        "ldr x11, [%1, x10, lsl #3]\n"  // load array[i]
        "add x9, x9, x11\n"      // sum += array[i]
        "add x10, x10, #1\n"     // i++
        "cmp x10, #4\n"          // i < 4?
        "b.lt 1b\n"              // loop back
        "mov %0, x9\n"           // output sum
        : "=r" (sum)
        : "r" (array)
        : "x9", "x10", "x11", "memory"
    );

    printf("Sum: %ld\n", sum);  // 100
}

Volatile Keyword:

// Without volatile, compiler may optimize away
void volatile_example() {
    long counter = 0;

    // This might be optimized away without volatile
    asm volatile("mov %0, #42" : "=r" (counter));

    printf("Counter: %ld\n", counter);  // Guaranteed to print 42
}

GPIO Hardware Access¶

Raspberry Pi GPIO Memory Map¶

// GPIO register base addresses
// Raspberry Pi 4/5: 0xFE200000
// Raspberry Pi 3:   0x3F200000
// Raspberry Pi 1/2: 0x20200000

#define BCM2711_PERI_BASE    0xFE000000  // Pi 4/5
#define GPIO_BASE            (BCM2711_PERI_BASE + 0x200000)

// GPIO Function Select Registers (0-5)
#define GPFSEL0              (GPIO_BASE + 0x00)
#define GPFSEL1              (GPIO_BASE + 0x04)
#define GPFSEL2              (GPIO_BASE + 0x08)

// GPIO Pin Output Set Registers (0-1)
#define GPSET0               (GPIO_BASE + 0x1C)
#define GPSET1               (GPIO_BASE + 0x20)

// GPIO Pin Output Clear Registers (0-1)
#define GPCLR0               (GPIO_BASE + 0x28)
#define GPCLR1               (GPIO_BASE + 0x2C)

// GPIO Pin Level Registers (0-1)
#define GPLEV0               (GPIO_BASE + 0x34)
#define GPLEV1               (GPIO_BASE + 0x38)

GPIO Assembly Functions¶

gpio_functions.s:

.global gpio_init
.global gpio_set_output
.global gpio_set_high
.global gpio_set_low
.global gpio_read

// Base address for GPIO (Raspberry Pi 4/5)
.equ GPIO_BASE, 0xFE200000

// Offsets
.equ GPFSEL0, 0x00
.equ GPSET0,  0x1C
.equ GPCLR0,  0x28
.equ GPLEV0,  0x34

.section .text

// void gpio_init(void);
// Maps GPIO memory (assumes /dev/mem or /dev/gpiomem access)
gpio_init:
    // In practice, you'd use mmap() from C
    // This is a placeholder
    ret

// void gpio_set_output(int pin);
// Sets a GPIO pin as output
// Parameter: pin number in x0
gpio_set_output:
    // Determine which GPFSEL register (pin / 10)
    mov     x2, #10
    udiv    x1, x0, x2          // x1 = pin / 10
    msub    x2, x1, x2, x0      // x2 = pin % 10

    // Load GPIO base address
    movz    x3, #0x0000, lsl #48
    movk    x3, #0xFE20, lsl #16

    // Read current GPFSEL value
    ldr     w4, [x3, x1, lsl #2]

    // Clear the 3 bits for this pin
    mov     x5, #7              // 0b111
    lsl     x5, x5, x2, lsl #1  // Shift to pin position (* 3)
    add     x5, x5, x5, lsl #1  // x5 = x5 * 3
    bic     w4, w4, w5          // Clear bits

    // Set to output (001)
    mov     x5, #1
    mov     x6, x2
    add     x6, x6, x6, lsl #1  // x6 = pin % 10 * 3
    lsl     x5, x5, x6
    orr     w4, w4, w5          // Set output bit

    // Write back
    str     w4, [x3, x1, lsl #2]
    ret

// void gpio_set_high(int pin);
// Sets a GPIO pin high
gpio_set_high:
    // Load GPIO base + GPSET0 offset
    movz    x1, #0x0000, lsl #48
    movk    x1, #0xFE20, lsl #16
    add     x1, x1, #0x1C       // GPSET0 offset

    // Set bit for this pin
    mov     x2, #1
    lsl     x2, x2, x0          // 1 << pin
    str     w2, [x1]
    ret

// void gpio_set_low(int pin);
// Sets a GPIO pin low
gpio_set_low:
    // Load GPIO base + GPCLR0 offset
    movz    x1, #0x0000, lsl #48
    movk    x1, #0xFE20, lsl #16
    add     x1, x1, #0x28       // GPCLR0 offset

    // Clear bit for this pin
    mov     x2, #1
    lsl     x2, x2, x0          // 1 << pin
    str     w2, [x1]
    ret

// int gpio_read(int pin);
// Reads a GPIO pin state
gpio_read:
    // Load GPIO base + GPLEV0 offset
    movz    x1, #0x0000, lsl #48
    movk    x1, #0xFE20, lsl #16
    add     x1, x1, #0x34       // GPLEV0 offset

    // Read register
    ldr     w2, [x1]

    // Extract bit for this pin
    lsr     w2, w2, w0          // Shift pin bit to position 0
    and     w0, w2, #1          // Mask to get single bit
    ret

Complete LED Blink Example¶

led_blink.cpp:

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>

#define BCM2711_PERI_BASE 0xFE000000
#define GPIO_BASE (BCM2711_PERI_BASE + 0x200000)
#define PAGE_SIZE 4096
#define BLOCK_SIZE 4096

// Assembly functions
extern "C" {
    void gpio_setup_output_asm(volatile unsigned int* gpio, int pin);
    void gpio_write_asm(volatile unsigned int* gpio, int pin, int value);
}

volatile unsigned int *gpio = nullptr;

void setup_gpio() {
    int mem_fd = open("/dev/gpiomem", O_RDWR | O_SYNC);
    if (mem_fd < 0) {
        perror("Failed to open /dev/gpiomem");
        return;
    }

    gpio = (volatile unsigned int*)mmap(
        nullptr,
        BLOCK_SIZE,
        PROT_READ | PROT_WRITE,
        MAP_SHARED,
        mem_fd,
        0
    );

    close(mem_fd);

    if (gpio == MAP_FAILED) {
        perror("mmap failed");
        gpio = nullptr;
    }
}

int main() {
    const int LED_PIN = 17;  // GPIO 17

    setup_gpio();
    if (!gpio) {
        return 1;
    }

    // Set pin as output using assembly
    gpio_setup_output_asm(gpio, LED_PIN);

    printf("Blinking LED on GPIO %d (Ctrl+C to stop)\n", LED_PIN);

    while (true) {
        gpio_write_asm(gpio, LED_PIN, 1);  // LED on
        usleep(500000);  // 500ms

        gpio_write_asm(gpio, LED_PIN, 0);  // LED off
        usleep(500000);  // 500ms
    }

    return 0;
}

led_blink.s:

.global gpio_setup_output_asm
.global gpio_write_asm

// void gpio_setup_output_asm(volatile unsigned int* gpio, int pin);
// gpio=x0, pin=w1
gpio_setup_output_asm:
    // Calculate GPFSEL register and bit position
    mov     w2, #10
    udiv    w3, w1, w2          // Register index = pin / 10
    msub    w4, w3, w2, w1      // Bit group = pin % 10

    // Read current GPFSEL value
    ldr     w5, [x0, x3, lsl #2]

    // Calculate bit position (bit_group * 3)
    add     w6, w4, w4, lsl #1  // w6 = bit_group * 3

    // Clear 3 bits
    mov     w7, #7              // Mask: 0b111
    lsl     w7, w7, w6
    bic     w5, w5, w7

    // Set to output (001)
    mov     w7, #1
    lsl     w7, w7, w6
    orr     w5, w5, w7

    // Write back
    str     w5, [x0, x3, lsl #2]
    ret

// void gpio_write_asm(volatile unsigned int* gpio, int pin, int value);
// gpio=x0, pin=w1, value=w2
gpio_write_asm:
    // Determine register offset
    cbz     w2, gpio_clear

gpio_set:
    // GPSET0 offset = 0x1C / 4 = 7
    mov     w3, #1
    lsl     w3, w3, w1          // 1 << pin
    str     w3, [x0, #7]        // Write to GPSET0
    ret

gpio_clear:
    // GPCLR0 offset = 0x28 / 4 = 10
    mov     w3, #1
    lsl     w3, w3, w1          // 1 << pin
    str     w3, [x0, #10]       // Write to GPCLR0
    ret

Build and Run:

g++ -c led_blink.cpp -o led_blink_cpp.o
as -o led_blink_asm.o led_blink.s
g++ led_blink_cpp.o led_blink_asm.o -o led_blink
sudo ./led_blink

Performance Optimization Examples¶

Example 1: Optimized Memory Copy¶

C++ Version:

void memcpy_cpp(void* dest, const void* src, size_t n) {
    char* d = (char*)dest;
    const char* s = (const char*)src;
    for (size_t i = 0; i < n; i++) {
        d[i] = s[i];
    }
}

Optimized Assembly:

// void memcpy_asm(void* dest, const void* src, size_t n);
// dest=x0, src=x1, n=x2
.global memcpy_asm

memcpy_asm:
    cbz     x2, copy_done       // Return if n == 0

    // Copy 64 bytes at a time if possible
    cmp     x2, #64
    b.lt    copy_small

copy_64:
    ldp     x3, x4, [x1], #16
    ldp     x5, x6, [x1], #16
    ldp     x7, x8, [x1], #16
    ldp     x9, x10, [x1], #16

    stp     x3, x4, [x0], #16
    stp     x5, x6, [x0], #16
    stp     x7, x8, [x0], #16
    stp     x9, x10, [x0], #16

    sub     x2, x2, #64
    cmp     x2, #64
    b.ge    copy_64

copy_small:
    cbz     x2, copy_done

    // Copy 8 bytes at a time
    cmp     x2, #8
    b.lt    copy_bytes

copy_8:
    ldr     x3, [x1], #8
    str     x3, [x0], #8
    sub     x2, x2, #8
    cmp     x2, #8
    b.ge    copy_8

copy_bytes:
    cbz     x2, copy_done
    ldrb    w3, [x1], #1
    strb    w3, [x0], #1
    sub     x2, x2, #1
    b       copy_bytes

copy_done:
    ret

Example 2: SIMD Vector Addition¶

C++ Version:

void add_arrays_cpp(float* result, const float* a, const float* b, int n) {
    for (int i = 0; i < n; i++) {
        result[i] = a[i] + b[i];
    }
}

NEON SIMD Assembly:

// void add_arrays_simd(float* result, const float* a, const float* b, int n);
// result=x0, a=x1, b=x2, n=w3
.global add_arrays_simd

add_arrays_simd:
    cbz     w3, add_done

    // Process 4 floats at a time using NEON
    cmp     w3, #4
    b.lt    add_remainder

add_simd_loop:
    ld1     {v0.4s}, [x1], #16  // Load 4 floats from a
    ld1     {v1.4s}, [x2], #16  // Load 4 floats from b
    fadd    v0.4s, v0.4s, v1.4s // Add vectors
    st1     {v0.4s}, [x0], #16  // Store result

    sub     w3, w3, #4
    cmp     w3, #4
    b.ge    add_simd_loop

add_remainder:
    cbz     w3, add_done

    // Process remaining floats one by one
add_scalar_loop:
    ldr     s0, [x1], #4
    ldr     s1, [x2], #4
    fadd    s0, s0, s1
    str     s0, [x0], #4
    subs    w3, w3, #1
    b.ne    add_scalar_loop

add_done:
    ret

Example 3: Dot Product Optimization¶

Unoptimized:

double dot_product_cpp(const double* a, const double* b, int n) {
    double sum = 0.0;
    for (int i = 0; i < n; i++) {
        sum += a[i] * b[i];
    }
    return sum;
}

Optimized with Loop Unrolling:

// double dot_product_asm(const double* a, const double* b, int n);
// a=x0, b=x1, n=w2, return=d0
.global dot_product_asm

dot_product_asm:
    fmov    d0, xzr             // sum = 0.0
    cbz     w2, dot_done

    // Unroll loop by 4
    cmp     w2, #4
    b.lt    dot_remainder

dot_unrolled:
    ldp     d1, d2, [x0], #16   // Load a[i], a[i+1]
    ldp     d3, d4, [x1], #16   // Load b[i], b[i+1]
    fmul    d1, d1, d3          // a[i] * b[i]
    fmadd   d0, d2, d4, d0      // sum += a[i+1] * b[i+1]
    fadd    d0, d0, d1          // sum += a[i] * b[i]

    ldp     d1, d2, [x0], #16   // Load a[i+2], a[i+3]
    ldp     d3, d4, [x1], #16   // Load b[i+2], b[i+3]
    fmadd   d0, d1, d3, d0      // sum += a[i+2] * b[i+2]
    fmadd   d0, d2, d4, d0      // sum += a[i+3] * b[i+3]

    sub     w2, w2, #4
    cmp     w2, #4
    b.ge    dot_unrolled

dot_remainder:
    cbz     w2, dot_done
    ldr     d1, [x0], #8
    ldr     d2, [x1], #8
    fmadd   d0, d1, d2, d0
    subs    w2, w2, #1
    b.ne    dot_remainder

dot_done:
    ret

Debugging Mixed C++ and Assembly¶

Using GDB¶

# Compile with debug symbols
g++ -g -c main.cpp -o main.o
as -g -o functions.o functions.s
g++ -g main.o functions.o -o program

# Debug
gdb ./program

# Useful commands:
(gdb) break main                # Break at C++ main
(gdb) break add_numbers         # Break at assembly function
(gdb) run
(gdb) stepi                     # Step one assembly instruction
(gdb) info registers            # Show all registers
(gdb) x/10i $pc                 # Disassemble next 10 instructions
(gdb) layout asm                # Show assembly in TUI mode
(gdb) layout split              # Show source and assembly

Inline Assembly Debugging¶

int debug_example() {
    long a = 10, b = 20, result;

    asm volatile(
        "nop\n"              // Breakpoint marker
        "add %[res], %[x], %[y]\n"
        "nop\n"
        : [res] "=r" (result)
        : [x] "r" (a), [y] "r" (b)
    );

    return result;
}

Best Practices¶

1. Use `extern "C"` for Assembly Functions¶

extern "C" {
    void my_asm_function();  // Prevents C++ name mangling
}

2. Document Register Usage¶

// Function: process_data
// Inputs:
//   x0: pointer to data
//   x1: data length
// Outputs:
//   x0: result
// Clobbers:
//   x2, x3, x4
process_data:
    // ...

3. Preserve Stack Alignment¶

// Always maintain 16-byte alignment
stp     x29, x30, [sp, #-16]!   // Good
str     x29, [sp, #-8]!          // Bad - misaligns stack!

4. Use Appropriate Optimization Flags¶

# For C++
g++ -O2 -c main.cpp

# Don't over-optimize assembly
as -o functions.o functions.s  # No -O flags needed

5. Benchmark Your Code¶

#include <chrono>

void benchmark() {
    using namespace std::chrono;

    auto start = high_resolution_clock::now();
    // Call your assembly function
    asm_function();
    auto end = high_resolution_clock::now();

    auto duration = duration_cast<microseconds>(end - start);
    printf("Time: %ld μs\n", duration.count());
}

Summary¶

In this tutorial, we covered:

C++ Integration¶

✅ Calling assembly from C++ (extern "C")
✅ Calling C++ from assembly
✅ Passing structures and complex types
✅ C++ member function calls

Inline Assembly¶

✅ Basic syntax and constraints
✅ Input/output operands
✅ Clobbered registers
✅ Memory operations
✅ Volatile keyword usage

GPIO Control¶

✅ Memory-mapped I/O
✅ GPIO register layout
✅ Setting pins as output
✅ Reading and writing GPIO
✅ Complete LED blink example

Performance Optimization¶

✅ Optimized memory copy
✅ SIMD/NEON vector operations
✅ Loop unrolling
✅ Dot product optimization

Debugging¶

✅ GDB with mixed code
✅ Breakpoints in assembly
✅ Register inspection

Best Practices¶

✅ extern "C" linkage
✅ Documentation
✅ Stack alignment
✅ Optimization strategies
✅ Benchmarking

Conclusion¶

Congratulations! You've completed the Arm64 Assembly tutorial series. You now have the knowledge to:

Write efficient assembly code for Raspberry Pi
Interface seamlessly between C++ and assembly
Control hardware directly through GPIO
Optimize performance-critical code sections
Debug complex mixed-language programs

Where to Go Next¶

Practice: Rewrite portions of your existing projects in assembly
Experiment: Try controlling different peripherals (SPI, I2C, PWM)
Optimize: Profile your code and optimize hotspots
Learn More: Study the ARM Architecture Reference Manual
Contribute: Share your assembly libraries with the community

Remember that assembly should be used judiciously - only for performance-critical sections or hardware access where C++ isn't sufficient. Modern compilers are excellent at optimization, so measure before you optimize!

Happy coding!

Interfacing with C++ and GPIO Control¶

Introduction¶

Calling Assembly from C++¶

External Assembly Function¶

Passing Complex Types¶

Calling C++ from Assembly¶

Calling C Standard Library¶

Calling C++ Member Functions¶

Inline Assembly in C++¶

Basic Inline Assembly Syntax¶

Extended Assembly Examples¶

GPIO Hardware Access¶

Raspberry Pi GPIO Memory Map¶

GPIO Assembly Functions¶

Complete LED Blink Example¶

Performance Optimization Examples¶

Example 1: Optimized Memory Copy¶

Example 2: SIMD Vector Addition¶

Example 3: Dot Product Optimization¶

Debugging Mixed C++ and Assembly¶

Using GDB¶

Inline Assembly Debugging¶

Best Practices¶

1. Use extern "C" for Assembly Functions¶

2. Document Register Usage¶

3. Preserve Stack Alignment¶

4. Use Appropriate Optimization Flags¶

5. Benchmark Your Code¶

Summary¶

C++ Integration¶

Inline Assembly¶

GPIO Control¶

Performance Optimization¶

Debugging¶

Best Practices¶

Conclusion¶

Where to Go Next¶

1. Use `extern "C"` for Assembly Functions¶