diff --git a/src/port/versal/.gitignore b/src/port/versal/.gitignore new file mode 100644 index 00000000..8e5ab963 --- /dev/null +++ b/src/port/versal/.gitignore @@ -0,0 +1,4 @@ +*.o +*.elf +*.bin +BOOT.BIN diff --git a/src/port/versal/Makefile b/src/port/versal/Makefile new file mode 100644 index 00000000..4d5963a3 --- /dev/null +++ b/src/port/versal/Makefile @@ -0,0 +1,89 @@ +# Xilinx Versal Gen 1 (VMK180, Cortex-A72) wolfIP bare-metal port +# +# Build: make CROSS_COMPILE=aarch64-none-elf- +# +# Toolchain: ARM GNU aarch64-none-elf-gcc (tested with 14.3.rel1). +# +# UNTESTED ON HARDWARE -- structural scaffold mirroring src/port/zcu102/. + +CROSS_COMPILE ?= aarch64-none-elf- +CC := $(CROSS_COMPILE)gcc +OBJCOPY := $(CROSS_COMPILE)objcopy +SIZE := $(CROSS_COMPILE)size + +ROOT := ../../.. + +# Cortex-A72, AArch64, EL3 single-EL bare-metal. No SIMD/FP in the +# wolfIP/driver paths - keep -mgeneral-regs-only to catch any +# accidental FP use and make the ABI deterministic for cert. +CFLAGS := -mcpu=cortex-a72 -mgeneral-regs-only +CFLAGS += -Os -ffreestanding -fno-builtin -fno-common +CFLAGS += -fdata-sections -ffunction-sections +CFLAGS += -g -Wall -Wextra -Werror -Wno-unused-parameter +CFLAGS += -std=gnu99 +CFLAGS += -I. -I$(ROOT) -I$(ROOT)/src -I$(ROOT)/src/port +CFLAGS += -DVERSAL -DXILINX_AARCH64 +# Append extra defines for investigation builds, e.g.: +# make CFLAGS_EXTRA="-DDEBUG_GIC -DDEBUG_GEM -DDEBUG_PHY" +CFLAGS += $(CFLAGS_EXTRA) + +ASFLAGS := -mcpu=cortex-a72 + +# Layout selector. Default ocm keeps the OCM-only layout that the JTAG +# iteration scripts depend on (everything in OCM @ 0xFFFC0000). Pass +# LAYOUT=ddr to relink for DDR @ 0x10000000 -- which is what wolfBoot +# uses (WOLFBOOT_LOAD_ADDRESS in zynqmp.config also applies to Versal +# when adapted). +LAYOUT ?= ocm +ifeq ($(LAYOUT),ddr) + LDSCRIPT := target_ddr.ld + CFLAGS += -DVERSAL_LAYOUT_DDR +else ifeq ($(LAYOUT),ocm) + LDSCRIPT := target.ld + CFLAGS += -DVERSAL_LAYOUT_OCM +else + $(error LAYOUT must be 'ocm' or 'ddr') +endif + +LDFLAGS := -nostdlib -nostartfiles -T $(LDSCRIPT) -Wl,-gc-sections +# Replace newlib's aarch64 memset/memcpy (which use 'dc zva' and may +# hang on a similar Cortex-A72 setup; the safe pattern is to override +# them as we did on ZCU102). +LDFLAGS += -Wl,--wrap=memset -Wl,--wrap=memcpy + +LOCAL_C := main.c uart.c mmu.c gic.c gem.c phy_dp83867.c entropy.c +LOCAL_S := startup.S +LOCAL_OBJS := $(LOCAL_C:.c=.o) $(LOCAL_S:.S=.o) + +WOLFIP_OBJ := wolfip.o +OBJS := $(LOCAL_OBJS) $(WOLFIP_OBJ) + +all: app.elf + @echo "Built: app.elf" + @$(SIZE) app.elf + +app.elf: $(OBJS) $(LDSCRIPT) + $(CC) $(CFLAGS) $(OBJS) $(LDFLAGS) \ + -Wl,--start-group -lc -lgcc -Wl,--end-group -o $@ + +$(WOLFIP_OBJ): $(ROOT)/src/wolfip.c + $(CC) $(CFLAGS) -Wno-zero-length-bounds -Wno-type-limits -c $< -o $@ + +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +%.o: %.S + $(CC) $(ASFLAGS) -c $< -o $@ + +clean: + rm -f $(OBJS) app.elf BOOT.BIN + +.PHONY: all clean help + +help: + @echo "Versal Gen 1 wolfIP build (scaffold, untested):" + @echo " make - build app.elf (OCM layout)" + @echo " make LAYOUT=ddr - DDR layout for wolfBoot" + @echo " make clean - remove artifacts" + @echo "" + @echo "Override CROSS_COMPILE if your toolchain prefix differs." diff --git a/src/port/versal/README.md b/src/port/versal/README.md new file mode 100644 index 00000000..f282c1a8 --- /dev/null +++ b/src/port/versal/README.md @@ -0,0 +1,47 @@ +# wolfIP port: Xilinx Versal Gen 1 (VMK180) + +**STATUS: UNTESTED ON HARDWARE.** Structural scaffold mirroring `src/port/zcu102/`. The code compiles cleanly with `aarch64-none-elf-gcc` but has not been brought up on a real VMK180 board. Lab verification is a Phase 3 milestone once the bench is available. + +## What this port is + +Bare-metal wolfIP port for the AMD/Xilinx Versal ACAP Gen 1, demoed on the VMK180 dev board. Cortex-A72 APU 0 at EL3, GCC bare-metal, no Xilinx Standalone BSP, no FreeRTOS. Targets the same deterministic UDP/IPv4 profile as the ZCU102 port for DO-178C DAL-C qualification. + +## What differs from ZCU102 + +| Subsystem | ZCU102 | Versal Gen 1 | Where it lives | +|-----------|--------|--------------|----------------| +| APU core | Cortex-A53 | Cortex-A72 | `Makefile` (-mcpu) | +| Bootloader handoff | FSBL -> EL3 | PLM -> BL31 -> EL3 (or EL2) | `startup.S` | +| GIC | GIC-400 (GICv2) | GIC-600 (GICv3) | `gic.c` rewritten for GICv3 system regs + GICR | +| UART | Cadence | ARM PL011 | `uart.c` rewritten | +| GEM count | 4 (GEM0-3) | 2 (GEM0-1) | `board.h` | +| On-board RJ45 | GEM3 (INTID 95) | GEM0 (INTID 88) | `board.h` | +| GEM IP | Cadence GEM3 | Cadence GEM3 | `gem.c` unchanged (just base addr / INTID) | +| PHY | DP83867 RGMII | DP83867 RGMII (VMK180) | `phy_dp83867.c` unchanged | +| MMU | EL3 ARMv8 | EL3 ARMv8 | `mmu.c` unchanged | +| RNG | memuse entropy | memuse entropy | `entropy.c` unchanged | + +The reused 90% (`gem.c`, `phy_dp83867.c`, `mmu.c`, `entropy.c`, `main.c`, `target.ld`, `target_ddr.ld`) is identical to the ZCU102 port; only `board.h`, `uart.c`, `gic.c`, and the startup/Makefile breadcrumbs are Versal-specific. + +## Build + +``` +cd src/port/versal +make CROSS_COMPILE=aarch64-none-elf- # OCM layout (default) +make CROSS_COMPILE=aarch64-none-elf- LAYOUT=ddr # DDR layout for wolfBoot +``` + +Output: `app.elf`. Size info is printed at the end of the build. + +## Known unknowns (to validate on hardware) + +- `gic.c` `gic_init` order may need rework if BL31 owns the distributor on Versal -- the safer path is to skip distributor init entirely and only set up the redistributor + CPU interface. The current code re-initialises the distributor defensively; this may be redundant or actively wrong depending on BL31 settings. +- `CRL_APB_GEM0_REF_CTRL` offset in `board.h` is a placeholder (0x50). Cross-check against the Versal LPD clock register map before bring-up. +- The on-board PHY MDIO address on VMK180 needs confirmation; the ZCU102 used `0x0C`, VMK180 may be different. +- PL011 baud assumes `UARTCLK = 100 MHz`. Versal PLM typically configures this but the rate could differ; confirm from the LPD clock tree. +- `SCR_EL3` routing convention (set `IRQ`+`FIQ`+`EA` bits) is carried over from the ZCU102 fix. Cortex-A72 may not require it; harmless to leave for now. +- DDR DAP write reliability on Versal (the issue we hit on ZCU102 for JTAG iteration to DDR) may behave differently -- VMK180 uses LPDDR4 with PLM-controlled training. Expect SD/QSPI boot to be the easier first test path. + +## Files + +Identical layout to `src/port/zcu102/`. See that port's README for per-file responsibilities. The differences listed in the table above are the only substantive Versal-specific code. diff --git a/src/port/versal/board.h b/src/port/versal/board.h new file mode 100644 index 00000000..72af08df --- /dev/null +++ b/src/port/versal/board.h @@ -0,0 +1,108 @@ +/* board.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Xilinx Versal Gen 1 (VCxxxx / VMK180 board) PS register base + * addresses and GIC SPI IDs. Values are derived from the Versal ACAP + * Technical Reference Manual (AM011), the VMK180 board user guide, + * and the published `versal.dtsi` device tree. No Xilinx BSP header + * (xparameters.h) or xilstandalone code is referenced. + * + * UNTESTED ON HARDWARE -- code-only scaffold while the lab board is + * unavailable. The structure mirrors src/port/zcu102/. Key + * differences from ZynqMP are: + * - Cortex-A72 (not A53), with PLM handoff at EL2 + * - GICv3 distributor + redistributor (no GICv2 legacy GICC) + * - ARM PL011 UART (not Cadence) + * - 2 GEMs (GEM0/GEM1) instead of 4; on-board RJ45 is GEM0 on VMK180 + */ +#ifndef VERSAL_BOARD_H +#define VERSAL_BOARD_H + +#include + +/* --------------------------------------------------------------------- + * Memory map (Versal PS) + * ------------------------------------------------------------------- */ +#define DDR_BASE 0x00000000UL +#define DDR_SIZE 0x80000000UL /* 2 GB lower bank */ + +/* OCM on Versal lives at 0xFFFC0000 (256 KB). Same as ZynqMP. */ +#define OCM_BASE 0xFFFC0000UL +#define OCM_SIZE 0x00040000UL + +/* --------------------------------------------------------------------- + * PS peripherals + * ------------------------------------------------------------------- */ +#define UART0_BASE 0xFF000000UL /* PL011 */ +#define UART1_BASE 0xFF010000UL /* PL011 */ + +#define GEM0_BASE 0xFF0C0000UL /* on-board GEM (VMK180) */ +#define GEM1_BASE 0xFF0D0000UL + +#define CRL_APB_BASE 0xFF5E0000UL /* LPD clock & reset */ +#define IOU_SLCR_BASE 0xFF180000UL + +/* GICv3: distributor + redistributor */ +#define GICD_BASE 0xF9000000UL +#define GICR_BASE 0xF9080000UL /* per-CPU redistributors */ + +/* --------------------------------------------------------------------- + * GIC SPI numbers as GIC INTIDs (ARM GIC numbering: SPI N -> INTID 32+N). + * Versal versal.dtsi: + * GEM0: GIC_SPI 56 -> INTID 88 + * GEM1: GIC_SPI 58 -> INTID 90 + * ------------------------------------------------------------------- */ +#define IRQ_GEM0 (32 + 56) /* GIC_SPI 56 -> INTID 88, + * on-board VMK180 RJ45 */ +#define IRQ_GEM1 (32 + 58) /* GIC_SPI 58 -> INTID 90 */ + +/* --------------------------------------------------------------------- + * CRL_APB clock and reset registers (LPD). Versal keeps the ZynqMP + * layout for the LPD clocks; the GEM clock control register names match. + * ------------------------------------------------------------------- */ +#define CRL_APB_GEM0_REF_CTRL (CRL_APB_BASE + 0x50) /* offset TBD */ +#define CRL_APB_RST_LPD_IOU0 (CRL_APB_BASE + 0x230) /* GEM bits 0-1 */ + +/* --------------------------------------------------------------------- + * PL011 UART0 - on-board USB-UART on VMK180 + * ------------------------------------------------------------------- */ +#define UART_BAUD 115200 + +/* MAC address for eth0. Locally-administered, even first octet. */ +#ifndef WOLFIP_MAC_0 +#define WOLFIP_MAC_0 0x02 +#endif +#ifndef WOLFIP_MAC_1 +#define WOLFIP_MAC_1 0x00 +#endif +#ifndef WOLFIP_MAC_2 +#define WOLFIP_MAC_2 0x5A +#endif +#ifndef WOLFIP_MAC_3 +#define WOLFIP_MAC_3 0x11 +#endif +#ifndef WOLFIP_MAC_4 +#define WOLFIP_MAC_4 0x22 +#endif +#ifndef WOLFIP_MAC_5 +#define WOLFIP_MAC_5 0x33 +#endif + +#endif /* VERSAL_BOARD_H */ diff --git a/src/port/versal/config.h b/src/port/versal/config.h new file mode 100644 index 00000000..a23992cb --- /dev/null +++ b/src/port/versal/config.h @@ -0,0 +1,82 @@ +/* config.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * wolfIP configuration for Xilinx ZCU102 (UltraScale+ MPSoC, A53-0 EL3 + * bare-metal). UDP-only profile aimed at deterministic DAL-C use. + */ +#ifndef WOLF_CONFIG_H +#define WOLF_CONFIG_H + +#ifndef CONFIG_IPFILTER +#define CONFIG_IPFILTER 0 +#endif + +#define ETHERNET +#define LINK_MTU 1536 + +/* UDP-only profile in intent: the application does not call + * wolfIP_sock_socket() with IPSTACK_SOCK_STREAM. MAX_TCPSOCKETS is set + * to a small non-zero value only because core wolfIP currently sizes + * its timer heap via MAX_TIMERS = MAX_TCPSOCKETS * 3, and DHCP / ARP + * aging need timers. With MAX_TCPSOCKETS=0 the timer-heap insert path + * is permanently full and DHCP cannot schedule its retransmit timer. + * A core wolfIP follow-up should decouple MAX_TIMERS from + * MAX_TCPSOCKETS so DAL-C builds can truly opt TCP code out at + * compile time. */ +#define MAX_TCPSOCKETS 2 +#define MAX_UDPSOCKETS 4 +#define MAX_ICMPSOCKETS 1 +#define RXBUF_SIZE (LINK_MTU * 4) +#define TXBUF_SIZE (LINK_MTU * 4) + +#define MAX_NEIGHBORS 16 + +#ifndef WOLFIP_MAX_INTERFACES +#define WOLFIP_MAX_INTERFACES 1 +#endif + +#ifndef WOLFIP_ENABLE_FORWARDING +#define WOLFIP_ENABLE_FORWARDING 0 +#endif + +#ifndef WOLFIP_ENABLE_LOOPBACK +#define WOLFIP_ENABLE_LOOPBACK 0 +#endif + +#ifndef WOLFIP_ENABLE_DHCP +#define WOLFIP_ENABLE_DHCP 1 +#endif + +/* Static IP fallback (used if DHCP is disabled or times out). */ +#define WOLFIP_IP "192.168.1.100" +#define WOLFIP_NETMASK "255.255.255.0" +#define WOLFIP_GW "192.168.1.1" +#define WOLFIP_STATIC_DNS_IP "8.8.8.8" + +#if WOLFIP_ENABLE_DHCP +#define DHCP +#define DHCP_DISCOVER_RETRIES 2 +#define DHCP_REQUEST_RETRIES 2 +#endif + +/* Hardware debug: define for verbose GEM / MDIO / DHCP logging. */ +/* #define DEBUG_HW */ + +#endif /* WOLF_CONFIG_H */ diff --git a/src/port/versal/entropy.c b/src/port/versal/entropy.c new file mode 100644 index 00000000..b454af55 --- /dev/null +++ b/src/port/versal/entropy.c @@ -0,0 +1,116 @@ +/* entropy.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * MemUse-pattern entropy source for the wolfIP ZCU102 port. + * + * The ZCU102's UltraScale+ PS does not ship a hardware TRNG that is + * usable from EL3 bare-metal without the PMU firmware and CSU helpers. + * This source produces non-deterministic 32-bit words by sampling the + * Cortex-A53 virtual count register (CNTVCT_EL0) before and after a + * memory-access loop that touches a state buffer larger than the L1 + * data cache. The cache-miss / line-fill / write-allocate timing + * variance is the entropy source - the same primitive wolfCrypt's + * wc_Entropy_Get() (HAVE_ENTROPY_MEMUSE in wolfssl/wolfcrypt/src/ + * wolfentropy.c) uses internally. + * + * This implementation skips wolfCrypt's SHA3-256 conditioning because + * the consumers in wolfIP (TCP ISN, DHCP/DNS transaction IDs, + * ephemeral source ports, IP fragment ID) need unpredictable bits, + * not uniformly-distributed cryptographic randomness. For crypto- + * grade seeding the port should be rebuilt with the full wolfCrypt + * wc_Entropy_Get() in place of zcu102_get_random32(). + * + * Algorithm per call: + * 1. t0 = CNTVCT_EL0 + * 2. Walk state[] performing read+xor+write; ~256 accesses spans + * multiple L1 cache lines on this 32 KB / 4-way A53 cache. + * 3. t1 = CNTVCT_EL0 + * 4. Fold (t1 - t0) into the rolling 64-bit accumulator and + * perturb state[] so the next call diverges. + * 5. Apply a non-cryptographic finaliser (xorshift) and return + * the low 32 bits. + * + * The state buffer is 1024 bytes (sized to span the A53's 64-byte + * line size 16 times, ensuring at least a handful of cache misses + * per call even on a warm cache). + */ +#include + +#define ENTROPY_STATE_WORDS 128u /* 1024 bytes, 16 cache lines */ +#define ENTROPY_WALK_ITERS 256u + +static volatile uint64_t entropy_state[ENTROPY_STATE_WORDS]; +static volatile uint64_t entropy_acc; +static volatile uint32_t entropy_idx; + +static inline uint64_t cntvct_el0(void) +{ + uint64_t v; + __asm__ volatile ("mrs %0, cntvct_el0" : "=r"(v)); + return v; +} + +/* Return a 32-bit value with low predictability, suitable for + * protocol identifiers (DHCP xid, DNS id, TCP ISN, ephemeral port, + * IP fragment id). Not crypto-grade; see file header. */ +uint32_t zcu102_get_random32(void) +{ + uint64_t t0, t1, delta; + uint64_t acc; + uint32_t i; + uint32_t walk_idx; + + t0 = cntvct_el0(); + + /* Memory-access loop: stride through the state array. Using a + * data-dependent index (acc & mask) keeps the prefetcher from + * predicting cache lines, which is exactly the timing noise we + * want to harvest. */ + acc = entropy_acc; + walk_idx = entropy_idx; + for (i = 0; i < ENTROPY_WALK_ITERS; i++) { + uint32_t pos = (walk_idx + (uint32_t)(acc & 0x7Fu)) + & (ENTROPY_STATE_WORDS - 1u); + uint64_t v = entropy_state[pos]; + v ^= acc; + v = (v << 1) | (v >> 63); /* rotate left 1 */ + entropy_state[pos] = v; + acc += v; + walk_idx++; + } + + t1 = cntvct_el0(); + delta = t1 - t0; + + /* Fold the timing delta into the accumulator and the head of + * the state ring. */ + acc ^= delta; + acc ^= (delta << 17) | (delta >> 47); + entropy_state[walk_idx & (ENTROPY_STATE_WORDS - 1u)] ^= acc; + entropy_acc = acc; + entropy_idx = walk_idx; + + /* xorshift64 finaliser to whiten the output word. */ + acc ^= acc << 13; + acc ^= acc >> 7; + acc ^= acc << 17; + + return (uint32_t)acc; +} diff --git a/src/port/versal/gem.c b/src/port/versal/gem.c new file mode 100644 index 00000000..2435c15d --- /dev/null +++ b/src/port/versal/gem.c @@ -0,0 +1,762 @@ +/* gem.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Cadence GEM driver for Versal GEM0 (on-board RJ45 on VMK180). + * + * - 32-bit DMA addressing (OCM low bank, well under 4 GB). + * - IRQ-driven RX via GIC-600 INTID 88 (GIC_SPI 56) -> gem_isr; + * polled TX. Register set is identical to ZynqMP's GEM3 -- only + * the base address (board.h GEM0_BASE) and INTID (board.h + * IRQ_GEM0) differ. The SCR_EL3.IRQ routing convention from the + * ZCU102 port is preserved (see startup.S) defensively, even + * though Versal's GICv3 + BL31 handoff path may not require it. + * - BDs and frame buffers live in the .dma_buffers section, which the + * linker places in OCM (Normal-WB executable per L2_PERIPH[511]). + * Cache coherency between CPU L1 D-cache and the MAC DMA path is + * maintained explicitly via cache_clean() / cache_inval() at every + * BD hand-off. + * + * Register set per Versal ACAP TRM (AM011) chapter "Gigabit Ethernet + * MAC" / Cadence GEM. + * + * UNTESTED ON HARDWARE -- structural scaffold. + */ +#include +#include +#include "config.h" +#include "../../../wolfip.h" +#include "board.h" +#include "uart.h" +#include "gic.h" +#include "gem.h" +#include "phy_dp83867.h" + +/* Cache maintenance helpers for GEM DMA coherency. Cortex-A53 cache + * line is 64 bytes. With D-cache enabled and BD/buffers in normal + * cacheable memory, CPU writes may sit in L1 D-cache and not be + * visible to the MAC's DMA path. cache_clean() writes back dirty + * lines to memory before DMA reads; cache_inval() invalidates lines + * so subsequent CPU reads pull fresh DMA-written data. */ +#define CACHE_LINE 64u + +static inline void cache_clean(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("dc cvac, %0" :: "r"(a) : "memory"); + __asm__ volatile ("dsb sy" ::: "memory"); +} + +static inline void cache_inval(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("dc ivac, %0" :: "r"(a) : "memory"); + __asm__ volatile ("dsb sy" ::: "memory"); +} + +/* --------------------------------------------------------------------- + * Register offsets and bit masks (subset we use) + * ------------------------------------------------------------------- */ +#define GEM_NWCTRL (*(volatile uint32_t *)(GEM0_BASE + 0x000)) +#define GEM_NWCFG (*(volatile uint32_t *)(GEM0_BASE + 0x004)) +#define GEM_NWSR (*(volatile uint32_t *)(GEM0_BASE + 0x008)) +#define GEM_DMACR (*(volatile uint32_t *)(GEM0_BASE + 0x010)) +#define GEM_TSR (*(volatile uint32_t *)(GEM0_BASE + 0x014)) +#define GEM_RXQBASE (*(volatile uint32_t *)(GEM0_BASE + 0x018)) +#define GEM_TXQBASE (*(volatile uint32_t *)(GEM0_BASE + 0x01C)) +#define GEM_RSR (*(volatile uint32_t *)(GEM0_BASE + 0x020)) +#define GEM_ISR (*(volatile uint32_t *)(GEM0_BASE + 0x024)) +#define GEM_IER (*(volatile uint32_t *)(GEM0_BASE + 0x028)) +#define GEM_IDR (*(volatile uint32_t *)(GEM0_BASE + 0x02C)) +#define GEM_PHYMNTNC (*(volatile uint32_t *)(GEM0_BASE + 0x034)) +#define GEM_HASHL (*(volatile uint32_t *)(GEM0_BASE + 0x080)) +#define GEM_HASHH (*(volatile uint32_t *)(GEM0_BASE + 0x084)) +#define GEM_LADDR1L (*(volatile uint32_t *)(GEM0_BASE + 0x088)) +#define GEM_LADDR1H (*(volatile uint32_t *)(GEM0_BASE + 0x08C)) +/* Priority queue base addresses (queues 1-3). Cadence GEM has 4 TX + * and 4 RX priority queues; if we don't point unused ones at a safe + * dummy BD, the MAC will eventually try to fetch from queue1+ at + * power-on-random addresses and hang (TSR.TXGO sticks with no octets + * transmitted). U-Boot's zynq_gem and Linux's macb both set these. */ +#define GEM_TXQ1BASE (*(volatile uint32_t *)(GEM0_BASE + 0x440)) +#define GEM_TXQ2BASE (*(volatile uint32_t *)(GEM0_BASE + 0x444)) +#define GEM_TXQ3BASE (*(volatile uint32_t *)(GEM0_BASE + 0x448)) +#define GEM_RXQ1BASE (*(volatile uint32_t *)(GEM0_BASE + 0x480)) +#define GEM_RXQ2BASE (*(volatile uint32_t *)(GEM0_BASE + 0x484)) +#define GEM_RXQ3BASE (*(volatile uint32_t *)(GEM0_BASE + 0x488)) +#define GEM_OCTTXL (*(volatile uint32_t *)(GEM0_BASE + 0x100)) +#define GEM_TXCNT (*(volatile uint32_t *)(GEM0_BASE + 0x108)) +#define GEM_OCTRXL (*(volatile uint32_t *)(GEM0_BASE + 0x150)) +#define GEM_RXCNT (*(volatile uint32_t *)(GEM0_BASE + 0x158)) +#define GEM_RXFCSCNT (*(volatile uint32_t *)(GEM0_BASE + 0x190)) +#define GEM_RXORCNT (*(volatile uint32_t *)(GEM0_BASE + 0x1A4)) + +#define NWCTRL_LOOPEN (1u << 1) +#define NWCTRL_RXEN (1u << 2) +#define NWCTRL_TXEN (1u << 3) +#define NWCTRL_MDEN (1u << 4) +#define NWCTRL_STATCLR (1u << 5) +#define NWCTRL_STARTTX (1u << 9) +#define NWCTRL_HALTTX (1u << 10) + +#define NWCFG_SPEED100 (1u << 0) +#define NWCFG_FDEN (1u << 1) +#define NWCFG_COPYALL (1u << 4) +#define NWCFG_BCASTDI (1u << 5) +#define NWCFG_MCASTHASHEN (1u << 6) +#define NWCFG_UCASTHASHEN (1u << 7) +#define NWCFG_1536RXEN (1u << 8) +#define NWCFG_1000 (1u << 10) +#define NWCFG_FCSREM (1u << 17) +#define NWCFG_MDCDIV_SHIFT 18u +#define NWCFG_MDCDIV_MASK (7u << 18) +#define NWCFG_DWIDTH_64 (1u << 21) /* Data bus width = 64 bit (AArch64) */ + +#define NWSR_PHY_IDLE (1u << 2) + +#define RSR_BUFFNA (1u << 0) +#define RSR_FRAMERX (1u << 1) +#define RSR_RXOVR (1u << 2) +#define RSR_HRESPNOK (1u << 3) + +#define IXR_MGMNT (1u << 0) +#define IXR_FRAMERX (1u << 1) +#define IXR_TXCOMPL (1u << 7) +#define IXR_TXEXH (1u << 6) +#define IXR_RXUSED (1u << 2) +#define IXR_RXOVR (1u << 10) +#define IXR_HRESPNOK (1u << 11) + +#define PHYMNTNC_CLAUSE22 0x40020000u +#define PHYMNTNC_OP_R (2u << 28) +#define PHYMNTNC_OP_W (1u << 28) + +#define RXBUF_OWN_SW (1u << 0) +#define RXBUF_WRAP (1u << 1) +#define RXBUF_ADDR_MASK 0xFFFFFFFCu +#define RXBUF_LEN_MASK 0x00001FFFu + +#define TXBUF_USED (1u << 31) +#define TXBUF_WRAP (1u << 30) +#define TXBUF_LAST (1u << 15) +#define TXBUF_LEN_MASK 0x00003FFFu + +/* --------------------------------------------------------------------- + * BD ring and frame buffer sizing + * ------------------------------------------------------------------- */ +/* Ring sizes deliberately small to fit text + DMA buffers + BSS in + * 256 KB OCM (we keep everything in OCM because DDR-via-JTAG isn't + * reliable without PMU FW running). For higher throughput, bump + * these once we move BSS back to DDR. */ +#define RX_RING_LEN 16 +#define TX_RING_LEN 8 +#define BUF_LEN 1536 /* multiple of 64, per DMACR.RXBS */ + +/* GEM BD: two 32-bit words. */ +struct gem_bd { + uint32_t addr; + uint32_t status; +}; + +/* All DMA-visible objects go in .dma_buffers (Device-nGnRnE per MMU). */ +static struct gem_bd rx_ring[RX_RING_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +static struct gem_bd tx_ring[TX_RING_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +static uint8_t rx_buf_pool[RX_RING_LEN][BUF_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +static uint8_t tx_buf_pool[TX_RING_LEN][BUF_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); + +/* Dummy BD pair for disabling priority queues 1-3. The TX dummy has + * USED=1 so the MAC ignores it (refuses to transmit). The RX dummy + * has the SW-OWN/NEW bit set so MAC won't write into queue 1-3 RX. */ +static struct gem_bd dummy_tx_bd + __attribute__((aligned(8), section(".dma_buffers"))); +static struct gem_bd dummy_rx_bd + __attribute__((aligned(8), section(".dma_buffers"))); + +/* --------------------------------------------------------------------- + * Software RX queue: filled by ISR, drained by eth_poll() in the main + * loop. Single producer (ISR) / single consumer (main), so a lockless + * head/tail pair is safe when we use DSB to publish writes. + * + * Each slot stores a pointer to one of rx_buf_pool[i] plus length; + * the buffer's BD is recycled after the main loop hands the frame to + * wolfIP. + * ------------------------------------------------------------------- */ +#define SWQ_DEPTH 16 + +struct swq_slot { + uint8_t *buf; + uint16_t len; + uint16_t ring_idx; /* into rx_ring[] - recycle after consume */ +}; + +static volatile struct swq_slot swq[SWQ_DEPTH]; +static volatile uint32_t swq_head; /* ISR writes */ +static volatile uint32_t swq_tail; /* main reads */ +static volatile uint32_t rx_drops; /* ISR-side counter */ +static volatile uint32_t s_irq_count; +static volatile uint32_t s_rx_frames; +static volatile uint32_t s_tx_sent; + +static uint32_t rx_next; /* next BD the SW will look at */ +static uint32_t tx_next; /* next BD the SW will try to TX */ + +static uint8_t phy_addr_used; + +/* --------------------------------------------------------------------- + * MDIO + * ------------------------------------------------------------------- */ +static int mdio_wait_idle(void) +{ + int spin; + for (spin = 0; spin < 100000; spin++) { + if (GEM_NWSR & NWSR_PHY_IDLE) + return 0; + } + return -1; +} + +int gem_mdio_read(uint8_t phy_addr, uint8_t reg, uint16_t *out) +{ + uint32_t v; + if (mdio_wait_idle() < 0) + return -1; + v = PHYMNTNC_CLAUSE22 | PHYMNTNC_OP_R + | (((uint32_t)phy_addr & 0x1Fu) << 23) + | (((uint32_t)reg & 0x1Fu) << 18); + GEM_PHYMNTNC = v; + if (mdio_wait_idle() < 0) + return -2; + *out = (uint16_t)(GEM_PHYMNTNC & 0xFFFFu); + return 0; +} + +int gem_mdio_write(uint8_t phy_addr, uint8_t reg, uint16_t value) +{ + uint32_t v; + if (mdio_wait_idle() < 0) + return -1; + v = PHYMNTNC_CLAUSE22 | PHYMNTNC_OP_W + | (((uint32_t)phy_addr & 0x1Fu) << 23) + | (((uint32_t)reg & 0x1Fu) << 18) + | (uint32_t)value; + GEM_PHYMNTNC = v; + if (mdio_wait_idle() < 0) + return -2; + return 0; +} + +/* --------------------------------------------------------------------- + * BD ring init + * ------------------------------------------------------------------- */ +static void rx_ring_init(void) +{ + uint32_t i; + for (i = 0; i < RX_RING_LEN; i++) { + uint32_t addr = (uint32_t)(uintptr_t)rx_buf_pool[i]; + addr &= RXBUF_ADDR_MASK; + if (i == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + rx_ring[i].addr = addr; /* OWN=0 -> hardware can use */ + rx_ring[i].status = 0; + } + rx_next = 0; +} + +static void tx_ring_init(void) +{ + uint32_t i; + /* Match u-boot zynq_gem pattern: all BDs start as dummies with + * USED|LAST|WRAP, addr=0. eth_send fills in addr + length + LAST + * (and clears USED) when actually transmitting. The WRAP bit on + * the last BD keeps the MAC walker in our ring. */ + for (i = 0; i < TX_RING_LEN; i++) { + tx_ring[i].addr = 0; + tx_ring[i].status = TXBUF_USED | TXBUF_LAST + | ((i == TX_RING_LEN - 1) ? TXBUF_WRAP : 0); + } + tx_next = 0; +} + +/* --------------------------------------------------------------------- + * RX ISR + * ------------------------------------------------------------------- */ +static void gem_isr(void) +{ + uint32_t isr; + + s_irq_count++; + isr = GEM_ISR; + GEM_ISR = isr; /* clear-on-write */ + + /* Invalidate the WHOLE RX ring at entry - MAC may have written + * to any BD, not just rx_next. Cheap (one cache line typically + * since the ring is small). */ + cache_inval(rx_ring, sizeof(rx_ring)); + + /* Walk RX BDs whose SW-OWN bit is set (frame ready for software). */ + while (rx_ring[rx_next].addr & RXBUF_OWN_SW) { + s_rx_frames++; + /* Also invalidate the buffer before we copy from it. */ + cache_inval(rx_buf_pool[rx_next], + rx_ring[rx_next].status & RXBUF_LEN_MASK); + uint32_t status = rx_ring[rx_next].status; + uint32_t next_head = swq_head; + uint32_t slot = next_head % SWQ_DEPTH; + uint32_t depth = next_head - swq_tail; + + if (depth >= SWQ_DEPTH) { + /* SW queue full - drop and recycle the BD. */ + rx_drops++; + } else { + swq[slot].buf = rx_buf_pool[rx_next]; + swq[slot].len = (uint16_t)(status & RXBUF_LEN_MASK); + swq[slot].ring_idx = (uint16_t)rx_next; + __asm__ volatile ("dsb sy" ::: "memory"); + swq_head = next_head + 1; + } + + /* If we have headroom in the SW queue we recycle the BD only + * after main consumes the slot (see eth_poll); when dropping we + * recycle here. */ + if (depth >= SWQ_DEPTH) { + uint32_t addr = (uint32_t)(uintptr_t)rx_buf_pool[rx_next]; + addr &= RXBUF_ADDR_MASK; + if (rx_next == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + rx_ring[rx_next].status = 0; + __asm__ volatile ("dsb sy" ::: "memory"); + rx_ring[rx_next].addr = addr; /* OWN=0 again */ + /* MAC reads BDs straight from memory; clean the line so it + * sees OWN=0, otherwise it skips past this BD and walks the + * ring leaving holes. */ + cache_clean(&rx_ring[rx_next], sizeof(rx_ring[rx_next])); + } + rx_next = (rx_next + 1) % RX_RING_LEN; + } + + /* RXUSED recovery: clear BUFFNA. With cache_clean on the recycle + * path, this should be rare; when it happens, also kick the RX + * path so the MAC re-walks the ring. */ + if (isr & IXR_RXUSED) { + GEM_RSR = RSR_BUFFNA; + } + if (isr & IXR_RXOVR) { + GEM_RSR = RSR_RXOVR; + } +} + +/* --------------------------------------------------------------------- + * eth_poll / eth_send (called from wolfIP_poll and stack TX path) + * ------------------------------------------------------------------- */ +static int eth_poll(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + uint32_t tail; + uint32_t slot; + uint32_t copy; + uint32_t addr; + uint16_t idx; + + (void)ll; + + /* RX frames are delivered into swq[] by gem_isr() running off the + * GIC-400 INTID 95 IRQ path (see startup.S SCR_EL3 setup and + * board.h IRQ_GEM0). eth_poll just drains the SW queue here. */ + tail = swq_tail; + if (tail == swq_head) + return 0; /* SW queue empty */ + + slot = tail % SWQ_DEPTH; + copy = swq[slot].len; + if (copy > len) + copy = len; + memcpy(buf, swq[slot].buf, copy); + + /* Recycle the BD back to hardware. */ + idx = swq[slot].ring_idx; + addr = (uint32_t)(uintptr_t)rx_buf_pool[idx]; + addr &= RXBUF_ADDR_MASK; + if (idx == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + rx_ring[idx].status = 0; + __asm__ volatile ("dsb sy" ::: "memory"); + rx_ring[idx].addr = addr; /* OWN bit cleared = HW can write */ + /* MAC walks BDs from main memory (not coherent with CPU D-cache); + * push the OWN=0 store out so the MAC will reuse this slot. */ + cache_clean(&rx_ring[idx], sizeof(rx_ring[idx])); + + __asm__ volatile ("dsb sy" ::: "memory"); + swq_tail = tail + 1; + + return (int)copy; +} + +static int eth_send(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + uint32_t idx; + uint32_t status; + + (void)ll; + + if (len > BUF_LEN) + return -1; + + idx = tx_next; + /* Wait briefly for the BD to be free (USED=1 means MAC done). The + * USED bit is written back by MAC DMA - invalidate the cache line + * so the CPU does not see the stale USED=0 we wrote when we last + * armed this BD. */ + { + int spin; + for (spin = 0; spin < 100000; spin++) { + cache_inval(&tx_ring[idx], sizeof(tx_ring[idx])); + if (tx_ring[idx].status & TXBUF_USED) + break; + } + if ((tx_ring[idx].status & TXBUF_USED) == 0) + return -2; /* TX ring backed up - tell caller to retry */ + } + + memcpy(tx_buf_pool[idx], buf, len); + + /* Pad to minimum Ethernet frame (60 bytes; MAC adds 4-byte FCS). */ + if (len < 60u) { + memset(tx_buf_pool[idx] + len, 0, 60u - len); + len = 60u; + } + + /* Flush the frame buffer from D-cache so MAC DMA reads see it. */ + cache_clean(tx_buf_pool[idx], len); + + /* Re-arm BD: set buffer address, then clear USED with length+LAST + * (preserve WRAP if this is the last BD). Buffer addr written + * before status so MAC walking the ring sees a valid pair. */ + tx_ring[idx].addr = (uint32_t)(uintptr_t)tx_buf_pool[idx]; + status = (len & TXBUF_LEN_MASK) | TXBUF_LAST; + if (idx == TX_RING_LEN - 1) + status |= TXBUF_WRAP; + tx_ring[idx].status = status; /* USED=0 -> ready for MAC */ + + /* Flush BD update so MAC sees USED=0. */ + cache_clean(&tx_ring[idx], sizeof(tx_ring[idx])); + GEM_NWCTRL |= NWCTRL_STARTTX; + + s_tx_sent++; + tx_next = (idx + 1) % TX_RING_LEN; + return (int)len; +} + +uint32_t gem_irq_count(void) { return s_irq_count; } +uint32_t gem_rx_frames(void) { return s_rx_frames; } +uint32_t gem_tx_sent(void) { return s_tx_sent; } + +void gem_dump_state(void) +{ + uint32_t i; + cache_inval(rx_ring, sizeof(rx_ring)); + cache_inval(tx_ring, sizeof(tx_ring)); + uart_puts("GEM3 regs: NWCTRL="); uart_puthex(GEM_NWCTRL); + uart_puts(" NWCFG="); uart_puthex(GEM_NWCFG); + uart_puts(" NWSR="); uart_puthex(GEM_NWSR); + uart_puts(" DMACR="); uart_puthex(GEM_DMACR); + uart_puts("\n ISR="); uart_puthex(GEM_ISR); + uart_puts(" RSR="); uart_puthex(GEM_RSR); + uart_puts(" TSR="); uart_puthex(GEM_TSR); + uart_puts(" IMR="); uart_puthex(*(volatile uint32_t *)(GEM0_BASE + 0x030)); + uart_puts("\n tx[0]="); uart_puthex(tx_ring[0].addr); + uart_puts("/"); uart_puthex(tx_ring[0].status); + uart_puts(" rx[0]="); uart_puthex(rx_ring[0].addr); + uart_puts("/"); uart_puthex(rx_ring[0].status); + uart_puts("\n irq="); uart_putdec(s_irq_count); + uart_puts(" rx_frm="); uart_putdec(s_rx_frames); + uart_puts(" tx_snt="); uart_putdec(s_tx_sent); + uart_puts(" drops="); uart_putdec(rx_drops); + uart_puts("\n HW counters: txoct="); uart_putdec(GEM_OCTTXL); + uart_puts(" txcnt="); uart_putdec(GEM_TXCNT); + uart_puts(" rxoct="); uart_putdec(GEM_OCTRXL); + uart_puts(" rxcnt="); uart_putdec(GEM_RXCNT); + uart_puts(" rxfcs="); uart_putdec(GEM_RXFCSCNT); + uart_puts(" rxor="); uart_putdec(GEM_RXORCNT); + { + uint32_t filled = 0; + uint32_t first_filled = 0xFFFF; + for (i = 0; i < RX_RING_LEN; i++) { + if (rx_ring[i].addr & RXBUF_OWN_SW) { + filled++; + if (first_filled == 0xFFFF) first_filled = i; + } + } + uart_puts(" rx_filled="); uart_putdec(filled); + uart_puts(" first="); uart_putdec(first_filled); + uart_puts(" rx_next="); uart_putdec(rx_next); + } + uart_puts("\n"); +} + +/* --------------------------------------------------------------------- + * Clock + reset for GEM3 via CRL_APB. + * + * For the stock ZCU102 boot flow, FSBL has already configured GEM3: + * - CRL_APB.GEM3_REF_CTRL -> 125 MHz from IOPLL or RPLL + * - CRL_APB.RST_LPD_IOU0 -> GEM3 out of reset + * - IOU_SLCR MIO 64..77 -> GEM3 RGMII + MDIO pin muxing + * + * We pulse the GEM3 reset bit so the MAC starts from a known state + * without touching the clock control (which would race with FSBL's + * setup of PLLs). + * ------------------------------------------------------------------- */ +#define CRL_RST_GEM3 (1u << 3) + +/* Configure CRL_APB.GEM3_REF_CTRL for the negotiated link speed. The + * MAC sources TX_CLK to the PHY at this rate (RGMII): 125 MHz for + * 1 Gbps, 25 MHz for 100 Mbps, 2.5 MHz for 10 Mbps. PetaLinux/FSBL + * may pre-program this for a different speed than we want; both + * U-Boot and Linux re-program it whenever PHY link speed changes. + * + * IOPLL = 1500 MHz on ZCU102 (FSBL default). + * 1500 / 12 / 1 = 125 MHz (1000) + * 1500 / 12 / 5 = 25 MHz (100) + * 1500 / 12 / 50 = 2.5 MHz (10) + * + * Register layout (TRM): CLKACT bit26, CLKACT_RX bit25, + * DIVISOR1 bits[21:16], DIVISOR0 bits[13:8], SRCSEL bits[2:0]. */ +static void gem3_set_ref_clk(int speed_mbps) +{ + volatile uint32_t *gem3_ref = (volatile uint32_t *)CRL_APB_GEM0_REF_CTRL; + uint32_t div1; + uint32_t val; + + switch (speed_mbps) { + case 1000: div1 = 1; break; + case 100: div1 = 5; break; + case 10: div1 = 50; break; + default: div1 = 1; break; + } + val = (1u << 26) /* CLKACT */ + | (1u << 25) /* CLKACT_RX */ + | ((div1 & 0x3Fu) << 16) /* DIVISOR1 */ + | ((12u & 0x3Fu) << 8) /* DIVISOR0 */ + | (0u); /* SRCSEL = IOPLL */ + *gem3_ref = val; +} + +static void gem3_hw_reset(void) +{ + volatile uint32_t *rst = (volatile uint32_t *)CRL_APB_RST_LPD_IOU0; + volatile uint32_t *gem3ref = (volatile uint32_t *)CRL_APB_GEM0_REF_CTRL; + + uart_puts("GEM3 clk before: GEM3_REF_CTRL="); + uart_puthex(*gem3ref); + uart_puts(" RST_LPD_IOU0="); + uart_puthex(*rst); + uart_puts("\n"); + + *rst |= CRL_RST_GEM3; + { + volatile int d; + for (d = 0; d < 10000; d++) + ; + } + *rst &= ~CRL_RST_GEM3; + { + volatile int d; + for (d = 0; d < 100000; d++) /* ~10 ms post-reset settle */ + ; + } + + /* Force 125 MHz reference for the 1 Gbps case. zcu102_eth_init() + * downshifts this later if the PHY ends up at 100/10. */ + gem3_set_ref_clk(1000); + uart_puts("GEM3 clk after : GEM3_REF_CTRL="); + uart_puthex(*gem3ref); + uart_puts("\n"); +} + +/* --------------------------------------------------------------------- + * Public init + * ------------------------------------------------------------------- */ +int zcu102_eth_init(struct wolfIP_ll_dev *ll) +{ + uint8_t addr; + uint16_t id1; + int found_phy; + int speed; + int fd; + int link_up; + + gem3_hw_reset(); + + /* Disable everything before configuring. */ + GEM_NWCTRL = 0; + GEM_IDR = 0xFFFFFFFFu; + (void)GEM_ISR; + GEM_ISR = 0xFFFFFFFFu; + GEM_TSR = 0xFFFFFFFFu; + GEM_RSR = RSR_BUFFNA | RSR_FRAMERX | RSR_RXOVR | RSR_HRESPNOK; + + /* Initial NWCFG: gigabit, full duplex, MDC=/96, 1536-byte frames, + * strip FCS from RX, accept broadcasts, multicast via hash, + * DWIDTH_64 because ZynqMP GEM hangs on a 64-bit AXI bus and + * needs this bit for TX to actually transmit (matches U-Boot + * ZYNQ_GEM_DBUS_WIDTH for CONFIG_ARM64). + * COPYALL temporarily on for first-bring-up so we can confirm + * the RX path is alive even if filtering is mis-set. */ + GEM_NWCFG = NWCFG_1000 + | NWCFG_FDEN + | NWCFG_FCSREM + | NWCFG_1536RXEN + | NWCFG_MCASTHASHEN + | NWCFG_COPYALL + | NWCFG_DWIDTH_64 + | (5u << NWCFG_MDCDIV_SHIFT); + + /* DMACR: AHB fixed burst 16 beats, RX buffer 1536/64=24, TX/RX + * packet buffer memory at max. Do NOT set bit 30 (DMA_ADDR_BUS_WIDTH + * 64-bit): that selects 16-byte BD format with addr_hi, which would + * break the 8-byte struct gem_bd layout (MAC would walk every other + * BD and write to bogus high addresses, dropping the frame after + * counting it - exactly the failure mode we hit). 64-bit AXI bus + * width is set in NWCFG bit 21 instead. */ + GEM_DMACR = (24u << 16) /* RX buffer size in 64-byte units */ + | (1u << 10) /* TX packet buffer memory size = max */ + | (3u << 8) /* RX packet buffer memory size = max */ + | 0x10u; /* burst length = 16 */ + + /* Set MAC address into SAB1/SAT1. SAB1L writes are latched on + * SAB1H write per TRM, so write the high half last. */ + GEM_LADDR1L = (uint32_t)WOLFIP_MAC_0 + | ((uint32_t)WOLFIP_MAC_1 << 8) + | ((uint32_t)WOLFIP_MAC_2 << 16) + | ((uint32_t)WOLFIP_MAC_3 << 24); + GEM_LADDR1H = (uint32_t)WOLFIP_MAC_4 + | ((uint32_t)WOLFIP_MAC_5 << 8); + + GEM_HASHL = 0; + GEM_HASHH = 0; + + /* Build BD rings. */ + rx_ring_init(); + tx_ring_init(); + GEM_RXQBASE = (uint32_t)(uintptr_t)rx_ring; + GEM_TXQBASE = (uint32_t)(uintptr_t)tx_ring; + + /* Disable priority queues 1-3 with dummy BDs. Without this, the + * MAC may walk uninitialised q1/q2/q3 base pointers and hang + * (TSR.TXGO sticks but no octets transmitted). */ + dummy_tx_bd.addr = 0; + dummy_tx_bd.status = TXBUF_USED | TXBUF_WRAP | TXBUF_LAST; + dummy_rx_bd.addr = RXBUF_WRAP | RXBUF_OWN_SW; + dummy_rx_bd.status = 0; + GEM_TXQ1BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_TXQ2BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_TXQ3BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_RXQ1BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + GEM_RXQ2BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + GEM_RXQ3BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + cache_clean(&dummy_tx_bd, sizeof(dummy_tx_bd)); + cache_clean(&dummy_rx_bd, sizeof(dummy_rx_bd)); + + /* Clear any stale RX/TX packet classification screening. ZynqMP + * GEM has SCREENING_TYPE_1 (TID match) at 0x500+ and SCREENING_TYPE_2 + * (compare) at 0x540+. If non-zero, frames may be routed to non-Q0 + * queues. Default 0 = all to Q0. */ + { + uint32_t k; + for (k = 0; k < 16; k++) { + *(volatile uint32_t *)(GEM0_BASE + 0x500 + 4*k) = 0; + *(volatile uint32_t *)(GEM0_BASE + 0x540 + 4*k) = 0; + } + } + + /* Enable MDIO so we can talk to the PHY. */ + GEM_NWCTRL |= NWCTRL_MDEN; + + /* Probe MDIO addresses 0..31 for a responsive PHY. ZCU102 routes + * DP83867 to MDIO address 0x0C, but probing makes the driver + * resilient to board variants. */ + found_phy = 0; + for (addr = 0; addr < 32; addr++) { + if (gem_mdio_read(addr, 0x02, &id1) == 0 && id1 != 0xFFFFu && id1 != 0) { + found_phy = 1; + break; + } + } + if (!found_phy) { + uart_puts("GEM3: no PHY responding on MDIO!\n"); + return -10; + } + phy_addr_used = addr; + uart_puts("GEM3: PHY at MDIO addr="); + uart_puthex(phy_addr_used); + uart_puts("\n"); + + if (dp83867_init(phy_addr_used, &speed, &fd) < 0) { + uart_puts("GEM3: PHY init failed\n"); + return -11; + } + + /* If PHY ended up at 10/100, downshift the MAC and re-program the + * GEM3 reference clock to match (125 MHz / 25 MHz / 2.5 MHz). */ + if (speed != 1000) { + uint32_t cfg = GEM_NWCFG; + cfg &= ~NWCFG_1000; + if (speed == 100) + cfg |= NWCFG_SPEED100; + else + cfg &= ~NWCFG_SPEED100; + if (!fd) + cfg &= ~NWCFG_FDEN; + GEM_NWCFG = cfg; + gem3_set_ref_clk(speed); + } + + /* Install RX ISR. */ + gic_register_handler(IRQ_GEM0, gem_isr); + gic_enable_spi(IRQ_GEM0, 0xA0); + + /* Enable RX/TX and arm RX-side interrupts. */ + GEM_IER = IXR_FRAMERX | IXR_RXUSED | IXR_RXOVR | IXR_HRESPNOK; + GEM_NWCTRL |= NWCTRL_RXEN | NWCTRL_TXEN; + + /* Populate wolfIP ll_dev. */ + ll->mac[0] = WOLFIP_MAC_0; + ll->mac[1] = WOLFIP_MAC_1; + ll->mac[2] = WOLFIP_MAC_2; + ll->mac[3] = WOLFIP_MAC_3; + ll->mac[4] = WOLFIP_MAC_4; + ll->mac[5] = WOLFIP_MAC_5; + memcpy(ll->ifname, "eth0", 5); + ll->non_ethernet = 0; + ll->mtu = LINK_MTU; + ll->poll = eth_poll; + ll->send = eth_send; + ll->priv = NULL; + + link_up = (dp83867_link_status(phy_addr_used) == 1) ? 1 : 0; + return (link_up << 8) | (int)phy_addr_used; +} diff --git a/src/port/versal/gem.h b/src/port/versal/gem.h new file mode 100644 index 00000000..48aa8b33 --- /dev/null +++ b/src/port/versal/gem.h @@ -0,0 +1,35 @@ +/* gem.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Cadence GEM driver for Xilinx UltraScale+ MPSoC GEM3 (on-board RJ45 + * on ZCU102). Single-instance, RGMII, gigabit, polled TX, IRQ-driven + * RX. + */ +#ifndef ZCU102_GEM_H +#define ZCU102_GEM_H + +#include +#include "../../../wolfip.h" + +/* Initialize GEM3, MMIO clock + reset, PHY, and populate the wolfIP + * link-layer device. Returns: + * < 0 on error (negated TRM code) + * bits [7:0] PHY MDIO address used + * bit [8] link_up flag (1 = link is up at end of init) + */ +int zcu102_eth_init(struct wolfIP_ll_dev *ll); + +/* MDIO helpers exposed for the PHY driver (phy_dp83867.c). */ +int gem_mdio_read(uint8_t phy_addr, uint8_t reg, uint16_t *out); +int gem_mdio_write(uint8_t phy_addr, uint8_t reg, uint16_t value); + +/* Diagnostics: dump GEM registers and counters to UART. */ +void gem_dump_state(void); +uint32_t gem_irq_count(void); +uint32_t gem_rx_frames(void); +uint32_t gem_tx_sent(void); + +#endif /* ZCU102_GEM_H */ diff --git a/src/port/versal/gic.c b/src/port/versal/gic.c new file mode 100644 index 00000000..8cc42028 --- /dev/null +++ b/src/port/versal/gic.c @@ -0,0 +1,218 @@ +/* gic.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * GICv3 minimal driver for Cortex-A72 on Versal Gen 1. GICv3 differs + * from the ZCU102's GIC-400 (GICv2) in three structural ways: + * + * - The CPU interface is accessed via AArch64 system registers + * (ICC_*_EL1 / ICC_*_EL3) rather than memory-mapped GICC. + * - Each CPU has its own redistributor (GICR) memory map; SGI/PPI + * enable/priority/group regs live there instead of GICD. + * - Affinity routing is the default; legacy CPU-target byte fields + * do not exist for SPIs. + * + * BL31 (TF-A) on Versal normally initialises the GIC distributor and + * the per-CPU redistributor for us; we only re-prime per-INTID config + * for the SPIs we use and enable the CPU interface for our EL. + * + * UNTESTED ON HARDWARE -- structural scaffold mirroring the GICv2 + * driver under src/port/zcu102/gic.c. Bring-up steps and edge cases + * (interrupt routing model, security state, GIC600 vs GIC500 quirks) + * will need to be validated when the VMK180 lab board is available. + */ +#include +#include "board.h" +#include "gic.h" + +/* Distributor registers */ +#define GICD_CTLR (*(volatile uint32_t *)(GICD_BASE + 0x000)) +#define GICD_TYPER (*(volatile uint32_t *)(GICD_BASE + 0x004)) +#define GICD_IGROUPR(n) (*(volatile uint32_t *)(GICD_BASE + 0x080 + 4*(n))) +#define GICD_ISENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x100 + 4*(n))) +#define GICD_ICENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x180 + 4*(n))) +#define GICD_ISPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x200 + 4*(n))) +#define GICD_ICPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x280 + 4*(n))) +#define GICD_IPRIORITYR(n) (*(volatile uint32_t *)(GICD_BASE + 0x400 + 4*(n))) +#define GICD_ICFGR(n) (*(volatile uint32_t *)(GICD_BASE + 0xC00 + 4*(n))) +#define GICD_IROUTER(n) (*(volatile uint64_t *)(GICD_BASE + 0x6000 + 8*(n))) + +#define GICD_CTLR_ARE_S (1u << 4) +#define GICD_CTLR_ARE_NS (1u << 5) +#define GICD_CTLR_ENG0 (1u << 0) +#define GICD_CTLR_ENG1S (1u << 2) + +/* Redistributor for CPU 0 */ +#define GICR_CTLR (*(volatile uint32_t *)(GICR_BASE + 0x000)) +#define GICR_WAKER (*(volatile uint32_t *)(GICR_BASE + 0x014)) + +#define GICR_SGI_BASE (GICR_BASE + 0x10000) +#define GICR_IGROUPR0 (*(volatile uint32_t *)(GICR_SGI_BASE + 0x080)) +#define GICR_ISENABLER0 (*(volatile uint32_t *)(GICR_SGI_BASE + 0x100)) +#define GICR_ICENABLER0 (*(volatile uint32_t *)(GICR_SGI_BASE + 0x180)) +#define GICR_IPRIORITYR(n) (*(volatile uint32_t *)(GICR_SGI_BASE + 0x400 + 4*(n))) +#define GICR_ICFGR0 (*(volatile uint32_t *)(GICR_SGI_BASE + 0xC00)) +#define GICR_ICFGR1 (*(volatile uint32_t *)(GICR_SGI_BASE + 0xC04)) + +#define GICR_WAKER_PS (1u << 1) +#define GICR_WAKER_CA (1u << 2) + +#define GIC_NR_LINES 224 +static gic_handler_t handlers[GIC_NR_LINES]; +static volatile uint32_t g_irq_total; +static volatile uint32_t g_irq_last_intid; + +void gic_register_handler(uint32_t intid, gic_handler_t fn) +{ + if (intid < GIC_NR_LINES) + handlers[intid] = fn; +} + +/* ICC_*_EL1 / ICC_*_EL3 system register accessors. The encoded + * MSR/MRS forms below avoid relying on a particular assembler + * version supporting the symbolic names. */ +static inline void icc_sre_el3_set(uint64_t v) +{ + __asm__ volatile ("msr S3_6_C12_C12_5, %0" :: "r"(v)); + __asm__ volatile ("isb" ::: "memory"); +} + +static inline void icc_pmr_el1_set(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C4_C6_0, %0" :: "r"(v)); +} + +static inline void icc_igrpen1_el1_set(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C12_C12_7, %0" :: "r"(v)); +} + +static inline void icc_igrpen0_el1_set(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C12_C12_6, %0" :: "r"(v)); +} + +static inline uint64_t icc_iar1_el1_read(void) +{ + uint64_t v; + __asm__ volatile ("mrs %0, S3_0_C12_C12_0" : "=r"(v)); + return v; +} + +static inline void icc_eoir1_el1_write(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C12_C12_1, %0" :: "r"(v)); +} + +static inline void icc_ctlr_el1_set(uint64_t v) +{ + __asm__ volatile ("msr S3_0_C12_C12_4, %0" :: "r"(v)); +} + +static void gicr_wakeup(void) +{ + uint32_t waker = GICR_WAKER; + waker &= ~GICR_WAKER_PS; + GICR_WAKER = waker; + while (GICR_WAKER & GICR_WAKER_CA) + ; +} + +void gic_init(void) +{ + uint32_t i; + + GICD_CTLR = GICD_CTLR_ARE_S | GICD_CTLR_ENG1S; + + for (i = 1; i < (GIC_NR_LINES / 32u); i++) { + GICD_IGROUPR(i) = 0xFFFFFFFFu; + GICD_ICENABLER(i) = 0xFFFFFFFFu; + } + for (i = 8u; i < (GIC_NR_LINES / 4u); i++) + GICD_IPRIORITYR(i) = 0xA0A0A0A0u; + for (i = 32; i < GIC_NR_LINES; i++) + GICD_IROUTER(i) = 0; + + gicr_wakeup(); + GICR_IGROUPR0 = 0xFFFFFFFFu; + GICR_ICENABLER0 = 0xFFFFFFFFu; + for (i = 0; i < 8; i++) + GICR_IPRIORITYR(i) = 0xA0A0A0A0u; + + icc_sre_el3_set(0xF); + icc_pmr_el1_set(0xF8); + icc_ctlr_el1_set(0); + icc_igrpen1_el1_set(1); + icc_igrpen0_el1_set(1); +} + +void gic_enable_spi(uint32_t intid, uint32_t priority) +{ + uint32_t reg, shift; + volatile uint8_t *prio_byte; + + prio_byte = (volatile uint8_t *)(GICD_BASE + 0x400); + prio_byte[intid] = (uint8_t)(priority & 0xF8u); + + GICD_IGROUPR(intid >> 5) |= (1u << (intid & 31u)); + GICD_IROUTER(intid) = 0; + + shift = (intid & 15u) * 2u; + reg = GICD_ICFGR(intid >> 4); + reg &= ~(3u << shift); + GICD_ICFGR(intid >> 4) = reg; + + GICD_ICPENDR(intid >> 5) = (1u << (intid & 31u)); + GICD_ISENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void irq_dispatch(void) +{ + uint64_t iar; + uint32_t intid; + + iar = icc_iar1_el1_read(); + intid = (uint32_t)(iar & 0xFFFFFFu); + g_irq_total++; + g_irq_last_intid = intid; + if (intid < GIC_NR_LINES && handlers[intid] != 0) + handlers[intid](); + icc_eoir1_el1_write(iar); +} + +uint32_t gic_total_irqs(void) { return g_irq_total; } +uint32_t gic_last_intid(void) { return g_irq_last_intid; } + +uint32_t gic_is_pending(uint32_t intid) +{ + return (GICD_ISPENDR(intid >> 5) >> (intid & 31u)) & 1u; +} + +void gic_disable_spi(uint32_t intid) +{ + GICD_ICENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void gic_self_test_sgi(uint32_t intid) +{ + /* GICv3 ICC_SGI1R_EL1: target self via target list 1 */ + uint64_t v = ((uint64_t)(intid & 0xF) << 24) | 1u; + __asm__ volatile ("msr S3_0_C12_C11_5, %0" :: "r"(v)); + __asm__ volatile ("isb" ::: "memory"); +} diff --git a/src/port/versal/gic.h b/src/port/versal/gic.h new file mode 100644 index 00000000..2a1eae9e --- /dev/null +++ b/src/port/versal/gic.h @@ -0,0 +1,49 @@ +/* gic.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef ZCU102_GIC_H +#define ZCU102_GIC_H + +#include + +typedef void (*gic_handler_t)(void); + +void gic_init(void); +void gic_register_handler(uint32_t intid, gic_handler_t fn); +void gic_enable_spi(uint32_t intid, uint32_t priority); +void gic_disable_spi(uint32_t intid); + +/* Returns 1 if interrupt is currently pending at the distributor, + * 0 otherwise. Diagnostic only. */ +uint32_t gic_is_pending(uint32_t intid); + +/* Fire a software-generated interrupt to self (CPU0) for testing. + * intid must be < 16. */ +void gic_self_test_sgi(uint32_t intid); + +/* Total IRQs taken (any intid) and the last intid we saw. */ +uint32_t gic_total_irqs(void); +uint32_t gic_last_intid(void); + +/* Polled-mode IRQ dispatch: drains any pending IRQ from the GIC + * by reading GICC_IAR, calling the registered handler, and EOI'ing. + * Returns the number of interrupts dispatched in this call. + * + * Workaround: on this ZynqMP / Cortex-A53 / GIC-400 combination, + * the GIC latches pending interrupts correctly but the CPU never + * takes the IRQ exception (root cause not pinned). Calling this + * function from the main loop is functionally equivalent. */ +uint32_t gic_poll_dispatch(void); + +/* Provided by startup.S, asm helpers. */ +void irq_enable(void); +void irq_disable(void); + +/* Called by the IRQ vector trampoline in startup.S. Acknowledges, + * dispatches, and EOIs the current interrupt. */ +void irq_dispatch(void); + +#endif /* ZCU102_GIC_H */ diff --git a/src/port/versal/main.c b/src/port/versal/main.c new file mode 100644 index 00000000..8984d98f --- /dev/null +++ b/src/port/versal/main.c @@ -0,0 +1,324 @@ +/* main.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * wolfIP UDP echo + DHCP client demo for Xilinx Versal Gen 1 (VMK180 + * dev board, Cortex-A72 EL3 bare-metal, GEM0 -> on-board RJ45). + * + * UNTESTED ON HARDWARE -- structural scaffold mirroring the ZCU102 + * port. See src/port/versal/README.md. + */ +#include +#include +#include "config.h" +#include "../../../wolfip.h" +#include "board.h" +#include "uart.h" +#include "gic.h" +#include "gem.h" +#include "timer.h" + +#define ECHO_PORT 7 +#define RX_BUF_SIZE 1500 + +static struct wolfIP *IPStack; +static int udp_fd = -1; +static uint8_t udp_rx_buf[RX_BUF_SIZE]; + +/* Override newlib memset/memcpy with plain bytewise versions via + * linker --wrap. The aarch64 newlib memset uses 'dc zva' which hangs + * on this Cortex-A53 setup (DZE=1 in SCTLR_EL3 doesn't help; the + * instruction wedges before completing). The Makefile passes + * -Wl,--wrap=memset -Wl,--wrap=memcpy so all calls get redirected + * to these __wrap_ functions. */ +void *__wrap_memset(void *s, int c, unsigned long n) +{ + unsigned char *p = (unsigned char *)s; + while (n--) + *p++ = (unsigned char)c; + return s; +} + +void *__wrap_memcpy(void *dest, const void *src, unsigned long n) +{ + unsigned char *d = (unsigned char *)dest; + const unsigned char *s = (const unsigned char *)src; + while (n--) + *d++ = *s++; + return dest; +} + +/* Called from startup.S vector trampoline on synchronous/SError fault. */ +void exception_report(uint64_t esr, uint64_t elr, uint64_t far, uint64_t spsr) +{ + uart_puts("\n\n*** EL3 SYNC EXCEPTION ***\n"); + uart_puts(" ESR_EL3 : "); uart_puthex((uint32_t)(esr >> 32)); + uart_puthex((uint32_t)esr); uart_puts("\n"); + uart_puts(" EC = "); uart_puthex((uint32_t)((esr >> 26) & 0x3F)); + uart_puts(" (0x21=instr abort, 0x25=data abort, 0x24=alignment)\n"); + uart_puts(" ELR_EL3 : "); uart_puthex((uint32_t)(elr >> 32)); + uart_puthex((uint32_t)elr); uart_puts("\n"); + uart_puts(" FAR_EL3 : "); uart_puthex((uint32_t)(far >> 32)); + uart_puthex((uint32_t)far); uart_puts("\n"); + uart_puts(" SPSR_EL3: "); uart_puthex((uint32_t)spsr); uart_puts("\n"); +} + +void exception_report_serror(uint64_t esr, uint64_t elr, uint64_t far, + uint64_t spsr, uint64_t kind) +{ + (void)kind; + uart_puts("\n\n*** EL3 SError / FIQ ***\n"); + uart_puts(" ESR_EL3 : "); uart_puthex((uint32_t)(esr >> 32)); + uart_puthex((uint32_t)esr); uart_puts("\n"); + uart_puts(" ELR_EL3 : "); uart_puthex((uint32_t)(elr >> 32)); + uart_puthex((uint32_t)elr); uart_puts("\n"); + uart_puts(" FAR_EL3 : "); uart_puthex((uint32_t)(far >> 32)); + uart_puthex((uint32_t)far); uart_puts("\n"); + uart_puts(" SPSR_EL3: "); uart_puthex((uint32_t)spsr); uart_puts("\n"); +} + +/* wolfIP needs a 32-bit random word for protocol identifiers (TCP ISN, + * DHCP xid, DNS id, ephemeral source port, IP fragment id). We delegate + * to the port-local memuse-pattern entropy source (entropy.c), which + * follows the algorithm of wolfCrypt's wc_Entropy_Get() but is + * self-contained for cert isolation. */ +extern uint32_t zcu102_get_random32(void); + +uint32_t wolfIP_getrandom(void) +{ + return zcu102_get_random32(); +} + +static void udp_echo_cb(int fd, uint16_t event, void *arg) +{ + struct wolfIP *s = (struct wolfIP *)arg; + struct wolfIP_sockaddr_in peer; + uint32_t peer_len = sizeof(peer); + int n; + + if (!(event & CB_EVENT_READABLE)) + return; + + n = wolfIP_sock_recvfrom(s, fd, udp_rx_buf, sizeof(udp_rx_buf), 0, + (struct wolfIP_sockaddr *)&peer, &peer_len); + if (n > 0) { + (void)wolfIP_sock_sendto(s, fd, udp_rx_buf, (uint32_t)n, 0, + (struct wolfIP_sockaddr *)&peer, peer_len); + uart_puts("UDP echo: "); uart_putdec((uint32_t)n); + uart_puts(" bytes from "); uart_putip4(peer.sin_addr.s_addr); + uart_puts("\n"); + } +} + +int main(void) +{ + struct wolfIP_ll_dev *ll; + struct wolfIP_sockaddr_in addr; + uint64_t tick = 0; + int ret; + + uart_init(); + uart_puts("\n\n=== wolfIP Versal Gen 1 (VMK180, Cortex-A72 EL3) ===\n"); + uart_puts("MMU on, caches on. Bringing up GIC-400...\n"); + + gic_init(); + + uart_puts("Initializing wolfIP stack...\n"); + wolfIP_init_static(&IPStack); + + uart_puts("Bringing up GEM0 (RGMII, DP83867)...\n"); + ll = wolfIP_getdev(IPStack); + ret = zcu102_eth_init(ll); + if (ret < 0) { + uart_puts("ERROR: zcu102_eth_init failed: "); + uart_puthex((uint32_t)ret); + uart_puts("\n"); + while (1) + ; + } + uart_puts(" link "); uart_puts((ret & 0x100) ? "UP" : "DOWN"); + uart_puts(", PHY="); uart_puthex((uint32_t)(ret & 0xFF)); + uart_puts("\n"); + + /* Unmask IRQ at CPU now that GEM3 SPI is enabled at GICD. The CPU + * IRQ exception is currently not delivered on this A53/EL3 setup + * (open issue - see README); eth_poll() drives gem_isr() from the + * main loop instead. The SGI self-test instrumentation below is + * left in place behind DEBUG_GIC for further investigation. */ + irq_enable(); +#ifdef DEBUG_GIC + uart_puts("IRQ enabled. Self-test: firing SGI 0...\n"); + { + uint32_t before = gic_total_irqs(); + uint64_t daif, scr, vbar; + __asm__ volatile ("mrs %0, daif" : "=r"(daif)); + __asm__ volatile ("mrs %0, scr_el3" : "=r"(scr)); + __asm__ volatile ("mrs %0, vbar_el3" : "=r"(vbar)); + uart_puts(" pre: DAIF="); uart_puthex((uint32_t)daif); + uart_puts(" SCR_EL3="); uart_puthex((uint32_t)scr); + uart_puts(" VBAR_EL3="); uart_puthex((uint32_t)vbar); + uart_puts("\n"); + { + uint32_t vec_irq_curr_spx; + vec_irq_curr_spx = *(volatile uint32_t *)(vbar + 0x280); + uart_puts(" vec[Cur SPx IRQ] @ "); + uart_puthex((uint32_t)(vbar + 0x280)); + uart_puts(" = "); + uart_puthex(vec_irq_curr_spx); + uart_puts(" (B opcode: top byte 0x14 expected)\n"); + } + uart_puts(" GICD_CTLR="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x000)); + uart_puts(" GICD_ISENABLER(0)="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x100)); + uart_puts(" GICD_IGROUPR(0)="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x080)); + uart_puts("\n"); + uart_puts(" GICC_CTLR="); uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x000)); + uart_puts(" GICC_PMR="); uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x004)); + uart_puts("\n"); + gic_self_test_sgi(0); + delay_ms(10); + { + uint64_t isr, rpr; + __asm__ volatile ("mrs %0, isr_el1" : "=r"(isr)); + rpr = *(volatile uint32_t *)(GICC_BASE + 0x014); + uart_puts(" post-SGI: ISR_EL1="); + uart_puthex((uint32_t)isr); + uart_puts(" (bit7=I, bit6=F, bit8=A)\n"); + uart_puts(" GICC_RPR="); uart_puthex((uint32_t)rpr); + uart_puts(" (running priority; 0xFF=idle)\n"); + } + uart_puts(" SGI fired. gic_total_irqs: "); + uart_putdec(before); + uart_puts(" -> "); + uart_putdec(gic_total_irqs()); + uart_puts(" last_intid="); + uart_puthex(gic_last_intid()); + uart_puts("\n GICD_ISPENDR(0)="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x200)); + uart_puts(" GICC_HPPIR="); uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x018)); + uart_puts("\n"); + { + uint32_t iar = *(volatile uint32_t *)(GICC_BASE + 0x00C); + uart_puts(" polled GICC_IAR="); uart_puthex(iar); + uart_puts("\n"); + if ((iar & 0x3FF) != 0x3FF) { + *(volatile uint32_t *)(GICC_BASE + 0x010) = iar; + uart_puts(" EOI'd. polled GICC_HPPIR after="); + uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x018)); + uart_puts("\n"); + } + } + /* Extra system-register snapshot. FSBL/ATF sometimes leaves + * HCR_EL2 / MDCR_EL3 / OSLAR_EL1 with bits set that affect + * exception routing or debug halt; dump them so we can rule + * those out. NOTE: WFI wake test was tried here and hangs + * the CPU even though ISR_EL1.I=1 was observed earlier - the + * GIC appears to assert and deassert nIRQ within a few cycles + * rather than holding it level until ACK. That is consistent + * with edge-triggered SGI behavior but is not what the spec + * requires; it leaves no time for the exception logic to + * latch the event. */ + { + uint64_t hcr, mdcr, sctlr, oslsr; + __asm__ volatile ("mrs %0, hcr_el2" : "=r"(hcr)); + __asm__ volatile ("mrs %0, mdcr_el3" : "=r"(mdcr)); + __asm__ volatile ("mrs %0, sctlr_el3" : "=r"(sctlr)); + __asm__ volatile ("mrs %0, oslsr_el1" : "=r"(oslsr)); + uart_puts(" HCR_EL2="); uart_puthex((uint32_t)hcr); + uart_puts(" MDCR_EL3="); uart_puthex((uint32_t)mdcr); + uart_puts("\n SCTLR_EL3="); uart_puthex((uint32_t)sctlr); + uart_puts(" OSLSR_EL1="); uart_puthex((uint32_t)oslsr); + uart_puts("\n"); + } + } +#endif +#ifdef DEBUG_GEM + uart_puts("Initial GEM state:\n"); + gem_dump_state(); +#endif + +#ifdef DHCP + if (dhcp_client_init(IPStack) >= 0) { + uint32_t dhcp_elapsed = 0; + const uint32_t dhcp_timeout = 15000; + uart_puts("Starting DHCP client...\n"); + while (!dhcp_bound(IPStack) && dhcp_client_is_running(IPStack) + && dhcp_elapsed < dhcp_timeout) { + (void)wolfIP_poll(IPStack, tick); + tick++; + delay_ms(1); + dhcp_elapsed++; + /* gic_poll_dispatch removed - eth_poll already polls + * GEM_ISR directly. Doubling up here just spins. */ +#ifdef DEBUG_GEM + if ((dhcp_elapsed % 1000) == 0) { + uart_puts(" ["); uart_putdec(dhcp_elapsed); + uart_puts(" ms] bound="); + uart_putdec(dhcp_bound(IPStack) ? 1u : 0u); + uart_puts(" running="); + uart_putdec(dhcp_client_is_running(IPStack) ? 1u : 0u); + uart_puts("\n"); + gem_dump_state(); + } +#endif + } + if (dhcp_bound(IPStack)) { + ip4 ip = 0, nm = 0, gw = 0; + wolfIP_ipconfig_get(IPStack, &ip, &nm, &gw); + uart_puts("DHCP bound:\n IP: "); uart_putip4(ip); + uart_puts("\n Mask: "); uart_putip4(nm); + uart_puts("\n GW: "); uart_putip4(gw); + uart_puts("\n"); + } else { + ip4 ip = atoip4(WOLFIP_IP); + ip4 nm = atoip4(WOLFIP_NETMASK); + ip4 gw = atoip4(WOLFIP_GW); + uart_puts("DHCP timeout - using static IP\n"); + wolfIP_ipconfig_set(IPStack, ip, nm, gw); + } + } +#else + { + ip4 ip = atoip4(WOLFIP_IP); + ip4 nm = atoip4(WOLFIP_NETMASK); + ip4 gw = atoip4(WOLFIP_GW); + wolfIP_ipconfig_set(IPStack, ip, nm, gw); + uart_puts("Static IP: "); uart_putip4(ip); uart_puts("\n"); + } +#endif + + uart_puts("Opening UDP echo socket on port "); + uart_putdec(ECHO_PORT); uart_puts("\n"); + udp_fd = wolfIP_sock_socket(IPStack, AF_INET, IPSTACK_SOCK_DGRAM, 0); + wolfIP_register_callback(IPStack, udp_fd, udp_echo_cb, IPStack); + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = ee16(ECHO_PORT); + addr.sin_addr.s_addr = 0; + (void)wolfIP_sock_bind(IPStack, udp_fd, + (struct wolfIP_sockaddr *)&addr, sizeof(addr)); + + uart_puts("Ready. Try: nc -u 7\n\n"); + + for (;;) { + (void)wolfIP_poll(IPStack, tick++); + delay_ms(1); + } + + return 0; +} diff --git a/src/port/versal/mmu.c b/src/port/versal/mmu.c new file mode 100644 index 00000000..ec70dfd5 --- /dev/null +++ b/src/port/versal/mmu.c @@ -0,0 +1,227 @@ +/* mmu.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Minimal MMU bring-up for A53 at EL3 with a 32-bit virtual address + * space (T0SZ=32, start level L1). Static tables map the full 4 GB VA + * range: + * + * L1[0] -> L2_DDR (0x00000000 .. 0x3FFFFFFF, 1 GB, 2 MB granular) + * L1[1] -> 0x40000000 .. 0x7FFFFFFF Normal WB IS (1 GB block) + * L1[2] -> invalid (0x80000000 .. 0xBFFFFFFF) + * L1[3] -> L2_PERIPH (0xC0000000 .. 0xFFFFFFFF, 1 GB, 2 MB granular) + * + * L2_DDR has a Normal-NC carve-out for any 2 MB block overlapping the + * linker's [_dma_buffers_start, _dma_buffers_end) range. In the current + * OCM-only layout the .dma_buffers section lives in OCM (mapped via + * L2_PERIPH[511] Normal-WB), so this carve-out is effectively dormant - + * GEM DMA coherency is handled with explicit DC CVAC / IVAC ops in + * gem.c. The carve-out remains in the tables so a future DDR-resident + * layout works without an MMU change. + * + * L2_PERIPH covers the PS peripheral aperture as Device-nGnRnE except + * entry 511 (0xFFE00000..0xFFFFFFFF) which is Normal-WB executable so + * code can be fetched from OCM (0xFFFC0000..0xFFFFFFFF) after the MMU + * is enabled. + * + * MAIR_EL3: + * ATTR0 = 0xFF (Normal Inner+Outer WB Cacheable, Read+Write alloc) + * ATTR1 = 0x00 (Device-nGnRnE - PS peripherals) + * ATTR2 = 0x44 (Normal Inner+Outer Non-Cacheable - reserved for a + * future DDR DMA carve-out) + * + * Block descriptor low attributes: + * bits[1:0] = 0b01 block + * bits[5:2] = AttrIndx + * bits[7:6] = AP = 0 (RW at EL3) + * bits[9:8] = SH = 0b11 inner-shareable (only meaningful for Normal) + * bit[10] = AF = 1 + * bit[54] = UXN/XN = 1 for Device, 0 for Normal RX + */ +#include +#include "mmu.h" + +extern uint8_t _dma_buffers_start[]; +extern uint8_t _dma_buffers_end[]; + +/* L1 has 4 entries (one per GB in our 4 GB VA). Section attribute keeps + * it in the dedicated .page_tables area so it lives at a known DDR + * address - the MMU walker still uses physical addresses to read it. */ +static volatile uint64_t L1[512] + __attribute__((aligned(4096), section(".page_tables"))); +static volatile uint64_t L2_DDR[512] + __attribute__((aligned(4096), section(".page_tables"))); +/* L2 for the 3-4 GB region. Most blocks are Device (PS peripherals) + * but the 2 MB block at 0xFFE00000 - 0xFFFFFFFF must be Normal+exec + * because OCM (0xFFFC0000-0xFFFFFFFF) lives there and our code runs + * from OCM. */ +static volatile uint64_t L2_PERIPH[512] + __attribute__((aligned(4096), section(".page_tables"))); + +#define DESC_VALID (1ULL << 0) +#define DESC_TABLE (1ULL << 1) +#define DESC_BLOCK (0ULL << 1) +#define DESC_AF (1ULL << 10) +#define DESC_SH_INNER (3ULL << 8) +#define DESC_AP_RW_EL3 (0ULL << 6) +#define DESC_XN (1ULL << 54) +#define DESC_ATTR(i) (((uint64_t)(i) & 7ULL) << 2) + +#define ATTR_NORMAL 0 /* AttrIndx 0 = MAIR ATTR0 (Normal WB) */ +#define ATTR_DEVICE 1 /* AttrIndx 1 = MAIR ATTR1 (Device) */ +#define ATTR_NORMAL_NC 2 /* AttrIndx 2 = MAIR ATTR2 (Normal NC) */ + +#define BLOCK_NORMAL(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_SH_INNER | DESC_AP_RW_EL3 | DESC_ATTR(ATTR_NORMAL)) + +#define BLOCK_DEVICE(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_AP_RW_EL3 | DESC_ATTR(ATTR_DEVICE) | DESC_XN) + +#define BLOCK_NORMAL_NC(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_SH_INNER | DESC_AP_RW_EL3 | DESC_ATTR(ATTR_NORMAL_NC) | DESC_XN) + +#define TABLE_DESC(pa) \ + (((uint64_t)(pa)) | DESC_TABLE | DESC_VALID) + +#define L2_BLOCK_SIZE (2ULL * 1024 * 1024) /* 2 MB */ +#define L1_BLOCK_SIZE (1024ULL * 1024 * 1024) /* 1 GB */ + +static void mmu_build_tables(void) +{ + uint64_t addr; + uint64_t dma_lo; + uint64_t dma_hi; + int i; + + /* L2_DDR: 512 entries covering 0..1 GB at 2 MB each. */ + dma_lo = (uint64_t)(uintptr_t)_dma_buffers_start; + dma_hi = (uint64_t)(uintptr_t)_dma_buffers_end; +#ifdef VERSAL_LAYOUT_DDR + /* DDR layout: the app and its dma_buffers both live in DDR. We + * keep all of DDR as Normal-WB cacheable and rely on the explicit + * DC CVAC / IVAC ops in gem.c for coherency. Disabling the NC + * carve-out means the stack (which shares a 2 MB block with the + * dma_buffers in the DDR linker layout) stays cacheable. */ + (void)dma_lo; (void)dma_hi; +#endif + + for (i = 0; i < 512; i++) { + addr = (uint64_t)i * L2_BLOCK_SIZE; +#ifdef VERSAL_LAYOUT_DDR + L2_DDR[i] = BLOCK_NORMAL(addr); +#else + if ((addr + L2_BLOCK_SIZE) <= dma_lo || addr >= dma_hi) { + L2_DDR[i] = BLOCK_NORMAL(addr); + } else { + /* Any 2 MB block that overlaps the DMA region is Normal-NC + * so newlib's memcpy/memset (LDP/STP, unaligned tails) does + * not alignment-fault when staging frames into tx_buf_pool. */ + L2_DDR[i] = BLOCK_NORMAL_NC(addr); + } +#endif /* VERSAL_LAYOUT_DDR */ + } + + /* L2_PERIPH: 3..4 GB range. All Device-nGnRnE except the last + * 2 MB block which contains OCM (0xFFFC0000..0xFFFFFFFF) and + * must be Normal+executable so we can fetch our code from OCM. */ + for (i = 0; i < 511; i++) { + addr = 3ULL * L1_BLOCK_SIZE + (uint64_t)i * L2_BLOCK_SIZE; + L2_PERIPH[i] = BLOCK_DEVICE(addr); + } + /* Entry 511 covers 0xFFE00000..0xFFFFFFFF. OCM is at 0xFFFC0000+, + * within this 2 MB block. Map as Normal WB cacheable, executable. */ + L2_PERIPH[511] = BLOCK_NORMAL(3ULL * L1_BLOCK_SIZE + + 511ULL * L2_BLOCK_SIZE); + + /* L1 entries. */ + L1[0] = TABLE_DESC((uintptr_t)L2_DDR); + L1[1] = BLOCK_NORMAL(L1_BLOCK_SIZE); /* 1..2 GB DDR */ + L1[2] = 0; /* 2..3 GB unused */ + L1[3] = TABLE_DESC((uintptr_t)L2_PERIPH); /* 3..4 GB peri + OCM */ + + for (i = 4; i < 512; i++) + L1[i] = 0; +} + +void mmu_enable(void) +{ + uint64_t mair; + uint64_t tcr; + uint64_t sctlr; + + mmu_build_tables(); + + /* Make sure the table writes are visible to the table walker + * before we point TTBR at them. We are still running with the + * D-cache off here, so a DSB SY is sufficient. */ + __asm__ volatile ("dsb sy" ::: "memory"); + + /* MAIR_EL3: + * ATTR0 = 0xFF (Normal WB Inner+Outer Cacheable) + * ATTR1 = 0x00 (Device-nGnRnE) + * ATTR2 = 0x44 (Normal Inner+Outer Non-Cacheable, for DMA buffers) */ + mair = (0xFFULL << 0) | (0x00ULL << 8) | (0x44ULL << 16); + __asm__ volatile ("msr mair_el3, %0" :: "r"(mair)); + + /* TCR_EL3: 32-bit VA (T0SZ=32, start level L1), 4 KB granule, + * IRGN0=WB-RA-WA, ORGN0=WB-RA-WA, SH0=Inner shareable, IPS=40 bit. + * EL3 TCR has T0SZ at bits [5:0], IRGN0[9:8], ORGN0[11:10], + * SH0[13:12], TG0[15:14], PS[18:16], TBI[20], RES1 at bit 23,31. + */ + tcr = (uint64_t)32 /* T0SZ = 32 -> 4 GB VA */ + | ((uint64_t)1 << 8) /* IRGN0 = WB RA-WA */ + | ((uint64_t)1 << 10) /* ORGN0 = WB RA-WA */ + | ((uint64_t)3 << 12) /* SH0 = Inner shareable */ + | ((uint64_t)0 << 14) /* TG0 = 4 KB */ + | ((uint64_t)2 << 16) /* PS = 40 bit PA */ + | ((uint64_t)1 << 23) /* RES1 */ + | ((uint64_t)1 << 31); /* RES1 */ + __asm__ volatile ("msr tcr_el3, %0" :: "r"(tcr)); + + /* TTBR0_EL3 = &L1. */ + __asm__ volatile ("msr ttbr0_el3, %0" :: "r"((uint64_t)(uintptr_t)L1)); + + __asm__ volatile ("isb" ::: "memory"); + + /* Invalidate TLBs and I-cache before turning the MMU on. */ + __asm__ volatile ("tlbi alle3" ::: "memory"); + __asm__ volatile ("ic iallu" ::: "memory"); + __asm__ volatile ("dsb sy" ::: "memory"); + __asm__ volatile ("isb" ::: "memory"); + + /* Enable MMU + I-cache + D-cache. Cache coherency with GEM DMA + * is handled with explicit DC CVAC / DC IVAC ops in eth_send and + * eth_poll (see gem.c cache_*() helpers). + * + * DZE bit 14 = enable DC ZVA at EL0/EL1 (and EL3 since we are + * here). Newlib aarch64 memset uses DC ZVA for fast bulk zero + * writes; without DZE=1 the instruction traps UNDEF and the + * exception loop wedges the CPU. */ + __asm__ volatile ("mrs %0, sctlr_el3" : "=r"(sctlr)); + sctlr |= (1ULL << 0); /* M */ + sctlr |= (1ULL << 2); /* C */ + sctlr |= (1ULL << 12); /* I */ + sctlr |= (1ULL << 14); /* DZE - allow DC ZVA */ + sctlr &= ~(1ULL << 1); /* A off */ + __asm__ volatile ("msr sctlr_el3, %0" :: "r"(sctlr)); + __asm__ volatile ("isb" ::: "memory"); +} diff --git a/src/port/versal/mmu.h b/src/port/versal/mmu.h new file mode 100644 index 00000000..90ab7475 --- /dev/null +++ b/src/port/versal/mmu.h @@ -0,0 +1,12 @@ +/* mmu.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef ZCU102_MMU_H +#define ZCU102_MMU_H + +void mmu_enable(void); + +#endif /* ZCU102_MMU_H */ diff --git a/src/port/versal/phy_dp83867.c b/src/port/versal/phy_dp83867.c new file mode 100644 index 00000000..987c239c --- /dev/null +++ b/src/port/versal/phy_dp83867.c @@ -0,0 +1,338 @@ +/* phy_dp83867.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * TI DP83867IR PHY init for the ZCU102 (PHY on RGMII to PS-GEM3). + * + * The DP83867 needs explicit RGMII TX and RX clock delay configuration + * (CFG4 / RGMIICTL extended registers) because the ZCU102 routes + * RGMII signals as a straight-through trace without external delay. + * Without this the link comes up at 1 Gbps but carries corrupt data + * (random RX frames, no TX). The Linux dp83867 driver and Xilinx + * device tree both apply a 2.0 ns TX + 2.0 ns RX skew - we match. + * + * Extended registers (>0x1F) are accessed via the IEEE-defined indirect + * pair (REGCR=0x0D, ADDAR=0x0E): + * 1. Write REGCR = 0x001F (address-of, devad 31). + * 2. Write ADDAR = . + * 3. Write REGCR = 0x401F (data, devad 31, no-increment). + * 4. Read/Write ADDAR = . + */ +#include +#include "gem.h" +#include "phy_dp83867.h" +#include "timer.h" +#include "uart.h" + +/* Standard IEEE PHY registers (clause 22) */ +#define PHY_BMCR 0x00 +#define PHY_BMSR 0x01 +#define PHY_ID1 0x02 +#define PHY_ID2 0x03 +#define PHY_ANAR 0x04 +#define PHY_GBCR 0x09 +#define PHY_GBSR 0x0A +#define PHY_REGCR 0x0D +#define PHY_ADDAR 0x0E + +#define BMCR_RESET (1u << 15) +#define BMCR_ANRESTART (1u << 9) +#define BMCR_ANEN (1u << 12) + +#define BMSR_ANCOMPLETE (1u << 5) +#define BMSR_LINK_UP (1u << 2) + +/* DP83867 extended registers (accessed via REGCR/ADDAR, devad 0x1F) */ +#define DP83867_CFG4 0x0031 /* Configuration 4 (RX_CTRL strap fix) */ +#define DP83867_RGMIICTL 0x0032 /* RGMII control */ +#define DP83867_STRAP_STS1 0x006E /* Strap status register (read-only) */ +#define DP83867_RGMIIDCTL 0x0086 /* RGMII delay control */ +#define DP83867_IO_MUX_CFG 0x0170 /* IO MUX config (impedance) */ + +/* Clause-22 register (direct access) */ +#define DP83867_PHYCR 0x10 /* PHY Control register */ +#define PHYCR_FIFO_DEPTH_MASK (3u << 14) +#define PHYCR_FIFO_DEPTH_8B (3u << 14) + +/* RGMIICTL bits */ +#define RGMIICTL_RX_DELAY_EN (1u << 0) +#define RGMIICTL_TX_DELAY_EN (1u << 1) + +/* RGMIIDCTL: TX delay in [3:0], RX delay in [7:4], each step ~0.25 ns. + * 0x8 -> 2.0 ns (matches Linux/Xilinx default for ZCU102). */ +#define RGMIIDCTL_DELAY_2NS (0x8u | (0x8u << 4)) + +/* Speed read from PHY status register (DP83867 0x11) */ +#define DP83867_PHYSTS 0x0011 +#define PHYSTS_SPEED_MASK (3u << 14) +#define PHYSTS_SPEED_1000 (2u << 14) +#define PHYSTS_SPEED_100 (1u << 14) +#define PHYSTS_SPEED_10 (0u << 14) +#define PHYSTS_DUPLEX (1u << 13) + +static int phy_ext_write(uint8_t phy_addr, uint16_t ext_reg, uint16_t val) +{ + int rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x001Fu); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_ADDAR, ext_reg); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x401Fu); + if (rc < 0) return rc; + return gem_mdio_write(phy_addr, PHY_ADDAR, val); +} + +static int phy_ext_read(uint8_t phy_addr, uint16_t ext_reg, uint16_t *out) +{ + int rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x001Fu); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_ADDAR, ext_reg); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x401Fu); + if (rc < 0) return rc; + return gem_mdio_read(phy_addr, PHY_ADDAR, out); +} + +int dp83867_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out) +{ + uint16_t id1 = 0; + uint16_t id2 = 0; + uint16_t bmcr; + uint16_t bmsr; + uint16_t physts; + int i; + + if (gem_mdio_read(phy_addr, PHY_ID1, &id1) < 0) + return -1; + if (gem_mdio_read(phy_addr, PHY_ID2, &id2) < 0) + return -2; + uart_puts("DP83867: ID1="); uart_puthex(id1); + uart_puts(" ID2="); uart_puthex(id2); + uart_puts("\n"); + /* DP83867 OUI = 0x2000A23x. ID1=0x2000, ID2 upper bits match. */ + if (id1 != 0x2000u || (id2 & 0xFFF0u) != 0xA230u) { + uart_puts(" warn: PHY ID does not match DP83867, continuing\n"); + } + + /* Soft reset. */ + if (gem_mdio_write(phy_addr, PHY_BMCR, BMCR_RESET) < 0) + return -3; + for (i = 0; i < 1000; i++) { + delay_ms(1); + if (gem_mdio_read(phy_addr, PHY_BMCR, &bmcr) < 0) + return -4; + if ((bmcr & BMCR_RESET) == 0) + break; + } + if (i == 1000) + return -5; + + /* Order below mirrors the Linux/U-Boot dp83867_config sequence: + * 1. Strap fix (CFG4 bit 7) right after SW reset. + * 2. PHYCR FIFO depth RMW. + * 3. RGMIICTL RMW to enable both delays. + * 4. RGMIIDCTL set delay values. + * 5. Restart AN (caller does after we return). + */ + { + uint16_t strap = 0; + uint16_t cfg4_before = 0; + uint16_t cfg4_after = 0; + uint16_t iomux = 0; + uint16_t rgmiictl = 0; + uint16_t phycr_before = 0; + uint16_t phycr_after = 0; + + (void)phy_ext_read(phy_addr, DP83867_STRAP_STS1, &strap); + (void)phy_ext_read(phy_addr, DP83867_IO_MUX_CFG, &iomux); + (void)phy_ext_read(phy_addr, DP83867_CFG4, &cfg4_before); + + /* 1. RX_CTRL strap quirk for ZCU102. */ + cfg4_after = cfg4_before & ~(1u << 7); + if (phy_ext_write(phy_addr, DP83867_CFG4, cfg4_after) < 0) + return -6; + + /* 2. PHYCR FIFO depth = 8 bytes (RMW so we keep Auto-MDIX, + * power-down detect, etc., that the strap brought up). */ + (void)gem_mdio_read(phy_addr, DP83867_PHYCR, &phycr_before); + phycr_after = (phycr_before & ~PHYCR_FIFO_DEPTH_MASK) + | PHYCR_FIFO_DEPTH_8B; + if (gem_mdio_write(phy_addr, DP83867_PHYCR, phycr_after) < 0) + return -7; + + /* 3. RGMIICTL: enable TX and RX clock delays (RMW). */ + (void)phy_ext_read(phy_addr, DP83867_RGMIICTL, &rgmiictl); + rgmiictl |= RGMIICTL_RX_DELAY_EN | RGMIICTL_TX_DELAY_EN; + if (phy_ext_write(phy_addr, DP83867_RGMIICTL, rgmiictl) < 0) + return -8; + + /* 4. RGMIIDCTL: 2.0 ns each (matches Linux ti,*-internal-delay=8). */ + if (phy_ext_write(phy_addr, DP83867_RGMIIDCTL, + RGMIIDCTL_DELAY_2NS) < 0) + return -9; + +#ifdef DEBUG_PHY + /* Verbose pre-AN dump so we can diff against U-Boot's state. */ + uart_puts("DP83867 pre-AN: STRAP_STS1="); uart_puthex(strap); + uart_puts(" IO_MUX_CFG="); uart_puthex(iomux); + uart_puts("\n CFG4: "); uart_puthex(cfg4_before); + uart_puts(" -> "); uart_puthex(cfg4_after); + uart_puts(" PHYCR: "); uart_puthex(phycr_before); + uart_puts(" -> "); uart_puthex(phycr_after); + uart_puts("\n RGMIICTL="); uart_puthex(rgmiictl); + uart_puts(" RGMIIDCTL="); uart_puthex(RGMIIDCTL_DELAY_2NS); + uart_puts("\n"); + + { + uint16_t v; + (void)phy_ext_read(phy_addr, DP83867_CFG4, &v); + uart_puts("DP83867 readback: CFG4="); uart_puthex(v); + (void)phy_ext_read(phy_addr, DP83867_RGMIICTL, &v); + uart_puts(" RGMIICTL="); uart_puthex(v); + (void)phy_ext_read(phy_addr, DP83867_RGMIIDCTL, &v); + uart_puts(" RGMIIDCTL="); uart_puthex(v); + (void)gem_mdio_read(phy_addr, DP83867_PHYCR, &v); + uart_puts(" PHYCR="); uart_puthex(v); + uart_puts("\n"); + } +#else + (void)strap; (void)iomux; + (void)cfg4_before; (void)cfg4_after; + (void)phycr_before; (void)phycr_after; + (void)rgmiictl; +#endif + } + + /* Advertise 10/100/1000 full + half duplex. */ + if (gem_mdio_write(phy_addr, PHY_ANAR, 0x01E1u) < 0) + return -8; + if (gem_mdio_write(phy_addr, PHY_GBCR, (1u << 9) | (1u << 8)) < 0) + return -9; + + /* Restart AN. */ + if (gem_mdio_write(phy_addr, PHY_BMCR, BMCR_ANEN | BMCR_ANRESTART) < 0) + return -10; + + /* Wait up to 5 s for AN complete, polling at 50 ms. AN typically + * needs 100-1500 ms depending on link partner. Report progress so + * a hung negotiation is visible on UART. */ + uart_puts("DP83867: waiting for autoneg"); + for (i = 0; i < 100; i++) { + delay_ms(50); + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -11; + if (bmsr & BMSR_ANCOMPLETE) { + uart_puts(" done ("); + uart_putdec((uint32_t)i * 50u); + uart_puts("ms)\n"); + break; + } + if ((i % 10) == 9) + uart_putc('.'); + } + if (!(bmsr & BMSR_ANCOMPLETE)) + uart_puts(" TIMEOUT\n"); + + /* Give the PHY a moment to latch the negotiated speed before we + * read PHYSTS - on DP83867 link-OK and PHYSTS update slightly + * after AN_COMPLETE asserts. */ + delay_ms(100); + + /* After AN_COMPLETE, the 1000BASE-T link still needs to finish + * master/slave training and have BOTH receivers report OK before + * BMSR.LINK_UP asserts. This can take several hundred ms more. + * Poll BMSR (double-read for latch) up to 5 s, dumping GBSR each + * iteration so we can see remote_rx_status flip. */ + { + int j; + uint16_t gbsr = 0; + uint16_t bmsr2 = 0; + uart_puts("DP83867: waiting for link"); + for (j = 0; j < 100; j++) { + delay_ms(50); + (void)gem_mdio_read(phy_addr, PHY_BMSR, &bmsr2); + (void)gem_mdio_read(phy_addr, PHY_BMSR, &bmsr2); + (void)gem_mdio_read(phy_addr, PHY_GBSR, &gbsr); + if (bmsr2 & BMSR_LINK_UP) { + uart_puts(" UP ("); + uart_putdec((uint32_t)j * 50u); + uart_puts("ms) GBSR="); + uart_puthex(gbsr); + uart_puts("\n"); + bmsr = bmsr2; + break; + } + if ((j % 10) == 9) { + uart_puts(" ["); + uart_putdec((uint32_t)(j + 1) * 50u); + uart_puts("ms GBSR="); + uart_puthex(gbsr); + uart_puts("]"); + } + } + if (!(bmsr2 & BMSR_LINK_UP)) + uart_puts(" TIMEOUT\n"); + } + + if (gem_mdio_read(phy_addr, DP83867_PHYSTS, &physts) < 0) + return -12; + +#ifdef DEBUG_PHY + { + uint16_t bmcr_now = 0; + uint16_t lpa = 0; + uint16_t gbsr = 0; + (void)gem_mdio_read(phy_addr, PHY_BMCR, &bmcr_now); + (void)gem_mdio_read(phy_addr, 0x05, &lpa); /* MII LPA */ + (void)gem_mdio_read(phy_addr, PHY_GBSR, &gbsr); + uart_puts("DP83867 regs: BMCR="); uart_puthex(bmcr_now); + uart_puts(" BMSR="); uart_puthex(bmsr); + uart_puts(" LPA="); uart_puthex(lpa); + uart_puts(" GBSR="); uart_puthex(gbsr); + uart_puts(" PHYSTS="); uart_puthex(physts); + uart_puts("\n"); + } +#endif + + if ((physts & PHYSTS_SPEED_MASK) == PHYSTS_SPEED_1000) + *speed_out = 1000; + else if ((physts & PHYSTS_SPEED_MASK) == PHYSTS_SPEED_100) + *speed_out = 100; + else + *speed_out = 10; + *full_duplex_out = (physts & PHYSTS_DUPLEX) ? 1 : 0; + + uart_puts("DP83867 link: "); + uart_putdec((uint32_t)*speed_out); + uart_puts(*full_duplex_out ? " Mbps FD\n" : " Mbps HD\n"); + + return (bmsr & BMSR_LINK_UP) ? 1 : 0; +} + +int dp83867_link_status(uint8_t phy_addr) +{ + uint16_t bmsr; + /* BMSR latches link down; read twice. */ + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -1; + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -1; + return (bmsr & BMSR_LINK_UP) ? 1 : 0; +} diff --git a/src/port/versal/phy_dp83867.h b/src/port/versal/phy_dp83867.h new file mode 100644 index 00000000..efbf45a3 --- /dev/null +++ b/src/port/versal/phy_dp83867.h @@ -0,0 +1,23 @@ +/* phy_dp83867.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * TI DP83867IR PHY driver: 10/100/1000 RGMII PHY used on the ZCU102 + * dev board. We only need configuration (reset, RGMII TX/RX skew, + * auto-negotiation) and link status; no advanced features. + */ +#ifndef ZCU102_PHY_DP83867_H +#define ZCU102_PHY_DP83867_H + +#include + +/* Returns 0 on success, < 0 on failure. On success *speed and *fd are + * the negotiated speed (10/100/1000) and full-duplex flag. */ +int dp83867_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out); + +/* Returns 1 if link is up, 0 if down, < 0 on MDIO error. */ +int dp83867_link_status(uint8_t phy_addr); + +#endif /* ZCU102_PHY_DP83867_H */ diff --git a/src/port/versal/startup.S b/src/port/versal/startup.S new file mode 100644 index 00000000..548858fc --- /dev/null +++ b/src/port/versal/startup.S @@ -0,0 +1,297 @@ +/* startup.S + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * AArch64 EL3 entry for Cortex-A72 on Xilinx Versal Gen 1 (VMK180). + * The Platform Management Loader (PLM) -> TF-A (BL31) chain is the + * standard handoff; this port assumes BL31 has dropped us at EL3 + * Secure with MMU/caches off (we force them off again to be safe) + * and then branches to _start at the image entry. + * + * Most of this file is identical to src/port/zcu102/startup.S. The + * Versal-specific differences are: + * - Cortex-A72 instead of A53 (set -mcpu=cortex-a72 in Makefile; + * the assembler itself uses generic v8.0 instructions which work + * unmodified) + * - PL011 UART early-poke addresses (DR is at offset 0, not 0x30) + * - GICv3 (handled in gic.c; no startup-side change needed beyond + * letting BL31 own the distributor init, which is the + * conservative default) + * + * UNTESTED ON HARDWARE. + */ + /* A loader (FSBL, wolfBoot, ...) that respects the ELF entry + * point in the program header branches to _start directly. A + * loader that simply branches to the first byte of the binary + * (wolfBoot's do_boot path does this -- it uses LOAD_ADDRESS, + * not the ELF entry) lands on this stub instead, which forwards + * to _start. Linker scripts place this section first in the + * output image so it always sits at offset 0 of the binary. */ + .section .boot_entry, "ax" + .global _boot_entry +_boot_entry: + b _start + + .section .vectors, "ax" + .align 11 /* 2048 byte align required by VBAR */ + .global _vectors +_vectors: + /* Current EL with SP0 (not used; we always use SPx). */ + .align 7 + b el3_sync_trampoline /* sync */ + .align 7 + b el3_irq_trampoline /* IRQ */ + .align 7 + b el3_irq_trampoline /* FIQ - same handler (see note below) */ + .align 7 + b el3_serror_trampoline /* SError */ + /* Current EL with SPx */ + .align 7 + b el3_sync_trampoline /* sync */ + .align 7 + b el3_irq_trampoline /* IRQ */ + .align 7 + b el3_irq_trampoline /* FIQ - same handler; GIC-400 in + * secure mode may deliver Group 0 + * interrupts via nFIQ depending on + * GICC_CTLR.FIQEn. We route both to + * the same trampoline so the C + * dispatcher sees the INTID either + * way. */ + .align 7 + b el3_serror_trampoline /* SError */ + /* Lower EL using AArch64 (unused, we stay at EL3) */ + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + /* Lower EL using AArch32 (unused) */ + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + + .section .text, "ax" + .global _start + .type _start, %function +_start: + /* VERY FIRST INSTRUCTIONS - prove we're running. Write '@' to + * UART0 (PL011) DR register at offset 0x000. No register-bank- + * dependent ops here. PL011 differs from the Cadence UART in + * the ZCU102 port -- DR is at offset 0, not 0x30. */ + mov x1, #0xFF000000 + mov w2, #'@' + str w2, [x1, #0x00] + str w2, [x1, #0x00] + str w2, [x1, #0x00] + + /* Make sure we are on APU-0. If PLM released us as APU-1 by + * accident, park it. */ + mrs x0, mpidr_el1 + and x0, x0, #0xff /* Aff0 */ + cbnz x0, _park_secondary + + /* Disable MMU + caches in case FSBL left them on. */ + mrs x0, sctlr_el3 + bic x0, x0, #(1 << 0) /* M - MMU off */ + bic x0, x0, #(1 << 2) /* C - D-cache off */ + bic x0, x0, #(1 << 12) /* I - I-cache off */ + msr sctlr_el3, x0 + isb + + /* Allow FP/SIMD at EL3 (FSBL does this too, but be explicit). */ + msr cptr_el3, xzr + + /* Force SPSel = 1 (use SP_ELx). The IRQ vector at offset 0x280 + * (Current EL with SPx) is what we wired el3_irq_trampoline to. + * FSBL may have left SPSel at 0 (SP_EL0); fix it deterministically. */ + msr spsel, #1 + + /* Force SCR_EL3 to a known state. We run entirely at EL3 in + * Secure world. The wolfBoot AArch64 startup explicitly sets the + * IRQ + FIQ + EA routing bits even though the ARM ARM says they + * only affect lower-EL interrupts; reusing that convention here + * because empirically the A53 does not enter the IRQ exception + * unless these are set (ISR_EL1.I went high but no exception + * fired with these bits clear). + * bit 0 NS = 0 (stay Secure) + * bit 1 IRQ = 1 (route IRQ to EL3) + * bit 2 FIQ = 1 (route FIQ to EL3) + * bit 3 EA = 1 (route SError/abort to EL3) + * bit 10 RW = 0 (no lower EL64; we never drop to lower EL) */ + mov x0, #((1 << 1) | (1 << 2) | (1 << 3)) + msr scr_el3, x0 + isb + + /* Vector base. */ + adrp x0, _vectors + add x0, x0, :lo12:_vectors + msr vbar_el3, x0 + + /* Stack pointer. After 'msr spsel, #1' this writes SP_EL3. */ + ldr x0, =_stack_top + mov sp, x0 + + /* Very early UART poke - one char before any C code, so even if a + * later step hangs we know _start was reached. PL011 DR is at + * offset 0x000. Assumes PLM has already configured UART0 baud. */ + mov x1, #0xFF000000 + mov w2, #'!' + str w2, [x1, #0x00] + + /* Clear BSS. */ + ldr x0, =_sbss + ldr x1, =_ebss +1: cmp x0, x1 + b.ge 2f + str xzr, [x0], #8 + b 1b +2: + /* Breadcrumb: BSS cleared. */ + mov x1, #0xFF000000 + mov w2, #'B' + str w2, [x1, #0x00] + + /* Bring up the MMU + caches. C function in mmu.c. */ + bl mmu_enable + + /* Breadcrumb: MMU enabled. */ + mov x1, #0xFF000000 + mov w2, #'M' + str w2, [x1, #0x00] + + /* Branch to main. */ + bl main + + /* main() should not return. If it does, hang. */ + b _hang + + .type _park_secondary, %function +_park_secondary: + wfe + b _park_secondary + + .global _hang + .type _hang, %function +_hang: + b _hang + +/* --------------------------------------------------------------------- + * IRQ trampoline. EL3 IRQ vector -> save GP regs, call C handler, restore. + * Keeps the C handler clean and avoids __attribute__((interrupt)) tricks + * which are not reliable on aarch64. + * ------------------------------------------------------------------- */ + .type el3_irq_trampoline, %function +el3_irq_trampoline: + /* Save full integer register file (x0-x30) plus SPSR_EL3/ELR_EL3. + * Frame is 18 * 16 = 288 bytes (16-byte aligned). Callee-saved + * x19-x28 must be preserved too: irq_dispatch is an ordinary C + * function and may clobber them, while the interrupted code + * almost certainly relies on them. */ + sub sp, sp, #(18 * 16) + stp x0, x1, [sp, #(0 * 16)] + stp x2, x3, [sp, #(1 * 16)] + stp x4, x5, [sp, #(2 * 16)] + stp x6, x7, [sp, #(3 * 16)] + stp x8, x9, [sp, #(4 * 16)] + stp x10, x11, [sp, #(5 * 16)] + stp x12, x13, [sp, #(6 * 16)] + stp x14, x15, [sp, #(7 * 16)] + stp x16, x17, [sp, #(8 * 16)] + stp x18, x19, [sp, #(9 * 16)] + stp x20, x21, [sp, #(10 * 16)] + stp x22, x23, [sp, #(11 * 16)] + stp x24, x25, [sp, #(12 * 16)] + stp x26, x27, [sp, #(13 * 16)] + stp x28, x29, [sp, #(14 * 16)] + str x30, [sp, #(15 * 16)] + /* Snapshot exception return state in case irq_dispatch (or any + * nested exception inside it) clobbers SPSR_EL3 / ELR_EL3. */ + mrs x0, spsr_el3 + mrs x1, elr_el3 + stp x0, x1, [sp, #(16 * 16)] + + bl irq_dispatch + + ldp x0, x1, [sp, #(16 * 16)] + msr spsr_el3, x0 + msr elr_el3, x1 + ldp x0, x1, [sp, #(0 * 16)] + ldp x2, x3, [sp, #(1 * 16)] + ldp x4, x5, [sp, #(2 * 16)] + ldp x6, x7, [sp, #(3 * 16)] + ldp x8, x9, [sp, #(4 * 16)] + ldp x10, x11, [sp, #(5 * 16)] + ldp x12, x13, [sp, #(6 * 16)] + ldp x14, x15, [sp, #(7 * 16)] + ldp x16, x17, [sp, #(8 * 16)] + ldp x18, x19, [sp, #(9 * 16)] + ldp x20, x21, [sp, #(10 * 16)] + ldp x22, x23, [sp, #(11 * 16)] + ldp x24, x25, [sp, #(12 * 16)] + ldp x26, x27, [sp, #(13 * 16)] + ldp x28, x29, [sp, #(14 * 16)] + ldr x30, [sp, #(15 * 16)] + add sp, sp, #(18 * 16) + eret + + .global irq_enable + .type irq_enable, %function +irq_enable: + msr daifclr, #3 /* unmask IRQ (bit 1) + FIQ (bit 0) */ + ret + + .global irq_disable + .type irq_disable, %function +irq_disable: + msr daifset, #2 /* mask IRQ */ + ret + +/* --------------------------------------------------------------------- + * Synchronous exception handler - print ESR_EL3 / ELR_EL3 / FAR_EL3 + * then hang. Anything that previously fell to _hang silently (alignment + * fault, translation fault, undefined instruction) now produces a + * UART dump. + * ------------------------------------------------------------------- */ + .type el3_sync_trampoline, %function +el3_sync_trampoline: + mrs x0, esr_el3 + mrs x1, elr_el3 + mrs x2, far_el3 + mrs x3, spsr_el3 + bl exception_report + b _hang + + .type el3_serror_trampoline, %function +el3_serror_trampoline: + mrs x0, esr_el3 + mrs x1, elr_el3 + mrs x2, far_el3 + mrs x3, spsr_el3 + mov x4, #1 /* indicate SError to C */ + bl exception_report_serror + b _hang diff --git a/src/port/versal/target.ld b/src/port/versal/target.ld new file mode 100644 index 00000000..e85cf909 --- /dev/null +++ b/src/port/versal/target.ld @@ -0,0 +1,132 @@ +/* ZCU102 (Xilinx UltraScale+ MPSoC, Cortex-A53) Linker Script + * + * Memory map (current OCM-only layout): + * OCM : 256 KB @ 0xFFFC0000 (everything lives here) + * DDR low : 2 GB @ 0x00000000 (initialized by FSBL, currently unused + * by this app; reserved for future + * heap or larger ring buffers) + * + * App layout in OCM: + * 0xFFFC0000 .vectors (2 KB-aligned) + * ... .text, .rodata, .data, .bss, .page_tables, + * .dma_buffers (linker packs them in order) + * 0x100000000 _stack_top (top of OCM, stack grows down) + * + * Why OCM-only: + * - JTAG iteration: psu_init alone (no PMU FW) doesn't reliably + * bring up DDR for mwr-force loads. OCM is independent of the + * DDR controller and always works. + * - SD boot: bootgen will emit a warning about OCM overlap with + * FSBL, but FSBL's jump-to-image happens after partition load, + * so the overlay is safe. + * - The 16-KB JTAG DAP alias bug at the low DDR window is avoided + * entirely. + * + * .dma_buffers stays inside OCM (Normal-WB per L2_PERIPH[511]); GEM + * DMA coherency is handled with explicit DC CVAC / IVAC ops in gem.c + * rather than via an MMU attribute carve-out. + */ + +OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") +OUTPUT_ARCH(aarch64) +ENTRY(_start) + +/* Single-region layout: everything in OCM. DDR and DMA region + * definitions are kept as placeholders for a future layout that + * spills .dma_buffers (and possibly .bss) into DDR once the JTAG + * iteration path supports it. They are not referenced by any SECTION + * in the current layout. */ +MEMORY +{ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 + DDR (rw) : ORIGIN = 0x00010000, LENGTH = 0x001F0000 + DMA (rw) : ORIGIN = 0x00200000, LENGTH = 0x00200000 +} + +/* Stack at top of OCM (we keep stack in OCM with the rest since + * DDR-via-JTAG is unreliable without PMU FW). The 24 KB free area + * above DMA buffers gives plenty of stack room for our app. */ +_stack_top = 0x100000000; +_dma_base = 0xFFFF1000; /* in OCM, see linker output */ +_dma_size = 0x00009000; /* ~36 KB sufficient for reduced rings */ + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > OCM :text + + .vectors : + { + . = ALIGN(2048); + KEEP(*(.vectors)) + } > OCM :text + + .text : + { + . = ALIGN(8); + *(.text*) + *(.rodata*) + . = ALIGN(8); + } > OCM :text + + .data : + { + . = ALIGN(8); + _sdata = .; + *(.data*) + . = ALIGN(8); + _edata = .; + } > OCM :text + + /* BSS in OCM as well - DDR-via-JTAG isn't reliable without PMU + * FW, so we keep all writeable state in OCM (256 KB total). */ + .bss (NOLOAD) : + { + . = ALIGN(8); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(8); + _ebss = .; + } > OCM :text + + /* Page tables in OCM so MMU walker isn't dependent on DDR being + * fully up (DDR-via-JTAG is unreliable without PMU FW; CPU + * fetch from OCM is bulletproof). 12 KB total (3x4KB tables). */ + .page_tables (NOLOAD) : + { + . = ALIGN(4096); + _page_tables_start = .; + *(.page_tables) + . = ALIGN(4096); + _page_tables_end = .; + } > OCM :text + + /* DMA buffers also in OCM - OCM is accessible to all AXI masters + * including the GEM DMA. With everything in OCM there's no DDR + * dependency for the basic bring-up. */ + .dma_buffers (NOLOAD) : + { + . = ALIGN(64); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > OCM :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/versal/target_ddr.ld b/src/port/versal/target_ddr.ld new file mode 100644 index 00000000..b2c82be0 --- /dev/null +++ b/src/port/versal/target_ddr.ld @@ -0,0 +1,126 @@ +/* ZCU102 (Xilinx UltraScale+ MPSoC, Cortex-A53) Linker Script - DDR layout + * + * Used when the app is loaded by wolfBoot (or any loader that places + * the signed image into DDR at a known LOAD_ADDRESS). FSBL + PMU FW + + * BL31 are all running by the time control reaches us, so the DDR + * controller is fully initialised and the DDR DAP 16-KB alias bug is + * a non-issue (the loader writes via the AXI master path). + * + * Memory map: + * DDR : 0x10000000 .. 0x10FFFFFF (16 MB; matches WOLFBOOT_LOAD_ADDRESS + * in wolfBoot's config/examples/zynqmp.config) + * OCM : 0xFFFC0000 .. 0xFFFFFFFF (256 KB, still mapped Normal-WB + * executable by L2_PERIPH[511]; unused + * for this layout but left in MEMORY + * so MMU page-table addresses inside + * mmu.c remain valid) + * + * App layout in DDR (16 MB region @ 0x10000000): + * .vectors / .text / .rodata / .data / .bss / .page_tables / .dma_buffers + * stack grows down from _stack_top at the top of the region + * + * Stack top is set near the end of the DDR region with plenty of head + * room (4 MB) below for .bss + page tables + DMA buffers. Increase + * the LENGTH below if a larger heap or more DMA buffers are needed. + */ + +OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") +OUTPUT_ARCH(aarch64) +ENTRY(_start) + +MEMORY +{ + DDR (rwx) : ORIGIN = 0x10000000, LENGTH = 0x01000000 /* 16 MB */ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 /* still mapped */ +} + +/* Stack near the top of the DDR region. 16 MB - 4 KB gives the stack + * a safe red zone. */ +_stack_top = 0x10FFF000; +/* Dormant DMA carve-out markers (the cache-coherency path in gem.c is + * the active mechanism). Set _dma_size to 0 so mmu.c does not insert + * any Normal-NC blocks in the L2_DDR table for this layout. */ +_dma_base = 0x10E00000; +_dma_size = 0; + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + /* First 4 bytes of the image must be a `b _start` so wolfBoot's + * do_boot() (which branches to LOAD_ADDRESS, not the ELF entry) + * lands on a valid instruction. */ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > DDR :text + + .vectors : + { + . = ALIGN(2048); + KEEP(*(.vectors)) + } > DDR :text + + .text : + { + . = ALIGN(8); + *(.text*) + *(.rodata*) + . = ALIGN(8); + } > DDR :text + + .data : + { + . = ALIGN(8); + _sdata = .; + *(.data*) + . = ALIGN(8); + _edata = .; + } > DDR :text + + .bss (NOLOAD) : + { + . = ALIGN(8); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(8); + _ebss = .; + } > DDR :text + + .page_tables (NOLOAD) : + { + . = ALIGN(4096); + _page_tables_start = .; + *(.page_tables) + . = ALIGN(4096); + _page_tables_end = .; + } > DDR :text + + /* DMA buffers stay in the DDR region as plain Normal-WB cacheable + * memory; gem.c maintains coherency with explicit DC CVAC / IVAC + * ops at every BD hand-off. The L2_DDR table has a Normal-NC + * carve-out for any 2 MB block overlapping this range but it is + * currently dormant (cache_clean / cache_inval suffices), so no + * special alignment beyond 64-byte cache lines is required. */ + .dma_buffers (NOLOAD) : + { + . = ALIGN(64); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > DDR :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/versal/timer.h b/src/port/versal/timer.h new file mode 100644 index 00000000..17959e20 --- /dev/null +++ b/src/port/versal/timer.h @@ -0,0 +1,41 @@ +/* timer.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * AArch64 generic-timer based delay helpers. ZynqMP FSBL/ATF programs + * CNTFRQ_EL0 to 100 MHz; we fall back to that if the register reads 0. + * + * CNTPCT_EL0 is readable at every EL on Cortex-A53 without trap setup. + */ +#ifndef ZCU102_TIMER_H +#define ZCU102_TIMER_H + +#include + +static inline uint64_t timer_now(void) +{ + uint64_t v; + __asm__ volatile ("isb; mrs %0, cntpct_el0" : "=r"(v) :: "memory"); + return v; +} + +static inline uint32_t timer_freq(void) +{ + uint64_t v; + __asm__ volatile ("mrs %0, cntfrq_el0" : "=r"(v)); + return v ? (uint32_t)v : 100000000u; +} + +static inline void delay_us(uint32_t us) +{ + uint64_t start = timer_now(); + uint64_t target = ((uint64_t)us * (uint64_t)timer_freq()) / 1000000ULL; + while ((timer_now() - start) < target) { } +} + +static inline void delay_ms(uint32_t ms) +{ + delay_us(ms * 1000u); +} + +#endif /* ZCU102_TIMER_H */ diff --git a/src/port/versal/uart.c b/src/port/versal/uart.c new file mode 100644 index 00000000..df0497bc --- /dev/null +++ b/src/port/versal/uart.c @@ -0,0 +1,131 @@ +/* uart.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * ARM PL011 UART polled driver. Versal routes UART0 to the on-board + * USB-UART on VMK180. We assume the PLM has already pinned the UART + * pins via the LPD configuration object and enabled the reference + * clock (typically 100 MHz IOPLL-derived); this driver programs the + * baud divisors and enables TX/RX. + * + * Register reference: ARM PL011 (PrimeCell SP804 / SP805 family + * UART), DDI 0183. The Versal versal.dtsi maps PL011 base addresses + * to 0xFF000000 (UART0) and 0xFF010000 (UART1). + * + * UNTESTED ON HARDWARE -- structural scaffold. + */ +#include +#include "board.h" +#include "uart.h" + +/* PL011 registers, all 32-bit. */ +#define UART_DR (*(volatile uint32_t *)(UART0_BASE + 0x000)) /* data */ +#define UART_FR (*(volatile uint32_t *)(UART0_BASE + 0x018)) /* flag */ +#define UART_IBRD (*(volatile uint32_t *)(UART0_BASE + 0x024)) /* int baud */ +#define UART_FBRD (*(volatile uint32_t *)(UART0_BASE + 0x028)) /* frac baud */ +#define UART_LCR_H (*(volatile uint32_t *)(UART0_BASE + 0x02C)) /* line ctrl */ +#define UART_CR (*(volatile uint32_t *)(UART0_BASE + 0x030)) /* control */ +#define UART_IMSC (*(volatile uint32_t *)(UART0_BASE + 0x038)) /* irq mask */ +#define UART_ICR (*(volatile uint32_t *)(UART0_BASE + 0x044)) /* irq clr */ + +#define UART_FR_TXFF (1u << 5) +#define UART_FR_TXFE (1u << 7) +#define UART_FR_BUSY (1u << 3) + +#define UART_LCR_H_WLEN_8 (3u << 5) /* 8-bit word length */ +#define UART_LCR_H_FEN (1u << 4) /* FIFO enable */ + +#define UART_CR_UARTEN (1u << 0) +#define UART_CR_TXE (1u << 8) +#define UART_CR_RXE (1u << 9) + +/* Baud formulas (PL011): + * BAUDDIV = (UARTCLK * 4) / baud + * IBRD = BAUDDIV / 64 + * FBRD = BAUDDIV % 64 + * For UARTCLK = 100 MHz, baud = 115200: + * BAUDDIV = (100e6 * 4) / 115200 = 3472 + * IBRD = 3472 / 64 = 54 + * FBRD = 3472 % 64 = 16 + * Actual baud = (100e6 * 4) / ((54 * 64) + 16) = 100e6 / 868 = 115207 */ +#define UART_IBRD_115200 54 +#define UART_FBRD_115200 16 + +void uart_init(void) +{ + UART_CR = 0; /* disable while configuring */ + UART_ICR = 0x7FF; /* clear all interrupts */ + UART_IMSC = 0; /* mask all interrupts */ + UART_IBRD = UART_IBRD_115200; + UART_FBRD = UART_FBRD_115200; + UART_LCR_H = UART_LCR_H_WLEN_8 | UART_LCR_H_FEN; + UART_CR = UART_CR_UARTEN | UART_CR_TXE | UART_CR_RXE; +} + +void uart_putc(char c) +{ + while (UART_FR & UART_FR_TXFF) + ; + UART_DR = (uint32_t)(unsigned char)c; +} + +void uart_puts(const char *s) +{ + while (*s) { + if (*s == '\n') + uart_putc('\r'); + uart_putc(*s++); + } +} + +void uart_puthex(uint32_t val) +{ + static const char hex[] = "0123456789ABCDEF"; + int i; + uart_puts("0x"); + for (i = 28; i >= 0; i -= 4) + uart_putc(hex[(val >> i) & 0xF]); +} + +void uart_putdec(uint32_t val) +{ + char buf[11]; + int i = 0; + if (val == 0) { + uart_putc('0'); + return; + } + while (val > 0 && i < (int)sizeof(buf)) { + buf[i++] = '0' + (char)(val % 10); + val /= 10; + } + while (i > 0) + uart_putc(buf[--i]); +} + +void uart_putip4(ip4 ip) +{ + uart_putdec((ip >> 24) & 0xFF); + uart_putc('.'); + uart_putdec((ip >> 16) & 0xFF); + uart_putc('.'); + uart_putdec((ip >> 8) & 0xFF); + uart_putc('.'); + uart_putdec(ip & 0xFF); +} diff --git a/src/port/versal/uart.h b/src/port/versal/uart.h new file mode 100644 index 00000000..aa3df3ef --- /dev/null +++ b/src/port/versal/uart.h @@ -0,0 +1,20 @@ +/* uart.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef ZCU102_UART_H +#define ZCU102_UART_H + +#include +#include "../../../wolfip.h" /* for ip4 */ + +void uart_init(void); +void uart_putc(char c); +void uart_puts(const char *s); +void uart_puthex(uint32_t val); +void uart_putdec(uint32_t val); +void uart_putip4(ip4 ip); + +#endif /* ZCU102_UART_H */ diff --git a/src/port/zcu102/.gitignore b/src/port/zcu102/.gitignore new file mode 100644 index 00000000..8e5ab963 --- /dev/null +++ b/src/port/zcu102/.gitignore @@ -0,0 +1,4 @@ +*.o +*.elf +*.bin +BOOT.BIN diff --git a/src/port/zcu102/Makefile b/src/port/zcu102/Makefile new file mode 100644 index 00000000..ef399081 --- /dev/null +++ b/src/port/zcu102/Makefile @@ -0,0 +1,107 @@ +# Xilinx ZCU102 (UltraScale+ MPSoC, Cortex-A53) wolfIP bare-metal port +# +# Build: make CROSS_COMPILE=aarch64-none-elf- +# Bootbin: FSBL_ELF=/path/to/fsbl.elf make bootbin +# +# Toolchain: ARM GNU aarch64-none-elf-gcc (tested with 14.3.rel1). + +CROSS_COMPILE ?= aarch64-none-elf- +CC := $(CROSS_COMPILE)gcc +OBJCOPY := $(CROSS_COMPILE)objcopy +SIZE := $(CROSS_COMPILE)size + +ROOT := ../../.. + +# Cortex-A53, AArch64, EL3 single-EL bare-metal. No SIMD/FP in the +# wolfIP/driver paths - keep -mgeneral-regs-only to catch any +# accidental FP use and make the ABI deterministic for cert. +CFLAGS := -mcpu=cortex-a53 -mgeneral-regs-only +CFLAGS += -Os -ffreestanding -fno-builtin -fno-common +CFLAGS += -fdata-sections -ffunction-sections +CFLAGS += -g -Wall -Wextra -Werror -Wno-unused-parameter +CFLAGS += -std=gnu99 +CFLAGS += -I. -I$(ROOT) -I$(ROOT)/src -I$(ROOT)/src/port +CFLAGS += -DZCU102 -DXILINX_AARCH64 +# Append extra defines for investigation builds, e.g.: +# make CFLAGS_EXTRA="-DDEBUG_GIC -DDEBUG_GEM -DDEBUG_PHY" +CFLAGS += $(CFLAGS_EXTRA) + +ASFLAGS := -mcpu=cortex-a53 + +# Layout selector. Default ocm keeps the OCM-only layout that the JTAG +# iteration scripts depend on (everything in OCM @ 0xFFFC0000). Pass +# LAYOUT=ddr to relink for DDR @ 0x10000000 -- this is the layout +# wolfBoot expects (WOLFBOOT_LOAD_ADDRESS in zynqmp.config). +LAYOUT ?= ocm +ifeq ($(LAYOUT),ddr) + LDSCRIPT := target_ddr.ld + CFLAGS += -DZCU102_LAYOUT_DDR +else ifeq ($(LAYOUT),ocm) + LDSCRIPT := target.ld + CFLAGS += -DZCU102_LAYOUT_OCM +else + $(error LAYOUT must be 'ocm' or 'ddr') +endif + +LDFLAGS := -nostdlib -nostartfiles -T $(LDSCRIPT) -Wl,-gc-sections +# Replace newlib's aarch64 memset/memcpy (which use 'dc zva' and hang +# on this Cortex-A53 setup) with our bytewise versions in main.c. +LDFLAGS += -Wl,--wrap=memset -Wl,--wrap=memcpy + +LOCAL_C := main.c uart.c mmu.c gic.c gem.c phy_dp83867.c entropy.c +LOCAL_S := startup.S +LOCAL_OBJS := $(LOCAL_C:.c=.o) $(LOCAL_S:.S=.o) + +# Compile wolfIP core into our directory (don't reuse the upstream .o, +# which may have been built for a different ABI). +WOLFIP_OBJ := wolfip.o +OBJS := $(LOCAL_OBJS) $(WOLFIP_OBJ) + +all: app.elf + @echo "Built: app.elf" + @$(SIZE) app.elf + +app.elf: $(OBJS) $(LDSCRIPT) + $(CC) $(CFLAGS) $(OBJS) $(LDFLAGS) \ + -Wl,--start-group -lc -lgcc -Wl,--end-group -o $@ + +# wolfIP core: -Wno-zero-length-bounds is needed because wolfIP sizes +# its timer heap as MAX_TIMERS = MAX_TCPSOCKETS * 3. With our minimum +# of MAX_TCPSOCKETS=2 (forced by DHCP/ARP timer scheduling, see +# README), the heap is 6 entries which is fine. The warning fires +# anyway on the zero-length-array code path that wolfIP includes for +# the MAX_TCPSOCKETS=0 profile we'd actually like to use; the [0] +# accesses are runtime-guarded by heap->size > 0 so this is a false +# positive. Drop the suppression once core decouples the timer count +# from MAX_TCPSOCKETS. +$(WOLFIP_OBJ): $(ROOT)/src/wolfip.c + $(CC) $(CFLAGS) -Wno-zero-length-bounds -Wno-type-limits -c $< -o $@ + +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +%.o: %.S + $(CC) $(ASFLAGS) -c $< -o $@ + +# Build a bootable BOOT.BIN. Requires FSBL_ELF env var pointing to a +# pre-built ZCU102 FSBL (built in Vitis or PetaLinux). bootgen itself +# is part of Vitis or available standalone. +bootbin: app.elf + @if [ -z "$$FSBL_ELF" ]; then \ + echo "ERROR: FSBL_ELF must point to a prebuilt ZCU102 FSBL ELF."; \ + exit 1; \ + fi + FSBL_ELF=$$FSBL_ELF APP_ELF=$$PWD/app.elf bootgen/build_bootbin.sh + +clean: + rm -f $(OBJS) app.elf BOOT.BIN + +.PHONY: all clean bootbin help + +help: + @echo "ZCU102 wolfIP build:" + @echo " make - build app.elf" + @echo " FSBL_ELF=... make bootbin - build BOOT.BIN" + @echo " make clean - remove artifacts" + @echo "" + @echo "Override CROSS_COMPILE if your toolchain prefix differs." diff --git a/src/port/zcu102/README.md b/src/port/zcu102/README.md new file mode 100644 index 00000000..22f2831e --- /dev/null +++ b/src/port/zcu102/README.md @@ -0,0 +1,199 @@ +# wolfIP port: Xilinx ZCU102 (UltraScale+ MPSoC) + +Bare-metal wolfIP port for the AMD/Xilinx Zynq UltraScale+ MPSoC, demoed +on the ZCU102 dev board. Targets a single Cortex-A53 core (APU 0) at +EL3, GCC bare-metal, no Xilinx Standalone BSP, no FreeRTOS, no wolfBoot. + +This first milestone is aimed at a deterministic UDP-only profile +suitable for DO-178C DAL-C qualification. The application opens a +UDP echo socket on port 7 and runs a DHCP client to acquire a lease. + +## What this port covers + +- PS-GEM3 (on-board RJ45) at 1 Gbps via the TI DP83867IR PHY (RGMII). +- IRQ-driven RX via GIC-400 SPI 63 (`gem_isr()` runs from the EL3 IRQ + vector), polled TX. Requires `SCR_EL3.IRQ=1` to actually enter the + exception on this A53 even though we run at EL3 -- see the comment + in `startup.S`. +- Clean-room Cadence GEM driver - no XEmacPs, no Xilinx Standalone BSP, + no `xparameters.h`. All register base addresses live in `board.h`. +- MMU at EL3 with a static page table: DDR Normal WB, peripherals + Device-nGnRnE, and an OCM (0xFFFC0000+) Normal-WB executable block + where this app currently lives (text, data, BSS, page tables, and + the GEM BDs/frame buffers all in OCM). GEM DMA coherency is handled + with explicit DC CVAC / IVAC ops in `gem.c`. A Normal-NC DMA + carve-out is reserved in the L2_DDR table for a future layout that + spills `.dma_buffers` into DDR but is dormant today. +- PS-UART0 polled console (USB-UART on the ZCU102 board, channel 0). +- DHCP client and a UDP echo demo (port 7); ICMP echo reply works + through the wolfIP core. + +## What is explicitly NOT in this port yet + +- Software VLAN (Daniele has a separate wolfIP-core PR in flight). +- uC/OS-II socket port (planned follow-up; trivially adapts an existing + `bsd_socket.c`). +- Additional GEM instances (GEM0/1/2). Driver is single-instance. +- Versal Gen 1, Zynq-7000. +- wolfBoot integration. Stock Xilinx FSBL hands control directly to + `app.elf`. +- TLS / wolfSSL. + +## Hardware + +- AMD/Xilinx ZCU102 evaluation board (XCZU9EG-2FFVB1156). Rev 1.0 or + 1.1 are both fine. +- USB-UART via the on-board FTDI FT4232 (host sees four `/dev/ttyUSB*` + channels; UART0 is the standard one, typically `/dev/ttyUSB0` or the + channel labelled "MIO" depending on board / udev). +- Ethernet via the on-board RJ45 (PS-GEM3 -> DP83867 PHY @ MDIO 0x0C). + +## Build + +Toolchain: ARM GNU `aarch64-none-elf-gcc`. The default is on `$PATH`; +override with `CROSS_COMPILE=...-` if needed. + +``` +cd src/port/zcu102 +make CROSS_COMPILE=aarch64-none-elf- +``` + +Output: `app.elf`. Section sizes are printed at the end of the build. + +## Build BOOT.BIN + +You need a pre-built ZCU102 FSBL ELF. The simplest way to obtain one +is the Vitis "zynqmp_fsbl" template (single-click build), or PetaLinux +`petalinux-build -c bootloader`. We deliberately do NOT vendor FSBL +sources here; FSBL is a Xilinx-provided component and stock works. + +Source Vitis first (so `bootgen` is on `$PATH`), then: + +``` +FSBL_ELF=/path/to/zynqmp_fsbl.elf make bootbin +``` + +Output: `BOOT.BIN` in the port directory. + +## Boot + +### SD card boot + +1. Format a microSD as FAT32. +2. Copy `BOOT.BIN` to the root of the SD card. +3. Set ZCU102 boot mode DIP SW6 to SD (positions 1-4 = ON, OFF, OFF, OFF). +4. Insert the card and power-cycle the board. + +### JTAG boot (Vitis xsct) + +``` +xsct +% connect +% targets -set -filter {name =~ "PSU"} +% rst -system +% loadhw -hw /path/to/your-design.xsa +% targets -set -filter {name =~ "Cortex-A53 #0"} +% dow /path/to/wolfip/src/port/zcu102/app.elf +% con +``` + +If you do not have an XSA from your own design, the stock ZCU102 base +design from Vitis is fine - we only depend on the PS configuration +(DDR controller, MIO pinmuxing, IOPLL clocks) which is identical +across base designs. + +### JTAG iteration (no SD swap) + +This port ships a self-contained xsdb loader under `jtag/` that +power-cycles the board (via remote Pi GPIO, optional), forces JTAG +boot mode, runs `psu_init`, loads `app.elf` into OCM, and releases +A53-0 at the OCM entry. The whole app + BSS + page tables + DMA +buffers fit in the 256 KB OCM, so DDR-via-JTAG flakiness is avoided. + +``` +./jtag/boot.sh # one-shot +./jtag/boot_iter.sh # build + power-cycle + load loop +``` + +See `jtag/boot.tcl` for the actual xsdb sequence. + +## Expected UART output + +``` +=== wolfIP ZCU102 (UltraScale+ A53-0 EL3) === +MMU on, caches on. Bringing up GIC-400... +Initializing wolfIP stack... +Bringing up GEM3 (RGMII, DP83867)... +GEM3: PHY at MDIO addr=0x0000000C +DP83867: ID1=0x00002000 ID2=0x0000A231 +DP83867 link: 1000 Mbps FD + link UP, PHY=0x0000000C +Starting DHCP client... +DHCP bound: + IP: 192.168.1.50 + Mask: 255.255.255.0 + GW: 192.168.1.1 +Opening UDP echo socket on port 7 +Ready. Try: nc -u 7 +``` + +## Verification + +From a host on the same subnet as the board: + +``` +$ ping -c 3 192.168.1.50 +$ echo "hello wolfip" | nc -u -w1 192.168.1.50 7 +hello wolfip +``` + +UART capture via the `uart-monitor` skill (add a board entry pointing +at `/dev/ttyUSB0` and 115200 8N1). + +## Files + +| File | Purpose | +|---------------------|---------| +| `Makefile` | Build app.elf and BOOT.BIN | +| `target.ld` | aarch64 EL3 linker script - separate RX/RW segments, 2 MB DMA region | +| `startup.S` | EL3 vectors, BSS clear, MMU/main bring-up, IRQ trampoline | +| `board.h` | PS register base addresses, GIC SPI IDs | +| `mmu.c` / `.h` | EL3 page tables (T0SZ=32, 1 GB L1 + 2 MB L2 for DDR + DMA carve-out) | +| `gic.c` / `.h` | GIC-400 (GICv2) minimal driver | +| `uart.c` / `.h` | PS-UART0 polled console | +| `gem.c` / `.h` | Cadence GEM driver (PS-GEM3): BDs, polled-RX/TX, MDIO, cache maintenance | +| `phy_dp83867.c` / `.h` | TI DP83867IR init + RGMII skew + AN + RX_CTRL strap quirk | +| `main.c` | wolfIP init, DHCP client, UDP echo on port 7, memset/memcpy wrappers | +| `config.h` | wolfIP build profile (UDP-only intent) | +| `bootgen/boot.bif` | bootgen template (substitutes `${FSBL_ELF}` and `${APP_ELF}`) | +| `bootgen/build_bootbin.sh` | renders the bif and invokes bootgen | +| `jtag/boot.sh` / `.tcl` | xsdb loader for OCM-only JTAG iteration | + +## Notes for cert / DAL-C + +- No Xilinx Standalone BSP linked in. `aarch64-none-elf-gcc` newlib + provides `memcpy`/`memset` only. +- No dynamic allocation. All buffers static in BSS or `.dma_buffers`. +- No floating point (`-mgeneral-regs-only`). +- The MAC address is hard-coded in `board.h`. Replace with a + per-board value (e.g., read from EEPROM or PS_VERSION fuses) for + production; we keep static for repeatability in the lab. +- The wolfIP core currently sizes its timer heap as + `MAX_TIMERS = MAX_TCPSOCKETS * 3`. This port sets `MAX_TCPSOCKETS=2` + in `config.h` so DHCP / ARP can schedule timers; the application + does not open any TCP sockets. A core wolfIP follow-up should + decouple the timer count from TCP so the TCP code can be fully + excluded from a DAL-C build. +- The wolfIP core triggers two false-positive GCC warnings + (`-Wzero-length-bounds`, `-Wtype-limits`) when `MAX_TCPSOCKETS` + reaches its lower bound. We suppress them on the wolfip.c compile + only; the diagnostics on this port's source remain at `-Wall -Wextra + -Werror`. +- newlib's aarch64 `memset`/`memcpy` use `dc zva`, which hangs on this + Cortex-A53 setup even with `SCTLR_EL3.DZE=1`. We override both with + bytewise versions in `main.c` via `-Wl,--wrap`. + +## Known issues + +- `MAX_TCPSOCKETS=2` is the minimum for the current wolfIP core - see + the timer-heap note above. diff --git a/src/port/zcu102/board.h b/src/port/zcu102/board.h new file mode 100644 index 00000000..7429103d --- /dev/null +++ b/src/port/zcu102/board.h @@ -0,0 +1,108 @@ +/* board.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Xilinx UltraScale+ MPSoC PS register base addresses, GIC SPI IDs, + * and clock parents for the ZCU102 board. All values are derived from + * the ZynqMP TRM (UG1085) and the ZCU102 board user guide (UG1182). + * No Xilinx BSP header (xparameters.h) is required. + */ +#ifndef ZCU102_BOARD_H +#define ZCU102_BOARD_H + +#include + +/* --------------------------------------------------------------------- + * Memory map (ZynqMP PS) + * ------------------------------------------------------------------- */ +#define DDR_BASE 0x00000000UL +#define DDR_SIZE 0x80000000UL /* 2 GB lower bank */ + +#define OCM_BASE 0xFFFC0000UL +#define OCM_SIZE 0x00040000UL /* 256 KB */ + +/* --------------------------------------------------------------------- + * PS peripherals + * ------------------------------------------------------------------- */ +#define UART0_BASE 0xFF000000UL +#define UART1_BASE 0xFF010000UL + +#define GEM0_BASE 0xFF0B0000UL +#define GEM1_BASE 0xFF0C0000UL +#define GEM2_BASE 0xFF0D0000UL +#define GEM3_BASE 0xFF0E0000UL + +#define CRL_APB_BASE 0xFF5E0000UL +#define IOU_SLCR_BASE 0xFF180000UL + +/* GIC-400 distributor and CPU interface (per ZynqMP TRM). */ +#define GICD_BASE 0xF9010000UL +#define GICC_BASE 0xF9020000UL + +/* --------------------------------------------------------------------- + * GIC SPI numbers as GIC INTIDs (ARM GIC numbering: SPI N -> INTID 32+N). + * The ZynqMP TRM Table 13-1 column "SPI ID" is the GIC_SPI offset (0..) + * used in Linux device trees; the actual GIC INTID is 32 + that offset. + * We use INTIDs directly throughout this driver, so add 32. + * ------------------------------------------------------------------- */ +#define IRQ_GEM0 (32 + 57) /* GIC_SPI 57 -> INTID 89 */ +#define IRQ_GEM1 (32 + 59) /* GIC_SPI 59 -> INTID 91 */ +#define IRQ_GEM2 (32 + 61) /* GIC_SPI 61 -> INTID 93 */ +#define IRQ_GEM3 (32 + 63) /* GIC_SPI 63 -> INTID 95 + * on-board ZCU102 RJ45 */ + +/* --------------------------------------------------------------------- + * CRL_APB clock and reset registers + * ------------------------------------------------------------------- */ +#define CRL_APB_GEM3_REF_CTRL (CRL_APB_BASE + 0x5C) +#define CRL_APB_RST_LPD_IOU0 (CRL_APB_BASE + 0x230) /* GEM3 reset bit 3 */ + +/* --------------------------------------------------------------------- + * PS UART0 (Cadence) - on-board USB-UART on ZCU102 via U104 FT4232 + * ------------------------------------------------------------------- */ +#define UART_BAUD 115200 + +/* Default A53-0 EL3 stack location (set in startup.S, mirrored here for + * any C code that needs to know). */ +#define A53_STACK_TOP 0x00100000UL /* 1 MB - 8 B */ + +/* MAC address for eth0. Locally-administered, even first octet: + * 02:00:5A:11:22:33. Each byte is individually overridable via + * build-time -DWOLFIP_MAC_n=0xXX so callers can swap any subset + * (e.g. only the last three bytes from an EEPROM-derived value). */ +#ifndef WOLFIP_MAC_0 +#define WOLFIP_MAC_0 0x02 +#endif +#ifndef WOLFIP_MAC_1 +#define WOLFIP_MAC_1 0x00 +#endif +#ifndef WOLFIP_MAC_2 +#define WOLFIP_MAC_2 0x5A +#endif +#ifndef WOLFIP_MAC_3 +#define WOLFIP_MAC_3 0x11 +#endif +#ifndef WOLFIP_MAC_4 +#define WOLFIP_MAC_4 0x22 +#endif +#ifndef WOLFIP_MAC_5 +#define WOLFIP_MAC_5 0x33 +#endif + +#endif /* ZCU102_BOARD_H */ diff --git a/src/port/zcu102/bootgen/boot.bif b/src/port/zcu102/bootgen/boot.bif new file mode 100644 index 00000000..7d69f741 --- /dev/null +++ b/src/port/zcu102/bootgen/boot.bif @@ -0,0 +1,14 @@ +// ZCU102 BOOT.BIN definition for wolfIP bare-metal app. +// +// Variables expanded by build_bootbin.sh: +// ${FSBL_ELF} - path to a pre-built ZynqMP FSBL (A53-0, EL3, NS) +// ${APP_ELF} - path to the wolfIP app ELF (this directory's app.elf) +// +// bootgen consumes this file with: +// bootgen -arch zynqmp -image boot.bif -w on -o BOOT.BIN + +the_ROM_image: +{ + [bootloader, destination_cpu=a53-0] ${FSBL_ELF} + [destination_cpu=a53-0] ${APP_ELF} +} diff --git a/src/port/zcu102/bootgen/build_bootbin.sh b/src/port/zcu102/bootgen/build_bootbin.sh new file mode 100755 index 00000000..0069760f --- /dev/null +++ b/src/port/zcu102/bootgen/build_bootbin.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# +# Build BOOT.BIN for the wolfIP ZCU102 bare-metal app. +# +# Required env vars: +# FSBL_ELF - path to a prebuilt ZynqMP FSBL ELF (A53-0, EL3, NS). +# Build this once in Vitis (helloworld template -> zynqmp_fsbl) +# or in PetaLinux; we do not vendor FSBL sources here. +# APP_ELF - path to the wolfIP app ELF. The Makefile's "bootbin" +# target sets this for you to $PWD/app.elf. +# +# Optional: +# BOOTGEN - path to the bootgen binary (default: from $PATH). +# OUT_DIR - where to place BOOT.BIN (default: parent of this script). +# +set -euo pipefail + +if [[ -z "${FSBL_ELF:-}" ]]; then + echo "ERROR: FSBL_ELF env var must point to a ZynqMP FSBL ELF." >&2 + exit 1 +fi +if [[ -z "${APP_ELF:-}" ]]; then + echo "ERROR: APP_ELF env var must point to the wolfIP app ELF." >&2 + exit 1 +fi +if [[ ! -f "${FSBL_ELF}" ]]; then + echo "ERROR: FSBL_ELF '${FSBL_ELF}' not found." >&2 + exit 1 +fi +if [[ ! -f "${APP_ELF}" ]]; then + echo "ERROR: APP_ELF '${APP_ELF}' not found." >&2 + exit 1 +fi + +BOOTGEN="${BOOTGEN:-bootgen}" +if ! command -v "${BOOTGEN}" >/dev/null 2>&1; then + echo "ERROR: bootgen not found. Source Vitis (settings64.sh) first." >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OUT_DIR="${OUT_DIR:-$(dirname "${SCRIPT_DIR}")}" +BIF_TEMPLATE="${SCRIPT_DIR}/boot.bif" +BIF_RENDERED="$(mktemp -t wolfip-zcu102-bif.XXXXXX)" +trap 'rm -f "${BIF_RENDERED}"' EXIT + +# Substitute ${FSBL_ELF} and ${APP_ELF} in the bif template. +sed \ + -e "s|\${FSBL_ELF}|${FSBL_ELF}|g" \ + -e "s|\${APP_ELF}|${APP_ELF}|g" \ + "${BIF_TEMPLATE}" > "${BIF_RENDERED}" + +cd "${OUT_DIR}" +"${BOOTGEN}" -arch zynqmp -image "${BIF_RENDERED}" -w on -o BOOT.BIN + +echo "BOOT.BIN written to: ${OUT_DIR}/BOOT.BIN" diff --git a/src/port/zcu102/config.h b/src/port/zcu102/config.h new file mode 100644 index 00000000..a23992cb --- /dev/null +++ b/src/port/zcu102/config.h @@ -0,0 +1,82 @@ +/* config.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * wolfIP configuration for Xilinx ZCU102 (UltraScale+ MPSoC, A53-0 EL3 + * bare-metal). UDP-only profile aimed at deterministic DAL-C use. + */ +#ifndef WOLF_CONFIG_H +#define WOLF_CONFIG_H + +#ifndef CONFIG_IPFILTER +#define CONFIG_IPFILTER 0 +#endif + +#define ETHERNET +#define LINK_MTU 1536 + +/* UDP-only profile in intent: the application does not call + * wolfIP_sock_socket() with IPSTACK_SOCK_STREAM. MAX_TCPSOCKETS is set + * to a small non-zero value only because core wolfIP currently sizes + * its timer heap via MAX_TIMERS = MAX_TCPSOCKETS * 3, and DHCP / ARP + * aging need timers. With MAX_TCPSOCKETS=0 the timer-heap insert path + * is permanently full and DHCP cannot schedule its retransmit timer. + * A core wolfIP follow-up should decouple MAX_TIMERS from + * MAX_TCPSOCKETS so DAL-C builds can truly opt TCP code out at + * compile time. */ +#define MAX_TCPSOCKETS 2 +#define MAX_UDPSOCKETS 4 +#define MAX_ICMPSOCKETS 1 +#define RXBUF_SIZE (LINK_MTU * 4) +#define TXBUF_SIZE (LINK_MTU * 4) + +#define MAX_NEIGHBORS 16 + +#ifndef WOLFIP_MAX_INTERFACES +#define WOLFIP_MAX_INTERFACES 1 +#endif + +#ifndef WOLFIP_ENABLE_FORWARDING +#define WOLFIP_ENABLE_FORWARDING 0 +#endif + +#ifndef WOLFIP_ENABLE_LOOPBACK +#define WOLFIP_ENABLE_LOOPBACK 0 +#endif + +#ifndef WOLFIP_ENABLE_DHCP +#define WOLFIP_ENABLE_DHCP 1 +#endif + +/* Static IP fallback (used if DHCP is disabled or times out). */ +#define WOLFIP_IP "192.168.1.100" +#define WOLFIP_NETMASK "255.255.255.0" +#define WOLFIP_GW "192.168.1.1" +#define WOLFIP_STATIC_DNS_IP "8.8.8.8" + +#if WOLFIP_ENABLE_DHCP +#define DHCP +#define DHCP_DISCOVER_RETRIES 2 +#define DHCP_REQUEST_RETRIES 2 +#endif + +/* Hardware debug: define for verbose GEM / MDIO / DHCP logging. */ +/* #define DEBUG_HW */ + +#endif /* WOLF_CONFIG_H */ diff --git a/src/port/zcu102/entropy.c b/src/port/zcu102/entropy.c new file mode 100644 index 00000000..b454af55 --- /dev/null +++ b/src/port/zcu102/entropy.c @@ -0,0 +1,116 @@ +/* entropy.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * MemUse-pattern entropy source for the wolfIP ZCU102 port. + * + * The ZCU102's UltraScale+ PS does not ship a hardware TRNG that is + * usable from EL3 bare-metal without the PMU firmware and CSU helpers. + * This source produces non-deterministic 32-bit words by sampling the + * Cortex-A53 virtual count register (CNTVCT_EL0) before and after a + * memory-access loop that touches a state buffer larger than the L1 + * data cache. The cache-miss / line-fill / write-allocate timing + * variance is the entropy source - the same primitive wolfCrypt's + * wc_Entropy_Get() (HAVE_ENTROPY_MEMUSE in wolfssl/wolfcrypt/src/ + * wolfentropy.c) uses internally. + * + * This implementation skips wolfCrypt's SHA3-256 conditioning because + * the consumers in wolfIP (TCP ISN, DHCP/DNS transaction IDs, + * ephemeral source ports, IP fragment ID) need unpredictable bits, + * not uniformly-distributed cryptographic randomness. For crypto- + * grade seeding the port should be rebuilt with the full wolfCrypt + * wc_Entropy_Get() in place of zcu102_get_random32(). + * + * Algorithm per call: + * 1. t0 = CNTVCT_EL0 + * 2. Walk state[] performing read+xor+write; ~256 accesses spans + * multiple L1 cache lines on this 32 KB / 4-way A53 cache. + * 3. t1 = CNTVCT_EL0 + * 4. Fold (t1 - t0) into the rolling 64-bit accumulator and + * perturb state[] so the next call diverges. + * 5. Apply a non-cryptographic finaliser (xorshift) and return + * the low 32 bits. + * + * The state buffer is 1024 bytes (sized to span the A53's 64-byte + * line size 16 times, ensuring at least a handful of cache misses + * per call even on a warm cache). + */ +#include + +#define ENTROPY_STATE_WORDS 128u /* 1024 bytes, 16 cache lines */ +#define ENTROPY_WALK_ITERS 256u + +static volatile uint64_t entropy_state[ENTROPY_STATE_WORDS]; +static volatile uint64_t entropy_acc; +static volatile uint32_t entropy_idx; + +static inline uint64_t cntvct_el0(void) +{ + uint64_t v; + __asm__ volatile ("mrs %0, cntvct_el0" : "=r"(v)); + return v; +} + +/* Return a 32-bit value with low predictability, suitable for + * protocol identifiers (DHCP xid, DNS id, TCP ISN, ephemeral port, + * IP fragment id). Not crypto-grade; see file header. */ +uint32_t zcu102_get_random32(void) +{ + uint64_t t0, t1, delta; + uint64_t acc; + uint32_t i; + uint32_t walk_idx; + + t0 = cntvct_el0(); + + /* Memory-access loop: stride through the state array. Using a + * data-dependent index (acc & mask) keeps the prefetcher from + * predicting cache lines, which is exactly the timing noise we + * want to harvest. */ + acc = entropy_acc; + walk_idx = entropy_idx; + for (i = 0; i < ENTROPY_WALK_ITERS; i++) { + uint32_t pos = (walk_idx + (uint32_t)(acc & 0x7Fu)) + & (ENTROPY_STATE_WORDS - 1u); + uint64_t v = entropy_state[pos]; + v ^= acc; + v = (v << 1) | (v >> 63); /* rotate left 1 */ + entropy_state[pos] = v; + acc += v; + walk_idx++; + } + + t1 = cntvct_el0(); + delta = t1 - t0; + + /* Fold the timing delta into the accumulator and the head of + * the state ring. */ + acc ^= delta; + acc ^= (delta << 17) | (delta >> 47); + entropy_state[walk_idx & (ENTROPY_STATE_WORDS - 1u)] ^= acc; + entropy_acc = acc; + entropy_idx = walk_idx; + + /* xorshift64 finaliser to whiten the output word. */ + acc ^= acc << 13; + acc ^= acc >> 7; + acc ^= acc << 17; + + return (uint32_t)acc; +} diff --git a/src/port/zcu102/flash_sd.sh b/src/port/zcu102/flash_sd.sh new file mode 100755 index 00000000..337088d8 --- /dev/null +++ b/src/port/zcu102/flash_sd.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +# +# flash_sd.sh - copy wolfIP ZCU102 BOOT.BIN to the SD card's boot partition. +# +# Usage: +# ./flash_sd.sh # uses /dev/sdb (default), src/port/zcu102/BOOT.BIN +# SD_DEV=/dev/sdc ./flash_sd.sh +# BOOTBIN=/path/to/BOOT.BIN ./flash_sd.sh +# +# Defensive: refuses to write to a device that is not flagged removable +# by the kernel, or any device larger than 128 GiB (so it cannot ever +# scribble on your system SSD by accident). +# +set -euo pipefail + +SD_DEV="${SD_DEV:-/dev/sdb}" +PART="${SD_DEV}1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BOOTBIN="${BOOTBIN:-${SCRIPT_DIR}/BOOT.BIN}" + +red() { printf '\033[1;31m%s\033[0m\n' "$*" >&2; } +green() { printf '\033[1;32m%s\033[0m\n' "$*"; } +note() { printf ' %s\n' "$*"; } + +# --- Sanity checks ------------------------------------------------------- + +if [[ ! -b "${SD_DEV}" ]]; then + red "ERROR: ${SD_DEV} is not a block device." + exit 1 +fi +if [[ ! -b "${PART}" ]]; then + red "ERROR: boot partition ${PART} not found." + red " Did you insert the card and pick the right SD_DEV?" + exit 1 +fi + +RM=$(lsblk -dn -o RM "${SD_DEV}" | tr -d '[:space:]') +if [[ "${RM}" != "1" ]]; then + red "ERROR: ${SD_DEV} is not marked removable (RM=${RM})." + red " Refusing to write - this looks like a fixed disk." + exit 1 +fi + +SIZE_BYTES=$(lsblk -dn -o SIZE -b "${SD_DEV}" | tr -d '[:space:]') +SIZE_GIB=$(( SIZE_BYTES / 1024 / 1024 / 1024 )) +if (( SIZE_GIB > 128 )); then + red "ERROR: ${SD_DEV} is ${SIZE_GIB} GiB - too large for an SD card." + red " Refusing to write." + exit 1 +fi + +if [[ ! -f "${BOOTBIN}" ]]; then + red "ERROR: ${BOOTBIN} not found. Did you run 'make bootbin'?" + exit 1 +fi + +note "SD device : ${SD_DEV} (${SIZE_GIB} GiB, removable)" +note "Boot partition: ${PART}" +note "Source : ${BOOTBIN}" +echo + +# --- Mount (idempotent) -------------------------------------------------- + +MNT=$(lsblk -no MOUNTPOINT "${PART}") +WE_MOUNTED=0 +if [[ -z "${MNT}" ]]; then + note "Mounting ${PART} via udisksctl..." + udisksctl mount -b "${PART}" >/dev/null + MNT=$(lsblk -no MOUNTPOINT "${PART}") + WE_MOUNTED=1 +fi +if [[ -z "${MNT}" ]]; then + red "ERROR: ${PART} did not mount." + exit 1 +fi +note "Mountpoint : ${MNT}" + +# Verify FAT - cheap heuristic: check filesystem type via lsblk. +FSTYPE=$(lsblk -no FSTYPE "${PART}") +if [[ "${FSTYPE}" != "vfat" && "${FSTYPE}" != "exfat" && "${FSTYPE}" != "msdos" ]]; then + red "WARN: ${PART} filesystem is '${FSTYPE}', expected vfat for ZCU102 SD boot." +fi + +# --- Backup and copy ----------------------------------------------------- + +if [[ -f "${MNT}/BOOT.BIN" ]]; then + OLD_SZ=$(stat -c%s "${MNT}/BOOT.BIN") + cp --preserve=timestamps "${MNT}/BOOT.BIN" "${MNT}/BOOT.BIN.bak" + note "Backed up existing BOOT.BIN (${OLD_SZ} bytes) -> BOOT.BIN.bak" +fi + +cp "${BOOTBIN}" "${MNT}/BOOT.BIN" +sync +NEW_SZ=$(stat -c%s "${MNT}/BOOT.BIN") +note "Wrote ${NEW_SZ} bytes to ${MNT}/BOOT.BIN" + +# --- Unmount ------------------------------------------------------------- + +if (( WE_MOUNTED == 1 )); then + note "Unmounting ${PART}..." + udisksctl unmount -b "${PART}" >/dev/null +fi +sync + +green "Done. Safe to remove the SD card and boot the board." +echo +note "Watch UART log: tail -f /tmp/uart-monitor/latest/ZYNQMP_ZCU102_UART0.log" +note "Or: uart-monitor tail ZYNQMP_ZCU102_UART0" diff --git a/src/port/zcu102/gem.c b/src/port/zcu102/gem.c new file mode 100644 index 00000000..a627624c --- /dev/null +++ b/src/port/zcu102/gem.c @@ -0,0 +1,758 @@ +/* gem.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Cadence GEM driver for ZynqMP GEM3 (on-board RJ45 on ZCU102). + * + * - 32-bit DMA addressing (OCM low bank, well under 4 GB). + * - IRQ-driven RX (GIC-400 SPI 63 -> gem_isr) and polled TX. Note: + * the SCR_EL3.IRQ routing bit must be set on this A53 for the + * exception to actually be entered, despite the ARM ARM appearing + * to say SCR_EL3.IRQ only affects lower-EL routing. See + * startup.S for the explicit SCR_EL3 setup. + * - BDs and frame buffers live in the .dma_buffers section, which the + * linker places in OCM (Normal-WB executable per L2_PERIPH[511]). + * Cache coherency between CPU L1 D-cache and the MAC DMA path is + * maintained explicitly via cache_clean() / cache_inval() at every + * BD hand-off. + * + * Register set per ZynqMP TRM (UG1085) chapter 34 / Cadence GEM. + */ +#include +#include +#include "config.h" +#include "../../../wolfip.h" +#include "board.h" +#include "uart.h" +#include "gic.h" +#include "gem.h" +#include "phy_dp83867.h" + +/* Cache maintenance helpers for GEM DMA coherency. Cortex-A53 cache + * line is 64 bytes. With D-cache enabled and BD/buffers in normal + * cacheable memory, CPU writes may sit in L1 D-cache and not be + * visible to the MAC's DMA path. cache_clean() writes back dirty + * lines to memory before DMA reads; cache_inval() invalidates lines + * so subsequent CPU reads pull fresh DMA-written data. */ +#define CACHE_LINE 64u + +static inline void cache_clean(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("dc cvac, %0" :: "r"(a) : "memory"); + __asm__ volatile ("dsb sy" ::: "memory"); +} + +static inline void cache_inval(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("dc ivac, %0" :: "r"(a) : "memory"); + __asm__ volatile ("dsb sy" ::: "memory"); +} + +/* --------------------------------------------------------------------- + * Register offsets and bit masks (subset we use) + * ------------------------------------------------------------------- */ +#define GEM_NWCTRL (*(volatile uint32_t *)(GEM3_BASE + 0x000)) +#define GEM_NWCFG (*(volatile uint32_t *)(GEM3_BASE + 0x004)) +#define GEM_NWSR (*(volatile uint32_t *)(GEM3_BASE + 0x008)) +#define GEM_DMACR (*(volatile uint32_t *)(GEM3_BASE + 0x010)) +#define GEM_TSR (*(volatile uint32_t *)(GEM3_BASE + 0x014)) +#define GEM_RXQBASE (*(volatile uint32_t *)(GEM3_BASE + 0x018)) +#define GEM_TXQBASE (*(volatile uint32_t *)(GEM3_BASE + 0x01C)) +#define GEM_RSR (*(volatile uint32_t *)(GEM3_BASE + 0x020)) +#define GEM_ISR (*(volatile uint32_t *)(GEM3_BASE + 0x024)) +#define GEM_IER (*(volatile uint32_t *)(GEM3_BASE + 0x028)) +#define GEM_IDR (*(volatile uint32_t *)(GEM3_BASE + 0x02C)) +#define GEM_PHYMNTNC (*(volatile uint32_t *)(GEM3_BASE + 0x034)) +#define GEM_HASHL (*(volatile uint32_t *)(GEM3_BASE + 0x080)) +#define GEM_HASHH (*(volatile uint32_t *)(GEM3_BASE + 0x084)) +#define GEM_LADDR1L (*(volatile uint32_t *)(GEM3_BASE + 0x088)) +#define GEM_LADDR1H (*(volatile uint32_t *)(GEM3_BASE + 0x08C)) +/* Priority queue base addresses (queues 1-3). Cadence GEM has 4 TX + * and 4 RX priority queues; if we don't point unused ones at a safe + * dummy BD, the MAC will eventually try to fetch from queue1+ at + * power-on-random addresses and hang (TSR.TXGO sticks with no octets + * transmitted). U-Boot's zynq_gem and Linux's macb both set these. */ +#define GEM_TXQ1BASE (*(volatile uint32_t *)(GEM3_BASE + 0x440)) +#define GEM_TXQ2BASE (*(volatile uint32_t *)(GEM3_BASE + 0x444)) +#define GEM_TXQ3BASE (*(volatile uint32_t *)(GEM3_BASE + 0x448)) +#define GEM_RXQ1BASE (*(volatile uint32_t *)(GEM3_BASE + 0x480)) +#define GEM_RXQ2BASE (*(volatile uint32_t *)(GEM3_BASE + 0x484)) +#define GEM_RXQ3BASE (*(volatile uint32_t *)(GEM3_BASE + 0x488)) +#define GEM_OCTTXL (*(volatile uint32_t *)(GEM3_BASE + 0x100)) +#define GEM_TXCNT (*(volatile uint32_t *)(GEM3_BASE + 0x108)) +#define GEM_OCTRXL (*(volatile uint32_t *)(GEM3_BASE + 0x150)) +#define GEM_RXCNT (*(volatile uint32_t *)(GEM3_BASE + 0x158)) +#define GEM_RXFCSCNT (*(volatile uint32_t *)(GEM3_BASE + 0x190)) +#define GEM_RXORCNT (*(volatile uint32_t *)(GEM3_BASE + 0x1A4)) + +#define NWCTRL_LOOPEN (1u << 1) +#define NWCTRL_RXEN (1u << 2) +#define NWCTRL_TXEN (1u << 3) +#define NWCTRL_MDEN (1u << 4) +#define NWCTRL_STATCLR (1u << 5) +#define NWCTRL_STARTTX (1u << 9) +#define NWCTRL_HALTTX (1u << 10) + +#define NWCFG_SPEED100 (1u << 0) +#define NWCFG_FDEN (1u << 1) +#define NWCFG_COPYALL (1u << 4) +#define NWCFG_BCASTDI (1u << 5) +#define NWCFG_MCASTHASHEN (1u << 6) +#define NWCFG_UCASTHASHEN (1u << 7) +#define NWCFG_1536RXEN (1u << 8) +#define NWCFG_1000 (1u << 10) +#define NWCFG_FCSREM (1u << 17) +#define NWCFG_MDCDIV_SHIFT 18u +#define NWCFG_MDCDIV_MASK (7u << 18) +#define NWCFG_DWIDTH_64 (1u << 21) /* Data bus width = 64 bit (AArch64) */ + +#define NWSR_PHY_IDLE (1u << 2) + +#define RSR_BUFFNA (1u << 0) +#define RSR_FRAMERX (1u << 1) +#define RSR_RXOVR (1u << 2) +#define RSR_HRESPNOK (1u << 3) + +#define IXR_MGMNT (1u << 0) +#define IXR_FRAMERX (1u << 1) +#define IXR_TXCOMPL (1u << 7) +#define IXR_TXEXH (1u << 6) +#define IXR_RXUSED (1u << 2) +#define IXR_RXOVR (1u << 10) +#define IXR_HRESPNOK (1u << 11) + +#define PHYMNTNC_CLAUSE22 0x40020000u +#define PHYMNTNC_OP_R (2u << 28) +#define PHYMNTNC_OP_W (1u << 28) + +#define RXBUF_OWN_SW (1u << 0) +#define RXBUF_WRAP (1u << 1) +#define RXBUF_ADDR_MASK 0xFFFFFFFCu +#define RXBUF_LEN_MASK 0x00001FFFu + +#define TXBUF_USED (1u << 31) +#define TXBUF_WRAP (1u << 30) +#define TXBUF_LAST (1u << 15) +#define TXBUF_LEN_MASK 0x00003FFFu + +/* --------------------------------------------------------------------- + * BD ring and frame buffer sizing + * ------------------------------------------------------------------- */ +/* Ring sizes deliberately small to fit text + DMA buffers + BSS in + * 256 KB OCM (we keep everything in OCM because DDR-via-JTAG isn't + * reliable without PMU FW running). For higher throughput, bump + * these once we move BSS back to DDR. */ +#define RX_RING_LEN 16 +#define TX_RING_LEN 8 +#define BUF_LEN 1536 /* multiple of 64, per DMACR.RXBS */ + +/* GEM BD: two 32-bit words. */ +struct gem_bd { + uint32_t addr; + uint32_t status; +}; + +/* All DMA-visible objects go in .dma_buffers (Device-nGnRnE per MMU). */ +static struct gem_bd rx_ring[RX_RING_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +static struct gem_bd tx_ring[TX_RING_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +static uint8_t rx_buf_pool[RX_RING_LEN][BUF_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +static uint8_t tx_buf_pool[TX_RING_LEN][BUF_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); + +/* Dummy BD pair for disabling priority queues 1-3. The TX dummy has + * USED=1 so the MAC ignores it (refuses to transmit). The RX dummy + * has the SW-OWN/NEW bit set so MAC won't write into queue 1-3 RX. */ +static struct gem_bd dummy_tx_bd + __attribute__((aligned(8), section(".dma_buffers"))); +static struct gem_bd dummy_rx_bd + __attribute__((aligned(8), section(".dma_buffers"))); + +/* --------------------------------------------------------------------- + * Software RX queue: filled by ISR, drained by eth_poll() in the main + * loop. Single producer (ISR) / single consumer (main), so a lockless + * head/tail pair is safe when we use DSB to publish writes. + * + * Each slot stores a pointer to one of rx_buf_pool[i] plus length; + * the buffer's BD is recycled after the main loop hands the frame to + * wolfIP. + * ------------------------------------------------------------------- */ +#define SWQ_DEPTH 16 + +struct swq_slot { + uint8_t *buf; + uint16_t len; + uint16_t ring_idx; /* into rx_ring[] - recycle after consume */ +}; + +static volatile struct swq_slot swq[SWQ_DEPTH]; +static volatile uint32_t swq_head; /* ISR writes */ +static volatile uint32_t swq_tail; /* main reads */ +static volatile uint32_t rx_drops; /* ISR-side counter */ +static volatile uint32_t s_irq_count; +static volatile uint32_t s_rx_frames; +static volatile uint32_t s_tx_sent; + +static uint32_t rx_next; /* next BD the SW will look at */ +static uint32_t tx_next; /* next BD the SW will try to TX */ + +static uint8_t phy_addr_used; + +/* --------------------------------------------------------------------- + * MDIO + * ------------------------------------------------------------------- */ +static int mdio_wait_idle(void) +{ + int spin; + for (spin = 0; spin < 100000; spin++) { + if (GEM_NWSR & NWSR_PHY_IDLE) + return 0; + } + return -1; +} + +int gem_mdio_read(uint8_t phy_addr, uint8_t reg, uint16_t *out) +{ + uint32_t v; + if (mdio_wait_idle() < 0) + return -1; + v = PHYMNTNC_CLAUSE22 | PHYMNTNC_OP_R + | (((uint32_t)phy_addr & 0x1Fu) << 23) + | (((uint32_t)reg & 0x1Fu) << 18); + GEM_PHYMNTNC = v; + if (mdio_wait_idle() < 0) + return -2; + *out = (uint16_t)(GEM_PHYMNTNC & 0xFFFFu); + return 0; +} + +int gem_mdio_write(uint8_t phy_addr, uint8_t reg, uint16_t value) +{ + uint32_t v; + if (mdio_wait_idle() < 0) + return -1; + v = PHYMNTNC_CLAUSE22 | PHYMNTNC_OP_W + | (((uint32_t)phy_addr & 0x1Fu) << 23) + | (((uint32_t)reg & 0x1Fu) << 18) + | (uint32_t)value; + GEM_PHYMNTNC = v; + if (mdio_wait_idle() < 0) + return -2; + return 0; +} + +/* --------------------------------------------------------------------- + * BD ring init + * ------------------------------------------------------------------- */ +static void rx_ring_init(void) +{ + uint32_t i; + for (i = 0; i < RX_RING_LEN; i++) { + uint32_t addr = (uint32_t)(uintptr_t)rx_buf_pool[i]; + addr &= RXBUF_ADDR_MASK; + if (i == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + rx_ring[i].addr = addr; /* OWN=0 -> hardware can use */ + rx_ring[i].status = 0; + } + rx_next = 0; +} + +static void tx_ring_init(void) +{ + uint32_t i; + /* Match u-boot zynq_gem pattern: all BDs start as dummies with + * USED|LAST|WRAP, addr=0. eth_send fills in addr + length + LAST + * (and clears USED) when actually transmitting. The WRAP bit on + * the last BD keeps the MAC walker in our ring. */ + for (i = 0; i < TX_RING_LEN; i++) { + tx_ring[i].addr = 0; + tx_ring[i].status = TXBUF_USED | TXBUF_LAST + | ((i == TX_RING_LEN - 1) ? TXBUF_WRAP : 0); + } + tx_next = 0; +} + +/* --------------------------------------------------------------------- + * RX ISR + * ------------------------------------------------------------------- */ +static void gem_isr(void) +{ + uint32_t isr; + + s_irq_count++; + isr = GEM_ISR; + GEM_ISR = isr; /* clear-on-write */ + + /* Invalidate the WHOLE RX ring at entry - MAC may have written + * to any BD, not just rx_next. Cheap (one cache line typically + * since the ring is small). */ + cache_inval(rx_ring, sizeof(rx_ring)); + + /* Walk RX BDs whose SW-OWN bit is set (frame ready for software). */ + while (rx_ring[rx_next].addr & RXBUF_OWN_SW) { + s_rx_frames++; + /* Also invalidate the buffer before we copy from it. */ + cache_inval(rx_buf_pool[rx_next], + rx_ring[rx_next].status & RXBUF_LEN_MASK); + uint32_t status = rx_ring[rx_next].status; + uint32_t next_head = swq_head; + uint32_t slot = next_head % SWQ_DEPTH; + uint32_t depth = next_head - swq_tail; + + if (depth >= SWQ_DEPTH) { + /* SW queue full - drop and recycle the BD. */ + rx_drops++; + } else { + swq[slot].buf = rx_buf_pool[rx_next]; + swq[slot].len = (uint16_t)(status & RXBUF_LEN_MASK); + swq[slot].ring_idx = (uint16_t)rx_next; + __asm__ volatile ("dsb sy" ::: "memory"); + swq_head = next_head + 1; + } + + /* If we have headroom in the SW queue we recycle the BD only + * after main consumes the slot (see eth_poll); when dropping we + * recycle here. */ + if (depth >= SWQ_DEPTH) { + uint32_t addr = (uint32_t)(uintptr_t)rx_buf_pool[rx_next]; + addr &= RXBUF_ADDR_MASK; + if (rx_next == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + rx_ring[rx_next].status = 0; + __asm__ volatile ("dsb sy" ::: "memory"); + rx_ring[rx_next].addr = addr; /* OWN=0 again */ + /* MAC reads BDs straight from memory; clean the line so it + * sees OWN=0, otherwise it skips past this BD and walks the + * ring leaving holes. */ + cache_clean(&rx_ring[rx_next], sizeof(rx_ring[rx_next])); + } + rx_next = (rx_next + 1) % RX_RING_LEN; + } + + /* RXUSED recovery: clear BUFFNA. With cache_clean on the recycle + * path, this should be rare; when it happens, also kick the RX + * path so the MAC re-walks the ring. */ + if (isr & IXR_RXUSED) { + GEM_RSR = RSR_BUFFNA; + } + if (isr & IXR_RXOVR) { + GEM_RSR = RSR_RXOVR; + } +} + +/* --------------------------------------------------------------------- + * eth_poll / eth_send (called from wolfIP_poll and stack TX path) + * ------------------------------------------------------------------- */ +static int eth_poll(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + uint32_t tail; + uint32_t slot; + uint32_t copy; + uint32_t addr; + uint16_t idx; + + (void)ll; + + /* RX frames are delivered into swq[] by gem_isr() running off the + * GIC-400 INTID 95 IRQ path (see startup.S SCR_EL3 setup and + * board.h IRQ_GEM3). eth_poll just drains the SW queue here. */ + tail = swq_tail; + if (tail == swq_head) + return 0; /* SW queue empty */ + + slot = tail % SWQ_DEPTH; + copy = swq[slot].len; + if (copy > len) + copy = len; + memcpy(buf, swq[slot].buf, copy); + + /* Recycle the BD back to hardware. */ + idx = swq[slot].ring_idx; + addr = (uint32_t)(uintptr_t)rx_buf_pool[idx]; + addr &= RXBUF_ADDR_MASK; + if (idx == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + rx_ring[idx].status = 0; + __asm__ volatile ("dsb sy" ::: "memory"); + rx_ring[idx].addr = addr; /* OWN bit cleared = HW can write */ + /* MAC walks BDs from main memory (not coherent with CPU D-cache); + * push the OWN=0 store out so the MAC will reuse this slot. */ + cache_clean(&rx_ring[idx], sizeof(rx_ring[idx])); + + __asm__ volatile ("dsb sy" ::: "memory"); + swq_tail = tail + 1; + + return (int)copy; +} + +static int eth_send(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + uint32_t idx; + uint32_t status; + + (void)ll; + + if (len > BUF_LEN) + return -1; + + idx = tx_next; + /* Wait briefly for the BD to be free (USED=1 means MAC done). The + * USED bit is written back by MAC DMA - invalidate the cache line + * so the CPU does not see the stale USED=0 we wrote when we last + * armed this BD. */ + { + int spin; + for (spin = 0; spin < 100000; spin++) { + cache_inval(&tx_ring[idx], sizeof(tx_ring[idx])); + if (tx_ring[idx].status & TXBUF_USED) + break; + } + if ((tx_ring[idx].status & TXBUF_USED) == 0) + return -2; /* TX ring backed up - tell caller to retry */ + } + + memcpy(tx_buf_pool[idx], buf, len); + + /* Pad to minimum Ethernet frame (60 bytes; MAC adds 4-byte FCS). */ + if (len < 60u) { + memset(tx_buf_pool[idx] + len, 0, 60u - len); + len = 60u; + } + + /* Flush the frame buffer from D-cache so MAC DMA reads see it. */ + cache_clean(tx_buf_pool[idx], len); + + /* Re-arm BD: set buffer address, then clear USED with length+LAST + * (preserve WRAP if this is the last BD). Buffer addr written + * before status so MAC walking the ring sees a valid pair. */ + tx_ring[idx].addr = (uint32_t)(uintptr_t)tx_buf_pool[idx]; + status = (len & TXBUF_LEN_MASK) | TXBUF_LAST; + if (idx == TX_RING_LEN - 1) + status |= TXBUF_WRAP; + tx_ring[idx].status = status; /* USED=0 -> ready for MAC */ + + /* Flush BD update so MAC sees USED=0. */ + cache_clean(&tx_ring[idx], sizeof(tx_ring[idx])); + GEM_NWCTRL |= NWCTRL_STARTTX; + + s_tx_sent++; + tx_next = (idx + 1) % TX_RING_LEN; + return (int)len; +} + +uint32_t gem_irq_count(void) { return s_irq_count; } +uint32_t gem_rx_frames(void) { return s_rx_frames; } +uint32_t gem_tx_sent(void) { return s_tx_sent; } + +void gem_dump_state(void) +{ + uint32_t i; + cache_inval(rx_ring, sizeof(rx_ring)); + cache_inval(tx_ring, sizeof(tx_ring)); + uart_puts("GEM3 regs: NWCTRL="); uart_puthex(GEM_NWCTRL); + uart_puts(" NWCFG="); uart_puthex(GEM_NWCFG); + uart_puts(" NWSR="); uart_puthex(GEM_NWSR); + uart_puts(" DMACR="); uart_puthex(GEM_DMACR); + uart_puts("\n ISR="); uart_puthex(GEM_ISR); + uart_puts(" RSR="); uart_puthex(GEM_RSR); + uart_puts(" TSR="); uart_puthex(GEM_TSR); + uart_puts(" IMR="); uart_puthex(*(volatile uint32_t *)(GEM3_BASE + 0x030)); + uart_puts("\n tx[0]="); uart_puthex(tx_ring[0].addr); + uart_puts("/"); uart_puthex(tx_ring[0].status); + uart_puts(" rx[0]="); uart_puthex(rx_ring[0].addr); + uart_puts("/"); uart_puthex(rx_ring[0].status); + uart_puts("\n irq="); uart_putdec(s_irq_count); + uart_puts(" rx_frm="); uart_putdec(s_rx_frames); + uart_puts(" tx_snt="); uart_putdec(s_tx_sent); + uart_puts(" drops="); uart_putdec(rx_drops); + uart_puts("\n HW counters: txoct="); uart_putdec(GEM_OCTTXL); + uart_puts(" txcnt="); uart_putdec(GEM_TXCNT); + uart_puts(" rxoct="); uart_putdec(GEM_OCTRXL); + uart_puts(" rxcnt="); uart_putdec(GEM_RXCNT); + uart_puts(" rxfcs="); uart_putdec(GEM_RXFCSCNT); + uart_puts(" rxor="); uart_putdec(GEM_RXORCNT); + { + uint32_t filled = 0; + uint32_t first_filled = 0xFFFF; + for (i = 0; i < RX_RING_LEN; i++) { + if (rx_ring[i].addr & RXBUF_OWN_SW) { + filled++; + if (first_filled == 0xFFFF) first_filled = i; + } + } + uart_puts(" rx_filled="); uart_putdec(filled); + uart_puts(" first="); uart_putdec(first_filled); + uart_puts(" rx_next="); uart_putdec(rx_next); + } + uart_puts("\n"); +} + +/* --------------------------------------------------------------------- + * Clock + reset for GEM3 via CRL_APB. + * + * For the stock ZCU102 boot flow, FSBL has already configured GEM3: + * - CRL_APB.GEM3_REF_CTRL -> 125 MHz from IOPLL or RPLL + * - CRL_APB.RST_LPD_IOU0 -> GEM3 out of reset + * - IOU_SLCR MIO 64..77 -> GEM3 RGMII + MDIO pin muxing + * + * We pulse the GEM3 reset bit so the MAC starts from a known state + * without touching the clock control (which would race with FSBL's + * setup of PLLs). + * ------------------------------------------------------------------- */ +#define CRL_RST_GEM3 (1u << 3) + +/* Configure CRL_APB.GEM3_REF_CTRL for the negotiated link speed. The + * MAC sources TX_CLK to the PHY at this rate (RGMII): 125 MHz for + * 1 Gbps, 25 MHz for 100 Mbps, 2.5 MHz for 10 Mbps. PetaLinux/FSBL + * may pre-program this for a different speed than we want; both + * U-Boot and Linux re-program it whenever PHY link speed changes. + * + * IOPLL = 1500 MHz on ZCU102 (FSBL default). + * 1500 / 12 / 1 = 125 MHz (1000) + * 1500 / 12 / 5 = 25 MHz (100) + * 1500 / 12 / 50 = 2.5 MHz (10) + * + * Register layout (TRM): CLKACT bit26, CLKACT_RX bit25, + * DIVISOR1 bits[21:16], DIVISOR0 bits[13:8], SRCSEL bits[2:0]. */ +static void gem3_set_ref_clk(int speed_mbps) +{ + volatile uint32_t *gem3_ref = (volatile uint32_t *)CRL_APB_GEM3_REF_CTRL; + uint32_t div1; + uint32_t val; + + switch (speed_mbps) { + case 1000: div1 = 1; break; + case 100: div1 = 5; break; + case 10: div1 = 50; break; + default: div1 = 1; break; + } + val = (1u << 26) /* CLKACT */ + | (1u << 25) /* CLKACT_RX */ + | ((div1 & 0x3Fu) << 16) /* DIVISOR1 */ + | ((12u & 0x3Fu) << 8) /* DIVISOR0 */ + | (0u); /* SRCSEL = IOPLL */ + *gem3_ref = val; +} + +static void gem3_hw_reset(void) +{ + volatile uint32_t *rst = (volatile uint32_t *)CRL_APB_RST_LPD_IOU0; + volatile uint32_t *gem3ref = (volatile uint32_t *)CRL_APB_GEM3_REF_CTRL; + + uart_puts("GEM3 clk before: GEM3_REF_CTRL="); + uart_puthex(*gem3ref); + uart_puts(" RST_LPD_IOU0="); + uart_puthex(*rst); + uart_puts("\n"); + + *rst |= CRL_RST_GEM3; + { + volatile int d; + for (d = 0; d < 10000; d++) + ; + } + *rst &= ~CRL_RST_GEM3; + { + volatile int d; + for (d = 0; d < 100000; d++) /* ~10 ms post-reset settle */ + ; + } + + /* Force 125 MHz reference for the 1 Gbps case. zcu102_eth_init() + * downshifts this later if the PHY ends up at 100/10. */ + gem3_set_ref_clk(1000); + uart_puts("GEM3 clk after : GEM3_REF_CTRL="); + uart_puthex(*gem3ref); + uart_puts("\n"); +} + +/* --------------------------------------------------------------------- + * Public init + * ------------------------------------------------------------------- */ +int zcu102_eth_init(struct wolfIP_ll_dev *ll) +{ + uint8_t addr; + uint16_t id1; + int found_phy; + int speed; + int fd; + int link_up; + + gem3_hw_reset(); + + /* Disable everything before configuring. */ + GEM_NWCTRL = 0; + GEM_IDR = 0xFFFFFFFFu; + (void)GEM_ISR; + GEM_ISR = 0xFFFFFFFFu; + GEM_TSR = 0xFFFFFFFFu; + GEM_RSR = RSR_BUFFNA | RSR_FRAMERX | RSR_RXOVR | RSR_HRESPNOK; + + /* Initial NWCFG: gigabit, full duplex, MDC=/96, 1536-byte frames, + * strip FCS from RX, accept broadcasts, multicast via hash, + * DWIDTH_64 because ZynqMP GEM hangs on a 64-bit AXI bus and + * needs this bit for TX to actually transmit (matches U-Boot + * ZYNQ_GEM_DBUS_WIDTH for CONFIG_ARM64). + * COPYALL temporarily on for first-bring-up so we can confirm + * the RX path is alive even if filtering is mis-set. */ + GEM_NWCFG = NWCFG_1000 + | NWCFG_FDEN + | NWCFG_FCSREM + | NWCFG_1536RXEN + | NWCFG_MCASTHASHEN + | NWCFG_COPYALL + | NWCFG_DWIDTH_64 + | (5u << NWCFG_MDCDIV_SHIFT); + + /* DMACR: AHB fixed burst 16 beats, RX buffer 1536/64=24, TX/RX + * packet buffer memory at max. Do NOT set bit 30 (DMA_ADDR_BUS_WIDTH + * 64-bit): that selects 16-byte BD format with addr_hi, which would + * break the 8-byte struct gem_bd layout (MAC would walk every other + * BD and write to bogus high addresses, dropping the frame after + * counting it - exactly the failure mode we hit). 64-bit AXI bus + * width is set in NWCFG bit 21 instead. */ + GEM_DMACR = (24u << 16) /* RX buffer size in 64-byte units */ + | (1u << 10) /* TX packet buffer memory size = max */ + | (3u << 8) /* RX packet buffer memory size = max */ + | 0x10u; /* burst length = 16 */ + + /* Set MAC address into SAB1/SAT1. SAB1L writes are latched on + * SAB1H write per TRM, so write the high half last. */ + GEM_LADDR1L = (uint32_t)WOLFIP_MAC_0 + | ((uint32_t)WOLFIP_MAC_1 << 8) + | ((uint32_t)WOLFIP_MAC_2 << 16) + | ((uint32_t)WOLFIP_MAC_3 << 24); + GEM_LADDR1H = (uint32_t)WOLFIP_MAC_4 + | ((uint32_t)WOLFIP_MAC_5 << 8); + + GEM_HASHL = 0; + GEM_HASHH = 0; + + /* Build BD rings. */ + rx_ring_init(); + tx_ring_init(); + GEM_RXQBASE = (uint32_t)(uintptr_t)rx_ring; + GEM_TXQBASE = (uint32_t)(uintptr_t)tx_ring; + + /* Disable priority queues 1-3 with dummy BDs. Without this, the + * MAC may walk uninitialised q1/q2/q3 base pointers and hang + * (TSR.TXGO sticks but no octets transmitted). */ + dummy_tx_bd.addr = 0; + dummy_tx_bd.status = TXBUF_USED | TXBUF_WRAP | TXBUF_LAST; + dummy_rx_bd.addr = RXBUF_WRAP | RXBUF_OWN_SW; + dummy_rx_bd.status = 0; + GEM_TXQ1BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_TXQ2BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_TXQ3BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_RXQ1BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + GEM_RXQ2BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + GEM_RXQ3BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + cache_clean(&dummy_tx_bd, sizeof(dummy_tx_bd)); + cache_clean(&dummy_rx_bd, sizeof(dummy_rx_bd)); + + /* Clear any stale RX/TX packet classification screening. ZynqMP + * GEM has SCREENING_TYPE_1 (TID match) at 0x500+ and SCREENING_TYPE_2 + * (compare) at 0x540+. If non-zero, frames may be routed to non-Q0 + * queues. Default 0 = all to Q0. */ + { + uint32_t k; + for (k = 0; k < 16; k++) { + *(volatile uint32_t *)(GEM3_BASE + 0x500 + 4*k) = 0; + *(volatile uint32_t *)(GEM3_BASE + 0x540 + 4*k) = 0; + } + } + + /* Enable MDIO so we can talk to the PHY. */ + GEM_NWCTRL |= NWCTRL_MDEN; + + /* Probe MDIO addresses 0..31 for a responsive PHY. ZCU102 routes + * DP83867 to MDIO address 0x0C, but probing makes the driver + * resilient to board variants. */ + found_phy = 0; + for (addr = 0; addr < 32; addr++) { + if (gem_mdio_read(addr, 0x02, &id1) == 0 && id1 != 0xFFFFu && id1 != 0) { + found_phy = 1; + break; + } + } + if (!found_phy) { + uart_puts("GEM3: no PHY responding on MDIO!\n"); + return -10; + } + phy_addr_used = addr; + uart_puts("GEM3: PHY at MDIO addr="); + uart_puthex(phy_addr_used); + uart_puts("\n"); + + if (dp83867_init(phy_addr_used, &speed, &fd) < 0) { + uart_puts("GEM3: PHY init failed\n"); + return -11; + } + + /* If PHY ended up at 10/100, downshift the MAC and re-program the + * GEM3 reference clock to match (125 MHz / 25 MHz / 2.5 MHz). */ + if (speed != 1000) { + uint32_t cfg = GEM_NWCFG; + cfg &= ~NWCFG_1000; + if (speed == 100) + cfg |= NWCFG_SPEED100; + else + cfg &= ~NWCFG_SPEED100; + if (!fd) + cfg &= ~NWCFG_FDEN; + GEM_NWCFG = cfg; + gem3_set_ref_clk(speed); + } + + /* Install RX ISR. */ + gic_register_handler(IRQ_GEM3, gem_isr); + gic_enable_spi(IRQ_GEM3, 0xA0); + + /* Enable RX/TX and arm RX-side interrupts. */ + GEM_IER = IXR_FRAMERX | IXR_RXUSED | IXR_RXOVR | IXR_HRESPNOK; + GEM_NWCTRL |= NWCTRL_RXEN | NWCTRL_TXEN; + + /* Populate wolfIP ll_dev. */ + ll->mac[0] = WOLFIP_MAC_0; + ll->mac[1] = WOLFIP_MAC_1; + ll->mac[2] = WOLFIP_MAC_2; + ll->mac[3] = WOLFIP_MAC_3; + ll->mac[4] = WOLFIP_MAC_4; + ll->mac[5] = WOLFIP_MAC_5; + memcpy(ll->ifname, "eth0", 5); + ll->non_ethernet = 0; + ll->mtu = LINK_MTU; + ll->poll = eth_poll; + ll->send = eth_send; + ll->priv = NULL; + + link_up = (dp83867_link_status(phy_addr_used) == 1) ? 1 : 0; + return (link_up << 8) | (int)phy_addr_used; +} diff --git a/src/port/zcu102/gem.h b/src/port/zcu102/gem.h new file mode 100644 index 00000000..48aa8b33 --- /dev/null +++ b/src/port/zcu102/gem.h @@ -0,0 +1,35 @@ +/* gem.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Cadence GEM driver for Xilinx UltraScale+ MPSoC GEM3 (on-board RJ45 + * on ZCU102). Single-instance, RGMII, gigabit, polled TX, IRQ-driven + * RX. + */ +#ifndef ZCU102_GEM_H +#define ZCU102_GEM_H + +#include +#include "../../../wolfip.h" + +/* Initialize GEM3, MMIO clock + reset, PHY, and populate the wolfIP + * link-layer device. Returns: + * < 0 on error (negated TRM code) + * bits [7:0] PHY MDIO address used + * bit [8] link_up flag (1 = link is up at end of init) + */ +int zcu102_eth_init(struct wolfIP_ll_dev *ll); + +/* MDIO helpers exposed for the PHY driver (phy_dp83867.c). */ +int gem_mdio_read(uint8_t phy_addr, uint8_t reg, uint16_t *out); +int gem_mdio_write(uint8_t phy_addr, uint8_t reg, uint16_t value); + +/* Diagnostics: dump GEM registers and counters to UART. */ +void gem_dump_state(void); +uint32_t gem_irq_count(void); +uint32_t gem_rx_frames(void); +uint32_t gem_tx_sent(void); + +#endif /* ZCU102_GEM_H */ diff --git a/src/port/zcu102/gic.c b/src/port/zcu102/gic.c new file mode 100644 index 00000000..de2582c5 --- /dev/null +++ b/src/port/zcu102/gic.c @@ -0,0 +1,202 @@ +/* gic.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * GIC-400 (ARM GICv2) minimal driver for Cortex-A53 EL3 on ZynqMP. + * Configures all SPIs as Group 0 (IGROUPR bits cleared), level- + * triggered, targeted at CPU0, priority 0xA0. With GICC_CTLR.FIQEn=0 + * a pending Group 0 interrupt is delivered as IRQ, not FIQ. Only + * the SPIs explicitly enabled via gic_enable_spi() will fire. The + * IRQ vector in startup.S funnels into irq_dispatch() here, which + * reads IAR, routes to the registered C handler, and EOIs. + * + * No assumptions about a previous BSP - we initialize the distributor + * and CPU interface from scratch. + */ +#include +#include "board.h" +#include "gic.h" + +/* Distributor registers */ +#define GICD_CTLR (*(volatile uint32_t *)(GICD_BASE + 0x000)) +#define GICD_TYPER (*(volatile uint32_t *)(GICD_BASE + 0x004)) +#define GICD_IGROUPR(n) (*(volatile uint32_t *)(GICD_BASE + 0x080 + 4*(n))) +#define GICD_ISENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x100 + 4*(n))) +#define GICD_ICENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x180 + 4*(n))) +#define GICD_ISPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x200 + 4*(n))) +#define GICD_ICPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x280 + 4*(n))) +#define GICD_IPRIORITYR(n) (*(volatile uint32_t *)(GICD_BASE + 0x400 + 4*(n))) +#define GICD_ITARGETSR(n) (*(volatile uint32_t *)(GICD_BASE + 0x800 + 4*(n))) +#define GICD_ICFGR(n) (*(volatile uint32_t *)(GICD_BASE + 0xC00 + 4*(n))) +#define GICD_SGIR (*(volatile uint32_t *)(GICD_BASE + 0xF00)) + +/* CPU interface registers */ +#define GICC_CTLR (*(volatile uint32_t *)(GICC_BASE + 0x000)) +#define GICC_PMR (*(volatile uint32_t *)(GICC_BASE + 0x004)) +#define GICC_BPR (*(volatile uint32_t *)(GICC_BASE + 0x008)) +#define GICC_IAR (*(volatile uint32_t *)(GICC_BASE + 0x00C)) +#define GICC_EOIR (*(volatile uint32_t *)(GICC_BASE + 0x010)) + +#define GIC_NR_LINES 192 /* GIC-400 in ZynqMP supports up to 192 SPIs */ + +static gic_handler_t handlers[GIC_NR_LINES]; +static volatile uint32_t g_irq_total; +static volatile uint32_t g_irq_last_intid; + +void gic_register_handler(uint32_t intid, gic_handler_t fn) +{ + if (intid < GIC_NR_LINES) + handlers[intid] = fn; +} + +static void byte_write(volatile uint32_t *reg, uint32_t intid, uint8_t val) +{ + uint32_t shift; + uint32_t v; + shift = (intid & 3u) * 8u; + v = reg[intid >> 2]; + v &= ~(0xFFu << shift); + v |= ((uint32_t)val << shift); + reg[intid >> 2] = v; +} + +void gic_enable_spi(uint32_t intid, uint32_t priority) +{ + /* Set priority (lower number = higher prio). */ + byte_write((volatile uint32_t *)(GICD_BASE + 0x400), + intid, (uint8_t)(priority & 0xF8u)); + /* Target CPU0. */ + byte_write((volatile uint32_t *)(GICD_BASE + 0x800), + intid, 0x01u); + /* Group 0 (Secure) - we run at EL3 Secure, so Group 0 is the + * correct choice. GICC.FIQEn=0 makes Group 0 route to IRQ, which + * is what our vector table handles. */ + GICD_IGROUPR(intid >> 5) &= ~(1u << (intid & 31u)); + /* Level-triggered (ICFGR bits = 0b00 -> level, 0b10 -> edge). */ + { + uint32_t reg; + uint32_t shift; + shift = (intid & 15u) * 2u; + reg = GICD_ICFGR(intid >> 4); + reg &= ~(3u << shift); + GICD_ICFGR(intid >> 4) = reg; + } + /* Clear pending and enable. */ + GICD_ICPENDR(intid >> 5) = (1u << (intid & 31u)); + GICD_ISENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void gic_disable_spi(uint32_t intid) +{ + GICD_ICENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void gic_init(void) +{ + uint32_t i; + + /* Disable distributor while we reconfigure. */ + GICD_CTLR = 0; + + /* SGIs and PPIs (INTID 0..31): Group 0 Secure, but leave disabled + * for now - enabling them lit up some pending PPI from CSU/PMU + * that hung wolfIP_init when it occupied the CPU interface. */ + GICD_IGROUPR(0) = 0; + GICD_ICENABLER(0) = 0xFFFFFFFFu; + GICD_ICPENDR(0) = 0xFFFFFFFFu; + /* SPIs (INTID 32+): disable all, mark all as Group 0. */ + for (i = 1; i < (GIC_NR_LINES / 32u); i++) { + GICD_ICENABLER(i) = 0xFFFFFFFFu; + GICD_ICPENDR(i) = 0xFFFFFFFFu; + GICD_IGROUPR(i) = 0; + } + /* SGI/PPI priorities (lower 8 entries cover INTID 0..31). */ + for (i = 0; i < 8u; i++) + GICD_IPRIORITYR(i) = 0xA0A0A0A0u; + for (i = 8u; i < (GIC_NR_LINES / 4u); i++) + GICD_IPRIORITYR(i) = 0xA0A0A0A0u; + for (i = 8u; i < (GIC_NR_LINES / 4u); i++) + GICD_ITARGETSR(i) = 0x01010101u; + for (i = 2u; i < (GIC_NR_LINES / 16u); i++) + GICD_ICFGR(i) = 0; + + /* Enable distributor: both groups (we are at EL3). */ + GICD_CTLR = 0x3u; + + /* CPU interface: priority mask wide open, both groups enabled, + * FIQEn=0 so Group 0 (Secure) interrupts route to nIRQ output + * (per GICv2 IHI 0048B 4.6.4: FIQEn=0 -> nIRQ, FIQEn=1 -> nFIQ). + * AckCtl=1 so Secure reads of GICC_IAR can ack Group 1 too. */ + GICC_PMR = 0xF8u; + GICC_BPR = 0; + GICC_CTLR = 0x07u; /* EnableGrp0 | EnableGrp1 | AckCtl, FIQEn=0 */ +} + +void irq_dispatch(void) +{ + uint32_t iar; + uint32_t intid; + + iar = GICC_IAR; + intid = iar & 0x3FFu; + g_irq_total++; + g_irq_last_intid = intid; + if (intid < GIC_NR_LINES && handlers[intid] != 0) + handlers[intid](); + /* Always EOI to allow next interrupt, even if no handler matched. */ + GICC_EOIR = iar; +} + +uint32_t gic_total_irqs(void) { return g_irq_total; } +uint32_t gic_last_intid(void) { return g_irq_last_intid; } + +uint32_t gic_poll_dispatch(void) +{ + uint32_t n = 0; + uint32_t iar; + uint32_t intid; + + /* Drain up to 8 interrupts per poll to avoid live-locking the + * main loop if a peripheral is hammering us. */ + while (n < 8) { + iar = GICC_IAR; + intid = iar & 0x3FFu; + if (intid >= 1020) /* 1023 spurious / no pending */ + break; + g_irq_total++; + g_irq_last_intid = intid; + if (intid < GIC_NR_LINES && handlers[intid] != 0) + handlers[intid](); + GICC_EOIR = iar; + n++; + } + return n; +} + +uint32_t gic_is_pending(uint32_t intid) +{ + return (GICD_ISPENDR(intid >> 5) >> (intid & 31u)) & 1u; +} + +void gic_self_test_sgi(uint32_t intid) +{ + /* GICD_SGIR: TargetListFilter (bits 25:24) = 10 (self), + * SGIINTID (bits 3:0) = intid. */ + GICD_SGIR = (2u << 24) | (intid & 0xFu); +} diff --git a/src/port/zcu102/gic.h b/src/port/zcu102/gic.h new file mode 100644 index 00000000..2a1eae9e --- /dev/null +++ b/src/port/zcu102/gic.h @@ -0,0 +1,49 @@ +/* gic.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef ZCU102_GIC_H +#define ZCU102_GIC_H + +#include + +typedef void (*gic_handler_t)(void); + +void gic_init(void); +void gic_register_handler(uint32_t intid, gic_handler_t fn); +void gic_enable_spi(uint32_t intid, uint32_t priority); +void gic_disable_spi(uint32_t intid); + +/* Returns 1 if interrupt is currently pending at the distributor, + * 0 otherwise. Diagnostic only. */ +uint32_t gic_is_pending(uint32_t intid); + +/* Fire a software-generated interrupt to self (CPU0) for testing. + * intid must be < 16. */ +void gic_self_test_sgi(uint32_t intid); + +/* Total IRQs taken (any intid) and the last intid we saw. */ +uint32_t gic_total_irqs(void); +uint32_t gic_last_intid(void); + +/* Polled-mode IRQ dispatch: drains any pending IRQ from the GIC + * by reading GICC_IAR, calling the registered handler, and EOI'ing. + * Returns the number of interrupts dispatched in this call. + * + * Workaround: on this ZynqMP / Cortex-A53 / GIC-400 combination, + * the GIC latches pending interrupts correctly but the CPU never + * takes the IRQ exception (root cause not pinned). Calling this + * function from the main loop is functionally equivalent. */ +uint32_t gic_poll_dispatch(void); + +/* Provided by startup.S, asm helpers. */ +void irq_enable(void); +void irq_disable(void); + +/* Called by the IRQ vector trampoline in startup.S. Acknowledges, + * dispatches, and EOIs the current interrupt. */ +void irq_dispatch(void); + +#endif /* ZCU102_GIC_H */ diff --git a/src/port/zcu102/jtag/boot.sh b/src/port/zcu102/jtag/boot.sh new file mode 100755 index 00000000..fa05e6a8 --- /dev/null +++ b/src/port/zcu102/jtag/boot.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# +# Boot the wolfIP ZCU102 app via JTAG (Platform Cable II / Digilent). +# Sourced from anywhere; assumes a hw_server reachable on localhost +# (the default when Vitis is local). +# +# Required env (no built-in defaults; set per-developer): +# XSDB - path to Vitis xsdb binary +# (e.g. /opt/Xilinx/2025.2/Vitis/bin/xsdb) +# PSU_INIT_TCL - path to a ZCU102 psu_init.tcl, generated by Vitis +# for a base design that matches your board (DDR, +# MIO pinmux, IOPLL clocks). The PetaLinux hw- +# description directory contains one. +# OBJCOPY - aarch64 objcopy binary, e.g. +# aarch64-none-elf-objcopy on PATH +# +# Optional env (sensible defaults): +# APP_ELF - default: ${PORT_DIR}/app.elf +# APP_BIN - default: ${PORT_DIR}/app.bin (objcopy'd here) +# APP_LOAD_ADDR- default: 0xFFFC0000 (OCM). Set to 0x10000000 for the +# LAYOUT=ddr build, which is also what wolfBoot uses. +# PMUFW_ELF - path to pmufw.elf. When set, jtag/boot.tcl loads it +# into the PMU MicroBlaze and starts it before +# psu_init. Required for reliable DDR access via +# JTAG; not needed for the OCM-only layout. +# +# Usage (from the port directory): +# XSDB=/opt/Xilinx/2025.2/Vitis/bin/xsdb \ +# PSU_INIT_TCL=/path/to/psu_init.tcl \ +# OBJCOPY=aarch64-none-elf-objcopy \ +# ./jtag/boot.sh +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PORT_DIR="$(dirname "${SCRIPT_DIR}")" + +: "${XSDB:?XSDB is required (path to Vitis xsdb binary)}" +: "${PSU_INIT_TCL:?PSU_INIT_TCL is required (path to a ZCU102 psu_init.tcl)}" +: "${OBJCOPY:?OBJCOPY is required (aarch64 objcopy binary on PATH or absolute path)}" +APP_ELF="${APP_ELF:-${PORT_DIR}/app.elf}" +APP_BIN="${APP_BIN:-${PORT_DIR}/app.bin}" + +if ! command -v "${XSDB}" >/dev/null 2>&1 && [[ ! -x "${XSDB}" ]]; then + echo "ERROR: xsdb not found / not executable: ${XSDB}" >&2 + exit 1 +fi +if [[ ! -f "${PSU_INIT_TCL}" ]]; then + echo "ERROR: psu_init.tcl not found at ${PSU_INIT_TCL}" >&2 + exit 1 +fi +if ! command -v "${OBJCOPY}" >/dev/null 2>&1 && [[ ! -x "${OBJCOPY}" ]]; then + echo "ERROR: objcopy not found / not executable: ${OBJCOPY}" >&2 + exit 1 +fi +if [[ ! -f "${APP_ELF}" ]]; then + echo "ERROR: app.elf not found at ${APP_ELF}. Run 'make' first." >&2 + exit 1 +fi + +# Generate flat binary (PT_LOAD segments concatenated by physical addr). +echo "Generating app.bin from app.elf..." +"${OBJCOPY}" -O binary "${APP_ELF}" "${APP_BIN}" + +# If PMU FW is provided, generate its flat binary alongside. xsdb's +# `dow` fails on the PMU MicroBlaze target without a loaded XSA, so +# the TCL loads it via mwr-force from this .bin instead. PMU FW is +# MicroBlaze, so it needs a MicroBlaze-capable objcopy (override +# with MB_OBJCOPY=, defaults to the Vitis-shipped one if present). +if [[ -n "${PMUFW_ELF:-}" ]]; then + if [[ ! -f "${PMUFW_ELF}" ]]; then + echo "ERROR: PMUFW_ELF set but not found: ${PMUFW_ELF}" >&2 + exit 1 + fi + MB_OBJCOPY="${MB_OBJCOPY:-/opt/Xilinx/2025.2/gnu/microblaze/lin/bin/mb-objcopy}" + if ! command -v "${MB_OBJCOPY}" >/dev/null 2>&1 && [[ ! -x "${MB_OBJCOPY}" ]]; then + echo "ERROR: MicroBlaze objcopy not found at ${MB_OBJCOPY}" >&2 + echo " set MB_OBJCOPY=/path/to/mb-objcopy" >&2 + exit 1 + fi + PMUFW_BIN="${PMUFW_BIN:-${PMUFW_ELF%.elf}.bin}" + echo "Generating $(basename "${PMUFW_BIN}") from pmufw.elf (mb-objcopy)..." + "${MB_OBJCOPY}" -O binary "${PMUFW_ELF}" "${PMUFW_BIN}" + export PMUFW_BIN +fi + +echo "JTAG boot ZCU102 wolfIP app" +echo " xsdb : ${XSDB}" +echo " psu_init.tcl : ${PSU_INIT_TCL}" +echo " app.elf : ${APP_ELF}" +echo " app.bin : ${APP_BIN} ($(stat -c%s "${APP_BIN}") bytes)" +echo + +export APP_ELF APP_BIN PSU_INIT_TCL APP_LOAD_ADDR PMUFW_ELF + +"${XSDB}" "${SCRIPT_DIR}/boot.tcl" + +echo +echo "App is running. Watch UART:" +echo " uart-monitor tail ZYNQMP_ZCU102_UART0" diff --git a/src/port/zcu102/jtag/boot.tcl b/src/port/zcu102/jtag/boot.tcl new file mode 100644 index 00000000..cdb16a68 --- /dev/null +++ b/src/port/zcu102/jtag/boot.tcl @@ -0,0 +1,220 @@ +# JTAG load of the wolfIP A53-0 bare-metal app on ZCU102. +# +# Pattern adapted from a known-working ZynqMP JTAG bare-metal loader +# (puf-provision/run.tcl). Key differences from earlier attempts that +# all failed silently: +# 1. Force JTAG bootmode via CSU register write (mwr 0xFF5E0200 0x0100). +# Without this, rst -system leaves the SoC in a state where dow +# eventually fails or the core won't resume. +# 2. Use psu_init.tcl directly (no FSBL stage). FSBL on this board has +# a JTAG-mode park (WFE deep-sleep) that 'con' cannot wake. +# 3. Use mwr -force per word to write the raw binary instead of dow. +# xsdb's dow path on DDR has a cache-flush dance that fails after +# psu_init runs. +# 4. Install a 'b .' bootloop at the A53 default RVBAR (0xFFFF0000) +# so rst -processor is safe and doesn't fly off into garbage. +# 5. After dow, target A53, rst -processor, stop, rwr pc, con. +# +# Env vars (set by jtag/boot.sh): +# APP_BIN path to the raw binary (objcopy -O binary app.elf app.bin) +# APP_ELF path to the ELF (for entry point reading) +# PSU_INIT_TCL path to psu_init.tcl + +set OCM_BASE 0xFFFC0000 +# DDR layout uses 0x10000000 (matches WOLFBOOT_LOAD_ADDRESS in +# wolfBoot's config/examples/zynqmp.config). The jtag/boot.sh script +# exports APP_LOAD_ADDR if set, otherwise defaults to the OCM base. +# Use scan to convert a hex string ("0x10000000") to an integer the +# rest of this script can compare and pass to mwr / dow. +if {[info exists env(APP_LOAD_ADDR)]} { + scan $env(APP_LOAD_ADDR) "%i" APP_LOAD_ADDR +} else { + set APP_LOAD_ADDR $OCM_BASE +} + +# Load a raw binary file to a target address via mwr -force, one +# 32-bit word at a time. Slow but reliable - bypasses xsdb's cache +# coherency logic that breaks dow on DDR after psu_init. +proc load_binary {bin_file base_addr} { + set fp [open $bin_file rb] + set data [read $fp] + close $fp + set len [string length $data] + + # Pad to 4-byte alignment. + set pad [expr {(4 - ($len % 4)) % 4}] + if {$pad > 0} { + append data [string repeat "\x00" $pad] + } + set padded [string length $data] + set words [expr {$padded / 4}] + + puts " loading [format %d $len] bytes ($words words) to [format 0x%08X $base_addr]" + + targets -set -nocase -filter {name =~ "*PSU*"} + for {set i 0} {$i < $words} {incr i} { + set off [expr {$i * 4}] + binary scan $data @${off}iu word + mwr -force [format "0x%X" [expr {$base_addr + $off}]] \ + [format "0x%X" [expr {$word & 0xFFFFFFFF}]] + if {($i % 8192) == 0 && $i > 0} { + puts " [expr {$i * 100 / $words}]%..." + } + } + puts " 100% done" + return $len +} + +# ---------------------------------------------------------------------- +# 1. Connect, system reset, force JTAG bootmode. +# ---------------------------------------------------------------------- +puts "Connecting..." +connect + +# Enumerate the JTAG chain explicitly. Without this poke, the DAP / +# PSU / APU targets are sometimes not visible immediately after the +# hw_server attach - 'targets' will only show PS TAP / PMU / PL. +puts "JTAG chain:" +jtag targets + +puts "All targets:" +targets + +puts "System reset..." +targets -set -nocase -filter {name =~ "*PSU*"} +rst -system +after 500 + +# ---------------------------------------------------------------------- +# 1b. Load and start PMU firmware (MicroBlaze on the PMU). +# +# Without PMU FW, JTAG writes to DDR after psu_init are unreliable on +# this board -- the DDR controller training appears to need PMU +# coordination. Loading PMU FW via JTAG mirrors what the CSU +# BootROM would do during a normal SD/QSPI boot. Only do this if +# PMUFW_ELF is set in the environment; otherwise we keep the OCM-only +# behavior we had in Phase 1. We do this BEFORE the CSU JTAG-bootmode +# write because CSU touches PMU on the bootmode handshake. +# ---------------------------------------------------------------------- +if {[info exists env(PMUFW_BIN)]} { + puts "" + puts "Loading PMU FW: $env(PMUFW_BIN)" + # xsdb's `dow` fails on PMU MicroBlaze without a loaded XSA + # ("Invalid context"). Bypass it by writing the binary via + # mwr-force to PMU IRAM at 0xFFDC0000 -- same technique we use + # for the A53 app. The PMU's BootROM hands control to IRAM @ + # 0xFFDC0000 after we deassert PMU reset (psu_init touches PMU + # via CRL_APB.RST_LPD_TOP which keeps PMU running). + jtag targets + targets -set -nocase -filter {name =~ "PMU"} + stop + after 200 + load_binary $env(PMUFW_BIN) 0xFFDC0000 + con + after 1500 + puts "PMU FW running." +} + +puts "Forcing JTAG boot mode (CSU)..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr 0xFF5E0200 0x0100 +after 1000 + +# ---------------------------------------------------------------------- +# 2. psu_init - DDR, clocks, MIO, UART, GEM3 pinmux. +# ---------------------------------------------------------------------- +puts "Sourcing psu_init.tcl..." +source $env(PSU_INIT_TCL) +puts "psu_init..." +psu_init +after 1000 +puts "psu_post_config..." +psu_post_config +after 500 + +# ---------------------------------------------------------------------- +# 3. UART0 baud init (FSBL would do this; psu_init alone doesn't). +# ---------------------------------------------------------------------- +puts "UART0 baud init (115200 8N1 at 100 MHz ref)..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr 0xFF000000 0x03 ;# CR: TX_RST + RX_RST +mwr 0xFF000004 0x20 ;# MR: 8N1 +mwr 0xFF000018 124 ;# BAUDGEN: CD = 124 +mwr 0xFF000034 6 ;# BAUDDIV: BDIV = 6 +mwr 0xFF000000 0x114 ;# CR: TXEN + RXEN + STPBRK +after 100 + +# Banner write so we can see UART is live before our app starts. +foreach c [split "=== JTAG ready, loading app ===\r\n" ""] { + scan $c %c v + mwr -force 0xFF000030 $v +} +after 200 + +# ---------------------------------------------------------------------- +# 4. Load the wolfIP app. +# +# For the OCM layout we use mwr-force per-word (load_binary): the +# native xsdb `dow` path triggers a cache-flush dance that fails after +# psu_init when targeting OCM. For the DDR layout that workaround is +# not necessary -- the AXI master path is reliable to DDR once the +# DDR controller is up, and `dow` is much faster than the +# word-at-a-time fallback. Choose based on APP_LOAD_ADDR: anything +# >= 0xFF000000 is OCM/peripheral and gets the slow safe path; below +# that is DDR and uses dow on the ELF directly. +# ---------------------------------------------------------------------- +puts "" +puts "Loading: $env(APP_BIN) at [format 0x%X $APP_LOAD_ADDR] via mwr-force" +load_binary $env(APP_BIN) $APP_LOAD_ADDR +# Verify the first word landed. KNOWN ISSUE: with APP_LOAD_ADDR in DDR +# (e.g. 0x10000000), single-word mwr-force writes succeed but the +# bulk per-word loop in load_binary frequently shows the first word +# read back as something other than what we wrote, even with PMU FW +# running. The same xsdb cache/coherency dance that breaks `dow` over +# DDR after psu_init appears to be at play. The OCM target works +# reliably. Track this separately; the DDR path will be exercised +# end-to-end via SD/QSPI once wolfBoot's bootgen chain is set up. +if {$APP_LOAD_ADDR < 0xFF000000} { + set fp [open $env(APP_BIN) rb] + set head [read $fp 4] + close $fp + binary scan $head iu expect + set got [mrd -value -force [format 0x%X $APP_LOAD_ADDR]] + puts [format " verify: image\[0\]=0x%08X mem\[0\]=0x%08X %s" \ + $expect $got [expr {$expect == $got ? "OK" : "MISMATCH (known JTAG-DDR issue)"}]] +} + +# ---------------------------------------------------------------------- +# 5. Install RVBAR boot loop in OCM so rst -processor doesn't crash. +# ---------------------------------------------------------------------- +puts "" +puts "Installing RVBAR boot loop at 0xFFFF0000..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr -force 0xFFFF0000 0x14000000 ;# B . (branch to self, aarch64) +mwr -force 0xFFFF0004 0x14000000 + +# ---------------------------------------------------------------------- +# 6. A53 #0: reset, halt, set PC, continue. +# ---------------------------------------------------------------------- +puts "" +puts "Preparing A53 #0..." +targets -set -nocase -filter {name =~ "*A53*#0"} +rst -processor +after 200 +catch {stop} +after 200 +puts "PC after rst -processor (should be RVBAR 0xFFFF0000): [rrd pc]" + +set readelf [expr {[info exists env(READELF)] ? $env(READELF) : "aarch64-none-elf-readelf"}] +set entry [exec $readelf -h $env(APP_ELF) | grep "Entry point" | awk "{print \$NF}"] +puts "App ELF entry: $entry" +rwr pc $entry +puts "PC after rwr: [rrd pc]" + +puts "" +puts "con..." +con + +puts "Detaching, leaving app running." +disconnect +exit diff --git a/src/port/zcu102/jtag/boot_iter.sh b/src/port/zcu102/jtag/boot_iter.sh new file mode 100755 index 00000000..68d73194 --- /dev/null +++ b/src/port/zcu102/jtag/boot_iter.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# +# JTAG iteration helper: power-cycles the ZCU102, restarts hw_server, +# clears the UART log, JTAG-loads the app, and dumps the resulting +# UART output. Useful for headless iteration without physical access +# to the board. +# +# Everything that touches your specific bench is parameterised through +# env vars. Defaults are no-ops so you must set them per developer. +# +# Required env (in addition to whatever boot.sh requires): +# POWER_OFF_CMD - shell command to power the board OFF (e.g. +# "ssh pi@Pi4 'raspi-gpio set 20 op dl'") +# POWER_ON_CMD - shell command to power the board ON (e.g. +# "ssh pi@Pi4 'raspi-gpio set 20 op dh'") +# HW_SERVER - path to the Vitis hw_server binary +# (e.g. /opt/Xilinx/2025.2/Vitis/bin/hw_server) +# UART_LABEL - uart-monitor board label for the ZCU102 USB-UART +# (e.g. ZYNQMP_ZCU102_UART0) +# +# Optional env: +# OFF_DELAY - seconds to hold OFF before ON (default 4) +# BOOT_DELAY - seconds to wait after ON before JTAG (default 10) +# POST_DELAY - seconds to wait after boot.sh before dumping +# UART (default 5) +# UART_LOG - path to the live log file +# (default /tmp/uart-monitor/latest/$UART_LABEL.log) +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PORT_DIR="$(dirname "${SCRIPT_DIR}")" + +: "${POWER_OFF_CMD:?POWER_OFF_CMD is required (shell cmd that powers the board off)}" +: "${POWER_ON_CMD:?POWER_ON_CMD is required (shell cmd that powers the board on)}" +: "${HW_SERVER:?HW_SERVER is required (path to Vitis hw_server)}" +: "${UART_LABEL:?UART_LABEL is required (uart-monitor board label)}" +OFF_DELAY="${OFF_DELAY:-4}" +BOOT_DELAY="${BOOT_DELAY:-10}" +POST_DELAY="${POST_DELAY:-5}" +UART_LOG="${UART_LOG:-/tmp/uart-monitor/latest/${UART_LABEL}.log}" + +echo "=== Power cycle (POWER_OFF_CMD / POWER_ON_CMD) ===" +eval "${POWER_OFF_CMD}" +sleep "${OFF_DELAY}" +eval "${POWER_ON_CMD}" +echo "Powered on, waiting ${BOOT_DELAY}s for CSU bootROM..." +sleep "${BOOT_DELAY}" + +echo +echo "=== Restart hw_server (clears stale JTAG state) ===" +pkill -f hw_server || true +sleep 1 +"${HW_SERVER}" -d >/dev/null 2>&1 & +sleep 3 + +echo +echo "=== Clear UART log (${UART_LABEL}) ===" +uart-monitor clear "${UART_LABEL}" + +echo +echo "=== JTAG boot FSBL + app ===" +"${SCRIPT_DIR}/boot.sh" + +echo +echo "=== Waiting ${POST_DELAY}s for app to produce output ===" +sleep "${POST_DELAY}" + +echo +echo "=== UART output (${UART_LOG}) ===" +cat "${UART_LOG}" diff --git a/src/port/zcu102/main.c b/src/port/zcu102/main.c new file mode 100644 index 00000000..0efb7455 --- /dev/null +++ b/src/port/zcu102/main.c @@ -0,0 +1,321 @@ +/* main.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * wolfIP UDP echo + DHCP client demo for Xilinx ZCU102 (UltraScale+ + * MPSoC, A53-0 EL3 bare-metal, GEM3 -> on-board RJ45). + */ +#include +#include +#include "config.h" +#include "../../../wolfip.h" +#include "board.h" +#include "uart.h" +#include "gic.h" +#include "gem.h" +#include "timer.h" + +#define ECHO_PORT 7 +#define RX_BUF_SIZE 1500 + +static struct wolfIP *IPStack; +static int udp_fd = -1; +static uint8_t udp_rx_buf[RX_BUF_SIZE]; + +/* Override newlib memset/memcpy with plain bytewise versions via + * linker --wrap. The aarch64 newlib memset uses 'dc zva' which hangs + * on this Cortex-A53 setup (DZE=1 in SCTLR_EL3 doesn't help; the + * instruction wedges before completing). The Makefile passes + * -Wl,--wrap=memset -Wl,--wrap=memcpy so all calls get redirected + * to these __wrap_ functions. */ +void *__wrap_memset(void *s, int c, unsigned long n) +{ + unsigned char *p = (unsigned char *)s; + while (n--) + *p++ = (unsigned char)c; + return s; +} + +void *__wrap_memcpy(void *dest, const void *src, unsigned long n) +{ + unsigned char *d = (unsigned char *)dest; + const unsigned char *s = (const unsigned char *)src; + while (n--) + *d++ = *s++; + return dest; +} + +/* Called from startup.S vector trampoline on synchronous/SError fault. */ +void exception_report(uint64_t esr, uint64_t elr, uint64_t far, uint64_t spsr) +{ + uart_puts("\n\n*** EL3 SYNC EXCEPTION ***\n"); + uart_puts(" ESR_EL3 : "); uart_puthex((uint32_t)(esr >> 32)); + uart_puthex((uint32_t)esr); uart_puts("\n"); + uart_puts(" EC = "); uart_puthex((uint32_t)((esr >> 26) & 0x3F)); + uart_puts(" (0x21=instr abort, 0x25=data abort, 0x24=alignment)\n"); + uart_puts(" ELR_EL3 : "); uart_puthex((uint32_t)(elr >> 32)); + uart_puthex((uint32_t)elr); uart_puts("\n"); + uart_puts(" FAR_EL3 : "); uart_puthex((uint32_t)(far >> 32)); + uart_puthex((uint32_t)far); uart_puts("\n"); + uart_puts(" SPSR_EL3: "); uart_puthex((uint32_t)spsr); uart_puts("\n"); +} + +void exception_report_serror(uint64_t esr, uint64_t elr, uint64_t far, + uint64_t spsr, uint64_t kind) +{ + (void)kind; + uart_puts("\n\n*** EL3 SError / FIQ ***\n"); + uart_puts(" ESR_EL3 : "); uart_puthex((uint32_t)(esr >> 32)); + uart_puthex((uint32_t)esr); uart_puts("\n"); + uart_puts(" ELR_EL3 : "); uart_puthex((uint32_t)(elr >> 32)); + uart_puthex((uint32_t)elr); uart_puts("\n"); + uart_puts(" FAR_EL3 : "); uart_puthex((uint32_t)(far >> 32)); + uart_puthex((uint32_t)far); uart_puts("\n"); + uart_puts(" SPSR_EL3: "); uart_puthex((uint32_t)spsr); uart_puts("\n"); +} + +/* wolfIP needs a 32-bit random word for protocol identifiers (TCP ISN, + * DHCP xid, DNS id, ephemeral source port, IP fragment id). We delegate + * to the port-local memuse-pattern entropy source (entropy.c), which + * follows the algorithm of wolfCrypt's wc_Entropy_Get() but is + * self-contained for cert isolation. */ +extern uint32_t zcu102_get_random32(void); + +uint32_t wolfIP_getrandom(void) +{ + return zcu102_get_random32(); +} + +static void udp_echo_cb(int fd, uint16_t event, void *arg) +{ + struct wolfIP *s = (struct wolfIP *)arg; + struct wolfIP_sockaddr_in peer; + uint32_t peer_len = sizeof(peer); + int n; + + if (!(event & CB_EVENT_READABLE)) + return; + + n = wolfIP_sock_recvfrom(s, fd, udp_rx_buf, sizeof(udp_rx_buf), 0, + (struct wolfIP_sockaddr *)&peer, &peer_len); + if (n > 0) { + (void)wolfIP_sock_sendto(s, fd, udp_rx_buf, (uint32_t)n, 0, + (struct wolfIP_sockaddr *)&peer, peer_len); + uart_puts("UDP echo: "); uart_putdec((uint32_t)n); + uart_puts(" bytes from "); uart_putip4(peer.sin_addr.s_addr); + uart_puts("\n"); + } +} + +int main(void) +{ + struct wolfIP_ll_dev *ll; + struct wolfIP_sockaddr_in addr; + uint64_t tick = 0; + int ret; + + uart_init(); + uart_puts("\n\n=== wolfIP ZCU102 (UltraScale+ A53-0 EL3) ===\n"); + uart_puts("MMU on, caches on. Bringing up GIC-400...\n"); + + gic_init(); + + uart_puts("Initializing wolfIP stack...\n"); + wolfIP_init_static(&IPStack); + + uart_puts("Bringing up GEM3 (RGMII, DP83867)...\n"); + ll = wolfIP_getdev(IPStack); + ret = zcu102_eth_init(ll); + if (ret < 0) { + uart_puts("ERROR: zcu102_eth_init failed: "); + uart_puthex((uint32_t)ret); + uart_puts("\n"); + while (1) + ; + } + uart_puts(" link "); uart_puts((ret & 0x100) ? "UP" : "DOWN"); + uart_puts(", PHY="); uart_puthex((uint32_t)(ret & 0xFF)); + uart_puts("\n"); + + /* Unmask IRQ at CPU now that GEM3 SPI is enabled at GICD. The CPU + * IRQ exception is currently not delivered on this A53/EL3 setup + * (open issue - see README); eth_poll() drives gem_isr() from the + * main loop instead. The SGI self-test instrumentation below is + * left in place behind DEBUG_GIC for further investigation. */ + irq_enable(); +#ifdef DEBUG_GIC + uart_puts("IRQ enabled. Self-test: firing SGI 0...\n"); + { + uint32_t before = gic_total_irqs(); + uint64_t daif, scr, vbar; + __asm__ volatile ("mrs %0, daif" : "=r"(daif)); + __asm__ volatile ("mrs %0, scr_el3" : "=r"(scr)); + __asm__ volatile ("mrs %0, vbar_el3" : "=r"(vbar)); + uart_puts(" pre: DAIF="); uart_puthex((uint32_t)daif); + uart_puts(" SCR_EL3="); uart_puthex((uint32_t)scr); + uart_puts(" VBAR_EL3="); uart_puthex((uint32_t)vbar); + uart_puts("\n"); + { + uint32_t vec_irq_curr_spx; + vec_irq_curr_spx = *(volatile uint32_t *)(vbar + 0x280); + uart_puts(" vec[Cur SPx IRQ] @ "); + uart_puthex((uint32_t)(vbar + 0x280)); + uart_puts(" = "); + uart_puthex(vec_irq_curr_spx); + uart_puts(" (B opcode: top byte 0x14 expected)\n"); + } + uart_puts(" GICD_CTLR="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x000)); + uart_puts(" GICD_ISENABLER(0)="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x100)); + uart_puts(" GICD_IGROUPR(0)="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x080)); + uart_puts("\n"); + uart_puts(" GICC_CTLR="); uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x000)); + uart_puts(" GICC_PMR="); uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x004)); + uart_puts("\n"); + gic_self_test_sgi(0); + delay_ms(10); + { + uint64_t isr, rpr; + __asm__ volatile ("mrs %0, isr_el1" : "=r"(isr)); + rpr = *(volatile uint32_t *)(GICC_BASE + 0x014); + uart_puts(" post-SGI: ISR_EL1="); + uart_puthex((uint32_t)isr); + uart_puts(" (bit7=I, bit6=F, bit8=A)\n"); + uart_puts(" GICC_RPR="); uart_puthex((uint32_t)rpr); + uart_puts(" (running priority; 0xFF=idle)\n"); + } + uart_puts(" SGI fired. gic_total_irqs: "); + uart_putdec(before); + uart_puts(" -> "); + uart_putdec(gic_total_irqs()); + uart_puts(" last_intid="); + uart_puthex(gic_last_intid()); + uart_puts("\n GICD_ISPENDR(0)="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x200)); + uart_puts(" GICC_HPPIR="); uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x018)); + uart_puts("\n"); + { + uint32_t iar = *(volatile uint32_t *)(GICC_BASE + 0x00C); + uart_puts(" polled GICC_IAR="); uart_puthex(iar); + uart_puts("\n"); + if ((iar & 0x3FF) != 0x3FF) { + *(volatile uint32_t *)(GICC_BASE + 0x010) = iar; + uart_puts(" EOI'd. polled GICC_HPPIR after="); + uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x018)); + uart_puts("\n"); + } + } + /* Extra system-register snapshot. FSBL/ATF sometimes leaves + * HCR_EL2 / MDCR_EL3 / OSLAR_EL1 with bits set that affect + * exception routing or debug halt; dump them so we can rule + * those out. NOTE: WFI wake test was tried here and hangs + * the CPU even though ISR_EL1.I=1 was observed earlier - the + * GIC appears to assert and deassert nIRQ within a few cycles + * rather than holding it level until ACK. That is consistent + * with edge-triggered SGI behavior but is not what the spec + * requires; it leaves no time for the exception logic to + * latch the event. */ + { + uint64_t hcr, mdcr, sctlr, oslsr; + __asm__ volatile ("mrs %0, hcr_el2" : "=r"(hcr)); + __asm__ volatile ("mrs %0, mdcr_el3" : "=r"(mdcr)); + __asm__ volatile ("mrs %0, sctlr_el3" : "=r"(sctlr)); + __asm__ volatile ("mrs %0, oslsr_el1" : "=r"(oslsr)); + uart_puts(" HCR_EL2="); uart_puthex((uint32_t)hcr); + uart_puts(" MDCR_EL3="); uart_puthex((uint32_t)mdcr); + uart_puts("\n SCTLR_EL3="); uart_puthex((uint32_t)sctlr); + uart_puts(" OSLSR_EL1="); uart_puthex((uint32_t)oslsr); + uart_puts("\n"); + } + } +#endif +#ifdef DEBUG_GEM + uart_puts("Initial GEM state:\n"); + gem_dump_state(); +#endif + +#ifdef DHCP + if (dhcp_client_init(IPStack) >= 0) { + uint32_t dhcp_elapsed = 0; + const uint32_t dhcp_timeout = 15000; + uart_puts("Starting DHCP client...\n"); + while (!dhcp_bound(IPStack) && dhcp_client_is_running(IPStack) + && dhcp_elapsed < dhcp_timeout) { + (void)wolfIP_poll(IPStack, tick); + tick++; + delay_ms(1); + dhcp_elapsed++; + /* gic_poll_dispatch removed - eth_poll already polls + * GEM_ISR directly. Doubling up here just spins. */ +#ifdef DEBUG_GEM + if ((dhcp_elapsed % 1000) == 0) { + uart_puts(" ["); uart_putdec(dhcp_elapsed); + uart_puts(" ms] bound="); + uart_putdec(dhcp_bound(IPStack) ? 1u : 0u); + uart_puts(" running="); + uart_putdec(dhcp_client_is_running(IPStack) ? 1u : 0u); + uart_puts("\n"); + gem_dump_state(); + } +#endif + } + if (dhcp_bound(IPStack)) { + ip4 ip = 0, nm = 0, gw = 0; + wolfIP_ipconfig_get(IPStack, &ip, &nm, &gw); + uart_puts("DHCP bound:\n IP: "); uart_putip4(ip); + uart_puts("\n Mask: "); uart_putip4(nm); + uart_puts("\n GW: "); uart_putip4(gw); + uart_puts("\n"); + } else { + ip4 ip = atoip4(WOLFIP_IP); + ip4 nm = atoip4(WOLFIP_NETMASK); + ip4 gw = atoip4(WOLFIP_GW); + uart_puts("DHCP timeout - using static IP\n"); + wolfIP_ipconfig_set(IPStack, ip, nm, gw); + } + } +#else + { + ip4 ip = atoip4(WOLFIP_IP); + ip4 nm = atoip4(WOLFIP_NETMASK); + ip4 gw = atoip4(WOLFIP_GW); + wolfIP_ipconfig_set(IPStack, ip, nm, gw); + uart_puts("Static IP: "); uart_putip4(ip); uart_puts("\n"); + } +#endif + + uart_puts("Opening UDP echo socket on port "); + uart_putdec(ECHO_PORT); uart_puts("\n"); + udp_fd = wolfIP_sock_socket(IPStack, AF_INET, IPSTACK_SOCK_DGRAM, 0); + wolfIP_register_callback(IPStack, udp_fd, udp_echo_cb, IPStack); + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = ee16(ECHO_PORT); + addr.sin_addr.s_addr = 0; + (void)wolfIP_sock_bind(IPStack, udp_fd, + (struct wolfIP_sockaddr *)&addr, sizeof(addr)); + + uart_puts("Ready. Try: nc -u 7\n\n"); + + for (;;) { + (void)wolfIP_poll(IPStack, tick++); + delay_ms(1); + } + + return 0; +} diff --git a/src/port/zcu102/mmu.c b/src/port/zcu102/mmu.c new file mode 100644 index 00000000..d8eaf7e2 --- /dev/null +++ b/src/port/zcu102/mmu.c @@ -0,0 +1,227 @@ +/* mmu.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Minimal MMU bring-up for A53 at EL3 with a 32-bit virtual address + * space (T0SZ=32, start level L1). Static tables map the full 4 GB VA + * range: + * + * L1[0] -> L2_DDR (0x00000000 .. 0x3FFFFFFF, 1 GB, 2 MB granular) + * L1[1] -> 0x40000000 .. 0x7FFFFFFF Normal WB IS (1 GB block) + * L1[2] -> invalid (0x80000000 .. 0xBFFFFFFF) + * L1[3] -> L2_PERIPH (0xC0000000 .. 0xFFFFFFFF, 1 GB, 2 MB granular) + * + * L2_DDR has a Normal-NC carve-out for any 2 MB block overlapping the + * linker's [_dma_buffers_start, _dma_buffers_end) range. In the current + * OCM-only layout the .dma_buffers section lives in OCM (mapped via + * L2_PERIPH[511] Normal-WB), so this carve-out is effectively dormant - + * GEM DMA coherency is handled with explicit DC CVAC / IVAC ops in + * gem.c. The carve-out remains in the tables so a future DDR-resident + * layout works without an MMU change. + * + * L2_PERIPH covers the PS peripheral aperture as Device-nGnRnE except + * entry 511 (0xFFE00000..0xFFFFFFFF) which is Normal-WB executable so + * code can be fetched from OCM (0xFFFC0000..0xFFFFFFFF) after the MMU + * is enabled. + * + * MAIR_EL3: + * ATTR0 = 0xFF (Normal Inner+Outer WB Cacheable, Read+Write alloc) + * ATTR1 = 0x00 (Device-nGnRnE - PS peripherals) + * ATTR2 = 0x44 (Normal Inner+Outer Non-Cacheable - reserved for a + * future DDR DMA carve-out) + * + * Block descriptor low attributes: + * bits[1:0] = 0b01 block + * bits[5:2] = AttrIndx + * bits[7:6] = AP = 0 (RW at EL3) + * bits[9:8] = SH = 0b11 inner-shareable (only meaningful for Normal) + * bit[10] = AF = 1 + * bit[54] = UXN/XN = 1 for Device, 0 for Normal RX + */ +#include +#include "mmu.h" + +extern uint8_t _dma_buffers_start[]; +extern uint8_t _dma_buffers_end[]; + +/* L1 has 4 entries (one per GB in our 4 GB VA). Section attribute keeps + * it in the dedicated .page_tables area so it lives at a known DDR + * address - the MMU walker still uses physical addresses to read it. */ +static volatile uint64_t L1[512] + __attribute__((aligned(4096), section(".page_tables"))); +static volatile uint64_t L2_DDR[512] + __attribute__((aligned(4096), section(".page_tables"))); +/* L2 for the 3-4 GB region. Most blocks are Device (PS peripherals) + * but the 2 MB block at 0xFFE00000 - 0xFFFFFFFF must be Normal+exec + * because OCM (0xFFFC0000-0xFFFFFFFF) lives there and our code runs + * from OCM. */ +static volatile uint64_t L2_PERIPH[512] + __attribute__((aligned(4096), section(".page_tables"))); + +#define DESC_VALID (1ULL << 0) +#define DESC_TABLE (1ULL << 1) +#define DESC_BLOCK (0ULL << 1) +#define DESC_AF (1ULL << 10) +#define DESC_SH_INNER (3ULL << 8) +#define DESC_AP_RW_EL3 (0ULL << 6) +#define DESC_XN (1ULL << 54) +#define DESC_ATTR(i) (((uint64_t)(i) & 7ULL) << 2) + +#define ATTR_NORMAL 0 /* AttrIndx 0 = MAIR ATTR0 (Normal WB) */ +#define ATTR_DEVICE 1 /* AttrIndx 1 = MAIR ATTR1 (Device) */ +#define ATTR_NORMAL_NC 2 /* AttrIndx 2 = MAIR ATTR2 (Normal NC) */ + +#define BLOCK_NORMAL(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_SH_INNER | DESC_AP_RW_EL3 | DESC_ATTR(ATTR_NORMAL)) + +#define BLOCK_DEVICE(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_AP_RW_EL3 | DESC_ATTR(ATTR_DEVICE) | DESC_XN) + +#define BLOCK_NORMAL_NC(pa) \ + (((uint64_t)(pa)) | DESC_BLOCK | DESC_VALID | DESC_AF | \ + DESC_SH_INNER | DESC_AP_RW_EL3 | DESC_ATTR(ATTR_NORMAL_NC) | DESC_XN) + +#define TABLE_DESC(pa) \ + (((uint64_t)(pa)) | DESC_TABLE | DESC_VALID) + +#define L2_BLOCK_SIZE (2ULL * 1024 * 1024) /* 2 MB */ +#define L1_BLOCK_SIZE (1024ULL * 1024 * 1024) /* 1 GB */ + +static void mmu_build_tables(void) +{ + uint64_t addr; + uint64_t dma_lo; + uint64_t dma_hi; + int i; + + /* L2_DDR: 512 entries covering 0..1 GB at 2 MB each. */ + dma_lo = (uint64_t)(uintptr_t)_dma_buffers_start; + dma_hi = (uint64_t)(uintptr_t)_dma_buffers_end; +#ifdef ZCU102_LAYOUT_DDR + /* DDR layout: the app and its dma_buffers both live in DDR. We + * keep all of DDR as Normal-WB cacheable and rely on the explicit + * DC CVAC / IVAC ops in gem.c for coherency. Disabling the NC + * carve-out means the stack (which shares a 2 MB block with the + * dma_buffers in the DDR linker layout) stays cacheable. */ + (void)dma_lo; (void)dma_hi; +#endif + + for (i = 0; i < 512; i++) { + addr = (uint64_t)i * L2_BLOCK_SIZE; +#ifdef ZCU102_LAYOUT_DDR + L2_DDR[i] = BLOCK_NORMAL(addr); +#else + if ((addr + L2_BLOCK_SIZE) <= dma_lo || addr >= dma_hi) { + L2_DDR[i] = BLOCK_NORMAL(addr); + } else { + /* Any 2 MB block that overlaps the DMA region is Normal-NC + * so newlib's memcpy/memset (LDP/STP, unaligned tails) does + * not alignment-fault when staging frames into tx_buf_pool. */ + L2_DDR[i] = BLOCK_NORMAL_NC(addr); + } +#endif /* ZCU102_LAYOUT_DDR */ + } + + /* L2_PERIPH: 3..4 GB range. All Device-nGnRnE except the last + * 2 MB block which contains OCM (0xFFFC0000..0xFFFFFFFF) and + * must be Normal+executable so we can fetch our code from OCM. */ + for (i = 0; i < 511; i++) { + addr = 3ULL * L1_BLOCK_SIZE + (uint64_t)i * L2_BLOCK_SIZE; + L2_PERIPH[i] = BLOCK_DEVICE(addr); + } + /* Entry 511 covers 0xFFE00000..0xFFFFFFFF. OCM is at 0xFFFC0000+, + * within this 2 MB block. Map as Normal WB cacheable, executable. */ + L2_PERIPH[511] = BLOCK_NORMAL(3ULL * L1_BLOCK_SIZE + + 511ULL * L2_BLOCK_SIZE); + + /* L1 entries. */ + L1[0] = TABLE_DESC((uintptr_t)L2_DDR); + L1[1] = BLOCK_NORMAL(L1_BLOCK_SIZE); /* 1..2 GB DDR */ + L1[2] = 0; /* 2..3 GB unused */ + L1[3] = TABLE_DESC((uintptr_t)L2_PERIPH); /* 3..4 GB peri + OCM */ + + for (i = 4; i < 512; i++) + L1[i] = 0; +} + +void mmu_enable(void) +{ + uint64_t mair; + uint64_t tcr; + uint64_t sctlr; + + mmu_build_tables(); + + /* Make sure the table writes are visible to the table walker + * before we point TTBR at them. We are still running with the + * D-cache off here, so a DSB SY is sufficient. */ + __asm__ volatile ("dsb sy" ::: "memory"); + + /* MAIR_EL3: + * ATTR0 = 0xFF (Normal WB Inner+Outer Cacheable) + * ATTR1 = 0x00 (Device-nGnRnE) + * ATTR2 = 0x44 (Normal Inner+Outer Non-Cacheable, for DMA buffers) */ + mair = (0xFFULL << 0) | (0x00ULL << 8) | (0x44ULL << 16); + __asm__ volatile ("msr mair_el3, %0" :: "r"(mair)); + + /* TCR_EL3: 32-bit VA (T0SZ=32, start level L1), 4 KB granule, + * IRGN0=WB-RA-WA, ORGN0=WB-RA-WA, SH0=Inner shareable, IPS=40 bit. + * EL3 TCR has T0SZ at bits [5:0], IRGN0[9:8], ORGN0[11:10], + * SH0[13:12], TG0[15:14], PS[18:16], TBI[20], RES1 at bit 23,31. + */ + tcr = (uint64_t)32 /* T0SZ = 32 -> 4 GB VA */ + | ((uint64_t)1 << 8) /* IRGN0 = WB RA-WA */ + | ((uint64_t)1 << 10) /* ORGN0 = WB RA-WA */ + | ((uint64_t)3 << 12) /* SH0 = Inner shareable */ + | ((uint64_t)0 << 14) /* TG0 = 4 KB */ + | ((uint64_t)2 << 16) /* PS = 40 bit PA */ + | ((uint64_t)1 << 23) /* RES1 */ + | ((uint64_t)1 << 31); /* RES1 */ + __asm__ volatile ("msr tcr_el3, %0" :: "r"(tcr)); + + /* TTBR0_EL3 = &L1. */ + __asm__ volatile ("msr ttbr0_el3, %0" :: "r"((uint64_t)(uintptr_t)L1)); + + __asm__ volatile ("isb" ::: "memory"); + + /* Invalidate TLBs and I-cache before turning the MMU on. */ + __asm__ volatile ("tlbi alle3" ::: "memory"); + __asm__ volatile ("ic iallu" ::: "memory"); + __asm__ volatile ("dsb sy" ::: "memory"); + __asm__ volatile ("isb" ::: "memory"); + + /* Enable MMU + I-cache + D-cache. Cache coherency with GEM DMA + * is handled with explicit DC CVAC / DC IVAC ops in eth_send and + * eth_poll (see gem.c cache_*() helpers). + * + * DZE bit 14 = enable DC ZVA at EL0/EL1 (and EL3 since we are + * here). Newlib aarch64 memset uses DC ZVA for fast bulk zero + * writes; without DZE=1 the instruction traps UNDEF and the + * exception loop wedges the CPU. */ + __asm__ volatile ("mrs %0, sctlr_el3" : "=r"(sctlr)); + sctlr |= (1ULL << 0); /* M */ + sctlr |= (1ULL << 2); /* C */ + sctlr |= (1ULL << 12); /* I */ + sctlr |= (1ULL << 14); /* DZE - allow DC ZVA */ + sctlr &= ~(1ULL << 1); /* A off */ + __asm__ volatile ("msr sctlr_el3, %0" :: "r"(sctlr)); + __asm__ volatile ("isb" ::: "memory"); +} diff --git a/src/port/zcu102/mmu.h b/src/port/zcu102/mmu.h new file mode 100644 index 00000000..90ab7475 --- /dev/null +++ b/src/port/zcu102/mmu.h @@ -0,0 +1,12 @@ +/* mmu.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef ZCU102_MMU_H +#define ZCU102_MMU_H + +void mmu_enable(void); + +#endif /* ZCU102_MMU_H */ diff --git a/src/port/zcu102/phy_dp83867.c b/src/port/zcu102/phy_dp83867.c new file mode 100644 index 00000000..987c239c --- /dev/null +++ b/src/port/zcu102/phy_dp83867.c @@ -0,0 +1,338 @@ +/* phy_dp83867.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * TI DP83867IR PHY init for the ZCU102 (PHY on RGMII to PS-GEM3). + * + * The DP83867 needs explicit RGMII TX and RX clock delay configuration + * (CFG4 / RGMIICTL extended registers) because the ZCU102 routes + * RGMII signals as a straight-through trace without external delay. + * Without this the link comes up at 1 Gbps but carries corrupt data + * (random RX frames, no TX). The Linux dp83867 driver and Xilinx + * device tree both apply a 2.0 ns TX + 2.0 ns RX skew - we match. + * + * Extended registers (>0x1F) are accessed via the IEEE-defined indirect + * pair (REGCR=0x0D, ADDAR=0x0E): + * 1. Write REGCR = 0x001F (address-of, devad 31). + * 2. Write ADDAR = . + * 3. Write REGCR = 0x401F (data, devad 31, no-increment). + * 4. Read/Write ADDAR = . + */ +#include +#include "gem.h" +#include "phy_dp83867.h" +#include "timer.h" +#include "uart.h" + +/* Standard IEEE PHY registers (clause 22) */ +#define PHY_BMCR 0x00 +#define PHY_BMSR 0x01 +#define PHY_ID1 0x02 +#define PHY_ID2 0x03 +#define PHY_ANAR 0x04 +#define PHY_GBCR 0x09 +#define PHY_GBSR 0x0A +#define PHY_REGCR 0x0D +#define PHY_ADDAR 0x0E + +#define BMCR_RESET (1u << 15) +#define BMCR_ANRESTART (1u << 9) +#define BMCR_ANEN (1u << 12) + +#define BMSR_ANCOMPLETE (1u << 5) +#define BMSR_LINK_UP (1u << 2) + +/* DP83867 extended registers (accessed via REGCR/ADDAR, devad 0x1F) */ +#define DP83867_CFG4 0x0031 /* Configuration 4 (RX_CTRL strap fix) */ +#define DP83867_RGMIICTL 0x0032 /* RGMII control */ +#define DP83867_STRAP_STS1 0x006E /* Strap status register (read-only) */ +#define DP83867_RGMIIDCTL 0x0086 /* RGMII delay control */ +#define DP83867_IO_MUX_CFG 0x0170 /* IO MUX config (impedance) */ + +/* Clause-22 register (direct access) */ +#define DP83867_PHYCR 0x10 /* PHY Control register */ +#define PHYCR_FIFO_DEPTH_MASK (3u << 14) +#define PHYCR_FIFO_DEPTH_8B (3u << 14) + +/* RGMIICTL bits */ +#define RGMIICTL_RX_DELAY_EN (1u << 0) +#define RGMIICTL_TX_DELAY_EN (1u << 1) + +/* RGMIIDCTL: TX delay in [3:0], RX delay in [7:4], each step ~0.25 ns. + * 0x8 -> 2.0 ns (matches Linux/Xilinx default for ZCU102). */ +#define RGMIIDCTL_DELAY_2NS (0x8u | (0x8u << 4)) + +/* Speed read from PHY status register (DP83867 0x11) */ +#define DP83867_PHYSTS 0x0011 +#define PHYSTS_SPEED_MASK (3u << 14) +#define PHYSTS_SPEED_1000 (2u << 14) +#define PHYSTS_SPEED_100 (1u << 14) +#define PHYSTS_SPEED_10 (0u << 14) +#define PHYSTS_DUPLEX (1u << 13) + +static int phy_ext_write(uint8_t phy_addr, uint16_t ext_reg, uint16_t val) +{ + int rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x001Fu); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_ADDAR, ext_reg); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x401Fu); + if (rc < 0) return rc; + return gem_mdio_write(phy_addr, PHY_ADDAR, val); +} + +static int phy_ext_read(uint8_t phy_addr, uint16_t ext_reg, uint16_t *out) +{ + int rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x001Fu); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_ADDAR, ext_reg); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x401Fu); + if (rc < 0) return rc; + return gem_mdio_read(phy_addr, PHY_ADDAR, out); +} + +int dp83867_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out) +{ + uint16_t id1 = 0; + uint16_t id2 = 0; + uint16_t bmcr; + uint16_t bmsr; + uint16_t physts; + int i; + + if (gem_mdio_read(phy_addr, PHY_ID1, &id1) < 0) + return -1; + if (gem_mdio_read(phy_addr, PHY_ID2, &id2) < 0) + return -2; + uart_puts("DP83867: ID1="); uart_puthex(id1); + uart_puts(" ID2="); uart_puthex(id2); + uart_puts("\n"); + /* DP83867 OUI = 0x2000A23x. ID1=0x2000, ID2 upper bits match. */ + if (id1 != 0x2000u || (id2 & 0xFFF0u) != 0xA230u) { + uart_puts(" warn: PHY ID does not match DP83867, continuing\n"); + } + + /* Soft reset. */ + if (gem_mdio_write(phy_addr, PHY_BMCR, BMCR_RESET) < 0) + return -3; + for (i = 0; i < 1000; i++) { + delay_ms(1); + if (gem_mdio_read(phy_addr, PHY_BMCR, &bmcr) < 0) + return -4; + if ((bmcr & BMCR_RESET) == 0) + break; + } + if (i == 1000) + return -5; + + /* Order below mirrors the Linux/U-Boot dp83867_config sequence: + * 1. Strap fix (CFG4 bit 7) right after SW reset. + * 2. PHYCR FIFO depth RMW. + * 3. RGMIICTL RMW to enable both delays. + * 4. RGMIIDCTL set delay values. + * 5. Restart AN (caller does after we return). + */ + { + uint16_t strap = 0; + uint16_t cfg4_before = 0; + uint16_t cfg4_after = 0; + uint16_t iomux = 0; + uint16_t rgmiictl = 0; + uint16_t phycr_before = 0; + uint16_t phycr_after = 0; + + (void)phy_ext_read(phy_addr, DP83867_STRAP_STS1, &strap); + (void)phy_ext_read(phy_addr, DP83867_IO_MUX_CFG, &iomux); + (void)phy_ext_read(phy_addr, DP83867_CFG4, &cfg4_before); + + /* 1. RX_CTRL strap quirk for ZCU102. */ + cfg4_after = cfg4_before & ~(1u << 7); + if (phy_ext_write(phy_addr, DP83867_CFG4, cfg4_after) < 0) + return -6; + + /* 2. PHYCR FIFO depth = 8 bytes (RMW so we keep Auto-MDIX, + * power-down detect, etc., that the strap brought up). */ + (void)gem_mdio_read(phy_addr, DP83867_PHYCR, &phycr_before); + phycr_after = (phycr_before & ~PHYCR_FIFO_DEPTH_MASK) + | PHYCR_FIFO_DEPTH_8B; + if (gem_mdio_write(phy_addr, DP83867_PHYCR, phycr_after) < 0) + return -7; + + /* 3. RGMIICTL: enable TX and RX clock delays (RMW). */ + (void)phy_ext_read(phy_addr, DP83867_RGMIICTL, &rgmiictl); + rgmiictl |= RGMIICTL_RX_DELAY_EN | RGMIICTL_TX_DELAY_EN; + if (phy_ext_write(phy_addr, DP83867_RGMIICTL, rgmiictl) < 0) + return -8; + + /* 4. RGMIIDCTL: 2.0 ns each (matches Linux ti,*-internal-delay=8). */ + if (phy_ext_write(phy_addr, DP83867_RGMIIDCTL, + RGMIIDCTL_DELAY_2NS) < 0) + return -9; + +#ifdef DEBUG_PHY + /* Verbose pre-AN dump so we can diff against U-Boot's state. */ + uart_puts("DP83867 pre-AN: STRAP_STS1="); uart_puthex(strap); + uart_puts(" IO_MUX_CFG="); uart_puthex(iomux); + uart_puts("\n CFG4: "); uart_puthex(cfg4_before); + uart_puts(" -> "); uart_puthex(cfg4_after); + uart_puts(" PHYCR: "); uart_puthex(phycr_before); + uart_puts(" -> "); uart_puthex(phycr_after); + uart_puts("\n RGMIICTL="); uart_puthex(rgmiictl); + uart_puts(" RGMIIDCTL="); uart_puthex(RGMIIDCTL_DELAY_2NS); + uart_puts("\n"); + + { + uint16_t v; + (void)phy_ext_read(phy_addr, DP83867_CFG4, &v); + uart_puts("DP83867 readback: CFG4="); uart_puthex(v); + (void)phy_ext_read(phy_addr, DP83867_RGMIICTL, &v); + uart_puts(" RGMIICTL="); uart_puthex(v); + (void)phy_ext_read(phy_addr, DP83867_RGMIIDCTL, &v); + uart_puts(" RGMIIDCTL="); uart_puthex(v); + (void)gem_mdio_read(phy_addr, DP83867_PHYCR, &v); + uart_puts(" PHYCR="); uart_puthex(v); + uart_puts("\n"); + } +#else + (void)strap; (void)iomux; + (void)cfg4_before; (void)cfg4_after; + (void)phycr_before; (void)phycr_after; + (void)rgmiictl; +#endif + } + + /* Advertise 10/100/1000 full + half duplex. */ + if (gem_mdio_write(phy_addr, PHY_ANAR, 0x01E1u) < 0) + return -8; + if (gem_mdio_write(phy_addr, PHY_GBCR, (1u << 9) | (1u << 8)) < 0) + return -9; + + /* Restart AN. */ + if (gem_mdio_write(phy_addr, PHY_BMCR, BMCR_ANEN | BMCR_ANRESTART) < 0) + return -10; + + /* Wait up to 5 s for AN complete, polling at 50 ms. AN typically + * needs 100-1500 ms depending on link partner. Report progress so + * a hung negotiation is visible on UART. */ + uart_puts("DP83867: waiting for autoneg"); + for (i = 0; i < 100; i++) { + delay_ms(50); + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -11; + if (bmsr & BMSR_ANCOMPLETE) { + uart_puts(" done ("); + uart_putdec((uint32_t)i * 50u); + uart_puts("ms)\n"); + break; + } + if ((i % 10) == 9) + uart_putc('.'); + } + if (!(bmsr & BMSR_ANCOMPLETE)) + uart_puts(" TIMEOUT\n"); + + /* Give the PHY a moment to latch the negotiated speed before we + * read PHYSTS - on DP83867 link-OK and PHYSTS update slightly + * after AN_COMPLETE asserts. */ + delay_ms(100); + + /* After AN_COMPLETE, the 1000BASE-T link still needs to finish + * master/slave training and have BOTH receivers report OK before + * BMSR.LINK_UP asserts. This can take several hundred ms more. + * Poll BMSR (double-read for latch) up to 5 s, dumping GBSR each + * iteration so we can see remote_rx_status flip. */ + { + int j; + uint16_t gbsr = 0; + uint16_t bmsr2 = 0; + uart_puts("DP83867: waiting for link"); + for (j = 0; j < 100; j++) { + delay_ms(50); + (void)gem_mdio_read(phy_addr, PHY_BMSR, &bmsr2); + (void)gem_mdio_read(phy_addr, PHY_BMSR, &bmsr2); + (void)gem_mdio_read(phy_addr, PHY_GBSR, &gbsr); + if (bmsr2 & BMSR_LINK_UP) { + uart_puts(" UP ("); + uart_putdec((uint32_t)j * 50u); + uart_puts("ms) GBSR="); + uart_puthex(gbsr); + uart_puts("\n"); + bmsr = bmsr2; + break; + } + if ((j % 10) == 9) { + uart_puts(" ["); + uart_putdec((uint32_t)(j + 1) * 50u); + uart_puts("ms GBSR="); + uart_puthex(gbsr); + uart_puts("]"); + } + } + if (!(bmsr2 & BMSR_LINK_UP)) + uart_puts(" TIMEOUT\n"); + } + + if (gem_mdio_read(phy_addr, DP83867_PHYSTS, &physts) < 0) + return -12; + +#ifdef DEBUG_PHY + { + uint16_t bmcr_now = 0; + uint16_t lpa = 0; + uint16_t gbsr = 0; + (void)gem_mdio_read(phy_addr, PHY_BMCR, &bmcr_now); + (void)gem_mdio_read(phy_addr, 0x05, &lpa); /* MII LPA */ + (void)gem_mdio_read(phy_addr, PHY_GBSR, &gbsr); + uart_puts("DP83867 regs: BMCR="); uart_puthex(bmcr_now); + uart_puts(" BMSR="); uart_puthex(bmsr); + uart_puts(" LPA="); uart_puthex(lpa); + uart_puts(" GBSR="); uart_puthex(gbsr); + uart_puts(" PHYSTS="); uart_puthex(physts); + uart_puts("\n"); + } +#endif + + if ((physts & PHYSTS_SPEED_MASK) == PHYSTS_SPEED_1000) + *speed_out = 1000; + else if ((physts & PHYSTS_SPEED_MASK) == PHYSTS_SPEED_100) + *speed_out = 100; + else + *speed_out = 10; + *full_duplex_out = (physts & PHYSTS_DUPLEX) ? 1 : 0; + + uart_puts("DP83867 link: "); + uart_putdec((uint32_t)*speed_out); + uart_puts(*full_duplex_out ? " Mbps FD\n" : " Mbps HD\n"); + + return (bmsr & BMSR_LINK_UP) ? 1 : 0; +} + +int dp83867_link_status(uint8_t phy_addr) +{ + uint16_t bmsr; + /* BMSR latches link down; read twice. */ + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -1; + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -1; + return (bmsr & BMSR_LINK_UP) ? 1 : 0; +} diff --git a/src/port/zcu102/phy_dp83867.h b/src/port/zcu102/phy_dp83867.h new file mode 100644 index 00000000..efbf45a3 --- /dev/null +++ b/src/port/zcu102/phy_dp83867.h @@ -0,0 +1,23 @@ +/* phy_dp83867.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * TI DP83867IR PHY driver: 10/100/1000 RGMII PHY used on the ZCU102 + * dev board. We only need configuration (reset, RGMII TX/RX skew, + * auto-negotiation) and link status; no advanced features. + */ +#ifndef ZCU102_PHY_DP83867_H +#define ZCU102_PHY_DP83867_H + +#include + +/* Returns 0 on success, < 0 on failure. On success *speed and *fd are + * the negotiated speed (10/100/1000) and full-duplex flag. */ +int dp83867_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out); + +/* Returns 1 if link is up, 0 if down, < 0 on MDIO error. */ +int dp83867_link_status(uint8_t phy_addr); + +#endif /* ZCU102_PHY_DP83867_H */ diff --git a/src/port/zcu102/startup.S b/src/port/zcu102/startup.S new file mode 100644 index 00000000..534751cb --- /dev/null +++ b/src/port/zcu102/startup.S @@ -0,0 +1,282 @@ +/* startup.S + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * AArch64 EL3 entry for Cortex-A53 on Xilinx ZCU102. Stock Xilinx FSBL + * boots us in EL3 NS with caches/MMU off (or sometimes on - we force + * them off here to be safe), then jumps to _start at the image entry. + */ + /* A loader (FSBL, wolfBoot, ...) that respects the ELF entry + * point in the program header branches to _start directly. A + * loader that simply branches to the first byte of the binary + * (wolfBoot's do_boot path does this -- it uses LOAD_ADDRESS, + * not the ELF entry) lands on this stub instead, which forwards + * to _start. Linker scripts place this section first in the + * output image so it always sits at offset 0 of the binary. */ + .section .boot_entry, "ax" + .global _boot_entry +_boot_entry: + b _start + + .section .vectors, "ax" + .align 11 /* 2048 byte align required by VBAR */ + .global _vectors +_vectors: + /* Current EL with SP0 (not used; we always use SPx). */ + .align 7 + b el3_sync_trampoline /* sync */ + .align 7 + b el3_irq_trampoline /* IRQ */ + .align 7 + b el3_irq_trampoline /* FIQ - same handler (see note below) */ + .align 7 + b el3_serror_trampoline /* SError */ + /* Current EL with SPx */ + .align 7 + b el3_sync_trampoline /* sync */ + .align 7 + b el3_irq_trampoline /* IRQ */ + .align 7 + b el3_irq_trampoline /* FIQ - same handler; GIC-400 in + * secure mode may deliver Group 0 + * interrupts via nFIQ depending on + * GICC_CTLR.FIQEn. We route both to + * the same trampoline so the C + * dispatcher sees the INTID either + * way. */ + .align 7 + b el3_serror_trampoline /* SError */ + /* Lower EL using AArch64 (unused, we stay at EL3) */ + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + /* Lower EL using AArch32 (unused) */ + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + .align 7 + b _hang + + .section .text, "ax" + .global _start + .type _start, %function +_start: + /* VERY FIRST INSTRUCTIONS - prove we're running. Write '@' to + * UART0 TX FIFO. No register-bank-dependent ops here. */ + mov x1, #0xFF000000 + mov w2, #'@' + str w2, [x1, #0x30] + str w2, [x1, #0x30] + str w2, [x1, #0x30] + + /* Make sure we are on A53-0. If FSBL released us as A53-1/2/3 by + * accident, park them. */ + mrs x0, mpidr_el1 + and x0, x0, #0xff /* Aff0 */ + cbnz x0, _park_secondary + + /* Disable MMU + caches in case FSBL left them on. */ + mrs x0, sctlr_el3 + bic x0, x0, #(1 << 0) /* M - MMU off */ + bic x0, x0, #(1 << 2) /* C - D-cache off */ + bic x0, x0, #(1 << 12) /* I - I-cache off */ + msr sctlr_el3, x0 + isb + + /* Allow FP/SIMD at EL3 (FSBL does this too, but be explicit). */ + msr cptr_el3, xzr + + /* Force SPSel = 1 (use SP_ELx). The IRQ vector at offset 0x280 + * (Current EL with SPx) is what we wired el3_irq_trampoline to. + * FSBL may have left SPSel at 0 (SP_EL0); fix it deterministically. */ + msr spsel, #1 + + /* Force SCR_EL3 to a known state. We run entirely at EL3 in + * Secure world. The wolfBoot AArch64 startup explicitly sets the + * IRQ + FIQ + EA routing bits even though the ARM ARM says they + * only affect lower-EL interrupts; reusing that convention here + * because empirically the A53 does not enter the IRQ exception + * unless these are set (ISR_EL1.I went high but no exception + * fired with these bits clear). + * bit 0 NS = 0 (stay Secure) + * bit 1 IRQ = 1 (route IRQ to EL3) + * bit 2 FIQ = 1 (route FIQ to EL3) + * bit 3 EA = 1 (route SError/abort to EL3) + * bit 10 RW = 0 (no lower EL64; we never drop to lower EL) */ + mov x0, #((1 << 1) | (1 << 2) | (1 << 3)) + msr scr_el3, x0 + isb + + /* Vector base. */ + adrp x0, _vectors + add x0, x0, :lo12:_vectors + msr vbar_el3, x0 + + /* Stack pointer. After 'msr spsel, #1' this writes SP_EL3. */ + ldr x0, =_stack_top + mov sp, x0 + + /* Very early UART poke - one char before any C code, so even if a + * later step hangs we know _start was reached. Writes '!' to + * UART0 TX FIFO (0xFF000030). Assumes FSBL/psu_init already + * configured UART0 baud. */ + mov x1, #0xFF000000 + mov w2, #'!' + str w2, [x1, #0x30] + + /* Clear BSS. */ + ldr x0, =_sbss + ldr x1, =_ebss +1: cmp x0, x1 + b.ge 2f + str xzr, [x0], #8 + b 1b +2: + /* Breadcrumb: BSS cleared. */ + mov x1, #0xFF000000 + mov w2, #'B' + str w2, [x1, #0x30] + + /* Bring up the MMU + caches. C function in mmu.c. */ + bl mmu_enable + + /* Breadcrumb: MMU enabled. */ + mov x1, #0xFF000000 + mov w2, #'M' + str w2, [x1, #0x30] + + /* Branch to main. */ + bl main + + /* main() should not return. If it does, hang. */ + b _hang + + .type _park_secondary, %function +_park_secondary: + wfe + b _park_secondary + + .global _hang + .type _hang, %function +_hang: + b _hang + +/* --------------------------------------------------------------------- + * IRQ trampoline. EL3 IRQ vector -> save GP regs, call C handler, restore. + * Keeps the C handler clean and avoids __attribute__((interrupt)) tricks + * which are not reliable on aarch64. + * ------------------------------------------------------------------- */ + .type el3_irq_trampoline, %function +el3_irq_trampoline: + /* Save full integer register file (x0-x30) plus SPSR_EL3/ELR_EL3. + * Frame is 18 * 16 = 288 bytes (16-byte aligned). Callee-saved + * x19-x28 must be preserved too: irq_dispatch is an ordinary C + * function and may clobber them, while the interrupted code + * almost certainly relies on them. */ + sub sp, sp, #(18 * 16) + stp x0, x1, [sp, #(0 * 16)] + stp x2, x3, [sp, #(1 * 16)] + stp x4, x5, [sp, #(2 * 16)] + stp x6, x7, [sp, #(3 * 16)] + stp x8, x9, [sp, #(4 * 16)] + stp x10, x11, [sp, #(5 * 16)] + stp x12, x13, [sp, #(6 * 16)] + stp x14, x15, [sp, #(7 * 16)] + stp x16, x17, [sp, #(8 * 16)] + stp x18, x19, [sp, #(9 * 16)] + stp x20, x21, [sp, #(10 * 16)] + stp x22, x23, [sp, #(11 * 16)] + stp x24, x25, [sp, #(12 * 16)] + stp x26, x27, [sp, #(13 * 16)] + stp x28, x29, [sp, #(14 * 16)] + str x30, [sp, #(15 * 16)] + /* Snapshot exception return state in case irq_dispatch (or any + * nested exception inside it) clobbers SPSR_EL3 / ELR_EL3. */ + mrs x0, spsr_el3 + mrs x1, elr_el3 + stp x0, x1, [sp, #(16 * 16)] + + bl irq_dispatch + + ldp x0, x1, [sp, #(16 * 16)] + msr spsr_el3, x0 + msr elr_el3, x1 + ldp x0, x1, [sp, #(0 * 16)] + ldp x2, x3, [sp, #(1 * 16)] + ldp x4, x5, [sp, #(2 * 16)] + ldp x6, x7, [sp, #(3 * 16)] + ldp x8, x9, [sp, #(4 * 16)] + ldp x10, x11, [sp, #(5 * 16)] + ldp x12, x13, [sp, #(6 * 16)] + ldp x14, x15, [sp, #(7 * 16)] + ldp x16, x17, [sp, #(8 * 16)] + ldp x18, x19, [sp, #(9 * 16)] + ldp x20, x21, [sp, #(10 * 16)] + ldp x22, x23, [sp, #(11 * 16)] + ldp x24, x25, [sp, #(12 * 16)] + ldp x26, x27, [sp, #(13 * 16)] + ldp x28, x29, [sp, #(14 * 16)] + ldr x30, [sp, #(15 * 16)] + add sp, sp, #(18 * 16) + eret + + .global irq_enable + .type irq_enable, %function +irq_enable: + msr daifclr, #3 /* unmask IRQ (bit 1) + FIQ (bit 0) */ + ret + + .global irq_disable + .type irq_disable, %function +irq_disable: + msr daifset, #2 /* mask IRQ */ + ret + +/* --------------------------------------------------------------------- + * Synchronous exception handler - print ESR_EL3 / ELR_EL3 / FAR_EL3 + * then hang. Anything that previously fell to _hang silently (alignment + * fault, translation fault, undefined instruction) now produces a + * UART dump. + * ------------------------------------------------------------------- */ + .type el3_sync_trampoline, %function +el3_sync_trampoline: + mrs x0, esr_el3 + mrs x1, elr_el3 + mrs x2, far_el3 + mrs x3, spsr_el3 + bl exception_report + b _hang + + .type el3_serror_trampoline, %function +el3_serror_trampoline: + mrs x0, esr_el3 + mrs x1, elr_el3 + mrs x2, far_el3 + mrs x3, spsr_el3 + mov x4, #1 /* indicate SError to C */ + bl exception_report_serror + b _hang diff --git a/src/port/zcu102/target.ld b/src/port/zcu102/target.ld new file mode 100644 index 00000000..e85cf909 --- /dev/null +++ b/src/port/zcu102/target.ld @@ -0,0 +1,132 @@ +/* ZCU102 (Xilinx UltraScale+ MPSoC, Cortex-A53) Linker Script + * + * Memory map (current OCM-only layout): + * OCM : 256 KB @ 0xFFFC0000 (everything lives here) + * DDR low : 2 GB @ 0x00000000 (initialized by FSBL, currently unused + * by this app; reserved for future + * heap or larger ring buffers) + * + * App layout in OCM: + * 0xFFFC0000 .vectors (2 KB-aligned) + * ... .text, .rodata, .data, .bss, .page_tables, + * .dma_buffers (linker packs them in order) + * 0x100000000 _stack_top (top of OCM, stack grows down) + * + * Why OCM-only: + * - JTAG iteration: psu_init alone (no PMU FW) doesn't reliably + * bring up DDR for mwr-force loads. OCM is independent of the + * DDR controller and always works. + * - SD boot: bootgen will emit a warning about OCM overlap with + * FSBL, but FSBL's jump-to-image happens after partition load, + * so the overlay is safe. + * - The 16-KB JTAG DAP alias bug at the low DDR window is avoided + * entirely. + * + * .dma_buffers stays inside OCM (Normal-WB per L2_PERIPH[511]); GEM + * DMA coherency is handled with explicit DC CVAC / IVAC ops in gem.c + * rather than via an MMU attribute carve-out. + */ + +OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") +OUTPUT_ARCH(aarch64) +ENTRY(_start) + +/* Single-region layout: everything in OCM. DDR and DMA region + * definitions are kept as placeholders for a future layout that + * spills .dma_buffers (and possibly .bss) into DDR once the JTAG + * iteration path supports it. They are not referenced by any SECTION + * in the current layout. */ +MEMORY +{ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 + DDR (rw) : ORIGIN = 0x00010000, LENGTH = 0x001F0000 + DMA (rw) : ORIGIN = 0x00200000, LENGTH = 0x00200000 +} + +/* Stack at top of OCM (we keep stack in OCM with the rest since + * DDR-via-JTAG is unreliable without PMU FW). The 24 KB free area + * above DMA buffers gives plenty of stack room for our app. */ +_stack_top = 0x100000000; +_dma_base = 0xFFFF1000; /* in OCM, see linker output */ +_dma_size = 0x00009000; /* ~36 KB sufficient for reduced rings */ + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > OCM :text + + .vectors : + { + . = ALIGN(2048); + KEEP(*(.vectors)) + } > OCM :text + + .text : + { + . = ALIGN(8); + *(.text*) + *(.rodata*) + . = ALIGN(8); + } > OCM :text + + .data : + { + . = ALIGN(8); + _sdata = .; + *(.data*) + . = ALIGN(8); + _edata = .; + } > OCM :text + + /* BSS in OCM as well - DDR-via-JTAG isn't reliable without PMU + * FW, so we keep all writeable state in OCM (256 KB total). */ + .bss (NOLOAD) : + { + . = ALIGN(8); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(8); + _ebss = .; + } > OCM :text + + /* Page tables in OCM so MMU walker isn't dependent on DDR being + * fully up (DDR-via-JTAG is unreliable without PMU FW; CPU + * fetch from OCM is bulletproof). 12 KB total (3x4KB tables). */ + .page_tables (NOLOAD) : + { + . = ALIGN(4096); + _page_tables_start = .; + *(.page_tables) + . = ALIGN(4096); + _page_tables_end = .; + } > OCM :text + + /* DMA buffers also in OCM - OCM is accessible to all AXI masters + * including the GEM DMA. With everything in OCM there's no DDR + * dependency for the basic bring-up. */ + .dma_buffers (NOLOAD) : + { + . = ALIGN(64); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > OCM :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/zcu102/target_ddr.ld b/src/port/zcu102/target_ddr.ld new file mode 100644 index 00000000..b2c82be0 --- /dev/null +++ b/src/port/zcu102/target_ddr.ld @@ -0,0 +1,126 @@ +/* ZCU102 (Xilinx UltraScale+ MPSoC, Cortex-A53) Linker Script - DDR layout + * + * Used when the app is loaded by wolfBoot (or any loader that places + * the signed image into DDR at a known LOAD_ADDRESS). FSBL + PMU FW + + * BL31 are all running by the time control reaches us, so the DDR + * controller is fully initialised and the DDR DAP 16-KB alias bug is + * a non-issue (the loader writes via the AXI master path). + * + * Memory map: + * DDR : 0x10000000 .. 0x10FFFFFF (16 MB; matches WOLFBOOT_LOAD_ADDRESS + * in wolfBoot's config/examples/zynqmp.config) + * OCM : 0xFFFC0000 .. 0xFFFFFFFF (256 KB, still mapped Normal-WB + * executable by L2_PERIPH[511]; unused + * for this layout but left in MEMORY + * so MMU page-table addresses inside + * mmu.c remain valid) + * + * App layout in DDR (16 MB region @ 0x10000000): + * .vectors / .text / .rodata / .data / .bss / .page_tables / .dma_buffers + * stack grows down from _stack_top at the top of the region + * + * Stack top is set near the end of the DDR region with plenty of head + * room (4 MB) below for .bss + page tables + DMA buffers. Increase + * the LENGTH below if a larger heap or more DMA buffers are needed. + */ + +OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") +OUTPUT_ARCH(aarch64) +ENTRY(_start) + +MEMORY +{ + DDR (rwx) : ORIGIN = 0x10000000, LENGTH = 0x01000000 /* 16 MB */ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 /* still mapped */ +} + +/* Stack near the top of the DDR region. 16 MB - 4 KB gives the stack + * a safe red zone. */ +_stack_top = 0x10FFF000; +/* Dormant DMA carve-out markers (the cache-coherency path in gem.c is + * the active mechanism). Set _dma_size to 0 so mmu.c does not insert + * any Normal-NC blocks in the L2_DDR table for this layout. */ +_dma_base = 0x10E00000; +_dma_size = 0; + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + /* First 4 bytes of the image must be a `b _start` so wolfBoot's + * do_boot() (which branches to LOAD_ADDRESS, not the ELF entry) + * lands on a valid instruction. */ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > DDR :text + + .vectors : + { + . = ALIGN(2048); + KEEP(*(.vectors)) + } > DDR :text + + .text : + { + . = ALIGN(8); + *(.text*) + *(.rodata*) + . = ALIGN(8); + } > DDR :text + + .data : + { + . = ALIGN(8); + _sdata = .; + *(.data*) + . = ALIGN(8); + _edata = .; + } > DDR :text + + .bss (NOLOAD) : + { + . = ALIGN(8); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(8); + _ebss = .; + } > DDR :text + + .page_tables (NOLOAD) : + { + . = ALIGN(4096); + _page_tables_start = .; + *(.page_tables) + . = ALIGN(4096); + _page_tables_end = .; + } > DDR :text + + /* DMA buffers stay in the DDR region as plain Normal-WB cacheable + * memory; gem.c maintains coherency with explicit DC CVAC / IVAC + * ops at every BD hand-off. The L2_DDR table has a Normal-NC + * carve-out for any 2 MB block overlapping this range but it is + * currently dormant (cache_clean / cache_inval suffices), so no + * special alignment beyond 64-byte cache lines is required. */ + .dma_buffers (NOLOAD) : + { + . = ALIGN(64); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > DDR :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/zcu102/timer.h b/src/port/zcu102/timer.h new file mode 100644 index 00000000..17959e20 --- /dev/null +++ b/src/port/zcu102/timer.h @@ -0,0 +1,41 @@ +/* timer.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * AArch64 generic-timer based delay helpers. ZynqMP FSBL/ATF programs + * CNTFRQ_EL0 to 100 MHz; we fall back to that if the register reads 0. + * + * CNTPCT_EL0 is readable at every EL on Cortex-A53 without trap setup. + */ +#ifndef ZCU102_TIMER_H +#define ZCU102_TIMER_H + +#include + +static inline uint64_t timer_now(void) +{ + uint64_t v; + __asm__ volatile ("isb; mrs %0, cntpct_el0" : "=r"(v) :: "memory"); + return v; +} + +static inline uint32_t timer_freq(void) +{ + uint64_t v; + __asm__ volatile ("mrs %0, cntfrq_el0" : "=r"(v)); + return v ? (uint32_t)v : 100000000u; +} + +static inline void delay_us(uint32_t us) +{ + uint64_t start = timer_now(); + uint64_t target = ((uint64_t)us * (uint64_t)timer_freq()) / 1000000ULL; + while ((timer_now() - start) < target) { } +} + +static inline void delay_ms(uint32_t ms) +{ + delay_us(ms * 1000u); +} + +#endif /* ZCU102_TIMER_H */ diff --git a/src/port/zcu102/uart.c b/src/port/zcu102/uart.c new file mode 100644 index 00000000..21334161 --- /dev/null +++ b/src/port/zcu102/uart.c @@ -0,0 +1,133 @@ +/* uart.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Cadence (Xilinx PS) UART0 polled driver. ZCU102 routes UART0 to the + * on-board FTDI USB-UART (channel B / /dev/ttyUSB0 on the host). We + * assume FSBL has already pinned MIO 18/19 to UART0 and enabled its + * reference clock at 100 MHz (UART_REF_CLK divided to 6.25 MHz baudgen + * input by FSBL default); we just program the divisors for 115200 baud. + * + * Register reference: ZynqMP TRM (UG1085) chapter "UART Controller". + */ +#include +#include "board.h" +#include "uart.h" + +#define UART_CR (*(volatile uint32_t *)(UART0_BASE + 0x00)) +#define UART_MR (*(volatile uint32_t *)(UART0_BASE + 0x04)) +#define UART_BAUDGEN (*(volatile uint32_t *)(UART0_BASE + 0x18)) +#define UART_BAUD_DIV (*(volatile uint32_t *)(UART0_BASE + 0x34)) +#define UART_CHANNEL_STS (*(volatile uint32_t *)(UART0_BASE + 0x2C)) +#define UART_TX_RX_FIFO (*(volatile uint32_t *)(UART0_BASE + 0x30)) + +/* Control register bits */ +#define UART_CR_TXRES (1u << 1) /* TX software reset */ +#define UART_CR_RXRES (1u << 0) /* RX software reset */ +#define UART_CR_TXEN (1u << 4) +#define UART_CR_TXDIS (1u << 5) +#define UART_CR_RXEN (1u << 2) +#define UART_CR_RXDIS (1u << 3) +#define UART_CR_STPBRK (1u << 8) + +/* Mode register: 8N1, normal, no parity */ +#define UART_MR_8N1 ((0u << 8) | (4u << 3) | (0u << 1)) + +/* Channel status */ +#define UART_SR_TXFULL (1u << 4) +#define UART_SR_TXEMPTY (1u << 3) + +void uart_init(void) +{ + /* PetaLinux/Vitis FSBL's psu_init programs: + * IOPLL = 1500 MHz + * CRL_APB.UART0_REF_CTRL: SRCSEL=IOPLL, DIVISOR0=15, DIVISOR1=1 + * -> uart_ref_clk = 1500 / 15 / 1 = 100 MHz (sel_clk to baudgen) + * + * Cadence UART baud formula: + * baud = sel_clk / (CD * (BDIV + 1)) + * + * For 115200 with BDIV=6: + * CD = 100e6 / (115200 * 7) = 124 -> actual 115207, well under UART tol. + * + * If you change ref_clk (e.g. RPLL source, different divisors), recompute + * CD - this driver does not auto-detect from CRL_APB yet. */ + UART_CR = UART_CR_TXDIS | UART_CR_RXDIS; + UART_CR |= UART_CR_TXRES | UART_CR_RXRES; + while (UART_CR & (UART_CR_TXRES | UART_CR_RXRES)) + ; /* wait for reset to self-clear */ + + UART_MR = UART_MR_8N1; + UART_BAUDGEN = 124; + UART_BAUD_DIV = 6; + + UART_CR = UART_CR_TXEN | UART_CR_RXEN | UART_CR_STPBRK; +} + +void uart_putc(char c) +{ + while (UART_CHANNEL_STS & UART_SR_TXFULL) + ; + UART_TX_RX_FIFO = (uint32_t)(unsigned char)c; +} + +void uart_puts(const char *s) +{ + while (*s) { + if (*s == '\n') + uart_putc('\r'); + uart_putc(*s++); + } +} + +void uart_puthex(uint32_t val) +{ + static const char hex[] = "0123456789ABCDEF"; + int i; + uart_puts("0x"); + for (i = 28; i >= 0; i -= 4) + uart_putc(hex[(val >> i) & 0xF]); +} + +void uart_putdec(uint32_t val) +{ + char buf[11]; + int i = 0; + if (val == 0) { + uart_putc('0'); + return; + } + while (val > 0 && i < (int)sizeof(buf)) { + buf[i++] = '0' + (char)(val % 10); + val /= 10; + } + while (i > 0) + uart_putc(buf[--i]); +} + +void uart_putip4(ip4 ip) +{ + uart_putdec((ip >> 24) & 0xFF); + uart_putc('.'); + uart_putdec((ip >> 16) & 0xFF); + uart_putc('.'); + uart_putdec((ip >> 8) & 0xFF); + uart_putc('.'); + uart_putdec(ip & 0xFF); +} diff --git a/src/port/zcu102/uart.h b/src/port/zcu102/uart.h new file mode 100644 index 00000000..aa3df3ef --- /dev/null +++ b/src/port/zcu102/uart.h @@ -0,0 +1,20 @@ +/* uart.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef ZCU102_UART_H +#define ZCU102_UART_H + +#include +#include "../../../wolfip.h" /* for ip4 */ + +void uart_init(void); +void uart_putc(char c); +void uart_puts(const char *s); +void uart_puthex(uint32_t val); +void uart_putdec(uint32_t val); +void uart_putip4(ip4 ip); + +#endif /* ZCU102_UART_H */ diff --git a/src/port/zynq7000/.gitignore b/src/port/zynq7000/.gitignore new file mode 100644 index 00000000..8e5ab963 --- /dev/null +++ b/src/port/zynq7000/.gitignore @@ -0,0 +1,4 @@ +*.o +*.elf +*.bin +BOOT.BIN diff --git a/src/port/zynq7000/Makefile b/src/port/zynq7000/Makefile new file mode 100644 index 00000000..d29b36cf --- /dev/null +++ b/src/port/zynq7000/Makefile @@ -0,0 +1,69 @@ +# Xilinx Zynq-7000 (Cortex-A9, ARMv7-A 32-bit) wolfIP bare-metal port +# +# Build: make CROSS_COMPILE=arm-none-eabi- +# +# Toolchain: ARM GNU arm-none-eabi-gcc (tested with 13.2). +# +# UNTESTED ON HARDWARE -- structural scaffold mirroring src/port/zcu102/. + +CROSS_COMPILE ?= arm-none-eabi- +CC := $(CROSS_COMPILE)gcc +OBJCOPY := $(CROSS_COMPILE)objcopy +SIZE := $(CROSS_COMPILE)size + +ROOT := ../../.. + +# Cortex-A9, ARMv7-A 32-bit, no NEON in cert paths. +CFLAGS := -mcpu=cortex-a9 -marm +CFLAGS += -Os -ffreestanding -fno-builtin -fno-common +CFLAGS += -fdata-sections -ffunction-sections +CFLAGS += -g -Wall -Wextra -Werror -Wno-unused-parameter +CFLAGS += -std=gnu99 +CFLAGS += -I. -I$(ROOT) -I$(ROOT)/src -I$(ROOT)/src/port +CFLAGS += -DZYNQ7000 -DXILINX_ARMV7 +CFLAGS += $(CFLAGS_EXTRA) + +ASFLAGS := -mcpu=cortex-a9 -marm + +LDSCRIPT := target.ld +LDFLAGS := -nostdlib -nostartfiles -T $(LDSCRIPT) -Wl,-gc-sections +# Override newlib's memset/memcpy with bytewise variants in main.c +# (the same "fast memset uses an instruction the bare-metal setup +# does not tolerate" pattern we hit on the AArch64 port). +LDFLAGS += -Wl,--wrap=memset -Wl,--wrap=memcpy + +LOCAL_C := main.c uart.c mmu.c gic.c gem.c phy_dp83867.c entropy.c +LOCAL_S := startup.S +LOCAL_OBJS := $(LOCAL_C:.c=.o) $(LOCAL_S:.S=.o) + +WOLFIP_OBJ := wolfip.o +OBJS := $(LOCAL_OBJS) $(WOLFIP_OBJ) + +all: app.elf + @echo "Built: app.elf" + @$(SIZE) app.elf + +app.elf: $(OBJS) $(LDSCRIPT) + $(CC) $(CFLAGS) $(OBJS) $(LDFLAGS) \ + -Wl,--start-group -lc -lgcc -Wl,--end-group -o $@ + +$(WOLFIP_OBJ): $(ROOT)/src/wolfip.c + $(CC) $(CFLAGS) -Wno-zero-length-bounds -Wno-type-limits -c $< -o $@ + +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +%.o: %.S + $(CC) $(ASFLAGS) -c $< -o $@ + +clean: + rm -f $(OBJS) app.elf BOOT.BIN + +.PHONY: all clean help + +help: + @echo "Zynq-7000 wolfIP build (scaffold, untested):" + @echo " make - build app.elf" + @echo " make clean - remove artifacts" + @echo "" + @echo "Override CROSS_COMPILE if your toolchain prefix differs." diff --git a/src/port/zynq7000/README.md b/src/port/zynq7000/README.md new file mode 100644 index 00000000..5dc34805 --- /dev/null +++ b/src/port/zynq7000/README.md @@ -0,0 +1,56 @@ +# wolfIP port: Xilinx Zynq-7000 (Cortex-A9, ARMv7-A 32-bit) + +**STATUS: UNTESTED ON HARDWARE.** Structural scaffold mirroring `src/port/zcu102/`. The code compiles cleanly with `arm-none-eabi-gcc` but has not been brought up on a real Zynq-7000 board. + +## What this port is + +Bare-metal wolfIP port for the Xilinx Zynq-7000 family (Z-7020 etc., e.g. ZC702 / ZedBoard / MicroZed dev boards). Cortex-A9 in SVC mode, GCC bare-metal, no Xilinx Standalone BSP, no FreeRTOS. Targets the same deterministic UDP/IPv4 profile as the ZCU102 port. + +## What differs from ZCU102 + +| Subsystem | ZCU102 (ZynqMP) | Zynq-7000 | Where it lives | +|-----------|-----------------|-----------|----------------| +| Architecture | ARMv8-A AArch64 | ARMv7-A 32-bit | toolchain prefix | +| CPU core | Cortex-A53 | Cortex-A9 | `Makefile` (-mcpu) | +| Bootloader handoff | FSBL -> EL3 | FSBL -> SVC | `startup.S` | +| Toolchain | `aarch64-none-elf-gcc` | `arm-none-eabi-gcc` | `Makefile` | +| Exception model | EL3 vectors | ARMv7 exception modes | `startup.S` rewritten | +| MMU | 4-level long descriptor | 1-level short descriptor | `mmu.c` rewritten | +| Cache ops | DC CVAC / DC IVAC | MCR p15 c7 (DCCMVAC/DCIMVAC) | `gem.c` | +| Generic timer | `mrs cntpct_el0` | `mrrc p15, 0, ..., c14` | `timer.h`, `entropy.c` | +| GIC | GIC-400 (GICv2) | GIC-390 (GICv2) | `gic.c` (same driver, different base) | +| GIC base addrs | `0xF901xxxx` | `0xF8F0xxxx` | `board.h` | +| UART | Cadence at 0xFF000000 | Cadence at 0xE0000000 | `board.h` (same driver) | +| Clock + reset | CRL_APB at 0xFF5E0000 | SLCR at 0xF8000000 | `board.h` (gem.c clock helper needs rewrite) | +| GEM count | 4 (GEM0-3) | 2 (GEM0-1) | `board.h` | +| On-board RJ45 | GEM3 (INTID 95) | GEM0 (INTID 54) | `board.h` | +| BD format | 8-byte (DMACR[30]=0) | 8-byte (no 64-bit option) | `gem.c` (unchanged) | + +## Build + +``` +cd src/port/zynq7000 +make CROSS_COMPILE=arm-none-eabi- +``` + +Output: `app.elf`. + +## Known unknowns (to validate on hardware) + +- `gem.c` still has `NWCFG_DWIDTH_64` available; it must not be set on Zynq-7000 (the A9 AXI master path is 32-bit; the older GEM revision does not implement that bit). Confirm `GEM_NWCFG` bit 21 stays clear during bring-up. +- `gem.c` clock + reset code references `SLCR_GEM0_CLK_CTRL` / `SLCR_GEM_RST_CTRL`. The actual sequence will need an unlock (`SLCR_UNLOCK = 0xDF0D`) wrapper that does not exist in the AArch64 port. +- DP83867 MDIO address on Zynq-7000 boards varies (ZedBoard uses a Marvell 88E1518; ZC702 / MicroZed differ). The shipped `phy_dp83867.c` only covers DP83867; confirm the actual on-board PHY before flashing. +- `entropy.c` uses ARMv7 `MRRC p15, 1, ..., c14` for `cntvct_el0` (virtual counter). Cortex-A9 implements the generic timer differently from later cores; if `CNTFRQ` reads 0 the fallback (333 MHz) may be way off, causing `delay_us` to misbehave. Check `CNTFRQ` first thing during bring-up. +- ARMv7 IRQ trampoline in `startup.S` uses `srsdb` + `rfeia` -- standard but assumes the IRQ-mode stack is reachable; an early IRQ before SVC stack init would fault. The current code disables IRQ until `irq_enable` is called after wolfIP/GEM init, which avoids the race. +- `mmu.c` uses 1 MB sections (16 KB L1 table). All of DDR (1 GB) is mapped Normal-WB cacheable; the OCM high mapping at 0xFFFC0000 is in section 0xFFF mapped Normal-WB. PS peripherals are Device. The DMA carve-out logic from the AArch64 port is dropped because cache_clean/cache_inval handles coherency; reintroduce if the GEM exhibits coherency issues. + +## What was reused unchanged from ZCU102 + +- `gem.c` core logic (BD ring, ISR, eth_send, eth_poll, MDIO) -- only the cache ops were rewritten for ARMv7 CP15. +- `phy_dp83867.c` -- the DP83867 driver is host-architecture-independent. +- `main.c` -- mostly identical; the AArch64-specific `exception_report` was dropped, the DEBUG_GIC self-test was `#if 0`-ed pending ARMv7 equivalents. +- `entropy.c` -- only the timer-read primitive was rewritten. + +## Files + +Same layout as `src/port/zcu102/`. See that port's README for per-file responsibilities. diff --git a/src/port/zynq7000/board.h b/src/port/zynq7000/board.h new file mode 100644 index 00000000..28c3b808 --- /dev/null +++ b/src/port/zynq7000/board.h @@ -0,0 +1,112 @@ +/* board.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Xilinx Zynq-7000 (Cortex-A9, ARMv7-A 32-bit) PS register base + * addresses and GIC interrupt IDs. Derived from the Zynq-7000 TRM + * (UG585). No Xilinx Standalone BSP header is required. + * + * UNTESTED ON HARDWARE -- code-only scaffold while the lab board is + * unavailable. Mirrors src/port/zcu102/ structurally. Key differences: + * - Cortex-A9 (not A53), ARMv7-A 32-bit (not AArch64) + * - SLCR replaces ZynqMP's CRL_APB + * - GIC-390 (GICv2) inside the SCU at different base addresses + * - Cadence UART (same IP as ZynqMP; different base address) + * - Cadence GEM (older revision; 32-bit BD format default) + * - 2 GEMs (GEM0 / GEM1); on-board RJ45 is typically GEM0 + */ +#ifndef ZYNQ7000_BOARD_H +#define ZYNQ7000_BOARD_H + +#include + +/* --------------------------------------------------------------------- + * Memory map (Zynq-7000 PS) + * ------------------------------------------------------------------- */ +#define DDR_BASE 0x00000000UL +#define DDR_SIZE 0x40000000UL /* 1 GB typical, e.g. ZC702 */ + +/* OCM is mappable to 0x00000000 (low) or 0xFFFC0000 (high). Most + * bare-metal apps use the high mapping; FSBL configures the OCM + * address filter via SLCR.OCM_CFG. We assume the high mapping. */ +#define OCM_BASE 0xFFFC0000UL +#define OCM_SIZE 0x00040000UL /* 256 KB */ + +/* --------------------------------------------------------------------- + * PS peripherals + * ------------------------------------------------------------------- */ +#define UART0_BASE 0xE0000000UL /* Cadence */ +#define UART1_BASE 0xE0001000UL + +#define GEM0_BASE 0xE000B000UL /* on-board RJ45 typical */ +#define GEM1_BASE 0xE000C000UL + +#define SLCR_BASE 0xF8000000UL /* clock + reset */ + +/* GIC-390 (ARMv7 GICv2 compatible). Distributor + CPU IF are in the + * SCU (Snoop Control Unit) memory region on Zynq-7000. */ +#define GICD_BASE 0xF8F01000UL +#define GICC_BASE 0xF8F00100UL + +/* --------------------------------------------------------------------- + * GIC interrupt IDs (raw GIC INTIDs, not GIC_SPI offsets). + * Per Zynq-7000 TRM Table 7-3: + * GEM0: INTID 54 + * GEM1: INTID 77 + * ------------------------------------------------------------------- */ +#define IRQ_GEM0 54 +#define IRQ_GEM1 77 + +/* --------------------------------------------------------------------- + * SLCR clock and reset registers + * ------------------------------------------------------------------- */ +#define SLCR_LOCK (SLCR_BASE + 0x004) +#define SLCR_UNLOCK (SLCR_BASE + 0x008) +#define SLCR_GEM0_CLK_CTRL (SLCR_BASE + 0x140) +#define SLCR_GEM1_CLK_CTRL (SLCR_BASE + 0x144) +#define SLCR_GEM_RST_CTRL (SLCR_BASE + 0x214) + +#define SLCR_UNLOCK_KEY 0xDF0D /* per TRM */ + +/* --------------------------------------------------------------------- + * Cadence UART0 baud + * ------------------------------------------------------------------- */ +#define UART_BAUD 115200 + +/* MAC address for eth0. Locally-administered, even first octet. */ +#ifndef WOLFIP_MAC_0 +#define WOLFIP_MAC_0 0x02 +#endif +#ifndef WOLFIP_MAC_1 +#define WOLFIP_MAC_1 0x00 +#endif +#ifndef WOLFIP_MAC_2 +#define WOLFIP_MAC_2 0x5A +#endif +#ifndef WOLFIP_MAC_3 +#define WOLFIP_MAC_3 0x11 +#endif +#ifndef WOLFIP_MAC_4 +#define WOLFIP_MAC_4 0x22 +#endif +#ifndef WOLFIP_MAC_5 +#define WOLFIP_MAC_5 0x33 +#endif + +#endif /* ZYNQ7000_BOARD_H */ diff --git a/src/port/zynq7000/config.h b/src/port/zynq7000/config.h new file mode 100644 index 00000000..a23992cb --- /dev/null +++ b/src/port/zynq7000/config.h @@ -0,0 +1,82 @@ +/* config.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * wolfIP configuration for Xilinx ZCU102 (UltraScale+ MPSoC, A53-0 EL3 + * bare-metal). UDP-only profile aimed at deterministic DAL-C use. + */ +#ifndef WOLF_CONFIG_H +#define WOLF_CONFIG_H + +#ifndef CONFIG_IPFILTER +#define CONFIG_IPFILTER 0 +#endif + +#define ETHERNET +#define LINK_MTU 1536 + +/* UDP-only profile in intent: the application does not call + * wolfIP_sock_socket() with IPSTACK_SOCK_STREAM. MAX_TCPSOCKETS is set + * to a small non-zero value only because core wolfIP currently sizes + * its timer heap via MAX_TIMERS = MAX_TCPSOCKETS * 3, and DHCP / ARP + * aging need timers. With MAX_TCPSOCKETS=0 the timer-heap insert path + * is permanently full and DHCP cannot schedule its retransmit timer. + * A core wolfIP follow-up should decouple MAX_TIMERS from + * MAX_TCPSOCKETS so DAL-C builds can truly opt TCP code out at + * compile time. */ +#define MAX_TCPSOCKETS 2 +#define MAX_UDPSOCKETS 4 +#define MAX_ICMPSOCKETS 1 +#define RXBUF_SIZE (LINK_MTU * 4) +#define TXBUF_SIZE (LINK_MTU * 4) + +#define MAX_NEIGHBORS 16 + +#ifndef WOLFIP_MAX_INTERFACES +#define WOLFIP_MAX_INTERFACES 1 +#endif + +#ifndef WOLFIP_ENABLE_FORWARDING +#define WOLFIP_ENABLE_FORWARDING 0 +#endif + +#ifndef WOLFIP_ENABLE_LOOPBACK +#define WOLFIP_ENABLE_LOOPBACK 0 +#endif + +#ifndef WOLFIP_ENABLE_DHCP +#define WOLFIP_ENABLE_DHCP 1 +#endif + +/* Static IP fallback (used if DHCP is disabled or times out). */ +#define WOLFIP_IP "192.168.1.100" +#define WOLFIP_NETMASK "255.255.255.0" +#define WOLFIP_GW "192.168.1.1" +#define WOLFIP_STATIC_DNS_IP "8.8.8.8" + +#if WOLFIP_ENABLE_DHCP +#define DHCP +#define DHCP_DISCOVER_RETRIES 2 +#define DHCP_REQUEST_RETRIES 2 +#endif + +/* Hardware debug: define for verbose GEM / MDIO / DHCP logging. */ +/* #define DEBUG_HW */ + +#endif /* WOLF_CONFIG_H */ diff --git a/src/port/zynq7000/entropy.c b/src/port/zynq7000/entropy.c new file mode 100644 index 00000000..b634a785 --- /dev/null +++ b/src/port/zynq7000/entropy.c @@ -0,0 +1,118 @@ +/* entropy.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * MemUse-pattern entropy source for the wolfIP ZCU102 port. + * + * The ZCU102's UltraScale+ PS does not ship a hardware TRNG that is + * usable from EL3 bare-metal without the PMU firmware and CSU helpers. + * This source produces non-deterministic 32-bit words by sampling the + * Cortex-A53 virtual count register (CNTVCT_EL0) before and after a + * memory-access loop that touches a state buffer larger than the L1 + * data cache. The cache-miss / line-fill / write-allocate timing + * variance is the entropy source - the same primitive wolfCrypt's + * wc_Entropy_Get() (HAVE_ENTROPY_MEMUSE in wolfssl/wolfcrypt/src/ + * wolfentropy.c) uses internally. + * + * This implementation skips wolfCrypt's SHA3-256 conditioning because + * the consumers in wolfIP (TCP ISN, DHCP/DNS transaction IDs, + * ephemeral source ports, IP fragment ID) need unpredictable bits, + * not uniformly-distributed cryptographic randomness. For crypto- + * grade seeding the port should be rebuilt with the full wolfCrypt + * wc_Entropy_Get() in place of zynq7000_get_random32(). + * + * Algorithm per call: + * 1. t0 = CNTVCT_EL0 + * 2. Walk state[] performing read+xor+write; ~256 accesses spans + * multiple L1 cache lines on this 32 KB / 4-way A53 cache. + * 3. t1 = CNTVCT_EL0 + * 4. Fold (t1 - t0) into the rolling 64-bit accumulator and + * perturb state[] so the next call diverges. + * 5. Apply a non-cryptographic finaliser (xorshift) and return + * the low 32 bits. + * + * The state buffer is 1024 bytes (sized to span the A53's 64-byte + * line size 16 times, ensuring at least a handful of cache misses + * per call even on a warm cache). + */ +#include + +#define ENTROPY_STATE_WORDS 128u /* 1024 bytes, 16 cache lines */ +#define ENTROPY_WALK_ITERS 256u + +static volatile uint64_t entropy_state[ENTROPY_STATE_WORDS]; +static volatile uint64_t entropy_acc; +static volatile uint32_t entropy_idx; + +static inline uint64_t cntvct_el0(void) +{ + /* ARMv7 generic timer 64-bit virtual count via + * MRRC p15, 1, Rlo, Rhi, c14. */ + uint32_t lo, hi; + __asm__ volatile ("mrrc p15, 1, %0, %1, c14" : "=r"(lo), "=r"(hi)); + return ((uint64_t)hi << 32) | lo; +} + +/* Return a 32-bit value with low predictability, suitable for + * protocol identifiers (DHCP xid, DNS id, TCP ISN, ephemeral port, + * IP fragment id). Not crypto-grade; see file header. */ +uint32_t zynq7000_get_random32(void) +{ + uint64_t t0, t1, delta; + uint64_t acc; + uint32_t i; + uint32_t walk_idx; + + t0 = cntvct_el0(); + + /* Memory-access loop: stride through the state array. Using a + * data-dependent index (acc & mask) keeps the prefetcher from + * predicting cache lines, which is exactly the timing noise we + * want to harvest. */ + acc = entropy_acc; + walk_idx = entropy_idx; + for (i = 0; i < ENTROPY_WALK_ITERS; i++) { + uint32_t pos = (walk_idx + (uint32_t)(acc & 0x7Fu)) + & (ENTROPY_STATE_WORDS - 1u); + uint64_t v = entropy_state[pos]; + v ^= acc; + v = (v << 1) | (v >> 63); /* rotate left 1 */ + entropy_state[pos] = v; + acc += v; + walk_idx++; + } + + t1 = cntvct_el0(); + delta = t1 - t0; + + /* Fold the timing delta into the accumulator and the head of + * the state ring. */ + acc ^= delta; + acc ^= (delta << 17) | (delta >> 47); + entropy_state[walk_idx & (ENTROPY_STATE_WORDS - 1u)] ^= acc; + entropy_acc = acc; + entropy_idx = walk_idx; + + /* xorshift64 finaliser to whiten the output word. */ + acc ^= acc << 13; + acc ^= acc >> 7; + acc ^= acc << 17; + + return (uint32_t)acc; +} diff --git a/src/port/zynq7000/gem.c b/src/port/zynq7000/gem.c new file mode 100644 index 00000000..d8b0062c --- /dev/null +++ b/src/port/zynq7000/gem.c @@ -0,0 +1,761 @@ +/* gem.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Cadence GEM driver for Zynq-7000 GEM0 (on-board RJ45 typical). + * + * - 32-bit DMA addressing (OCM low bank or DDR, both < 4 GB). + * - IRQ-driven RX via GIC INTID 54 (board.h IRQ_GEM0) -> gem_isr; + * polled TX. The register layout matches ZynqMP's GEM3 except + * Zynq-7000 has an older GEM revision with 32-bit BDs by default + * (no DMACR DMA_ADDR_BUS_WIDTH_64 bit needed -- BDs stay 8 bytes). + * The NWCFG.DBUS_WIDTH_64 bit is unused on Zynq-7000 (the PS bus + * width is fixed at 32 bits for the A9 AXI master path); the + * constant is kept for source-compatibility but never set. + * + * Register set per Zynq-7000 TRM (UG585) chapter 16 / Cadence GEM. + * + * UNTESTED ON HARDWARE -- structural scaffold. + */ +#include +#include +#include "config.h" +#include "../../../wolfip.h" +#include "board.h" +#include "uart.h" +#include "gic.h" +#include "gem.h" +#include "phy_dp83867.h" + +/* Cache maintenance helpers for GEM DMA coherency. Cortex-A53 cache + * line is 64 bytes. With D-cache enabled and BD/buffers in normal + * cacheable memory, CPU writes may sit in L1 D-cache and not be + * visible to the MAC's DMA path. cache_clean() writes back dirty + * lines to memory before DMA reads; cache_inval() invalidates lines + * so subsequent CPU reads pull fresh DMA-written data. */ +#define CACHE_LINE 64u + +static inline void cache_clean(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + /* ARMv7 DCCMVAC (Clean Data cache by MVA to PoC): + * MCR p15, 0, Rt, c7, c10, 1 */ + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("mcr p15, 0, %0, c7, c10, 1" :: "r"(a) : "memory"); + __asm__ volatile ("dsb" ::: "memory"); +} + +static inline void cache_inval(const void *p, uint32_t sz) +{ + uintptr_t start = (uintptr_t)p & ~(CACHE_LINE - 1u); + uintptr_t end = ((uintptr_t)p + sz + CACHE_LINE - 1u) & ~(CACHE_LINE - 1u); + uintptr_t a; + /* ARMv7 DCIMVAC (Invalidate Data cache by MVA to PoC): + * MCR p15, 0, Rt, c7, c6, 1 */ + for (a = start; a < end; a += CACHE_LINE) + __asm__ volatile ("mcr p15, 0, %0, c7, c6, 1" :: "r"(a) : "memory"); + __asm__ volatile ("dsb" ::: "memory"); +} + +/* --------------------------------------------------------------------- + * Register offsets and bit masks (subset we use) + * ------------------------------------------------------------------- */ +#define GEM_NWCTRL (*(volatile uint32_t *)(GEM0_BASE + 0x000)) +#define GEM_NWCFG (*(volatile uint32_t *)(GEM0_BASE + 0x004)) +#define GEM_NWSR (*(volatile uint32_t *)(GEM0_BASE + 0x008)) +#define GEM_DMACR (*(volatile uint32_t *)(GEM0_BASE + 0x010)) +#define GEM_TSR (*(volatile uint32_t *)(GEM0_BASE + 0x014)) +#define GEM_RXQBASE (*(volatile uint32_t *)(GEM0_BASE + 0x018)) +#define GEM_TXQBASE (*(volatile uint32_t *)(GEM0_BASE + 0x01C)) +#define GEM_RSR (*(volatile uint32_t *)(GEM0_BASE + 0x020)) +#define GEM_ISR (*(volatile uint32_t *)(GEM0_BASE + 0x024)) +#define GEM_IER (*(volatile uint32_t *)(GEM0_BASE + 0x028)) +#define GEM_IDR (*(volatile uint32_t *)(GEM0_BASE + 0x02C)) +#define GEM_PHYMNTNC (*(volatile uint32_t *)(GEM0_BASE + 0x034)) +#define GEM_HASHL (*(volatile uint32_t *)(GEM0_BASE + 0x080)) +#define GEM_HASHH (*(volatile uint32_t *)(GEM0_BASE + 0x084)) +#define GEM_LADDR1L (*(volatile uint32_t *)(GEM0_BASE + 0x088)) +#define GEM_LADDR1H (*(volatile uint32_t *)(GEM0_BASE + 0x08C)) +/* Priority queue base addresses (queues 1-3). Cadence GEM has 4 TX + * and 4 RX priority queues; if we don't point unused ones at a safe + * dummy BD, the MAC will eventually try to fetch from queue1+ at + * power-on-random addresses and hang (TSR.TXGO sticks with no octets + * transmitted). U-Boot's zynq_gem and Linux's macb both set these. */ +#define GEM_TXQ1BASE (*(volatile uint32_t *)(GEM0_BASE + 0x440)) +#define GEM_TXQ2BASE (*(volatile uint32_t *)(GEM0_BASE + 0x444)) +#define GEM_TXQ3BASE (*(volatile uint32_t *)(GEM0_BASE + 0x448)) +#define GEM_RXQ1BASE (*(volatile uint32_t *)(GEM0_BASE + 0x480)) +#define GEM_RXQ2BASE (*(volatile uint32_t *)(GEM0_BASE + 0x484)) +#define GEM_RXQ3BASE (*(volatile uint32_t *)(GEM0_BASE + 0x488)) +#define GEM_OCTTXL (*(volatile uint32_t *)(GEM0_BASE + 0x100)) +#define GEM_TXCNT (*(volatile uint32_t *)(GEM0_BASE + 0x108)) +#define GEM_OCTRXL (*(volatile uint32_t *)(GEM0_BASE + 0x150)) +#define GEM_RXCNT (*(volatile uint32_t *)(GEM0_BASE + 0x158)) +#define GEM_RXFCSCNT (*(volatile uint32_t *)(GEM0_BASE + 0x190)) +#define GEM_RXORCNT (*(volatile uint32_t *)(GEM0_BASE + 0x1A4)) + +#define NWCTRL_LOOPEN (1u << 1) +#define NWCTRL_RXEN (1u << 2) +#define NWCTRL_TXEN (1u << 3) +#define NWCTRL_MDEN (1u << 4) +#define NWCTRL_STATCLR (1u << 5) +#define NWCTRL_STARTTX (1u << 9) +#define NWCTRL_HALTTX (1u << 10) + +#define NWCFG_SPEED100 (1u << 0) +#define NWCFG_FDEN (1u << 1) +#define NWCFG_COPYALL (1u << 4) +#define NWCFG_BCASTDI (1u << 5) +#define NWCFG_MCASTHASHEN (1u << 6) +#define NWCFG_UCASTHASHEN (1u << 7) +#define NWCFG_1536RXEN (1u << 8) +#define NWCFG_1000 (1u << 10) +#define NWCFG_FCSREM (1u << 17) +#define NWCFG_MDCDIV_SHIFT 18u +#define NWCFG_MDCDIV_MASK (7u << 18) +#define NWCFG_DWIDTH_64 (1u << 21) /* Data bus width = 64 bit (AArch64) */ + +#define NWSR_PHY_IDLE (1u << 2) + +#define RSR_BUFFNA (1u << 0) +#define RSR_FRAMERX (1u << 1) +#define RSR_RXOVR (1u << 2) +#define RSR_HRESPNOK (1u << 3) + +#define IXR_MGMNT (1u << 0) +#define IXR_FRAMERX (1u << 1) +#define IXR_TXCOMPL (1u << 7) +#define IXR_TXEXH (1u << 6) +#define IXR_RXUSED (1u << 2) +#define IXR_RXOVR (1u << 10) +#define IXR_HRESPNOK (1u << 11) + +#define PHYMNTNC_CLAUSE22 0x40020000u +#define PHYMNTNC_OP_R (2u << 28) +#define PHYMNTNC_OP_W (1u << 28) + +#define RXBUF_OWN_SW (1u << 0) +#define RXBUF_WRAP (1u << 1) +#define RXBUF_ADDR_MASK 0xFFFFFFFCu +#define RXBUF_LEN_MASK 0x00001FFFu + +#define TXBUF_USED (1u << 31) +#define TXBUF_WRAP (1u << 30) +#define TXBUF_LAST (1u << 15) +#define TXBUF_LEN_MASK 0x00003FFFu + +/* --------------------------------------------------------------------- + * BD ring and frame buffer sizing + * ------------------------------------------------------------------- */ +/* Ring sizes deliberately small to fit text + DMA buffers + BSS in + * 256 KB OCM (we keep everything in OCM because DDR-via-JTAG isn't + * reliable without PMU FW running). For higher throughput, bump + * these once we move BSS back to DDR. */ +#define RX_RING_LEN 16 +#define TX_RING_LEN 8 +#define BUF_LEN 1536 /* multiple of 64, per DMACR.RXBS */ + +/* GEM BD: two 32-bit words. */ +struct gem_bd { + uint32_t addr; + uint32_t status; +}; + +/* All DMA-visible objects go in .dma_buffers (Device-nGnRnE per MMU). */ +static struct gem_bd rx_ring[RX_RING_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +static struct gem_bd tx_ring[TX_RING_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +static uint8_t rx_buf_pool[RX_RING_LEN][BUF_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); +static uint8_t tx_buf_pool[TX_RING_LEN][BUF_LEN] + __attribute__((aligned(64), section(".dma_buffers"))); + +/* Dummy BD pair for disabling priority queues 1-3. The TX dummy has + * USED=1 so the MAC ignores it (refuses to transmit). The RX dummy + * has the SW-OWN/NEW bit set so MAC won't write into queue 1-3 RX. */ +static struct gem_bd dummy_tx_bd + __attribute__((aligned(8), section(".dma_buffers"))); +static struct gem_bd dummy_rx_bd + __attribute__((aligned(8), section(".dma_buffers"))); + +/* --------------------------------------------------------------------- + * Software RX queue: filled by ISR, drained by eth_poll() in the main + * loop. Single producer (ISR) / single consumer (main), so a lockless + * head/tail pair is safe when we use DSB to publish writes. + * + * Each slot stores a pointer to one of rx_buf_pool[i] plus length; + * the buffer's BD is recycled after the main loop hands the frame to + * wolfIP. + * ------------------------------------------------------------------- */ +#define SWQ_DEPTH 16 + +struct swq_slot { + uint8_t *buf; + uint16_t len; + uint16_t ring_idx; /* into rx_ring[] - recycle after consume */ +}; + +static volatile struct swq_slot swq[SWQ_DEPTH]; +static volatile uint32_t swq_head; /* ISR writes */ +static volatile uint32_t swq_tail; /* main reads */ +static volatile uint32_t rx_drops; /* ISR-side counter */ +static volatile uint32_t s_irq_count; +static volatile uint32_t s_rx_frames; +static volatile uint32_t s_tx_sent; + +static uint32_t rx_next; /* next BD the SW will look at */ +static uint32_t tx_next; /* next BD the SW will try to TX */ + +static uint8_t phy_addr_used; + +/* --------------------------------------------------------------------- + * MDIO + * ------------------------------------------------------------------- */ +static int mdio_wait_idle(void) +{ + int spin; + for (spin = 0; spin < 100000; spin++) { + if (GEM_NWSR & NWSR_PHY_IDLE) + return 0; + } + return -1; +} + +int gem_mdio_read(uint8_t phy_addr, uint8_t reg, uint16_t *out) +{ + uint32_t v; + if (mdio_wait_idle() < 0) + return -1; + v = PHYMNTNC_CLAUSE22 | PHYMNTNC_OP_R + | (((uint32_t)phy_addr & 0x1Fu) << 23) + | (((uint32_t)reg & 0x1Fu) << 18); + GEM_PHYMNTNC = v; + if (mdio_wait_idle() < 0) + return -2; + *out = (uint16_t)(GEM_PHYMNTNC & 0xFFFFu); + return 0; +} + +int gem_mdio_write(uint8_t phy_addr, uint8_t reg, uint16_t value) +{ + uint32_t v; + if (mdio_wait_idle() < 0) + return -1; + v = PHYMNTNC_CLAUSE22 | PHYMNTNC_OP_W + | (((uint32_t)phy_addr & 0x1Fu) << 23) + | (((uint32_t)reg & 0x1Fu) << 18) + | (uint32_t)value; + GEM_PHYMNTNC = v; + if (mdio_wait_idle() < 0) + return -2; + return 0; +} + +/* --------------------------------------------------------------------- + * BD ring init + * ------------------------------------------------------------------- */ +static void rx_ring_init(void) +{ + uint32_t i; + for (i = 0; i < RX_RING_LEN; i++) { + uint32_t addr = (uint32_t)(uintptr_t)rx_buf_pool[i]; + addr &= RXBUF_ADDR_MASK; + if (i == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + rx_ring[i].addr = addr; /* OWN=0 -> hardware can use */ + rx_ring[i].status = 0; + } + rx_next = 0; +} + +static void tx_ring_init(void) +{ + uint32_t i; + /* Match u-boot zynq_gem pattern: all BDs start as dummies with + * USED|LAST|WRAP, addr=0. eth_send fills in addr + length + LAST + * (and clears USED) when actually transmitting. The WRAP bit on + * the last BD keeps the MAC walker in our ring. */ + for (i = 0; i < TX_RING_LEN; i++) { + tx_ring[i].addr = 0; + tx_ring[i].status = TXBUF_USED | TXBUF_LAST + | ((i == TX_RING_LEN - 1) ? TXBUF_WRAP : 0); + } + tx_next = 0; +} + +/* --------------------------------------------------------------------- + * RX ISR + * ------------------------------------------------------------------- */ +static void gem_isr(void) +{ + uint32_t isr; + + s_irq_count++; + isr = GEM_ISR; + GEM_ISR = isr; /* clear-on-write */ + + /* Invalidate the WHOLE RX ring at entry - MAC may have written + * to any BD, not just rx_next. Cheap (one cache line typically + * since the ring is small). */ + cache_inval(rx_ring, sizeof(rx_ring)); + + /* Walk RX BDs whose SW-OWN bit is set (frame ready for software). */ + while (rx_ring[rx_next].addr & RXBUF_OWN_SW) { + s_rx_frames++; + /* Also invalidate the buffer before we copy from it. */ + cache_inval(rx_buf_pool[rx_next], + rx_ring[rx_next].status & RXBUF_LEN_MASK); + uint32_t status = rx_ring[rx_next].status; + uint32_t next_head = swq_head; + uint32_t slot = next_head % SWQ_DEPTH; + uint32_t depth = next_head - swq_tail; + + if (depth >= SWQ_DEPTH) { + /* SW queue full - drop and recycle the BD. */ + rx_drops++; + } else { + swq[slot].buf = rx_buf_pool[rx_next]; + swq[slot].len = (uint16_t)(status & RXBUF_LEN_MASK); + swq[slot].ring_idx = (uint16_t)rx_next; + __asm__ volatile ("dsb" ::: "memory"); + swq_head = next_head + 1; + } + + /* If we have headroom in the SW queue we recycle the BD only + * after main consumes the slot (see eth_poll); when dropping we + * recycle here. */ + if (depth >= SWQ_DEPTH) { + uint32_t addr = (uint32_t)(uintptr_t)rx_buf_pool[rx_next]; + addr &= RXBUF_ADDR_MASK; + if (rx_next == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + rx_ring[rx_next].status = 0; + __asm__ volatile ("dsb" ::: "memory"); + rx_ring[rx_next].addr = addr; /* OWN=0 again */ + /* MAC reads BDs straight from memory; clean the line so it + * sees OWN=0, otherwise it skips past this BD and walks the + * ring leaving holes. */ + cache_clean(&rx_ring[rx_next], sizeof(rx_ring[rx_next])); + } + rx_next = (rx_next + 1) % RX_RING_LEN; + } + + /* RXUSED recovery: clear BUFFNA. With cache_clean on the recycle + * path, this should be rare; when it happens, also kick the RX + * path so the MAC re-walks the ring. */ + if (isr & IXR_RXUSED) { + GEM_RSR = RSR_BUFFNA; + } + if (isr & IXR_RXOVR) { + GEM_RSR = RSR_RXOVR; + } +} + +/* --------------------------------------------------------------------- + * eth_poll / eth_send (called from wolfIP_poll and stack TX path) + * ------------------------------------------------------------------- */ +static int eth_poll(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + uint32_t tail; + uint32_t slot; + uint32_t copy; + uint32_t addr; + uint16_t idx; + + (void)ll; + + /* RX frames are delivered into swq[] by gem_isr() running off the + * GIC-400 INTID 95 IRQ path (see startup.S SCR_EL3 setup and + * board.h IRQ_GEM0). eth_poll just drains the SW queue here. */ + tail = swq_tail; + if (tail == swq_head) + return 0; /* SW queue empty */ + + slot = tail % SWQ_DEPTH; + copy = swq[slot].len; + if (copy > len) + copy = len; + memcpy(buf, swq[slot].buf, copy); + + /* Recycle the BD back to hardware. */ + idx = swq[slot].ring_idx; + addr = (uint32_t)(uintptr_t)rx_buf_pool[idx]; + addr &= RXBUF_ADDR_MASK; + if (idx == RX_RING_LEN - 1) + addr |= RXBUF_WRAP; + rx_ring[idx].status = 0; + __asm__ volatile ("dsb" ::: "memory"); + rx_ring[idx].addr = addr; /* OWN bit cleared = HW can write */ + /* MAC walks BDs from main memory (not coherent with CPU D-cache); + * push the OWN=0 store out so the MAC will reuse this slot. */ + cache_clean(&rx_ring[idx], sizeof(rx_ring[idx])); + + __asm__ volatile ("dsb" ::: "memory"); + swq_tail = tail + 1; + + return (int)copy; +} + +static int eth_send(struct wolfIP_ll_dev *ll, void *buf, uint32_t len) +{ + uint32_t idx; + uint32_t status; + + (void)ll; + + if (len > BUF_LEN) + return -1; + + idx = tx_next; + /* Wait briefly for the BD to be free (USED=1 means MAC done). The + * USED bit is written back by MAC DMA - invalidate the cache line + * so the CPU does not see the stale USED=0 we wrote when we last + * armed this BD. */ + { + int spin; + for (spin = 0; spin < 100000; spin++) { + cache_inval(&tx_ring[idx], sizeof(tx_ring[idx])); + if (tx_ring[idx].status & TXBUF_USED) + break; + } + if ((tx_ring[idx].status & TXBUF_USED) == 0) + return -2; /* TX ring backed up - tell caller to retry */ + } + + memcpy(tx_buf_pool[idx], buf, len); + + /* Pad to minimum Ethernet frame (60 bytes; MAC adds 4-byte FCS). */ + if (len < 60u) { + memset(tx_buf_pool[idx] + len, 0, 60u - len); + len = 60u; + } + + /* Flush the frame buffer from D-cache so MAC DMA reads see it. */ + cache_clean(tx_buf_pool[idx], len); + + /* Re-arm BD: set buffer address, then clear USED with length+LAST + * (preserve WRAP if this is the last BD). Buffer addr written + * before status so MAC walking the ring sees a valid pair. */ + tx_ring[idx].addr = (uint32_t)(uintptr_t)tx_buf_pool[idx]; + status = (len & TXBUF_LEN_MASK) | TXBUF_LAST; + if (idx == TX_RING_LEN - 1) + status |= TXBUF_WRAP; + tx_ring[idx].status = status; /* USED=0 -> ready for MAC */ + + /* Flush BD update so MAC sees USED=0. */ + cache_clean(&tx_ring[idx], sizeof(tx_ring[idx])); + GEM_NWCTRL |= NWCTRL_STARTTX; + + s_tx_sent++; + tx_next = (idx + 1) % TX_RING_LEN; + return (int)len; +} + +uint32_t gem_irq_count(void) { return s_irq_count; } +uint32_t gem_rx_frames(void) { return s_rx_frames; } +uint32_t gem_tx_sent(void) { return s_tx_sent; } + +void gem_dump_state(void) +{ + uint32_t i; + cache_inval(rx_ring, sizeof(rx_ring)); + cache_inval(tx_ring, sizeof(tx_ring)); + uart_puts("GEM3 regs: NWCTRL="); uart_puthex(GEM_NWCTRL); + uart_puts(" NWCFG="); uart_puthex(GEM_NWCFG); + uart_puts(" NWSR="); uart_puthex(GEM_NWSR); + uart_puts(" DMACR="); uart_puthex(GEM_DMACR); + uart_puts("\n ISR="); uart_puthex(GEM_ISR); + uart_puts(" RSR="); uart_puthex(GEM_RSR); + uart_puts(" TSR="); uart_puthex(GEM_TSR); + uart_puts(" IMR="); uart_puthex(*(volatile uint32_t *)(GEM0_BASE + 0x030)); + uart_puts("\n tx[0]="); uart_puthex(tx_ring[0].addr); + uart_puts("/"); uart_puthex(tx_ring[0].status); + uart_puts(" rx[0]="); uart_puthex(rx_ring[0].addr); + uart_puts("/"); uart_puthex(rx_ring[0].status); + uart_puts("\n irq="); uart_putdec(s_irq_count); + uart_puts(" rx_frm="); uart_putdec(s_rx_frames); + uart_puts(" tx_snt="); uart_putdec(s_tx_sent); + uart_puts(" drops="); uart_putdec(rx_drops); + uart_puts("\n HW counters: txoct="); uart_putdec(GEM_OCTTXL); + uart_puts(" txcnt="); uart_putdec(GEM_TXCNT); + uart_puts(" rxoct="); uart_putdec(GEM_OCTRXL); + uart_puts(" rxcnt="); uart_putdec(GEM_RXCNT); + uart_puts(" rxfcs="); uart_putdec(GEM_RXFCSCNT); + uart_puts(" rxor="); uart_putdec(GEM_RXORCNT); + { + uint32_t filled = 0; + uint32_t first_filled = 0xFFFF; + for (i = 0; i < RX_RING_LEN; i++) { + if (rx_ring[i].addr & RXBUF_OWN_SW) { + filled++; + if (first_filled == 0xFFFF) first_filled = i; + } + } + uart_puts(" rx_filled="); uart_putdec(filled); + uart_puts(" first="); uart_putdec(first_filled); + uart_puts(" rx_next="); uart_putdec(rx_next); + } + uart_puts("\n"); +} + +/* --------------------------------------------------------------------- + * Clock + reset for GEM3 via CRL_APB. + * + * For the stock ZCU102 boot flow, FSBL has already configured GEM3: + * - CRL_APB.GEM3_REF_CTRL -> 125 MHz from IOPLL or RPLL + * - CRL_APB.RST_LPD_IOU0 -> GEM3 out of reset + * - IOU_SLCR MIO 64..77 -> GEM3 RGMII + MDIO pin muxing + * + * We pulse the GEM3 reset bit so the MAC starts from a known state + * without touching the clock control (which would race with FSBL's + * setup of PLLs). + * ------------------------------------------------------------------- */ +#define CRL_RST_GEM3 (1u << 3) + +/* Configure CRL_APB.GEM3_REF_CTRL for the negotiated link speed. The + * MAC sources TX_CLK to the PHY at this rate (RGMII): 125 MHz for + * 1 Gbps, 25 MHz for 100 Mbps, 2.5 MHz for 10 Mbps. PetaLinux/FSBL + * may pre-program this for a different speed than we want; both + * U-Boot and Linux re-program it whenever PHY link speed changes. + * + * IOPLL = 1500 MHz on ZCU102 (FSBL default). + * 1500 / 12 / 1 = 125 MHz (1000) + * 1500 / 12 / 5 = 25 MHz (100) + * 1500 / 12 / 50 = 2.5 MHz (10) + * + * Register layout (TRM): CLKACT bit26, CLKACT_RX bit25, + * DIVISOR1 bits[21:16], DIVISOR0 bits[13:8], SRCSEL bits[2:0]. */ +static void gem3_set_ref_clk(int speed_mbps) +{ + volatile uint32_t *gem3_ref = (volatile uint32_t *)SLCR_GEM0_CLK_CTRL; + uint32_t div1; + uint32_t val; + + switch (speed_mbps) { + case 1000: div1 = 1; break; + case 100: div1 = 5; break; + case 10: div1 = 50; break; + default: div1 = 1; break; + } + val = (1u << 26) /* CLKACT */ + | (1u << 25) /* CLKACT_RX */ + | ((div1 & 0x3Fu) << 16) /* DIVISOR1 */ + | ((12u & 0x3Fu) << 8) /* DIVISOR0 */ + | (0u); /* SRCSEL = IOPLL */ + *gem3_ref = val; +} + +static void gem3_hw_reset(void) +{ + volatile uint32_t *rst = (volatile uint32_t *)SLCR_GEM_RST_CTRL; + volatile uint32_t *gem3ref = (volatile uint32_t *)SLCR_GEM0_CLK_CTRL; + + uart_puts("GEM3 clk before: GEM3_REF_CTRL="); + uart_puthex(*gem3ref); + uart_puts(" RST_LPD_IOU0="); + uart_puthex(*rst); + uart_puts("\n"); + + *rst |= CRL_RST_GEM3; + { + volatile int d; + for (d = 0; d < 10000; d++) + ; + } + *rst &= ~CRL_RST_GEM3; + { + volatile int d; + for (d = 0; d < 100000; d++) /* ~10 ms post-reset settle */ + ; + } + + /* Force 125 MHz reference for the 1 Gbps case. zcu102_eth_init() + * downshifts this later if the PHY ends up at 100/10. */ + gem3_set_ref_clk(1000); + uart_puts("GEM3 clk after : GEM3_REF_CTRL="); + uart_puthex(*gem3ref); + uart_puts("\n"); +} + +/* --------------------------------------------------------------------- + * Public init + * ------------------------------------------------------------------- */ +int zcu102_eth_init(struct wolfIP_ll_dev *ll) +{ + uint8_t addr; + uint16_t id1; + int found_phy; + int speed; + int fd; + int link_up; + + gem3_hw_reset(); + + /* Disable everything before configuring. */ + GEM_NWCTRL = 0; + GEM_IDR = 0xFFFFFFFFu; + (void)GEM_ISR; + GEM_ISR = 0xFFFFFFFFu; + GEM_TSR = 0xFFFFFFFFu; + GEM_RSR = RSR_BUFFNA | RSR_FRAMERX | RSR_RXOVR | RSR_HRESPNOK; + + /* Initial NWCFG: gigabit, full duplex, MDC=/96, 1536-byte frames, + * strip FCS from RX, accept broadcasts, multicast via hash, + * DWIDTH_64 because ZynqMP GEM hangs on a 64-bit AXI bus and + * needs this bit for TX to actually transmit (matches U-Boot + * ZYNQ_GEM_DBUS_WIDTH for CONFIG_ARM64). + * COPYALL temporarily on for first-bring-up so we can confirm + * the RX path is alive even if filtering is mis-set. */ + GEM_NWCFG = NWCFG_1000 + | NWCFG_FDEN + | NWCFG_FCSREM + | NWCFG_1536RXEN + | NWCFG_MCASTHASHEN + | NWCFG_COPYALL + | NWCFG_DWIDTH_64 + | (5u << NWCFG_MDCDIV_SHIFT); + + /* DMACR: AHB fixed burst 16 beats, RX buffer 1536/64=24, TX/RX + * packet buffer memory at max. Do NOT set bit 30 (DMA_ADDR_BUS_WIDTH + * 64-bit): that selects 16-byte BD format with addr_hi, which would + * break the 8-byte struct gem_bd layout (MAC would walk every other + * BD and write to bogus high addresses, dropping the frame after + * counting it - exactly the failure mode we hit). 64-bit AXI bus + * width is set in NWCFG bit 21 instead. */ + GEM_DMACR = (24u << 16) /* RX buffer size in 64-byte units */ + | (1u << 10) /* TX packet buffer memory size = max */ + | (3u << 8) /* RX packet buffer memory size = max */ + | 0x10u; /* burst length = 16 */ + + /* Set MAC address into SAB1/SAT1. SAB1L writes are latched on + * SAB1H write per TRM, so write the high half last. */ + GEM_LADDR1L = (uint32_t)WOLFIP_MAC_0 + | ((uint32_t)WOLFIP_MAC_1 << 8) + | ((uint32_t)WOLFIP_MAC_2 << 16) + | ((uint32_t)WOLFIP_MAC_3 << 24); + GEM_LADDR1H = (uint32_t)WOLFIP_MAC_4 + | ((uint32_t)WOLFIP_MAC_5 << 8); + + GEM_HASHL = 0; + GEM_HASHH = 0; + + /* Build BD rings. */ + rx_ring_init(); + tx_ring_init(); + GEM_RXQBASE = (uint32_t)(uintptr_t)rx_ring; + GEM_TXQBASE = (uint32_t)(uintptr_t)tx_ring; + + /* Disable priority queues 1-3 with dummy BDs. Without this, the + * MAC may walk uninitialised q1/q2/q3 base pointers and hang + * (TSR.TXGO sticks but no octets transmitted). */ + dummy_tx_bd.addr = 0; + dummy_tx_bd.status = TXBUF_USED | TXBUF_WRAP | TXBUF_LAST; + dummy_rx_bd.addr = RXBUF_WRAP | RXBUF_OWN_SW; + dummy_rx_bd.status = 0; + GEM_TXQ1BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_TXQ2BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_TXQ3BASE = (uint32_t)(uintptr_t)&dummy_tx_bd; + GEM_RXQ1BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + GEM_RXQ2BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + GEM_RXQ3BASE = (uint32_t)(uintptr_t)&dummy_rx_bd; + cache_clean(&dummy_tx_bd, sizeof(dummy_tx_bd)); + cache_clean(&dummy_rx_bd, sizeof(dummy_rx_bd)); + + /* Clear any stale RX/TX packet classification screening. ZynqMP + * GEM has SCREENING_TYPE_1 (TID match) at 0x500+ and SCREENING_TYPE_2 + * (compare) at 0x540+. If non-zero, frames may be routed to non-Q0 + * queues. Default 0 = all to Q0. */ + { + uint32_t k; + for (k = 0; k < 16; k++) { + *(volatile uint32_t *)(GEM0_BASE + 0x500 + 4*k) = 0; + *(volatile uint32_t *)(GEM0_BASE + 0x540 + 4*k) = 0; + } + } + + /* Enable MDIO so we can talk to the PHY. */ + GEM_NWCTRL |= NWCTRL_MDEN; + + /* Probe MDIO addresses 0..31 for a responsive PHY. ZCU102 routes + * DP83867 to MDIO address 0x0C, but probing makes the driver + * resilient to board variants. */ + found_phy = 0; + for (addr = 0; addr < 32; addr++) { + if (gem_mdio_read(addr, 0x02, &id1) == 0 && id1 != 0xFFFFu && id1 != 0) { + found_phy = 1; + break; + } + } + if (!found_phy) { + uart_puts("GEM3: no PHY responding on MDIO!\n"); + return -10; + } + phy_addr_used = addr; + uart_puts("GEM3: PHY at MDIO addr="); + uart_puthex(phy_addr_used); + uart_puts("\n"); + + if (dp83867_init(phy_addr_used, &speed, &fd) < 0) { + uart_puts("GEM3: PHY init failed\n"); + return -11; + } + + /* If PHY ended up at 10/100, downshift the MAC and re-program the + * GEM3 reference clock to match (125 MHz / 25 MHz / 2.5 MHz). */ + if (speed != 1000) { + uint32_t cfg = GEM_NWCFG; + cfg &= ~NWCFG_1000; + if (speed == 100) + cfg |= NWCFG_SPEED100; + else + cfg &= ~NWCFG_SPEED100; + if (!fd) + cfg &= ~NWCFG_FDEN; + GEM_NWCFG = cfg; + gem3_set_ref_clk(speed); + } + + /* Install RX ISR. */ + gic_register_handler(IRQ_GEM0, gem_isr); + gic_enable_spi(IRQ_GEM0, 0xA0); + + /* Enable RX/TX and arm RX-side interrupts. */ + GEM_IER = IXR_FRAMERX | IXR_RXUSED | IXR_RXOVR | IXR_HRESPNOK; + GEM_NWCTRL |= NWCTRL_RXEN | NWCTRL_TXEN; + + /* Populate wolfIP ll_dev. */ + ll->mac[0] = WOLFIP_MAC_0; + ll->mac[1] = WOLFIP_MAC_1; + ll->mac[2] = WOLFIP_MAC_2; + ll->mac[3] = WOLFIP_MAC_3; + ll->mac[4] = WOLFIP_MAC_4; + ll->mac[5] = WOLFIP_MAC_5; + memcpy(ll->ifname, "eth0", 5); + ll->non_ethernet = 0; + ll->mtu = LINK_MTU; + ll->poll = eth_poll; + ll->send = eth_send; + ll->priv = NULL; + + link_up = (dp83867_link_status(phy_addr_used) == 1) ? 1 : 0; + return (link_up << 8) | (int)phy_addr_used; +} diff --git a/src/port/zynq7000/gem.h b/src/port/zynq7000/gem.h new file mode 100644 index 00000000..48aa8b33 --- /dev/null +++ b/src/port/zynq7000/gem.h @@ -0,0 +1,35 @@ +/* gem.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * Cadence GEM driver for Xilinx UltraScale+ MPSoC GEM3 (on-board RJ45 + * on ZCU102). Single-instance, RGMII, gigabit, polled TX, IRQ-driven + * RX. + */ +#ifndef ZCU102_GEM_H +#define ZCU102_GEM_H + +#include +#include "../../../wolfip.h" + +/* Initialize GEM3, MMIO clock + reset, PHY, and populate the wolfIP + * link-layer device. Returns: + * < 0 on error (negated TRM code) + * bits [7:0] PHY MDIO address used + * bit [8] link_up flag (1 = link is up at end of init) + */ +int zcu102_eth_init(struct wolfIP_ll_dev *ll); + +/* MDIO helpers exposed for the PHY driver (phy_dp83867.c). */ +int gem_mdio_read(uint8_t phy_addr, uint8_t reg, uint16_t *out); +int gem_mdio_write(uint8_t phy_addr, uint8_t reg, uint16_t value); + +/* Diagnostics: dump GEM registers and counters to UART. */ +void gem_dump_state(void); +uint32_t gem_irq_count(void); +uint32_t gem_rx_frames(void); +uint32_t gem_tx_sent(void); + +#endif /* ZCU102_GEM_H */ diff --git a/src/port/zynq7000/gic.c b/src/port/zynq7000/gic.c new file mode 100644 index 00000000..3c309b93 --- /dev/null +++ b/src/port/zynq7000/gic.c @@ -0,0 +1,213 @@ +/* gic.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * GIC-390 (ARM GICv2) minimal driver for Cortex-A9 on Zynq-7000. + * + * Functionally identical to src/port/zcu102/gic.c -- both are + * GICv2-compatible distributor+CPU IF designs. Only the base + * addresses (in board.h) differ. The CPU runs Secure by default on + * Zynq-7000 (no EL3 vs Secure-EL1 distinction); the ZCU102 startup + * code's SCR_EL3 routing fix is therefore not applicable, and the + * IRQ exception path comes through the ARMv7 vector at + * VBAR + 0x18 (see startup.S). The same "GIC INTID == TRM + * interrupt ID" convention applies (no GIC_SPI offset shift). + * + * UNTESTED ON HARDWARE -- structural scaffold. + * Configures all SPIs as Group 0 (IGROUPR bits cleared), level- + * triggered, targeted at CPU0, priority 0xA0. With GICC_CTLR.FIQEn=0 + * a pending Group 0 interrupt is delivered as IRQ, not FIQ. Only + * the SPIs explicitly enabled via gic_enable_spi() will fire. The + * IRQ vector in startup.S funnels into irq_dispatch() here, which + * reads IAR, routes to the registered C handler, and EOIs. + * + * No assumptions about a previous BSP - we initialize the distributor + * and CPU interface from scratch. + */ +#include +#include "board.h" +#include "gic.h" + +/* Distributor registers */ +#define GICD_CTLR (*(volatile uint32_t *)(GICD_BASE + 0x000)) +#define GICD_TYPER (*(volatile uint32_t *)(GICD_BASE + 0x004)) +#define GICD_IGROUPR(n) (*(volatile uint32_t *)(GICD_BASE + 0x080 + 4*(n))) +#define GICD_ISENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x100 + 4*(n))) +#define GICD_ICENABLER(n) (*(volatile uint32_t *)(GICD_BASE + 0x180 + 4*(n))) +#define GICD_ISPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x200 + 4*(n))) +#define GICD_ICPENDR(n) (*(volatile uint32_t *)(GICD_BASE + 0x280 + 4*(n))) +#define GICD_IPRIORITYR(n) (*(volatile uint32_t *)(GICD_BASE + 0x400 + 4*(n))) +#define GICD_ITARGETSR(n) (*(volatile uint32_t *)(GICD_BASE + 0x800 + 4*(n))) +#define GICD_ICFGR(n) (*(volatile uint32_t *)(GICD_BASE + 0xC00 + 4*(n))) +#define GICD_SGIR (*(volatile uint32_t *)(GICD_BASE + 0xF00)) + +/* CPU interface registers */ +#define GICC_CTLR (*(volatile uint32_t *)(GICC_BASE + 0x000)) +#define GICC_PMR (*(volatile uint32_t *)(GICC_BASE + 0x004)) +#define GICC_BPR (*(volatile uint32_t *)(GICC_BASE + 0x008)) +#define GICC_IAR (*(volatile uint32_t *)(GICC_BASE + 0x00C)) +#define GICC_EOIR (*(volatile uint32_t *)(GICC_BASE + 0x010)) + +#define GIC_NR_LINES 192 /* GIC-400 in ZynqMP supports up to 192 SPIs */ + +static gic_handler_t handlers[GIC_NR_LINES]; +static volatile uint32_t g_irq_total; +static volatile uint32_t g_irq_last_intid; + +void gic_register_handler(uint32_t intid, gic_handler_t fn) +{ + if (intid < GIC_NR_LINES) + handlers[intid] = fn; +} + +static void byte_write(volatile uint32_t *reg, uint32_t intid, uint8_t val) +{ + uint32_t shift; + uint32_t v; + shift = (intid & 3u) * 8u; + v = reg[intid >> 2]; + v &= ~(0xFFu << shift); + v |= ((uint32_t)val << shift); + reg[intid >> 2] = v; +} + +void gic_enable_spi(uint32_t intid, uint32_t priority) +{ + /* Set priority (lower number = higher prio). */ + byte_write((volatile uint32_t *)(GICD_BASE + 0x400), + intid, (uint8_t)(priority & 0xF8u)); + /* Target CPU0. */ + byte_write((volatile uint32_t *)(GICD_BASE + 0x800), + intid, 0x01u); + /* Group 0 (Secure) - we run at EL3 Secure, so Group 0 is the + * correct choice. GICC.FIQEn=0 makes Group 0 route to IRQ, which + * is what our vector table handles. */ + GICD_IGROUPR(intid >> 5) &= ~(1u << (intid & 31u)); + /* Level-triggered (ICFGR bits = 0b00 -> level, 0b10 -> edge). */ + { + uint32_t reg; + uint32_t shift; + shift = (intid & 15u) * 2u; + reg = GICD_ICFGR(intid >> 4); + reg &= ~(3u << shift); + GICD_ICFGR(intid >> 4) = reg; + } + /* Clear pending and enable. */ + GICD_ICPENDR(intid >> 5) = (1u << (intid & 31u)); + GICD_ISENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void gic_disable_spi(uint32_t intid) +{ + GICD_ICENABLER(intid >> 5) = (1u << (intid & 31u)); +} + +void gic_init(void) +{ + uint32_t i; + + /* Disable distributor while we reconfigure. */ + GICD_CTLR = 0; + + /* SGIs and PPIs (INTID 0..31): Group 0 Secure, but leave disabled + * for now - enabling them lit up some pending PPI from CSU/PMU + * that hung wolfIP_init when it occupied the CPU interface. */ + GICD_IGROUPR(0) = 0; + GICD_ICENABLER(0) = 0xFFFFFFFFu; + GICD_ICPENDR(0) = 0xFFFFFFFFu; + /* SPIs (INTID 32+): disable all, mark all as Group 0. */ + for (i = 1; i < (GIC_NR_LINES / 32u); i++) { + GICD_ICENABLER(i) = 0xFFFFFFFFu; + GICD_ICPENDR(i) = 0xFFFFFFFFu; + GICD_IGROUPR(i) = 0; + } + /* SGI/PPI priorities (lower 8 entries cover INTID 0..31). */ + for (i = 0; i < 8u; i++) + GICD_IPRIORITYR(i) = 0xA0A0A0A0u; + for (i = 8u; i < (GIC_NR_LINES / 4u); i++) + GICD_IPRIORITYR(i) = 0xA0A0A0A0u; + for (i = 8u; i < (GIC_NR_LINES / 4u); i++) + GICD_ITARGETSR(i) = 0x01010101u; + for (i = 2u; i < (GIC_NR_LINES / 16u); i++) + GICD_ICFGR(i) = 0; + + /* Enable distributor: both groups (we are at EL3). */ + GICD_CTLR = 0x3u; + + /* CPU interface: priority mask wide open, both groups enabled, + * FIQEn=0 so Group 0 (Secure) interrupts route to nIRQ output + * (per GICv2 IHI 0048B 4.6.4: FIQEn=0 -> nIRQ, FIQEn=1 -> nFIQ). + * AckCtl=1 so Secure reads of GICC_IAR can ack Group 1 too. */ + GICC_PMR = 0xF8u; + GICC_BPR = 0; + GICC_CTLR = 0x07u; /* EnableGrp0 | EnableGrp1 | AckCtl, FIQEn=0 */ +} + +void irq_dispatch(void) +{ + uint32_t iar; + uint32_t intid; + + iar = GICC_IAR; + intid = iar & 0x3FFu; + g_irq_total++; + g_irq_last_intid = intid; + if (intid < GIC_NR_LINES && handlers[intid] != 0) + handlers[intid](); + /* Always EOI to allow next interrupt, even if no handler matched. */ + GICC_EOIR = iar; +} + +uint32_t gic_total_irqs(void) { return g_irq_total; } +uint32_t gic_last_intid(void) { return g_irq_last_intid; } + +uint32_t gic_poll_dispatch(void) +{ + uint32_t n = 0; + uint32_t iar; + uint32_t intid; + + /* Drain up to 8 interrupts per poll to avoid live-locking the + * main loop if a peripheral is hammering us. */ + while (n < 8) { + iar = GICC_IAR; + intid = iar & 0x3FFu; + if (intid >= 1020) /* 1023 spurious / no pending */ + break; + g_irq_total++; + g_irq_last_intid = intid; + if (intid < GIC_NR_LINES && handlers[intid] != 0) + handlers[intid](); + GICC_EOIR = iar; + n++; + } + return n; +} + +uint32_t gic_is_pending(uint32_t intid) +{ + return (GICD_ISPENDR(intid >> 5) >> (intid & 31u)) & 1u; +} + +void gic_self_test_sgi(uint32_t intid) +{ + /* GICD_SGIR: TargetListFilter (bits 25:24) = 10 (self), + * SGIINTID (bits 3:0) = intid. */ + GICD_SGIR = (2u << 24) | (intid & 0xFu); +} diff --git a/src/port/zynq7000/gic.h b/src/port/zynq7000/gic.h new file mode 100644 index 00000000..2a1eae9e --- /dev/null +++ b/src/port/zynq7000/gic.h @@ -0,0 +1,49 @@ +/* gic.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef ZCU102_GIC_H +#define ZCU102_GIC_H + +#include + +typedef void (*gic_handler_t)(void); + +void gic_init(void); +void gic_register_handler(uint32_t intid, gic_handler_t fn); +void gic_enable_spi(uint32_t intid, uint32_t priority); +void gic_disable_spi(uint32_t intid); + +/* Returns 1 if interrupt is currently pending at the distributor, + * 0 otherwise. Diagnostic only. */ +uint32_t gic_is_pending(uint32_t intid); + +/* Fire a software-generated interrupt to self (CPU0) for testing. + * intid must be < 16. */ +void gic_self_test_sgi(uint32_t intid); + +/* Total IRQs taken (any intid) and the last intid we saw. */ +uint32_t gic_total_irqs(void); +uint32_t gic_last_intid(void); + +/* Polled-mode IRQ dispatch: drains any pending IRQ from the GIC + * by reading GICC_IAR, calling the registered handler, and EOI'ing. + * Returns the number of interrupts dispatched in this call. + * + * Workaround: on this ZynqMP / Cortex-A53 / GIC-400 combination, + * the GIC latches pending interrupts correctly but the CPU never + * takes the IRQ exception (root cause not pinned). Calling this + * function from the main loop is functionally equivalent. */ +uint32_t gic_poll_dispatch(void); + +/* Provided by startup.S, asm helpers. */ +void irq_enable(void); +void irq_disable(void); + +/* Called by the IRQ vector trampoline in startup.S. Acknowledges, + * dispatches, and EOIs the current interrupt. */ +void irq_dispatch(void); + +#endif /* ZCU102_GIC_H */ diff --git a/src/port/zynq7000/main.c b/src/port/zynq7000/main.c new file mode 100644 index 00000000..b2833046 --- /dev/null +++ b/src/port/zynq7000/main.c @@ -0,0 +1,298 @@ +/* main.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * wolfIP UDP echo + DHCP client demo for Xilinx Zynq-7000 (Cortex-A9, + * ARMv7-A 32-bit, GEM0 -> on-board RJ45). + * + * UNTESTED ON HARDWARE -- structural scaffold mirroring src/port/zcu102/. + */ +#include +#include +#include "config.h" +#include "../../../wolfip.h" +#include "board.h" +#include "uart.h" +#include "gic.h" +#include "gem.h" +#include "timer.h" + +#define ECHO_PORT 7 +#define RX_BUF_SIZE 1500 + +static struct wolfIP *IPStack; +static int udp_fd = -1; +static uint8_t udp_rx_buf[RX_BUF_SIZE]; + +/* Override newlib memset/memcpy with plain bytewise versions via + * linker --wrap. The same defensive pattern we used on the AArch64 + * port; whether the ARMv7 newlib variant has an equivalent issue + * has not been verified, so we keep the override pending lab + * validation. The Makefile passes -Wl,--wrap=memset -Wl,--wrap=memcpy + * so all calls get redirected to these __wrap_ functions. */ +void *__wrap_memset(void *s, int c, unsigned long n) +{ + unsigned char *p = (unsigned char *)s; + while (n--) + *p++ = (unsigned char)c; + return s; +} + +void *__wrap_memcpy(void *dest, const void *src, unsigned long n) +{ + unsigned char *d = (unsigned char *)dest; + const unsigned char *s = (const unsigned char *)src; + while (n--) + *d++ = *s++; + return dest; +} + +/* ARMv7 exception handlers in startup.S simply hang on faults + * (undef, prefetch abort, data abort). No C-side reporter required + * for this scaffold; bring-up will add CP15 fault-status reads + * (DFSR / IFSR / DFAR / IFAR) when the lab board is available. */ + +/* wolfIP needs a 32-bit random word for protocol identifiers (TCP ISN, + * DHCP xid, DNS id, ephemeral source port, IP fragment id). We delegate + * to the port-local memuse-pattern entropy source (entropy.c), which + * follows the algorithm of wolfCrypt's wc_Entropy_Get() but is + * self-contained for cert isolation. */ +extern uint32_t zynq7000_get_random32(void); + +uint32_t wolfIP_getrandom(void) +{ + return zynq7000_get_random32(); +} + +static void udp_echo_cb(int fd, uint16_t event, void *arg) +{ + struct wolfIP *s = (struct wolfIP *)arg; + struct wolfIP_sockaddr_in peer; + uint32_t peer_len = sizeof(peer); + int n; + + if (!(event & CB_EVENT_READABLE)) + return; + + n = wolfIP_sock_recvfrom(s, fd, udp_rx_buf, sizeof(udp_rx_buf), 0, + (struct wolfIP_sockaddr *)&peer, &peer_len); + if (n > 0) { + (void)wolfIP_sock_sendto(s, fd, udp_rx_buf, (uint32_t)n, 0, + (struct wolfIP_sockaddr *)&peer, peer_len); + uart_puts("UDP echo: "); uart_putdec((uint32_t)n); + uart_puts(" bytes from "); uart_putip4(peer.sin_addr.s_addr); + uart_puts("\n"); + } +} + +int main(void) +{ + struct wolfIP_ll_dev *ll; + struct wolfIP_sockaddr_in addr; + uint64_t tick = 0; + int ret; + + uart_init(); + uart_puts("\n\n=== wolfIP Zynq-7000 (Cortex-A9 SVC) ===\n"); + uart_puts("MMU on, caches on. Bringing up GIC-390...\n"); + + gic_init(); + + uart_puts("Initializing wolfIP stack...\n"); + wolfIP_init_static(&IPStack); + + uart_puts("Bringing up GEM0 (RGMII, DP83867)...\n"); + ll = wolfIP_getdev(IPStack); + ret = zcu102_eth_init(ll); + if (ret < 0) { + uart_puts("ERROR: zcu102_eth_init failed: "); + uart_puthex((uint32_t)ret); + uart_puts("\n"); + while (1) + ; + } + uart_puts(" link "); uart_puts((ret & 0x100) ? "UP" : "DOWN"); + uart_puts(", PHY="); uart_puthex((uint32_t)(ret & 0xFF)); + uart_puts("\n"); + + /* Unmask IRQ + FIQ at CPU now that GEM0 SPI is enabled at GICD. */ + irq_enable(); +#if 0 /* DEBUG_GIC self-test block uses AArch64 system regs; + * disabled on ARMv7 until the ARMv7-equivalent diagnostics + * (DAIF -> CPSR.I/F, ISR_EL1 -> CP15 c12 c1 0, etc.) are + * filled in during bring-up. */ + uart_puts("IRQ enabled. Self-test: firing SGI 0...\n"); + { + uint32_t before = gic_total_irqs(); + uint64_t daif, scr, vbar; + __asm__ volatile ("mrs %0, daif" : "=r"(daif)); + __asm__ volatile ("mrs %0, scr_el3" : "=r"(scr)); + __asm__ volatile ("mrs %0, vbar_el3" : "=r"(vbar)); + uart_puts(" pre: DAIF="); uart_puthex((uint32_t)daif); + uart_puts(" SCR_EL3="); uart_puthex((uint32_t)scr); + uart_puts(" VBAR_EL3="); uart_puthex((uint32_t)vbar); + uart_puts("\n"); + { + uint32_t vec_irq_curr_spx; + vec_irq_curr_spx = *(volatile uint32_t *)(vbar + 0x280); + uart_puts(" vec[Cur SPx IRQ] @ "); + uart_puthex((uint32_t)(vbar + 0x280)); + uart_puts(" = "); + uart_puthex(vec_irq_curr_spx); + uart_puts(" (B opcode: top byte 0x14 expected)\n"); + } + uart_puts(" GICD_CTLR="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x000)); + uart_puts(" GICD_ISENABLER(0)="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x100)); + uart_puts(" GICD_IGROUPR(0)="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x080)); + uart_puts("\n"); + uart_puts(" GICC_CTLR="); uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x000)); + uart_puts(" GICC_PMR="); uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x004)); + uart_puts("\n"); + gic_self_test_sgi(0); + delay_ms(10); + { + uint64_t isr, rpr; + __asm__ volatile ("mrs %0, isr_el1" : "=r"(isr)); + rpr = *(volatile uint32_t *)(GICC_BASE + 0x014); + uart_puts(" post-SGI: ISR_EL1="); + uart_puthex((uint32_t)isr); + uart_puts(" (bit7=I, bit6=F, bit8=A)\n"); + uart_puts(" GICC_RPR="); uart_puthex((uint32_t)rpr); + uart_puts(" (running priority; 0xFF=idle)\n"); + } + uart_puts(" SGI fired. gic_total_irqs: "); + uart_putdec(before); + uart_puts(" -> "); + uart_putdec(gic_total_irqs()); + uart_puts(" last_intid="); + uart_puthex(gic_last_intid()); + uart_puts("\n GICD_ISPENDR(0)="); uart_puthex(*(volatile uint32_t *)(GICD_BASE + 0x200)); + uart_puts(" GICC_HPPIR="); uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x018)); + uart_puts("\n"); + { + uint32_t iar = *(volatile uint32_t *)(GICC_BASE + 0x00C); + uart_puts(" polled GICC_IAR="); uart_puthex(iar); + uart_puts("\n"); + if ((iar & 0x3FF) != 0x3FF) { + *(volatile uint32_t *)(GICC_BASE + 0x010) = iar; + uart_puts(" EOI'd. polled GICC_HPPIR after="); + uart_puthex(*(volatile uint32_t *)(GICC_BASE + 0x018)); + uart_puts("\n"); + } + } + /* Extra system-register snapshot. FSBL/ATF sometimes leaves + * HCR_EL2 / MDCR_EL3 / OSLAR_EL1 with bits set that affect + * exception routing or debug halt; dump them so we can rule + * those out. NOTE: WFI wake test was tried here and hangs + * the CPU even though ISR_EL1.I=1 was observed earlier - the + * GIC appears to assert and deassert nIRQ within a few cycles + * rather than holding it level until ACK. That is consistent + * with edge-triggered SGI behavior but is not what the spec + * requires; it leaves no time for the exception logic to + * latch the event. */ + { + uint64_t hcr, mdcr, sctlr, oslsr; + __asm__ volatile ("mrs %0, hcr_el2" : "=r"(hcr)); + __asm__ volatile ("mrs %0, mdcr_el3" : "=r"(mdcr)); + __asm__ volatile ("mrs %0, sctlr_el3" : "=r"(sctlr)); + __asm__ volatile ("mrs %0, oslsr_el1" : "=r"(oslsr)); + uart_puts(" HCR_EL2="); uart_puthex((uint32_t)hcr); + uart_puts(" MDCR_EL3="); uart_puthex((uint32_t)mdcr); + uart_puts("\n SCTLR_EL3="); uart_puthex((uint32_t)sctlr); + uart_puts(" OSLSR_EL1="); uart_puthex((uint32_t)oslsr); + uart_puts("\n"); + } + } +#endif +#ifdef DEBUG_GEM + uart_puts("Initial GEM state:\n"); + gem_dump_state(); +#endif + +#ifdef DHCP + if (dhcp_client_init(IPStack) >= 0) { + uint32_t dhcp_elapsed = 0; + const uint32_t dhcp_timeout = 15000; + uart_puts("Starting DHCP client...\n"); + while (!dhcp_bound(IPStack) && dhcp_client_is_running(IPStack) + && dhcp_elapsed < dhcp_timeout) { + (void)wolfIP_poll(IPStack, tick); + tick++; + delay_ms(1); + dhcp_elapsed++; + /* gic_poll_dispatch removed - eth_poll already polls + * GEM_ISR directly. Doubling up here just spins. */ +#ifdef DEBUG_GEM + if ((dhcp_elapsed % 1000) == 0) { + uart_puts(" ["); uart_putdec(dhcp_elapsed); + uart_puts(" ms] bound="); + uart_putdec(dhcp_bound(IPStack) ? 1u : 0u); + uart_puts(" running="); + uart_putdec(dhcp_client_is_running(IPStack) ? 1u : 0u); + uart_puts("\n"); + gem_dump_state(); + } +#endif + } + if (dhcp_bound(IPStack)) { + ip4 ip = 0, nm = 0, gw = 0; + wolfIP_ipconfig_get(IPStack, &ip, &nm, &gw); + uart_puts("DHCP bound:\n IP: "); uart_putip4(ip); + uart_puts("\n Mask: "); uart_putip4(nm); + uart_puts("\n GW: "); uart_putip4(gw); + uart_puts("\n"); + } else { + ip4 ip = atoip4(WOLFIP_IP); + ip4 nm = atoip4(WOLFIP_NETMASK); + ip4 gw = atoip4(WOLFIP_GW); + uart_puts("DHCP timeout - using static IP\n"); + wolfIP_ipconfig_set(IPStack, ip, nm, gw); + } + } +#else + { + ip4 ip = atoip4(WOLFIP_IP); + ip4 nm = atoip4(WOLFIP_NETMASK); + ip4 gw = atoip4(WOLFIP_GW); + wolfIP_ipconfig_set(IPStack, ip, nm, gw); + uart_puts("Static IP: "); uart_putip4(ip); uart_puts("\n"); + } +#endif + + uart_puts("Opening UDP echo socket on port "); + uart_putdec(ECHO_PORT); uart_puts("\n"); + udp_fd = wolfIP_sock_socket(IPStack, AF_INET, IPSTACK_SOCK_DGRAM, 0); + wolfIP_register_callback(IPStack, udp_fd, udp_echo_cb, IPStack); + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = ee16(ECHO_PORT); + addr.sin_addr.s_addr = 0; + (void)wolfIP_sock_bind(IPStack, udp_fd, + (struct wolfIP_sockaddr *)&addr, sizeof(addr)); + + uart_puts("Ready. Try: nc -u 7\n\n"); + + for (;;) { + (void)wolfIP_poll(IPStack, tick++); + delay_ms(1); + } + + return 0; +} diff --git a/src/port/zynq7000/mmu.c b/src/port/zynq7000/mmu.c new file mode 100644 index 00000000..28ad1b5e --- /dev/null +++ b/src/port/zynq7000/mmu.c @@ -0,0 +1,131 @@ +/* mmu.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * ARMv7-A short-format L1 page tables (4 KB-aligned, 16 KB total, + * 4096 1 MB section descriptors covering the whole 32-bit VA space). + * Sufficient for a flat-mapped bare-metal app on Zynq-7000 PS. + * + * 0x00000000 - 0x3FFFFFFF DDR (1 GB) - Normal WB cacheable + * 0x40000000 - 0xDFFFFFFF unmapped (PL interconnect / SMC ranges) + * 0xE0000000 - 0xFFFFFFFF PS peripherals (UART, GEM, SLCR, GIC, OCM) + * mostly Device-nGnRnE, except the OCM + * high-mapping (0xFFFC0000..0xFFFFFFFF) + * which is Normal cacheable executable. + * + * Short descriptor section format (1 MB, supersection ignored): + * bits [31:20] = section base + * bits [19:18] = NS, NS for non-secure (we are Secure -> 0) + * bit [17] = nG (global) + * bit [16] = S (shareable) + * bits [14:12] = TEX + * bits [11:10] = AP[1:0] + * bit [15] = AP[2] + * bit [9] = IMPDEF + * bits [8:5] = Domain + * bit [4] = XN (execute never) + * bit [3] = C + * bit [2] = B + * bits [1:0] = 10 (section) + * + * TEX[2:0] + C + B encoding for Normal WB cacheable (TEX=001, C=1, B=1) + * and Device-nGnRnE (TEX=000, C=0, B=0) per ARMv7-A short descriptor. + * + * UNTESTED ON HARDWARE. + */ +#include +#include "mmu.h" + +static volatile uint32_t L1[4096] __attribute__((aligned(16384), + section(".page_tables"))); + +#define SEC_NORMAL_WB(addr) \ + (((addr) & 0xFFF00000u) | \ + (1u << 12) | /* TEX[0] = 1 */ \ + (1u << 10) | /* AP[1] = 1 (RW PL1+) */ \ + (1u << 3) | /* C */ \ + (1u << 2) | /* B */ \ + 0x2u) /* section */ + +#define SEC_DEVICE(addr) \ + (((addr) & 0xFFF00000u) | \ + (1u << 10) | /* AP[1] */ \ + (1u << 4) | /* XN */ \ + (1u << 2) | /* B (shareable device) */ \ + 0x2u) + +#define SEC_INVALID (0u) + +extern uint8_t _dma_buffers_start[]; +extern uint8_t _dma_buffers_end[]; + +static void mmu_build_tables(void) +{ + uint32_t i; + uint32_t addr; + + for (i = 0; i < 4096; i++) + L1[i] = SEC_INVALID; + + /* DDR 0x00000000 - 0x3FFFFFFF (1 GB) as Normal WB. */ + for (i = 0; i < 1024; i++) { + addr = i * 0x100000u; + L1[i] = SEC_NORMAL_WB(addr); + } + + /* PS peripherals at 0xE0000000 - 0xFEFFFFFF (Device). */ + for (i = 0xE00; i < 0xFF0; i++) { + addr = i * 0x100000u; + L1[i] = SEC_DEVICE(addr); + } + + /* OCM high mapping 0xFFFC0000 - 0xFFFFFFFF (last 256 KB of 4 GB). + * The section at 0xFFF00000 (1 MB) covers it. Mark Normal-WB and + * executable so code can run from OCM after MMU enable. */ + L1[0xFFF] = SEC_NORMAL_WB(0xFFF00000u); +} + +void mmu_enable(void) +{ + uint32_t sctlr; + + mmu_build_tables(); + + /* DACR: domain 0 = Client (check permissions). */ + __asm__ volatile ("mcr p15, 0, %0, c3, c0, 0" :: "r"(0x55555555u)); + + /* TTBR0 = L1 (low 32 bits of physical address). TTBR1 unused. */ + __asm__ volatile ("mcr p15, 0, %0, c2, c0, 0" :: "r"((uint32_t)L1)); + __asm__ volatile ("mcr p15, 0, %0, c2, c0, 2" :: "r"(0u)); /* TTBCR=0 */ + + /* Invalidate TLB + I-cache. */ + __asm__ volatile ("mcr p15, 0, %0, c8, c7, 0" :: "r"(0u)); /* TLBIALL */ + __asm__ volatile ("mcr p15, 0, %0, c7, c5, 0" :: "r"(0u)); /* ICIALLU */ + __asm__ volatile ("dsb" ::: "memory"); + __asm__ volatile ("isb" ::: "memory"); + + /* Enable MMU + I-cache + D-cache. */ + __asm__ volatile ("mrc p15, 0, %0, c1, c0, 0" : "=r"(sctlr)); + sctlr |= (1u << 0); /* M */ + sctlr |= (1u << 2); /* C */ + sctlr |= (1u << 12); /* I */ + sctlr &= ~(1u << 1); /* A off */ + __asm__ volatile ("mcr p15, 0, %0, c1, c0, 0" :: "r"(sctlr)); + __asm__ volatile ("isb" ::: "memory"); +} diff --git a/src/port/zynq7000/mmu.h b/src/port/zynq7000/mmu.h new file mode 100644 index 00000000..9d30f6a4 --- /dev/null +++ b/src/port/zynq7000/mmu.h @@ -0,0 +1,12 @@ +/* mmu.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef ZYNQ7000_MMU_H +#define ZYNQ7000_MMU_H + +void mmu_enable(void); + +#endif /* ZYNQ7000_MMU_H */ diff --git a/src/port/zynq7000/phy_dp83867.c b/src/port/zynq7000/phy_dp83867.c new file mode 100644 index 00000000..987c239c --- /dev/null +++ b/src/port/zynq7000/phy_dp83867.c @@ -0,0 +1,338 @@ +/* phy_dp83867.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * TI DP83867IR PHY init for the ZCU102 (PHY on RGMII to PS-GEM3). + * + * The DP83867 needs explicit RGMII TX and RX clock delay configuration + * (CFG4 / RGMIICTL extended registers) because the ZCU102 routes + * RGMII signals as a straight-through trace without external delay. + * Without this the link comes up at 1 Gbps but carries corrupt data + * (random RX frames, no TX). The Linux dp83867 driver and Xilinx + * device tree both apply a 2.0 ns TX + 2.0 ns RX skew - we match. + * + * Extended registers (>0x1F) are accessed via the IEEE-defined indirect + * pair (REGCR=0x0D, ADDAR=0x0E): + * 1. Write REGCR = 0x001F (address-of, devad 31). + * 2. Write ADDAR = . + * 3. Write REGCR = 0x401F (data, devad 31, no-increment). + * 4. Read/Write ADDAR = . + */ +#include +#include "gem.h" +#include "phy_dp83867.h" +#include "timer.h" +#include "uart.h" + +/* Standard IEEE PHY registers (clause 22) */ +#define PHY_BMCR 0x00 +#define PHY_BMSR 0x01 +#define PHY_ID1 0x02 +#define PHY_ID2 0x03 +#define PHY_ANAR 0x04 +#define PHY_GBCR 0x09 +#define PHY_GBSR 0x0A +#define PHY_REGCR 0x0D +#define PHY_ADDAR 0x0E + +#define BMCR_RESET (1u << 15) +#define BMCR_ANRESTART (1u << 9) +#define BMCR_ANEN (1u << 12) + +#define BMSR_ANCOMPLETE (1u << 5) +#define BMSR_LINK_UP (1u << 2) + +/* DP83867 extended registers (accessed via REGCR/ADDAR, devad 0x1F) */ +#define DP83867_CFG4 0x0031 /* Configuration 4 (RX_CTRL strap fix) */ +#define DP83867_RGMIICTL 0x0032 /* RGMII control */ +#define DP83867_STRAP_STS1 0x006E /* Strap status register (read-only) */ +#define DP83867_RGMIIDCTL 0x0086 /* RGMII delay control */ +#define DP83867_IO_MUX_CFG 0x0170 /* IO MUX config (impedance) */ + +/* Clause-22 register (direct access) */ +#define DP83867_PHYCR 0x10 /* PHY Control register */ +#define PHYCR_FIFO_DEPTH_MASK (3u << 14) +#define PHYCR_FIFO_DEPTH_8B (3u << 14) + +/* RGMIICTL bits */ +#define RGMIICTL_RX_DELAY_EN (1u << 0) +#define RGMIICTL_TX_DELAY_EN (1u << 1) + +/* RGMIIDCTL: TX delay in [3:0], RX delay in [7:4], each step ~0.25 ns. + * 0x8 -> 2.0 ns (matches Linux/Xilinx default for ZCU102). */ +#define RGMIIDCTL_DELAY_2NS (0x8u | (0x8u << 4)) + +/* Speed read from PHY status register (DP83867 0x11) */ +#define DP83867_PHYSTS 0x0011 +#define PHYSTS_SPEED_MASK (3u << 14) +#define PHYSTS_SPEED_1000 (2u << 14) +#define PHYSTS_SPEED_100 (1u << 14) +#define PHYSTS_SPEED_10 (0u << 14) +#define PHYSTS_DUPLEX (1u << 13) + +static int phy_ext_write(uint8_t phy_addr, uint16_t ext_reg, uint16_t val) +{ + int rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x001Fu); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_ADDAR, ext_reg); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x401Fu); + if (rc < 0) return rc; + return gem_mdio_write(phy_addr, PHY_ADDAR, val); +} + +static int phy_ext_read(uint8_t phy_addr, uint16_t ext_reg, uint16_t *out) +{ + int rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x001Fu); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_ADDAR, ext_reg); + if (rc < 0) return rc; + rc = gem_mdio_write(phy_addr, PHY_REGCR, 0x401Fu); + if (rc < 0) return rc; + return gem_mdio_read(phy_addr, PHY_ADDAR, out); +} + +int dp83867_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out) +{ + uint16_t id1 = 0; + uint16_t id2 = 0; + uint16_t bmcr; + uint16_t bmsr; + uint16_t physts; + int i; + + if (gem_mdio_read(phy_addr, PHY_ID1, &id1) < 0) + return -1; + if (gem_mdio_read(phy_addr, PHY_ID2, &id2) < 0) + return -2; + uart_puts("DP83867: ID1="); uart_puthex(id1); + uart_puts(" ID2="); uart_puthex(id2); + uart_puts("\n"); + /* DP83867 OUI = 0x2000A23x. ID1=0x2000, ID2 upper bits match. */ + if (id1 != 0x2000u || (id2 & 0xFFF0u) != 0xA230u) { + uart_puts(" warn: PHY ID does not match DP83867, continuing\n"); + } + + /* Soft reset. */ + if (gem_mdio_write(phy_addr, PHY_BMCR, BMCR_RESET) < 0) + return -3; + for (i = 0; i < 1000; i++) { + delay_ms(1); + if (gem_mdio_read(phy_addr, PHY_BMCR, &bmcr) < 0) + return -4; + if ((bmcr & BMCR_RESET) == 0) + break; + } + if (i == 1000) + return -5; + + /* Order below mirrors the Linux/U-Boot dp83867_config sequence: + * 1. Strap fix (CFG4 bit 7) right after SW reset. + * 2. PHYCR FIFO depth RMW. + * 3. RGMIICTL RMW to enable both delays. + * 4. RGMIIDCTL set delay values. + * 5. Restart AN (caller does after we return). + */ + { + uint16_t strap = 0; + uint16_t cfg4_before = 0; + uint16_t cfg4_after = 0; + uint16_t iomux = 0; + uint16_t rgmiictl = 0; + uint16_t phycr_before = 0; + uint16_t phycr_after = 0; + + (void)phy_ext_read(phy_addr, DP83867_STRAP_STS1, &strap); + (void)phy_ext_read(phy_addr, DP83867_IO_MUX_CFG, &iomux); + (void)phy_ext_read(phy_addr, DP83867_CFG4, &cfg4_before); + + /* 1. RX_CTRL strap quirk for ZCU102. */ + cfg4_after = cfg4_before & ~(1u << 7); + if (phy_ext_write(phy_addr, DP83867_CFG4, cfg4_after) < 0) + return -6; + + /* 2. PHYCR FIFO depth = 8 bytes (RMW so we keep Auto-MDIX, + * power-down detect, etc., that the strap brought up). */ + (void)gem_mdio_read(phy_addr, DP83867_PHYCR, &phycr_before); + phycr_after = (phycr_before & ~PHYCR_FIFO_DEPTH_MASK) + | PHYCR_FIFO_DEPTH_8B; + if (gem_mdio_write(phy_addr, DP83867_PHYCR, phycr_after) < 0) + return -7; + + /* 3. RGMIICTL: enable TX and RX clock delays (RMW). */ + (void)phy_ext_read(phy_addr, DP83867_RGMIICTL, &rgmiictl); + rgmiictl |= RGMIICTL_RX_DELAY_EN | RGMIICTL_TX_DELAY_EN; + if (phy_ext_write(phy_addr, DP83867_RGMIICTL, rgmiictl) < 0) + return -8; + + /* 4. RGMIIDCTL: 2.0 ns each (matches Linux ti,*-internal-delay=8). */ + if (phy_ext_write(phy_addr, DP83867_RGMIIDCTL, + RGMIIDCTL_DELAY_2NS) < 0) + return -9; + +#ifdef DEBUG_PHY + /* Verbose pre-AN dump so we can diff against U-Boot's state. */ + uart_puts("DP83867 pre-AN: STRAP_STS1="); uart_puthex(strap); + uart_puts(" IO_MUX_CFG="); uart_puthex(iomux); + uart_puts("\n CFG4: "); uart_puthex(cfg4_before); + uart_puts(" -> "); uart_puthex(cfg4_after); + uart_puts(" PHYCR: "); uart_puthex(phycr_before); + uart_puts(" -> "); uart_puthex(phycr_after); + uart_puts("\n RGMIICTL="); uart_puthex(rgmiictl); + uart_puts(" RGMIIDCTL="); uart_puthex(RGMIIDCTL_DELAY_2NS); + uart_puts("\n"); + + { + uint16_t v; + (void)phy_ext_read(phy_addr, DP83867_CFG4, &v); + uart_puts("DP83867 readback: CFG4="); uart_puthex(v); + (void)phy_ext_read(phy_addr, DP83867_RGMIICTL, &v); + uart_puts(" RGMIICTL="); uart_puthex(v); + (void)phy_ext_read(phy_addr, DP83867_RGMIIDCTL, &v); + uart_puts(" RGMIIDCTL="); uart_puthex(v); + (void)gem_mdio_read(phy_addr, DP83867_PHYCR, &v); + uart_puts(" PHYCR="); uart_puthex(v); + uart_puts("\n"); + } +#else + (void)strap; (void)iomux; + (void)cfg4_before; (void)cfg4_after; + (void)phycr_before; (void)phycr_after; + (void)rgmiictl; +#endif + } + + /* Advertise 10/100/1000 full + half duplex. */ + if (gem_mdio_write(phy_addr, PHY_ANAR, 0x01E1u) < 0) + return -8; + if (gem_mdio_write(phy_addr, PHY_GBCR, (1u << 9) | (1u << 8)) < 0) + return -9; + + /* Restart AN. */ + if (gem_mdio_write(phy_addr, PHY_BMCR, BMCR_ANEN | BMCR_ANRESTART) < 0) + return -10; + + /* Wait up to 5 s for AN complete, polling at 50 ms. AN typically + * needs 100-1500 ms depending on link partner. Report progress so + * a hung negotiation is visible on UART. */ + uart_puts("DP83867: waiting for autoneg"); + for (i = 0; i < 100; i++) { + delay_ms(50); + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -11; + if (bmsr & BMSR_ANCOMPLETE) { + uart_puts(" done ("); + uart_putdec((uint32_t)i * 50u); + uart_puts("ms)\n"); + break; + } + if ((i % 10) == 9) + uart_putc('.'); + } + if (!(bmsr & BMSR_ANCOMPLETE)) + uart_puts(" TIMEOUT\n"); + + /* Give the PHY a moment to latch the negotiated speed before we + * read PHYSTS - on DP83867 link-OK and PHYSTS update slightly + * after AN_COMPLETE asserts. */ + delay_ms(100); + + /* After AN_COMPLETE, the 1000BASE-T link still needs to finish + * master/slave training and have BOTH receivers report OK before + * BMSR.LINK_UP asserts. This can take several hundred ms more. + * Poll BMSR (double-read for latch) up to 5 s, dumping GBSR each + * iteration so we can see remote_rx_status flip. */ + { + int j; + uint16_t gbsr = 0; + uint16_t bmsr2 = 0; + uart_puts("DP83867: waiting for link"); + for (j = 0; j < 100; j++) { + delay_ms(50); + (void)gem_mdio_read(phy_addr, PHY_BMSR, &bmsr2); + (void)gem_mdio_read(phy_addr, PHY_BMSR, &bmsr2); + (void)gem_mdio_read(phy_addr, PHY_GBSR, &gbsr); + if (bmsr2 & BMSR_LINK_UP) { + uart_puts(" UP ("); + uart_putdec((uint32_t)j * 50u); + uart_puts("ms) GBSR="); + uart_puthex(gbsr); + uart_puts("\n"); + bmsr = bmsr2; + break; + } + if ((j % 10) == 9) { + uart_puts(" ["); + uart_putdec((uint32_t)(j + 1) * 50u); + uart_puts("ms GBSR="); + uart_puthex(gbsr); + uart_puts("]"); + } + } + if (!(bmsr2 & BMSR_LINK_UP)) + uart_puts(" TIMEOUT\n"); + } + + if (gem_mdio_read(phy_addr, DP83867_PHYSTS, &physts) < 0) + return -12; + +#ifdef DEBUG_PHY + { + uint16_t bmcr_now = 0; + uint16_t lpa = 0; + uint16_t gbsr = 0; + (void)gem_mdio_read(phy_addr, PHY_BMCR, &bmcr_now); + (void)gem_mdio_read(phy_addr, 0x05, &lpa); /* MII LPA */ + (void)gem_mdio_read(phy_addr, PHY_GBSR, &gbsr); + uart_puts("DP83867 regs: BMCR="); uart_puthex(bmcr_now); + uart_puts(" BMSR="); uart_puthex(bmsr); + uart_puts(" LPA="); uart_puthex(lpa); + uart_puts(" GBSR="); uart_puthex(gbsr); + uart_puts(" PHYSTS="); uart_puthex(physts); + uart_puts("\n"); + } +#endif + + if ((physts & PHYSTS_SPEED_MASK) == PHYSTS_SPEED_1000) + *speed_out = 1000; + else if ((physts & PHYSTS_SPEED_MASK) == PHYSTS_SPEED_100) + *speed_out = 100; + else + *speed_out = 10; + *full_duplex_out = (physts & PHYSTS_DUPLEX) ? 1 : 0; + + uart_puts("DP83867 link: "); + uart_putdec((uint32_t)*speed_out); + uart_puts(*full_duplex_out ? " Mbps FD\n" : " Mbps HD\n"); + + return (bmsr & BMSR_LINK_UP) ? 1 : 0; +} + +int dp83867_link_status(uint8_t phy_addr) +{ + uint16_t bmsr; + /* BMSR latches link down; read twice. */ + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -1; + if (gem_mdio_read(phy_addr, PHY_BMSR, &bmsr) < 0) + return -1; + return (bmsr & BMSR_LINK_UP) ? 1 : 0; +} diff --git a/src/port/zynq7000/phy_dp83867.h b/src/port/zynq7000/phy_dp83867.h new file mode 100644 index 00000000..efbf45a3 --- /dev/null +++ b/src/port/zynq7000/phy_dp83867.h @@ -0,0 +1,23 @@ +/* phy_dp83867.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * TI DP83867IR PHY driver: 10/100/1000 RGMII PHY used on the ZCU102 + * dev board. We only need configuration (reset, RGMII TX/RX skew, + * auto-negotiation) and link status; no advanced features. + */ +#ifndef ZCU102_PHY_DP83867_H +#define ZCU102_PHY_DP83867_H + +#include + +/* Returns 0 on success, < 0 on failure. On success *speed and *fd are + * the negotiated speed (10/100/1000) and full-duplex flag. */ +int dp83867_init(uint8_t phy_addr, int *speed_out, int *full_duplex_out); + +/* Returns 1 if link is up, 0 if down, < 0 on MDIO error. */ +int dp83867_link_status(uint8_t phy_addr); + +#endif /* ZCU102_PHY_DP83867_H */ diff --git a/src/port/zynq7000/startup.S b/src/port/zynq7000/startup.S new file mode 100644 index 00000000..3e8d3f48 --- /dev/null +++ b/src/port/zynq7000/startup.S @@ -0,0 +1,155 @@ +/* startup.S + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * ARMv7-A entry for Cortex-A9 on Xilinx Zynq-7000. Stock Xilinx FSBL + * configures DDR, clocks, MIO, then jumps to _start at the image + * entry. We bring up the SVC-mode stack, install the exception + * vector table, clear BSS, and call mmu_enable + main. + * + * UNTESTED ON HARDWARE -- structural scaffold; ARMv7 specifics + * (mode switching, exception vectors, MMU enable) are written to + * the spec but have not been run on a real board. + */ + .arch armv7-a + .syntax unified + + .section .vectors, "ax" + .global _vectors +_vectors: + b _start /* 0x00: reset */ + b _undef_handler /* 0x04: undefined instruction */ + b _svc_handler /* 0x08: SVC */ + b _prefetch_abort /* 0x0C: prefetch abort */ + b _data_abort /* 0x10: data abort */ + nop /* 0x14: reserved */ + b _irq_handler /* 0x18: IRQ */ + b _fiq_handler /* 0x1C: FIQ */ + + .section .text, "ax" + .global _start + .type _start, %function + .arm +_start: + /* Disable IRQ + FIQ until we have a stack. */ + cpsid if + + /* Switch to SVC mode (FSBL may have left us in another mode). */ + cps #0x13 /* SVC mode */ + + /* Set the SVC-mode stack. */ + ldr sp, =_stack_top + + /* Install vector base address (VBAR). VBAR is implemented on + * Cortex-A9 (CP15 c12 c0 0). */ + ldr r0, =_vectors + mcr p15, 0, r0, c12, c0, 0 /* MCR VBAR, r0 */ + isb + + /* Disable MMU + caches in case FSBL left them on. SCTLR is CP15 + * c1 c0 0. */ + mrc p15, 0, r0, c1, c0, 0 + bic r0, r0, #(1 << 0) /* M -- MMU disable */ + bic r0, r0, #(1 << 2) /* C -- D-cache disable */ + bic r0, r0, #(1 << 12) /* I -- I-cache disable */ + mcr p15, 0, r0, c1, c0, 0 + isb + + /* Very-early UART poke - one char before any C code. Cadence + * UART0 TX FIFO is at 0xE0000030. */ + ldr r1, =0xE0000000 + mov r2, #'!' + str r2, [r1, #0x30] + + /* Clear BSS. */ + ldr r0, =_sbss + ldr r1, =_ebss + mov r2, #0 +1: cmp r0, r1 + bge 2f + str r2, [r0], #4 + b 1b +2: + + /* Bring up the MMU + caches (C function in mmu.c). */ + bl mmu_enable + + /* Breadcrumb: MMU enabled. */ + ldr r1, =0xE0000000 + mov r2, #'M' + str r2, [r1, #0x30] + + /* Call main(). */ + bl main + + /* main() should not return. */ + b _hang + + .global _hang + .type _hang, %function +_hang: + b _hang + +/* ---------------------------------------------------------------------- + * Exception handlers. The IRQ vector funnels into irq_dispatch (C), + * mirroring the AArch64 trampoline in src/port/zcu102/startup.S. We + * save AAPCS caller-saved + lr to make irq_dispatch safe to call. + * -------------------------------------------------------------------- */ + .type _irq_handler, %function +_irq_handler: + /* Adjust lr_irq for return-from-exception (LR points past). */ + sub lr, lr, #4 + srsdb sp!, #0x12 /* save lr_irq, spsr_irq to IRQ stack */ + cpsid if, #0x13 /* switch to SVC mode for the handler */ + push {r0-r12, lr} + bl irq_dispatch + pop {r0-r12, lr} + rfeia sp! /* return from exception */ + + .type _fiq_handler, %function +_fiq_handler: + b _hang /* FIQ unused */ + + .type _undef_handler, %function +_undef_handler: + b _hang + + .type _svc_handler, %function +_svc_handler: + b _hang + + .type _prefetch_abort, %function +_prefetch_abort: + b _hang + + .type _data_abort, %function +_data_abort: + b _hang + + .global irq_enable + .type irq_enable, %function +irq_enable: + cpsie if /* enable IRQ + FIQ */ + bx lr + + .global irq_disable + .type irq_disable, %function +irq_disable: + cpsid if + bx lr diff --git a/src/port/zynq7000/target.ld b/src/port/zynq7000/target.ld new file mode 100644 index 00000000..1278466f --- /dev/null +++ b/src/port/zynq7000/target.ld @@ -0,0 +1,98 @@ +/* Zynq-7000 (Cortex-A9) Linker Script - OCM-only layout + * + * Memory map (Zynq-7000 PS): + * DDR : 0x00000000 .. 0x3FFFFFFF (1 GB on ZC702 / Zynq-7020 typical) + * OCM : 0xFFFC0000 .. 0xFFFFFFFF (256 KB, high-mapped by SLCR.OCM_CFG) + * + * We keep everything in OCM by default for JTAG-iteration symmetry + * with the ZCU102 port. The page tables alone take 16 KB (4096 + * section descriptors x 4 bytes) so we have less spare OCM than the + * AArch64 builds; track sizes and spill to DDR when needed. + * + * UNTESTED ON HARDWARE. + */ + +OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") +OUTPUT_ARCH(arm) +ENTRY(_start) + +MEMORY +{ + OCM (rwx) : ORIGIN = 0xFFFC0000, LENGTH = 0x00040000 + DDR (rw) : ORIGIN = 0x00010000, LENGTH = 0x10000000 +} + +_stack_top = 0xFFFFF000; + +PHDRS +{ + text PT_LOAD FLAGS(5); /* RX */ + data PT_LOAD FLAGS(6); /* RW */ +} + +SECTIONS +{ + .boot_entry : + { + KEEP(*(.boot_entry)) + } > OCM :text + + .vectors : + { + . = ALIGN(32); /* VBAR alignment requirement */ + KEEP(*(.vectors)) + } > OCM :text + + .text : + { + . = ALIGN(4); + *(.text*) + *(.rodata*) + . = ALIGN(4); + } > OCM :text + + .data : + { + . = ALIGN(4); + _sdata = .; + *(.data*) + . = ALIGN(4); + _edata = .; + } > OCM :text + + .bss (NOLOAD) : + { + . = ALIGN(4); + _sbss = .; + *(.bss*) + *(COMMON) + . = ALIGN(4); + _ebss = .; + } > OCM :text + + .page_tables (NOLOAD) : + { + . = ALIGN(16384); /* TTBR0 wants 16 KB alignment */ + _page_tables_start = .; + *(.page_tables) + . = ALIGN(16384); + _page_tables_end = .; + } > OCM :text + + .dma_buffers (NOLOAD) : + { + . = ALIGN(64); + _dma_buffers_start = .; + *(.dma_buffers) + . = ALIGN(64); + _dma_buffers_end = .; + } > OCM :text + + /DISCARD/ : + { + *(.note.*) + *(.comment) + *(.ARM.attributes) + *(.eh_frame*) + } +} diff --git a/src/port/zynq7000/timer.h b/src/port/zynq7000/timer.h new file mode 100644 index 00000000..7d9edfbe --- /dev/null +++ b/src/port/zynq7000/timer.h @@ -0,0 +1,46 @@ +/* timer.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * ARMv7 generic timer delay helpers. Zynq-7000 FSBL leaves CNTFRQ + * programmed (PS reference clock divided to typically 333.333 MHz); + * we fall back to that if the register reads 0. CNTPCT (physical + * count, 64-bit) is read via the CP15 MRRC two-register move. + * + * UNTESTED ON HARDWARE. + */ +#ifndef ZYNQ7000_TIMER_H +#define ZYNQ7000_TIMER_H + +#include + +static inline uint64_t timer_now(void) +{ + uint32_t lo, hi; + __asm__ volatile ("isb" ::: "memory"); + /* MRRC p15, 0, Rlo, Rhi, c14 -> CNTPCT 64-bit physical counter */ + __asm__ volatile ("mrrc p15, 0, %0, %1, c14" : "=r"(lo), "=r"(hi)); + return ((uint64_t)hi << 32) | lo; +} + +static inline uint32_t timer_freq(void) +{ + uint32_t v; + /* MRC p15, 0, R, c14, c0, 0 -> CNTFRQ */ + __asm__ volatile ("mrc p15, 0, %0, c14, c0, 0" : "=r"(v)); + return v ? v : 333333333u; +} + +static inline void delay_us(uint32_t us) +{ + uint64_t start = timer_now(); + uint64_t target = ((uint64_t)us * (uint64_t)timer_freq()) / 1000000ULL; + while ((timer_now() - start) < target) { } +} + +static inline void delay_ms(uint32_t ms) +{ + delay_us(ms * 1000u); +} + +#endif /* ZYNQ7000_TIMER_H */ diff --git a/src/port/zynq7000/uart.c b/src/port/zynq7000/uart.c new file mode 100644 index 00000000..21334161 --- /dev/null +++ b/src/port/zynq7000/uart.c @@ -0,0 +1,133 @@ +/* uart.c + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + * + * wolfIP is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * wolfIP is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + * Cadence (Xilinx PS) UART0 polled driver. ZCU102 routes UART0 to the + * on-board FTDI USB-UART (channel B / /dev/ttyUSB0 on the host). We + * assume FSBL has already pinned MIO 18/19 to UART0 and enabled its + * reference clock at 100 MHz (UART_REF_CLK divided to 6.25 MHz baudgen + * input by FSBL default); we just program the divisors for 115200 baud. + * + * Register reference: ZynqMP TRM (UG1085) chapter "UART Controller". + */ +#include +#include "board.h" +#include "uart.h" + +#define UART_CR (*(volatile uint32_t *)(UART0_BASE + 0x00)) +#define UART_MR (*(volatile uint32_t *)(UART0_BASE + 0x04)) +#define UART_BAUDGEN (*(volatile uint32_t *)(UART0_BASE + 0x18)) +#define UART_BAUD_DIV (*(volatile uint32_t *)(UART0_BASE + 0x34)) +#define UART_CHANNEL_STS (*(volatile uint32_t *)(UART0_BASE + 0x2C)) +#define UART_TX_RX_FIFO (*(volatile uint32_t *)(UART0_BASE + 0x30)) + +/* Control register bits */ +#define UART_CR_TXRES (1u << 1) /* TX software reset */ +#define UART_CR_RXRES (1u << 0) /* RX software reset */ +#define UART_CR_TXEN (1u << 4) +#define UART_CR_TXDIS (1u << 5) +#define UART_CR_RXEN (1u << 2) +#define UART_CR_RXDIS (1u << 3) +#define UART_CR_STPBRK (1u << 8) + +/* Mode register: 8N1, normal, no parity */ +#define UART_MR_8N1 ((0u << 8) | (4u << 3) | (0u << 1)) + +/* Channel status */ +#define UART_SR_TXFULL (1u << 4) +#define UART_SR_TXEMPTY (1u << 3) + +void uart_init(void) +{ + /* PetaLinux/Vitis FSBL's psu_init programs: + * IOPLL = 1500 MHz + * CRL_APB.UART0_REF_CTRL: SRCSEL=IOPLL, DIVISOR0=15, DIVISOR1=1 + * -> uart_ref_clk = 1500 / 15 / 1 = 100 MHz (sel_clk to baudgen) + * + * Cadence UART baud formula: + * baud = sel_clk / (CD * (BDIV + 1)) + * + * For 115200 with BDIV=6: + * CD = 100e6 / (115200 * 7) = 124 -> actual 115207, well under UART tol. + * + * If you change ref_clk (e.g. RPLL source, different divisors), recompute + * CD - this driver does not auto-detect from CRL_APB yet. */ + UART_CR = UART_CR_TXDIS | UART_CR_RXDIS; + UART_CR |= UART_CR_TXRES | UART_CR_RXRES; + while (UART_CR & (UART_CR_TXRES | UART_CR_RXRES)) + ; /* wait for reset to self-clear */ + + UART_MR = UART_MR_8N1; + UART_BAUDGEN = 124; + UART_BAUD_DIV = 6; + + UART_CR = UART_CR_TXEN | UART_CR_RXEN | UART_CR_STPBRK; +} + +void uart_putc(char c) +{ + while (UART_CHANNEL_STS & UART_SR_TXFULL) + ; + UART_TX_RX_FIFO = (uint32_t)(unsigned char)c; +} + +void uart_puts(const char *s) +{ + while (*s) { + if (*s == '\n') + uart_putc('\r'); + uart_putc(*s++); + } +} + +void uart_puthex(uint32_t val) +{ + static const char hex[] = "0123456789ABCDEF"; + int i; + uart_puts("0x"); + for (i = 28; i >= 0; i -= 4) + uart_putc(hex[(val >> i) & 0xF]); +} + +void uart_putdec(uint32_t val) +{ + char buf[11]; + int i = 0; + if (val == 0) { + uart_putc('0'); + return; + } + while (val > 0 && i < (int)sizeof(buf)) { + buf[i++] = '0' + (char)(val % 10); + val /= 10; + } + while (i > 0) + uart_putc(buf[--i]); +} + +void uart_putip4(ip4 ip) +{ + uart_putdec((ip >> 24) & 0xFF); + uart_putc('.'); + uart_putdec((ip >> 16) & 0xFF); + uart_putc('.'); + uart_putdec((ip >> 8) & 0xFF); + uart_putc('.'); + uart_putdec(ip & 0xFF); +} diff --git a/src/port/zynq7000/uart.h b/src/port/zynq7000/uart.h new file mode 100644 index 00000000..aa3df3ef --- /dev/null +++ b/src/port/zynq7000/uart.h @@ -0,0 +1,20 @@ +/* uart.h + * + * Copyright (C) 2026 wolfSSL Inc. + * + * This file is part of wolfIP TCP/IP stack. + */ +#ifndef ZCU102_UART_H +#define ZCU102_UART_H + +#include +#include "../../../wolfip.h" /* for ip4 */ + +void uart_init(void); +void uart_putc(char c); +void uart_puts(const char *s); +void uart_puthex(uint32_t val); +void uart_putdec(uint32_t val); +void uart_putip4(ip4 ip); + +#endif /* ZCU102_UART_H */ diff --git a/tools/scripts/zcu102/README.md b/tools/scripts/zcu102/README.md new file mode 100644 index 00000000..50a192c0 --- /dev/null +++ b/tools/scripts/zcu102/README.md @@ -0,0 +1,62 @@ +# ZCU102 JTAG bare-metal loader + +`jtag_load.tcl` is a generic AArch64 bare-metal JTAG loader for the +Xilinx ZCU102 (ZynqMP Cortex-A53 EL3). It lets you iterate on +bare-metal firmware without swapping the SD card. + +The src/port/zcu102/ directory has a wolfIP-specific wrapper around +this same pattern at `src/port/zcu102/jtag/boot.tcl`; this directory +holds the standalone reference so the pattern can be cloned into +other wolfSSL projects (wolfBoot, wolfTPM, wolfHSM, etc.) targeting +the same SoC. + +## Usage + +```sh +source /opt/Xilinx/2025.2/Vitis/settings64.sh + +# Build and produce a flat binary for the loader. +aarch64-none-elf-objcopy -O binary myapp.elf myapp.bin + +APP_ELF=$PWD/myapp.elf \ +APP_BIN=$PWD/myapp.bin \ +FSBL_PSU_INIT_TCL=/path/to/petalinux/hw-description/psu_init.tcl \ +xsdb tools/scripts/zcu102/jtag_load.tcl +``` + +ZCU102 must be in JTAG boot mode (SW6 = all ON). The loader expects +hw_server already running on localhost (Vitis starts it by default). + +## What it does + +1. `rst -system`, then `mwr 0xFF5E0200 0x0100` to force CSU JTAG bootmode +2. `psu_init` + `psu_post_config` to bring DDR / clocks / MIO / UART up +3. Re-initialize UART0 baud (psu_init alone doesn't always finish this) +4. Load `APP_BIN` word-by-word via `mwr -force` to OCM (0xFFFC0000) +5. Install a `b .` bootloop at the default RVBAR (0xFFFF0000) +6. `rst -processor` + `stop` + `rwr pc ` + `con` + +## Constraints + +- App `.text` + `.rodata` + `.data` must fit in OCM (256 KiB). +- App `.bss`, page tables, DMA buffers go in DDR, **above 0x10000** + (the first 16 KiB of DDR has a JTAG-DAP alias bug; avoid). +- MMU page tables must map the OCM 2 MiB block (entry 511 of an L2 + covering 0xC0000000..0xFFFFFFFF) as Normal + executable. Otherwise + `mmu_enable` faults on the next instruction fetch. + +## Five traps this loader avoids + +The corresponding wolfIP-specific loader at `src/port/zcu102/jtag/boot.tcl` +has inline comments at each step. The traps are: + +1. DDR DAP 16-KiB alias at low addresses (use OCM). +2. MMU L1 needs OCM carved out as Normal+exec (not Device+XN). +3. CSU JTAG bootmode bit must be written before psu_init. +4. `dow` to DDR breaks after psu_init - use `mwr -force` per word. +5. RVBAR bootloop at 0xFFFF0000 lets `rst -processor` be safe. + +## Related + +- `src/port/zcu102/jtag/boot.tcl` -- wolfIP-specific instance +- `tools/scripts/zynq7000/jtag_load.tcl` in `wolfBoot` -- ARMv7 analog diff --git a/tools/scripts/zcu102/jtag_load.tcl b/tools/scripts/zcu102/jtag_load.tcl new file mode 100644 index 00000000..5ec29cdc --- /dev/null +++ b/tools/scripts/zcu102/jtag_load.tcl @@ -0,0 +1,159 @@ +# jtag_load.tcl - generic AArch64 JTAG bare-metal loader for ZCU102. +# +# Source-agnostic: works for any AArch64 EL3 bare-metal ELF whose +# loadable text + vectors fit in OCM (0xFFFC0000, 256 KiB). +# BSS / page-tables / DMA buffers can live in DDR; they get zeroed +# by the app's own startup code, so it doesn't matter that DDR has +# a JTAG-DAP 16-KiB alias bug at low addresses. +# +# Usage: +# source /opt/Xilinx/2025.2/Vitis/settings64.sh # for xsdb on PATH +# FSBL_PSU_INIT_TCL=/path/to/psu_init.tcl \ +# APP_ELF=/path/to/app.elf \ +# APP_BIN=/path/to/app.bin \ +# xsdb tools/scripts/zcu102/jtag_load.tcl +# +# Set the ZCU102 SW6 boot-mode straps to ALL ON (JTAG mode 0000) +# and power the board on before running. +# +# This pattern was distilled from a working Xilinx PUF-provision +# JTAG loader. See src/port/zcu102/README.md and the comments in +# src/port/zcu102/jtag/boot.tcl for the full set of traps this +# loader is built to avoid. + +set OCM_BASE 0xFFFC0000 + +if {![info exists ::env(APP_ELF)] || ![info exists ::env(APP_BIN)] \ + || ![info exists ::env(FSBL_PSU_INIT_TCL)]} { + puts "Usage: APP_ELF=... APP_BIN=... FSBL_PSU_INIT_TCL=... xsdb $argv0" + exit 1 +} +foreach var {APP_ELF APP_BIN FSBL_PSU_INIT_TCL} { + if {![file exists $::env($var)]} { + puts "ERROR: $var = $::env($var) not found" + exit 1 + } +} + +# --------------------------------------------------------------------- +# Load a flat binary file to a target address via mwr -force, one 32- +# bit word at a time. Slow but reliable - bypasses xsdb's cache +# coherency logic which is broken on DDR after psu_init. +# --------------------------------------------------------------------- +proc load_binary {bin_file base_addr} { + set fp [open $bin_file rb] + set data [read $fp] + close $fp + set len [string length $data] + + set pad [expr {(4 - ($len % 4)) % 4}] + if {$pad > 0} { append data [string repeat "\x00" $pad] } + set words [expr {[string length $data] / 4}] + + puts " loading $len bytes ($words words) to [format 0x%08X $base_addr]" + + targets -set -nocase -filter {name =~ "*PSU*"} + for {set i 0} {$i < $words} {incr i} { + set off [expr {$i * 4}] + binary scan $data @${off}iu word + mwr -force [format "0x%X" [expr {$base_addr + $off}]] \ + [format "0x%X" [expr {$word & 0xFFFFFFFF}]] + if {($i % 8192) == 0 && $i > 0} { + puts " [expr {$i * 100 / $words}]%..." + } + } + puts " 100% done" +} + +# --------------------------------------------------------------------- +# 1. Connect, system reset, force CSU JTAG bootmode. +# --------------------------------------------------------------------- +puts "Connecting..." +connect +puts "All targets:" +targets + +targets -set -nocase -filter {name =~ "*PSU*"} +puts "System reset..." +rst -system +after 500 + +puts "Forcing JTAG boot mode (CSU 0xFF5E0200 <- 0x0100)..." +mwr 0xFF5E0200 0x0100 +after 1000 + +# --------------------------------------------------------------------- +# 2. psu_init (DDR, clocks, MIO, UART, GEM). +# --------------------------------------------------------------------- +puts "Sourcing psu_init.tcl..." +source $::env(FSBL_PSU_INIT_TCL) +puts "psu_init..." +psu_init +after 1000 +puts "psu_post_config..." +psu_post_config +after 500 + +# --------------------------------------------------------------------- +# 3. UART0 baud init at 115200 8N1 (100 MHz ref / 124 / 7 = 115207). +# --------------------------------------------------------------------- +puts "UART0 baud init..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr 0xFF000000 0x03 ;# CR: TX_RST + RX_RST +mwr 0xFF000004 0x20 ;# MR: 8N1 +mwr 0xFF000018 124 ;# BAUDGEN.CD = 124 +mwr 0xFF000034 6 ;# BAUDDIV.BDIV = 6 +mwr 0xFF000000 0x114 ;# CR: TXEN + RXEN + STPBRK +after 100 + +foreach c [split "=== JTAG ready, loading app ===\r\n" ""] { + scan $c %c v + mwr -force 0xFF000030 $v +} +after 200 + +# --------------------------------------------------------------------- +# 4. Load the app binary into OCM. +# --------------------------------------------------------------------- +puts "" +puts "Loading: $::env(APP_BIN) at [format 0x%08X $OCM_BASE]" +load_binary $::env(APP_BIN) $OCM_BASE + +# --------------------------------------------------------------------- +# 5. Install b . boot loop at default RVBAR_EL3 (0xFFFF0000). +# --------------------------------------------------------------------- +puts "" +puts "Installing RVBAR boot loop at 0xFFFF0000..." +targets -set -nocase -filter {name =~ "*PSU*"} +mwr -force 0xFFFF0000 0x14000000 ;# B . (aarch64 self-branch) +mwr -force 0xFFFF0004 0x14000000 + +# --------------------------------------------------------------------- +# 6. A53 #0: reset, halt, set PC, continue. +# --------------------------------------------------------------------- +puts "" +puts "Preparing A53 #0..." +targets -set -nocase -filter {name =~ "*A53*#0"} +rst -processor +after 200 +catch {stop} +after 200 +puts "PC after rst -processor (expect 0xFFFF0000): [rrd pc]" + +# Pull entry point from the ELF. +set readelf "aarch64-none-elf-readelf" +if {[info exists ::env(READELF)]} { set readelf $::env(READELF) } +set entry [exec $readelf -h $::env(APP_ELF) \ + | grep "Entry point" | awk "{print \$NF}"] +puts "App ELF entry: $entry" +rwr pc $entry +puts "PC after rwr: [rrd pc]" + +puts "" +puts "Continuing app..." +con + +after 500 +puts "Detached. App is running." +disconnect +exit