Device Tree 是一个硬件系统的描述,是对于 ACPI 的一个静态、轻量级的平替。它在定制主板/PCB的 ARM 和 RISC-V 系统上广泛使用;但在一些传统技术积累深厚的平台和兼容 UEFI 的系统上 ACPI 仍是更常见的选择。

当 Kernel 被 boot 时,它首先得通过 ACPI 或 Device Tree 获得设备信息,尤其是内存和 CPU 信息;有了这些信息内核才能初始化内存子系统、内核堆、任务队列等最最基础的内核模块。

Device TreeACPI 都有网页版的规范文件,光看篇幅就能看出 Device Tree 相当简单,而 ACPI 功能丰富。

Flattened Device Tree Blob (FDT/DTB) 则是 bootloader 传递给 kernel 的 Device Tree 二进制编码,这是本文的主角,由 spec 的第五章描述;但要能解析 FDT 还是得看完整个 spec。

基本结构

它通过类似 JSON 的树状结构描述设备树。不同于 JSON 的超多类型,Device Tree 只由两种元素构成:Node 和 Property。Node 是一个带名字的子元素列表,而 Property 是一个键值对,其中值由一串 u32 构成(实质上类似 void*,由驱动自行解析内容)。可以描述如下:

type Value = Vec<u32>;

enum Element {
    Node { name: String, value: Vec<Self> },
    Property { name: String, value: Value },
}

由于 Node 类的递归属性(且根节点唯一),一个“根”节点元素可以形成一个树结构,即描述为以下结构:

struct Node {
    children: Map<String, Self>,
    properties: Map<String, Value>
}

树有一个根 node,其他每个 node 都有从 root 到它的唯一路径,这一路径由 / 和 node name 串联起来,唯一确定;因而一个 path 唯一确定一个 node。

当研究存储结构时用的是前一种结构,而当作设备树时通常用后一种结构;需要根据语境灵活理解。

spec 规定了 device tree 根节点必须包含一些 node,比如一个 /cpus node 和至少一个 /memoryXXX node。它还规定了一些标准的 property,spec 里有详细说明,在此不赘述。

DTS

Device Tree Source 是一个 Device Tree 的人可读表述。下面是 QEMU (virt-9.1, cortex-a53) 传递给 kernel 的(不那么标准的)Device Tree Source:

/dts-v1/;

/ {
    interrupt-parent = &intc;
    dma-coherent;
    model = "linux,dummy-virt";
    #size-cells = <0x2>;
    #address-cells = <0x2>;
    compatible = "linux,dummy-virt";
    psci {
        migrate = <0xc4000005>;
        cpu_on = <0xc4000003>;
        cpu_off = <0x84000002>;
        cpu_suspend = <0xc4000001>;
        method = "hvc";
        compatible = "arm,psci-1.0", "arm,psci-0.2", "arm,psci";
    };
    memory@40000000 {
        reg = <0x40000000 0x2000000>;
        device_type = "memory";
    };
    platform-bus@c000000 {
        interrupt-parent = &intc;
        ranges = <0x0, 0x0, 0xc000000, 0x2000000>;
        #address-cells = <0x1>;
        #size-cells = <0x1>;
        compatible = "qemu,platform", "simple-bus";
    };
    fw-cfg@9020000 {
        dma-coherent;
        reg = <0x9020000 0x18>;
        compatible = "qemu,fw-cfg-mmio";
    };
    gpio-keys {
        compatible = "gpio-keys";
        poweroff {
            gpios = <0x8004, 0x3, 0x0>;
            linux,code = <0x74>;
            label = "GPIO Key Poweroff";
        };
    };
    pl061@9030000 {
        phandle = <0x8004>;
        clock-names = "apb_pclk";
        clocks = <0x8000>;
        interrupts = <0x0, 0x7, 0x4>;
        gpio-controller;
        #gpio-cells = <0x2>;
        compatible = "arm,pl061", "arm,primecell";
        reg = <0x9030000 0x1000>;
    };
    pcie@10000000 {
        interrupt-map-mask = <0x1800, 0x0, 0x0, 0x7>;
        interrupt-map = <0x0, 0x0, 0x0, 0x1, 0x8002, 0x0, 0x0, 0x0, 0x3, 0x4, 0x0, 0x0, 0x0, 0x2, 0x8002, 0x0, 0x0, 0x0, 0x4, 0x4, 0x0, 0x0, 0x0, 0x3, 0x8002, 0x0, 0x0, 0x0, 0x5, 0x4, 0x0, 0x0, 0x0, 0x4, 0x8002, 0x0, 0x0, 0x0, 0x6, 0x4, 0x800, 0x0, 0x0, 0x1, 0x8002, 0x0, 0x0, 0x0, 0x4, 0x4, 0x800, 0x0, 0x0, 0x2, 0x8002, 0x0, 0x0, 0x0, 0x5, 0x4, 0x800, 0x0, 0x0, 0x3, 0x8002, 0x0, 0x0, 0x0, 0x6, 0x4, 0x800, 0x0, 0x0, 0x4, 0x8002, 0x0, 0x0, 0x0, 0x3, 0x4, 0x1000, 0x0, 0x0, 0x1, 0x8002, 0x0, 0x0, 0x0, 0x5, 0x4, 0x1000, 0x0, 0x0, 0x2, 0x8002, 0x0, 0x0, 0x0, 0x6, 0x4, 0x1000, 0x0, 0x0, 0x3, 0x8002, 0x0, 0x0, 0x0, 0x3, 0x4, 0x1000, 0x0, 0x0, 0x4, 0x8002, 0x0, 0x0, 0x0, 0x4, 0x4, 0x1800, 0x0, 0x0, 0x1, 0x8002, 0x0, 0x0, 0x0, 0x6, 0x4, 0x1800, 0x0, 0x0, 0x2, 0x8002, 0x0, 0x0, 0x0, 0x3, 0x4, 0x1800, 0x0, 0x0, 0x3, 0x8002, 0x0, 0x0, 0x0, 0x4, 0x4, 0x1800, 0x0, 0x0, 0x4, 0x8002, 0x0, 0x0, 0x0, 0x5, 0x4>;
        #interrupt-cells = <0x1>;
        ranges = <0x1000000, 0x0, 0x0, 0x0, 0x3eff0000, 0x0, 0x10000, 0x2000000, 0x0, 0x10000000, 0x0, 0x10000000, 0x0, 0x2eff0000, 0x3000000, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0>;
        reg = <0x4010000000 0x10000000>;
        msi-map = <0x0, 0x8003, 0x0, 0x10000>;
        dma-coherent;
        bus-range = <0x0, 0xff>;
        linux,pci-domain = <0x0>;
        #size-cells = <0x2>;
        #address-cells = <0x3>;
        device_type = "pci";
        compatible = "pci-host-ecam-generic";
    };
    pl031@9010000 {
        clock-names = "apb_pclk";
        clocks = <0x8000>;
        interrupts = <0x0, 0x2, 0x4>;
        reg = <0x9010000 0x1000>;
        compatible = "arm,pl031", "arm,primecell";
    };
    pl011@9000000 {
        clock-names = "uartclk", "apb_pclk";
        clocks = <0x8000, 0x8000>;
        interrupts = <0x0, 0x1, 0x4>;
        reg = <0x9000000 0x1000>;
        compatible = "arm,pl011", "arm,primecell";
    };
    pmu {
        interrupts = <0x1, 0x7, 0x104>;
        compatible = "arm,armv8-pmuv3";
    };
    intc@8000000 {
        phandle = <0x8002>;
        reg = <0x8000000 0x10000, 0x8000000 0x10000>;
        compatible = "arm,cortex-a15-gic";
        ranges;
        #size-cells = <0x2>;
        #address-cells = <0x2>;
        interrupt-controller;
        #interrupt-cells = <0x3>;
        v2m@8020000 {
            phandle = <0x8003>;
            reg = <0x8020000 0x1000>;
            msi-controller;
            compatible = "arm,gic-v2m-frame";
        };
    };
    flash@0 {
        bank-width = <0x4>;
        reg = <0x0 0x4000000, 0x0 0x4000000>;
        compatible = "cfi-flash";
    };
    cpus {
        #size-cells = <0x0>;
        #address-cells = <0x1>;
        cpu-map {
            socket0 {
                cluster0 {
                    core0 {
                        cpu = <0x8001>;
                    };
                };
            };
        };
        cpu@0 {
            phandle = <0x8001>;
            reg = <>;
            compatible = "arm,cortex-a53";
            device_type = "cpu";
        };
    };
    timer {
        interrupts = <0x1, 0xd, 0x104, 0x1, 0xe, 0x104, 0x1, 0xb, 0x104, 0x1, 0xa, 0x104>;
        always-on;
        compatible = "arm,armv8-timer", "arm,armv7-timer";
    };
    apb-pclk {
        phandle = <0x8000>;
        clock-output-names = "clk24mhz";
        clock-frequency = <0x16e3600>;
        #clock-cells = <0x0>;
        compatible = "fixed-clock";
    };
    aliases {
        serial0 = "/pl011@9000000";
    };
    chosen {
        linux,initrd-end = <0x41000085>;
        linux,initrd-start = <0x41000000>;
        bootargs = "root=/dev/sda1 console=ttyS0";
        stdout-path = "/pl011@9000000";
        rng-seed = <0x8b39719e, 0xc0c23b1f, 0xf9e059aa, 0x686413b4, 0x872ed06f, 0x63c7478, 0x13371237, 0x34369cc5>;
        kaslr-seed = <0x7ba71524, 0xd32d8eee>;
    };
};

可以看到,每个花括号括起来了一个 Element::Node,花括号内部有一行行的键值对,是 Element::Property;还有键-子节点,是 Element::Node

但这个值好像并非都是 u32 list,有字符串,有 u64 list,甚至有空的。实际上编码后他们都是 u32 list,下面的 DTB 部分会介绍。

具体的 DTS 语法可以参考 spec 第六章,这里不赘述。

DTB Struct

终于来到本文的主角:(Flattened) Device Tree Blob (DTB/FDT)。它是 Device Tree 的二进制编码,由 spec 第五章定义。

本节先介绍 DTB Struct,即主要的树结构。这个结构里,一切都是 u32 对齐的。它的整体结构和 DTS 很类似,规定了以下标记:

const FDT_BEGIN_NODE: u32 = 0x00000001;
const FDT_END_NODE: u32 = 0x00000002;
const FDT_PROP: u32 = 0x00000003;

每个标记对应着一个元素的开始,其中 FDT_BEGIN_NODEFDT_END_NODE 类似于 DTS 的 {},标记着一个 Element::NodeFDT_PROP 则标记着一个Element::Property

具体地,他们的 Layout 如下:

  • FDT_BEGIN_NODE: 紧跟着一个字符串 name,接着下一个元素
  • FDT_END_NODE:接着下一个元素
  • FDT_PROP:紧跟着 prop 区域长度 (u32) 和 name offset (u32) ,接着规定长度的区域,接着下一个元素

需要说明:

  • DTB 所有数字都是 Big Endian
  • 所有非对齐到 4B 的内容,例如字符串和 property 内容,后面都会填充 0 直到对齐
  • DTB 里所有字符串都是 null-terminated,而不是 Rust 风格的长度-内容
  • PROP 的 name offset 在后面会解释
  • 除了上述标记,还有 FDT_NOP (无意义元素)和 FDT_END(整个 device tree 终止符)

对应解析代码如下:

pub enum FdtItem<'a> {
    Node (&'a str),
    EndNode,
    Property {name: &'a str, value: &'a [u8]},
    Unknown,
}

pub struct FdtIter<'a> {
    header: &'a Fdt,
    cursor: U32Ptr
}

impl<'a> Iterator for FdtIter<'a> {
    type Item = FdtItem<'a>;
    fn next(&mut self) -> Option<Self::Item> {
        use FdtItem::*;
        let cur = &mut self.cursor;
        while cur.peek() == FDT_NOP {
            cur.advance(1);
        }
        match cur.peek() {
            FDT_BEGIN_NODE => {
                cur.advance(1);
                let name = from_cstr(cur.0 as _);
                cur.advance((name.len()+4)/4);
                Some(Node(name))
            },
            FDT_END_NODE => { 
                cur.advance(1);
                Some(EndNode) 
            },
            FDT_PROP => {
                cur.advance(1);
                let len = cur.take() as usize;
                let nameoff = cur.take();
                let name = self.header.get_string(nameoff);
                let value = cur.split_off(len);
                Some(Property { name, value })
            },
            FDT_END => None,
            _ => Some(Unknown),
        }
    }
}

/// helper type, inspired by `bytes` crate
struct U32Ptr(*const u32);

impl U32Ptr {
    /// take u32 in big endian. will advance 4B
    pub fn take(&mut self) -> u32;
    /// peek u32 value, like dereference
    pub fn peek(&self) -> u32;
    /// advance in 4B
    pub fn advance(&mut self, off: usize);
    /// split off in Byte, will automatic align to 4B
    pub fn split_off<'a>(&mut self, len: usize) -> &'a [u8];
}

/// convert 0-terminated string to rust-style str
fn from_cstr<'a>(addr: *const u8) -> &'a str;

到树结构的转换也很简单:

struct FdtNode {
    children: BTreeMap<String, Self>,
    properties: BTreeMap<String, Vec<u8>>
}
impl<'a> FromIterator<FdtItem<'a>> for FdtNode {
    fn from_iter<T: IntoIterator<Item = FdtItem<'a>>>(iter: T) -> Self {
        let mut iter = iter.into_iter();
        // root
        assert!(matches!(iter.next(), Some(Node(_))));
        let mut stack = Vec::new();
        let mut children = BTreeMap::new();
        let mut properties = BTreeMap::new();
        loop {
            match iter.next().unwrap() {
                Node(name) => {
                    stack.push((name, children, properties));
                    (children, properties) = (BTreeMap::new(), BTreeMap::new());
                },
                EndNode => {
                    let node = FdtNode { children, properties };
                    let Some((name, ch, prop)) = stack.pop() else {
                        // end of root, exit
                        break node;
                    };
                    (children, properties) = (ch, prop);
                    children.insert(name.to_owned(), node);
                },
                Property { name, value } => {
                    properties.insert(name.to_owned(), value.to_owned());
                },
                Unknown => unreachable!()
            }
        }
    }
}

DTB

最后再来看其他的 DTB 部分。

DTB 整体是一个连续的内存区域,由一个 Header 和三个块组成:

  • 内存保留块
  • 树结构块(上一章节介绍过)
  • 字符串块

Header 在内存区域的最开头,保证 4B aligned。它的结构定义如下:

#[repr(C)]
struct DtbHeader {
    magic: u32, // 固定值 0xD00DFEED
    totalsize: u32, // 总尺寸
    off_dt_struct: u32, // 树结构块的起始偏移
    off_dt_strings: u32, // 字符串块的起始偏移
    off_mem_rsvmap: u32, // 内存保留快的起始偏移
    version: u32,
    last_comp_version: u32,
    boot_cpuid_phys: u32, // 启动使用的 cpu 编号
    size_dt_strings: u32, // 字符串块的尺寸
    size_dt_struct: u32, // 树结构块的尺寸
}

不过这里的 u32 是 big endian 的,在读值时需要转换。

可以如下方式解析:

pub struct Fdt {
    struct_: *const u32,
    strings: *const u8,
    reserve: *const u32,
}

impl Fdt {
    /// New Flattened Device Tree. Base address must be 8B aligned.
    pub fn new(base: usize) -> Option<Self> {
        // assume everything is 8B aligned
        let hdr = unsafe{&*(base as *const DtbHeader)};
        if hdr.magic.to_be() != DTB_MAGIC { return None; }
        // skip version check
        // let (version, last_comp) = (hdr.version.to_be(), hdr.last_comp_version.to_be());
        // assert!(DTB_VERSION>=last_comp && DTB_VERSION<=version);
        let struct_ = (base + hdr.off_dt_struct.to_be() as usize) as *const u32;
        let strings = (base + hdr.off_dt_strings.to_be() as usize) as *const u8;
        let reserve = (base + hdr.off_mem_rsvmap.to_be() as usize) as *const u32;
        Some(Self { struct_, strings, reserve })
    }

    pub fn get_string(&self, offset: u32)->&str {
        let addr = self.strings as usize + offset as usize;
        from_cstr(addr)
    }

    pub fn root_node(&self) -> FdtIter {
        FdtIter { header: self, cursor: U32Ptr(self.struct_) }
    }
}

后记

本文仅记录了 DTB 格式的解析,对内容的解读和驱动/内核的处理方式没有太多涉及。

一些梆硬的参考资料: