diff --git a/Cargo.lock b/Cargo.lock
index 6e3fed7..9192bef 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -37,17 +37,6 @@ dependencies = [
  "subtle",
 ]
 
-[[package]]
-name = "ahash"
-version = "0.7.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9"
-dependencies = [
- "getrandom 0.2.17",
- "once_cell",
- "version_check",
-]
-
 [[package]]
 name = "aho-corasick"
 version = "1.1.4"
@@ -357,7 +346,7 @@ version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "cexpr",
  "clang-sys",
  "itertools",
@@ -376,7 +365,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
 dependencies = [
  "annotate-snippets",
- "bitflags 2.13.0",
+ "bitflags",
  "cexpr",
  "clang-sys",
  "itertools",
@@ -403,12 +392,6 @@ version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
 
-[[package]]
-name = "bitflags"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-
 [[package]]
 name = "bitflags"
 version = "2.13.0"
@@ -650,6 +633,25 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.21"
@@ -838,13 +840,20 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
 
+[[package]]
+name = "fec-rs"
+version = "0.1.0"
+dependencies = [
+ "rayon",
+]
+
 [[package]]
 name = "ffmpeg-next"
 version = "7.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da02698288e0275e442a47fc12ca26d50daf0d48b15398ba5906f20ac2e2a9f9"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "ffmpeg-sys-next",
  "libc",
 ]
@@ -1078,15 +1087,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "hashbrown"
-version = "0.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
-dependencies = [
- "ahash",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.15.5"
@@ -1239,15 +1239,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "instant"
-version = "0.1.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
-dependencies = [
- "cfg-if",
-]
-
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.2"
@@ -1381,7 +1372,7 @@ version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6b8cfa2a7656627b4c92c6b9ef929433acd673d5ab3708cda1b18478ac00df4"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "cc",
  "convert_case",
  "cookie-factory",
@@ -1431,15 +1422,6 @@ dependencies = [
  "lumen-core",
 ]
 
-[[package]]
-name = "lru"
-version = "0.7.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e999beba7b6e8345721bd280141ed958096a2e4abdf74f67ff4ce49b4b54e47a"
-dependencies = [
- "hashbrown 0.12.3",
-]
-
 [[package]]
 name = "lru-slab"
 version = "0.1.2"
@@ -1462,10 +1444,10 @@ dependencies = [
  "aes-gcm",
  "bytes",
  "cbindgen",
+ "fec-rs",
  "proptest",
  "quinn",
  "rand 0.9.4",
- "reed-solomon-erasure",
  "reed-solomon-simd",
  "thiserror 2.0.18",
  "tokio",
@@ -1593,7 +1575,7 @@ version = "0.30.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "cfg-if",
  "cfg_aliases",
  "libc",
@@ -1757,17 +1739,6 @@ version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba"
 
-[[package]]
-name = "parking_lot"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
-dependencies = [
- "instant",
- "lock_api",
- "parking_lot_core 0.8.6",
-]
-
 [[package]]
 name = "parking_lot"
 version = "0.12.5"
@@ -1775,21 +1746,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
 dependencies = [
  "lock_api",
- "parking_lot_core 0.9.12",
-]
-
-[[package]]
-name = "parking_lot_core"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
-dependencies = [
- "cfg-if",
- "instant",
- "libc",
- "redox_syscall 0.2.16",
- "smallvec",
- "winapi",
+ "parking_lot_core",
 ]
 
 [[package]]
@@ -1800,7 +1757,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall 0.5.18",
+ "redox_syscall",
  "smallvec",
  "windows-link",
 ]
@@ -1843,7 +1800,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9688b89abf11d756499f7c6190711d6dbe5a3acdb30c8fbf001d6596d06a8d44"
 dependencies = [
  "anyhow",
- "bitflags 2.13.0",
+ "bitflags",
  "libc",
  "libspa",
  "libspa-sys",
@@ -1954,7 +1911,7 @@ checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744"
 dependencies = [
  "bit-set",
  "bit-vec",
- "bitflags 2.13.0",
+ "bitflags",
  "num-traits",
  "rand 0.9.4",
  "rand_chacha 0.9.0",
@@ -2126,6 +2083,26 @@ dependencies = [
  "rand_core 0.9.5",
 ]
 
+[[package]]
+name = "rayon"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "rcgen"
 version = "0.13.2"
@@ -2145,35 +2122,13 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08ad765b21a08b1a8e5cdce052719188a23772bcbefb3c439f0baaf62c56ceac"
 
-[[package]]
-name = "redox_syscall"
-version = "0.2.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
-dependencies = [
- "bitflags 1.3.2",
-]
-
 [[package]]
 name = "redox_syscall"
 version = "0.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
 dependencies = [
- "bitflags 2.13.0",
-]
-
-[[package]]
-name = "reed-solomon-erasure"
-version = "6.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7263373d500d4d4f505d43a2a662d475a894aa94503a1ee28e9188b5f3960d4f"
-dependencies = [
- "libm",
- "lru",
- "parking_lot 0.11.2",
- "smallvec",
- "spin",
+ "bitflags",
 ]
 
 [[package]]
@@ -2278,7 +2233,7 @@ version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "errno",
  "libc",
  "linux-raw-sys",
@@ -2434,7 +2389,7 @@ version = "3.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "core-foundation",
  "core-foundation-sys",
  "libc",
@@ -2829,7 +2784,7 @@ dependencies = [
  "bytes",
  "libc",
  "mio",
- "parking_lot 0.12.5",
+ "parking_lot",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
@@ -3257,7 +3212,7 @@ version = "0.244.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "hashbrown 0.15.5",
  "indexmap",
  "semver",
@@ -3282,7 +3237,7 @@ version = "0.31.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "645c7c96bb74690c3189b5c9cb4ca1627062bb23693a4fad9d8c3de958260144"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "rustix",
  "wayland-backend",
  "wayland-scanner",
@@ -3294,7 +3249,7 @@ version = "0.32.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "563a85523cade2429938e790815fd7319062103b9f4a2dc806e9b53b95982d8f"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "wayland-backend",
  "wayland-client",
  "wayland-scanner",
@@ -3306,7 +3261,7 @@ version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e9567599ef23e09b8dad6e429e5738d4509dfc46b3b21f32841a304d16b29c8"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "wayland-backend",
  "wayland-client",
  "wayland-protocols",
@@ -3319,7 +3274,7 @@ version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eb04e52f7836d7c7976c78ca0250d61e33873c34156a2a1fc9474828ec268234"
 dependencies = [
- "bitflags 2.13.0",
+ "bitflags",
  "wayland-backend",
  "wayland-client",
  "wayland-protocols",
@@ -3365,22 +3320,6 @@ dependencies = [
  "rustls-pki-types",
 ]
 
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
 [[package]]
 name = "winapi-util"
 version = "0.1.11"
@@ -3390,12 +3329,6 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
 [[package]]
 name = "windows-link"
 version = "0.2.1"
@@ -3703,7 +3636,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
 dependencies = [
  "anyhow",
- "bitflags 2.13.0",
+ "bitflags",
  "indexmap",
  "log",
  "serde",
diff --git a/crates/lumen-core/Cargo.toml b/crates/lumen-core/Cargo.toml
index 9a61c4f..0d1bf85 100644
--- a/crates/lumen-core/Cargo.toml
+++ b/crates/lumen-core/Cargo.toml
@@ -23,7 +23,11 @@ quic = ["dep:quinn", "dep:tokio"]
 
 [dependencies]
 reed-solomon-simd = "3.1"      # GF(2^16) Leopard-RS, SIMD, O(n log n) — the wall-breaker (P2)
-reed-solomon-erasure = "6.0"   # GF(2^8) classic RS — GameStream/Moonlight compat (P1)
+# Vendored fork of fec-rs: GF(2^8) classic RS with the *Cauchy* generator matrix
+# (M[j][i] = inv[(m+i)^j]) — byte-identical to the `nanors` library Moonlight uses, so our
+# parity is decodable by a stock Moonlight client. (reed-solomon-erasure is Vandermonde and is
+# NOT interoperable.) See vendor/fec-rs/LICENSE (BSD-2-Clause).
+fec-rs = { path = "vendor/fec-rs" }
 aes-gcm = "0.10"               # AES-128-GCM session crypto, matches GameStream
 zerocopy = { version = "0.8", features = ["derive"] }
 bytes = "1"
diff --git a/crates/lumen-core/src/fec/gf8.rs b/crates/lumen-core/src/fec/gf8.rs
index 4443d0f..8792ea6 100644
--- a/crates/lumen-core/src/fec/gf8.rs
+++ b/crates/lumen-core/src/fec/gf8.rs
@@ -1,9 +1,12 @@
-//! GF(2⁸) classic Reed–Solomon backend (`reed-solomon-erasure`), equivalent to the
-//! `nanors` library Moonlight uses. Hard ceiling: data + recovery ≤ 255 shards/block.
+//! GF(2⁸) classic Reed–Solomon backend (vendored `fec-rs`). Uses the **Cauchy** generator
+//! matrix `M[j][i] = inv[(m+i)^j]` over GF(2⁸) (poly 0x1d) — byte-identical to the `nanors`
+//! library Moonlight uses, so the parity this produces is recoverable by a stock Moonlight
+//! client (unlike Vandermonde RS, whose parity is not interoperable). Hard ceiling: data +
+//! recovery ≤ 255 shards/block.
 
 use super::{validate_block_shape, validate_encode_shape, ErasureCoder, FecError};
 use crate::config::FecScheme;
-use reed_solomon_erasure::galois_8::ReedSolomon;
+use fec_rs::ReedSolomon;
 
 pub struct Gf8Coder;
 
@@ -21,7 +24,7 @@ impl ErasureCoder for Gf8Coder {
         let shard_len = data[0].len();
         let rs = ReedSolomon::new(k, recovery_count)
             .map_err(|_| FecError::Config("invalid GF(2^8) shard counts"))?;
-        // reed-solomon-erasure fills parity in place: shards = data || zeroed parity.
+        // fec-rs fills parity in place: shards = data || zeroed parity.
         let mut shards: Vec<Vec<u8>> = Vec::with_capacity(k + recovery_count);
         shards.extend_from_slice(data);
         shards.resize_with(k + recovery_count, || vec![0u8; shard_len]);
@@ -69,3 +72,69 @@ fn collect_originals(
     }
     Ok(out)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Locks byte-exact compatibility with Moonlight's `nanors` (Cauchy matrix
+    /// `M[j][i] = inv[(m+i)^j]`, GF(2⁸) poly 0x1d). If the backend ever switched matrices,
+    /// these vectors would break and our parity would no longer be Moonlight-decodable.
+    #[test]
+    fn nanors_exact_parity_vectors() {
+        let coder = Gf8Coder;
+        // The definitive nanors vector (k=4, m=2): single-byte shards [10,20,30,40] → [136, 0].
+        let data = vec![vec![10u8], vec![20], vec![30], vec![40]];
+        let parity = coder.encode(&data, 2).unwrap();
+        assert_eq!(parity, vec![vec![136u8], vec![0u8]]);
+
+        // Cross-check independently from the Cauchy parity rows (proves the matrix, not just a
+        // memorized output): parity[j] = XOR_i M[j][i] · data[i] over GF(2⁸).
+        let rows = [[142u8, 244, 71, 167], [244, 142, 167, 71]];
+        let din = [10u8, 20, 30, 40];
+        for (j, row) in rows.iter().enumerate() {
+            let expect = row
+                .iter()
+                .zip(din)
+                .fold(0u8, |acc, (&m, d)| acc ^ gf_mul(m, d));
+            assert_eq!(parity[j][0], expect, "parity row {j}");
+        }
+    }
+
+    /// Round-trip: erase `m` data shards and confirm reconstruction recovers the originals.
+    #[test]
+    fn recovers_erased_data_shards() {
+        let coder = Gf8Coder;
+        let data: Vec<Vec<u8>> = (0..6).map(|i| vec![i as u8; 8]).collect();
+        let parity = coder.encode(&data, 3).unwrap();
+        let mut received: Vec<Option<Vec<u8>>> = data
+            .iter()
+            .cloned()
+            .map(Some)
+            .chain(parity.into_iter().map(Some))
+            .collect();
+        // Erase 3 data shards (the FEC budget) + nothing else.
+        received[1] = None;
+        received[3] = None;
+        received[5] = None;
+        let recovered = coder.reconstruct(6, 3, &mut received).unwrap();
+        assert_eq!(recovered, data);
+    }
+
+    /// GF(2⁸) multiply, reduction poly 0x1d — independent of the backend.
+    fn gf_mul(mut a: u8, mut b: u8) -> u8 {
+        let mut p = 0u8;
+        for _ in 0..8 {
+            if b & 1 != 0 {
+                p ^= a;
+            }
+            let hi = a & 0x80;
+            a <<= 1;
+            if hi != 0 {
+                a ^= 0x1d;
+            }
+            b >>= 1;
+        }
+        p
+    }
+}
diff --git a/crates/lumen-core/vendor/fec-rs/Cargo.toml b/crates/lumen-core/vendor/fec-rs/Cargo.toml
new file mode 100644
index 0000000..1df6dfc
--- /dev/null
+++ b/crates/lumen-core/vendor/fec-rs/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "fec-rs"
+version = "0.1.0"
+edition = "2021"
+description = "A pure Rust Reed-Solomon erasure coding library with runtime SIMD acceleration"
+license = "BSD-2-Clause"
+repository = "https://github.com/hgaiser/fec-rs"
+keywords = ["reed-solomon", "erasure", "coding", "fec", "simd"]
+categories = ["algorithms", "encoding"]
+readme = "README.md"
+
+[dependencies]
+rayon = { version = "1", optional = true }
+
+[features]
+default = []
+parallel = ["rayon"]
diff --git a/crates/lumen-core/vendor/fec-rs/LICENSE b/crates/lumen-core/vendor/fec-rs/LICENSE
new file mode 100644
index 0000000..0300fa9
--- /dev/null
+++ b/crates/lumen-core/vendor/fec-rs/LICENSE
@@ -0,0 +1,24 @@
+BSD 2-Clause License
+
+Copyright (c) 2026, Hans Gaiser
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/crates/lumen-core/vendor/fec-rs/README.md b/crates/lumen-core/vendor/fec-rs/README.md
new file mode 100644
index 0000000..009eb18
--- /dev/null
+++ b/crates/lumen-core/vendor/fec-rs/README.md
@@ -0,0 +1,73 @@
+# fec-rs
+
+[![CI](https://github.com/hgaiser/fec-rs/workflows/CI/badge.svg)](https://github.com/hgaiser/fec-rs/actions)
+[![Crates.io](https://img.shields.io/crates/v/fec-rs.svg)](https://crates.io/crates/fec-rs)
+[![Documentation](https://docs.rs/fec-rs/badge.svg)](https://docs.rs/fec-rs)
+
+A pure Rust Reed-Solomon erasure coding library with runtime SIMD acceleration.
+
+## Features
+
+- **Pure Rust** — No C/C++ dependencies or FFI. Everything is implemented in safe Rust
+  (with targeted `unsafe` for SIMD intrinsics).
+- **Runtime SIMD detection** — Automatically uses the fastest available instruction set
+  via `std::is_x86_feature_detected!`. A single binary works on all x86_64 systems.
+- **GF(2^8)** — Operates over the Galois field GF(2^8) with generating polynomial 29 (0x1D),
+  compatible with the Moonlight streaming protocol.
+- **Shard-by-shard encoding** — Incremental encoding via `ShardByShard` for streaming use cases.
+- **Reconstruction** — Reconstruct missing data and/or parity shards from any sufficient subset.
+
+## SIMD Acceleration
+
+On x86_64, the library automatically detects CPU features at runtime and uses
+the best available instruction set:
+
+- **GFNI + AVX2** — Single-instruction GF multiply on 32 bytes (Intel Alder Lake+, AMD Zen 4+)
+- **AVX2** — VPSHUFB split-table nibble lookup on 32 bytes
+- **GFNI + SSE** — Single-instruction GF multiply on 16 bytes
+- **SSSE3** — VPSHUFB split-table nibble lookup on 16 bytes
+- **Scalar** — Lookup table fallback
+
+## Parallel Encoding
+
+Enable the `parallel` feature for optional rayon-based parallel encoding:
+
+```toml
+fec-rs = { version = "0.1", features = ["parallel"] }
+```
+
+When enabled, large encode workloads automatically distribute parity shard
+computation across threads. Small workloads use the sequential path to avoid
+overhead.
+
+## Usage
+
+```rust
+use fec_rs::ReedSolomon;
+
+let rs = ReedSolomon::new(4, 2).unwrap();
+
+let mut shards: Vec<Vec<u8>> = vec![
+    vec![0, 1, 2, 3],
+    vec![4, 5, 6, 7],
+    vec![8, 9, 10, 11],
+    vec![12, 13, 14, 15],
+    vec![0, 0, 0, 0], // parity shard 1
+    vec![0, 0, 0, 0], // parity shard 2
+];
+
+// Encode parity
+rs.encode(&mut shards).unwrap();
+
+// Verify
+assert!(rs.verify(&shards).unwrap());
+
+// Simulate loss of shard 0
+let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
+recovery[0] = None;
+
+// Reconstruct
+rs.reconstruct(&mut recovery).unwrap();
+```
+
+License: BSD-2-Clause
diff --git a/crates/lumen-core/vendor/fec-rs/build.rs b/crates/lumen-core/vendor/fec-rs/build.rs
new file mode 100644
index 0000000..d6d9a02
--- /dev/null
+++ b/crates/lumen-core/vendor/fec-rs/build.rs
@@ -0,0 +1,200 @@
+#![allow(clippy::needless_range_loop)]
+
+use std::env;
+use std::fs::File;
+use std::io::Write;
+use std::path::Path;
+
+const FIELD_SIZE: usize = 256;
+const GENERATING_POLYNOMIAL: usize = 29;
+
+fn gen_log_table(polynomial: usize) -> [u8; FIELD_SIZE] {
+    let mut result = [0u8; FIELD_SIZE];
+    let mut b: usize = 1;
+
+    for log in 0..FIELD_SIZE - 1 {
+        result[b] = log as u8;
+        b <<= 1;
+        if FIELD_SIZE <= b {
+            b = (b - FIELD_SIZE) ^ polynomial;
+        }
+    }
+
+    result
+}
+
+const EXP_TABLE_SIZE: usize = FIELD_SIZE * 2 - 2;
+
+fn gen_exp_table(log_table: &[u8; FIELD_SIZE]) -> [u8; EXP_TABLE_SIZE] {
+    let mut result = [0u8; EXP_TABLE_SIZE];
+
+    for i in 1..FIELD_SIZE {
+        let log = log_table[i] as usize;
+        result[log] = i as u8;
+        result[log + FIELD_SIZE - 1] = i as u8;
+    }
+
+    result
+}
+
+fn multiply(log_table: &[u8; FIELD_SIZE], exp_table: &[u8; EXP_TABLE_SIZE], a: u8, b: u8) -> u8 {
+    if a == 0 || b == 0 {
+        0
+    } else {
+        let log_a = log_table[a as usize];
+        let log_b = log_table[b as usize];
+        let log_result = log_a as usize + log_b as usize;
+        exp_table[log_result]
+    }
+}
+
+fn gen_mul_table(
+    log_table: &[u8; FIELD_SIZE],
+    exp_table: &[u8; EXP_TABLE_SIZE],
+) -> [[u8; FIELD_SIZE]; FIELD_SIZE] {
+    let mut result = [[0u8; FIELD_SIZE]; FIELD_SIZE];
+
+    for a in 0..FIELD_SIZE {
+        for b in 0..FIELD_SIZE {
+            result[a][b] = multiply(log_table, exp_table, a as u8, b as u8);
+        }
+    }
+
+    result
+}
+
+fn gen_mul_table_half(
+    log_table: &[u8; FIELD_SIZE],
+    exp_table: &[u8; EXP_TABLE_SIZE],
+) -> ([[u8; 16]; FIELD_SIZE], [[u8; 16]; FIELD_SIZE]) {
+    let mut low = [[0u8; 16]; FIELD_SIZE];
+    let mut high = [[0u8; 16]; FIELD_SIZE];
+
+    for a in 0..FIELD_SIZE {
+        for b in 0..FIELD_SIZE {
+            let mut result = 0;
+            if a != 0 && b != 0 {
+                let log_a = log_table[a];
+                let log_b = log_table[b];
+                result = exp_table[log_a as usize + log_b as usize];
+            }
+            if (b & 0x0F) == b {
+                low[a][b] = result;
+            }
+            if (b & 0xF0) == b {
+                high[a][b >> 4] = result;
+            }
+        }
+    }
+    (low, high)
+}
+
+/// Generate the GFNI affine matrix table.
+///
+/// For each constant `c` in GF(2^8), compute a u64-packed 8x8 binary matrix
+/// such that `vgf2p8affineqb(x, matrix, 0)` produces `c * x` in our GF(2^8).
+///
+/// vgf2p8affineqb semantics:
+///   result_bit[i] = popcount(x AND qword_byte[7-i]) mod 2
+/// where i goes from 0 (LSB) to 7 (MSB).
+///
+/// Matrix packing: qword byte[7] = row for output bit 7 (MSB),
+///                 qword byte[0] = row for output bit 0 (LSB).
+fn gen_gfni_table(
+    log_table: &[u8; FIELD_SIZE],
+    exp_table: &[u8; EXP_TABLE_SIZE],
+) -> [u64; FIELD_SIZE] {
+    let mut result = [0u64; FIELD_SIZE];
+
+    for c in 0..FIELD_SIZE {
+        // Build row bytes for each output bit.
+        // row_for_bit_i = mask where bit j is set iff input bit j contributes to output bit i.
+        // M[i][j] = bit_i(c * (1 << j))
+        let mut rows = [0u8; 8];
+        for j in 0..8u8 {
+            let basis = 1u8 << j; // input with only bit j set
+            let product = multiply(log_table, exp_table, c as u8, basis);
+            // product's bit i tells us M[i][j]
+            for i in 0..8u8 {
+                if (product >> i) & 1 == 1 {
+                    rows[i as usize] |= 1 << j;
+                }
+            }
+        }
+
+        // Pack into u64: byte[7-i] = rows[i]
+        // vgf2p8affineqb: result_bit[i] = popcount(x AND byte[7-i]) mod 2
+        // We want result_bit[i] = bit i of (c*x), so byte[7-i] = rows[i].
+        let mut matrix: u64 = 0;
+        for i in 0..8u32 {
+            matrix |= (rows[i as usize] as u64) << ((7 - i) * 8);
+        }
+        result[c] = matrix;
+    }
+
+    result
+}
+
+fn write_1d_table(f: &mut File, table: &[u8], name: &str) {
+    let len = table.len();
+    write!(f, "pub static {name}: [u8; {len}] = [").unwrap();
+    for v in table {
+        write!(f, "{v}, ").unwrap();
+    }
+    writeln!(f, "];").unwrap();
+}
+
+fn write_2d_table(f: &mut File, table: &[[u8; 16]; FIELD_SIZE], name: &str) {
+    let rows = table.len();
+    let cols = table[0].len();
+    write!(f, "pub static {name}: [[u8; {cols}]; {rows}] = [").unwrap();
+    for row in table {
+        write!(f, "[").unwrap();
+        for v in row {
+            write!(f, "{v}, ").unwrap();
+        }
+        writeln!(f, "],").unwrap();
+    }
+    writeln!(f, "];").unwrap();
+}
+
+fn write_mul_table(f: &mut File, table: &[[u8; FIELD_SIZE]; FIELD_SIZE]) {
+    let rows = table.len();
+    let cols = table[0].len();
+    write!(f, "pub static MUL_TABLE: [[u8; {cols}]; {rows}] = [").unwrap();
+    for row in table {
+        write!(f, "[").unwrap();
+        for v in row {
+            write!(f, "{v}, ").unwrap();
+        }
+        writeln!(f, "],").unwrap();
+    }
+    writeln!(f, "];").unwrap();
+}
+
+fn write_gfni_table(f: &mut File, table: &[u64; FIELD_SIZE]) {
+    write!(f, "pub static GFNI_TABLE: [u64; {}] = [", FIELD_SIZE).unwrap();
+    for v in table {
+        write!(f, "0x{v:016X}, ").unwrap();
+    }
+    writeln!(f, "];").unwrap();
+}
+
+fn main() {
+    let log_table = gen_log_table(GENERATING_POLYNOMIAL);
+    let exp_table = gen_exp_table(&log_table);
+    let mul_table = gen_mul_table(&log_table, &exp_table);
+    let (mul_table_low, mul_table_high) = gen_mul_table_half(&log_table, &exp_table);
+    let gfni_table = gen_gfni_table(&log_table, &exp_table);
+
+    let out_dir = env::var("OUT_DIR").unwrap();
+    let dest_path = Path::new(&out_dir).join("tables.rs");
+    let mut f = File::create(&dest_path).unwrap();
+
+    write_1d_table(&mut f, &log_table, "LOG_TABLE");
+    write_1d_table(&mut f, &exp_table, "EXP_TABLE");
+    write_mul_table(&mut f, &mul_table);
+    write_2d_table(&mut f, &mul_table_low, "MUL_TABLE_LOW");
+    write_2d_table(&mut f, &mul_table_high, "MUL_TABLE_HIGH");
+    write_gfni_table(&mut f, &gfni_table);
+}
diff --git a/crates/lumen-core/vendor/fec-rs/src/errors.rs b/crates/lumen-core/vendor/fec-rs/src/errors.rs
new file mode 100644
index 0000000..d1681af
--- /dev/null
+++ b/crates/lumen-core/vendor/fec-rs/src/errors.rs
@@ -0,0 +1,61 @@
+use core::fmt;
+
+#[derive(PartialEq, Debug, Clone, Copy)]
+pub enum Error {
+    TooFewShards,
+    TooManyShards,
+    TooFewDataShards,
+    TooManyDataShards,
+    TooFewParityShards,
+    TooManyParityShards,
+    TooFewBufferShards,
+    TooManyBufferShards,
+    IncorrectShardSize,
+    TooFewShardsPresent,
+    EmptyShard,
+    InvalidIndex,
+    InvalidParityMatrix,
+    SingularMatrix,
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Error::TooFewShards => write!(f, "Too few shards"),
+            Error::TooManyShards => write!(f, "Too many shards"),
+            Error::TooFewDataShards => write!(f, "Too few data shards"),
+            Error::TooManyDataShards => write!(f, "Too many data shards"),
+            Error::TooFewParityShards => write!(f, "Too few parity shards"),
+            Error::TooManyParityShards => write!(f, "Too many parity shards"),
+            Error::TooFewBufferShards => write!(f, "Too few buffer shards"),
+            Error::TooManyBufferShards => write!(f, "Too many buffer shards"),
+            Error::IncorrectShardSize => write!(f, "Incorrect shard size"),
+            Error::TooFewShardsPresent => write!(f, "Too few shards present for reconstruction"),
+            Error::EmptyShard => write!(f, "Empty shard"),
+            Error::InvalidIndex => write!(f, "Invalid index"),
+            Error::InvalidParityMatrix => write!(f, "Invalid parity matrix"),
+            Error::SingularMatrix => write!(f, "Singular matrix"),
+        }
+    }
+}
+
+impl std::error::Error for Error {}
+
+#[derive(PartialEq, Debug, Clone, Copy)]
+pub enum SBSError {
+    TooManyCalls,
+    LeftoverShards,
+    RSError(Error),
+}
+
+impl fmt::Display for SBSError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            SBSError::TooManyCalls => write!(f, "Too many calls"),
+            SBSError::LeftoverShards => write!(f, "Leftover shards"),
+            SBSError::RSError(e) => write!(f, "{e}"),
+        }
+    }
+}
+
+impl std::error::Error for SBSError {}
diff --git a/crates/lumen-core/vendor/fec-rs/src/galois.rs b/crates/lumen-core/vendor/fec-rs/src/galois.rs
new file mode 100644
index 0000000..1245494
--- /dev/null
+++ b/crates/lumen-core/vendor/fec-rs/src/galois.rs
@@ -0,0 +1,636 @@
+include!(concat!(env!("OUT_DIR"), "/tables.rs"));
+
+/// Add two GF(2^8) elements (XOR).
+#[inline(always)]
+pub fn add(a: u8, b: u8) -> u8 {
+    a ^ b
+}
+
+/// Multiply two GF(2^8) elements using lookup table.
+#[inline(always)]
+pub fn mul(a: u8, b: u8) -> u8 {
+    MUL_TABLE[a as usize][b as usize]
+}
+
+/// Divide a by b in GF(2^8). Panics if b is 0.
+#[inline(always)]
+pub fn div(a: u8, b: u8) -> u8 {
+    if a == 0 {
+        return 0;
+    }
+    assert!(b != 0, "Division by zero in GF(2^8)");
+    let log_a = LOG_TABLE[a as usize] as isize;
+    let log_b = LOG_TABLE[b as usize] as isize;
+    let mut log_result = log_a - log_b;
+    if log_result < 0 {
+        log_result += 255;
+    }
+    EXP_TABLE[log_result as usize]
+}
+
+/// Compute a^n in GF(2^8).
+#[inline(always)]
+pub fn exp(a: u8, n: usize) -> u8 {
+    if n == 0 {
+        return 1;
+    }
+    if a == 0 {
+        return 0;
+    }
+    let log_a = LOG_TABLE[a as usize] as usize;
+    let log_result = log_a * (n % 255) % 255;
+    EXP_TABLE[log_result]
+}
+
+/// Multiply each element of `input` by `c` and write to `out`.
+///
+/// Uses SIMD acceleration when available:
+/// - GFNI + AVX2 (best: single-instruction GF multiply on 32 bytes)
+/// - AVX2 VPSHUFB (split-table nibble lookup on 32 bytes)
+/// - GFNI + SSE (single-instruction GF multiply on 16 bytes)
+/// - SSSE3 VPSHUFB (split-table nibble lookup on 16 bytes)
+/// - Scalar fallback
+#[inline]
+pub fn mul_slice(c: u8, input: &[u8], out: &mut [u8]) {
+    assert_eq!(input.len(), out.len());
+    if input.is_empty() || c == 0 {
+        out.iter_mut().for_each(|o| *o = 0);
+        return;
+    }
+    if c == 1 {
+        out.copy_from_slice(input);
+        return;
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx2") {
+            unsafe {
+                mul_slice_gfni_avx2(c, input, out);
+            }
+            return;
+        }
+        if is_x86_feature_detected!("avx2") {
+            unsafe {
+                mul_slice_avx2(c, input, out);
+            }
+            return;
+        }
+        if is_x86_feature_detected!("gfni") {
+            unsafe {
+                mul_slice_gfni_sse(c, input, out);
+            }
+            return;
+        }
+        if is_x86_feature_detected!("ssse3") {
+            unsafe {
+                mul_slice_ssse3(c, input, out);
+            }
+            return;
+        }
+    }
+
+    mul_slice_scalar(c, input, out);
+}
+
+/// Multiply each element of `input` by `c` and XOR into `out`.
+///
+/// Uses SIMD acceleration when available (same priority as `mul_slice`).
+#[inline]
+pub fn mul_slice_xor(c: u8, input: &[u8], out: &mut [u8]) {
+    assert_eq!(input.len(), out.len());
+    if input.is_empty() || c == 0 {
+        return;
+    }
+    if c == 1 {
+        for (o, i) in out.iter_mut().zip(input.iter()) {
+            *o ^= *i;
+        }
+        return;
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx2") {
+            unsafe {
+                mul_slice_xor_gfni_avx2(c, input, out);
+            }
+            return;
+        }
+        if is_x86_feature_detected!("avx2") {
+            unsafe {
+                mul_slice_xor_avx2(c, input, out);
+            }
+            return;
+        }
+        if is_x86_feature_detected!("gfni") {
+            unsafe {
+                mul_slice_xor_gfni_sse(c, input, out);
+            }
+            return;
+        }
+        if is_x86_feature_detected!("ssse3") {
+            unsafe {
+                mul_slice_xor_ssse3(c, input, out);
+            }
+            return;
+        }
+    }
+
+    mul_slice_xor_scalar(c, input, out);
+}
+
+/// Function pointer types for bulk GF(2^8) operations.
+pub type MulSliceFn = fn(u8, &[u8], &mut [u8]);
+
+/// Pair of (mul_slice, mul_slice_xor) function pointers for the best available SIMD path.
+///
+/// Unlike `mul_slice`/`mul_slice_xor`, these skip runtime feature detection on every call.
+/// The caller checks once and stores the result.
+///
+/// Note: These raw dispatch functions do NOT handle the c==0 or c==1 special cases.
+/// The caller must handle those before calling through the function pointer.
+pub fn detect_mul_slice() -> (MulSliceFn, MulSliceFn) {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx2") {
+            return (
+                wrap_mul_slice_gfni_avx2 as MulSliceFn,
+                wrap_mul_slice_xor_gfni_avx2 as MulSliceFn,
+            );
+        }
+        if is_x86_feature_detected!("avx2") {
+            return (
+                wrap_mul_slice_avx2 as MulSliceFn,
+                wrap_mul_slice_xor_avx2 as MulSliceFn,
+            );
+        }
+        if is_x86_feature_detected!("gfni") {
+            return (
+                wrap_mul_slice_gfni_sse as MulSliceFn,
+                wrap_mul_slice_xor_gfni_sse as MulSliceFn,
+            );
+        }
+        if is_x86_feature_detected!("ssse3") {
+            return (
+                wrap_mul_slice_ssse3 as MulSliceFn,
+                wrap_mul_slice_xor_ssse3 as MulSliceFn,
+            );
+        }
+    }
+    (
+        mul_slice_scalar as MulSliceFn,
+        mul_slice_xor_scalar as MulSliceFn,
+    )
+}
+
+// Safe wrappers for SIMD functions (used as function pointer targets)
+#[cfg(target_arch = "x86_64")]
+fn wrap_mul_slice_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
+    unsafe { mul_slice_gfni_avx2(c, input, out) }
+}
+#[cfg(target_arch = "x86_64")]
+fn wrap_mul_slice_xor_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
+    unsafe { mul_slice_xor_gfni_avx2(c, input, out) }
+}
+#[cfg(target_arch = "x86_64")]
+fn wrap_mul_slice_avx2(c: u8, input: &[u8], out: &mut [u8]) {
+    unsafe { mul_slice_avx2(c, input, out) }
+}
+#[cfg(target_arch = "x86_64")]
+fn wrap_mul_slice_xor_avx2(c: u8, input: &[u8], out: &mut [u8]) {
+    unsafe { mul_slice_xor_avx2(c, input, out) }
+}
+#[cfg(target_arch = "x86_64")]
+fn wrap_mul_slice_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
+    unsafe { mul_slice_gfni_sse(c, input, out) }
+}
+#[cfg(target_arch = "x86_64")]
+fn wrap_mul_slice_xor_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
+    unsafe { mul_slice_xor_gfni_sse(c, input, out) }
+}
+#[cfg(target_arch = "x86_64")]
+fn wrap_mul_slice_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
+    unsafe { mul_slice_ssse3(c, input, out) }
+}
+#[cfg(target_arch = "x86_64")]
+fn wrap_mul_slice_xor_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
+    unsafe { mul_slice_xor_ssse3(c, input, out) }
+}
+
+// ── Scalar fallback ──────────────────────────────────────────────────────
+
+fn mul_slice_scalar(c: u8, input: &[u8], out: &mut [u8]) {
+    let mt = &MUL_TABLE[c as usize];
+    for (o, &i) in out.iter_mut().zip(input.iter()) {
+        *o = mt[i as usize];
+    }
+}
+
+fn mul_slice_xor_scalar(c: u8, input: &[u8], out: &mut [u8]) {
+    let mt = &MUL_TABLE[c as usize];
+    for (o, &i) in out.iter_mut().zip(input.iter()) {
+        *o ^= mt[i as usize];
+    }
+}
+
+// ── x86_64 SIMD implementations ─────────────────────────────────────────
+
+// ── GFNI + AVX2 (best path: 32 bytes per vgf2p8affineqb) ──────────────
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "gfni,avx2")]
+unsafe fn mul_slice_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
+    use core::arch::x86_64::*;
+
+    let matrix = GFNI_TABLE[c as usize] as i64;
+    let mat_vec = _mm256_set1_epi64x(matrix);
+
+    let len = input.len();
+    let mut i = 0;
+
+    while i + 32 <= len {
+        let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
+        let result = _mm256_gf2p8affine_epi64_epi8(data, mat_vec, 0);
+        _mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
+        i += 32;
+    }
+
+    let mt = &MUL_TABLE[c as usize];
+    while i < len {
+        *out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
+        i += 1;
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "gfni,avx2")]
+unsafe fn mul_slice_xor_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
+    use core::arch::x86_64::*;
+
+    let matrix = GFNI_TABLE[c as usize] as i64;
+    let mat_vec = _mm256_set1_epi64x(matrix);
+
+    let len = input.len();
+    let mut i = 0;
+
+    while i + 32 <= len {
+        let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
+        let existing = _mm256_loadu_si256(out.as_ptr().add(i) as *const _);
+        let mul_result = _mm256_gf2p8affine_epi64_epi8(data, mat_vec, 0);
+        let result = _mm256_xor_si256(mul_result, existing);
+        _mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
+        i += 32;
+    }
+
+    let mt = &MUL_TABLE[c as usize];
+    while i < len {
+        *out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
+        i += 1;
+    }
+}
+
+// ── GFNI + SSE (16 bytes per vgf2p8affineqb) ──────────────────────────
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "gfni")]
+unsafe fn mul_slice_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
+    use core::arch::x86_64::*;
+
+    let matrix = GFNI_TABLE[c as usize] as i64;
+    let mat_vec = _mm_set1_epi64x(matrix);
+
+    let len = input.len();
+    let mut i = 0;
+
+    while i + 16 <= len {
+        let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
+        let result = _mm_gf2p8affine_epi64_epi8(data, mat_vec, 0);
+        _mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
+        i += 16;
+    }
+
+    let mt = &MUL_TABLE[c as usize];
+    while i < len {
+        *out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
+        i += 1;
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "gfni")]
+unsafe fn mul_slice_xor_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
+    use core::arch::x86_64::*;
+
+    let matrix = GFNI_TABLE[c as usize] as i64;
+    let mat_vec = _mm_set1_epi64x(matrix);
+
+    let len = input.len();
+    let mut i = 0;
+
+    while i + 16 <= len {
+        let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
+        let existing = _mm_loadu_si128(out.as_ptr().add(i) as *const _);
+        let mul_result = _mm_gf2p8affine_epi64_epi8(data, mat_vec, 0);
+        let result = _mm_xor_si128(mul_result, existing);
+        _mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
+        i += 16;
+    }
+
+    let mt = &MUL_TABLE[c as usize];
+    while i < len {
+        *out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
+        i += 1;
+    }
+}
+
+// ── AVX2 VPSHUFB (32 bytes, split-table nibble lookup) ─────────────────
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn mul_slice_avx2(c: u8, input: &[u8], out: &mut [u8]) {
+    use core::arch::x86_64::*;
+
+    let low = &MUL_TABLE_LOW[c as usize];
+    let high = &MUL_TABLE_HIGH[c as usize];
+
+    // Broadcast the 16-byte low/high tables to 256-bit registers by duplicating
+    let low_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(low.as_ptr() as *const _));
+    let high_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(high.as_ptr() as *const _));
+    let mask = _mm256_set1_epi8(0x0F);
+
+    let len = input.len();
+    let mut i = 0;
+
+    // Process 32 bytes at a time
+    while i + 32 <= len {
+        let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
+        let lo_nibble = _mm256_and_si256(data, mask);
+        let hi_nibble = _mm256_and_si256(_mm256_srli_epi64(data, 4), mask);
+        let lo_result = _mm256_shuffle_epi8(low_vec, lo_nibble);
+        let hi_result = _mm256_shuffle_epi8(high_vec, hi_nibble);
+        let result = _mm256_xor_si256(lo_result, hi_result);
+        _mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
+        i += 32;
+    }
+
+    // Handle remaining bytes with scalar
+    let mt = &MUL_TABLE[c as usize];
+    while i < len {
+        *out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
+        i += 1;
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn mul_slice_xor_avx2(c: u8, input: &[u8], out: &mut [u8]) {
+    use core::arch::x86_64::*;
+
+    let low = &MUL_TABLE_LOW[c as usize];
+    let high = &MUL_TABLE_HIGH[c as usize];
+
+    let low_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(low.as_ptr() as *const _));
+    let high_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(high.as_ptr() as *const _));
+    let mask = _mm256_set1_epi8(0x0F);
+
+    let len = input.len();
+    let mut i = 0;
+
+    while i + 32 <= len {
+        let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
+        let existing = _mm256_loadu_si256(out.as_ptr().add(i) as *const _);
+        let lo_nibble = _mm256_and_si256(data, mask);
+        let hi_nibble = _mm256_and_si256(_mm256_srli_epi64(data, 4), mask);
+        let lo_result = _mm256_shuffle_epi8(low_vec, lo_nibble);
+        let hi_result = _mm256_shuffle_epi8(high_vec, hi_nibble);
+        let result = _mm256_xor_si256(_mm256_xor_si256(lo_result, hi_result), existing);
+        _mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
+        i += 32;
+    }
+
+    let mt = &MUL_TABLE[c as usize];
+    while i < len {
+        *out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
+        i += 1;
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "ssse3")]
+unsafe fn mul_slice_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
+    use core::arch::x86_64::*;
+
+    let low = &MUL_TABLE_LOW[c as usize];
+    let high = &MUL_TABLE_HIGH[c as usize];
+
+    let low_vec = _mm_loadu_si128(low.as_ptr() as *const _);
+    let high_vec = _mm_loadu_si128(high.as_ptr() as *const _);
+    let mask = _mm_set1_epi8(0x0F);
+
+    let len = input.len();
+    let mut i = 0;
+
+    while i + 16 <= len {
+        let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
+        let lo_nibble = _mm_and_si128(data, mask);
+        let hi_nibble = _mm_and_si128(_mm_srli_epi64(data, 4), mask);
+        let lo_result = _mm_shuffle_epi8(low_vec, lo_nibble);
+        let hi_result = _mm_shuffle_epi8(high_vec, hi_nibble);
+        let result = _mm_xor_si128(lo_result, hi_result);
+        _mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
+        i += 16;
+    }
+
+    let mt = &MUL_TABLE[c as usize];
+    while i < len {
+        *out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
+        i += 1;
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "ssse3")]
+unsafe fn mul_slice_xor_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
+    use core::arch::x86_64::*;
+
+    let low = &MUL_TABLE_LOW[c as usize];
+    let high = &MUL_TABLE_HIGH[c as usize];
+
+    let low_vec = _mm_loadu_si128(low.as_ptr() as *const _);
+    let high_vec = _mm_loadu_si128(high.as_ptr() as *const _);
+    let mask = _mm_set1_epi8(0x0F);
+
+    let len = input.len();
+    let mut i = 0;
+
+    while i + 16 <= len {
+        let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
+        let existing = _mm_loadu_si128(out.as_ptr().add(i) as *const _);
+        let lo_nibble = _mm_and_si128(data, mask);
+        let hi_nibble = _mm_and_si128(_mm_srli_epi64(data, 4), mask);
+        let lo_result = _mm_shuffle_epi8(low_vec, lo_nibble);
+        let hi_result = _mm_shuffle_epi8(high_vec, hi_nibble);
+        let result = _mm_xor_si128(_mm_xor_si128(lo_result, hi_result), existing);
+        _mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
+        i += 16;
+    }
+
+    let mt = &MUL_TABLE[c as usize];
+    while i < len {
+        *out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
+        i += 1;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_gfni_table() {
+        // Verify GFNI_TABLE by emulating vgf2p8affineqb in software:
+        //   result_bit[i] = popcount(x AND qword_byte[7-i]) mod 2
+        for c in 0u16..256 {
+            let matrix = GFNI_TABLE[c as usize];
+            for b in 0u16..256 {
+                let expected = MUL_TABLE[c as usize][b as usize];
+                let x = b as u8;
+                let mut result: u8 = 0;
+                for i in 0..8u32 {
+                    let row_byte = ((matrix >> ((7 - i) * 8)) & 0xFF) as u8;
+                    let dot = (row_byte & x).count_ones() % 2;
+                    result |= (dot as u8) << i;
+                }
+                assert_eq!(
+                    result, expected,
+                    "GFNI table mismatch: c={c}, b={b}, got={result}, expected={expected}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_add() {
+        assert_eq!(add(0, 0), 0);
+        assert_eq!(add(1, 0), 1);
+        assert_eq!(add(0, 1), 1);
+        assert_eq!(add(1, 1), 0);
+        assert_eq!(add(0xFF, 0xFF), 0);
+        assert_eq!(add(0xAA, 0x55), 0xFF);
+    }
+
+    #[test]
+    fn test_mul() {
+        assert_eq!(mul(0, 0), 0);
+        assert_eq!(mul(1, 0), 0);
+        assert_eq!(mul(0, 1), 0);
+        assert_eq!(mul(1, 1), 1);
+        // a * 1 = a
+        for a in 0u8..=255 {
+            assert_eq!(mul(a, 1), a);
+            assert_eq!(mul(1, a), a);
+        }
+        // a * 0 = 0
+        for a in 0u8..=255 {
+            assert_eq!(mul(a, 0), 0);
+        }
+    }
+
+    #[test]
+    fn test_div() {
+        // a / 1 = a
+        for a in 0u8..=255 {
+            assert_eq!(div(a, 1), a);
+        }
+        // a / a = 1 (for a != 0)
+        for a in 1u8..=255 {
+            assert_eq!(div(a, a), 1);
+        }
+        // (a * b) / b = a
+        for a in 1u8..=255 {
+            for b in 1u8..=255 {
+                assert_eq!(div(mul(a, b), b), a);
+            }
+        }
+    }
+
+    #[test]
+    fn test_exp() {
+        assert_eq!(exp(0, 0), 1);
+        assert_eq!(exp(1, 0), 1);
+        assert_eq!(exp(5, 0), 1);
+        assert_eq!(exp(0, 1), 0);
+        assert_eq!(exp(0, 100), 0);
+        // a^1 = a
+        for a in 0u8..=255 {
+            assert_eq!(exp(a, 1), a);
+        }
+        // a^2 = a * a
+        for a in 0u8..=255 {
+            assert_eq!(exp(a, 2), mul(a, a));
+        }
+    }
+
+    #[test]
+    fn test_mul_slice_basic() {
+        let input = [1u8, 2, 3, 4, 5, 6, 7, 8];
+        let mut out = [0u8; 8];
+        mul_slice(3, &input, &mut out);
+        for i in 0..input.len() {
+            assert_eq!(out[i], mul(3, input[i]));
+        }
+    }
+
+    #[test]
+    fn test_mul_slice_xor_basic() {
+        let input = [1u8, 2, 3, 4, 5, 6, 7, 8];
+        let mut out = [10u8; 8];
+        let original = out;
+        mul_slice_xor(3, &input, &mut out);
+        for i in 0..input.len() {
+            assert_eq!(out[i], original[i] ^ mul(3, input[i]));
+        }
+    }
+
+    #[test]
+    fn test_mul_slice_large() {
+        // Test with a buffer large enough to exercise SIMD paths
+        let input: Vec<u8> = (0..256).map(|i| i as u8).collect();
+        let mut out = vec![0u8; 256];
+        let mut expected = vec![0u8; 256];
+
+        for c in [2u8, 7, 42, 128, 255] {
+            mul_slice_scalar(c, &input, &mut expected);
+            mul_slice(c, &input, &mut out);
+            assert_eq!(out, expected, "mul_slice mismatch for c={c}");
+        }
+    }
+
+    #[test]
+    fn test_mul_slice_xor_large() {
+        let input: Vec<u8> = (0..256).map(|i| i as u8).collect();
+
+        for c in [2u8, 7, 42, 128, 255] {
+            let mut out_expected = vec![0xABu8; 256];
+            let mut out_simd = out_expected.clone();
+            mul_slice_xor_scalar(c, &input, &mut out_expected);
+            mul_slice_xor(c, &input, &mut out_simd);
+            assert_eq!(out_simd, out_expected, "mul_slice_xor mismatch for c={c}");
+        }
+    }
+
+    #[test]
+    fn test_mul_slice_unaligned_sizes() {
+        // Test sizes that don't align to SIMD width
+        for size in [1, 7, 15, 16, 17, 31, 32, 33, 63, 64, 65, 100] {
+            let input: Vec<u8> = (0..size).map(|i| i as u8).collect();
+            let mut out = vec![0u8; size];
+            let mut expected = vec![0u8; size];
+
+            mul_slice_scalar(42, &input, &mut expected);
+            mul_slice(42, &input, &mut out);
+            assert_eq!(out, expected, "mul_slice mismatch for size={size}");
+        }
+    }
+}
diff --git a/crates/lumen-core/vendor/fec-rs/src/lib.rs b/crates/lumen-core/vendor/fec-rs/src/lib.rs
new file mode 100644
index 0000000..a0d4a5f
--- /dev/null
+++ b/crates/lumen-core/vendor/fec-rs/src/lib.rs
@@ -0,0 +1,73 @@
+//! A pure Rust Reed-Solomon erasure coding library with runtime SIMD acceleration.
+//!
+//! # Features
+//!
+//! - **Pure Rust** — No C/C++ dependencies or FFI. Everything is implemented in safe Rust
+//!   (with targeted `unsafe` for SIMD intrinsics).
+//! - **Runtime SIMD detection** — Automatically uses the fastest available instruction set
+//!   via `std::is_x86_feature_detected!`. A single binary works on all x86_64 systems.
+//! - **GF(2^8)** — Operates over the Galois field GF(2^8) with generating polynomial 29 (0x1D),
+//!   compatible with the Moonlight streaming protocol.
+//! - **Shard-by-shard encoding** — Incremental encoding via `ShardByShard` for streaming use cases.
+//! - **Reconstruction** — Reconstruct missing data and/or parity shards from any sufficient subset.
+//!
+//! # SIMD Acceleration
+//!
+//! On x86_64, the library automatically detects CPU features at runtime and uses
+//! the best available instruction set:
+//!
+//! - **GFNI + AVX2** — Single-instruction GF multiply on 32 bytes (Intel Alder Lake+, AMD Zen 4+)
+//! - **AVX2** — VPSHUFB split-table nibble lookup on 32 bytes
+//! - **GFNI + SSE** — Single-instruction GF multiply on 16 bytes
+//! - **SSSE3** — VPSHUFB split-table nibble lookup on 16 bytes
+//! - **Scalar** — Lookup table fallback
+//!
+//! # Parallel Encoding
+//!
+//! Enable the `parallel` feature for optional rayon-based parallel encoding:
+//!
+//! ```toml
+//! fec-rs = { version = "0.1", features = ["parallel"] }
+//! ```
+//!
+//! When enabled, large encode workloads automatically distribute parity shard
+//! computation across threads. Small workloads use the sequential path to avoid
+//! overhead.
+//!
+//! # Usage
+//!
+//! ```
+//! use fec_rs::ReedSolomon;
+//!
+//! let rs = ReedSolomon::new(4, 2).unwrap();
+//!
+//! let mut shards: Vec<Vec<u8>> = vec![
+//!     vec![0, 1, 2, 3],
+//!     vec![4, 5, 6, 7],
+//!     vec![8, 9, 10, 11],
+//!     vec![12, 13, 14, 15],
+//!     vec![0, 0, 0, 0], // parity shard 1
+//!     vec![0, 0, 0, 0], // parity shard 2
+//! ];
+//!
+//! // Encode parity
+//! rs.encode(&mut shards).unwrap();
+//!
+//! // Verify
+//! assert!(rs.verify(&shards).unwrap());
+//!
+//! // Simulate loss of shard 0
+//! let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
+//! recovery[0] = None;
+//!
+//! // Reconstruct
+//! rs.reconstruct(&mut recovery).unwrap();
+//! ```
+
+mod errors;
+pub mod galois;
+mod matrix;
+mod reed_solomon;
+
+pub use errors::{Error, SBSError};
+pub use reed_solomon::{ReconstructShard, ReedSolomon, ShardByShard};
diff --git a/crates/lumen-core/vendor/fec-rs/src/matrix.rs b/crates/lumen-core/vendor/fec-rs/src/matrix.rs
new file mode 100644
index 0000000..0bf1309
--- /dev/null
+++ b/crates/lumen-core/vendor/fec-rs/src/matrix.rs
@@ -0,0 +1,251 @@
+use crate::galois;
+
+#[derive(PartialEq, Debug, Clone)]
+pub struct Matrix {
+    pub row_count: usize,
+    pub col_count: usize,
+    pub data: Vec<u8>,
+}
+
+impl Matrix {
+    pub fn new(rows: usize, cols: usize) -> Self {
+        Self {
+            row_count: rows,
+            col_count: cols,
+            data: vec![0u8; rows * cols],
+        }
+    }
+
+    pub fn identity(size: usize) -> Self {
+        let mut m = Self::new(size, size);
+        for i in 0..size {
+            m.data[i * size + i] = 1;
+        }
+        m
+    }
+
+    pub fn vandermonde(rows: usize, cols: usize) -> Self {
+        let mut m = Self::new(rows, cols);
+        for r in 0..rows {
+            let r_a = r as u8;
+            for c in 0..cols {
+                m.data[r * cols + c] = galois::exp(r_a, c);
+            }
+        }
+        m
+    }
+
+    #[inline]
+    pub fn get(&self, r: usize, c: usize) -> u8 {
+        self.data[r * self.col_count + c]
+    }
+
+    #[inline]
+    pub fn set(&mut self, r: usize, c: usize, val: u8) {
+        self.data[r * self.col_count + c] = val;
+    }
+
+    pub fn get_row(&self, row: usize) -> &[u8] {
+        let start = row * self.col_count;
+        &self.data[start..start + self.col_count]
+    }
+
+    pub fn sub_matrix(&self, rmin: usize, cmin: usize, rmax: usize, cmax: usize) -> Self {
+        let new_rows = rmax - rmin;
+        let new_cols = cmax - cmin;
+        let mut m = Self::new(new_rows, new_cols);
+        for r in rmin..rmax {
+            for c in cmin..cmax {
+                m.data[(r - rmin) * new_cols + (c - cmin)] = self.get(r, c);
+            }
+        }
+        m
+    }
+
+    pub fn multiply(&self, rhs: &Matrix) -> Self {
+        assert_eq!(
+            self.col_count, rhs.row_count,
+            "Matrix dimensions incompatible for multiply"
+        );
+        let mut result = Self::new(self.row_count, rhs.col_count);
+        for r in 0..self.row_count {
+            for c in 0..rhs.col_count {
+                let mut val = 0u8;
+                for i in 0..self.col_count {
+                    val = galois::add(val, galois::mul(self.get(r, i), rhs.get(i, c)));
+                }
+                result.set(r, c, val);
+            }
+        }
+        result
+    }
+
+    pub fn augment(&self, rhs: &Matrix) -> Self {
+        assert_eq!(
+            self.row_count, rhs.row_count,
+            "Matrix row counts must match for augment"
+        );
+        let new_cols = self.col_count + rhs.col_count;
+        let mut m = Self::new(self.row_count, new_cols);
+        for r in 0..self.row_count {
+            for c in 0..self.col_count {
+                m.set(r, c, self.get(r, c));
+            }
+            for c in 0..rhs.col_count {
+                m.set(r, self.col_count + c, rhs.get(r, c));
+            }
+        }
+        m
+    }
+
+    fn swap_rows(&mut self, r1: usize, r2: usize) {
+        if r1 == r2 {
+            return;
+        }
+        let s1 = r1 * self.col_count;
+        let s2 = r2 * self.col_count;
+        for i in 0..self.col_count {
+            self.data.swap(s1 + i, s2 + i);
+        }
+    }
+
+    fn gaussian_elim(&mut self) -> Result<(), &'static str> {
+        for r in 0..self.row_count {
+            // Pivot search
+            if self.get(r, r) == 0 {
+                for r_below in r + 1..self.row_count {
+                    if self.get(r_below, r) != 0 {
+                        self.swap_rows(r, r_below);
+                        break;
+                    }
+                }
+            }
+            if self.get(r, r) == 0 {
+                return Err("Singular matrix");
+            }
+            // Scale to 1
+            if self.get(r, r) != 1 {
+                let scale = galois::div(1, self.get(r, r));
+                for c in 0..self.col_count {
+                    let val = galois::mul(scale, self.get(r, c));
+                    self.set(r, c, val);
+                }
+            }
+            // Eliminate below
+            for r_below in r + 1..self.row_count {
+                if self.get(r_below, r) != 0 {
+                    let scale = self.get(r_below, r);
+                    for c in 0..self.col_count {
+                        let val =
+                            galois::add(self.get(r_below, c), galois::mul(scale, self.get(r, c)));
+                        self.set(r_below, c, val);
+                    }
+                }
+            }
+        }
+
+        // Back substitution
+        for d in 0..self.row_count {
+            for r_above in 0..d {
+                if self.get(r_above, d) != 0 {
+                    let scale = self.get(r_above, d);
+                    for c in 0..self.col_count {
+                        let val =
+                            galois::add(self.get(r_above, c), galois::mul(scale, self.get(d, c)));
+                        self.set(r_above, c, val);
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
+    pub fn invert(&self) -> Result<Self, &'static str> {
+        assert!(
+            self.row_count == self.col_count,
+            "Cannot invert non-square matrix"
+        );
+        let mut work = self.augment(&Self::identity(self.row_count));
+        work.gaussian_elim()?;
+        Ok(work.sub_matrix(0, self.row_count, self.col_count, self.col_count * 2))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn mat(data: Vec<Vec<u8>>) -> Matrix {
+        let rows = data.len();
+        let cols = data[0].len();
+        let flat: Vec<u8> = data.into_iter().flatten().collect();
+        Matrix {
+            row_count: rows,
+            col_count: cols,
+            data: flat,
+        }
+    }
+
+    #[test]
+    fn test_identity() {
+        let m = Matrix::identity(3);
+        let expected = mat(vec![vec![1, 0, 0], vec![0, 1, 0], vec![0, 0, 1]]);
+        assert_eq!(m, expected);
+    }
+
+    #[test]
+    fn test_multiply() {
+        let m1 = mat(vec![vec![1, 2], vec![3, 4]]);
+        let m2 = mat(vec![vec![5, 6], vec![7, 8]]);
+        let result = m1.multiply(&m2);
+        let expected = mat(vec![vec![11, 22], vec![19, 42]]);
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_invert() {
+        let m = mat(vec![
+            vec![56, 23, 98],
+            vec![3, 100, 200],
+            vec![45, 201, 123],
+        ]);
+        let inv = m.invert().unwrap();
+        let expected = mat(vec![
+            vec![175, 133, 33],
+            vec![130, 13, 245],
+            vec![112, 35, 126],
+        ]);
+        assert_eq!(inv, expected);
+    }
+
+    #[test]
+    fn test_invert_identity() {
+        let m = Matrix::identity(4);
+        let inv = m.invert().unwrap();
+        assert_eq!(inv, m);
+    }
+
+    #[test]
+    fn test_multiply_identity() {
+        let m = mat(vec![
+            vec![56, 23, 98],
+            vec![3, 100, 200],
+            vec![45, 201, 123],
+        ]);
+        let id = Matrix::identity(3);
+        assert_eq!(m.multiply(&id), m);
+        assert_eq!(id.multiply(&m), m);
+    }
+
+    #[test]
+    fn test_invert_times_original_is_identity() {
+        let m = mat(vec![
+            vec![56, 23, 98],
+            vec![3, 100, 200],
+            vec![45, 201, 123],
+        ]);
+        let inv = m.invert().unwrap();
+        let product = m.multiply(&inv);
+        assert_eq!(product, Matrix::identity(3));
+    }
+}
diff --git a/crates/lumen-core/vendor/fec-rs/src/reed_solomon.rs b/crates/lumen-core/vendor/fec-rs/src/reed_solomon.rs
new file mode 100644
index 0000000..eccb188
--- /dev/null
+++ b/crates/lumen-core/vendor/fec-rs/src/reed_solomon.rs
@@ -0,0 +1,1263 @@
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use crate::errors::{Error, SBSError};
+use crate::galois::{self, EXP_TABLE, LOG_TABLE};
+use crate::matrix::Matrix;
+
+const DATA_DECODE_MATRIX_CACHE_CAPACITY: usize = 254;
+
+/// Reed-Solomon erasure code encoder/decoder.
+///
+/// Operates over GF(2^8) with generating polynomial 29, compatible
+/// with the Moonlight streaming protocol.
+///
+/// # Example
+///
+/// ```
+/// use fec_rs::ReedSolomon;
+///
+/// let rs = ReedSolomon::new(4, 2).unwrap();
+///
+/// let mut shards: Vec<Vec<u8>> = vec![
+///     vec![0, 1, 2, 3],
+///     vec![4, 5, 6, 7],
+///     vec![8, 9, 10, 11],
+///     vec![12, 13, 14, 15],
+///     vec![0, 0, 0, 0],
+///     vec![0, 0, 0, 0],
+/// ];
+///
+/// rs.encode(&mut shards).unwrap();
+/// assert!(rs.verify(&shards).unwrap());
+/// ```
+#[derive(Debug)]
+pub struct ReedSolomon {
+    data_shard_count: usize,
+    parity_shard_count: usize,
+    total_shard_count: usize,
+    matrix: Matrix,
+    mul_slice_fn: galois::MulSliceFn,
+    mul_slice_xor_fn: galois::MulSliceFn,
+    data_decode_matrix_cache: Mutex<HashMap<Vec<usize>, Matrix>>,
+}
+
+impl Clone for ReedSolomon {
+    fn clone(&self) -> Self {
+        ReedSolomon {
+            data_shard_count: self.data_shard_count,
+            parity_shard_count: self.parity_shard_count,
+            total_shard_count: self.total_shard_count,
+            matrix: self.matrix.clone(),
+            mul_slice_fn: self.mul_slice_fn,
+            mul_slice_xor_fn: self.mul_slice_xor_fn,
+            data_decode_matrix_cache: Mutex::new(HashMap::new()),
+        }
+    }
+}
+
+impl PartialEq for ReedSolomon {
+    fn eq(&self, rhs: &ReedSolomon) -> bool {
+        self.data_shard_count == rhs.data_shard_count
+            && self.parity_shard_count == rhs.parity_shard_count
+            && self.matrix == rhs.matrix
+    }
+}
+
+impl ReedSolomon {
+    fn build_matrix(data_shards: usize, total_shards: usize) -> Matrix {
+        let vandermonde = Matrix::vandermonde(total_shards, data_shards);
+        let top = vandermonde.sub_matrix(0, 0, data_shards, data_shards);
+        let mut result = vandermonde.multiply(&top.invert().unwrap());
+
+        let parity_shards = total_shards - data_shards;
+        let mut inverse = vec![0u8; 256];
+        inverse[0] = 0;
+        inverse[1] = 1;
+        for i in 2..256 {
+            inverse[i] = EXP_TABLE[(255 - LOG_TABLE[i]) as usize];
+        }
+
+        for j in 0..parity_shards {
+            for i in 0..data_shards {
+                result.data[(data_shards + j) * data_shards + i] = inverse[(parity_shards + i) ^ j];
+            }
+        }
+
+        result
+    }
+
+    /// Creates a new Reed-Solomon encoder/decoder.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if `data_shards` or `parity_shards` is 0,
+    /// or if `data_shards + parity_shards > 256`.
+    pub fn new(data_shards: usize, parity_shards: usize) -> Result<Self, Error> {
+        if data_shards == 0 {
+            return Err(Error::TooFewDataShards);
+        }
+        if parity_shards == 0 {
+            return Err(Error::TooFewParityShards);
+        }
+        if data_shards + parity_shards > 256 {
+            return Err(Error::TooManyShards);
+        }
+
+        let total_shards = data_shards + parity_shards;
+        let matrix = Self::build_matrix(data_shards, total_shards);
+        let (mul_slice_fn, mul_slice_xor_fn) = galois::detect_mul_slice();
+
+        Ok(ReedSolomon {
+            data_shard_count: data_shards,
+            parity_shard_count: parity_shards,
+            total_shard_count: total_shards,
+            matrix,
+            mul_slice_fn,
+            mul_slice_xor_fn,
+            data_decode_matrix_cache: Mutex::new(HashMap::new()),
+        })
+    }
+
+    /// Directly set the parity rows of the encoding matrix.
+    ///
+    /// `parity` must have exactly `parity_shard_count * data_shard_count` elements.
+    /// This is used for compatibility with the Moonlight audio protocol,
+    /// where the parity matrix values are hardcoded from OpenFEC.
+    pub fn set_parity_matrix(&mut self, parity: &[u8]) -> Result<(), Error> {
+        let expected = self.parity_shard_count * self.data_shard_count;
+        if parity.len() != expected {
+            return Err(Error::InvalidParityMatrix);
+        }
+        let offset = self.data_shard_count * self.data_shard_count;
+        self.matrix.data[offset..offset + expected].copy_from_slice(parity);
+        self.data_decode_matrix_cache
+            .lock()
+            .unwrap_or_else(|poisoned| poisoned.into_inner())
+            .clear();
+        Ok(())
+    }
+
+    pub fn data_shard_count(&self) -> usize {
+        self.data_shard_count
+    }
+
+    pub fn parity_shard_count(&self) -> usize {
+        self.parity_shard_count
+    }
+
+    pub fn total_shard_count(&self) -> usize {
+        self.total_shard_count
+    }
+
+    #[inline]
+    fn get_parity_rows(&self) -> Vec<&[u8]> {
+        (self.data_shard_count..self.total_shard_count)
+            .map(|i| self.matrix.get_row(i))
+            .collect()
+    }
+
+    /// Multiply `input` by `c` in GF(2^8), write to `out`. Uses pre-detected SIMD path.
+    #[inline(always)]
+    fn mul_slice(&self, c: u8, input: &[u8], out: &mut [u8]) {
+        assert_eq!(input.len(), out.len());
+        if c == 0 {
+            out.iter_mut().for_each(|o| *o = 0);
+            return;
+        }
+        if c == 1 {
+            out.copy_from_slice(input);
+            return;
+        }
+        (self.mul_slice_fn)(c, input, out);
+    }
+
+    /// Multiply `input` by `c` in GF(2^8), XOR into `out`. Uses pre-detected SIMD path.
+    #[inline(always)]
+    fn mul_slice_xor(&self, c: u8, input: &[u8], out: &mut [u8]) {
+        assert_eq!(input.len(), out.len());
+        if c == 0 {
+            return;
+        }
+        if c == 1 {
+            for (o, i) in out.iter_mut().zip(input.iter()) {
+                *o ^= *i;
+            }
+            return;
+        }
+        (self.mul_slice_xor_fn)(c, input, out);
+    }
+
+    fn code_some_slices(&self, matrix_rows: &[&[u8]], inputs: &[&[u8]], outputs: &mut [&mut [u8]]) {
+        for (i_input, input) in inputs.iter().enumerate() {
+            self.code_single_slice(matrix_rows, i_input, input, outputs);
+        }
+    }
+
+    #[inline]
+    fn code_single_slice(
+        &self,
+        matrix_rows: &[&[u8]],
+        i_input: usize,
+        input: &[u8],
+        outputs: &mut [&mut [u8]],
+    ) {
+        for (i_row, output) in outputs.iter_mut().enumerate() {
+            let c = matrix_rows[i_row][i_input];
+            if i_input == 0 {
+                self.mul_slice(c, input, output);
+            } else {
+                self.mul_slice_xor(c, input, output);
+            }
+        }
+    }
+
+    /// Constructs the parity shards.
+    ///
+    /// The parity shard slots will be overwritten.
+    ///
+    /// With the `parallel` feature enabled, parity shards are computed in parallel
+    /// using rayon when the workload is large enough.
+    #[cfg(feature = "parallel")]
+    pub fn encode<T: AsRef<[u8]> + AsMut<[u8]> + Send>(
+        &self,
+        shards: &mut [T],
+    ) -> Result<(), Error> {
+        if shards.len() < self.total_shard_count {
+            return Err(Error::TooFewShards);
+        }
+        if shards.len() > self.total_shard_count {
+            return Err(Error::TooManyShards);
+        }
+        Self::check_slices_uniform(shards)?;
+
+        let (data, parity) = shards.split_at_mut(self.data_shard_count);
+        let parity_rows = self.get_parity_rows();
+
+        // Use rayon for large workloads where parallelization overhead is worthwhile.
+        // Threshold: parity_count * data_count * shard_size > ~1MB of work.
+        let shard_size = data[0].as_ref().len();
+        let work = self.parity_shard_count * self.data_shard_count * shard_size;
+        if work > 1_000_000 {
+            use rayon::prelude::*;
+
+            // Collect data shard references that we can share across threads.
+            let data_refs: Vec<&[u8]> = data.iter().map(|d| d.as_ref()).collect();
+            let mul_fn = self.mul_slice_fn;
+            let mul_xor_fn = self.mul_slice_xor_fn;
+
+            parity
+                .par_iter_mut()
+                .enumerate()
+                .for_each(|(i_row, p): (usize, &mut T)| {
+                    let output = p.as_mut();
+                    let row = parity_rows[i_row];
+                    for (i_input, &input) in data_refs.iter().enumerate() {
+                        let c = row[i_input];
+                        if c == 0 {
+                            if i_input == 0 {
+                                output.iter_mut().for_each(|o| *o = 0);
+                            }
+                        } else if c == 1 {
+                            if i_input == 0 {
+                                output.copy_from_slice(input);
+                            } else {
+                                for (o, i) in output.iter_mut().zip(input.iter()) {
+                                    *o ^= *i;
+                                }
+                            }
+                        } else if i_input == 0 {
+                            mul_fn(c, input, output);
+                        } else {
+                            mul_xor_fn(c, input, output);
+                        }
+                    }
+                });
+
+            return Ok(());
+        }
+
+        Self::encode_sequential(
+            data,
+            parity,
+            &parity_rows,
+            self.mul_slice_fn,
+            self.mul_slice_xor_fn,
+        );
+        Ok(())
+    }
+
+    #[cfg(not(feature = "parallel"))]
+    pub fn encode<T: AsRef<[u8]> + AsMut<[u8]>>(&self, shards: &mut [T]) -> Result<(), Error> {
+        if shards.len() < self.total_shard_count {
+            return Err(Error::TooFewShards);
+        }
+        if shards.len() > self.total_shard_count {
+            return Err(Error::TooManyShards);
+        }
+        Self::check_slices_uniform(shards)?;
+
+        let (data, parity) = shards.split_at_mut(self.data_shard_count);
+        let parity_rows = self.get_parity_rows();
+
+        Self::encode_sequential(
+            data,
+            parity,
+            &parity_rows,
+            self.mul_slice_fn,
+            self.mul_slice_xor_fn,
+        );
+        Ok(())
+    }
+
+    fn encode_sequential<T: AsRef<[u8]> + AsMut<[u8]>>(
+        data: &[T],
+        parity: &mut [T],
+        parity_rows: &[&[u8]],
+        mul_slice_fn: galois::MulSliceFn,
+        mul_slice_xor_fn: galois::MulSliceFn,
+    ) {
+        for i_input in 0..data.len() {
+            let input = data[i_input].as_ref();
+            for (i_row, p) in parity.iter_mut().enumerate() {
+                let c = parity_rows[i_row][i_input];
+                let output = p.as_mut();
+                if c == 0 {
+                    if i_input == 0 {
+                        output.iter_mut().for_each(|o| *o = 0);
+                    }
+                } else if c == 1 {
+                    if i_input == 0 {
+                        output.copy_from_slice(input);
+                    } else {
+                        for (o, i) in output.iter_mut().zip(input.iter()) {
+                            *o ^= *i;
+                        }
+                    }
+                } else if i_input == 0 {
+                    mul_slice_fn(c, input, output);
+                } else {
+                    mul_slice_xor_fn(c, input, output);
+                }
+            }
+        }
+    }
+
+    /// Constructs the parity shards using separate data and parity references.
+    pub fn encode_sep<T: AsRef<[u8]>, U: AsRef<[u8]> + AsMut<[u8]>>(
+        &self,
+        data: &[T],
+        parity: &mut [U],
+    ) -> Result<(), Error> {
+        if data.len() != self.data_shard_count {
+            return Err(if data.len() < self.data_shard_count {
+                Error::TooFewDataShards
+            } else {
+                Error::TooManyDataShards
+            });
+        }
+        if parity.len() != self.parity_shard_count {
+            return Err(if parity.len() < self.parity_shard_count {
+                Error::TooFewParityShards
+            } else {
+                Error::TooManyParityShards
+            });
+        }
+
+        let data_refs: Vec<&[u8]> = data.iter().map(|s| s.as_ref()).collect();
+        let mut parity_refs: Vec<&mut [u8]> = parity.iter_mut().map(|s| s.as_mut()).collect();
+
+        // Ensure all slices (data + parity) have the same non-zero length.
+        let shard_len = data_refs[0].len();
+        if shard_len == 0 {
+            return Err(Error::EmptyShard);
+        }
+        for d in &data_refs[1..] {
+            if d.len() != shard_len {
+                return Err(Error::IncorrectShardSize);
+            }
+        }
+        for p in parity_refs.iter() {
+            if p.len() != shard_len {
+                return Err(Error::IncorrectShardSize);
+            }
+        }
+
+        let parity_rows = self.get_parity_rows();
+        self.code_some_slices(&parity_rows, &data_refs, &mut parity_refs);
+
+        Ok(())
+    }
+
+    /// Constructs the parity shards incrementally using a single data shard.
+    ///
+    /// Must be called in order from `i_data = 0` to `data_shard_count - 1`.
+    /// When `i_data == 0`, parity shards are overwritten. Otherwise, results
+    /// are XOR-accumulated.
+    pub fn encode_single<T: AsRef<[u8]> + AsMut<[u8]>>(
+        &self,
+        i_data: usize,
+        shards: &mut [T],
+    ) -> Result<(), Error> {
+        if i_data >= self.data_shard_count {
+            return Err(Error::InvalidIndex);
+        }
+        if shards.len() != self.total_shard_count {
+            return Err(if shards.len() < self.total_shard_count {
+                Error::TooFewShards
+            } else {
+                Error::TooManyShards
+            });
+        }
+        Self::check_slices_uniform(shards)?;
+
+        let (data_part, parity_part) = shards.split_at_mut(self.data_shard_count);
+        let input = data_part[i_data].as_ref();
+        let mut parity_refs: Vec<&mut [u8]> = parity_part.iter_mut().map(|s| s.as_mut()).collect();
+
+        let parity_rows = self.get_parity_rows();
+        self.code_single_slice(&parity_rows, i_data, input, &mut parity_refs);
+
+        Ok(())
+    }
+
+    /// Constructs the parity shards incrementally using a single data shard (separated).
+    pub fn encode_single_sep<U: AsRef<[u8]> + AsMut<[u8]>>(
+        &self,
+        i_data: usize,
+        single_data: &[u8],
+        parity: &mut [U],
+    ) -> Result<(), Error> {
+        if i_data >= self.data_shard_count {
+            return Err(Error::InvalidIndex);
+        }
+        if parity.len() != self.parity_shard_count {
+            return Err(if parity.len() < self.parity_shard_count {
+                Error::TooFewParityShards
+            } else {
+                Error::TooManyParityShards
+            });
+        }
+        if single_data.is_empty() {
+            return Err(Error::EmptyShard);
+        }
+        for p in parity.iter() {
+            if p.as_ref().len() != single_data.len() {
+                return Err(Error::IncorrectShardSize);
+            }
+        }
+
+        let mut parity_refs: Vec<&mut [u8]> = parity.iter_mut().map(|s| s.as_mut()).collect();
+        let parity_rows = self.get_parity_rows();
+        self.code_single_slice(&parity_rows, i_data, single_data, &mut parity_refs);
+
+        Ok(())
+    }
+
+    /// Checks if the parity shards are correct.
+    pub fn verify<T: AsRef<[u8]>>(&self, shards: &[T]) -> Result<bool, Error> {
+        if shards.len() != self.total_shard_count {
+            return Err(if shards.len() < self.total_shard_count {
+                Error::TooFewShards
+            } else {
+                Error::TooManyShards
+            });
+        }
+        Self::check_slices_uniform(shards)?;
+
+        let slice_len = shards[0].as_ref().len();
+        let mut buffer: Vec<Vec<u8>> = (0..self.parity_shard_count)
+            .map(|_| vec![0u8; slice_len])
+            .collect();
+
+        let data = &shards[0..self.data_shard_count];
+        let to_check = &shards[self.data_shard_count..];
+
+        let data_refs: Vec<&[u8]> = data.iter().map(|s| s.as_ref()).collect();
+        let mut buf_refs: Vec<&mut [u8]> = buffer.iter_mut().map(|s| s.as_mut_slice()).collect();
+
+        let parity_rows = self.get_parity_rows();
+        self.code_some_slices(&parity_rows, &data_refs, &mut buf_refs);
+
+        for (computed, expected) in buffer.iter().zip(to_check.iter()) {
+            if computed.as_slice() != expected.as_ref() {
+                return Ok(false);
+            }
+        }
+
+        Ok(true)
+    }
+
+    /// Reconstructs all missing shards.
+    ///
+    /// Shards that are `None` will be reconstructed. The number of present
+    /// shards must be >= `data_shard_count`.
+    pub fn reconstruct<T: ReconstructShard>(&self, shards: &mut [T]) -> Result<(), Error> {
+        self.reconstruct_internal(shards, false)
+    }
+
+    /// Reconstructs only missing data shards.
+    pub fn reconstruct_data<T: ReconstructShard>(&self, shards: &mut [T]) -> Result<(), Error> {
+        self.reconstruct_internal(shards, true)
+    }
+
+    fn get_data_decode_matrix(
+        &self,
+        valid_indices: &[usize],
+        invalid_indices: &[usize],
+    ) -> Result<Matrix, Error> {
+        {
+            let cache = self
+                .data_decode_matrix_cache
+                .lock()
+                .unwrap_or_else(|poisoned| poisoned.into_inner());
+            if let Some(m) = cache.get(invalid_indices) {
+                return Ok(m.clone());
+            }
+        }
+
+        let mut sub_matrix = Matrix::new(self.data_shard_count, self.data_shard_count);
+        for (sub_row, &valid_index) in valid_indices.iter().enumerate() {
+            for c in 0..self.data_shard_count {
+                sub_matrix.set(sub_row, c, self.matrix.get(valid_index, c));
+            }
+        }
+
+        let data_decode_matrix = sub_matrix.invert().map_err(|_| Error::SingularMatrix)?;
+
+        {
+            let mut cache = self
+                .data_decode_matrix_cache
+                .lock()
+                .unwrap_or_else(|poisoned| poisoned.into_inner());
+            if cache.len() >= DATA_DECODE_MATRIX_CACHE_CAPACITY {
+                // Simple eviction: clear the cache when full
+                cache.clear();
+            }
+            cache.insert(invalid_indices.to_vec(), data_decode_matrix.clone());
+        }
+
+        Ok(data_decode_matrix)
+    }
+
+    fn reconstruct_internal<T: ReconstructShard>(
+        &self,
+        shards: &mut [T],
+        data_only: bool,
+    ) -> Result<(), Error> {
+        if shards.len() != self.total_shard_count {
+            return Err(if shards.len() < self.total_shard_count {
+                Error::TooFewShards
+            } else {
+                Error::TooManyShards
+            });
+        }
+
+        // Count present shards and find shard length
+        let mut number_present = 0usize;
+        let mut shard_len: Option<usize> = None;
+
+        for shard in shards.iter() {
+            if let Some(len) = shard.len() {
+                if len == 0 {
+                    return Err(Error::EmptyShard);
+                }
+                number_present += 1;
+                if let Some(old_len) = shard_len {
+                    if len != old_len {
+                        return Err(Error::IncorrectShardSize);
+                    }
+                }
+                shard_len = Some(len);
+            }
+        }
+
+        if number_present == self.total_shard_count {
+            return Ok(());
+        }
+
+        if number_present < self.data_shard_count {
+            return Err(Error::TooFewShardsPresent);
+        }
+
+        let shard_len = shard_len.expect("at least one shard present");
+
+        // Categorize shards
+        let mut valid_indices: Vec<usize> = Vec::with_capacity(self.data_shard_count);
+        let mut invalid_indices: Vec<usize> = Vec::with_capacity(self.parity_shard_count);
+
+        for (i, shard) in shards.iter().enumerate() {
+            if shard.len().is_some() {
+                if valid_indices.len() < self.data_shard_count {
+                    valid_indices.push(i);
+                }
+            } else {
+                invalid_indices.push(i);
+            }
+        }
+
+        // Initialize missing shards
+        for &i in &invalid_indices {
+            if i < self.data_shard_count || !data_only {
+                shards[i].initialize(shard_len);
+            }
+        }
+
+        let data_decode_matrix = self.get_data_decode_matrix(&valid_indices, &invalid_indices)?;
+
+        // Reconstruct missing data shards.
+        // We need to read from valid shards and write to invalid shards simultaneously.
+        // Since the index sets are disjoint, we use raw pointers to avoid borrow conflicts.
+        let missing_data_indices: Vec<usize> = invalid_indices
+            .iter()
+            .copied()
+            .filter(|&i| i < self.data_shard_count)
+            .collect();
+
+        if !missing_data_indices.is_empty() {
+            let matrix_rows: Vec<&[u8]> = missing_data_indices
+                .iter()
+                .map(|&i| data_decode_matrix.get_row(i))
+                .collect();
+
+            // For each data shard input, encode into each missing data output
+            for (i_input, &valid_idx) in valid_indices.iter().enumerate() {
+                // SAFETY: valid_idx and missing indices are disjoint sets,
+                // so we can safely read from valid_idx while writing to missing indices.
+                let input_ptr = shards[valid_idx].get().unwrap().as_ptr();
+                let input_slice = unsafe { std::slice::from_raw_parts(input_ptr, shard_len) };
+
+                for (i_out, &missing_idx) in missing_data_indices.iter().enumerate() {
+                    let c = matrix_rows[i_out][i_input];
+                    let output = shards[missing_idx].get_mut().unwrap();
+                    if i_input == 0 {
+                        self.mul_slice(c, input_slice, output);
+                    } else {
+                        self.mul_slice_xor(c, input_slice, output);
+                    }
+                }
+            }
+        }
+
+        if data_only {
+            return Ok(());
+        }
+
+        // Reconstruct missing parity shards from all data shards
+        let missing_parity_indices: Vec<usize> = invalid_indices
+            .iter()
+            .copied()
+            .filter(|&i| i >= self.data_shard_count)
+            .collect();
+
+        if !missing_parity_indices.is_empty() {
+            let parity_rows = self.get_parity_rows();
+            let matrix_rows: Vec<&[u8]> = missing_parity_indices
+                .iter()
+                .map(|&i| parity_rows[i - self.data_shard_count])
+                .collect();
+
+            for i_input in 0..self.data_shard_count {
+                // SAFETY: data shards (0..data_shard_count) are disjoint from parity shards.
+                let input_ptr = shards[i_input].get().unwrap().as_ptr();
+                let input_slice = unsafe { std::slice::from_raw_parts(input_ptr, shard_len) };
+
+                for (i_out, &missing_idx) in missing_parity_indices.iter().enumerate() {
+                    let c = matrix_rows[i_out][i_input];
+                    let output = shards[missing_idx].get_mut().unwrap();
+                    if i_input == 0 {
+                        self.mul_slice(c, input_slice, output);
+                    } else {
+                        self.mul_slice_xor(c, input_slice, output);
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn check_slices_uniform<T: AsRef<[u8]>>(slices: &[T]) -> Result<(), Error> {
+        if slices.is_empty() {
+            return Ok(());
+        }
+        let size = slices[0].as_ref().len();
+        if size == 0 {
+            return Err(Error::EmptyShard);
+        }
+        for slice in slices.iter().skip(1) {
+            if slice.as_ref().len() != size {
+                return Err(Error::IncorrectShardSize);
+            }
+        }
+        Ok(())
+    }
+}
+
+/// A trait for types that can hold optional shard data for reconstruction.
+///
+/// # Safety
+///
+/// Implementations must guarantee that distinct indices in a `&mut [T]` slice
+/// yield non-overlapping memory regions from `get()`/`get_mut()`. Specifically:
+/// - `get()` on element `i` must not alias `get_mut()` on element `j` when `i != j`.
+/// - The returned slices must remain valid and not be moved/reallocated while borrows are active.
+///
+/// This is required because `reconstruct_internal` uses raw pointers to read from
+/// some shard indices while writing to others simultaneously.
+pub unsafe trait ReconstructShard {
+    /// Returns the length of the shard data, or `None` if absent.
+    fn len(&self) -> Option<usize>;
+    /// Returns `true` if the shard data is absent.
+    fn is_empty(&self) -> bool {
+        self.len().is_none()
+    }
+    /// Get an immutable reference to the shard data.
+    fn get(&self) -> Option<&[u8]>;
+    /// Get a mutable reference to the shard data.
+    fn get_mut(&mut self) -> Option<&mut [u8]>;
+    /// Initialize the shard data to the given length (zeroed).
+    fn initialize(&mut self, len: usize);
+}
+
+// SAFETY: `Option<Vec<u8>>` stores each shard in its own heap allocation,
+// so distinct indices in a slice always yield non-overlapping memory.
+unsafe impl ReconstructShard for Option<Vec<u8>> {
+    fn len(&self) -> Option<usize> {
+        self.as_ref().map(|v| v.len())
+    }
+
+    fn get(&self) -> Option<&[u8]> {
+        self.as_ref().map(|v| v.as_slice())
+    }
+
+    fn get_mut(&mut self) -> Option<&mut [u8]> {
+        self.as_mut().map(|v| v.as_mut_slice())
+    }
+
+    fn initialize(&mut self, len: usize) {
+        if self.is_none() {
+            *self = Some(vec![0u8; len]);
+        }
+    }
+}
+
+/// Bookkeeper for shard-by-shard encoding.
+///
+/// Useful for streaming use cases where data shards arrive one at a time.
+///
+/// # Example
+///
+/// ```
+/// use fec_rs::{ReedSolomon, ShardByShard};
+///
+/// let rs = ReedSolomon::new(3, 2).unwrap();
+/// let mut sbs = ShardByShard::new(&rs);
+///
+/// let mut shards: Vec<Vec<u8>> = vec![
+///     vec![0, 1, 2, 3, 4],
+///     vec![5, 6, 7, 8, 9],
+///     vec![0, 0, 0, 0, 0], // placeholder
+///     vec![0, 0, 0, 0, 0], // parity 1
+///     vec![0, 0, 0, 0, 0], // parity 2
+/// ];
+///
+/// // Encode first two data shards
+/// sbs.encode(&mut shards).unwrap();
+/// sbs.encode(&mut shards).unwrap();
+///
+/// // Fill in third data shard
+/// shards[2] = vec![10, 11, 12, 13, 14];
+///
+/// // Encode third data shard
+/// sbs.encode(&mut shards).unwrap();
+///
+/// assert!(rs.verify(&shards).unwrap());
+/// ```
+pub struct ShardByShard<'a> {
+    codec: &'a ReedSolomon,
+    cur_input: usize,
+}
+
+impl<'a> ShardByShard<'a> {
+    pub fn new(codec: &'a ReedSolomon) -> Self {
+        ShardByShard {
+            codec,
+            cur_input: 0,
+        }
+    }
+
+    /// Checks if all data shards have been encoded.
+    pub fn parity_ready(&self) -> bool {
+        self.cur_input == self.codec.data_shard_count
+    }
+
+    /// Resets the bookkeeping state.
+    ///
+    /// Returns `SBSError::LeftoverShards` if shards were encoded but parity is not ready.
+    pub fn reset(&mut self) -> Result<(), SBSError> {
+        if self.cur_input > 0 && !self.parity_ready() {
+            return Err(SBSError::LeftoverShards);
+        }
+        self.cur_input = 0;
+        Ok(())
+    }
+
+    /// Resets unconditionally.
+    pub fn reset_force(&mut self) {
+        self.cur_input = 0;
+    }
+
+    /// Returns the current data shard index to be encoded.
+    pub fn cur_input_index(&self) -> usize {
+        self.cur_input
+    }
+
+    /// Encodes the current data shard into the parity shards.
+    pub fn encode<T: AsRef<[u8]> + AsMut<[u8]>>(
+        &mut self,
+        shards: &mut [T],
+    ) -> Result<(), SBSError> {
+        if self.parity_ready() {
+            return Err(SBSError::TooManyCalls);
+        }
+        self.codec
+            .encode_single(self.cur_input, shards)
+            .map_err(SBSError::RSError)?;
+        self.cur_input += 1;
+        Ok(())
+    }
+
+    /// Encodes the current data shard using separate data and parity references.
+    pub fn encode_sep<U: AsRef<[u8]> + AsMut<[u8]>>(
+        &mut self,
+        data: &[&[u8]],
+        parity: &mut [U],
+    ) -> Result<(), SBSError> {
+        if self.parity_ready() {
+            return Err(SBSError::TooManyCalls);
+        }
+        if data.len() != self.codec.data_shard_count {
+            return Err(SBSError::RSError(
+                if data.len() < self.codec.data_shard_count {
+                    Error::TooFewDataShards
+                } else {
+                    Error::TooManyDataShards
+                },
+            ));
+        }
+        self.codec
+            .encode_single_sep(self.cur_input, data[self.cur_input], parity)
+            .map_err(SBSError::RSError)?;
+        self.cur_input += 1;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_new_basic() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        assert_eq!(rs.data_shard_count(), 4);
+        assert_eq!(rs.parity_shard_count(), 2);
+        assert_eq!(rs.total_shard_count(), 6);
+    }
+
+    #[test]
+    fn test_new_errors() {
+        assert_eq!(ReedSolomon::new(0, 1), Err(Error::TooFewDataShards));
+        assert_eq!(ReedSolomon::new(1, 0), Err(Error::TooFewParityShards));
+        assert_eq!(ReedSolomon::new(128, 129), Err(Error::TooManyShards));
+    }
+
+    #[test]
+    fn test_set_parity_matrix_rejects_wrong_length() {
+        let mut rs = ReedSolomon::new(4, 2).unwrap();
+        assert_eq!(
+            rs.set_parity_matrix(&[1, 2, 3]),
+            Err(Error::InvalidParityMatrix)
+        );
+    }
+
+    #[test]
+    fn test_partial_eq_reflects_parity_matrix_changes() {
+        let mut lhs = ReedSolomon::new(4, 2).unwrap();
+        let rhs = ReedSolomon::new(4, 2).unwrap();
+
+        assert_eq!(lhs, rhs);
+
+        lhs.set_parity_matrix(&[0x77, 0x40, 0x38, 0x0e, 0xc7, 0xa7, 0x0d, 0x6c])
+            .unwrap();
+
+        assert_ne!(lhs, rhs);
+    }
+
+    #[test]
+    fn test_encode_verify() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        let mut shards: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+            vec![0, 0, 0, 0],
+            vec![0, 0, 0, 0],
+        ];
+
+        rs.encode(&mut shards).unwrap();
+        assert!(rs.verify(&shards).unwrap());
+
+        // Corrupt parity
+        shards[4][0] ^= 0xFF;
+        assert!(!rs.verify(&shards).unwrap());
+    }
+
+    #[test]
+    fn test_reconstruct_missing_data() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        let mut shards: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+            vec![0, 0, 0, 0],
+            vec![0, 0, 0, 0],
+        ];
+        rs.encode(&mut shards).unwrap();
+
+        let original = shards.clone();
+
+        // Lose shard 0
+        let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
+        recovery[0] = None;
+
+        rs.reconstruct(&mut recovery).unwrap();
+
+        for (i, shard) in recovery.iter().enumerate() {
+            assert_eq!(shard.as_ref().unwrap(), &original[i]);
+        }
+    }
+
+    #[test]
+    fn test_reconstruct_missing_parity() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        let mut shards: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+            vec![0, 0, 0, 0],
+            vec![0, 0, 0, 0],
+        ];
+        rs.encode(&mut shards).unwrap();
+
+        let original = shards.clone();
+
+        // Lose both parity shards
+        let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
+        recovery[4] = None;
+        recovery[5] = None;
+
+        rs.reconstruct(&mut recovery).unwrap();
+
+        for (i, shard) in recovery.iter().enumerate() {
+            assert_eq!(shard.as_ref().unwrap(), &original[i]);
+        }
+    }
+
+    #[test]
+    fn test_reconstruct_mixed() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        let mut shards: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+            vec![0, 0, 0, 0],
+            vec![0, 0, 0, 0],
+        ];
+        rs.encode(&mut shards).unwrap();
+
+        let original = shards.clone();
+
+        // Lose one data + one parity
+        let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
+        recovery[1] = None;
+        recovery[4] = None;
+
+        rs.reconstruct(&mut recovery).unwrap();
+
+        for (i, shard) in recovery.iter().enumerate() {
+            assert_eq!(shard.as_ref().unwrap(), &original[i]);
+        }
+    }
+
+    #[test]
+    fn test_reconstruct_too_few_shards() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        let mut shards: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+            vec![0, 0, 0, 0],
+            vec![0, 0, 0, 0],
+        ];
+        rs.encode(&mut shards).unwrap();
+
+        let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
+        recovery[0] = None;
+        recovery[1] = None;
+        recovery[2] = None;
+
+        assert_eq!(
+            rs.reconstruct(&mut recovery),
+            Err(Error::TooFewShardsPresent)
+        );
+    }
+
+    #[test]
+    fn test_shard_by_shard() {
+        let rs = ReedSolomon::new(3, 2).unwrap();
+        let mut sbs = ShardByShard::new(&rs);
+
+        let mut shards: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3, 4],
+            vec![5, 6, 7, 8, 9],
+            vec![10, 11, 12, 13, 14],
+            vec![0, 0, 0, 0, 0],
+            vec![0, 0, 0, 0, 0],
+        ];
+
+        let mut shards_batch = shards.clone();
+        rs.encode(&mut shards_batch).unwrap();
+
+        sbs.encode(&mut shards).unwrap();
+        sbs.encode(&mut shards).unwrap();
+        sbs.encode(&mut shards).unwrap();
+
+        assert!(sbs.parity_ready());
+        assert_eq!(shards, shards_batch);
+    }
+
+    #[test]
+    fn test_encode_sep() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        let data: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+        ];
+        let mut parity = vec![vec![0u8; 4]; 2];
+        rs.encode_sep(&data, &mut parity).unwrap();
+
+        // Verify against encode
+        let shards: Vec<Vec<u8>> = data.iter().cloned().chain(parity.iter().cloned()).collect();
+        assert!(rs.verify(&shards).unwrap());
+
+        // Also verify with direct encode
+        let mut shards2: Vec<Vec<u8>> = data.iter().cloned().chain(vec![vec![0u8; 4]; 2]).collect();
+        rs.encode(&mut shards2).unwrap();
+        assert_eq!(shards, shards2);
+    }
+
+    #[test]
+    fn test_encode_single_sep_matches_batch_encode() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        let data: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+        ];
+        let mut parity = vec![vec![0u8; 4]; 2];
+
+        for (i, shard) in data.iter().enumerate() {
+            rs.encode_single_sep(i, shard, &mut parity).unwrap();
+        }
+
+        let mut expected: Vec<Vec<u8>> =
+            data.iter().cloned().chain(vec![vec![0u8; 4]; 2]).collect();
+        rs.encode(&mut expected).unwrap();
+
+        assert_eq!(parity[0], expected[4]);
+        assert_eq!(parity[1], expected[5]);
+    }
+
+    #[test]
+    fn test_shard_by_shard_encode_sep_matches_batch_encode() {
+        let rs = ReedSolomon::new(3, 2).unwrap();
+        let data: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3, 4],
+            vec![5, 6, 7, 8, 9],
+            vec![10, 11, 12, 13, 14],
+        ];
+        let data_refs: Vec<&[u8]> = data.iter().map(Vec::as_slice).collect();
+        let mut parity = vec![vec![0u8; 5]; 2];
+        let mut sbs = ShardByShard::new(&rs);
+
+        while !sbs.parity_ready() {
+            sbs.encode_sep(&data_refs, &mut parity).unwrap();
+        }
+
+        let mut expected: Vec<Vec<u8>> =
+            data.iter().cloned().chain(vec![vec![0u8; 5]; 2]).collect();
+        rs.encode(&mut expected).unwrap();
+
+        assert_eq!(parity[0], expected[3]);
+        assert_eq!(parity[1], expected[4]);
+    }
+
+    #[test]
+    fn test_reconstruct_returns_singular_matrix_for_invalid_parity_matrix() {
+        let mut rs = ReedSolomon::new(4, 2).unwrap();
+        rs.set_parity_matrix(&[1, 0, 0, 0, 1, 0, 0, 0]).unwrap();
+
+        let mut shards: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+            vec![0, 0, 0, 0],
+            vec![0, 0, 0, 0],
+        ];
+        rs.encode(&mut shards).unwrap();
+
+        let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
+        recovery[0] = None;
+        recovery[1] = None;
+
+        assert_eq!(rs.reconstruct(&mut recovery), Err(Error::SingularMatrix));
+    }
+
+    #[test]
+    fn test_set_parity_matrix_invalidates_decode_matrix_cache() {
+        let mut rs = ReedSolomon::new(4, 2).unwrap();
+        let mut shards: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+            vec![0, 0, 0, 0],
+            vec![0, 0, 0, 0],
+        ];
+        rs.encode(&mut shards).unwrap();
+
+        let mut initial_recovery: Vec<Option<Vec<u8>>> = shards.iter().cloned().map(Some).collect();
+        initial_recovery[0] = None;
+        initial_recovery[1] = None;
+        rs.reconstruct(&mut initial_recovery).unwrap();
+
+        rs.set_parity_matrix(&[1, 0, 0, 0, 1, 0, 0, 0]).unwrap();
+
+        let mut recovery_after_update: Vec<Option<Vec<u8>>> =
+            shards.into_iter().map(Some).collect();
+        recovery_after_update[0] = None;
+        recovery_after_update[1] = None;
+
+        assert_eq!(
+            rs.reconstruct(&mut recovery_after_update),
+            Err(Error::SingularMatrix)
+        );
+    }
+
+    #[cfg(feature = "parallel")]
+    #[test]
+    fn test_parallel_encode_matches_sequential_path() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        let shard_len = 200_000;
+        let mut shards: Vec<Vec<u8>> = (0..6)
+            .map(|i| {
+                if i < 4 {
+                    (0..shard_len).map(|j| ((i * 17 + j) % 251) as u8).collect()
+                } else {
+                    vec![0u8; shard_len]
+                }
+            })
+            .collect();
+
+        let parity_rows = rs.get_parity_rows();
+        let mut expected = vec![vec![0u8; shard_len]; 2];
+
+        {
+            let (data, _) = shards.split_at_mut(4);
+            ReedSolomon::encode_sequential(
+                data,
+                &mut expected,
+                &parity_rows,
+                rs.mul_slice_fn,
+                rs.mul_slice_xor_fn,
+            );
+        }
+
+        rs.encode(&mut shards).unwrap();
+
+        assert_eq!(shards[4], expected[0]);
+        assert_eq!(shards[5], expected[1]);
+    }
+
+    #[test]
+    fn test_various_shard_counts() {
+        for data in [1, 2, 5, 10, 50, 127] {
+            for parity in [1, 2, 3, 5] {
+                if data + parity > 256 {
+                    continue;
+                }
+                let rs = ReedSolomon::new(data, parity).unwrap();
+                let mut shards: Vec<Vec<u8>> = (0..data + parity)
+                    .map(|i| {
+                        if i < data {
+                            vec![i as u8; 64]
+                        } else {
+                            vec![0u8; 64]
+                        }
+                    })
+                    .collect();
+
+                rs.encode(&mut shards).unwrap();
+                assert!(
+                    rs.verify(&shards).unwrap(),
+                    "Verify failed for data={data}, parity={parity}"
+                );
+
+                // Reconstruct with one missing data shard
+                let original = shards.clone();
+                let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
+                recovery[0] = None;
+                rs.reconstruct(&mut recovery).unwrap();
+                assert_eq!(
+                    recovery[0].as_ref().unwrap(),
+                    &original[0],
+                    "Reconstruct failed for data={data}, parity={parity}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_reconstruct_data_only() {
+        let rs = ReedSolomon::new(4, 2).unwrap();
+        let mut shards: Vec<Vec<u8>> = vec![
+            vec![0, 1, 2, 3],
+            vec![4, 5, 6, 7],
+            vec![8, 9, 10, 11],
+            vec![12, 13, 14, 15],
+            vec![0, 0, 0, 0],
+            vec![0, 0, 0, 0],
+        ];
+        rs.encode(&mut shards).unwrap();
+
+        let original_data = shards[0].clone();
+
+        let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
+        recovery[0] = None;
+
+        rs.reconstruct_data(&mut recovery).unwrap();
+
+        assert_eq!(recovery[0].as_ref().unwrap(), &original_data);
+    }
+}
diff --git a/crates/lumen-host/src/gamestream/stream.rs b/crates/lumen-host/src/gamestream/stream.rs
index bbd1faf..f55f7f9 100644
--- a/crates/lumen-host/src/gamestream/stream.rs
+++ b/crates/lumen-host/src/gamestream/stream.rs
@@ -8,6 +8,7 @@ use super::VIDEO_PORT;
 use crate::capture::{self, Capturer, FastSyntheticCapturer};
 use crate::encode::{self, Codec};
 use anyhow::{Context, Result};
+use rand::Rng;
 use std::net::UdpSocket;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
@@ -82,7 +83,12 @@ fn run(cfg: StreamConfig, running: &AtomicBool) -> Result<()> {
         cfg.bitrate_kbps as u64 * 1000,
     )
     .context("open NVENC for stream")?;
-    let mut pk = VideoPacketizer::new(cfg.packet_size);
+    // FEC overhead percent (Sunshine default 20). Override with LUMEN_FEC_PCT (0 = data-only).
+    let fec_pct: u8 = std::env::var("LUMEN_FEC_PCT")
+        .ok()
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(20);
+    let mut pk = VideoPacketizer::new(cfg.packet_size, fec_pct);
 
     // Pace at a steady rate (capped at 60fps), re-encoding the last captured frame when the
     // compositor produced no new one. wlroots only emits frames on damage, so a static or
@@ -94,6 +100,13 @@ fn run(cfg: StreamConfig, running: &AtomicBool) -> Result<()> {
     let mut fps_count: u32 = 0;
     let mut fps_t = Instant::now();
     let stream_start = Instant::now();
+    // Test knob: drop this % of outbound packets to exercise FEC recovery (0 = off).
+    let drop_pct: u32 = std::env::var("LUMEN_VIDEO_DROP")
+        .ok()
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(0);
+    let mut rng = rand::thread_rng();
+    let mut dropped: u64 = 0;
 
     while running.load(Ordering::SeqCst) {
         let tick = Instant::now();
@@ -113,6 +126,11 @@ fn run(cfg: StreamConfig, running: &AtomicBool) -> Result<()> {
                 FrameType::P
             };
             for pkt in pk.packetize(&au.data, ft, ts) {
+                // Simulated network loss: build the packet (advances seq) but skip the send.
+                if drop_pct > 0 && rng.gen_range(0..100) < drop_pct {
+                    dropped += 1;
+                    continue;
+                }
                 if sock.send(&pkt).is_err() {
                     client_gone = true;
                     break;
@@ -130,7 +148,7 @@ fn run(cfg: StreamConfig, running: &AtomicBool) -> Result<()> {
 
         fps_count += 1;
         if fps_t.elapsed() >= Duration::from_secs(1) {
-            tracing::info!(fps = fps_count, sent_pkts, "video: streaming");
+            tracing::info!(fps = fps_count, sent_pkts, dropped, "video: streaming");
             fps_count = 0;
             fps_t = Instant::now();
         }
diff --git a/crates/lumen-host/src/gamestream/video.rs b/crates/lumen-host/src/gamestream/video.rs
index f43dc82..4441ebc 100644
--- a/crates/lumen-host/src/gamestream/video.rs
+++ b/crates/lumen-host/src/gamestream/video.rs
@@ -1,15 +1,21 @@
 //! GameStream video wire packetization: an encoded access unit → UDP datagrams a stock
-//! Moonlight client decodes. Each datagram is
+//! Moonlight client decodes (and recovers under loss). Each datagram is
 //!   `RTP_PACKET(12, big-endian) + reserved[4] + NV_VIDEO_PACKET(16, little-endian) + payload`
 //! and the frame's bitstream is prefixed with an 8-byte `video_short_frame_header_t`, then
-//! striped into ≤4 FEC blocks of ≤255 data shards. Byte-exact spec:
+//! striped into ≤4 FEC blocks of ≤255 shards. Byte-exact spec:
 //! `docs/research/gamestream-protocol-research.json` (video plane).
 //!
-//! P1.3 sends **data shards only** (`fecPercentage = 0`): on a clean LAN the client has
-//! every data shard and never runs Reed–Solomon recovery, so we get a decodable frame
-//! without matching Moonlight's `nanors` parity matrix (that interop work is P1.5). Plaintext
-//! only (encryption negotiated off for now). This lives in lumen-host for fast iteration;
-//! the wire codec moves into lumen-core (the P1 wire mode) once proven.
+//! FEC (P1.5): each block carries `m = ⌈k·pct/100⌉` Reed–Solomon parity shards generated by
+//! `lumen_core::fec::Gf8Coder` (the nanors-compatible Cauchy GF(2⁸) coder). Crucially, RS runs
+//! over the **whole `blocksize` shard** — Moonlight decodes over `packetSize + 16` bytes from
+//! the datagram start (`RtpVideoQueue.c`), and rejects a recovered shard whose reconstructed
+//! `flags` byte isn't valid — so the NV header fields RS must reproduce (streamPacketIndex,
+//! frameIndex, flags, multiFec*) are written into the data shards **before** encoding, and only
+//! the transport fields (RTP header/seq/timestamp + fecInfo) are stamped **after**, matching
+//! Sunshine `stream.cpp`. `pct = 0` falls back to data-shards-only. Plaintext (AES-GCM video
+//! encryption is negotiated off for now).
+
+use lumen_core::fec::{ErasureCoder, Gf8Coder};
 
 /// RTP `header` byte: version 2 (0x80) | extension (0x10) — Moonlight keys on the extension.
 const RTP_HEADER_BYTE: u8 = 0x80 | 0x10;
@@ -28,28 +34,32 @@ pub enum FrameType {
     P,
 }
 
-/// Splits encoded access units into GameStream video datagrams.
+/// Splits encoded access units into GameStream video datagrams (data + FEC parity shards).
 pub struct VideoPacketizer {
     /// Negotiated `packetSize` (ANNOUNCE `x-nv-video[0].packetSize`).
     packet_size: usize,
     /// Per-shard payload bytes = `blocksize - SHARD_HEADER`, `blocksize = packetSize + 16`.
     payload_per_shard: usize,
+    /// Requested FEC overhead percent (0 = data shards only). The wire carries the recomputed
+    /// per-block `(100·m)/k` so Moonlight derives the same parity count.
+    fec_percentage: usize,
     frame_index: u32,
     /// Monotonic per-stream packet counter (the RTP sequence / streamPacketIndex source).
     seq: u32,
 }
 
 impl VideoPacketizer {
-    pub fn new(packet_size: usize) -> Self {
+    pub fn new(packet_size: usize, fec_percentage: u8) -> Self {
         VideoPacketizer {
             packet_size,
             payload_per_shard: packet_size + 16 - SHARD_HEADER,
+            fec_percentage: fec_percentage as usize,
             frame_index: 0,
             seq: 0,
         }
     }
 
-    /// Packetize one encoded AU into wire datagrams (ready for UDP send).
+    /// Packetize one encoded AU into wire datagrams (data shards + Cauchy RS parity shards).
     pub fn packetize(
         &mut self,
         au: &[u8],
@@ -59,6 +69,8 @@ impl VideoPacketizer {
         let frame_index = self.frame_index;
         self.frame_index = self.frame_index.wrapping_add(1);
         let pps = self.payload_per_shard;
+        let blocksize = SHARD_HEADER + pps; // = packet_size + 16
+        let pct = self.fec_percentage;
 
         // frame payload = 8-byte short frame header + the AU bitstream.
         let total_len = 8 + au.len();
@@ -71,53 +83,120 @@ impl VideoPacketizer {
         fp.extend_from_slice(au);
 
         let total_data = total_len.div_ceil(pps).max(1);
-        let n_blocks = total_data
-            .div_ceil(MAX_DATA_SHARDS_PER_BLOCK)
-            .clamp(1, MAX_FEC_BLOCKS);
+        // With parity, cap per-block data so k + m ≤ 255 (the GF(2⁸) ceiling): parity for k
+        // data shards is ⌈k·pct/100⌉, so k ≤ 255·100/(100+pct).
+        let max_data = if pct > 0 {
+            (255 * 100) / (100 + pct)
+        } else {
+            MAX_DATA_SHARDS_PER_BLOCK
+        };
+        let n_blocks = total_data.div_ceil(max_data).clamp(1, MAX_FEC_BLOCKS);
         let per_block = total_data.div_ceil(n_blocks);
 
-        let mut packets = Vec::with_capacity(total_data);
+        let mut packets = Vec::with_capacity(total_data + total_data * pct / 100 + n_blocks);
         for b in 0..n_blocks {
             let first = b * per_block;
             let last = ((b + 1) * per_block).min(total_data);
             if first >= last {
                 break;
             }
-            let block_data_count = last - first;
-            for (fec_index, shard) in (first..last).enumerate() {
-                let start = shard * pps;
-                let end = (start + pps).min(fp.len());
-                let mut payload = vec![0u8; pps]; // last shard zero-padded
-                payload[..end - start].copy_from_slice(&fp[start..end]);
+            let k = last - first;
+            let block_seq_base = self.seq;
+            let multi_fec_blocks = ((b as u8) << 4) | (((n_blocks - 1) as u8) << 6);
 
+            // 1. Build this block's k data-shard datagrams (full `blocksize`), writing the NV
+            //    header fields RS must reproduce on recovery (streamPacketIndex, frameIndex,
+            //    flags, multiFec*). The RTP header + fecInfo are left zero (stamped post-RS).
+            let mut shards: Vec<Vec<u8>> = Vec::with_capacity(k);
+            for i in 0..k {
+                let global = first + i;
+                let seq = block_seq_base + i as u32;
+                let mut buf = vec![0u8; blocksize];
                 let mut flags = FLAG_PIC;
-                if shard == 0 {
+                if global == 0 {
                     flags |= FLAG_SOF;
                 }
-                if shard == total_data - 1 {
+                if global == total_data - 1 {
                     flags |= FLAG_EOF;
                 }
-                let multi_fec_blocks = ((b as u8) << 4) | (((n_blocks - 1) as u8) << 6);
-                // fecInfo: dataShards<<22 | fecIndex<<12 | fecPercentage<<4 (pct = 0).
-                let fec_info: u32 = ((block_data_count as u32) << 22) | ((fec_index as u32) << 12);
-                let seq = self.seq;
-                self.seq = self.seq.wrapping_add(1);
+                buf[16..20].copy_from_slice(&(seq << 8).to_le_bytes()); // streamPacketIndex
+                buf[20..24].copy_from_slice(&frame_index.to_le_bytes()); // frameIndex
+                buf[24] = flags;
+                buf[26] = MULTI_FEC_FLAGS;
+                buf[27] = multi_fec_blocks;
+                let ps = global * pps;
+                let pe = (ps + pps).min(fp.len());
+                buf[SHARD_HEADER..SHARD_HEADER + (pe - ps)].copy_from_slice(&fp[ps..pe]);
+                shards.push(buf);
+            }
 
-                packets.push(build_packet(
+            // 2. m = ⌈k·pct/100⌉ parity shards over the full datagrams. The wire percentage is
+            //    recomputed from m so the client derives the same parity count.
+            let m = if pct > 0 { (k * pct).div_ceil(100) } else { 0 };
+            let wire_pct = if m > 0 { (100 * m) / k } else { 0 };
+            let parity = if m > 0 {
+                Gf8Coder.encode(&shards, m).unwrap_or_default()
+            } else {
+                Vec::new()
+            };
+
+            // 3. Stamp transport headers (RTP + fecInfo) on every shard. We do NOT touch the
+            //    flags/streamPacketIndex bytes, so a recovered data shard's RS-reconstructed
+            //    NV header stays valid.
+            self.seq = block_seq_base + k as u32;
+            for (i, mut buf) in shards.into_iter().enumerate() {
+                let seq = block_seq_base + i as u32;
+                finalize(
+                    &mut buf,
                     seq,
                     timestamp_90k,
                     frame_index,
-                    flags,
                     multi_fec_blocks,
-                    fec_info,
-                    &payload,
-                ));
+                    fec_info(k, i, wire_pct),
+                );
+                packets.push(buf);
+            }
+            for (j, mut buf) in parity.into_iter().enumerate() {
+                let seq = self.seq;
+                self.seq = self.seq.wrapping_add(1);
+                finalize(
+                    &mut buf,
+                    seq,
+                    timestamp_90k,
+                    frame_index,
+                    multi_fec_blocks,
+                    fec_info(k, k + j, wire_pct),
+                );
+                packets.push(buf);
             }
         }
         packets
     }
 }
 
+/// `fecInfo` (u32, little-endian): `dataShards<<22 | fecIndex<<12 | fecPercentage<<4`.
+fn fec_info(k: usize, fec_index: usize, pct: usize) -> u32 {
+    ((k as u32) << 22) | ((fec_index as u32) << 12) | ((pct as u32) << 4)
+}
+
+/// Stamp the post-RS transport fields into a shard datagram (in place). Leaves the NV
+/// `flags`/`streamPacketIndex`/`multiFecFlags` bytes untouched (RS-covered).
+fn finalize(
+    buf: &mut [u8],
+    seq: u32,
+    ts_90k: u32,
+    frame_index: u32,
+    multi_fec_blocks: u8,
+    fec_info: u32,
+) {
+    buf[0] = RTP_HEADER_BYTE; // header (version 2 + extension)
+    buf[2..4].copy_from_slice(&(seq as u16).to_be_bytes()); // sequenceNumber (BE)
+    buf[4..8].copy_from_slice(&ts_90k.to_be_bytes()); // timestamp (90 kHz, BE)
+    buf[20..24].copy_from_slice(&frame_index.to_le_bytes()); // frameIndex (re-affirm for parity)
+    buf[27] = multi_fec_blocks; // re-affirm for parity
+    buf[28..32].copy_from_slice(&fec_info.to_le_bytes()); // fecInfo (LE)
+}
+
 /// 8-byte `video_short_frame_header_t` (little-endian), prefixed to the AU bitstream.
 fn short_frame_header(frame_type: FrameType, last_payload_len: u16) -> [u8; 8] {
     let mut h = [0u8; 8];
@@ -132,55 +211,21 @@ fn short_frame_header(frame_type: FrameType, last_payload_len: u16) -> [u8; 8] {
     h
 }
 
-/// Build one wire datagram: RTP(BE) + reserved + NV_VIDEO_PACKET(LE) + payload.
-fn build_packet(
-    seq: u32,
-    timestamp_90k: u32,
-    frame_index: u32,
-    flags: u8,
-    multi_fec_blocks: u8,
-    fec_info: u32,
-    payload: &[u8],
-) -> Vec<u8> {
-    let mut p = Vec::with_capacity(SHARD_HEADER + payload.len());
-    // --- RTP_PACKET (12 bytes, big-endian) ---
-    p.push(RTP_HEADER_BYTE); // header
-    p.push(0); // packetType (unused for video)
-    p.extend_from_slice(&(seq as u16).to_be_bytes()); // sequenceNumber
-    p.extend_from_slice(&timestamp_90k.to_be_bytes()); // timestamp (90 kHz)
-    p.extend_from_slice(&0u32.to_be_bytes()); // ssrc
-                                              // --- reserved[4] ---
-    p.extend_from_slice(&[0u8; 4]);
-    // --- NV_VIDEO_PACKET (16 bytes, little-endian) ---
-    p.extend_from_slice(&(seq << 8).to_le_bytes()); // streamPacketIndex (low byte 0)
-    p.extend_from_slice(&frame_index.to_le_bytes()); // frameIndex
-    p.push(flags);
-    p.push(0); // extraFlags
-    p.push(MULTI_FEC_FLAGS);
-    p.push(multi_fec_blocks);
-    p.extend_from_slice(&fec_info.to_le_bytes()); // fecInfo
-                                                  // --- payload ---
-    p.extend_from_slice(payload);
-    p
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
     fn single_block_layout() {
-        let mut pk = VideoPacketizer::new(1392); // payload_per_shard = 1392+16-32 = 1376
+        let mut pk = VideoPacketizer::new(1392, 0); // data-only; pps = 1392+16-32 = 1376
         assert_eq!(pk.payload_per_shard, 1376);
         let au = vec![0xABu8; 4000]; // 8+4000 = 4008 → ceil(4008/1376) = 3 data shards
         let pkts = pk.packetize(&au, FrameType::Idr, 90_000);
         assert_eq!(pkts.len(), 3);
-        // Every datagram is SHARD_HEADER + payload_per_shard.
         for p in &pkts {
             assert_eq!(p.len(), SHARD_HEADER + 1376);
             assert_eq!(p[0], 0x90); // RTP header byte
         }
-        // First packet: SOF set, fecIndex 0, frameIndex 0.
         let first = &pkts[0];
         assert_eq!(first[24] & FLAG_SOF, FLAG_SOF);
         assert_eq!(first[24] & FLAG_PIC, FLAG_PIC);
@@ -189,12 +234,10 @@ mod tests {
         let fec_info = u32::from_le_bytes(first[28..32].try_into().unwrap());
         assert_eq!(fec_info >> 22, 3); // dataShards = 3
         assert_eq!((fec_info >> 12) & 0x3ff, 0); // fecIndex 0
-                                                 // Last packet: EOF set, fecIndex 2.
         let last = &pkts[2];
         assert_eq!(last[24] & FLAG_EOF, FLAG_EOF);
         let fec_info_last = u32::from_le_bytes(last[28..32].try_into().unwrap());
         assert_eq!((fec_info_last >> 12) & 0x3ff, 2);
-        // RTP sequence numbers are 0,1,2.
         for (i, p) in pkts.iter().enumerate() {
             assert_eq!(u16::from_be_bytes(p[2..4].try_into().unwrap()), i as u16);
         }
@@ -202,15 +245,59 @@ mod tests {
 
     #[test]
     fn multi_block_split() {
-        let mut pk = VideoPacketizer::new(1392);
-        // Need > 255 data shards → multi-block. 255*1376 ≈ 351 KB; use 600 KB.
+        let mut pk = VideoPacketizer::new(1392, 0); // data-only
         let au = vec![0u8; 600_000];
         let pkts = pk.packetize(&au, FrameType::P, 0);
         let total = (8 + au.len()).div_ceil(1376);
         assert_eq!(pkts.len(), total);
-        // n_blocks = ceil(total/255), clamped to 4; check multiFecBlocks lastBlock nibble.
         let n_blocks = total.div_ceil(255).clamp(1, 4);
         let last_block = ((pkts.last().unwrap()[27]) >> 6) & 0x3;
         assert_eq!(last_block as usize, n_blocks - 1);
     }
+
+    #[test]
+    fn emits_parity_shards() {
+        let mut pk = VideoPacketizer::new(1392, 20); // pps = 1376, 20% FEC
+        let au = vec![0xABu8; 4000]; // 8+4000 = 4008 → 3 data shards (k=3)
+        let pkts = pk.packetize(&au, FrameType::Idr, 0);
+        // m = ceil(3*20/100) = 1 parity shard → 4 packets; wire_pct = 100*1/3 = 33.
+        assert_eq!(pkts.len(), 4);
+        for p in &pkts {
+            let fec_info = u32::from_le_bytes(p[28..32].try_into().unwrap());
+            assert_eq!(fec_info >> 22, 3); // dataShards = k = 3
+            assert_eq!((fec_info >> 4) & 0xff, 33); // wire fecPercentage
+        }
+        // The parity shard is last: fecIndex = k = 3.
+        let parity = &pkts[3];
+        let fec_info = u32::from_le_bytes(parity[28..32].try_into().unwrap());
+        assert_eq!((fec_info >> 12) & 0x3ff, 3);
+        // Data shards keep SOF (first) / EOF (last data shard) / PIC.
+        assert_eq!(pkts[0][24] & FLAG_SOF, FLAG_SOF);
+        assert_eq!(pkts[2][24] & FLAG_EOF, FLAG_EOF);
+        // RTP sequence numbers are contiguous across data + parity (0,1,2,3).
+        for (i, p) in pkts.iter().enumerate() {
+            assert_eq!(u16::from_be_bytes(p[2..4].try_into().unwrap()), i as u16);
+        }
+    }
+
+    /// End-to-end recovery: parity over the full datagram reconstructs a dropped data shard's
+    /// payload AND its NV `flags` byte (the byte Moonlight validates), proving the layout.
+    #[test]
+    fn parity_recovers_full_datagram_incl_flags() {
+        let mut pk = VideoPacketizer::new(1392, 50); // high pct → plenty of parity
+        let au = vec![0x5Au8; 4000]; // k = 3
+        let pkts = pk.packetize(&au, FrameType::Idr, 0);
+        let k = 3usize;
+        let m = pkts.len() - k;
+        assert!(m >= 1);
+        // Drop data shard 1; reconstruct from the rest via the same Cauchy coder.
+        let mut received: Vec<Option<Vec<u8>>> = pkts.iter().map(|p| Some(p.clone())).collect();
+        received[1] = None;
+        let recovered = Gf8Coder.reconstruct(k, m, &mut received).unwrap();
+        // The recovered shard equals the original data shard's RS-covered bytes: its flags
+        // byte (offset 24) is PIC (middle shard), proving the NV header recovers correctly.
+        assert_eq!(recovered[1][24], FLAG_PIC);
+        // ...and the payload region matches the original.
+        assert_eq!(recovered[1][SHARD_HEADER..], pkts[1][SHARD_HEADER..]);
+    }
 }