feat: M2 P1.5 (FEC) — nanors-exact Reed-Solomon recovery for the video stream
Moonlight now reconstructs lost video shards from our parity (verified live: under induced packet loss the picture recovers cleanly instead of failing with "network connection too bad"; 0% added loss in normal operation). The decisive finding: Moonlight's nanors uses a CAUCHY generator matrix (M[j][i] = inv[(m+i)^j], GF(2^8) poly 0x1d), while reed-solomon-erasure is Vandermonde — so its parity was NOT Moonlight-decodable, despite the old gf8.rs comment claiming equivalence. lumen-core: - Swap the GF(2^8) backend from reed-solomon-erasure to a vendored fec-rs (vendor/fec-rs, BSD-2), which builds the byte-identical Cauchy matrix. Pure Rust, no FFI — keeps the "one core" hot path. This makes both lumen's own protocol and the GameStream parity nanors-compatible. - Lock it with a regression test against real nanors vectors (k=4,m=2 [10,20,30,40] -> parity [136,0]) + an independent matrix-derived cross-check + an erase/recover round-trip. Existing FEC/loopback tests stay green, so lumen's own protocol is unaffected. lumen-host video.rs: - Generate m = ceil(k*pct/100) parity shards per FEC block via Gf8Coder; stamp fecInfo with the recomputed wire pct (100*m/k) so the client derives the same count; cap per-block data to 255*100/(100+pct) so k+m <= 255. - CRITICAL byte-exactness: RS runs over the whole `blocksize` shard (Moonlight decodes packetSize+16 bytes from the datagram start and PACKET_RECOVERY_FAILUREs on a bad reconstructed `flags` byte). So the NV header fields RS must reproduce (streamPacketIndex/frameIndex/flags/multiFec*) are written into data shards BEFORE encode, and only the transport fields (RTP header/seq/timestamp + fecInfo) are stamped AFTER — leaving the flags byte RS-covered. Matches Sunshine stream.cpp. Unit-tested incl. flags recovery. - fec_percentage wired from stream.rs (Sunshine default 20, LUMEN_FEC_PCT override; 0 = data-only). LUMEN_VIDEO_DROP injects loss to test recovery. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Generated
+66
-133
@@ -37,17 +37,6 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.7.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9"
|
||||
dependencies = [
|
||||
"getrandom 0.2.17",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.4"
|
||||
@@ -357,7 +346,7 @@ version = "0.70.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"itertools",
|
||||
@@ -376,7 +365,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
|
||||
dependencies = [
|
||||
"annotate-snippets",
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"itertools",
|
||||
@@ -403,12 +392,6 @@ version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.13.0"
|
||||
@@ -650,6 +633,25 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
||||
dependencies = [
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
@@ -838,13 +840,20 @@ version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
|
||||
|
||||
[[package]]
|
||||
name = "fec-rs"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"rayon",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ffmpeg-next"
|
||||
version = "7.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da02698288e0275e442a47fc12ca26d50daf0d48b15398ba5906f20ac2e2a9f9"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"ffmpeg-sys-next",
|
||||
"libc",
|
||||
]
|
||||
@@ -1078,15 +1087,6 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.5"
|
||||
@@ -1239,15 +1239,6 @@ dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "instant"
|
||||
version = "0.1.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.2"
|
||||
@@ -1381,7 +1372,7 @@ version = "0.9.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6b8cfa2a7656627b4c92c6b9ef929433acd673d5ab3708cda1b18478ac00df4"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"cc",
|
||||
"convert_case",
|
||||
"cookie-factory",
|
||||
@@ -1431,15 +1422,6 @@ dependencies = [
|
||||
"lumen-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.7.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e999beba7b6e8345721bd280141ed958096a2e4abdf74f67ff4ce49b4b54e47a"
|
||||
dependencies = [
|
||||
"hashbrown 0.12.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru-slab"
|
||||
version = "0.1.2"
|
||||
@@ -1462,10 +1444,10 @@ dependencies = [
|
||||
"aes-gcm",
|
||||
"bytes",
|
||||
"cbindgen",
|
||||
"fec-rs",
|
||||
"proptest",
|
||||
"quinn",
|
||||
"rand 0.9.4",
|
||||
"reed-solomon-erasure",
|
||||
"reed-solomon-simd",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
@@ -1593,7 +1575,7 @@ version = "0.30.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
"cfg_aliases",
|
||||
"libc",
|
||||
@@ -1757,17 +1739,6 @@ version = "2.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
|
||||
dependencies = [
|
||||
"instant",
|
||||
"lock_api",
|
||||
"parking_lot_core 0.8.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.5"
|
||||
@@ -1775,21 +1746,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
"parking_lot_core 0.9.12",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"instant",
|
||||
"libc",
|
||||
"redox_syscall 0.2.16",
|
||||
"smallvec",
|
||||
"winapi",
|
||||
"parking_lot_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1800,7 +1757,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall 0.5.18",
|
||||
"redox_syscall",
|
||||
"smallvec",
|
||||
"windows-link",
|
||||
]
|
||||
@@ -1843,7 +1800,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9688b89abf11d756499f7c6190711d6dbe5a3acdb30c8fbf001d6596d06a8d44"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"libc",
|
||||
"libspa",
|
||||
"libspa-sys",
|
||||
@@ -1954,7 +1911,7 @@ checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744"
|
||||
dependencies = [
|
||||
"bit-set",
|
||||
"bit-vec",
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"num-traits",
|
||||
"rand 0.9.4",
|
||||
"rand_chacha 0.9.0",
|
||||
@@ -2126,6 +2083,26 @@ dependencies = [
|
||||
"rand_core 0.9.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rcgen"
|
||||
version = "0.13.2"
|
||||
@@ -2145,35 +2122,13 @@ version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08ad765b21a08b1a8e5cdce052719188a23772bcbefb3c439f0baaf62c56ceac"
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reed-solomon-erasure"
|
||||
version = "6.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7263373d500d4d4f505d43a2a662d475a894aa94503a1ee28e9188b5f3960d4f"
|
||||
dependencies = [
|
||||
"libm",
|
||||
"lru",
|
||||
"parking_lot 0.11.2",
|
||||
"smallvec",
|
||||
"spin",
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2278,7 +2233,7 @@ version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
@@ -2434,7 +2389,7 @@ version = "3.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"core-foundation",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
@@ -2829,7 +2784,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"libc",
|
||||
"mio",
|
||||
"parking_lot 0.12.5",
|
||||
"parking_lot",
|
||||
"pin-project-lite",
|
||||
"signal-hook-registry",
|
||||
"socket2",
|
||||
@@ -3257,7 +3212,7 @@ version = "0.244.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"hashbrown 0.15.5",
|
||||
"indexmap",
|
||||
"semver",
|
||||
@@ -3282,7 +3237,7 @@ version = "0.31.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "645c7c96bb74690c3189b5c9cb4ca1627062bb23693a4fad9d8c3de958260144"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"rustix",
|
||||
"wayland-backend",
|
||||
"wayland-scanner",
|
||||
@@ -3294,7 +3249,7 @@ version = "0.32.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "563a85523cade2429938e790815fd7319062103b9f4a2dc806e9b53b95982d8f"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"wayland-backend",
|
||||
"wayland-client",
|
||||
"wayland-scanner",
|
||||
@@ -3306,7 +3261,7 @@ version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e9567599ef23e09b8dad6e429e5738d4509dfc46b3b21f32841a304d16b29c8"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"wayland-backend",
|
||||
"wayland-client",
|
||||
"wayland-protocols",
|
||||
@@ -3319,7 +3274,7 @@ version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb04e52f7836d7c7976c78ca0250d61e33873c34156a2a1fc9474828ec268234"
|
||||
dependencies = [
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"wayland-backend",
|
||||
"wayland-client",
|
||||
"wayland-protocols",
|
||||
@@ -3365,22 +3320,6 @@ dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.11"
|
||||
@@ -3390,12 +3329,6 @@ dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.2.1"
|
||||
@@ -3703,7 +3636,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bitflags 2.13.0",
|
||||
"bitflags",
|
||||
"indexmap",
|
||||
"log",
|
||||
"serde",
|
||||
|
||||
@@ -23,7 +23,11 @@ quic = ["dep:quinn", "dep:tokio"]
|
||||
|
||||
[dependencies]
|
||||
reed-solomon-simd = "3.1" # GF(2^16) Leopard-RS, SIMD, O(n log n) — the wall-breaker (P2)
|
||||
reed-solomon-erasure = "6.0" # GF(2^8) classic RS — GameStream/Moonlight compat (P1)
|
||||
# Vendored fork of fec-rs: GF(2^8) classic RS with the *Cauchy* generator matrix
|
||||
# (M[j][i] = inv[(m+i)^j]) — byte-identical to the `nanors` library Moonlight uses, so our
|
||||
# parity is decodable by a stock Moonlight client. (reed-solomon-erasure is Vandermonde and is
|
||||
# NOT interoperable.) See vendor/fec-rs/LICENSE (BSD-2-Clause).
|
||||
fec-rs = { path = "vendor/fec-rs" }
|
||||
aes-gcm = "0.10" # AES-128-GCM session crypto, matches GameStream
|
||||
zerocopy = { version = "0.8", features = ["derive"] }
|
||||
bytes = "1"
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
//! GF(2⁸) classic Reed–Solomon backend (`reed-solomon-erasure`), equivalent to the
|
||||
//! `nanors` library Moonlight uses. Hard ceiling: data + recovery ≤ 255 shards/block.
|
||||
//! GF(2⁸) classic Reed–Solomon backend (vendored `fec-rs`). Uses the **Cauchy** generator
|
||||
//! matrix `M[j][i] = inv[(m+i)^j]` over GF(2⁸) (poly 0x1d) — byte-identical to the `nanors`
|
||||
//! library Moonlight uses, so the parity this produces is recoverable by a stock Moonlight
|
||||
//! client (unlike Vandermonde RS, whose parity is not interoperable). Hard ceiling: data +
|
||||
//! recovery ≤ 255 shards/block.
|
||||
|
||||
use super::{validate_block_shape, validate_encode_shape, ErasureCoder, FecError};
|
||||
use crate::config::FecScheme;
|
||||
use reed_solomon_erasure::galois_8::ReedSolomon;
|
||||
use fec_rs::ReedSolomon;
|
||||
|
||||
pub struct Gf8Coder;
|
||||
|
||||
@@ -21,7 +24,7 @@ impl ErasureCoder for Gf8Coder {
|
||||
let shard_len = data[0].len();
|
||||
let rs = ReedSolomon::new(k, recovery_count)
|
||||
.map_err(|_| FecError::Config("invalid GF(2^8) shard counts"))?;
|
||||
// reed-solomon-erasure fills parity in place: shards = data || zeroed parity.
|
||||
// fec-rs fills parity in place: shards = data || zeroed parity.
|
||||
let mut shards: Vec<Vec<u8>> = Vec::with_capacity(k + recovery_count);
|
||||
shards.extend_from_slice(data);
|
||||
shards.resize_with(k + recovery_count, || vec![0u8; shard_len]);
|
||||
@@ -69,3 +72,69 @@ fn collect_originals(
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Locks byte-exact compatibility with Moonlight's `nanors` (Cauchy matrix
|
||||
/// `M[j][i] = inv[(m+i)^j]`, GF(2⁸) poly 0x1d). If the backend ever switched matrices,
|
||||
/// these vectors would break and our parity would no longer be Moonlight-decodable.
|
||||
#[test]
|
||||
fn nanors_exact_parity_vectors() {
|
||||
let coder = Gf8Coder;
|
||||
// The definitive nanors vector (k=4, m=2): single-byte shards [10,20,30,40] → [136, 0].
|
||||
let data = vec![vec![10u8], vec![20], vec![30], vec![40]];
|
||||
let parity = coder.encode(&data, 2).unwrap();
|
||||
assert_eq!(parity, vec![vec![136u8], vec![0u8]]);
|
||||
|
||||
// Cross-check independently from the Cauchy parity rows (proves the matrix, not just a
|
||||
// memorized output): parity[j] = XOR_i M[j][i] · data[i] over GF(2⁸).
|
||||
let rows = [[142u8, 244, 71, 167], [244, 142, 167, 71]];
|
||||
let din = [10u8, 20, 30, 40];
|
||||
for (j, row) in rows.iter().enumerate() {
|
||||
let expect = row
|
||||
.iter()
|
||||
.zip(din)
|
||||
.fold(0u8, |acc, (&m, d)| acc ^ gf_mul(m, d));
|
||||
assert_eq!(parity[j][0], expect, "parity row {j}");
|
||||
}
|
||||
}
|
||||
|
||||
/// Round-trip: erase `m` data shards and confirm reconstruction recovers the originals.
|
||||
#[test]
|
||||
fn recovers_erased_data_shards() {
|
||||
let coder = Gf8Coder;
|
||||
let data: Vec<Vec<u8>> = (0..6).map(|i| vec![i as u8; 8]).collect();
|
||||
let parity = coder.encode(&data, 3).unwrap();
|
||||
let mut received: Vec<Option<Vec<u8>>> = data
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(Some)
|
||||
.chain(parity.into_iter().map(Some))
|
||||
.collect();
|
||||
// Erase 3 data shards (the FEC budget) + nothing else.
|
||||
received[1] = None;
|
||||
received[3] = None;
|
||||
received[5] = None;
|
||||
let recovered = coder.reconstruct(6, 3, &mut received).unwrap();
|
||||
assert_eq!(recovered, data);
|
||||
}
|
||||
|
||||
/// GF(2⁸) multiply, reduction poly 0x1d — independent of the backend.
|
||||
fn gf_mul(mut a: u8, mut b: u8) -> u8 {
|
||||
let mut p = 0u8;
|
||||
for _ in 0..8 {
|
||||
if b & 1 != 0 {
|
||||
p ^= a;
|
||||
}
|
||||
let hi = a & 0x80;
|
||||
a <<= 1;
|
||||
if hi != 0 {
|
||||
a ^= 0x1d;
|
||||
}
|
||||
b >>= 1;
|
||||
}
|
||||
p
|
||||
}
|
||||
}
|
||||
|
||||
+17
@@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "fec-rs"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
description = "A pure Rust Reed-Solomon erasure coding library with runtime SIMD acceleration"
|
||||
license = "BSD-2-Clause"
|
||||
repository = "https://github.com/hgaiser/fec-rs"
|
||||
keywords = ["reed-solomon", "erasure", "coding", "fec", "simd"]
|
||||
categories = ["algorithms", "encoding"]
|
||||
readme = "README.md"
|
||||
|
||||
[dependencies]
|
||||
rayon = { version = "1", optional = true }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
parallel = ["rayon"]
|
||||
+24
@@ -0,0 +1,24 @@
|
||||
BSD 2-Clause License
|
||||
|
||||
Copyright (c) 2026, Hans Gaiser
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+73
@@ -0,0 +1,73 @@
|
||||
# fec-rs
|
||||
|
||||
[](https://github.com/hgaiser/fec-rs/actions)
|
||||
[](https://crates.io/crates/fec-rs)
|
||||
[](https://docs.rs/fec-rs)
|
||||
|
||||
A pure Rust Reed-Solomon erasure coding library with runtime SIMD acceleration.
|
||||
|
||||
## Features
|
||||
|
||||
- **Pure Rust** — No C/C++ dependencies or FFI. Everything is implemented in safe Rust
|
||||
(with targeted `unsafe` for SIMD intrinsics).
|
||||
- **Runtime SIMD detection** — Automatically uses the fastest available instruction set
|
||||
via `std::is_x86_feature_detected!`. A single binary works on all x86_64 systems.
|
||||
- **GF(2^8)** — Operates over the Galois field GF(2^8) with generating polynomial 29 (0x1D),
|
||||
compatible with the Moonlight streaming protocol.
|
||||
- **Shard-by-shard encoding** — Incremental encoding via `ShardByShard` for streaming use cases.
|
||||
- **Reconstruction** — Reconstruct missing data and/or parity shards from any sufficient subset.
|
||||
|
||||
## SIMD Acceleration
|
||||
|
||||
On x86_64, the library automatically detects CPU features at runtime and uses
|
||||
the best available instruction set:
|
||||
|
||||
- **GFNI + AVX2** — Single-instruction GF multiply on 32 bytes (Intel Alder Lake+, AMD Zen 4+)
|
||||
- **AVX2** — VPSHUFB split-table nibble lookup on 32 bytes
|
||||
- **GFNI + SSE** — Single-instruction GF multiply on 16 bytes
|
||||
- **SSSE3** — VPSHUFB split-table nibble lookup on 16 bytes
|
||||
- **Scalar** — Lookup table fallback
|
||||
|
||||
## Parallel Encoding
|
||||
|
||||
Enable the `parallel` feature for optional rayon-based parallel encoding:
|
||||
|
||||
```toml
|
||||
fec-rs = { version = "0.1", features = ["parallel"] }
|
||||
```
|
||||
|
||||
When enabled, large encode workloads automatically distribute parity shard
|
||||
computation across threads. Small workloads use the sequential path to avoid
|
||||
overhead.
|
||||
|
||||
## Usage
|
||||
|
||||
```rust
|
||||
use fec_rs::ReedSolomon;
|
||||
|
||||
let rs = ReedSolomon::new(4, 2).unwrap();
|
||||
|
||||
let mut shards: Vec<Vec<u8>> = vec![
|
||||
vec![0, 1, 2, 3],
|
||||
vec![4, 5, 6, 7],
|
||||
vec![8, 9, 10, 11],
|
||||
vec![12, 13, 14, 15],
|
||||
vec![0, 0, 0, 0], // parity shard 1
|
||||
vec![0, 0, 0, 0], // parity shard 2
|
||||
];
|
||||
|
||||
// Encode parity
|
||||
rs.encode(&mut shards).unwrap();
|
||||
|
||||
// Verify
|
||||
assert!(rs.verify(&shards).unwrap());
|
||||
|
||||
// Simulate loss of shard 0
|
||||
let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
|
||||
recovery[0] = None;
|
||||
|
||||
// Reconstruct
|
||||
rs.reconstruct(&mut recovery).unwrap();
|
||||
```
|
||||
|
||||
License: BSD-2-Clause
|
||||
+200
@@ -0,0 +1,200 @@
|
||||
#![allow(clippy::needless_range_loop)]
|
||||
|
||||
use std::env;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
const FIELD_SIZE: usize = 256;
|
||||
const GENERATING_POLYNOMIAL: usize = 29;
|
||||
|
||||
fn gen_log_table(polynomial: usize) -> [u8; FIELD_SIZE] {
|
||||
let mut result = [0u8; FIELD_SIZE];
|
||||
let mut b: usize = 1;
|
||||
|
||||
for log in 0..FIELD_SIZE - 1 {
|
||||
result[b] = log as u8;
|
||||
b <<= 1;
|
||||
if FIELD_SIZE <= b {
|
||||
b = (b - FIELD_SIZE) ^ polynomial;
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
const EXP_TABLE_SIZE: usize = FIELD_SIZE * 2 - 2;
|
||||
|
||||
fn gen_exp_table(log_table: &[u8; FIELD_SIZE]) -> [u8; EXP_TABLE_SIZE] {
|
||||
let mut result = [0u8; EXP_TABLE_SIZE];
|
||||
|
||||
for i in 1..FIELD_SIZE {
|
||||
let log = log_table[i] as usize;
|
||||
result[log] = i as u8;
|
||||
result[log + FIELD_SIZE - 1] = i as u8;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn multiply(log_table: &[u8; FIELD_SIZE], exp_table: &[u8; EXP_TABLE_SIZE], a: u8, b: u8) -> u8 {
|
||||
if a == 0 || b == 0 {
|
||||
0
|
||||
} else {
|
||||
let log_a = log_table[a as usize];
|
||||
let log_b = log_table[b as usize];
|
||||
let log_result = log_a as usize + log_b as usize;
|
||||
exp_table[log_result]
|
||||
}
|
||||
}
|
||||
|
||||
fn gen_mul_table(
|
||||
log_table: &[u8; FIELD_SIZE],
|
||||
exp_table: &[u8; EXP_TABLE_SIZE],
|
||||
) -> [[u8; FIELD_SIZE]; FIELD_SIZE] {
|
||||
let mut result = [[0u8; FIELD_SIZE]; FIELD_SIZE];
|
||||
|
||||
for a in 0..FIELD_SIZE {
|
||||
for b in 0..FIELD_SIZE {
|
||||
result[a][b] = multiply(log_table, exp_table, a as u8, b as u8);
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn gen_mul_table_half(
|
||||
log_table: &[u8; FIELD_SIZE],
|
||||
exp_table: &[u8; EXP_TABLE_SIZE],
|
||||
) -> ([[u8; 16]; FIELD_SIZE], [[u8; 16]; FIELD_SIZE]) {
|
||||
let mut low = [[0u8; 16]; FIELD_SIZE];
|
||||
let mut high = [[0u8; 16]; FIELD_SIZE];
|
||||
|
||||
for a in 0..FIELD_SIZE {
|
||||
for b in 0..FIELD_SIZE {
|
||||
let mut result = 0;
|
||||
if a != 0 && b != 0 {
|
||||
let log_a = log_table[a];
|
||||
let log_b = log_table[b];
|
||||
result = exp_table[log_a as usize + log_b as usize];
|
||||
}
|
||||
if (b & 0x0F) == b {
|
||||
low[a][b] = result;
|
||||
}
|
||||
if (b & 0xF0) == b {
|
||||
high[a][b >> 4] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
(low, high)
|
||||
}
|
||||
|
||||
/// Generate the GFNI affine matrix table.
|
||||
///
|
||||
/// For each constant `c` in GF(2^8), compute a u64-packed 8x8 binary matrix
|
||||
/// such that `vgf2p8affineqb(x, matrix, 0)` produces `c * x` in our GF(2^8).
|
||||
///
|
||||
/// vgf2p8affineqb semantics:
|
||||
/// result_bit[i] = popcount(x AND qword_byte[7-i]) mod 2
|
||||
/// where i goes from 0 (LSB) to 7 (MSB).
|
||||
///
|
||||
/// Matrix packing: qword byte[7] = row for output bit 7 (MSB),
|
||||
/// qword byte[0] = row for output bit 0 (LSB).
|
||||
fn gen_gfni_table(
|
||||
log_table: &[u8; FIELD_SIZE],
|
||||
exp_table: &[u8; EXP_TABLE_SIZE],
|
||||
) -> [u64; FIELD_SIZE] {
|
||||
let mut result = [0u64; FIELD_SIZE];
|
||||
|
||||
for c in 0..FIELD_SIZE {
|
||||
// Build row bytes for each output bit.
|
||||
// row_for_bit_i = mask where bit j is set iff input bit j contributes to output bit i.
|
||||
// M[i][j] = bit_i(c * (1 << j))
|
||||
let mut rows = [0u8; 8];
|
||||
for j in 0..8u8 {
|
||||
let basis = 1u8 << j; // input with only bit j set
|
||||
let product = multiply(log_table, exp_table, c as u8, basis);
|
||||
// product's bit i tells us M[i][j]
|
||||
for i in 0..8u8 {
|
||||
if (product >> i) & 1 == 1 {
|
||||
rows[i as usize] |= 1 << j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pack into u64: byte[7-i] = rows[i]
|
||||
// vgf2p8affineqb: result_bit[i] = popcount(x AND byte[7-i]) mod 2
|
||||
// We want result_bit[i] = bit i of (c*x), so byte[7-i] = rows[i].
|
||||
let mut matrix: u64 = 0;
|
||||
for i in 0..8u32 {
|
||||
matrix |= (rows[i as usize] as u64) << ((7 - i) * 8);
|
||||
}
|
||||
result[c] = matrix;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn write_1d_table(f: &mut File, table: &[u8], name: &str) {
|
||||
let len = table.len();
|
||||
write!(f, "pub static {name}: [u8; {len}] = [").unwrap();
|
||||
for v in table {
|
||||
write!(f, "{v}, ").unwrap();
|
||||
}
|
||||
writeln!(f, "];").unwrap();
|
||||
}
|
||||
|
||||
fn write_2d_table(f: &mut File, table: &[[u8; 16]; FIELD_SIZE], name: &str) {
|
||||
let rows = table.len();
|
||||
let cols = table[0].len();
|
||||
write!(f, "pub static {name}: [[u8; {cols}]; {rows}] = [").unwrap();
|
||||
for row in table {
|
||||
write!(f, "[").unwrap();
|
||||
for v in row {
|
||||
write!(f, "{v}, ").unwrap();
|
||||
}
|
||||
writeln!(f, "],").unwrap();
|
||||
}
|
||||
writeln!(f, "];").unwrap();
|
||||
}
|
||||
|
||||
fn write_mul_table(f: &mut File, table: &[[u8; FIELD_SIZE]; FIELD_SIZE]) {
|
||||
let rows = table.len();
|
||||
let cols = table[0].len();
|
||||
write!(f, "pub static MUL_TABLE: [[u8; {cols}]; {rows}] = [").unwrap();
|
||||
for row in table {
|
||||
write!(f, "[").unwrap();
|
||||
for v in row {
|
||||
write!(f, "{v}, ").unwrap();
|
||||
}
|
||||
writeln!(f, "],").unwrap();
|
||||
}
|
||||
writeln!(f, "];").unwrap();
|
||||
}
|
||||
|
||||
fn write_gfni_table(f: &mut File, table: &[u64; FIELD_SIZE]) {
|
||||
write!(f, "pub static GFNI_TABLE: [u64; {}] = [", FIELD_SIZE).unwrap();
|
||||
for v in table {
|
||||
write!(f, "0x{v:016X}, ").unwrap();
|
||||
}
|
||||
writeln!(f, "];").unwrap();
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let log_table = gen_log_table(GENERATING_POLYNOMIAL);
|
||||
let exp_table = gen_exp_table(&log_table);
|
||||
let mul_table = gen_mul_table(&log_table, &exp_table);
|
||||
let (mul_table_low, mul_table_high) = gen_mul_table_half(&log_table, &exp_table);
|
||||
let gfni_table = gen_gfni_table(&log_table, &exp_table);
|
||||
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
let dest_path = Path::new(&out_dir).join("tables.rs");
|
||||
let mut f = File::create(&dest_path).unwrap();
|
||||
|
||||
write_1d_table(&mut f, &log_table, "LOG_TABLE");
|
||||
write_1d_table(&mut f, &exp_table, "EXP_TABLE");
|
||||
write_mul_table(&mut f, &mul_table);
|
||||
write_2d_table(&mut f, &mul_table_low, "MUL_TABLE_LOW");
|
||||
write_2d_table(&mut f, &mul_table_high, "MUL_TABLE_HIGH");
|
||||
write_gfni_table(&mut f, &gfni_table);
|
||||
}
|
||||
+61
@@ -0,0 +1,61 @@
|
||||
use core::fmt;
|
||||
|
||||
#[derive(PartialEq, Debug, Clone, Copy)]
|
||||
pub enum Error {
|
||||
TooFewShards,
|
||||
TooManyShards,
|
||||
TooFewDataShards,
|
||||
TooManyDataShards,
|
||||
TooFewParityShards,
|
||||
TooManyParityShards,
|
||||
TooFewBufferShards,
|
||||
TooManyBufferShards,
|
||||
IncorrectShardSize,
|
||||
TooFewShardsPresent,
|
||||
EmptyShard,
|
||||
InvalidIndex,
|
||||
InvalidParityMatrix,
|
||||
SingularMatrix,
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Error::TooFewShards => write!(f, "Too few shards"),
|
||||
Error::TooManyShards => write!(f, "Too many shards"),
|
||||
Error::TooFewDataShards => write!(f, "Too few data shards"),
|
||||
Error::TooManyDataShards => write!(f, "Too many data shards"),
|
||||
Error::TooFewParityShards => write!(f, "Too few parity shards"),
|
||||
Error::TooManyParityShards => write!(f, "Too many parity shards"),
|
||||
Error::TooFewBufferShards => write!(f, "Too few buffer shards"),
|
||||
Error::TooManyBufferShards => write!(f, "Too many buffer shards"),
|
||||
Error::IncorrectShardSize => write!(f, "Incorrect shard size"),
|
||||
Error::TooFewShardsPresent => write!(f, "Too few shards present for reconstruction"),
|
||||
Error::EmptyShard => write!(f, "Empty shard"),
|
||||
Error::InvalidIndex => write!(f, "Invalid index"),
|
||||
Error::InvalidParityMatrix => write!(f, "Invalid parity matrix"),
|
||||
Error::SingularMatrix => write!(f, "Singular matrix"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for Error {}
|
||||
|
||||
#[derive(PartialEq, Debug, Clone, Copy)]
|
||||
pub enum SBSError {
|
||||
TooManyCalls,
|
||||
LeftoverShards,
|
||||
RSError(Error),
|
||||
}
|
||||
|
||||
impl fmt::Display for SBSError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
SBSError::TooManyCalls => write!(f, "Too many calls"),
|
||||
SBSError::LeftoverShards => write!(f, "Leftover shards"),
|
||||
SBSError::RSError(e) => write!(f, "{e}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for SBSError {}
|
||||
+636
@@ -0,0 +1,636 @@
|
||||
include!(concat!(env!("OUT_DIR"), "/tables.rs"));
|
||||
|
||||
/// Add two GF(2^8) elements (XOR).
|
||||
#[inline(always)]
|
||||
pub fn add(a: u8, b: u8) -> u8 {
|
||||
a ^ b
|
||||
}
|
||||
|
||||
/// Multiply two GF(2^8) elements using lookup table.
|
||||
#[inline(always)]
|
||||
pub fn mul(a: u8, b: u8) -> u8 {
|
||||
MUL_TABLE[a as usize][b as usize]
|
||||
}
|
||||
|
||||
/// Divide a by b in GF(2^8). Panics if b is 0.
|
||||
#[inline(always)]
|
||||
pub fn div(a: u8, b: u8) -> u8 {
|
||||
if a == 0 {
|
||||
return 0;
|
||||
}
|
||||
assert!(b != 0, "Division by zero in GF(2^8)");
|
||||
let log_a = LOG_TABLE[a as usize] as isize;
|
||||
let log_b = LOG_TABLE[b as usize] as isize;
|
||||
let mut log_result = log_a - log_b;
|
||||
if log_result < 0 {
|
||||
log_result += 255;
|
||||
}
|
||||
EXP_TABLE[log_result as usize]
|
||||
}
|
||||
|
||||
/// Compute a^n in GF(2^8).
|
||||
#[inline(always)]
|
||||
pub fn exp(a: u8, n: usize) -> u8 {
|
||||
if n == 0 {
|
||||
return 1;
|
||||
}
|
||||
if a == 0 {
|
||||
return 0;
|
||||
}
|
||||
let log_a = LOG_TABLE[a as usize] as usize;
|
||||
let log_result = log_a * (n % 255) % 255;
|
||||
EXP_TABLE[log_result]
|
||||
}
|
||||
|
||||
/// Multiply each element of `input` by `c` and write to `out`.
|
||||
///
|
||||
/// Uses SIMD acceleration when available:
|
||||
/// - GFNI + AVX2 (best: single-instruction GF multiply on 32 bytes)
|
||||
/// - AVX2 VPSHUFB (split-table nibble lookup on 32 bytes)
|
||||
/// - GFNI + SSE (single-instruction GF multiply on 16 bytes)
|
||||
/// - SSSE3 VPSHUFB (split-table nibble lookup on 16 bytes)
|
||||
/// - Scalar fallback
|
||||
#[inline]
|
||||
pub fn mul_slice(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
assert_eq!(input.len(), out.len());
|
||||
if input.is_empty() || c == 0 {
|
||||
out.iter_mut().for_each(|o| *o = 0);
|
||||
return;
|
||||
}
|
||||
if c == 1 {
|
||||
out.copy_from_slice(input);
|
||||
return;
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx2") {
|
||||
unsafe {
|
||||
mul_slice_gfni_avx2(c, input, out);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if is_x86_feature_detected!("avx2") {
|
||||
unsafe {
|
||||
mul_slice_avx2(c, input, out);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if is_x86_feature_detected!("gfni") {
|
||||
unsafe {
|
||||
mul_slice_gfni_sse(c, input, out);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if is_x86_feature_detected!("ssse3") {
|
||||
unsafe {
|
||||
mul_slice_ssse3(c, input, out);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
mul_slice_scalar(c, input, out);
|
||||
}
|
||||
|
||||
/// Multiply each element of `input` by `c` and XOR into `out`.
|
||||
///
|
||||
/// Uses SIMD acceleration when available (same priority as `mul_slice`).
|
||||
#[inline]
|
||||
pub fn mul_slice_xor(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
assert_eq!(input.len(), out.len());
|
||||
if input.is_empty() || c == 0 {
|
||||
return;
|
||||
}
|
||||
if c == 1 {
|
||||
for (o, i) in out.iter_mut().zip(input.iter()) {
|
||||
*o ^= *i;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx2") {
|
||||
unsafe {
|
||||
mul_slice_xor_gfni_avx2(c, input, out);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if is_x86_feature_detected!("avx2") {
|
||||
unsafe {
|
||||
mul_slice_xor_avx2(c, input, out);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if is_x86_feature_detected!("gfni") {
|
||||
unsafe {
|
||||
mul_slice_xor_gfni_sse(c, input, out);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if is_x86_feature_detected!("ssse3") {
|
||||
unsafe {
|
||||
mul_slice_xor_ssse3(c, input, out);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
mul_slice_xor_scalar(c, input, out);
|
||||
}
|
||||
|
||||
/// Function pointer types for bulk GF(2^8) operations.
|
||||
pub type MulSliceFn = fn(u8, &[u8], &mut [u8]);
|
||||
|
||||
/// Pair of (mul_slice, mul_slice_xor) function pointers for the best available SIMD path.
|
||||
///
|
||||
/// Unlike `mul_slice`/`mul_slice_xor`, these skip runtime feature detection on every call.
|
||||
/// The caller checks once and stores the result.
|
||||
///
|
||||
/// Note: These raw dispatch functions do NOT handle the c==0 or c==1 special cases.
|
||||
/// The caller must handle those before calling through the function pointer.
|
||||
pub fn detect_mul_slice() -> (MulSliceFn, MulSliceFn) {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("gfni") && is_x86_feature_detected!("avx2") {
|
||||
return (
|
||||
wrap_mul_slice_gfni_avx2 as MulSliceFn,
|
||||
wrap_mul_slice_xor_gfni_avx2 as MulSliceFn,
|
||||
);
|
||||
}
|
||||
if is_x86_feature_detected!("avx2") {
|
||||
return (
|
||||
wrap_mul_slice_avx2 as MulSliceFn,
|
||||
wrap_mul_slice_xor_avx2 as MulSliceFn,
|
||||
);
|
||||
}
|
||||
if is_x86_feature_detected!("gfni") {
|
||||
return (
|
||||
wrap_mul_slice_gfni_sse as MulSliceFn,
|
||||
wrap_mul_slice_xor_gfni_sse as MulSliceFn,
|
||||
);
|
||||
}
|
||||
if is_x86_feature_detected!("ssse3") {
|
||||
return (
|
||||
wrap_mul_slice_ssse3 as MulSliceFn,
|
||||
wrap_mul_slice_xor_ssse3 as MulSliceFn,
|
||||
);
|
||||
}
|
||||
}
|
||||
(
|
||||
mul_slice_scalar as MulSliceFn,
|
||||
mul_slice_xor_scalar as MulSliceFn,
|
||||
)
|
||||
}
|
||||
|
||||
// Safe wrappers for SIMD functions (used as function pointer targets)
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn wrap_mul_slice_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
unsafe { mul_slice_gfni_avx2(c, input, out) }
|
||||
}
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn wrap_mul_slice_xor_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
unsafe { mul_slice_xor_gfni_avx2(c, input, out) }
|
||||
}
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn wrap_mul_slice_avx2(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
unsafe { mul_slice_avx2(c, input, out) }
|
||||
}
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn wrap_mul_slice_xor_avx2(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
unsafe { mul_slice_xor_avx2(c, input, out) }
|
||||
}
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn wrap_mul_slice_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
unsafe { mul_slice_gfni_sse(c, input, out) }
|
||||
}
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn wrap_mul_slice_xor_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
unsafe { mul_slice_xor_gfni_sse(c, input, out) }
|
||||
}
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn wrap_mul_slice_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
unsafe { mul_slice_ssse3(c, input, out) }
|
||||
}
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn wrap_mul_slice_xor_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
unsafe { mul_slice_xor_ssse3(c, input, out) }
|
||||
}
|
||||
|
||||
// ── Scalar fallback ──────────────────────────────────────────────────────
|
||||
|
||||
fn mul_slice_scalar(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
for (o, &i) in out.iter_mut().zip(input.iter()) {
|
||||
*o = mt[i as usize];
|
||||
}
|
||||
}
|
||||
|
||||
fn mul_slice_xor_scalar(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
for (o, &i) in out.iter_mut().zip(input.iter()) {
|
||||
*o ^= mt[i as usize];
|
||||
}
|
||||
}
|
||||
|
||||
// ── x86_64 SIMD implementations ─────────────────────────────────────────
|
||||
|
||||
// ── GFNI + AVX2 (best path: 32 bytes per vgf2p8affineqb) ──────────────
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "gfni,avx2")]
|
||||
unsafe fn mul_slice_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
use core::arch::x86_64::*;
|
||||
|
||||
let matrix = GFNI_TABLE[c as usize] as i64;
|
||||
let mat_vec = _mm256_set1_epi64x(matrix);
|
||||
|
||||
let len = input.len();
|
||||
let mut i = 0;
|
||||
|
||||
while i + 32 <= len {
|
||||
let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
|
||||
let result = _mm256_gf2p8affine_epi64_epi8(data, mat_vec, 0);
|
||||
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
|
||||
i += 32;
|
||||
}
|
||||
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
while i < len {
|
||||
*out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "gfni,avx2")]
|
||||
unsafe fn mul_slice_xor_gfni_avx2(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
use core::arch::x86_64::*;
|
||||
|
||||
let matrix = GFNI_TABLE[c as usize] as i64;
|
||||
let mat_vec = _mm256_set1_epi64x(matrix);
|
||||
|
||||
let len = input.len();
|
||||
let mut i = 0;
|
||||
|
||||
while i + 32 <= len {
|
||||
let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
|
||||
let existing = _mm256_loadu_si256(out.as_ptr().add(i) as *const _);
|
||||
let mul_result = _mm256_gf2p8affine_epi64_epi8(data, mat_vec, 0);
|
||||
let result = _mm256_xor_si256(mul_result, existing);
|
||||
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
|
||||
i += 32;
|
||||
}
|
||||
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
while i < len {
|
||||
*out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// ── GFNI + SSE (16 bytes per vgf2p8affineqb) ──────────────────────────
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "gfni")]
|
||||
unsafe fn mul_slice_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
use core::arch::x86_64::*;
|
||||
|
||||
let matrix = GFNI_TABLE[c as usize] as i64;
|
||||
let mat_vec = _mm_set1_epi64x(matrix);
|
||||
|
||||
let len = input.len();
|
||||
let mut i = 0;
|
||||
|
||||
while i + 16 <= len {
|
||||
let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
|
||||
let result = _mm_gf2p8affine_epi64_epi8(data, mat_vec, 0);
|
||||
_mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
|
||||
i += 16;
|
||||
}
|
||||
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
while i < len {
|
||||
*out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "gfni")]
|
||||
unsafe fn mul_slice_xor_gfni_sse(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
use core::arch::x86_64::*;
|
||||
|
||||
let matrix = GFNI_TABLE[c as usize] as i64;
|
||||
let mat_vec = _mm_set1_epi64x(matrix);
|
||||
|
||||
let len = input.len();
|
||||
let mut i = 0;
|
||||
|
||||
while i + 16 <= len {
|
||||
let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
|
||||
let existing = _mm_loadu_si128(out.as_ptr().add(i) as *const _);
|
||||
let mul_result = _mm_gf2p8affine_epi64_epi8(data, mat_vec, 0);
|
||||
let result = _mm_xor_si128(mul_result, existing);
|
||||
_mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
|
||||
i += 16;
|
||||
}
|
||||
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
while i < len {
|
||||
*out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// ── AVX2 VPSHUFB (32 bytes, split-table nibble lookup) ─────────────────
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
unsafe fn mul_slice_avx2(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
use core::arch::x86_64::*;
|
||||
|
||||
let low = &MUL_TABLE_LOW[c as usize];
|
||||
let high = &MUL_TABLE_HIGH[c as usize];
|
||||
|
||||
// Broadcast the 16-byte low/high tables to 256-bit registers by duplicating
|
||||
let low_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(low.as_ptr() as *const _));
|
||||
let high_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(high.as_ptr() as *const _));
|
||||
let mask = _mm256_set1_epi8(0x0F);
|
||||
|
||||
let len = input.len();
|
||||
let mut i = 0;
|
||||
|
||||
// Process 32 bytes at a time
|
||||
while i + 32 <= len {
|
||||
let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
|
||||
let lo_nibble = _mm256_and_si256(data, mask);
|
||||
let hi_nibble = _mm256_and_si256(_mm256_srli_epi64(data, 4), mask);
|
||||
let lo_result = _mm256_shuffle_epi8(low_vec, lo_nibble);
|
||||
let hi_result = _mm256_shuffle_epi8(high_vec, hi_nibble);
|
||||
let result = _mm256_xor_si256(lo_result, hi_result);
|
||||
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
|
||||
i += 32;
|
||||
}
|
||||
|
||||
// Handle remaining bytes with scalar
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
while i < len {
|
||||
*out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
unsafe fn mul_slice_xor_avx2(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
use core::arch::x86_64::*;
|
||||
|
||||
let low = &MUL_TABLE_LOW[c as usize];
|
||||
let high = &MUL_TABLE_HIGH[c as usize];
|
||||
|
||||
let low_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(low.as_ptr() as *const _));
|
||||
let high_vec = _mm256_broadcastsi128_si256(_mm_loadu_si128(high.as_ptr() as *const _));
|
||||
let mask = _mm256_set1_epi8(0x0F);
|
||||
|
||||
let len = input.len();
|
||||
let mut i = 0;
|
||||
|
||||
while i + 32 <= len {
|
||||
let data = _mm256_loadu_si256(input.as_ptr().add(i) as *const _);
|
||||
let existing = _mm256_loadu_si256(out.as_ptr().add(i) as *const _);
|
||||
let lo_nibble = _mm256_and_si256(data, mask);
|
||||
let hi_nibble = _mm256_and_si256(_mm256_srli_epi64(data, 4), mask);
|
||||
let lo_result = _mm256_shuffle_epi8(low_vec, lo_nibble);
|
||||
let hi_result = _mm256_shuffle_epi8(high_vec, hi_nibble);
|
||||
let result = _mm256_xor_si256(_mm256_xor_si256(lo_result, hi_result), existing);
|
||||
_mm256_storeu_si256(out.as_mut_ptr().add(i) as *mut _, result);
|
||||
i += 32;
|
||||
}
|
||||
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
while i < len {
|
||||
*out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn mul_slice_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
use core::arch::x86_64::*;
|
||||
|
||||
let low = &MUL_TABLE_LOW[c as usize];
|
||||
let high = &MUL_TABLE_HIGH[c as usize];
|
||||
|
||||
let low_vec = _mm_loadu_si128(low.as_ptr() as *const _);
|
||||
let high_vec = _mm_loadu_si128(high.as_ptr() as *const _);
|
||||
let mask = _mm_set1_epi8(0x0F);
|
||||
|
||||
let len = input.len();
|
||||
let mut i = 0;
|
||||
|
||||
while i + 16 <= len {
|
||||
let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
|
||||
let lo_nibble = _mm_and_si128(data, mask);
|
||||
let hi_nibble = _mm_and_si128(_mm_srli_epi64(data, 4), mask);
|
||||
let lo_result = _mm_shuffle_epi8(low_vec, lo_nibble);
|
||||
let hi_result = _mm_shuffle_epi8(high_vec, hi_nibble);
|
||||
let result = _mm_xor_si128(lo_result, hi_result);
|
||||
_mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
|
||||
i += 16;
|
||||
}
|
||||
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
while i < len {
|
||||
*out.get_unchecked_mut(i) = mt[*input.get_unchecked(i) as usize];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn mul_slice_xor_ssse3(c: u8, input: &[u8], out: &mut [u8]) {
|
||||
use core::arch::x86_64::*;
|
||||
|
||||
let low = &MUL_TABLE_LOW[c as usize];
|
||||
let high = &MUL_TABLE_HIGH[c as usize];
|
||||
|
||||
let low_vec = _mm_loadu_si128(low.as_ptr() as *const _);
|
||||
let high_vec = _mm_loadu_si128(high.as_ptr() as *const _);
|
||||
let mask = _mm_set1_epi8(0x0F);
|
||||
|
||||
let len = input.len();
|
||||
let mut i = 0;
|
||||
|
||||
while i + 16 <= len {
|
||||
let data = _mm_loadu_si128(input.as_ptr().add(i) as *const _);
|
||||
let existing = _mm_loadu_si128(out.as_ptr().add(i) as *const _);
|
||||
let lo_nibble = _mm_and_si128(data, mask);
|
||||
let hi_nibble = _mm_and_si128(_mm_srli_epi64(data, 4), mask);
|
||||
let lo_result = _mm_shuffle_epi8(low_vec, lo_nibble);
|
||||
let hi_result = _mm_shuffle_epi8(high_vec, hi_nibble);
|
||||
let result = _mm_xor_si128(_mm_xor_si128(lo_result, hi_result), existing);
|
||||
_mm_storeu_si128(out.as_mut_ptr().add(i) as *mut _, result);
|
||||
i += 16;
|
||||
}
|
||||
|
||||
let mt = &MUL_TABLE[c as usize];
|
||||
while i < len {
|
||||
*out.get_unchecked_mut(i) ^= mt[*input.get_unchecked(i) as usize];
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_gfni_table() {
|
||||
// Verify GFNI_TABLE by emulating vgf2p8affineqb in software:
|
||||
// result_bit[i] = popcount(x AND qword_byte[7-i]) mod 2
|
||||
for c in 0u16..256 {
|
||||
let matrix = GFNI_TABLE[c as usize];
|
||||
for b in 0u16..256 {
|
||||
let expected = MUL_TABLE[c as usize][b as usize];
|
||||
let x = b as u8;
|
||||
let mut result: u8 = 0;
|
||||
for i in 0..8u32 {
|
||||
let row_byte = ((matrix >> ((7 - i) * 8)) & 0xFF) as u8;
|
||||
let dot = (row_byte & x).count_ones() % 2;
|
||||
result |= (dot as u8) << i;
|
||||
}
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"GFNI table mismatch: c={c}, b={b}, got={result}, expected={expected}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_add() {
|
||||
assert_eq!(add(0, 0), 0);
|
||||
assert_eq!(add(1, 0), 1);
|
||||
assert_eq!(add(0, 1), 1);
|
||||
assert_eq!(add(1, 1), 0);
|
||||
assert_eq!(add(0xFF, 0xFF), 0);
|
||||
assert_eq!(add(0xAA, 0x55), 0xFF);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mul() {
|
||||
assert_eq!(mul(0, 0), 0);
|
||||
assert_eq!(mul(1, 0), 0);
|
||||
assert_eq!(mul(0, 1), 0);
|
||||
assert_eq!(mul(1, 1), 1);
|
||||
// a * 1 = a
|
||||
for a in 0u8..=255 {
|
||||
assert_eq!(mul(a, 1), a);
|
||||
assert_eq!(mul(1, a), a);
|
||||
}
|
||||
// a * 0 = 0
|
||||
for a in 0u8..=255 {
|
||||
assert_eq!(mul(a, 0), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_div() {
|
||||
// a / 1 = a
|
||||
for a in 0u8..=255 {
|
||||
assert_eq!(div(a, 1), a);
|
||||
}
|
||||
// a / a = 1 (for a != 0)
|
||||
for a in 1u8..=255 {
|
||||
assert_eq!(div(a, a), 1);
|
||||
}
|
||||
// (a * b) / b = a
|
||||
for a in 1u8..=255 {
|
||||
for b in 1u8..=255 {
|
||||
assert_eq!(div(mul(a, b), b), a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exp() {
|
||||
assert_eq!(exp(0, 0), 1);
|
||||
assert_eq!(exp(1, 0), 1);
|
||||
assert_eq!(exp(5, 0), 1);
|
||||
assert_eq!(exp(0, 1), 0);
|
||||
assert_eq!(exp(0, 100), 0);
|
||||
// a^1 = a
|
||||
for a in 0u8..=255 {
|
||||
assert_eq!(exp(a, 1), a);
|
||||
}
|
||||
// a^2 = a * a
|
||||
for a in 0u8..=255 {
|
||||
assert_eq!(exp(a, 2), mul(a, a));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mul_slice_basic() {
|
||||
let input = [1u8, 2, 3, 4, 5, 6, 7, 8];
|
||||
let mut out = [0u8; 8];
|
||||
mul_slice(3, &input, &mut out);
|
||||
for i in 0..input.len() {
|
||||
assert_eq!(out[i], mul(3, input[i]));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mul_slice_xor_basic() {
|
||||
let input = [1u8, 2, 3, 4, 5, 6, 7, 8];
|
||||
let mut out = [10u8; 8];
|
||||
let original = out;
|
||||
mul_slice_xor(3, &input, &mut out);
|
||||
for i in 0..input.len() {
|
||||
assert_eq!(out[i], original[i] ^ mul(3, input[i]));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mul_slice_large() {
|
||||
// Test with a buffer large enough to exercise SIMD paths
|
||||
let input: Vec<u8> = (0..256).map(|i| i as u8).collect();
|
||||
let mut out = vec![0u8; 256];
|
||||
let mut expected = vec![0u8; 256];
|
||||
|
||||
for c in [2u8, 7, 42, 128, 255] {
|
||||
mul_slice_scalar(c, &input, &mut expected);
|
||||
mul_slice(c, &input, &mut out);
|
||||
assert_eq!(out, expected, "mul_slice mismatch for c={c}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mul_slice_xor_large() {
|
||||
let input: Vec<u8> = (0..256).map(|i| i as u8).collect();
|
||||
|
||||
for c in [2u8, 7, 42, 128, 255] {
|
||||
let mut out_expected = vec![0xABu8; 256];
|
||||
let mut out_simd = out_expected.clone();
|
||||
mul_slice_xor_scalar(c, &input, &mut out_expected);
|
||||
mul_slice_xor(c, &input, &mut out_simd);
|
||||
assert_eq!(out_simd, out_expected, "mul_slice_xor mismatch for c={c}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mul_slice_unaligned_sizes() {
|
||||
// Test sizes that don't align to SIMD width
|
||||
for size in [1, 7, 15, 16, 17, 31, 32, 33, 63, 64, 65, 100] {
|
||||
let input: Vec<u8> = (0..size).map(|i| i as u8).collect();
|
||||
let mut out = vec![0u8; size];
|
||||
let mut expected = vec![0u8; size];
|
||||
|
||||
mul_slice_scalar(42, &input, &mut expected);
|
||||
mul_slice(42, &input, &mut out);
|
||||
assert_eq!(out, expected, "mul_slice mismatch for size={size}");
|
||||
}
|
||||
}
|
||||
}
|
||||
+73
@@ -0,0 +1,73 @@
|
||||
//! A pure Rust Reed-Solomon erasure coding library with runtime SIMD acceleration.
|
||||
//!
|
||||
//! # Features
|
||||
//!
|
||||
//! - **Pure Rust** — No C/C++ dependencies or FFI. Everything is implemented in safe Rust
|
||||
//! (with targeted `unsafe` for SIMD intrinsics).
|
||||
//! - **Runtime SIMD detection** — Automatically uses the fastest available instruction set
|
||||
//! via `std::is_x86_feature_detected!`. A single binary works on all x86_64 systems.
|
||||
//! - **GF(2^8)** — Operates over the Galois field GF(2^8) with generating polynomial 29 (0x1D),
|
||||
//! compatible with the Moonlight streaming protocol.
|
||||
//! - **Shard-by-shard encoding** — Incremental encoding via `ShardByShard` for streaming use cases.
|
||||
//! - **Reconstruction** — Reconstruct missing data and/or parity shards from any sufficient subset.
|
||||
//!
|
||||
//! # SIMD Acceleration
|
||||
//!
|
||||
//! On x86_64, the library automatically detects CPU features at runtime and uses
|
||||
//! the best available instruction set:
|
||||
//!
|
||||
//! - **GFNI + AVX2** — Single-instruction GF multiply on 32 bytes (Intel Alder Lake+, AMD Zen 4+)
|
||||
//! - **AVX2** — VPSHUFB split-table nibble lookup on 32 bytes
|
||||
//! - **GFNI + SSE** — Single-instruction GF multiply on 16 bytes
|
||||
//! - **SSSE3** — VPSHUFB split-table nibble lookup on 16 bytes
|
||||
//! - **Scalar** — Lookup table fallback
|
||||
//!
|
||||
//! # Parallel Encoding
|
||||
//!
|
||||
//! Enable the `parallel` feature for optional rayon-based parallel encoding:
|
||||
//!
|
||||
//! ```toml
|
||||
//! fec-rs = { version = "0.1", features = ["parallel"] }
|
||||
//! ```
|
||||
//!
|
||||
//! When enabled, large encode workloads automatically distribute parity shard
|
||||
//! computation across threads. Small workloads use the sequential path to avoid
|
||||
//! overhead.
|
||||
//!
|
||||
//! # Usage
|
||||
//!
|
||||
//! ```
|
||||
//! use fec_rs::ReedSolomon;
|
||||
//!
|
||||
//! let rs = ReedSolomon::new(4, 2).unwrap();
|
||||
//!
|
||||
//! let mut shards: Vec<Vec<u8>> = vec![
|
||||
//! vec![0, 1, 2, 3],
|
||||
//! vec![4, 5, 6, 7],
|
||||
//! vec![8, 9, 10, 11],
|
||||
//! vec![12, 13, 14, 15],
|
||||
//! vec![0, 0, 0, 0], // parity shard 1
|
||||
//! vec![0, 0, 0, 0], // parity shard 2
|
||||
//! ];
|
||||
//!
|
||||
//! // Encode parity
|
||||
//! rs.encode(&mut shards).unwrap();
|
||||
//!
|
||||
//! // Verify
|
||||
//! assert!(rs.verify(&shards).unwrap());
|
||||
//!
|
||||
//! // Simulate loss of shard 0
|
||||
//! let mut recovery: Vec<Option<Vec<u8>>> = shards.into_iter().map(Some).collect();
|
||||
//! recovery[0] = None;
|
||||
//!
|
||||
//! // Reconstruct
|
||||
//! rs.reconstruct(&mut recovery).unwrap();
|
||||
//! ```
|
||||
|
||||
mod errors;
|
||||
pub mod galois;
|
||||
mod matrix;
|
||||
mod reed_solomon;
|
||||
|
||||
pub use errors::{Error, SBSError};
|
||||
pub use reed_solomon::{ReconstructShard, ReedSolomon, ShardByShard};
|
||||
+251
@@ -0,0 +1,251 @@
|
||||
use crate::galois;
|
||||
|
||||
#[derive(PartialEq, Debug, Clone)]
|
||||
pub struct Matrix {
|
||||
pub row_count: usize,
|
||||
pub col_count: usize,
|
||||
pub data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl Matrix {
|
||||
pub fn new(rows: usize, cols: usize) -> Self {
|
||||
Self {
|
||||
row_count: rows,
|
||||
col_count: cols,
|
||||
data: vec![0u8; rows * cols],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn identity(size: usize) -> Self {
|
||||
let mut m = Self::new(size, size);
|
||||
for i in 0..size {
|
||||
m.data[i * size + i] = 1;
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
pub fn vandermonde(rows: usize, cols: usize) -> Self {
|
||||
let mut m = Self::new(rows, cols);
|
||||
for r in 0..rows {
|
||||
let r_a = r as u8;
|
||||
for c in 0..cols {
|
||||
m.data[r * cols + c] = galois::exp(r_a, c);
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, r: usize, c: usize) -> u8 {
|
||||
self.data[r * self.col_count + c]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn set(&mut self, r: usize, c: usize, val: u8) {
|
||||
self.data[r * self.col_count + c] = val;
|
||||
}
|
||||
|
||||
pub fn get_row(&self, row: usize) -> &[u8] {
|
||||
let start = row * self.col_count;
|
||||
&self.data[start..start + self.col_count]
|
||||
}
|
||||
|
||||
pub fn sub_matrix(&self, rmin: usize, cmin: usize, rmax: usize, cmax: usize) -> Self {
|
||||
let new_rows = rmax - rmin;
|
||||
let new_cols = cmax - cmin;
|
||||
let mut m = Self::new(new_rows, new_cols);
|
||||
for r in rmin..rmax {
|
||||
for c in cmin..cmax {
|
||||
m.data[(r - rmin) * new_cols + (c - cmin)] = self.get(r, c);
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
pub fn multiply(&self, rhs: &Matrix) -> Self {
|
||||
assert_eq!(
|
||||
self.col_count, rhs.row_count,
|
||||
"Matrix dimensions incompatible for multiply"
|
||||
);
|
||||
let mut result = Self::new(self.row_count, rhs.col_count);
|
||||
for r in 0..self.row_count {
|
||||
for c in 0..rhs.col_count {
|
||||
let mut val = 0u8;
|
||||
for i in 0..self.col_count {
|
||||
val = galois::add(val, galois::mul(self.get(r, i), rhs.get(i, c)));
|
||||
}
|
||||
result.set(r, c, val);
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
pub fn augment(&self, rhs: &Matrix) -> Self {
|
||||
assert_eq!(
|
||||
self.row_count, rhs.row_count,
|
||||
"Matrix row counts must match for augment"
|
||||
);
|
||||
let new_cols = self.col_count + rhs.col_count;
|
||||
let mut m = Self::new(self.row_count, new_cols);
|
||||
for r in 0..self.row_count {
|
||||
for c in 0..self.col_count {
|
||||
m.set(r, c, self.get(r, c));
|
||||
}
|
||||
for c in 0..rhs.col_count {
|
||||
m.set(r, self.col_count + c, rhs.get(r, c));
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn swap_rows(&mut self, r1: usize, r2: usize) {
|
||||
if r1 == r2 {
|
||||
return;
|
||||
}
|
||||
let s1 = r1 * self.col_count;
|
||||
let s2 = r2 * self.col_count;
|
||||
for i in 0..self.col_count {
|
||||
self.data.swap(s1 + i, s2 + i);
|
||||
}
|
||||
}
|
||||
|
||||
fn gaussian_elim(&mut self) -> Result<(), &'static str> {
|
||||
for r in 0..self.row_count {
|
||||
// Pivot search
|
||||
if self.get(r, r) == 0 {
|
||||
for r_below in r + 1..self.row_count {
|
||||
if self.get(r_below, r) != 0 {
|
||||
self.swap_rows(r, r_below);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if self.get(r, r) == 0 {
|
||||
return Err("Singular matrix");
|
||||
}
|
||||
// Scale to 1
|
||||
if self.get(r, r) != 1 {
|
||||
let scale = galois::div(1, self.get(r, r));
|
||||
for c in 0..self.col_count {
|
||||
let val = galois::mul(scale, self.get(r, c));
|
||||
self.set(r, c, val);
|
||||
}
|
||||
}
|
||||
// Eliminate below
|
||||
for r_below in r + 1..self.row_count {
|
||||
if self.get(r_below, r) != 0 {
|
||||
let scale = self.get(r_below, r);
|
||||
for c in 0..self.col_count {
|
||||
let val =
|
||||
galois::add(self.get(r_below, c), galois::mul(scale, self.get(r, c)));
|
||||
self.set(r_below, c, val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Back substitution
|
||||
for d in 0..self.row_count {
|
||||
for r_above in 0..d {
|
||||
if self.get(r_above, d) != 0 {
|
||||
let scale = self.get(r_above, d);
|
||||
for c in 0..self.col_count {
|
||||
let val =
|
||||
galois::add(self.get(r_above, c), galois::mul(scale, self.get(d, c)));
|
||||
self.set(r_above, c, val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn invert(&self) -> Result<Self, &'static str> {
|
||||
assert!(
|
||||
self.row_count == self.col_count,
|
||||
"Cannot invert non-square matrix"
|
||||
);
|
||||
let mut work = self.augment(&Self::identity(self.row_count));
|
||||
work.gaussian_elim()?;
|
||||
Ok(work.sub_matrix(0, self.row_count, self.col_count, self.col_count * 2))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn mat(data: Vec<Vec<u8>>) -> Matrix {
|
||||
let rows = data.len();
|
||||
let cols = data[0].len();
|
||||
let flat: Vec<u8> = data.into_iter().flatten().collect();
|
||||
Matrix {
|
||||
row_count: rows,
|
||||
col_count: cols,
|
||||
data: flat,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_identity() {
|
||||
let m = Matrix::identity(3);
|
||||
let expected = mat(vec![vec![1, 0, 0], vec![0, 1, 0], vec![0, 0, 1]]);
|
||||
assert_eq!(m, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiply() {
|
||||
let m1 = mat(vec![vec![1, 2], vec![3, 4]]);
|
||||
let m2 = mat(vec![vec![5, 6], vec![7, 8]]);
|
||||
let result = m1.multiply(&m2);
|
||||
let expected = mat(vec![vec![11, 22], vec![19, 42]]);
|
||||
assert_eq!(result, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invert() {
|
||||
let m = mat(vec![
|
||||
vec![56, 23, 98],
|
||||
vec![3, 100, 200],
|
||||
vec![45, 201, 123],
|
||||
]);
|
||||
let inv = m.invert().unwrap();
|
||||
let expected = mat(vec![
|
||||
vec![175, 133, 33],
|
||||
vec![130, 13, 245],
|
||||
vec![112, 35, 126],
|
||||
]);
|
||||
assert_eq!(inv, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invert_identity() {
|
||||
let m = Matrix::identity(4);
|
||||
let inv = m.invert().unwrap();
|
||||
assert_eq!(inv, m);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiply_identity() {
|
||||
let m = mat(vec![
|
||||
vec![56, 23, 98],
|
||||
vec![3, 100, 200],
|
||||
vec![45, 201, 123],
|
||||
]);
|
||||
let id = Matrix::identity(3);
|
||||
assert_eq!(m.multiply(&id), m);
|
||||
assert_eq!(id.multiply(&m), m);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invert_times_original_is_identity() {
|
||||
let m = mat(vec![
|
||||
vec![56, 23, 98],
|
||||
vec![3, 100, 200],
|
||||
vec![45, 201, 123],
|
||||
]);
|
||||
let inv = m.invert().unwrap();
|
||||
let product = m.multiply(&inv);
|
||||
assert_eq!(product, Matrix::identity(3));
|
||||
}
|
||||
}
|
||||
+1263
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,7 @@ use super::VIDEO_PORT;
|
||||
use crate::capture::{self, Capturer, FastSyntheticCapturer};
|
||||
use crate::encode::{self, Codec};
|
||||
use anyhow::{Context, Result};
|
||||
use rand::Rng;
|
||||
use std::net::UdpSocket;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
@@ -82,7 +83,12 @@ fn run(cfg: StreamConfig, running: &AtomicBool) -> Result<()> {
|
||||
cfg.bitrate_kbps as u64 * 1000,
|
||||
)
|
||||
.context("open NVENC for stream")?;
|
||||
let mut pk = VideoPacketizer::new(cfg.packet_size);
|
||||
// FEC overhead percent (Sunshine default 20). Override with LUMEN_FEC_PCT (0 = data-only).
|
||||
let fec_pct: u8 = std::env::var("LUMEN_FEC_PCT")
|
||||
.ok()
|
||||
.and_then(|v| v.parse().ok())
|
||||
.unwrap_or(20);
|
||||
let mut pk = VideoPacketizer::new(cfg.packet_size, fec_pct);
|
||||
|
||||
// Pace at a steady rate (capped at 60fps), re-encoding the last captured frame when the
|
||||
// compositor produced no new one. wlroots only emits frames on damage, so a static or
|
||||
@@ -94,6 +100,13 @@ fn run(cfg: StreamConfig, running: &AtomicBool) -> Result<()> {
|
||||
let mut fps_count: u32 = 0;
|
||||
let mut fps_t = Instant::now();
|
||||
let stream_start = Instant::now();
|
||||
// Test knob: drop this % of outbound packets to exercise FEC recovery (0 = off).
|
||||
let drop_pct: u32 = std::env::var("LUMEN_VIDEO_DROP")
|
||||
.ok()
|
||||
.and_then(|v| v.parse().ok())
|
||||
.unwrap_or(0);
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut dropped: u64 = 0;
|
||||
|
||||
while running.load(Ordering::SeqCst) {
|
||||
let tick = Instant::now();
|
||||
@@ -113,6 +126,11 @@ fn run(cfg: StreamConfig, running: &AtomicBool) -> Result<()> {
|
||||
FrameType::P
|
||||
};
|
||||
for pkt in pk.packetize(&au.data, ft, ts) {
|
||||
// Simulated network loss: build the packet (advances seq) but skip the send.
|
||||
if drop_pct > 0 && rng.gen_range(0..100) < drop_pct {
|
||||
dropped += 1;
|
||||
continue;
|
||||
}
|
||||
if sock.send(&pkt).is_err() {
|
||||
client_gone = true;
|
||||
break;
|
||||
@@ -130,7 +148,7 @@ fn run(cfg: StreamConfig, running: &AtomicBool) -> Result<()> {
|
||||
|
||||
fps_count += 1;
|
||||
if fps_t.elapsed() >= Duration::from_secs(1) {
|
||||
tracing::info!(fps = fps_count, sent_pkts, "video: streaming");
|
||||
tracing::info!(fps = fps_count, sent_pkts, dropped, "video: streaming");
|
||||
fps_count = 0;
|
||||
fps_t = Instant::now();
|
||||
}
|
||||
|
||||
@@ -1,15 +1,21 @@
|
||||
//! GameStream video wire packetization: an encoded access unit → UDP datagrams a stock
|
||||
//! Moonlight client decodes. Each datagram is
|
||||
//! Moonlight client decodes (and recovers under loss). Each datagram is
|
||||
//! `RTP_PACKET(12, big-endian) + reserved[4] + NV_VIDEO_PACKET(16, little-endian) + payload`
|
||||
//! and the frame's bitstream is prefixed with an 8-byte `video_short_frame_header_t`, then
|
||||
//! striped into ≤4 FEC blocks of ≤255 data shards. Byte-exact spec:
|
||||
//! striped into ≤4 FEC blocks of ≤255 shards. Byte-exact spec:
|
||||
//! `docs/research/gamestream-protocol-research.json` (video plane).
|
||||
//!
|
||||
//! P1.3 sends **data shards only** (`fecPercentage = 0`): on a clean LAN the client has
|
||||
//! every data shard and never runs Reed–Solomon recovery, so we get a decodable frame
|
||||
//! without matching Moonlight's `nanors` parity matrix (that interop work is P1.5). Plaintext
|
||||
//! only (encryption negotiated off for now). This lives in lumen-host for fast iteration;
|
||||
//! the wire codec moves into lumen-core (the P1 wire mode) once proven.
|
||||
//! FEC (P1.5): each block carries `m = ⌈k·pct/100⌉` Reed–Solomon parity shards generated by
|
||||
//! `lumen_core::fec::Gf8Coder` (the nanors-compatible Cauchy GF(2⁸) coder). Crucially, RS runs
|
||||
//! over the **whole `blocksize` shard** — Moonlight decodes over `packetSize + 16` bytes from
|
||||
//! the datagram start (`RtpVideoQueue.c`), and rejects a recovered shard whose reconstructed
|
||||
//! `flags` byte isn't valid — so the NV header fields RS must reproduce (streamPacketIndex,
|
||||
//! frameIndex, flags, multiFec*) are written into the data shards **before** encoding, and only
|
||||
//! the transport fields (RTP header/seq/timestamp + fecInfo) are stamped **after**, matching
|
||||
//! Sunshine `stream.cpp`. `pct = 0` falls back to data-shards-only. Plaintext (AES-GCM video
|
||||
//! encryption is negotiated off for now).
|
||||
|
||||
use lumen_core::fec::{ErasureCoder, Gf8Coder};
|
||||
|
||||
/// RTP `header` byte: version 2 (0x80) | extension (0x10) — Moonlight keys on the extension.
|
||||
const RTP_HEADER_BYTE: u8 = 0x80 | 0x10;
|
||||
@@ -28,28 +34,32 @@ pub enum FrameType {
|
||||
P,
|
||||
}
|
||||
|
||||
/// Splits encoded access units into GameStream video datagrams.
|
||||
/// Splits encoded access units into GameStream video datagrams (data + FEC parity shards).
|
||||
pub struct VideoPacketizer {
|
||||
/// Negotiated `packetSize` (ANNOUNCE `x-nv-video[0].packetSize`).
|
||||
packet_size: usize,
|
||||
/// Per-shard payload bytes = `blocksize - SHARD_HEADER`, `blocksize = packetSize + 16`.
|
||||
payload_per_shard: usize,
|
||||
/// Requested FEC overhead percent (0 = data shards only). The wire carries the recomputed
|
||||
/// per-block `(100·m)/k` so Moonlight derives the same parity count.
|
||||
fec_percentage: usize,
|
||||
frame_index: u32,
|
||||
/// Monotonic per-stream packet counter (the RTP sequence / streamPacketIndex source).
|
||||
seq: u32,
|
||||
}
|
||||
|
||||
impl VideoPacketizer {
|
||||
pub fn new(packet_size: usize) -> Self {
|
||||
pub fn new(packet_size: usize, fec_percentage: u8) -> Self {
|
||||
VideoPacketizer {
|
||||
packet_size,
|
||||
payload_per_shard: packet_size + 16 - SHARD_HEADER,
|
||||
fec_percentage: fec_percentage as usize,
|
||||
frame_index: 0,
|
||||
seq: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Packetize one encoded AU into wire datagrams (ready for UDP send).
|
||||
/// Packetize one encoded AU into wire datagrams (data shards + Cauchy RS parity shards).
|
||||
pub fn packetize(
|
||||
&mut self,
|
||||
au: &[u8],
|
||||
@@ -59,6 +69,8 @@ impl VideoPacketizer {
|
||||
let frame_index = self.frame_index;
|
||||
self.frame_index = self.frame_index.wrapping_add(1);
|
||||
let pps = self.payload_per_shard;
|
||||
let blocksize = SHARD_HEADER + pps; // = packet_size + 16
|
||||
let pct = self.fec_percentage;
|
||||
|
||||
// frame payload = 8-byte short frame header + the AU bitstream.
|
||||
let total_len = 8 + au.len();
|
||||
@@ -71,53 +83,120 @@ impl VideoPacketizer {
|
||||
fp.extend_from_slice(au);
|
||||
|
||||
let total_data = total_len.div_ceil(pps).max(1);
|
||||
let n_blocks = total_data
|
||||
.div_ceil(MAX_DATA_SHARDS_PER_BLOCK)
|
||||
.clamp(1, MAX_FEC_BLOCKS);
|
||||
// With parity, cap per-block data so k + m ≤ 255 (the GF(2⁸) ceiling): parity for k
|
||||
// data shards is ⌈k·pct/100⌉, so k ≤ 255·100/(100+pct).
|
||||
let max_data = if pct > 0 {
|
||||
(255 * 100) / (100 + pct)
|
||||
} else {
|
||||
MAX_DATA_SHARDS_PER_BLOCK
|
||||
};
|
||||
let n_blocks = total_data.div_ceil(max_data).clamp(1, MAX_FEC_BLOCKS);
|
||||
let per_block = total_data.div_ceil(n_blocks);
|
||||
|
||||
let mut packets = Vec::with_capacity(total_data);
|
||||
let mut packets = Vec::with_capacity(total_data + total_data * pct / 100 + n_blocks);
|
||||
for b in 0..n_blocks {
|
||||
let first = b * per_block;
|
||||
let last = ((b + 1) * per_block).min(total_data);
|
||||
if first >= last {
|
||||
break;
|
||||
}
|
||||
let block_data_count = last - first;
|
||||
for (fec_index, shard) in (first..last).enumerate() {
|
||||
let start = shard * pps;
|
||||
let end = (start + pps).min(fp.len());
|
||||
let mut payload = vec![0u8; pps]; // last shard zero-padded
|
||||
payload[..end - start].copy_from_slice(&fp[start..end]);
|
||||
let k = last - first;
|
||||
let block_seq_base = self.seq;
|
||||
let multi_fec_blocks = ((b as u8) << 4) | (((n_blocks - 1) as u8) << 6);
|
||||
|
||||
// 1. Build this block's k data-shard datagrams (full `blocksize`), writing the NV
|
||||
// header fields RS must reproduce on recovery (streamPacketIndex, frameIndex,
|
||||
// flags, multiFec*). The RTP header + fecInfo are left zero (stamped post-RS).
|
||||
let mut shards: Vec<Vec<u8>> = Vec::with_capacity(k);
|
||||
for i in 0..k {
|
||||
let global = first + i;
|
||||
let seq = block_seq_base + i as u32;
|
||||
let mut buf = vec![0u8; blocksize];
|
||||
let mut flags = FLAG_PIC;
|
||||
if shard == 0 {
|
||||
if global == 0 {
|
||||
flags |= FLAG_SOF;
|
||||
}
|
||||
if shard == total_data - 1 {
|
||||
if global == total_data - 1 {
|
||||
flags |= FLAG_EOF;
|
||||
}
|
||||
let multi_fec_blocks = ((b as u8) << 4) | (((n_blocks - 1) as u8) << 6);
|
||||
// fecInfo: dataShards<<22 | fecIndex<<12 | fecPercentage<<4 (pct = 0).
|
||||
let fec_info: u32 = ((block_data_count as u32) << 22) | ((fec_index as u32) << 12);
|
||||
let seq = self.seq;
|
||||
self.seq = self.seq.wrapping_add(1);
|
||||
buf[16..20].copy_from_slice(&(seq << 8).to_le_bytes()); // streamPacketIndex
|
||||
buf[20..24].copy_from_slice(&frame_index.to_le_bytes()); // frameIndex
|
||||
buf[24] = flags;
|
||||
buf[26] = MULTI_FEC_FLAGS;
|
||||
buf[27] = multi_fec_blocks;
|
||||
let ps = global * pps;
|
||||
let pe = (ps + pps).min(fp.len());
|
||||
buf[SHARD_HEADER..SHARD_HEADER + (pe - ps)].copy_from_slice(&fp[ps..pe]);
|
||||
shards.push(buf);
|
||||
}
|
||||
|
||||
packets.push(build_packet(
|
||||
// 2. m = ⌈k·pct/100⌉ parity shards over the full datagrams. The wire percentage is
|
||||
// recomputed from m so the client derives the same parity count.
|
||||
let m = if pct > 0 { (k * pct).div_ceil(100) } else { 0 };
|
||||
let wire_pct = if m > 0 { (100 * m) / k } else { 0 };
|
||||
let parity = if m > 0 {
|
||||
Gf8Coder.encode(&shards, m).unwrap_or_default()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
// 3. Stamp transport headers (RTP + fecInfo) on every shard. We do NOT touch the
|
||||
// flags/streamPacketIndex bytes, so a recovered data shard's RS-reconstructed
|
||||
// NV header stays valid.
|
||||
self.seq = block_seq_base + k as u32;
|
||||
for (i, mut buf) in shards.into_iter().enumerate() {
|
||||
let seq = block_seq_base + i as u32;
|
||||
finalize(
|
||||
&mut buf,
|
||||
seq,
|
||||
timestamp_90k,
|
||||
frame_index,
|
||||
flags,
|
||||
multi_fec_blocks,
|
||||
fec_info,
|
||||
&payload,
|
||||
));
|
||||
fec_info(k, i, wire_pct),
|
||||
);
|
||||
packets.push(buf);
|
||||
}
|
||||
for (j, mut buf) in parity.into_iter().enumerate() {
|
||||
let seq = self.seq;
|
||||
self.seq = self.seq.wrapping_add(1);
|
||||
finalize(
|
||||
&mut buf,
|
||||
seq,
|
||||
timestamp_90k,
|
||||
frame_index,
|
||||
multi_fec_blocks,
|
||||
fec_info(k, k + j, wire_pct),
|
||||
);
|
||||
packets.push(buf);
|
||||
}
|
||||
}
|
||||
packets
|
||||
}
|
||||
}
|
||||
|
||||
/// `fecInfo` (u32, little-endian): `dataShards<<22 | fecIndex<<12 | fecPercentage<<4`.
|
||||
fn fec_info(k: usize, fec_index: usize, pct: usize) -> u32 {
|
||||
((k as u32) << 22) | ((fec_index as u32) << 12) | ((pct as u32) << 4)
|
||||
}
|
||||
|
||||
/// Stamp the post-RS transport fields into a shard datagram (in place). Leaves the NV
|
||||
/// `flags`/`streamPacketIndex`/`multiFecFlags` bytes untouched (RS-covered).
|
||||
fn finalize(
|
||||
buf: &mut [u8],
|
||||
seq: u32,
|
||||
ts_90k: u32,
|
||||
frame_index: u32,
|
||||
multi_fec_blocks: u8,
|
||||
fec_info: u32,
|
||||
) {
|
||||
buf[0] = RTP_HEADER_BYTE; // header (version 2 + extension)
|
||||
buf[2..4].copy_from_slice(&(seq as u16).to_be_bytes()); // sequenceNumber (BE)
|
||||
buf[4..8].copy_from_slice(&ts_90k.to_be_bytes()); // timestamp (90 kHz, BE)
|
||||
buf[20..24].copy_from_slice(&frame_index.to_le_bytes()); // frameIndex (re-affirm for parity)
|
||||
buf[27] = multi_fec_blocks; // re-affirm for parity
|
||||
buf[28..32].copy_from_slice(&fec_info.to_le_bytes()); // fecInfo (LE)
|
||||
}
|
||||
|
||||
/// 8-byte `video_short_frame_header_t` (little-endian), prefixed to the AU bitstream.
|
||||
fn short_frame_header(frame_type: FrameType, last_payload_len: u16) -> [u8; 8] {
|
||||
let mut h = [0u8; 8];
|
||||
@@ -132,55 +211,21 @@ fn short_frame_header(frame_type: FrameType, last_payload_len: u16) -> [u8; 8] {
|
||||
h
|
||||
}
|
||||
|
||||
/// Build one wire datagram: RTP(BE) + reserved + NV_VIDEO_PACKET(LE) + payload.
|
||||
fn build_packet(
|
||||
seq: u32,
|
||||
timestamp_90k: u32,
|
||||
frame_index: u32,
|
||||
flags: u8,
|
||||
multi_fec_blocks: u8,
|
||||
fec_info: u32,
|
||||
payload: &[u8],
|
||||
) -> Vec<u8> {
|
||||
let mut p = Vec::with_capacity(SHARD_HEADER + payload.len());
|
||||
// --- RTP_PACKET (12 bytes, big-endian) ---
|
||||
p.push(RTP_HEADER_BYTE); // header
|
||||
p.push(0); // packetType (unused for video)
|
||||
p.extend_from_slice(&(seq as u16).to_be_bytes()); // sequenceNumber
|
||||
p.extend_from_slice(×tamp_90k.to_be_bytes()); // timestamp (90 kHz)
|
||||
p.extend_from_slice(&0u32.to_be_bytes()); // ssrc
|
||||
// --- reserved[4] ---
|
||||
p.extend_from_slice(&[0u8; 4]);
|
||||
// --- NV_VIDEO_PACKET (16 bytes, little-endian) ---
|
||||
p.extend_from_slice(&(seq << 8).to_le_bytes()); // streamPacketIndex (low byte 0)
|
||||
p.extend_from_slice(&frame_index.to_le_bytes()); // frameIndex
|
||||
p.push(flags);
|
||||
p.push(0); // extraFlags
|
||||
p.push(MULTI_FEC_FLAGS);
|
||||
p.push(multi_fec_blocks);
|
||||
p.extend_from_slice(&fec_info.to_le_bytes()); // fecInfo
|
||||
// --- payload ---
|
||||
p.extend_from_slice(payload);
|
||||
p
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn single_block_layout() {
|
||||
let mut pk = VideoPacketizer::new(1392); // payload_per_shard = 1392+16-32 = 1376
|
||||
let mut pk = VideoPacketizer::new(1392, 0); // data-only; pps = 1392+16-32 = 1376
|
||||
assert_eq!(pk.payload_per_shard, 1376);
|
||||
let au = vec![0xABu8; 4000]; // 8+4000 = 4008 → ceil(4008/1376) = 3 data shards
|
||||
let pkts = pk.packetize(&au, FrameType::Idr, 90_000);
|
||||
assert_eq!(pkts.len(), 3);
|
||||
// Every datagram is SHARD_HEADER + payload_per_shard.
|
||||
for p in &pkts {
|
||||
assert_eq!(p.len(), SHARD_HEADER + 1376);
|
||||
assert_eq!(p[0], 0x90); // RTP header byte
|
||||
}
|
||||
// First packet: SOF set, fecIndex 0, frameIndex 0.
|
||||
let first = &pkts[0];
|
||||
assert_eq!(first[24] & FLAG_SOF, FLAG_SOF);
|
||||
assert_eq!(first[24] & FLAG_PIC, FLAG_PIC);
|
||||
@@ -189,12 +234,10 @@ mod tests {
|
||||
let fec_info = u32::from_le_bytes(first[28..32].try_into().unwrap());
|
||||
assert_eq!(fec_info >> 22, 3); // dataShards = 3
|
||||
assert_eq!((fec_info >> 12) & 0x3ff, 0); // fecIndex 0
|
||||
// Last packet: EOF set, fecIndex 2.
|
||||
let last = &pkts[2];
|
||||
assert_eq!(last[24] & FLAG_EOF, FLAG_EOF);
|
||||
let fec_info_last = u32::from_le_bytes(last[28..32].try_into().unwrap());
|
||||
assert_eq!((fec_info_last >> 12) & 0x3ff, 2);
|
||||
// RTP sequence numbers are 0,1,2.
|
||||
for (i, p) in pkts.iter().enumerate() {
|
||||
assert_eq!(u16::from_be_bytes(p[2..4].try_into().unwrap()), i as u16);
|
||||
}
|
||||
@@ -202,15 +245,59 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn multi_block_split() {
|
||||
let mut pk = VideoPacketizer::new(1392);
|
||||
// Need > 255 data shards → multi-block. 255*1376 ≈ 351 KB; use 600 KB.
|
||||
let mut pk = VideoPacketizer::new(1392, 0); // data-only
|
||||
let au = vec![0u8; 600_000];
|
||||
let pkts = pk.packetize(&au, FrameType::P, 0);
|
||||
let total = (8 + au.len()).div_ceil(1376);
|
||||
assert_eq!(pkts.len(), total);
|
||||
// n_blocks = ceil(total/255), clamped to 4; check multiFecBlocks lastBlock nibble.
|
||||
let n_blocks = total.div_ceil(255).clamp(1, 4);
|
||||
let last_block = ((pkts.last().unwrap()[27]) >> 6) & 0x3;
|
||||
assert_eq!(last_block as usize, n_blocks - 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn emits_parity_shards() {
|
||||
let mut pk = VideoPacketizer::new(1392, 20); // pps = 1376, 20% FEC
|
||||
let au = vec![0xABu8; 4000]; // 8+4000 = 4008 → 3 data shards (k=3)
|
||||
let pkts = pk.packetize(&au, FrameType::Idr, 0);
|
||||
// m = ceil(3*20/100) = 1 parity shard → 4 packets; wire_pct = 100*1/3 = 33.
|
||||
assert_eq!(pkts.len(), 4);
|
||||
for p in &pkts {
|
||||
let fec_info = u32::from_le_bytes(p[28..32].try_into().unwrap());
|
||||
assert_eq!(fec_info >> 22, 3); // dataShards = k = 3
|
||||
assert_eq!((fec_info >> 4) & 0xff, 33); // wire fecPercentage
|
||||
}
|
||||
// The parity shard is last: fecIndex = k = 3.
|
||||
let parity = &pkts[3];
|
||||
let fec_info = u32::from_le_bytes(parity[28..32].try_into().unwrap());
|
||||
assert_eq!((fec_info >> 12) & 0x3ff, 3);
|
||||
// Data shards keep SOF (first) / EOF (last data shard) / PIC.
|
||||
assert_eq!(pkts[0][24] & FLAG_SOF, FLAG_SOF);
|
||||
assert_eq!(pkts[2][24] & FLAG_EOF, FLAG_EOF);
|
||||
// RTP sequence numbers are contiguous across data + parity (0,1,2,3).
|
||||
for (i, p) in pkts.iter().enumerate() {
|
||||
assert_eq!(u16::from_be_bytes(p[2..4].try_into().unwrap()), i as u16);
|
||||
}
|
||||
}
|
||||
|
||||
/// End-to-end recovery: parity over the full datagram reconstructs a dropped data shard's
|
||||
/// payload AND its NV `flags` byte (the byte Moonlight validates), proving the layout.
|
||||
#[test]
|
||||
fn parity_recovers_full_datagram_incl_flags() {
|
||||
let mut pk = VideoPacketizer::new(1392, 50); // high pct → plenty of parity
|
||||
let au = vec![0x5Au8; 4000]; // k = 3
|
||||
let pkts = pk.packetize(&au, FrameType::Idr, 0);
|
||||
let k = 3usize;
|
||||
let m = pkts.len() - k;
|
||||
assert!(m >= 1);
|
||||
// Drop data shard 1; reconstruct from the rest via the same Cauchy coder.
|
||||
let mut received: Vec<Option<Vec<u8>>> = pkts.iter().map(|p| Some(p.clone())).collect();
|
||||
received[1] = None;
|
||||
let recovered = Gf8Coder.reconstruct(k, m, &mut received).unwrap();
|
||||
// The recovered shard equals the original data shard's RS-covered bytes: its flags
|
||||
// byte (offset 24) is PIC (middle shard), proving the NV header recovers correctly.
|
||||
assert_eq!(recovered[1][24], FLAG_PIC);
|
||||
// ...and the payload region matches the original.
|
||||
assert_eq!(recovered[1][SHARD_HEADER..], pkts[1][SHARD_HEADER..]);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user