diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml
index 15465a404c8a34f64831e57baed096d1384bd1fc..40a6d551c5f09c4a1b28641223e709b8a40ac409 100644
--- a/.github/workflows/code-quality.yml
+++ b/.github/workflows/code-quality.yml
@@ -24,17 +24,15 @@ jobs:
- name: Formatting check
run:
- cd faer-libs &&
cargo fmt --all -- --check &&
- cd ../faer-entity &&
+ cd ./faer-entity &&
cargo fmt --all -- --check
# want to get all quality issues
continue-on-error: true
- name: Linting check
run:
- cd faer-libs &&
cargo clippy --all-targets &&
- cd ../faer-entity &&
+ cd ./faer-entity &&
cargo clippy --all-targets
continue-on-error: true
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index f2b902ab87093e466f52e060d841baddeaa277f7..ab00d53d1d663539e4d7bfd46ed809a199c7b259 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -34,9 +34,8 @@ jobs:
- name: Verify 1.67.0
run:
- cd faer-libs &&
cargo check &&
- cd ../faer-entity &&
+ cd ./faer-entity &&
cargo check
testing:
@@ -70,7 +69,7 @@ jobs:
uses: taiki-e/install-action@cargo-llvm-cov
- name: Collect coverage data
- run: cd faer-libs && cargo llvm-cov nextest --lcov --output-path lcov.info --workspace
+ run: cargo llvm-cov nextest --lcov --output-path lcov.info --workspace
- name: Upload coverage data to codecov
uses: codecov/codecov-action@v3
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..169858d0611c418459196e751ddb79defda76bb6
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,58 @@
+[package]
+name = "faer"
+version = "0.18.0"
+edition = "2021"
+
+[dependencies]
+bytemuck = "1.14.3"
+coe-rs = "0.1.2"
+dbgf = "0.1.1"
+paste = "1.0.14"
+reborrow = "0.5.5"
+
+dyn-stack = "0.10.0"
+equator = "0.1.10"
+faer-entity = { version ="0.17.0", default-features = false, path = "./faer-entity" }
+
+gemm = { version = "0.17.1", default-features = false }
+num-complex = { version = "0.4.5", default-features = false }
+num-traits = { version = "0.2.18", default-features = false }
+
+matrixcompare-core = { version = "0.1.0", optional = true }
+matrixcompare = { version = "0.3", optional = true }
+
+rayon = { version = "1.8.1", optional = true }
+serde = { version = "1", optional = true, features = ["derive"] }
+log = { version = "0.4", optional = true, default-features = false }
+npyz = { version = "0.8", optional = true }
+
+[features]
+default = ["std", "rayon", "serde"]
+std = [
+ "faer-entity/std",
+ "gemm/std",
+ "matrixcompare-core",
+ "matrixcompare",
+ "num-traits/std",
+ "num-complex/std",
+]
+rayon = ["std", "gemm/rayon", "dep:rayon"]
+nightly = ["faer-entity/nightly", "gemm/nightly"]
+perf-warn = ["log"]
+serde = ["dep:serde"]
+npy = ["std", "dep:npyz"]
+
+[dev-dependencies]
+amd = "0.2.2"
+assert_approx_eq = "1.1.0"
+matrix-market-rs = "0.1.3"
+matrixcompare = "0.3.0"
+rand = "0.8.5"
+serde_test = "1.0.176"
+
+[profile.dev]
+opt-level = 3
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = ["--cfg", "docsrs", "--html-in-header", "katex-header.html"]
diff --git a/book/Cargo.toml b/book/Cargo.toml
deleted file mode 100644
index 334a817812a6c01ec9196277c4472767e445da21..0000000000000000000000000000000000000000
--- a/book/Cargo.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-[package]
-name = "faer-book"
-version = "0.0.0"
-edition = "2021"
-
-[dependencies]
-faer = "0.15.0"
-faer-core = "0.15.0"
-faer-cholesky = "0.15.0"
-faer-qr = "0.15.0"
-faer-lu = "0.15.0"
-faer-svd = "0.15.0"
-faer-evd = "0.15.0"
-
-[[bin]]
-name = "intro"
-path = "src/intro.rs"
diff --git a/book/brand-rust.svg b/book/brand-rust.svg
deleted file mode 100644
index dd830c261bf3675146ac5456a152fce57a1b941b..0000000000000000000000000000000000000000
--- a/book/brand-rust.svg
+++ /dev/null
@@ -1,57 +0,0 @@
-
\ No newline at end of file
diff --git a/book/dev_guide.typ b/book/dev_guide.typ
deleted file mode 100644
index 131a0968df162e2c209b255c9031c52c6646de3c..0000000000000000000000000000000000000000
--- a/book/dev_guide.typ
+++ /dev/null
@@ -1,1034 +0,0 @@
-#set text(font: "New Computer Modern")
-
-#show raw: set text(font: "New Computer Modern Mono", size: 1.2em)
-
-#show par: set block(spacing: 0.55em)
-
-#show heading: set block(above: 1.4em, below: 1em)
-
-#show link: underline
-
-#set page(numbering: "1")
-
-#set par(leading: 0.55em, justify: true)
-
-#set heading(numbering: "1.1")
-
-#show heading.where(level: 1): it => pagebreak(weak:true) + block({
- set text(font: "New Computer Modern", weight: "black")
- v(2cm)
- block(text(18pt)[Chapter #counter(heading).display()])
- v(1cm)
- block(text(22pt)[#it.body])
- v(1cm)
-})
-
-#import "@preview/codly:0.1.0"
-#import "@preview/tablex:0.0.6": tablex, rowspanx, colspanx, gridx, hlinex, vlinex
-#import "@preview/colorful-boxes:1.2.0": colorbox
-
-#let icon(codepoint) = {
- box(
- height: 0.8em,
- baseline: 0.05em,
- image(codepoint)
- )
- h(0.1em)
-}
-
-#show: codly.codly-init.with()
-
-#codly.codly(
- languages: (
- rust: (name: "Rust", icon: icon("brand-rust.svg"), color: rgb("#CE412B")),
- ),
- breakable: false,
- width-numbers: none,
-)
-
-#outline()
-
-== Introduction
-_`faer-rs`_ is a general-purpose linear algebra library for the Rust
-programming language, with a focus on correctness, portability, and
-performance.
-In this book, we'll be assuming version `0.16.0` of the library.
-
-_`faer`_ is designed around a high level API that sacrifices some amount of
-performance and customizability in exchange for ease of use, as well as a low
-level API that offers more control over memory allocations and multithreading
-capabilities. The two APIs share the same data structures and can be used
-together or separately, depending on the user's needs.
-
-This book assumes some level of familiarity with Rust, linear algebra and _`faer`_'s API.
-Users who are new to the library are encouraged to get started by taking a look
-at the user guide, the library's examples directory
-#footnote[`faer-rs/faer-libs/faer/examples`] and browsing the `docs.rs`
-documentation #footnote[https://docs.rs/faer/0.16.0/faer/index.html].
-
-We will go into detail over the various operations and matrix decompositions
-that are provided by the library, as well as their implementation details. We
-will also explain the architecture of _`faer`_'s data structures and how low
-level operations are handled using vectorized SIMD instructions.
-
-#pagebreak()
-
-= Data layout and the `Entity` trait
-
-In most linear algebra libraries, matrix data is stored contiguously in memory,
-regardless of the scalar type. This can be done in two ways, either a row-major
-layout or a column-major layout.
-
-Consider the matrix
-$ mat(
- a_11, a_12;
- a_21, a_22;
- a_31, a_32;
-) $
-Storing it in row-major layout would place the values in memory in the following order:
-$ (
- a_11, a_12,
- a_21, a_22,
- a_31, a_32
-), $
-while storing it in column-major order would place the values in memory in this order:
-$ (
- a_11, a_21, a_31,
- a_12, a_22, a_32
-). $
-
-_`faer`_, on the other hand, first splits each scalar into its atomic units,
-then stores each unit matrix separately in a contiguous fashion. The library
-does not mandate the usage of one layout or the other, but heavily prefers to receive
-data in column-major layout, with the notable exception of matrix multiplication which
-we try to optimize for both column-major and row-major layouts.
-
-The way in which a scalar can be split is chosen by the scalar type itself.
-For example, a complex floating point type may choose to either be stored as one unit
-or as a group of two units.
-
-Given the following complex matrix:
-$ mat(
- a_11 + i b_11, a_12 + i b_12;
- a_21 + i b_21, a_22 + i b_22;
- a_31 + i b_31, a_32 + i b_32;
-), $
-and assuming column-major layout, we can either choose the following storage scheme in which
-the full number is considered a single unit:
-$ (
- a_11, b_11, a_21, b_21, a_31, b_31,
- a_12, b_12, a_22, b_22, a_32, b_32
-), $
-
-or the following scheme in which the real and imaginary parts are considered two distinct units
-$ (
- a_11, a_21, a_31,
- a_12, a_22, a_32
-),\
-(
- b_11, b_21, b_31,
- b_12, b_22, b_32
-). $
-
-The former is commonly referred to as AoS layout (array of structures), while
-the latter is called SoA (structure of arrays). The choice of which one to use
-depends on the context. As a general rule, types that are natively vectorizable
-(have direct CPU support for arithmetic operations) prefer to be laid out in
-AoS layout. On the other hand, types that do not have native vectorization
-support but can still be vectorized by combining more primitive operations
-prefer to be laid out in SoA layout.
-
-Types that are not vectorizable may be in either one, but the AoS layout is
-typically easier to work with in that scenario.
-
-== `Entity` trait
-The `Entity` trait determines how a type prefers to be stored in memory,
-through its associated type `Group`.
-
-Given some type `E` that implements `Entity`, we can manipulate groups of
-arbitrary types in a generic way.
-
-For example, `faer_core::GroupFor` is an `E`-group of `E::Unit`, which can be
-thought of as a raw representation of `E`.
-
-Pre-existing data can be referred to using a reference to a slice or a raw
-pointer, for example `GroupFor`.
-
-The `Entity` trait requires associated functions to convert from one `E`-group type to another.
-For example, we can take a reference to each element in a group with
-`E::faer_as_ref`, or `E::faer_as_mut`.
-
-```rust
-use faer_core::{Entity, GroupFor};
-
-fn value_to_unit_references(value: E) {
- let units: GroupFor = value.into_units();
- let references: GroupFor = E::faer_as_ref(&units);
-}
-```
-
-We can map one group type to another using `E::faer_map`.
-```rust
-use faer_core::{Entity, GroupFor};
-
-fn slice_to_ptr(
- slice: GroupFor
-) -> GroupFor {
- E::faer_map(slice, |slice| slice.as_ptr())
-}
-```
-
-We can also zip and unzip groups of values with `E::faer_zip` and `E::faer_unzip`.
-```rust
-use faer_core::{Entity, GroupFor};
-
-unsafe fn ptr_to_slice<'a, E: Entity>(
- ptr: GroupFor,
- len: GroupFor
-) -> GroupFor {
- let zipped: GroupFor = E::faer_zip(ptr, len);
- E::faer_map(zipped, |(ptr, len)| std::slice::from_raw_parts(ptr, len))
-}
-
-unsafe fn split_at(
- slice: GroupFor,
- mid: usize
-) -> (GroupFor, GroupFor) {
- E::faer_unzip(E::faer_map(slice, |slice| slice.split_at(mid)))
-}
-```
-
-== Matrix layout
-Matrices in _`faer`_ fall into two broad categories with respect to layout. Owned
-matrices (`Mat`) which are always stored in column-major layout, and matrix views
-(`MatRef`/`MatMut`) which allow any strided layout.
-
-Note that even though matrix views allow for any row and column stride, they
-are still typically optimized for column major layout, since that happens to be
-the preferred layout for most matrix decompositions.
-
-Matrix views are roughly defined as:
-```rust
-struct MatRef<'a, E: Entity> {
- ptr: GroupFor,
- nrows: usize,
- ncols: usize,
- row_stride: isize,
- col_stride: isize,
- __marker: PhantomData<&'a E>,
-}
-
-struct MatMut<'a, E: Entity> {
- ptr: GroupFor,
- nrows: usize,
- ncols: usize,
- row_stride: isize,
- col_stride: isize,
- __marker: PhantomData<&'a mut E>,
-}
-```
-
-The actual implementation is slightly different in order to allow `MatRef` to
-have `Copy` semantics, as well as make use of the fact that `ptr` is never null
-to allow for niche optimizations (such as `Option>` having the
-same layout as `MatRef<'_, E>`).
-
-`ptr` is a group of non-null pointers to units, each pointing to a matrix with an
-underlying contiguous allocation. In other words, even though the data itself
-is strided, it has to have a contiguous underlying storage in order to allow
-for pointer arithmetic to be valid.
-
-`nrows`, `ncols`, `row_stride` and `col_stride` are the matrix dimensions and
-strides, which must be the same for every unit matrix in the group.
-
-Finally, `__marker` imbues `MatRef` and `MatMut` with the correct variance,
-This allows `MatRef<'short_lifetime, E>` to be a subtype of
-`MatRef<'long_lifetime, E>`, which allows for better ergonomics.
-
-In addition to `Copy` semantics for `MatRef`, both `MatRef` and `MatMut` naturally
-provide `Move` semantics, as do most Rust types. On top of that, they also provide
-`Reborrow` semantics, which currently need to be explicitly used, unlike native
-references which are implicitly reborrowed.
-
-#pagebreak()
-
-Reborrowing is the act of temporarily borrowing a matrix view as another matrix
-view with a shorter lifetime. For example, given a `MatMut<'a, E>`, we would like
-to pass it to functions taking `MatMut<'_, E>` by value without having to consume
-our object. Unlike `MatRef<'a, E>`, this is not done automatically as `MatMut`
-is not `Copy`. The solution is to mutably reborrow our `MatMut` object like this
-```rust
-fn function_taking_mat_ref(mat: MatRef<'_, E>) {}
-fn function_taking_mat_mut(mat: MatMut<'_, E>) {}
-
-fn mutable_reborrow_example(mut mat: MatMut<'_, E>) {
- use faer::prelude::*;
-
- function_taking_mat_mut(mat.rb_mut());
- function_taking_mat_mut(mat.rb_mut());
- function_taking_mat_ref(mat.rb());
- function_taking_mat_ref(mat.rb());
- function_taking_mat_mut(mat);
-
- // does not compile, since `mat` was moved in the previous call
- // function_taking_mat_mut(mat);
-}
-```
-
-Owned matrices on the other hand are roughly defined as:
-```rust
-struct Mat {
- ptr: GroupFor,
- nrows: usize,
- ncols: usize,
- row_capacity: usize,
- col_capacity: usize,
- __marker: PhantomData,
-}
-
-impl Drop for Mat {
- fn drop(&mut self) {
- // deallocate the storage
- }
-}
-```
-Unlike matrix views, we don't need to explicitly store the strides. We know that
-the row stride is equal to `1`, since the layout is column major, and the column
-stride is equal to `row_capacity`.
-
-We also have two new fields: `row_capacity` and `col_capacity`, which represent
-how much storage we have for resizing the matrix without having to reallocate.
-
-`Mat` can be converted to `MatRef` using `Mat::as_ref(&self)` or `MatMut` using
-`Mat::as_mut(&mut self)`.
-
-= Vector operations
-== Componentwise operations
-Componentwise operations are operations that take $n$ matrices with matching
-dimensions, producing an output of the same shape. Addition and subtraction
-are examples of commonly used componentwise operations.
-
-Componentwise operations can be expressed in _`faer`_ using the `zipped!`
-macro, followed by a call to `for_each` (for in-place iteration) or `map` (for
-producing an output value).
-
-```rust
-use faer_core::{zipped, unzipped};
-
-fn a_plus_3b(a: MatRef<'_, f64>, b: MatRef<'_, f64>) -> Mat {
- zipped!(a, b).map(|unzipped!(a, b)| {
- *a + 3.0 * *b
- })
-}
-
-fn swap_a_b(a: MatMut<'_, f64>, b: MatMut<'_, f64>) {
- zipped!(a, b).for_each(|unzipped!(mut a, mut b)| {
- (*a, *b) = (*b, *a);
- })
-}
-```
-
-`zipped!` function calls can be more efficient than naive nested loops. The
-reason for this is that `zipped!` analyzes the layout of the input matrices in
-order to determine the optimal iteration order. For example whether it should
-iterate over rows first, before columns. Or whether the iteration should happen
-in reverse order (starting from the last row/column) instead of the forward
-order.
-
-Currently, `zipped!` determines the iteration order based on the preferred
-iteration order of the first matrix, but this may change in a future release.
-
-== Vectorized operations
-SIMD (Single Instruction, Multiple Data) refers to the usage of CPU instructions
-that take vectors of inputs, packed together in CPU registers, and perform the
-same operation on all of them. As an example, classic addition takes two scalars
-as an input and produces one output, while SIMD addition could take two vectors,
-each containing 4 scalars, and adds them componentwise, producing an output vector
-of 4 scalars. Correct SIMD usage is a crucial part of any linear algebra
-library, given that most linear algebra operations lend themselves well to
-vectorization.
-
-== SIMD with _`pulp`_
-
-_`faer`_ provides a common interface for generic and composable SIMD, using the
-_`pulp`_ crate as a backend. _`pulp`_'s high level API abstracts away the differences
-between various instruction sets and provides a common API that's generic over
-them (but not the scalar type). This allows users to write a generic implementation
-that gets turned into several functions, one for each possible instruction set
-among a predetermined subset. Finally, the generic implementation can be used along
-with an `Arch` structure that determines the best implementation at runtime.
-
-Here's an example of how _`pulp`_ could be used to compute the expression $x^2 +
-2y - |z|$, and store it into an output vector.
-
-```rust
-use core::iter::zip;
-
-fn compute_expr(out: &mut[f64], x: &[f64], y: &[f64], z: &[f64]) {
- struct Impl<'a> {
- out: &'a mut [f64],
- x: &'a [f64],
- y: &'a [f64],
- z: &'a [f64],
- }
-
- impl pulp::WithSimd for Impl<'_> {
- type Output = ();
-
- #[inline(always)]
- fn with_simd(self, simd: S) {
- let Self { out, x, y, z } = self;
-
- let (out_head, out_tail) = S::f64s_as_mut_simd(out);
- let (x_head, x_tail) = S::f64s_as_simd(x);
- let (y_head, y_tail) = S::f64s_as_simd(y);
- let (z_head, z_tail) = S::f64s_as_simd(z);
-
- let two = simd.f64s_splat(2.0);
- for (out, (&x, (&y, &z))) in zip(
- out_head,
- zip(x_head, zip(y_head, z_head)),
- ) {
- *out = simd.f64s_add(
- x,
- simd.f64s_sub(simd.f64s_mul(two, y), simd.f64s_abs(z)),
- );
- }
-
- for (out, (&x, (&y, &z))) in zip(
- out_tail,
- zip(x_tail, zip(y_tail, z_tail)),
- ) {
- *out = x - 2.0 * y - z.abs();
- }
- }
- }
-
- pulp::Arch::new().dispatch(Impl { out, x, y, z });
-}
-```
-
-There's a lot of things going on at the same time in this code example. Let us
-go over them step by step.
-
-_`pulp`_'s generic SIMD implementation happens through the `WithSimd` trait,
-which takes `self` by value to pass in the function parameters. It additionally
-provides another parameter to `with_simd` describing the instruction set being
-used. `WithSimd::with_simd` *must* be marked with the `#[inline(always)]` attribute.
-Forgetting to do so could lead to a significant performance drop.
-
-Inside the body of the function, we split up each of `out`, `x`, `y` and
-`z` into two parts using `S::f64s_as[_mut]_simd`. The first part (`head`) is a
-slice of `S::f64s`, representing the vectorizable part of the original slice.
-The second part (`tail`) contains the remainder that doesn't fit into a vector
-register.
-
-Handling the head section is done using vectorized operation. Currently these
-need to take `simd` as a parameter, in order to guarantee its availability in a
-sound way. This is what allows the API to be safe. The tail section is handled
-using scalar operations.
-
-The final step is actually calling into our SIMD implementation. This is done
-by creating an instance of `pulp::Arch` that performs the runtime detection
-(and caches the result, so that future invocations are as fast as possible),
-then calling `Arch::dispatch` which takes a type that implements `WithSimd`,
-and chooses the best SIMD implementation for it.
-
-=== Memory alignment
-
-Instead of splitting the input and output slices into two sections
-(vectorizable head + non-vectorizable tail), an alternative approach would be
-to split them up into three sections instead (vectorizable head + vectorizable
-body + vectorizable tail). This can be accomplished using masked loads and
-stores, which can speed things up if the slices are _similarly aligned_.
-
-Similarly aligned slices are slices which have the same base address modulo
-the byte size of the CPU's vector registers. The simplest way to guarantee this
-is to allocate the slices in aligned memory (such that the base address is a
-multiple of the register size in bytes), in which case the slices are similarly
-aligned, and any subslices of them (with a shared offset and size) will also be
-similarly aligned. Aligned allocation is done automatically for matrices in _`faer`_,
-which helps uphold these guarantees for maximum performance.
-
-Here's an example of how one might write an implementation that makes use of
-memory alignment, using _`pulp`_.
-
-```rust
-use core::iter::zip;
-use pulp::{Read, Write};
-
-#[inline(always)]
-fn compute_expr_register(
- simd: S,
- mut out: impl Write