CalculusWithJuliaNotes.jl/quarto/staging/matrix-calculus-notes.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>

<meta charset="utf-8">
<meta name="generator" content="quarto-1.6.40">

<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">


<title>matrix-calculus-notes</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
  vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
  }
pre.numberSource { margin-left: 3em;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
</style>


<script src="matrix-calculus-notes_files/libs/clipboard/clipboard.min.js"></script>
<script src="matrix-calculus-notes_files/libs/quarto-html/quarto.js"></script>
<script src="matrix-calculus-notes_files/libs/quarto-html/popper.min.js"></script>
<script src="matrix-calculus-notes_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="matrix-calculus-notes_files/libs/quarto-html/anchor.min.js"></script>
<link href="matrix-calculus-notes_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="matrix-calculus-notes_files/libs/quarto-html/quarto-syntax-highlighting-549806ee2085284f45b00abea8c6df48.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="matrix-calculus-notes_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="matrix-calculus-notes_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="matrix-calculus-notes_files/libs/bootstrap/bootstrap-973236bd072d72a04ee9cd82dcc9cb29.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="light">

  <script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>

<script type="text/javascript">
const typesetMath = (el) => {
  if (window.MathJax) {
    // MathJax Typeset
    window.MathJax.typeset([el]);
  } else if (window.katex) {
    // KaTeX Render
    var mathElements = el.getElementsByClassName("math");
    var macros = [];
    for (var i = 0; i < mathElements.length; i++) {
      var texText = mathElements[i].firstChild;
      if (mathElements[i].tagName == "SPAN") {
        window.katex.render(texText.data, mathElements[i], {
          displayMode: mathElements[i].classList.contains('display'),
          throwOnError: false,
          macros: macros,
          fleqn: false
        });
      }
    }
  }
}
window.Quarto = {
  typesetMath
};
</script>

</head>

<body class="fullcontent">

<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">

<main class="content" id="quarto-document-content">

<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Matrix Calculus</h1>
</div>


<div class="quarto-title-meta">


  </div>


</header>


<p>XXX Add in examples from paper XXX optimization? large number of parameters? ,…</p>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Based on Bright, Edelman, and Johnson’s notes
</div>
</div>
<div class="callout-body-container callout-body">
<p>This section samples material from the notes <a href="https://arxiv.org/abs/2501.14787">Matrix Calculus (for Machine Learning and Beyond)</a> by Paige Bright, Alan Edelman, and Steven G. Johnson. These notes cover material taught in a course at MIT. Support materials for their course in <code>Julia</code> are available at <a href="https://github.com/mitmath/matrixcalc/tree/main">https://github.com/mitmath/matrixcalc/tree/main</a>. For more details and examples, please refer to the source.</p>
</div>
</div>
<p>We have seen several “derivatives” of a function, based on the number of inputs and outputs. The first one was for functions <span class="math inline">\(f: R \rightarrow R\)</span>.</p>
<p>Then <span class="math inline">\(f\)</span> has a derivative at <span class="math inline">\(x\)</span> if this limit exists</p>
<p><span class="math display">\[
\lim_{h \rightarrow 0}\frac{f(x + h) - f(x)}{h}.
\]</span></p>
<p>The derivative of the function <span class="math inline">\(x\)</span> is this limit for a given <span class="math inline">\(x\)</span>. Common notation is:</p>
<p><span class="math display">\[
f'(x) = \frac{dy}{dx} = \lim_{h \rightarrow 0}\frac{f(x + h) - f(x)}{h}
\]</span></p>
<p>(when the limit exists).</p>
<p>This limit gets expressed in different ways:</p>
<ul>
<li><p>linearization write <span class="math inline">\(f(x+\Delta x) - f(x) \approx f'(x)\Delta x\)</span>, where <span class="math inline">\(\delta x\)</span> is a small displacement from <span class="math inline">\(x\)</span>. The reason there isn’t equality is the unwritten higher order terms that vanish in a limit.</p></li>
<li><p>Alternate limits. Another way of writing this is in terms of explicit smaller order terms:</p></li>
</ul>
<p><span class="math display">\[
(f(x+h) - f(x)) - f'(x)h = \mathscr{o}(h),
\]</span></p>
<p>which means if we divide both sides by <span class="math inline">\(h\)</span> and take the limit, we will get <span class="math inline">\(0\)</span> on the right and the relationship on the left.</p>
<ul>
<li>Differential notation simply writes this as <span class="math inline">\(dy = f(x)dx\)</span>. More verbosely, we might write</li>
</ul>
<p><span class="math display">\[
df = f(x+dx) - f(x) = f'(x) dx.
\]</span></p>
<p>Here <span class="math inline">\(dx\)</span> is a differential, made rigorous by a limit, which hides the higher order terms.</p>
<p>In these notes the limit has been defined, with suitable modification, for functions of vectors (multiple values) with scalar or vector outputs.</p>
<p>For example, when <span class="math inline">\(f: R \rightarrow R^m\)</span> was a vector-valued function the derivative was defined similarly through a limit of <span class="math inline">\((f(t + \Delta t) - f(t))/{\Delta t}\)</span>, where each component needed to have a limit. This can be rewritten through <span class="math inline">\(f(t + dt) - f(t) = f'(t) dt\)</span>, again using differentials to avoid the higher order terms.</p>
<p>When <span class="math inline">\(f: R^n \rightarrow R\)</span> is a scalar-valued function of a derivative, differentiability was defined by a gradient existing with <span class="math inline">\(f(c+h) - f(c) - \nabla{f}(c) \cdot h\)</span> being <span class="math inline">\(\mathscr{o}(\|h\|)\)</span>. In other words <span class="math inline">\(df = f(c + dh) - f(c) = \nabla{f}(c) \cdot dh\)</span>. The gradient has the same shape as <span class="math inline">\(c\)</span>, a column vector. If we take the row vector (e.g.&nbsp;<span class="math inline">\(f'(c) = \nabla{f}(c)^T\)</span>) then again we see <span class="math inline">\(df = f(c+dh) - f(c) = f'(c) dh\)</span>, where the last term uses matrix multiplication of a row vector times a column vector.</p>
<p>Finally, when <span class="math inline">\(f:R^n \rightarrow R^m\)</span>, the Jacobian was defined and characterized by <span class="math inline">\(\| f(x + dx) - f(x) - J_f(x)dx \|\)</span> being <span class="math inline">\(\mathscr{o}(\|dx\|)\)</span>. Again, we can express this through <span class="math inline">\(df = f(x + dx) - f(x) = f'(x)dx\)</span> where <span class="math inline">\(f'(x) = J_f(x)\)</span>.</p>
<p>In writing <span class="math inline">\(df = f(x + dx) - f(x) = f'(x) dx\)</span> generically, some underlying facts are left implicit: <span class="math inline">\(dx\)</span> has the same shape as <span class="math inline">\(x\)</span> (so can be added); <span class="math inline">\(f'(x) dx\)</span> may mean usual multiplication or matrix multiplication; and there is an underlying concept of distance and size that allows the above to be rigorous. This may be an abolute value or a norm.</p>
<p>Further, various differentiation rules apply such as the sum, product, and chain rule.</p>
<p>The <span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> notes cover differentiation of functions in this uniform manner and then extend the form by treating derivatives as <em>linear operators</em>.</p>
<p>A <a href="https://en.wikipedia.org/wiki/Operator_(mathematics)">linear operator</a> is a mathematical object which satisfies</p>
<p><span class="math display">\[
f[\alpha v + \beta w] = \alpha f[v] + \beta f[w].
\]</span></p>
<p>where the <span class="math inline">\(\alpha\)</span> and <span class="math inline">\(\beta\)</span> are scalars, and <span class="math inline">\(v\)</span> and <span class="math inline">\(w\)</span> possibly not and come from a <em>vector space</em>. Regular multiplication and matrix multiplication are familiar linear operations, but there are many others.</p>
<p>The referenced notes identify <span class="math inline">\(f'(x) dx\)</span> with <span class="math inline">\(f'(x)[dx]\)</span>, the latter emphasizing <span class="math inline">\(f'(x)\)</span> acts on <span class="math inline">\(dx\)</span> and the notation is not commutative (e.g., it is not <span class="math inline">\(dx f'(x)\)</span>).</p>
<p>Linear operators are related to vector spaces.</p>
<p>A <a href="https://en.wikipedia.org/wiki/Vector_space">vector space</a> is a set of mathematical objects which can be added together and also multiplied by a scalar. Vectors of similar size, as previously discussed, are the typical example, with vector addition and scalar multiplication previously discussed topics. Matrices of similar size (and some subclasses) also form a vector space. Additionally, many other set of objects form vector spaces. An example might be polynomial functions of degree <span class="math inline">\(n\)</span> or less; continuous functions, or functions with a certain number of derivatives.</p>
<p>Take differentiable functions as an example, then the simplest derivative rules <span class="math inline">\([af(x) + bg(x)]' = a[f(x)]' + b[g(x)]'\)</span> show the linearity of the derivative in this setting. This linearity is different from how the derivative is a linear operator on <span class="math inline">\(dx\)</span>.</p>
<p>A vector space is described by a <em>basis</em> – a minimal set of vectors needed to describe the space, after consideration of linear combinations. For many vectors, this the set of special vectors with <span class="math inline">\(1\)</span> as one of the entries, and <span class="math inline">\(0\)</span> otherwise.</p>
<p>A key fact about a basis is every vector in the vector space can be expressed <em>uniquely</em> as a linear combination of the basis vectors.</p>
<p>Vectors and matrices have properties that are generalizations of the real numbers. As vectors and matrices form vector spaces, the concept of addition of vectors and matrices is defined, as is scalar multiplication. Additionally, we have seen:</p>
<ul>
<li><p>The dot product between two vectors of the same length is defined easily (<span class="math inline">\(v\cdot w = \Sigma_i v_i w_i\)</span>). It is coupled with the length as <span class="math inline">\(\|v\|^2 = v\cdot v\)</span>.</p></li>
<li><p>Matrix multiplication is defined for two properly sized matrices. If <span class="math inline">\(A\)</span> is <span class="math inline">\(m \times k\)</span> and <span class="math inline">\(B\)</span> is <span class="math inline">\(k \times n\)</span> then <span class="math inline">\(AB\)</span> is a <span class="math inline">\(m\times n\)</span> matrix with <span class="math inline">\((i,j)\)</span> term given by the dot product of the <span class="math inline">\(i\)</span>th row of <span class="math inline">\(A\)</span> (viewed as a vector) and the <span class="math inline">\(j\)</span>th column of <span class="math inline">\(B\)</span> (viewed as a vector). Matrix multiplication is associative but <em>not</em> commutative. (E.g. <span class="math inline">\((AB)C = A(BC)\)</span> but <span class="math inline">\(AB\)</span> and <span class="math inline">\(BA\)</span> need not be equal (or even defined, as the shapes may not match up).</p></li>
<li><p>A square matrix <span class="math inline">\(A\)</span> has an <em>inverse</em> <span class="math inline">\(A^{-1}\)</span> if <span class="math inline">\(AA^{-1} = A^{-1}A = I\)</span>, where <span class="math inline">\(I\)</span> is the identity matrix (a matrix which is zero except on its diagonal entries which are all <span class="math inline">\(1\)</span>). Square matrices may or may not have an inverse. When they don’t the matrix is called singular.</p></li>
<li><p>Viewing a vector as a matrix is possible. The association is typically through a <em>column</em> vector.</p></li>
<li><p>The transpose of a matrix comes by permuting the rows and columns. The transpose of a column vector is a row vector, so <span class="math inline">\(v\cdot w = v^T w\)</span>, where we use a superscript <span class="math inline">\(T\)</span> for the transpose. The transpose of a product, is the product of the transposes – reversed: <span class="math inline">\((AB)^T = B^T A^T\)</span>; the tranpose of a transpose is an identity operation: <span class="math inline">\((A^T)^T = A\)</span>; the inverse of a transpose is the tranpose of the inverse: <span class="math inline">\((A^{-1})^T = (A^T){-1}\)</span>.</p></li>
<li><p>Matrices for which <span class="math inline">\(A = A^T\)</span> are called symmetric.</p></li>
<li><p>A few of the operations on matrices are the transpose and the inverse. These return a matrix, when defined. There is also the determinant and the trace, which return a scalar from a matrix. The trace is just the sum of the diagonal; the determinant is more involved to compute, but was previously seen to have a relationship to the volume of a certain parallellpiped. There are a few other operations described in the following.</p></li>
</ul>
<p>.</p>
<section id="scalar-valued-functions-of-a-vector" class="level2">
<h2 class="anchored" data-anchor-id="scalar-valued-functions-of-a-vector">Scalar-valued functions of a vector</h2>
<p>Suppose <span class="math inline">\(f: R^n \rightarrow R\)</span>, a scalar-valued function of a vector. Then the directional derivative at <span class="math inline">\(x\)</span> in the direction <span class="math inline">\(v\)</span> was defined for a scalar <span class="math inline">\(\alpha\)</span> by:</p>
<p><span class="math display">\[
\frac{\partial}{\partial \alpha}f(x + \alpha v) \mid_{\alpha = 0} =
\lim_{\Delta\alpha \rightarrow 0} \frac{f(x + \Delta\alpha v) - f(x)}{\Delta\alpha}.
\]</span></p>
<p>This rate of change in the direction of <span class="math inline">\(v\)</span> can be expressed through the linear operator <span class="math inline">\(f'(x)\)</span> via</p>
<p><span class="math display">\[
f(x + d\alpha v) - f(x) = f'(x) [d\alpha v] = d\alpha f'(x)[v],
\]</span></p>
<p>using linearity to move the scalar part outside the <span class="math inline">\([]\)</span>. This connects the partial derivative at <span class="math inline">\(x\)</span> in the direction of <span class="math inline">\(v\)</span> with <span class="math inline">\(f'(x)\)</span>:</p>
<p><span class="math display">\[
\frac{\partial}{\partial \alpha}f(x + \alpha v) \mid_{\alpha = 0} =
f'(x)[v].
\]</span></p>
<p>Not only does this give a connection in notation with the derivative, it naturally illustrates how the derivative as a linear operator can act on non-infinitesimal values.</p>
<p>Previously, we wrote <span class="math inline">\(\nabla f \cdot v\)</span> for the directional derivative, where the gradient is a column vector. The above uses the identification <span class="math inline">\(f' = (\nabla f)^T\)</span>.</p>
<p>For <span class="math inline">\(f: R^n \rightarrow R\)</span> we have</p>
<p><span class="math display">\[
df = f(x + dx) - f(x) = f'(x) [dx]
\]</span></p>
<p>is a scalar, so if <span class="math inline">\(dx\)</span> is a column vector, <span class="math inline">\(f'(x)\)</span> is a row vector with the same number of components (just as <span class="math inline">\(\nabla f\)</span> is a column vector with the same number of components).</p>
<section id="examples" class="level5">
<h5 class="anchored" data-anchor-id="examples">Examples</h5>
<p><span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> include this example to show that the computation of derivatives using components can be avoided. Consider <span class="math inline">\(f(x) = x^T A x\)</span> where <span class="math inline">\(x\)</span> is a vector in <span class="math inline">\(R^n\)</span> and <span class="math inline">\(A\)</span> is an <span class="math inline">\(n\times n\)</span> matrix. Then <span class="math inline">\(f: R^n \rightarrow R\)</span> and its derivative can be computed:</p>
<p><span class="math display">\[
\begin{align*}
df &amp;= f(x + dx) - f(x)\\
&amp;= (x + dx)^T A (x + dx) - x^TAx \\
&amp;= x^TAx + dx^TA x + x^TAx + dx^T A dx - x^TAx\\
&amp;= dx^TA x + x^TAdx \\
&amp;= (dx^TAx)^T + x^TAdx \\
&amp;= x^T A^T dx + x^T A dx\\
&amp;= x^T(A^T + A) dx
\end{align*}
\]</span></p>
<p>The term <span class="math inline">\(dx^t A dx\)</span> is dropped, as it is higher order (goes to zero faster), it containing two <span class="math inline">\(dx\)</span> terms. In the second to last step, an identity operation (taking the transpose of the scalar quantity) is taken to simplify the algebra. Finally, as <span class="math inline">\(df = f'(x)[dx]\)</span> the identity of <span class="math inline">\(f'(x) =  x^T(A^T+A)\)</span> is made, or taking transposes <span class="math inline">\(\nabla f = (A + A^T)x\)</span>.</p>
<p>Compare the elegance above, with the component version, even though simplified, it still requires a specification of the size to carry the following out:</p>
<div id="43173572" class="cell" data-execution_count="1">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">SymPy</span></span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="pp">@syms</span> x[<span class="fl">1</span><span class="op">:</span><span class="fl">3</span>]<span class="op">::</span><span class="dt">real </span>A[<span class="fl">1</span><span class="op">:</span><span class="fl">3</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">3</span>]<span class="op">::</span><span class="dt">real</span></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>u <span class="op">=</span> x<span class="op">'</span> <span class="op">*</span> A <span class="op">*</span> x</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>grad_u <span class="op">=</span> [<span class="fu">diff</span>(u, xi) for xi <span class="kw">in</span> x]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display cell-output-markdown" data-execution_count="36">
<p><span class="math inline">\(\left[\begin{smallmatrix}2 A₁_{₁} x₁ + A₁_{₂} x₂ + A₁_{₃} x₃ + A₂_{₁} x₂ + A₃_{₁} x₃\\A₁_{₂} x₁ + A₂_{₁} x₁ + 2 A₂_{₂} x₂ + A₂_{₃} x₃ + A₃_{₂} x₃\\A₁_{₃} x₁ + A₂_{₃} x₂ + A₃_{₁} x₁ + A₃_{₂} x₂ + 2 A₃_{₃} x₃\end{smallmatrix}\right]\)</span></p>
</div>
</div>
<p>Compare to the formula for the gradient just derived:</p>
<div id="46b8de64" class="cell" data-execution_count="2">
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>grad_u_1 <span class="op">=</span> (A <span class="op">+</span> A<span class="op">'</span>)<span class="op">*</span>x</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display cell-output-markdown" data-execution_count="37">
<p><span class="math inline">\(\left[\begin{smallmatrix}2 A₁_{₁} x₁ + x₂ \left(A₁_{₂} + A₂_{₁}\right) + x₃ \left(A₁_{₃} + A₃_{₁}\right)\\2 A₂_{₂} x₂ + x₁ \left(A₁_{₂} + A₂_{₁}\right) + x₃ \left(A₂_{₃} + A₃_{₂}\right)\\2 A₃_{₃} x₃ + x₁ \left(A₁_{₃} + A₃_{₁}\right) + x₂ \left(A₂_{₃} + A₃_{₂}\right)\end{smallmatrix}\right]\)</span></p>
</div>
</div>
<p>The two are, of course, equal</p>
<div id="96ea196c" class="cell" data-execution_count="3">
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">all</span>(a <span class="op">==</span> b <span class="cf">for</span> (a,b) <span class="op">∈</span> <span class="fu">zip</span>(grad_u, grad_u_1))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display" data-execution_count="38">
<pre><code>true</code></pre>
</div>
</div>
<hr>
<p>For <span class="math inline">\(f: R^n \rightarrow R^m\)</span>, <span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> give an example of computing the Jacobian without resorting to component wise computations. Let <span class="math inline">\(f(x) = Ax\)</span> with <span class="math inline">\(A\)</span> being a <span class="math inline">\(m \times n\)</span> matrix, it follows that</p>
<p><span class="math display">\[
\begin{align*}
df &amp;= f(x + dx) - f(x)\\
&amp;= A(x + dx) - Ax\\
&amp;= Adx\\
&amp;= f'(x) dx.
\end{align*}
\]</span></p>
<p>The Jacobian is the linear operator <span class="math inline">\(A\)</span> acting on <span class="math inline">\(dx\)</span>.</p>
</section>
</section>
<section id="sum-and-product-rules-for-the-derivative" class="level2">
<h2 class="anchored" data-anchor-id="sum-and-product-rules-for-the-derivative">Sum and product rules for the derivative</h2>
<p>Using the differential notation – which implicitly ignores higher order terms as they vanish in a limit – the sum and product rules can be derived.</p>
<p>For the sum rule, let <span class="math inline">\(f(x) = g(x) + h(x)\)</span>. Then</p>
<p><span class="math display">\[
\begin{align*}
df &amp;= f(x + dx) - f(x) \\
&amp;= f'(x) dx\\
&amp;= \left(g(x+dx) + h(x+dx)\right) - \left(g(x) + h(x)\right)\\
&amp;= \left(g(x + dx) - g(x)\right) + \left(h(x + dx) - h(x)\right)\\
&amp;= g'(x)dx + h'(x) dx\\
&amp;= \left(g'(x) + h'(x)\right) dx
\end{align*}
\]</span></p>
<p>Comparing we get <span class="math inline">\(f'(x) = g'(x) + h'(x)\)</span>.</p>
<p>The sum rule has the same derivation as was done with univariate, scalar functions. Similarly for the product rule.</p>
<p>The product rule has with <span class="math inline">\(f(x) = g(x)h(x)\)</span></p>
<p><span class="math display">\[
\begin{align*}
df &amp;= f(x + dx) - f(x) \\
&amp;= g(x+dx)h(x + dx) - g(x) h(x)\\
&amp;= \left(g(x) + g'(x)dx\right)\left(h(x) + h'(x) dx\right) - \left(g(x) h(x)\right) \\
&amp;= g(x)h(x) + g'(x) dx h(x) + g(x) h'(x) dx + g'(x)dx h'(x) dx - g(x) h(x)\\
&amp;= gh + dg h + gdh + dg dh - gh\\
    &amp;= dg h + gdh,
\end{align*}
\]</span></p>
<p><strong>after</strong> dropping the higher order term and cancelling <span class="math inline">\(gh\)</span> terms of opposite signs in the fourth row.</p>
<section id="examples-1" class="level5">
<h5 class="anchored" data-anchor-id="examples-1">Examples</h5>
<p>These two rules can be used to show the last two examples:</p>
<p>First, to differentiate <span class="math inline">\(f(x) = x^TAx\)</span>:</p>
<p><span class="math display">\[
\begin{align*}
df &amp;= dx^T (Ax) + x^T d(Ax) \\
&amp;= x^T A^T dx + x^T A dx \\
&amp;= x^T(A^T + A) dx
\end{align*}
\]</span></p>
<p>Again, taking the transpose of the scalar quantity <span class="math inline">\(x^TAdx\)</span> to simplify the expression.</p>
<p>When <span class="math inline">\(A^T = A\)</span> (<span class="math inline">\(A\)</span> is symmetric) this simplifies to a more familiar looking <span class="math inline">\(2x^TA\)</span>, but we see that this requires assumptions not needed in the scalar case.</p>
<p>Next, if <span class="math inline">\(f(x) = Ax\)</span> then</p>
<p><span class="math display">\[
df = (dA)x + A(dx) = 0x + A dx = A dx,
\]</span></p>
<p><span class="math inline">\(A\)</span> being a constant here.</p>
</section>
<section id="example" class="level5">
<h5 class="anchored" data-anchor-id="example">Example</h5>
<p><span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> consider what in <code>Julia</code> is <code>.*</code>. That is the operation:</p>
<p><span class="math display">\[
v .* w =
\begin{bmatrix}
v_1w_1 \\
v_2w_2 \\
\vdots\\
v_nw_n
\end{bmatrix}
=
\begin{bmatrix}
v_1 &amp; 0 &amp; \cdots &amp; 0 \\
0 &amp; v_2 &amp; \cdots &amp; 0 \\
  &amp;     &amp; \vdots &amp;   \\
0 &amp; 0 &amp; \cdots   &amp; v_n
\end{bmatrix}
\begin{bmatrix}
w_1 \\
w_2 \\
\vdots\\
w_n
\end{bmatrix}
= \text{diag}(v) w.
\]</span></p>
<p>They compute the derivative of <span class="math inline">\(f(x) = A(x .* x)\)</span> for some fixed matrix <span class="math inline">\(A\)</span> of the proper size.</p>
<p>We can see that <span class="math inline">\(d (\text{diag}(v)w) = d(\text{diag}(v)) w + \text{diag}(v) dw = (dx) .* w + x .* dw\)</span>. So</p>
<p><span class="math inline">\(df = A(dx .* x + x .* dx) = 2A(x .* dx)\)</span>, as <span class="math inline">\(.*\)</span> is commutative by its definition. Writing this as <span class="math inline">\(df = 2A(x .* dx) = 2A(\text{diag}(x) dx) = (2A\text{diag}(x)) dx\)</span>, we identify <span class="math inline">\(f'(x) = 2A\text{diag}(x)\)</span>.</p>
<p>This operation is called the <a href="https://en.wikipedia.org/wiki/Hadamard_product_(matrices)">Hadamard product</a> and it extends to matrices and arrays.</p>
</section>
</section>
<section id="the-chain-rule" class="level2">
<h2 class="anchored" data-anchor-id="the-chain-rule">The chain rule</h2>
<p>Like the product rule, the chain rule is shown by <span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> in this notation with <span class="math inline">\(f(x) = g(h(x))\)</span>:</p>
<p><span class="math display">\[
\begin{align*}
df &amp;= f(x + dx) - f(x)\\
&amp;= g(h(x + dx)) - g(h(x))\\
&amp;= g(h(x) + h'(x)[dx]) - g(h(x))\\
&amp;= g'(h(x)) [h'(x) [dx]]\\
&amp;= (g'(h(x)) h'(x)) [dx]
\end{align*}
\]</span></p>
<p>(The limit requires a bit more detail.)</p>
<p>The operator <span class="math inline">\(f'(x)= g'(h(x)) h'(x)\)</span> is a product of matrices.</p>
<section id="computational-differences-with-expressions-from-the-chain-rule" class="level3">
<h3 class="anchored" data-anchor-id="computational-differences-with-expressions-from-the-chain-rule">Computational differences with expressions from the chain rule</h3>
<p>Of note here is the application of the chain rule to three (or more compositions):</p>
<p>The derivative of <span class="math inline">\(f(x) = a(x) b(x) c(x)\)</span> can be expressed as</p>
<p><span class="math display">\[
f' = (a'b')c'  \text{ or } f' = a'(b'c')
\]</span></p>
<p>Multiplying left to right (the first) is called reverse mode; multiplying right to left (the second) is called forward mode. The distinction becomes important when considering the computational cost of the multiplications.</p>
<ul>
<li>If <span class="math inline">\(f: R^n \rightarrow R^m\)</span> has <span class="math inline">\(n\)</span> much bigger than <span class="math inline">\(1\)</span> and <span class="math inline">\(m=1\)</span>, then it is much faster to do left to right multiplication</li>
<li>if <span class="math inline">\(f:R^n \rightarrow R^m\)</span> has <span class="math inline">\(n=1\)</span> and <span class="math inline">\(m\)</span> much bigger than one, the it is faster to do right to left multiplication.</li>
</ul>
<p>The basic idea comes down to the shape of the matrices. When <span class="math inline">\(m=1\)</span>, the derviative is a product of matrices of size <span class="math inline">\(n\times j\)</span> <span class="math inline">\(j\times k\)</span> and <span class="math inline">\(k \times 1\)</span> yielding a matrix of size <span class="math inline">\(n \times 1\)</span> matching the function dimension. Matrix multiplication of an <span class="math inline">\(m \times q\)</span> times <span class="math inline">\(q \times n\)</span> takes an order of <span class="math inline">\(mqn\)</span> operations. The multiplication of left to right is then</p>
<p>The first operation takes <span class="math inline">\(njk\)</span> operation leaving an <span class="math inline">\(n\times k\)</span> matrix, the next multiplication then takes another <span class="math inline">\(nk1\)</span> operations or <span class="math inline">\(njk + nk\)</span> together. Whereas computing from the right to left is first <span class="math inline">\(jk1\)</span> operations leaving a <span class="math inline">\(j \times 1\)</span> matrix. The next operation would take another <span class="math inline">\(nk1\)</span> operations. In totalL</p>
<ul>
<li>left to right is <span class="math inline">\(njk + nk\)</span> = <span class="math inline">\(nk \cdot (1 + j)\)</span>.</li>
<li>right to left is <span class="math inline">\(jk + j = j\cdot (k+1)\)</span>.</li>
</ul>
<p>When <span class="math inline">\(j=k\)</span>, say, we can compare and see the second is a factor less in terms of operations. This can be quite significant in higher dimensions, whereas the dimensions of calculus (where <span class="math inline">\(n\)</span> and <span class="math inline">\(m\)</span> are <span class="math inline">\(3\)</span> or less) it is not an issue.</p>
<section id="example-1" class="level5">
<h5 class="anchored" data-anchor-id="example-1">Example</h5>
<p>Using the <code>BenchmarkTools</code> package, we can check the time to compute various products:</p>
<div id="6f49ad74" class="cell" data-execution_count="4">
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">BenchmarkTools</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>n,j,k,m <span class="op">=</span> <span class="fl">20</span>,<span class="fl">15</span>,<span class="fl">10</span>,<span class="fl">1</span></span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="pp">@btime</span> <span class="fu">A*</span>(B<span class="op">*</span>C) setup<span class="op">=</span>(A<span class="op">=</span><span class="fu">rand</span>(n,j);B<span class="op">=</span><span class="fu">rand</span>(j,k); C<span class="op">=</span><span class="fu">rand</span>(k,m));</span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="pp">@btime</span> (A<span class="op">*</span>B)<span class="op">*</span>C setup<span class="op">=</span>(A<span class="op">=</span><span class="fu">rand</span>(n,j);B<span class="op">=</span><span class="fu">rand</span>(j,k); C<span class="op">=</span><span class="fu">rand</span>(k,m));</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>  420.814 ns (4 allocations: 432 bytes)
  702.678 ns (4 allocations: 1.88 KiB)</code></pre>
</div>
</div>
<p>The latter computation is about 1.5 times slower.</p>
<p>Whereas the relationship is changed when the first matrix is skinny and the last is not:</p>
<div id="818fc075" class="cell" data-execution_count="5">
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="pp">@btime</span> <span class="fu">A*</span>(B<span class="op">*</span>C) setup<span class="op">=</span>(A<span class="op">=</span><span class="fu">rand</span>(m,k);B<span class="op">=</span><span class="fu">rand</span>(k,j); C<span class="op">=</span><span class="fu">rand</span>(j,n));</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="pp">@btime</span> (A<span class="op">*</span>B)<span class="op">*</span>C setup<span class="op">=</span>(A<span class="op">=</span><span class="fu">rand</span>(m,k);B<span class="op">=</span><span class="fu">rand</span>(k,j); C<span class="op">=</span><span class="fu">rand</span>(j,n));</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>  901.488 ns (4 allocations: 1.88 KiB)
  623.468 ns (4 allocations: 432 bytes)</code></pre>
</div>
</div>
</section>
<section id="example-2" class="level5">
<h5 class="anchored" data-anchor-id="example-2">Example</h5>
<p>In calculus, we have <span class="math inline">\(n\)</span> and <span class="math inline">\(m\)</span> are <span class="math inline">\(1\)</span>,<span class="math inline">\(2\)</span>,or <span class="math inline">\(3\)</span>. But that need not be the case, especially if differentiation is over a parameter space.</p>
<p>XXXX (Maybe the ariplain wing, but please, something origi</p>
</section>
</section>
</section>
<section id="derivatives-of-matrix-functions" class="level2">
<h2 class="anchored" data-anchor-id="derivatives-of-matrix-functions">Derivatives of matrix functions</h2>
<p>What is the the derivative of <span class="math inline">\(f(A) = A^2\)</span>?</p>
<p>The function <span class="math inline">\(f\)</span> takes a <span class="math inline">\(n\times n\)</span> matrix and returns a matrix of the same size. This innocuous question isn’t directly handled by the Jacobian, which is defined for vector valued function <span class="math inline">\(f:R^n \rightarrow R^m\)</span>.</p>
<p>This derivative can be derived directly from the <em>product rule</em>:</p>
<p><span class="math display">\[
\begin{align*}
f(A) &amp;= [AA]'\\
&amp;= A dA + dA A
\end{align*}
\]</span></p>
<p>That is <span class="math inline">\(f'(A)\)</span> is the operator <span class="math inline">\(f'(A)[\delta A] = A \delta A + \delta A A\)</span> and not <span class="math inline">\(2A\delta A\)</span>, as <span class="math inline">\(A\)</span> may not commute with <span class="math inline">\(\delta A\)</span>.</p>
<section id="vectorization-of-a-matrix" class="level3">
<h3 class="anchored" data-anchor-id="vectorization-of-a-matrix">Vectorization of a matrix</h3>
<p>Alternatively, we can identify <span class="math inline">\(A\)</span> through its components, as a vector in <span class="math inline">\(R^{n^2}\)</span> and then leverage the Jacobian.</p>
<p>One such identification is vectorization – consecutively stacking the column vectors into a vector. In <code>Julia</code> the <code>vec</code> function does this operation:</p>
<div id="ca0ee93f" class="cell" data-execution_count="6">
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="pp">@syms</span> A[<span class="fl">1</span><span class="op">:</span><span class="fl">2</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">2</span>]</span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="fu">vec</span>(A)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display cell-output-markdown" data-execution_count="41">
<p><span class="math inline">\(\left[\begin{smallmatrix}A₁_{₁}\\A₂_{₁}\\A₁_{₂}\\A₂_{₂}\end{smallmatrix}\right]\)</span></p>
</div>
</div>
<p>The stacking by column follows how <code>Julia</code> stores matrices and how <code>Julia</code> references a matrices entries by linear index:</p>
<div id="f39f1774" class="cell" data-execution_count="7">
<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">vec</span>(A) <span class="op">==</span> [A[i] for i <span class="kw">in</span> <span class="fu">eachindex</span>(A)]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display" data-execution_count="42">
<pre><code>true</code></pre>
</div>
</div>
<p>With this vectorization operation, <span class="math inline">\(f\)</span> may be viewed as <span class="math inline">\(\tilde{f}:R^{n^2} \rightarrow R^{n^2}\)</span> through:</p>
<p><span class="math display">\[
\tilde{f}(\text{vec}(A)) = \text{vec}(f(A))
\]</span></p>
<p>We use <code>SymPy</code> to compute the Jacobian of this vector valued function.</p>
<div id="6d85eb25" class="cell" data-execution_count="8">
<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="pp">@syms</span> A[<span class="fl">1</span><span class="op">:</span><span class="fl">3</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">3</span>]<span class="op">::</span><span class="dt">real</span></span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="fu">f</span>(x) <span class="op">=</span> x<span class="op">^</span><span class="fl">2</span></span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>J <span class="op">=</span> <span class="fu">vec</span>(<span class="fu">f</span>(A)).<span class="fu">jacobian</span>(<span class="fu">vec</span>(A)) <span class="co"># jacobian of f̃</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display cell-output-markdown" data-execution_count="43">
<p><span class="math inline">\(\left[\begin{smallmatrix}2 A₁_{₁} &amp; A₁_{₂} &amp; A₁_{₃} &amp; A₂_{₁} &amp; 0 &amp; 0 &amp; A₃_{₁} &amp; 0 &amp; 0\\A₂_{₁} &amp; A₁_{₁} + A₂_{₂} &amp; A₂_{₃} &amp; 0 &amp; A₂_{₁} &amp; 0 &amp; 0 &amp; A₃_{₁} &amp; 0\\A₃_{₁} &amp; A₃_{₂} &amp; A₁_{₁} + A₃_{₃} &amp; 0 &amp; 0 &amp; A₂_{₁} &amp; 0 &amp; 0 &amp; A₃_{₁}\\A₁_{₂} &amp; 0 &amp; 0 &amp; A₁_{₁} + A₂_{₂} &amp; A₁_{₂} &amp; A₁_{₃} &amp; A₃_{₂} &amp; 0 &amp; 0\\0 &amp; A₁_{₂} &amp; 0 &amp; A₂_{₁} &amp; 2 A₂_{₂} &amp; A₂_{₃} &amp; 0 &amp; A₃_{₂} &amp; 0\\0 &amp; 0 &amp; A₁_{₂} &amp; A₃_{₁} &amp; A₃_{₂} &amp; A₂_{₂} + A₃_{₃} &amp; 0 &amp; 0 &amp; A₃_{₂}\\A₁_{₃} &amp; 0 &amp; 0 &amp; A₂_{₃} &amp; 0 &amp; 0 &amp; A₁_{₁} + A₃_{₃} &amp; A₁_{₂} &amp; A₁_{₃}\\0 &amp; A₁_{₃} &amp; 0 &amp; 0 &amp; A₂_{₃} &amp; 0 &amp; A₂_{₁} &amp; A₂_{₂} + A₃_{₃} &amp; A₂_{₃}\\0 &amp; 0 &amp; A₁_{₃} &amp; 0 &amp; 0 &amp; A₂_{₃} &amp; A₃_{₁} &amp; A₃_{₂} &amp; 2 A₃_{₃}\end{smallmatrix}\right]\)</span></p>
</div>
</div>
<p>We do this via linear algebra first, then see a more elegant manner following the notes.</p>
<p>A basic course in linear algebra shows that any linear operator on a finite vector space can be represented as a matrix. The basic idea is to represent what the operator does to each <em>basis</em> element and put these values as columns of the matrix.</p>
<p>In this <span class="math inline">\(3 \times 3\)</span> case, the linear operator works on an object with <span class="math inline">\(9\)</span> slots and returns an object with <span class="math inline">\(9\)</span> slots, so the matrix will be <span class="math inline">\(9 \times 9\)</span>.</p>
<p>The basis elements are simply the matrices with a <span class="math inline">\(1\)</span> in spot <span class="math inline">\((i,j)\)</span> and zero elsewhere. Here we generate them through a function:</p>
<div id="ff65c6df" class="cell" data-execution_count="9">
<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">basis</span>(i,j,A) <span class="op">=</span> (b<span class="op">=</span><span class="fu">zeros</span>(<span class="dt">Int</span>, <span class="fu">size</span>(A)<span class="op">...</span>); b[i,j] <span class="op">=</span> <span class="fl">1</span>; b)</span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>JJ <span class="op">=</span> [<span class="fu">vec</span>(<span class="fu">basis</span>(i,j,A)<span class="op">*</span>A <span class="op">+</span> <span class="fu">A*basis</span>(i,j,A)) for  j <span class="kw">in</span> <span class="fl">1</span><span class="op">:</span><span class="fl">3</span> for i <span class="kw">in</span> <span class="fl">1</span><span class="op">:</span><span class="fl">3</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display" data-execution_count="44">
<pre><code>9-element Vector{Vector{Sym{PyCall.PyObject}}}:
 [2*A₁_₁, A₂_₁, A₃_₁, A₁_₂, 0, 0, A₁_₃, 0, 0]
 [A₁_₂, A₁_₁ + A₂_₂, A₃_₂, 0, A₁_₂, 0, 0, A₁_₃, 0]
 [A₁_₃, A₂_₃, A₁_₁ + A₃_₃, 0, 0, A₁_₂, 0, 0, A₁_₃]
 [A₂_₁, 0, 0, A₁_₁ + A₂_₂, A₂_₁, A₃_₁, A₂_₃, 0, 0]
 [0, A₂_₁, 0, A₁_₂, 2*A₂_₂, A₃_₂, 0, A₂_₃, 0]
 [0, 0, A₂_₁, A₁_₃, A₂_₃, A₂_₂ + A₃_₃, 0, 0, A₂_₃]
 [A₃_₁, 0, 0, A₃_₂, 0, 0, A₁_₁ + A₃_₃, A₂_₁, A₃_₁]
 [0, A₃_₁, 0, 0, A₃_₂, 0, A₁_₂, A₂_₂ + A₃_₃, A₃_₂]
 [0, 0, A₃_₁, 0, 0, A₃_₂, A₁_₃, A₂_₃, 2*A₃_₃]</code></pre>
</div>
</div>
<p>The elements of <code>JJ</code> show the representation of each of the <span class="math inline">\(9\)</span> basis elements under the linear transformation.</p>
<p>To construct the matrix representing the linear operator, we need to concatenate these horizontally as column vectors</p>
<div id="1b5dd766" class="cell" data-execution_count="10">
<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>JJ <span class="op">=</span> <span class="fu">hcat</span>(JJ<span class="op">...</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display cell-output-markdown" data-execution_count="45">
<p><span class="math inline">\(\left[\begin{smallmatrix}2 A₁_{₁} &amp; A₁_{₂} &amp; A₁_{₃} &amp; A₂_{₁} &amp; 0 &amp; 0 &amp; A₃_{₁} &amp; 0 &amp; 0\\A₂_{₁} &amp; A₁_{₁} + A₂_{₂} &amp; A₂_{₃} &amp; 0 &amp; A₂_{₁} &amp; 0 &amp; 0 &amp; A₃_{₁} &amp; 0\\A₃_{₁} &amp; A₃_{₂} &amp; A₁_{₁} + A₃_{₃} &amp; 0 &amp; 0 &amp; A₂_{₁} &amp; 0 &amp; 0 &amp; A₃_{₁}\\A₁_{₂} &amp; 0 &amp; 0 &amp; A₁_{₁} + A₂_{₂} &amp; A₁_{₂} &amp; A₁_{₃} &amp; A₃_{₂} &amp; 0 &amp; 0\\0 &amp; A₁_{₂} &amp; 0 &amp; A₂_{₁} &amp; 2 A₂_{₂} &amp; A₂_{₃} &amp; 0 &amp; A₃_{₂} &amp; 0\\0 &amp; 0 &amp; A₁_{₂} &amp; A₃_{₁} &amp; A₃_{₂} &amp; A₂_{₂} + A₃_{₃} &amp; 0 &amp; 0 &amp; A₃_{₂}\\A₁_{₃} &amp; 0 &amp; 0 &amp; A₂_{₃} &amp; 0 &amp; 0 &amp; A₁_{₁} + A₃_{₃} &amp; A₁_{₂} &amp; A₁_{₃}\\0 &amp; A₁_{₃} &amp; 0 &amp; 0 &amp; A₂_{₃} &amp; 0 &amp; A₂_{₁} &amp; A₂_{₂} + A₃_{₃} &amp; A₂_{₃}\\0 &amp; 0 &amp; A₁_{₃} &amp; 0 &amp; 0 &amp; A₂_{₃} &amp; A₃_{₁} &amp; A₃_{₂} &amp; 2 A₃_{₃}\end{smallmatrix}\right]\)</span></p>
</div>
</div>
<p>The matrix <span class="math inline">\(JJ\)</span> is identical to <span class="math inline">\(J\)</span>, above:</p>
<div id="ca842927" class="cell" data-execution_count="11">
<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">all</span>(j <span class="op">==</span> jj <span class="cf">for</span> (j, jj) <span class="kw">in</span> <span class="fu">zip</span>(J, JJ))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display" data-execution_count="46">
<pre><code>true</code></pre>
</div>
</div>
</section>
<section id="kronecker-products" class="level3">
<h3 class="anchored" data-anchor-id="kronecker-products">Kronecker products</h3>
<p>But how can we see the Jacobian, <span class="math inline">\(J\)</span>, from the linear operator <span class="math inline">\(f'(A)[\delta A] = \delta A A + A \delta A\)</span>?</p>
<p>To make this less magical, a related operation to <code>vec</code> is defined.</p>
<p>The <span class="math inline">\(\text{vec}\)</span> function takes a matrix and stacks its columns.</p>
<p>The <span class="math inline">\(\text{vec}\)</span> function can turn a matrix into a vector, so it can be used for finding the Jacobian, as above. However the shape of the matrix is lost, as are the fundamental matrix operations, like multiplication.</p>
<p>The <a href="https://en.wikipedia.org/wiki/Kronecker_product">Kronecker product</a> replicates values making a bigger matrix. That is, if <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span> are matrices, the Kronecker product replaces each value in <span class="math inline">\(A\)</span> with that value times <span class="math inline">\(B\)</span>, making a bigger matrix, as each entry in <span class="math inline">\(A\)</span> is replaced by an entry with size <span class="math inline">\(B\)</span>.</p>
<p>Formally,</p>
<p><span class="math display">\[
A \otimes B =
\begin{bmatrix}
a_{11}B &amp; a_{12}B &amp; \cdots &amp; a_{1n}B \\
a_{21}B &amp; a_{22}B &amp; \cdots &amp; a_{2n}B \\
        &amp;\vdots   &amp;        &amp;         \\
a_{m1}B &amp; a_{m2}B &amp; \cdots &amp; a_{mn}B
\end{bmatrix}
\]</span></p>
<p>The function <code>kron</code> forms this product:</p>
<div id="c562b2e6" class="cell" data-execution_count="12">
<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="pp">@syms</span> A[<span class="fl">1</span><span class="op">:</span><span class="fl">2</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">3</span>] B[<span class="fl">1</span><span class="op">:</span><span class="fl">3</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">4</span>]</span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="fu">kron</span>(A, B) <span class="co"># same as hcat((vcat((A[i,j]*B for i in 1:2)...) for j in 1:3)...)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display cell-output-markdown" data-execution_count="47">
<p><span class="math inline">\(\left[\begin{smallmatrix}A₁_{₁} B₁_{₁} &amp; A₁_{₁} B₁_{₂} &amp; A₁_{₁} B₁_{₃} &amp; A₁_{₁} B₁_{₄} &amp; A₁_{₂} B₁_{₁} &amp; A₁_{₂} B₁_{₂} &amp; A₁_{₂} B₁_{₃} &amp; A₁_{₂} B₁_{₄} &amp; A₁_{₃} B₁_{₁} &amp; A₁_{₃} B₁_{₂} &amp; A₁_{₃} B₁_{₃} &amp; A₁_{₃} B₁_{₄}\\A₁_{₁} B₂_{₁} &amp; A₁_{₁} B₂_{₂} &amp; A₁_{₁} B₂_{₃} &amp; A₁_{₁} B₂_{₄} &amp; A₁_{₂} B₂_{₁} &amp; A₁_{₂} B₂_{₂} &amp; A₁_{₂} B₂_{₃} &amp; A₁_{₂} B₂_{₄} &amp; A₁_{₃} B₂_{₁} &amp; A₁_{₃} B₂_{₂} &amp; A₁_{₃} B₂_{₃} &amp; A₁_{₃} B₂_{₄}\\A₁_{₁} B₃_{₁} &amp; A₁_{₁} B₃_{₂} &amp; A₁_{₁} B₃_{₃} &amp; A₁_{₁} B₃_{₄} &amp; A₁_{₂} B₃_{₁} &amp; A₁_{₂} B₃_{₂} &amp; A₁_{₂} B₃_{₃} &amp; A₁_{₂} B₃_{₄} &amp; A₁_{₃} B₃_{₁} &amp; A₁_{₃} B₃_{₂} &amp; A₁_{₃} B₃_{₃} &amp; A₁_{₃} B₃_{₄}\\A₂_{₁} B₁_{₁} &amp; A₂_{₁} B₁_{₂} &amp; A₂_{₁} B₁_{₃} &amp; A₂_{₁} B₁_{₄} &amp; A₂_{₂} B₁_{₁} &amp; A₂_{₂} B₁_{₂} &amp; A₂_{₂} B₁_{₃} &amp; A₂_{₂} B₁_{₄} &amp; A₂_{₃} B₁_{₁} &amp; A₂_{₃} B₁_{₂} &amp; A₂_{₃} B₁_{₃} &amp; A₂_{₃} B₁_{₄}\\A₂_{₁} B₂_{₁} &amp; A₂_{₁} B₂_{₂} &amp; A₂_{₁} B₂_{₃} &amp; A₂_{₁} B₂_{₄} &amp; A₂_{₂} B₂_{₁} &amp; A₂_{₂} B₂_{₂} &amp; A₂_{₂} B₂_{₃} &amp; A₂_{₂} B₂_{₄} &amp; A₂_{₃} B₂_{₁} &amp; A₂_{₃} B₂_{₂} &amp; A₂_{₃} B₂_{₃} &amp; A₂_{₃} B₂_{₄}\\A₂_{₁} B₃_{₁} &amp; A₂_{₁} B₃_{₂} &amp; A₂_{₁} B₃_{₃} &amp; A₂_{₁} B₃_{₄} &amp; A₂_{₂} B₃_{₁} &amp; A₂_{₂} B₃_{₂} &amp; A₂_{₂} B₃_{₃} &amp; A₂_{₂} B₃_{₄} &amp; A₂_{₃} B₃_{₁} &amp; A₂_{₃} B₃_{₂} &amp; A₂_{₃} B₃_{₃} &amp; A₂_{₃} B₃_{₄}\end{smallmatrix}\right]\)</span></p>
</div>
</div>
<p>The <span class="math inline">\(m\times n\)</span> matrix <span class="math inline">\(A\)</span> and <span class="math inline">\(j \times k\)</span> matrix <span class="math inline">\(B\)</span> has a Kronecker product with size <span class="math inline">\(mj \times nk\)</span>.</p>
<p>The Kronecker product has a certain algebra, including:</p>
<ul>
<li>transposes: <span class="math inline">\((A \otimes B)^T) = A^T \otimes B^T\)</span></li>
<li>multiplication: <span class="math inline">\((A\otimes B)(C \otimes D) = (AC) \otimes (BD)\)</span></li>
<li>inverses: <span class="math inline">\((A \otimes B)^{-1} = (A^{-1}) \otimes (B^{-1})\)</span></li>
<li>orthogonal: <span class="math inline">\((A\otimes B)^T = (A\otimes B)\)</span> if both <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span> has the same property</li>
<li>determinants: <span class="math inline">\(\det(A\otimes B) = \det(A)^m \det(B)^n\)</span>, where <span class="math inline">\(A\)</span> is <span class="math inline">\(n\times n\)</span>, <span class="math inline">\(B\)</span> is <span class="math inline">\(m \times m\)</span>.</li>
<li>trace (sum of diagonal): <span class="math inline">\(\text{tr}(A \otimes B) = \text{tr}(A)\text{tr}(B)\)</span>.</li>
</ul>
<p>The main equation coupling <code>vec</code> and <code>kron</code> is the fact that if <span class="math inline">\(A\)</span>, <span class="math inline">\(B\)</span>, and <span class="math inline">\(C\)</span> have appropriate sizes, then:</p>
<p><span class="math display">\[
(A \otimes B) \text{vec}(C) = \text{vec}(B C A^T).
\]</span></p>
<p>Appropriate sizes for <span class="math inline">\(A\)</span>, <span class="math inline">\(B\)</span>, and <span class="math inline">\(C\)</span> are determined by the various products in <span class="math inline">\(BCA^T\)</span>.</p>
<p>If <span class="math inline">\(A\)</span> is <span class="math inline">\(m \times n\)</span> and <span class="math inline">\(B\)</span> is <span class="math inline">\(r \times s\)</span>, then since <span class="math inline">\(BC\)</span> is defined, <span class="math inline">\(C\)</span> has <span class="math inline">\(s\)</span> rows, and since <span class="math inline">\(CA^T\)</span> is defined, <span class="math inline">\(C\)</span> must have <span class="math inline">\(n\)</span> columns, as <span class="math inline">\(A^T\)</span> is <span class="math inline">\(n \times m\)</span>, so <span class="math inline">\(C\)</span> must be <span class="math inline">\(s\times n\)</span>. Checking this is correct on the other side, <span class="math inline">\(A \times B\)</span> would be size <span class="math inline">\(mr \times ns\)</span> and <span class="math inline">\(\vec{C}\)</span> would be size <span class="math inline">\(sn\)</span>, so that product works, size wise.</p>
<p>The referred to notes have an explanation for this formula, but we confirm with an example with <span class="math inline">\(m=n-2\)</span>, <span class="math inline">\(r=s=3\)</span>:</p>
<div id="d9cdbb04" class="cell" data-execution_count="13">
<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="pp">@syms</span> A[<span class="fl">1</span><span class="op">:</span><span class="fl">2</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">2</span>]<span class="op">::</span><span class="dt">real </span>B[<span class="fl">1</span><span class="op">:</span><span class="fl">3</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">3</span>]<span class="op">::</span><span class="dt">real </span>C[<span class="fl">1</span><span class="op">:</span><span class="fl">3</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">2</span>]<span class="op">::</span><span class="dt">real</span></span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>L, R <span class="op">=</span> <span class="fu">kron</span>(A,B)<span class="fu">*vec</span>(C),  <span class="fu">vec</span>(B<span class="op">*</span>C<span class="op">*</span>A<span class="op">'</span>)</span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a><span class="fu">all</span>(l <span class="op">==</span> r <span class="cf">for</span> (l, r) <span class="op">∈</span> <span class="fu">zip</span>(L, R))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display" data-execution_count="48">
<pre><code>true</code></pre>
</div>
</div>
<hr>
<p>Now to use this relationship to recognize <span class="math inline">\(df = A dA + dA A\)</span> with the Jacobian computed from <span class="math inline">\(\text{vec}{f(a)}\)</span>.</p>
<p>We have <span class="math inline">\(\text{vec}(A dA + dA A) = \text{vec}(A dA) + \text{vec}(dA A)\)</span>, by obvious linearity of <span class="math inline">\(\text{vec}\)</span>. Now inserting an identity matrix, <span class="math inline">\(I\)</span>, which is symmteric, we have:</p>
<p><span class="math display">\[
\text{vec}(A dA) = \text{vec}(A dA I^T) = (I \otimes A) \text{vec}(dA),
\]</span></p>
<p>and</p>
<p><span class="math display">\[
\text{vec}(dA A) = \text{vec}(I dA (A^T)^T) = (A^T \otimes I) \text{vec}(dA)
\]</span></p>
<p>This leaves</p>
<p><span class="math display">\[
\text{vec}(A dA + dA A) =
\left((I \otimes A) + (A^T \otimes I)\right) \text{vec}(dA)
\]</span></p>
<p>We should then get the Jacobian we computed from the following:</p>
<div id="67dde440" class="cell" data-execution_count="14">
<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="pp">@syms</span> A[<span class="fl">1</span><span class="op">:</span><span class="fl">3</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">3</span>]<span class="op">::</span><span class="dt">real</span></span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">LinearAlgebra</span>: I</span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>J <span class="op">=</span> <span class="fu">vec</span>(A<span class="op">^</span><span class="fl">2</span>).<span class="fu">jacobian</span>(<span class="fu">vec</span>(A))</span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>JJ <span class="op">=</span> <span class="fu">kron</span>(<span class="fu">I</span>(<span class="fl">3</span>), A) <span class="op">+</span> <span class="fu">kron</span>(A<span class="op">'</span>, <span class="fu">I</span>(<span class="fl">3</span>))</span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="fu">all</span>(j <span class="op">==</span> jj <span class="cf">for</span> (j,jj) <span class="kw">in</span> <span class="fu">zip</span>(J,JJ))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display" data-execution_count="49">
<pre><code>true</code></pre>
</div>
</div>
<p>This technique can also be used with other powers, say <span class="math inline">\(f(A) = A^3\)</span>, where the resulting <span class="math inline">\(df = A^2 dA + A dA A + dA A^2\)</span> is one answer that can be compared to a Jacobian through</p>
<p><span class="math display">\[
\begin{align*}
df &amp;= \text{vec}(A^2 dA I^T) + \text{vec}(A dA A) + \text{vec}(I dA A^2)\\
&amp;= (I \otimes A^2)\text{vec}(dA) + (A^T \otimes A) \text{vec}(dA) + ((A^T)^2 \otimes I) \text{vec}(dA)
\end{align*}
\]</span></p>
<p>The above shows how to relate the derivative of a matrix function to the Jacobian of a vectorized function, but only for illustration. It is decidely not necessary to express the derivative of <span class="math inline">\(f\)</span> in terms of the derivative of its vectorized counterpart.</p>
<section id="example-derivative-of-the-inverse" class="level5">
<h5 class="anchored" data-anchor-id="example-derivative-of-the-inverse">Example: derivative of the inverse</h5>
<p>What is the derivative of <span class="math inline">\(f(A) = A^{-1}\)</span>. When <span class="math inline">\(A\)</span> is a scalar, we related it to the reciprocal of the derivative of <span class="math inline">\(f\)</span> at some other point. The same technique is available. Starting with <span class="math inline">\(I = AA^{-1}\)</span> and noting <span class="math inline">\(dI\)</span> is <span class="math inline">\(0\)</span> we have</p>
<p><span class="math display">\[
\begin{align*}
0 &amp;= d(AA^{-1})\\
&amp;= dAA^{-1} + A d(A^{-1})
\end{align*}
\]</span></p>
<p>So, <span class="math inline">\(d(A^{-1}) = -A^{-1} dA A^{-1}\)</span>.</p>
<p>This could be re-expressed as a linear operator through</p>
<p><span class="math display">\[
\text{vec}(dA^{-1}) =
\left((A^{-1})^T \otimes A^{-1}\right) \text{vec}(dA)
= \left((A^T)^{-1} \otimes A^{-1}\right) \text{vec}(dA).
\]</span></p>
</section>
<section id="example-derivative-of-the-determinant" class="level5">
<h5 class="anchored" data-anchor-id="example-derivative-of-the-determinant">Example: derivative of the determinant</h5>
<p>Let <span class="math inline">\(f(A) = \text{det}(A)\)</span>. What is the derivative?</p>
<p>First, the determinant of a square, <span class="math inline">\(n\times n\)</span>, matrix <span class="math inline">\(A\)</span> is a scalar summary of <span class="math inline">\(A\)</span> with different means to compute it, but one recursive one in particular is helpful here:</p>
<p><span class="math display">\[
\text{det}(A) =  a_{1j}C_{1j} + a_{2j}C_{2j} + \cdots a_{nj}C_{nj}
\]</span></p>
<p>for any <span class="math inline">\(j\)</span>. The <em>cofactor</em> <span class="math inline">\(C_{ij}\)</span> is the determinant of the <span class="math inline">\((n-1)\times(n-1)\)</span> matrix with the <span class="math inline">\(i\)</span>th row and <span class="math inline">\(j\)</span>th column deleted times <span class="math inline">\((-1)^{i+j}\)</span>.</p>
<p>To find the <em>gradient</em> of <span class="math inline">\(f\)</span>, we differentiate by each of the <span class="math inline">\(A_{ij}\)</span> variables, and so</p>
<p><span class="math display">\[
\frac{\partial\text{det}(A)}{\partial A_{ij}} =
\frac{\partial (a_{1j}C_{1j} + a_{2j}C_{2j} + \cdots a_{nj}C_{nj})}{\partial A_{ij}} =
C_{ij},
\]</span></p>
<p>as each cofactor in the expansion has no dependence on <span class="math inline">\(A_{ij}\)</span> as the cofactor removes the <span class="math inline">\(i\)</span>th row and <span class="math inline">\(j\)</span>th column.</p>
<p>So the gradient is the matrix of cofactors.</p>
<p><span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> also give a different proof, starting with this observation</p>
<p><span class="math display">\[
\text{det}(I + dA) - \text{det}(I) = \text{tr}(dA)
\]</span></p>
<p>Assuming that, then by the fact <span class="math inline">\(\text{det}(AB) = \text{det}(A)\text{det}(B)\)</span>:</p>
<p><span class="math display">\[
\begin{align*}
\text{det}(A + A(A^{-1}dA)) - \text{det}(A) &amp;= \text{det}(A)\cdot(\text{det}(I+ A^{-1}dA) - \text{det}(I)) \\
&amp;= \text{det}(A) \text{tr}(A^{-1}dA)\\
&amp;= \text{tr}(\text{det}(A)A^{-1}dA)\\
\end{align*}
\]</span></p>
<p>This agrees through a formula to compute the inverse of a matrix through its cofactor matrix divided by its determinant.</p>
<p>That the trace gets involved, can be seen from this computation, which shows the only first-order terms are from the diagonal sum:</p>
<div id="f7627a0b" class="cell" data-execution_count="15">
<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">LinearAlgebra</span></span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="pp">@syms</span> dA[<span class="fl">1</span><span class="op">:</span><span class="fl">2</span>, <span class="fl">1</span><span class="op">:</span><span class="fl">2</span>]</span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a><span class="fu">det</span>(I <span class="op">+</span> dA) <span class="op">-</span> <span class="fu">det</span>(I)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-display cell-output-markdown" data-execution_count="50">
<p><span class="math inline">\(dA₁_{₁} dA₂_{₂} + dA₁_{₁} - dA₁_{₂} dA₂_{₁} + dA₂_{₂}\)</span></p>
</div>
</div>
</section>
</section>
</section>
<section id="the-adjoint-method" class="level2">
<h2 class="anchored" data-anchor-id="the-adjoint-method">The adjoint method</h2>
<p>The chain rule brings about a series of products. The adjoint method illustrated below, shows how to approach the computation of the series in a direction that minimizes the computational cost, illustrating why reverse mode is preferred to forward mode when a scalar function of several variables is considered.</p>
<p><span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> consider the derivative of</p>
<p><span class="math display">\[
g(p) = f(A(p)^{-1} b)
\]</span></p>
<p>This might arise from applying a scalar-valued <span class="math inline">\(f\)</span> to the solution of <span class="math inline">\(Ax = b\)</span>, where <span class="math inline">\(A\)</span> is parameterized by <span class="math inline">\(p\)</span>.</p>
<p>The chain rule gives the following computation to find the derivative (or gradient):</p>
<p><span class="math display">\[
\begin{align*}
dg
&amp;= f'(x)[dx]\\
&amp;= f'(x) [d(A(p)^{1} b)]\\
&amp;= f'(x)[-A(p)^{-1} dA A(p)^{-1} b + 0]\\
&amp;= -f'(x) A(p)^{-1} dA A(p)^{-1} b.
\end{align*}
\]</span></p>
<p>By writing <span class="math inline">\(dA = A'(p)[dp]\)</span> and setting <span class="math inline">\(v^T = f'(x)A(p)^{-1}\)</span> this becomes</p>
<p><span class="math display">\[
dg = -v^T dA A(p)^{-1} b = -v^T dA x
\]</span></p>
<p>This product of three terms can be computed in two directions:</p>
<p>From left to right:</p>
<p>First <span class="math inline">\(v\)</span> is found by solving <span class="math inline">\(v^T = f'(x) A^{-1}\)</span> through the solving of <span class="math inline">\(v = (A^{-1})^T (f'(x))^T = (A^T)^{-1} \nabla(f)\)</span> or by solving <span class="math inline">\(A^T v = \nabla f\)</span>. This is called the <em>adjoint</em> equation.</p>
<p>The partial derivatives in <span class="math inline">\(g\)</span> is related to each partial derivative of <span class="math inline">\(dA\)</span> through:</p>
<p><span class="math display">\[
\frac{\partial g}{\partial p_k} = -v^T\frac{\partial A}{\partial p_k} x,
\]</span></p>
<p>as the scalar factor commutes through. With <span class="math inline">\(v\)</span> and <span class="math inline">\(x\)</span> solved for (via the adjoint equation and from solving <span class="math inline">\(Ax=b\)</span>) the partials in <span class="math inline">\(p_k\)</span> are computed with dot products. There are just two costly operations.</p>
<p>From right to left:</p>
<p>The value of <span class="math inline">\(x\)</span> can be solved for, as above, but computing the value of</p>
<p><span class="math display">\[
\frac{\partial g}{\partial p_k} =
-f'(x) \left(A^{-1} \frac{\partial A}{\partial p_k} x \right)
\]</span></p>
<p>requires a costly solve for each <span class="math inline">\(p_k\)</span>, and <span class="math inline">\(p\)</span> may have many components. As mentioned above, the reverse mode offers advantages when there are many input parameters (<span class="math inline">\(p\)</span>) and a single output parameter.</p>
<section id="example-3" class="level5">
<h5 class="anchored" data-anchor-id="example-3">Example</h5>
<p>Suppose <span class="math inline">\(x(p)\)</span> solves some system of equations <span class="math inline">\(h(x(p),p) = 0\)</span> in <span class="math inline">\(R^n\)</span> (<span class="math inline">\(n\)</span> possibly just <span class="math inline">\(1\)</span>) and <span class="math inline">\(g(p) = f(x(p))\)</span> is some non-linear transformation of <span class="math inline">\(x\)</span>. What is the derivative of <span class="math inline">\(g\)</span> in <span class="math inline">\(p\)</span>?</p>
<p>Suppose the <em>implicit function theorem</em> applies to <span class="math inline">\(h(x,p) = 0\)</span>, that is – <em>locally</em> – there is an implicitly defined function <span class="math inline">\(x(p)\)</span> with a derivative. Moreover by differentiating both sides it can be identified:</p>
<p><span class="math display">\[
0 = \frac{\partial h}{\partial p} dp + \frac{\partial h}{\partial x} dx
\]</span></p>
<p>which can be solved for <span class="math inline">\(dx\)</span> to give</p>
<p><span class="math display">\[
dx = -\left(\frac{\partial h}{\partial x}\right)^{-1} \frac{\partial h}{\partial p} dp.
\]</span></p>
<p>The chain rule then gives</p>
<p><span class="math display">\[
dg = f'(x) dx = -f'(x) \left(\frac{\partial h}{\partial x}\right)^{-1} \frac{\partial h}{\partial p} dp.
\]</span></p>
<p>This can be computed in two directions:</p>
<p>From left to right:</p>
<p>Call <span class="math inline">\(A =\left(\frac{\partial h}{\partial x}\right)^{-1}\)</span>. Then define <span class="math inline">\(v\)</span> indirectly through <span class="math inline">\(v^T = f'(x) A^{-1}\)</span>. With this: <span class="math inline">\(v = (A^{-1})^T (f'(x))^T = (A^T)^{-1} \nabla{f}\)</span> which is found by solving <span class="math inline">\(A^Tv = \nabla{f}\)</span>. Again, this is the <em>adjoint</em> equation.</p>
<p>The value of <span class="math inline">\(dA\)</span> is related to each partial derivative for which</p>
<p><span class="math display">\[
\frac{\partial g}{\partial p_k} = -v^T\frac{\partial A}{\partial p_k} x,
\]</span></p>
<p>as the scalar factor commutes through. With <span class="math inline">\(v\)</span> and <span class="math inline">\(x\)</span> solved for (via the adjoint equation and from solving <span class="math inline">\(Ax=b\)</span>) the partials in <span class="math inline">\(p_k\)</span> are computed with dot products.</p>
<p>However, from right to left, the value of <span class="math inline">\(x\)</span> can be solved for, but computing the value of</p>
<p><span class="math display">\[
\frac{\partial g}{\partial p_k} =
-f'(x)
\left(A^{-1} \frac{\partial A}{\partial p_k} x \right)
\]</span></p>
<p>requires a costly solve for each <span class="math inline">\(p_k\)</span>, and <span class="math inline">\(p\)</span> may have many components. The reverse mode offers advantages when there are many input parameters (<span class="math inline">\(p\)</span>) and a single output parameter.</p>
</section>
<section id="example-4" class="level5">
<h5 class="anchored" data-anchor-id="example-4">Example</h5>
<p>Suppose <span class="math inline">\(x(p)\)</span> solves some system of equations <span class="math inline">\(h(x(p),p) = 0\)</span> in <span class="math inline">\(R^n\)</span> (<span class="math inline">\(n\)</span> possibly just 1$) and <span class="math inline">\(g(p) = f(x(p))\)</span> is some non-linear transformation of <span class="math inline">\(x\)</span>. What is the derivative of <span class="math inline">\(g\)</span> in <span class="math inline">\(p\)</span>?</p>
<p>Suppose the <em>implicit function theorem</em> applies to <span class="math inline">\(h(x,p) = 0\)</span>, that is <em>locally</em> the response <span class="math inline">\(x(p)\)</span> has a derivative, and moreover by the chain rule</p>
<p><span class="math display">\[
0 = \frac{\partial h}{\partial p} dp + \frac{\partial h}{\partial x} dx.
\]</span></p>
<p>Solving the above for <span class="math inline">\(dx\)</span> gives:</p>
<p><span class="math display">\[
dx = -\left(\frac{\partial h}{\partial x}\right)^{-1} \frac{\partial h}{\partial p} dp.
\]</span></p>
<p>The chain rule applied to <span class="math inline">\(g(p) = f(x(p))\)</span> then yields</p>
<p><span class="math display">\[
dg = f'(x) dx = - f'(x) \left(\frac{\partial h}{\partial x}\right)^{-1} \frac{\partial h}{\partial p} dp.
\]</span></p>
<p>Setting</p>
<p><span class="math display">\[
v^T = -f'(x) \left(\frac{\partial h}{\partial x}\right)^{-1}
\]</span></p>
<p>then <span class="math inline">\(v\)</span> can be solved from taking adjoints (as before). Let <span class="math inline">\(A = \partial h/\partial x\)</span>, the <span class="math inline">\(v^T = -f'(x) A^{-1}\)</span> or <span class="math inline">\(v = -(A^{-1})^T (f'(x))^t= -(A^T)^{-1} \nabla f\)</span>. As before it would take two solves to get both <span class="math inline">\(g\)</span> and its gradient.</p>
</section>
</section>
<section id="second-derivatives-hessian" class="level2">
<h2 class="anchored" data-anchor-id="second-derivatives-hessian">Second derivatives, Hessian</h2>
<p><span class="citation" data-cites="CarlssonNikitinTroedssonWendt">@CarlssonNikitinTroedssonWendt</span></p>
<p>We reference a theorem presented by <a href="https://arxiv.org/pdf/2502.03070v1">Carlsson, Nikitin, Troedsson, and Wendt</a> for exposition with some modification</p>
<div class="callout callout-style-simple callout-note no-icon">
<div class="callout-body d-flex">
<div class="callout-icon-container">
<i class="callout-icon no-icon"></i>
</div>
<div class="callout-body-container">
<p>Theorem 1. Let <span class="math inline">\(f:X \rightarrow Y\)</span>, where <span class="math inline">\(X,Y\)</span> are finite dimensional <em>inner product</em> spaces with elements in <span class="math inline">\(R\)</span>. Suppose <span class="math inline">\(f\)</span> is smooth (a certain number of derivatives). Then for each <span class="math inline">\(x\)</span> in <span class="math inline">\(X\)</span> there exists a unique linear operator, <span class="math inline">\(f'(x)\)</span>, and a unique <em>bilinear</em> <em>symmetric</em> operator <span class="math inline">\(f'': X \oplus X \rightarrow Y\)</span> such that</p>
<p><span class="math display">\[
f(x + \delta x) = f(x) + f'(x)[\delta x] + \frac{1}{2}f''(x)[\delta x, \delta x] + \mathscr(||\delta x ||^2).
\]</span></p>
</div>
</div>
</div>
<p>New terms include <em>bilinear</em>, <em>symmetric</em>, and <em>inner product</em>. An operator (<span class="math inline">\(X\oplus X \rightarrow Y\)</span>) is bilinear if it is a linear operator in each of its two arguments. Such an operator is <em>symmetric</em> if interchanging its two arguments makes no difference in its output. Finally, an <em>inner product</em> space is one with a generalization of the dot product. An inner product takes two vectors <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span> and returns a scalar; it is denoted <span class="math inline">\(\langle x,y\rangle\)</span>; and has properties of symmetry, linearity, and non-negativity (<span class="math inline">\(\langle x,x\rangle \geq 0\)</span>, and equal <span class="math inline">\(0\)</span> only if <span class="math inline">\(x\)</span> is the zero vector.) Inner products can be used to form a norm (or length) for a vector through <span class="math inline">\(||x||^2 = \langle x,x\rangle\)</span>.</p>
<p>We reference this, as the values denoted <span class="math inline">\(f'\)</span> and <span class="math inline">\(f''\)</span> are <em>unique</em>. So if we identify them one way, we have identified them.</p>
<p>Specializing to <span class="math inline">\(X=R^n\)</span> and <span class="math inline">\(Y=R^1\)</span>, we have, <span class="math inline">\(f'=\nabla f^T\)</span> and <span class="math inline">\(f''\)</span> is the Hessian.</p>
<p>Take <span class="math inline">\(n=2\)</span>. Previously we wrote a formula for Taylor’s theorem for <span class="math inline">\(f:R^n \rightarrow R\)</span> that with <span class="math inline">\(n=2\)</span> has with <span class="math inline">\(x=\langle x_1,x_2\rangle\)</span>:</p>
<p><span class="math display">\[
\begin{align*}
f(x + dx) &amp;= f(x) +
\frac{\partial f}{\partial x_1} dx_1 + \frac{\partial f}{\partial x_2} dx_2\\
&amp;+  \frac{1}{2}\left(
\frac{\partial^2 f}{\partial x_1^2}dx_1^2 +
\frac{\partial^2 f}{\partial x_1 \partial x_2}dx_1dx_2 +
\frac{\partial^2 f}{\partial x_2^2}dx_2^2
\right) + \mathscr{o}(dx).
\end{align*}
\]</span></p>
<p>We can see <span class="math inline">\(\nabla{f} \cdot dx = f'(x) dx\)</span> to tidy up part of the first line, and more over the second line can be seen to be a matrix product:</p>
<p><span class="math display">\[
[dx_1 dx_2]
\begin{bmatrix}
\frac{\partial^2 f}{\partial x_1^2} &amp;
\frac{\partial^2 f}{\partial x_1 \partial x_2}\\
\frac{\partial^2 f}{\partial x_2 \partial x_1} &amp;
\frac{\partial^2 f}{\partial x_2^2}
\end{bmatrix}
\begin{bmatrix}
dx_1\\
dx_2
\end{bmatrix}
= dx^T H dx,
\]</span></p>
<p><span class="math inline">\(H\)</span> being the <em>Hessian</em> with entries <span class="math inline">\(H_{ij} = \frac{\partial f}{\partial x_i \partial x_j}\)</span>.</p>
<p>This formula – <span class="math inline">\(f(x+dx)-f(x) \approx f'(x)dx + dx^T H dx\)</span> – is valid for any <span class="math inline">\(n\)</span>, showing <span class="math inline">\(n=2\)</span> was just for ease of notation when expressing in the coordinates and not as matrices.</p>
<p>By uniqueness, we have under these assumptions that the Hessian is <em>symmetric</em> and the expression <span class="math inline">\(dx^T H dx\)</span> is a <em>bilinear</em> form, which we can identify as <span class="math inline">\(f''(x)[dx,dx]\)</span>.</p>
<p>That the Hessian is symmetric could also be derived under these assumptions by directly computing that the mixed partials can have their order exchanged. But in this framework, as explained by <span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> it is a result of the underlying vector space having an addition that is commutative (e.g.&nbsp;<span class="math inline">\(u+v = v+u\)</span>).</p>
<p>The mapping <span class="math inline">\((u,v) \rightarrow u^T A v\)</span> for a matrix <span class="math inline">\(A\)</span> is bilinear. For a fixed <span class="math inline">\(u\)</span>, it is linear as it can be viewed as <span class="math inline">\((u^TA)[v]\)</span> and matrix multiplication is linear. Similarly for a fixed <span class="math inline">\(v\)</span>.</p>
<p><span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> extend this characterization to a broader setting. The second derivative can be viewed as expressing first-order change in <span class="math inline">\(f'(x)\)</span>, a linear operator. The value <span class="math inline">\(df'\)</span> has the same shape as <span class="math inline">\(f'\)</span>, which is a linear operator, so <span class="math inline">\(df'\)</span> acts on vectors, say <span class="math inline">\(dx\)</span>, then:</p>
<p><span class="math display">\[
df'[dx] = f''(x)[dx'][dx] = f''(x)[dx', dx]
\]</span></p>
<p>The prime in <span class="math inline">\(dx'\)</span> is just notation, not a derivative operation for <span class="math inline">\(dx\)</span>.</p>
<p>With this view, we can see that <span class="math inline">\(f''(x)\)</span> has two vectors it acts on. By definition it is linear in <span class="math inline">\(dx\)</span>. However, as <span class="math inline">\(f'(x)\)</span> is a linear operator and the sum and product rules apply to derivatives, this operator is linear in <span class="math inline">\(dx'\)</span> as well. So <span class="math inline">\(f''(x)\)</span> is bilinear and as mentioned earlier symmetric.</p>
<section id="polarization" class="level3">
<h3 class="anchored" data-anchor-id="polarization">Polarization</h3>
<p><span class="citation" data-cites="BrightEdelmanJohnson">@BrightEdelmanJohnson</span> interpret <span class="math inline">\(f''\)</span> by looking at the image under <span class="math inline">\(f\)</span> of <span class="math inline">\(x + dx + dx'\)</span>. If <span class="math inline">\(x\)</span> is a vector, then this has a geometrical picture, from vector addtion, relating <span class="math inline">\(x + dx\)</span>, <span class="math inline">\(x+dx'\)</span>, and <span class="math inline">\(x + dx + dx'\)</span>.</p>
<p>The image for <span class="math inline">\(x +dx\)</span> is to second order <span class="math inline">\(f(x) + f'(x)[dx] + (1/2)f''(x)[dx, dx]\)</span>, similarly <span class="math inline">\(x + dx'\)</span> is to second order <span class="math inline">\(f(x) + f'(x)[dx'] + (1/2)f''(x)[dx', dx']\)</span>. The key formula for <span class="math inline">\(f''(x)\)</span> is</p>
<p><span class="math display">\[
\begin{align*}
f(x + dx + dx') &amp;= f(x) + f'(x)[dx + dx'] + \frac{1}{2}f''(x)[dx, dx']\\
&amp;= f(x) + f'(x)[dx] + (1/2)f''(x)[dx, dx]
&amp;+ f(x) + f'(x)[dx] + (1/2)f''(x)[dx', dx']
&amp;+ f''(x)[dx, dx']
\end{align}
\]</span></p>
<p>This gives a means to compute <span class="math inline">\(f''\)</span> in terms of <span class="math inline">\(f''\)</span> acting on diagonal terms, where the two vectors are equal:</p>
<p><span class="math display">\[
f''(x)[dx, dx'] = \frac{1}{2} f''(x)[dx+dx',dx+dx'] - f''(x)[dx,dx] - f''(x)[dx',dx']
\]</span></p>
</section>
<section id="xxx-does-this-fit-in" class="level3">
<h3 class="anchored" data-anchor-id="xxx-does-this-fit-in">XXX does this fit in?</h3>
<p>However, as a description of second-order change in <span class="math inline">\(f\)</span>, we recover the initial terms in the Taylor series</p>
<p><span class="math display">\[
f(x + \delta x) = f(x) + f'(x)\delta x + (1/2) f''(x)[\delta x, \delta x] + \mathscr{o}(||\delta x||^2).
\]</span></p>
</section>
<section id="examples-2" class="level3">
<h3 class="anchored" data-anchor-id="examples-2">Examples</h3>
<section id="example-second-derivative-of-xtax" class="level5">
<h5 class="anchored" data-anchor-id="example-second-derivative-of-xtax">Example: second derivative of <span class="math inline">\(x^TAx\)</span></h5>
<p>Consider an expression from earlier <span class="math inline">\(f(x) = x^T A x\)</span> for some constant <span class="math inline">\(A\)</span>. Then <span class="math inline">\(f''\)</span> is found by noting that <span class="math inline">\(f' = (\nabla f)^T = x^T(A + A^T)\)</span>, or <span class="math inline">\(\nabla f = (A^T + A)x\)</span> and <span class="math inline">\(f'' = H = A^T + A\)</span> is the Jacobian of the gradient.</p>
<p>By rearranging terms, it can be shown that <span class="math inline">\(f(x) = 1/2 x^THx = 1/2 f''[x,x]\)</span>.</p>
</section>
<section id="example-second-derivative-of-textdeta" class="level5">
<h5 class="anchored" data-anchor-id="example-second-derivative-of-textdeta">Example: second derivative of <span class="math inline">\(\text{det}(A)\)</span></h5>
<p>Consider <span class="math inline">\(f(A) = \text{det}(A)\)</span>. We saw previously that:</p>
<p><span class="math display">\[
\begin{align*}
\text{tr}(A + B) &amp;= \text{tr}(A) + \text{tr}(B)\\
\text{det}(A + dA') &amp;= \text{det}(A) + \text{det}(A)\text{tr}(A^{-1}dA')\\
(A + dA') &amp;= A^{-1} - A^{-1} dA' A^{-1}
\end{align*}
\]</span></p>
<p>These are all used to simplify:</p>
<p><span class="math display">\[
\begin{align*}
\text{det}(A+dA')&amp;\text{tr}((A + dA')^{-1} dA) - \text{det}(A) \text{tr}(A^{-1}dA) \\
&amp;= \left(
\text{det}(A) + \text{det}(A)\text{tr}(A^{-1}dA')
\right)
\text{tr}((A^{-1} - A^{-1}dA' A^{-1})dA) - \text{det}(A) \text{tr}(A^{-1}dA) \\
&amp;=
\text{det}(A) \text{tr}(A^{-1}dA)\\
&amp;+ \text{det}(A)\text{tr}(A^{-1}dA')\text{tr}(A^{-1}dA) \\
&amp;- \text{det}(A)\text{tr}(A^{-1}dA' A^{-1}dA)\\
&amp;- \text{det}(A)\text{tr}(A^{-1}dA')\text{tr}(A^{-1}dA' A^{-1}dA)\\
&amp;- \text{det}(A) \text{tr}(A^{-1}dA) \\
&amp;= \text{det}(A)\text{tr}(A^{-1}dA')\text{tr}(A^{-1}dA) - \text{det}(A)\text{tr}(A^{-1}dA' A^{-1}dA)\\
&amp;+ \text{third order term}
\end{align*}
\]</span></p>
<p>So, after dropping the third-order term, we see: <span class="math display">\[
\begin{align*}
f''(A)&amp;[dA,dA'] \\
&amp;= \text{det}(A)\text{tr}(A^{-1}dA')\text{tr}(A^{-1}dA)
- \text{det}(A)\text{tr}(A^{-1}dA' A^{-1}dA).
\end{align*}
\]</span></p>
</section>
</section>
</section>

</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const toggleBodyColorMode = (bsSheetEl) => {
    const mode = bsSheetEl.getAttribute("data-mode");
    const bodyEl = window.document.querySelector("body");
    if (mode === "dark") {
      bodyEl.classList.add("quarto-dark");
      bodyEl.classList.remove("quarto-light");
    } else {
      bodyEl.classList.add("quarto-light");
      bodyEl.classList.remove("quarto-dark");
    }
  }
  const toggleBodyColorPrimary = () => {
    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
    if (bsSheetEl) {
      toggleBodyColorMode(bsSheetEl);
    }
  }
  toggleBodyColorPrimary();
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const isCodeAnnotation = (el) => {
    for (const clz of el.classList) {
      if (clz.startsWith('code-annotation-')) {
        return true;
      }
    }
    return false;
  }
  const onCopySuccess = function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    var currentTitle = button.getAttribute("title");
    button.setAttribute("title", "Copied!");
    let tooltip;
    if (window.bootstrap) {
      button.setAttribute("data-bs-toggle", "tooltip");
      button.setAttribute("data-bs-placement", "left");
      button.setAttribute("data-bs-title", "Copied!");
      tooltip = new bootstrap.Tooltip(button,
        { trigger: "manual",
          customClass: "code-copy-button-tooltip",
          offset: [0, -8]});
      tooltip.show();
    }
    setTimeout(function() {
      if (tooltip) {
        tooltip.hide();
        button.removeAttribute("data-bs-title");
        button.removeAttribute("data-bs-toggle");
        button.removeAttribute("data-bs-placement");
      }
      button.setAttribute("title", currentTitle);
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  }
  const getTextToCopy = function(trigger) {
      const codeEl = trigger.previousElementSibling.cloneNode(true);
      for (const childEl of codeEl.children) {
        if (isCodeAnnotation(childEl)) {
          childEl.remove();
        }
      }
      return codeEl.innerText;
  }
  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
    text: getTextToCopy
  });
  clipboard.on('success', onCopySuccess);
  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
      text: getTextToCopy,
      container: window.document.getElementById('quarto-embedded-source-code-modal')
    });
    clipboardModal.on('success', onCopySuccess);
  }
    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
    var mailtoRegex = new RegExp(/^mailto:/);
      var filterRegex = new RegExp('/' + window.location.host + '/');
    var isInternal = (href) => {
        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
    }
    // Inspect non-navigation links and adorn them if external
 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
    for (var i=0; i<links.length; i++) {
      const link = links[i];
      if (!isInternal(link.href)) {
        // undo the damage that might have been done by quarto-nav.js in the case of
        // links that we want to consider external
        if (link.dataset.originalHref !== undefined) {
          link.href = link.dataset.originalHref;
        }
      }
    }
  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
    const config = {
      allowHTML: true,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start',
    };
    if (contentFn) {
      config.content = contentFn;
    }
    if (onTriggerFn) {
      config.onTrigger = onTriggerFn;
    }
    if (onUntriggerFn) {
      config.onUntrigger = onUntriggerFn;
    }
    window.tippy(el, config);
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      // use id or data attribute instead here
      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      if (note) {
        return note.innerHTML;
      } else {
        return "";
      }
    });
  }
  const xrefs = window.document.querySelectorAll('a.quarto-xref');
  const processXRef = (id, note) => {
    // Strip column container classes
    const stripColumnClz = (el) => {
      el.classList.remove("page-full", "page-columns");
      if (el.children) {
        for (const child of el.children) {
          stripColumnClz(child);
        }
      }
    }
    stripColumnClz(note)
    if (id === null || id.startsWith('sec-')) {
      // Special case sections, only their first couple elements
      const container = document.createElement("div");
      if (note.children && note.children.length > 2) {
        container.appendChild(note.children[0].cloneNode(true));
        for (let i = 1; i < note.children.length; i++) {
          const child = note.children[i];
          if (child.tagName === "P" && child.innerText === "") {
            continue;
          } else {
            container.appendChild(child.cloneNode(true));
            break;
          }
        }
        if (window.Quarto?.typesetMath) {
          window.Quarto.typesetMath(container);
        }
        return container.innerHTML
      } else {
        if (window.Quarto?.typesetMath) {
          window.Quarto.typesetMath(note);
        }
        return note.innerHTML;
      }
    } else {
      // Remove any anchor links if they are present
      const anchorLink = note.querySelector('a.anchorjs-link');
      if (anchorLink) {
        anchorLink.remove();
      }
      if (window.Quarto?.typesetMath) {
        window.Quarto.typesetMath(note);
      }
      if (note.classList.contains("callout")) {
        return note.outerHTML;
      } else {
        return note.innerHTML;
      }
    }
  }
  for (var i=0; i<xrefs.length; i++) {
    const xref = xrefs[i];
    tippyHover(xref, undefined, function(instance) {
      instance.disable();
      let url = xref.getAttribute('href');
      let hash = undefined;
      if (url.startsWith('#')) {
        hash = url;
      } else {
        try { hash = new URL(url).hash; } catch {}
      }
      if (hash) {
        const id = hash.replace(/^#\/?/, "");
        const note = window.document.getElementById(id);
        if (note !== null) {
          try {
            const html = processXRef(id, note.cloneNode(true));
            instance.setContent(html);
          } finally {
            instance.enable();
            instance.show();
          }
        } else {
          // See if we can fetch this
          fetch(url.split('#')[0])
          .then(res => res.text())
          .then(html => {
            const parser = new DOMParser();
            const htmlDoc = parser.parseFromString(html, "text/html");
            const note = htmlDoc.getElementById(id);
            if (note !== null) {
              const html = processXRef(id, note);
              instance.setContent(html);
            }
          }).finally(() => {
            instance.enable();
            instance.show();
          });
        }
      } else {
        // See if we can fetch a full url (with no hash to target)
        // This is a special case and we should probably do some content thinning / targeting
        fetch(url)
        .then(res => res.text())
        .then(html => {
          const parser = new DOMParser();
          const htmlDoc = parser.parseFromString(html, "text/html");
          const note = htmlDoc.querySelector('main.content');
          if (note !== null) {
            // This should only happen for chapter cross references
            // (since there is no id in the URL)
            // remove the first header
            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
              note.children[0].remove();
            }
            const html = processXRef(null, note);
            instance.setContent(html);
          }
        }).finally(() => {
          instance.enable();
          instance.show();
        });
      }
    }, function(instance) {
    });
  }
      let selectedAnnoteEl;
      const selectorForAnnotation = ( cell, annotation) => {
        let cellAttr = 'data-code-cell="' + cell + '"';
        let lineAttr = 'data-code-annotation="' +  annotation + '"';
        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
        return selector;
      }
      const selectCodeLines = (annoteEl) => {
        const doc = window.document;
        const targetCell = annoteEl.getAttribute("data-target-cell");
        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
        const lineIds = lines.map((line) => {
          return targetCell + "-" + line;
        })
        let top = null;
        let height = null;
        let parent = null;
        if (lineIds.length > 0) {
            //compute the position of the single el (top and bottom and make a div)
            const el = window.document.getElementById(lineIds[0]);
            top = el.offsetTop;
            height = el.offsetHeight;
            parent = el.parentElement.parentElement;
          if (lineIds.length > 1) {
            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
            height = bottom - top;
          }
          if (top !== null && height !== null && parent !== null) {
            // cook up a div (if necessary) and position it
            let div = window.document.getElementById("code-annotation-line-highlight");
            if (div === null) {
              div = window.document.createElement("div");
              div.setAttribute("id", "code-annotation-line-highlight");
              div.style.position = 'absolute';
              parent.appendChild(div);
            }
            div.style.top = top - 2 + "px";
            div.style.height = height + 4 + "px";
            div.style.left = 0;
            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
            if (gutterDiv === null) {
              gutterDiv = window.document.createElement("div");
              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
              gutterDiv.style.position = 'absolute';
              const codeCell = window.document.getElementById(targetCell);
              const gutter = codeCell.querySelector('.code-annotation-gutter');
              gutter.appendChild(gutterDiv);
            }
            gutterDiv.style.top = top - 2 + "px";
            gutterDiv.style.height = height + 4 + "px";
          }
          selectedAnnoteEl = annoteEl;
        }
      };
      const unselectCodeLines = () => {
        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
        elementsIds.forEach((elId) => {
          const div = window.document.getElementById(elId);
          if (div) {
            div.remove();
          }
        });
        selectedAnnoteEl = undefined;
      };
        // Handle positioning of the toggle
    window.addEventListener(
      "resize",
      throttle(() => {
        elRect = undefined;
        if (selectedAnnoteEl) {
          selectCodeLines(selectedAnnoteEl);
        }
      }, 10)
    );
    function throttle(fn, ms) {
    let throttle = false;
    let timer;
      return (...args) => {
        if(!throttle) { // first call gets through
            fn.apply(this, args);
            throttle = true;
        } else { // all the others get throttled
            if(timer) clearTimeout(timer); // cancel #2
            timer = setTimeout(() => {
              fn.apply(this, args);
              timer = throttle = false;
            }, ms);
        }
      };
    }
      // Attach click handler to the DT
      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
      for (const annoteDlNode of annoteDls) {
        annoteDlNode.addEventListener('click', (event) => {
          const clickedEl = event.target;
          if (clickedEl !== selectedAnnoteEl) {
            unselectCodeLines();
            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
            if (activeEl) {
              activeEl.classList.remove('code-annotation-active');
            }
            selectCodeLines(clickedEl);
            clickedEl.classList.add('code-annotation-active');
          } else {
            // Unselect the line
            unselectCodeLines();
            clickedEl.classList.remove('code-annotation-active');
          }
        });
      }
  const findCites = (el) => {
    const parentEl = el.parentElement;
    if (parentEl) {
      const cites = parentEl.dataset.cites;
      if (cites) {
        return {
          el,
          cites: cites.split(' ')
        };
      } else {
        return findCites(el.parentElement)
      }
    } else {
      return undefined;
    }
  };
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const citeInfo = findCites(ref);
    if (citeInfo) {
      tippyHover(citeInfo.el, function() {
        var popup = window.document.createElement('div');
        citeInfo.cites.forEach(function(cite) {
          var citeDiv = window.document.createElement('div');
          citeDiv.classList.add('hanging-indent');
          citeDiv.classList.add('csl-entry');
          var biblioDiv = window.document.getElementById('ref-' + cite);
          if (biblioDiv) {
            citeDiv.innerHTML = biblioDiv.innerHTML;
          }
          popup.appendChild(citeDiv);
        });
        return popup.innerHTML;
      });
    }
  }
});
</script>
</div> <!-- /content -->


</body></html>