diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index 0d0c531a0..30760a5da 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -929,7 +929,6 @@ @article{alchian1950uncertainty
   publisher={The University of Chicago Press}
 }
 
-
 @article{mendoza1998international,
   title={The international ramifications of tax reforms: supply-side economics in a global economy},
   author={Mendoza, Enrique G and Tesar, Linda L},
@@ -997,20 +996,6 @@ @incollection{Hurwicz:1962
  year = {1962}
 }
 
-@article{Hurwicz:1966,
-   abstract = {Publisher Summary This chapter concentrates on the structural form of interdependent systems. A great deal of effort is devoted in econometrics and elsewhere to find the behavior pattern of an observed configuration. Such effort is justified on the grounds that the knowledge of the behavior pattern is needed for the purpose of giving explanation or prediction. The merits of this justification are also examined in the chapter. At this point, the chapter considers certain difficulties encountered in the process of looking for the behavior patterns. In certain fields, notably economics (but also— for example, electronic network theory), it deals with a set (configuration) of objects (components) that are interdependent in their behavior. For purposes of both theoretical analysis and empirical investigation of such situations, the phenomena are often described in the chapter (in idealized form) by means of a system of simultaneous equations. History alone is not enabled to determine the behavior pattern of the configuration; but this does not mean that the task is hopeless. The priori information is obtained from the axiom systems or theories that are believed to be relevant to the behavior pattern of the configuration.},
-   author = {Leonid Hurwicz},
-   doi = {http://dx.doi.org/10.1016/S0049-237X(09)70590-7},
-   editor = {Patrick S Ernest Nagel and Alfred Tarski},
-   journal = {Logic, Methodology and Philosophy of Science Proceeding of the 1960 International Congress},
-   pages = {232-239},
-   publisher = {Elsevier},
-   title = {On the Structural Form of Interdependent Systems},
-   volume = {44},
-   url = {http://www.sciencedirect.com/science/article/pii/S0049237X09705907},
-   year = {1966},
-}
-
 @article{hurwicz1950least,
   title   = {Least squares bias in time series},
   author  = {Hurwicz, Leonid},
@@ -1309,7 +1294,7 @@ @article{BHS_2009
 }
 
 @article{HST_1999,
-  author   = {Lars Peter Hansen and Thomas J. Sargent and Thomas D. Tallarini},
+  author   = {Hansen, Lars Peter and Sargent, Thomas J. and Tallarini, Thomas D.},
   title    = {{Robust Permanent Income and Pricing}},
   journal  = {Review of Economic Studies},
   year     = 1999,
@@ -1816,7 +1801,6 @@ @article{rosen1994cattle
   publisher = {The University of Chicago Press}
 }
 
-
 @article{Reffett1996,
   title     = {Production-based asset pricing in monetary economies with transactions costs},
   author    = {Reffett, Kevin L},
@@ -2369,17 +2353,6 @@ @article{AMSS_2002
   month   = {December}
 }
 
-@article{aiyagari2002optimal,
-  title     = {Optimal taxation without state-contingent debt},
-  author    = {Aiyagari, S Rao and Marcet, Albert and Sargent, Thomas J and Sepp{\"a}l{\"a}, Juha},
-  journal   = {Journal of Political Economy},
-  volume    = {110},
-  number    = {6},
-  pages     = {1220--1254},
-  year      = {2002},
-  publisher = {The University of Chicago Press}
-}
-
 @article{Rust1996,
   title   = {Numerical dynamic programming in economics},
   author  = {Rust, John},
@@ -3391,18 +3364,6 @@ @techreport{giannoni2010optimal
   institution = {National Bureau of Economic Research}
 }
 
-
-@article{pearlman1986rational,
-  title     = {Rational expectations models with partial information},
-  author    = {Pearlman, Joseph and Currie, David and Levine, Paul},
-  journal   = {Economic Modelling},
-  volume    = {3},
-  number    = {2},
-  pages     = {90--105},
-  year      = {1986},
-  publisher = {Elsevier}
-}
-
 @techreport{backus1986consistency,
   title       = {The consistency of optimal policy in stochastic rational expectations models},
   author      = {Backus, David and Driffill, John},
@@ -3502,7 +3463,6 @@ @article{kikuchi2018span
   publisher = {Wiley Online Library}
 }
 
-
 @article{do1999solutions,
   title     = {Solutions for the linear-quadratic control problem of Markov jump linear systems},
   author    = {Do Val, JBR and Geromel, JC and Costa, OLV},
@@ -4073,3 +4033,22 @@ @book{Nummelin_1984
   year      = {1984},
   doi       = {10.1017/CBO9780511526237}
 }
+@article{EpsteinWang1994,
+  author  = {Epstein, Larry G. and Wang, Tan},
+  title   = {{Intertemporal Asset Pricing under Knightian Uncertainty}},
+  journal = {Econometrica},
+  volume  = {62},
+  number  = {2},
+  pages   = {283--322},
+  year    = {1994}
+}
+
+@article{Campbell1987,
+  author  = {Campbell, John Y.},
+  title   = {{Does Saving Anticipate Declining Labor Income? An Alternative Test of the Permanent Income Hypothesis}},
+  journal = {Econometrica},
+  volume  = {55},
+  number  = {6},
+  pages   = {1249--1273},
+  year    = {1987}
+}
diff --git a/lectures/_toc.yml b/lectures/_toc.yml
index 929fefe00..0a4f545b2 100644
--- a/lectures/_toc.yml
+++ b/lectures/_toc.yml
@@ -65,6 +65,7 @@ parts:
   - file: wealth_dynamics
   - file: kalman
   - file: kalman_2
+  - file: kalman_filter_var 
   - file: organization_capital
   - file: measurement_models
 - caption: Search
@@ -109,8 +110,10 @@ parts:
   - file: cross_product_trick
   - file: perm_income
   - file: perm_income_cons
-  - file: theil_1 
+  - file: robust_permanent_income
+  - file: theil_1
   - file: theil_2
+  - file: lq_consumption_smoothing 
   - file: lq_inventories
 - caption: Optimal Growth
   numbered: true
diff --git a/lectures/kalman_2.md b/lectures/kalman_2.md
index 9b4c2e916..37ff9b433 100644
--- a/lectures/kalman_2.md
+++ b/lectures/kalman_2.md
@@ -264,7 +264,7 @@ u_hat_t = x_hat_t[1, :]
 
 For this fixed worker initial state, we plot $\mathbb{E}[y_t | y^{t-1}] = G \hat x_t$ where $\hat x_t = \mathbb{E}[x_t | y^{t-1}]$.
 
-We also plot $\mathbb{E}[u_0 | y^{t-1}]$, which is  the firm inference about  a worker's hard-wired "work ethic" $u_0$, conditioned on information $y^{t-1}$ that it has about him or her coming into period $t$.
+We also plot $\mathbb{E}[u_0 | y^{t-1}]$, which is  the firm's inference about  a worker's hard-wired "work ethic" $u_0$, conditioned on information $y^{t-1}$ that it has about him or her coming into period $t$.
 
 We can watch how the firm updates its inference $\mathbb{E}[u_0 | y^{t-1}]$ about the worker's work ethic as more output observations arrive.
 
@@ -341,7 +341,7 @@ for i, t in enumerate(np.linspace(0, T-1, 3, dtype=int)):
     # Create a contour plot for the PDF
     con = axs[i].contour(h, u, pdf_values, cmap='viridis')
     axs[i].clabel(con, inline=1, fontsize=10)
-    axs[i].set_title(f'Time Step {t}')
+    axs[i].set_title(f'time step {t}')
     axs[i].set_xlabel(r'$h_{{{}}}$'.format(str(t)))
     axs[i].set_ylabel(r'$u_{{{}}}$'.format(str(t)))
     
@@ -467,7 +467,7 @@ namedtuple.
 Here is an example.
 
 ```{code-cell} ipython3
-# We can set these parameters when creating a worker -- just like classes!
+# We can set these parameters when creating a worker, just like classes!
 hard_working_worker = create_worker(α=.4, β=.8, 
                         hhat_0=7.0, uhat_0=100, σ_h=2.5, σ_u=3.2)
 
diff --git a/lectures/kalman_filter_var.md b/lectures/kalman_filter_var.md
new file mode 100644
index 000000000..93a5a051e
--- /dev/null
+++ b/lectures/kalman_filter_var.md
@@ -0,0 +1,1518 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.11.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+# The Kalman Filter and Vector Autoregressions
+
+```{index} single: Kalman Filter
+```
+
+```{index} single: Vector Autoregression; and Kalman filter
+```
+
+## Overview
+
+This lecture derives the **Kalman filter** for a linear Gaussian state space system
+and then uses it to construct **vector autoregressions (VARs)**.
+
+Our approach rests on repeated applications of the **population linear least squares**
+projection formula, the insight that computing a conditional expectation of a jointly
+normal random vector is the same as running a population OLS regression.
+
+The lecture covers:
+
+- deriving the Kalman filter recursions from first principles
+- the **matrix Riccati difference equation** governing conditional covariance matrices
+- the **innovations representation** and the **Gram-Schmidt** whitening property
+- the structure of a **hidden Markov model**
+- the **likelihood function** for a state space system and its role in maximum likelihood
+  and Bayesian estimation
+- how the time-invariant Kalman filter generates a **vector autoregression**
+- why the Kalman filter is an essential tool for **interpreting VARs** estimated from
+  economic data
+
+## The state space system
+
+The Kalman filter applies to the **state space system** for $t \geq 0$:
+
+$$
+\begin{aligned}
+x_{t+1} &= A x_t + C w_{t+1} \\
+y_t      &= G x_t + v_t
+\end{aligned}
+$$ (eq:statespace)
+
+where
+
+- $x_t$ is an $n \times 1$ **state vector** (hidden, unobserved)
+- $y_t$ is an $m \times 1$ vector of **signals** on the hidden state (observed)
+- $w_{t+1}$ is a $p \times 1$ i.i.d. sequence of normal random variables with mean
+  $0$ and identity covariance matrix
+- $v_t$ is an i.i.d. sequence of normal random variables with mean zero and
+  covariance matrix $R$
+- $w_{t+1}$ and $v_s$ are orthogonal for all $t+1$ and $s \geq 0$
+
+The initial state satisfies
+
+$$
+x_0 \sim \mathcal{N}(\hat{x}_0, \Sigma_0)
+$$ (eq:kalf3)
+
+We observe $y_t, \ldots, y_0$ but **not** $x_t, \ldots, x_0$ at time $t$,
+and we know all first and second moments implied by
+{eq}`eq:statespace` and {eq}`eq:kalf3`.
+
+## The Kalman filter
+
+### Starting distribution
+
+Working forward in time starting at $t = 0$, before observing $y_0$,
+the specification {eq}`eq:statespace`-{eq}`eq:kalf3` implies that the
+marginal distribution of $y_0$ is
+
+$$
+y_0 \sim \mathcal{N}(G \hat{x}_0,\; G \Sigma_0 G' + R)
+$$ (eq:kalf4)
+
+For $t \geq 0$ let $y^t = [y_t, y_{t-1}, \ldots, y_0]$.
+
+We want a convenient recursive representation of the conditional distribution of
+$y_t$ given history $y^{t-1}$.
+
+The Kalman filter attains this by constructing recursive formulas for
+$\hat{x}_t$ and $\Sigma_t$ such that the distribution of $y_t$ conditional on
+$y^{t-1}$ generalises {eq}`eq:kalf4` to
+
+$$
+y_t \sim \mathcal{N}(G \hat{x}_t,\; G \Sigma_t G' + R)
+$$ (eq:kalf400)
+
+for $t \geq 1$, where the distribution of $x_t$ conditional on $y^{t-1}$ is
+$\mathcal{N}(\hat{x}_t, \Sigma_t)$.
+
+The objects $\hat{x}_t$ and $\Sigma_t$ characterise the **population regression**
+
+$$
+\hat{x}_t = \mathbb{E}[x_t \mid y_{t-1}, \ldots, y_0]
+$$
+
+and the **conditional covariance matrix**
+
+$$
+\Sigma_t = \mathbb{E}\!\left[(x_t - \hat{x}_t)(x_t - \hat{x}_t)' \mid y_{t-1}, \ldots, y_0\right]
+$$
+
+### Derivation
+
+At each date, our approach is to **regress what we do not know on what we know**.
+
+```{note}
+Because our assumptions imply that $\{x_t, y_t\}_{t=0}^\infty$ is a jointly normal
+stochastic process, linear least squares regressions equal conditional mathematical
+expectations — each step below is an application of Bayes' law.
+
+Under the weaker assumption that all means and covariances exist without joint normality,
+the same calculations yield "wide-sense conditional expectations" that coincide with true
+conditional expectations only when those conditional expectations are linear.
+```
+
+**Arrive at $t = 0$** knowing $\hat{x}_0$ and $\Sigma_0$.
+
+**Innovation:** The information about $x_0$ in $y_0$ that is new relative to
+$(\hat{x}_0, \Sigma_0)$ is the **innovation**
+
+$$
+a_0 \equiv y_0 - G \hat{x}_0
+$$
+
+**Updating beliefs about $x_0$:** The conditional mean
+$\mathbb{E}[x_0 \mid y_0] = \hat{x}_0 + L_0(y_0 - G\hat{x}_0)$ satisfies the
+population regression formula
+
+$$
+x_0 - \hat{x}_0 = L_0(y_0 - G\hat{x}_0) + \eta
+$$ (eq:kalf5)
+
+where $\eta$ is the least squares residual.
+
+Orthogonality of $\eta$ to
+$(y_0 - G\hat{x}_0)$ pins down $L_0$ via the normal equations
+
+$$
+\mathbb{E}(x_0 - \hat{x}_0)(y_0 - G\hat{x}_0)'
+= L_0\, \mathbb{E}(y_0 - G\hat{x}_0)(y_0 - G\hat{x}_0)'
+$$
+
+Evaluating the moment matrices and solving for $L_0$ gives
+
+$$
+L_0 = \Sigma_0 G'(G \Sigma_0 G' + R)^{-1}
+$$ (eq:kalf6)
+
+**Forecasting $x_1$:** Note that
+
+$$
+x_1 = A\hat{x}_0 + A(x_0 - \hat{x}_0) + C w_1
+$$ (eq:kalf6a)
+
+Applying {eq}`eq:kalf5` gives $\mathbb{E}[x_1 \mid y_0] = A\hat{x}_0 + AL_0(y_0 - G\hat{x}_0)$,
+which we write as
+
+$$
+\hat{x}_1 = A\hat{x}_0 + K_0(y_0 - G\hat{x}_0)
+$$ (eq:kalf7)
+
+where the **Kalman gain** at time 0 is
+
+$$
+K_0 = A \Sigma_0 G'(G \Sigma_0 G' + R)^{-1}
+$$ (eq:kalf7a)
+
+**Updating the covariance:** Subtracting {eq}`eq:kalf7` from {eq}`eq:kalf6a` yields
+
+$$
+x_1 - \hat{x}_1 = A(x_0 - \hat{x}_0) + C w_1 - K_0(y_0 - G\hat{x}_0)
+$$ (eq:kalf8)
+
+Using {eq}`eq:kalf8` and $y_0 = G x_0 + v_0$ to evaluate
+$\Sigma_1 \equiv \mathbb{E}[(x_1 - \hat{x}_1)(x_1 - \hat{x}_1)' \mid y_0]$ gives
+
+$$
+\Sigma_1 = (A - K_0 G)\Sigma_0(A - K_0 G)' + CC' + K_0 R K_0'
+$$ (eq:kalf9)
+
+Thus $f(x_1 \mid y_0) \sim \mathcal{N}(\hat{x}_1, \Sigma_1)$.
+
+Collecting the time-$0$ equations:
+
+$$
+\begin{aligned}
+a_0       &= y_0 - G\hat{x}_0 \\
+K_0       &= A\Sigma_0 G'(G\Sigma_0 G' + R)^{-1} \\
+\hat{x}_1 &= A\hat{x}_0 + K_0 a_0 \\
+\Sigma_1  &= CC' + K_0 R K_0' + (A - K_0 G)\Sigma_0(A - K_0 G)'
+\end{aligned}
+$$ (eq:kalf1000)
+
+System {eq}`eq:kalf1000` maps a mean-covariance pair $(\hat{x}_0, \Sigma_0)$ into a new
+pair $(\hat{x}_1, \Sigma_1)$, with auxiliary outputs $(a_0, K_0)$.
+
+Recognising that "we are in the same situation at the start of period 1 as at
+the start of period 0" activates a recursion, the **Kalman filter**.
+
+### The Kalman filter recursions
+
+Iterating system {eq}`eq:kalf1000` yields the **Kalman filter** for $t \geq 0$:
+
+$$
+\begin{aligned}
+a_t           &= y_t - G\hat{x}_t \\
+K_t           &= A\Sigma_t G'(G\Sigma_t G' + R)^{-1} \\
+\hat{x}_{t+1} &= A\hat{x}_t + K_t a_t \\
+\Sigma_{t+1}  &= CC' + K_t R K_t' + (A - K_t G)\Sigma_t(A - K_t G)'
+\end{aligned}
+$$ (eq:kalf10)
+
+Here $K_t$ is the **Kalman gain** at time $t$.
+
+### The matrix Riccati equation
+
+Substituting the expression for $K_t$ from the second line of {eq}`eq:kalf10`
+into the fourth line gives an equivalent update formula:
+
+$$
+\Sigma_{t+1} = A\Sigma_t A' + CC'
+  - A\Sigma_t G'(G\Sigma_t G' + R)^{-1} G\Sigma_t A'
+$$ (eq:riccati)
+
+Equation {eq}`eq:riccati` is the **matrix Riccati difference equation**.
+
+It governs the sequence of conditional covariance matrices $\{\Sigma_t\}_{t=0}^\infty$
+without reference to the observations $\{y_t\}$.
+
+```{index} single: Riccati equation; matrix difference
+```
+
+## The Gram-Schmidt process
+
+The random vector
+
+$$
+a_t = y_t - \mathbb{E}[y_t \mid y_{t-1}, \ldots, y_0]
+$$
+
+is the **innovation** of $y_t$ with respect to $y^{t-1}$, the part of $y_t$
+that cannot be predicted from past observations.
+
+Note that $\mathbb{E} a_t a_t' = G\Sigma_t G' + R$, the matrix whose inverse appears in
+the Kalman gain formula {eq}`eq:kalf10`.
+
+A direct calculation using $a_t = G(x_t - \hat{x}_t) + v_t$ shows that
+$\mathbb{E} a_t a_{t-1}' = 0$ and, more generally, $\mathbb{E}[a_t \mid a_{t-1}, \ldots, a_0] = 0$.
+
+```{note}
+An alternative argument from first principles: let $H(y^t)$ denote the closed linear
+span of $y^t$.
+
+Since $a_{t+1} = y_{t+1} - \mathbb{E}[y_{t+1} \mid y^t]$ is a least-squares
+error, $a_{t+1} \perp H(y^t)$, and in particular $a_{t+1} \perp a_t$.
+
+Thus $\{a_t\}$ is a white-noise process of innovations to $\{y_t\}$.
+```
+
+Sometimes {eq}`eq:kalf10` is called a **whitening filter**: it takes the signal process
+$\{y_t\}$ as input and produces the white-noise innovation process $\{a_t\}$ as output.
+
+The linear space $H(a^t)$ is an orthogonal basis for the linear space $H(y^t)$.
+
+Rather than computing $\mathbb{E}[x_t \mid y_{t-1}, \ldots, y_0]$ via one large regression,
+the Kalman filter performs a sequence of small regressions on successive orthogonal
+components of the basis $[a_{t-1}, \ldots, a_0]$, an instance of the
+**Gram-Schmidt procedure**.
+
+```{index} single: Gram-Schmidt process
+```
+
+## Hidden Markov model
+
+System {eq}`eq:statespace`-{eq}`eq:kalf3` is an example of a **hidden Markov model**.
+
+```{index} single: hidden Markov model
+```
+
+The observed process $\{y_t\}_{t=0}^\infty$ is *not* Markov, but the hidden
+process $\{x_t\}_{t=0}^\infty$ *is* Markov.
+
+So is the process of means and
+covariances $\{(\hat{x}_t, \Sigma_t)\}$, which are sufficient statistics for
+the distribution of $x_t$ conditional on $[y_{t-1}, \ldots, y_0]$.
+
+## Estimation
+
+### The innovations representation
+
+The **innovations representation** emerging from the Kalman filter is
+
+$$
+\begin{aligned}
+\hat{x}_{t+1} &= A\hat{x}_t + K_t a_t \\
+y_t           &= G\hat{x}_t + a_t
+\end{aligned}
+$$ (eq:innovrep)
+
+where $\hat{x}_t = \mathbb{E}[x_t \mid y^{t-1}]$ for $t \geq 1$ and
+$\mathbb{E}[a_t a_t' \mid y^{t-1}] = G\Sigma_t G' + R \equiv \Omega_t$.
+
+For $t \geq 1$, $\mathbb{E}[y_t \mid y^{t-1}] = G\hat{x}_t$ and the conditional
+distribution of $y_t$ given $y^{t-1}$ is $\mathcal{N}(G\hat{x}_t, \Omega_t)$.
+
+The objects $(G\hat{x}_t, \Omega_t)$ emerging from the Kalman filter recursions
+therefore completely characterise this conditional distribution.
+
+### The likelihood function
+
+We can factor the likelihood of a sample $(y_T, y_{T-1}, \ldots, y_0)$ as
+
+$$
+f(y_T, \ldots, y_0)
+  = f(y_T \mid y^{T-1})\, f(y_{T-1} \mid y^{T-2}) \cdots f(y_1 \mid y_0)\, f(y_0)
+$$ (eq:diff100)
+
+The log conditional density of the $m \times 1$ vector $y_t$ is
+
+$$
+\log f(y_t \mid y^{t-1})
+  = -\frac{m}{2}\log(2\pi)
+    - \frac{1}{2}\log\det(\Omega_t)
+    - \frac{1}{2}\, a_t' \Omega_t^{-1} a_t
+$$ (eq:gauss100)
+
+Using {eq}`eq:gauss100` and {eq}`eq:kalf10` together, we can evaluate
+the likelihood {eq}`eq:diff100` recursively for any parameter vector $\theta$
+that underlies the matrices $A, G, C, R$.
+
+Such calculations are at the heart of efficient strategies for computing
+**maximum likelihood estimators** of free parameters.
+
+### Bayesian inference
+
+The likelihood function is also central to **Bayesian inference**.
+
+Where $\theta$ is the parameter vector, $y_0^T$ the data, and
+$\tilde{p}(\theta)$ a prior density over $\theta$ before seeing $y_0^T$,
+Bayes' law gives the **posterior**
+
+$$
+\tilde{p}(\theta \mid y_0^T)
+  = \frac{f(y_0^T \mid \theta)\,\tilde{p}(\theta)}
+         {\int f(y_0^T \mid \theta)\,\tilde{p}(\theta)\, d\theta}
+$$
+
+The denominator is the marginal joint density $f(y_0^T)$.
+
+## Vector autoregressions and the Kalman filter
+
+### Convergence to a steady state
+
+Under conditions discussed by Anderson, Hansen, McGrattan, and Sargent (1996),
+iterations on the Riccati equation {eq}`eq:riccati` converge to a
+**time-invariant** matrix $\Sigma$ from any positive semi-definite starting value
+$\Sigma_0$.
+
+A time-invariant fixed point $\Sigma_t = \Sigma$ of {eq}`eq:riccati` is the
+covariance matrix of $x_t$ around
+
+$$
+\mathbb{E}\!\left[x_t \mid \{y_s\}_{s \leq t-1}\right]
+$$
+
+where the conditioning extends over the **semi-infinite** past $s \leq t-1$.
+
+### A time-invariant VAR
+
+If the fixed point $\Sigma$ exists and we initialise the filter at $\Sigma_0 = \Sigma$,
+the innovations representation {eq}`eq:innovrep` becomes time-invariant:
+
+$$
+\begin{aligned}
+\hat{x}_{t+1} &= A\hat{x}_t + K a_t \\
+y_t           &= G\hat{x}_t + a_t
+\end{aligned}
+$$ (eq:innovti)
+
+where $\mathbb{E} a_t a_t' = G\Sigma G' + R$ and the **steady-state Kalman gain** is
+$K = A\Sigma G'(G\Sigma G' + R)^{-1}$.
+
+From {eq}`eq:innovti` we obtain $\hat{x}_{t+1} = (A - KG)\hat{x}_t + K y_t$.
+
+If the eigenvalues of $A - KG$ are bounded in modulus strictly below unity, we can
+solve this equation forward to get
+
+$$
+\hat{x}_{t+1} = \sum_{j=0}^\infty (A - KG)^j K\, y_{t-j}
+$$ (eq:xhatform)
+
+Substituting {eq}`eq:xhatform` into the observation equation of {eq}`eq:innovti`
+gives the **vector autoregression**
+
+$$
+y_t = G \sum_{j=0}^\infty (A - KG)^j K\, y_{t-j-1} + a_t
+$$ (eq:var1)
+
+where by construction
+
+$$
+\mathbb{E}\!\left[a_t\, y_{t-j-1}'\right] = 0 \quad \forall\, j \geq 0
+$$ (eq:varorth)
+
+The orthogonality conditions {eq}`eq:varorth` identify {eq}`eq:var1` as a
+**vector autoregression**.
+
+Defining the lag operator $L$ by $L x_{t+1} \equiv x_t$, the
+**moving average representation** deduced from {eq}`eq:innovti` is
+
+$$
+y_t = \left[I + G(I - AL)^{-1} KL\right] a_t
+    = \left[I + G\sum_{j=0}^\infty A^j K L^{j+1}\right] a_t
+$$
+
+```{index} single: vector autoregression
+```
+
+### Interpreting VARs
+
+Equilibria of economic models (or their linear or log-linear approximations)
+typically take the form of state space system {eq}`eq:statespace`.
+
+This hidden Markov model disturbs the state $x_t$ by the $p \times 1$ shock
+vector $w_{t+1}$ and perturbs the $m \times 1$ vector of observables $y_t$ by the
+$m \times 1$ measurement error $v_t$.
+
+An economic theory typically makes $w_{t+1}$ and $v_t$ directly interpretable as
+shocks to preferences, technologies, endowments, or information sets.
+
+The state space system {eq}`eq:statespace` represents $\{y_t\}$ in terms of these
+**interpretable shocks**.
+
+However, in the typical situation these shocks
+**cannot** be recovered directly from the $y_t$'s, even when $A, G, C, R$ are known.
+
+The innovations representation {eq}`eq:innovti` represents the *same* stochastic
+process $\{y_t\}$ in terms of the $m \times 1$ vector $a_t$ of innovations that
+would be recovered by running an infinite-order population vector autoregression.
+
+Its role in mapping the original representation {eq}`eq:statespace` to the VAR
+{eq}`eq:var1` makes the Kalman filter an indispensable tool for
+**interpreting vector autoregressions**.
+
+```{index} single: Kalman Filter; and vector autoregressions
+```
+
+## Spectral factorization identity
+
+```{index} single: spectral factorization identity
+```
+
+Because the original state space system {eq}`eq:statespace` and the innovations
+representation {eq}`eq:innovti` describe the **same** stochastic process $\{y_t\}$,
+they imply two distinct formulas for the **spectral density matrix** of $\{y_t\}$.
+
+Equating those formulas yields the *spectral factorization identity*.
+
+### Two representations of the spectral density
+
+**From the original state space system.**  Writing the first line of
+{eq}`eq:statespace` as $x_t = (zI - A)^{-1} C w_{t+1}$ (using the $z$-transform
+convention $z^{-1} x_t = x_{t-1}$), the covariance generating function of
+$\{x_t\}$ is
+
+$$
+S_x(z) = (zI - A)^{-1} CC' (z^{-1}I - A')^{-1}.
+$$
+
+Since $v_t$ is orthogonal to $x_t$, the spectral density of $\{y_t\}$ is
+
+$$
+S_y(z) = G(zI - A)^{-1} CC' (z^{-1}I - A')^{-1} G' + R.
+$$ (eq:sf_original)
+
+**From the innovations representation.**  The time-invariant innovations
+representation {eq}`eq:innovti` gives $y_t = [G(zI - A)^{-1}K + I]\, a_t$.
+Since $a_t$ is white noise with covariance matrix $G\Sigma G' + R$, the
+spectral density is also
+
+$$
+S_y(z) = \bigl[G(zI-A)^{-1}K + I\bigr]
+          \bigl(G\Sigma G' + R\bigr)
+          \bigl[K'(z^{-1}I - A')^{-1}G' + I\bigr].
+$$ (eq:sf_innov)
+
+### The spectral factorization identity
+
+Equating {eq}`eq:sf_original` and {eq}`eq:sf_innov` gives the
+**spectral factorization identity**:
+
+$$
+G(zI - A)^{-1} CC' (z^{-1}I - A')^{-1} G' + R =
+\bigl[G(zI-A)^{-1}K + I\bigr]
+\bigl(G\Sigma G' + R\bigr)
+\bigl[K'(z^{-1}I - A')^{-1}G' + I\bigr].
+$$ (eq:sf_identity)
+
+The left side expresses $S_y(z)$ in terms of the **structural shocks**
+$(w_{t+1}, v_t)$ and the matrices $(A, C, G, R)$.
+
+The right side expresses the same object as a spectral factor built from the
+**innovations** $a_t$ and the steady-state Kalman gain $K$.
+
+### Connection to the Wold and autoregressive representations
+
+The factorization {eq}`eq:sf_identity` underpins the passage from the innovations
+representation to the Wold moving average and to the VAR.
+
+**Wold representation.**  From {eq}`eq:innovti`, solving for $\hat{x}_t$ over the
+semi-infinite past gives $\hat{x}_{t+1} = (I - AL)^{-1} K a_t$, so
+
+$$
+y_t = \bigl[G(I - AL)^{-1} KL + I\bigr]\, a_t.
+$$ (eq:sf_wold)
+
+**Autoregressive (VAR) representation.**  Applying the inverse of the
+moving-average operator in {eq}`eq:sf_wold`, using the identity
+
+$$
+\bigl[G(I-AL)^{-1}KL + I\bigr]^{-1} = I - G\bigl[I-(A-KG)L\bigr]^{-1}KL,
+$$
+
+gives
+
+$$
+y_t = G\bigl[I-(A-KG)L\bigr]^{-1}K\, y_{t-1} + a_t
+    = \sum_{j=1}^\infty G(A-KG)^{j-1}K\, y_{t-j} + a_t,
+$$ (eq:sf_var)
+
+which is the **vector autoregression** already stated in {eq}`eq:var1`.
+
+The key analytical fact behind both representations is that, under mild stability
+conditions, the zeros of $\det[G(zI-A)^{-1}K + I]$ all lie **inside** the unit
+circle.
+
+This ensures that the moving-average operator in {eq}`eq:sf_wold` has a
+causal (one-sided) inverse, so the innovation $a_t$ lies in the closed linear span
+of current and past observations $y^t$, confirming that $a_t$ is the true
+population forecast error from the VAR.
+
+## Python implementation
+
+We now illustrate the theory using the `quantecon` library, which provides
+`LinearStateSpace` and `Kalman` classes that implement everything derived above.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+import quantecon as qe
+```
+
+### A scalar hidden AR(1) model
+
+Consider a scalar hidden AR(1) state observed with measurement noise:
+
+$$
+\begin{aligned}
+x_{t+1} &= \rho\, x_t + \sigma_w\, w_{t+1} \\
+y_t      &= x_t + \sigma_v\, v_t
+\end{aligned}
+$$
+
+with $w_t, v_t \sim \mathcal{N}(0, 1)$ i.i.d.
+
+```{code-cell} ipython3
+# Model parameters
+rho     = 0.9
+sigma_w = 0.5
+sigma_v = 1.0
+
+# State space matrices  (n=1, p=1, m=1)
+A = np.array([[rho]])
+C = np.array([[sigma_w]])
+G = np.array([[1.0]])
+R = np.array([[sigma_v**2]])
+
+# Build a LinearStateSpace and a Kalman filter object
+H = np.array([[sigma_v]])   # measurement noise factor: R = H @ H.T
+lss = qe.LinearStateSpace(A, C, G, H, mu_0=np.zeros(1), Sigma_0=np.eye(1) * 10.0)
+kf  = qe.Kalman(lss)
+kf.set_state(np.zeros(1), np.eye(1) * 10.0)  # diffuse prior
+```
+
+**Simulate a sample path of the true hidden state and noisy observations.**
+
+```{code-cell} ipython3
+T = 200
+x_path, y_path = lss.simulate(ts_length=T, random_state=42)
+
+# x_path has shape (n, T+1);  y_path has shape (m, T)
+x_true = x_path[0, :T]
+y_obs  = y_path[0, :]
+```
+
+**Run the Kalman filter manually step by step to collect filtered estimates.**
+
+```{code-cell} ipython3
+x_hats = np.zeros(T)
+Sigmas = np.zeros(T)
+
+for t in range(T):
+    kf.update(y_obs[t:t+1])          # one full filter cycle
+    x_hats[t] = kf.x_hat.item()
+    Sigmas[t]  = kf.Sigma.item()
+```
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Kalman filtering of a scalar AR(1) state
+    name: fig-kfvar-scalar
+---
+fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=True)
+
+t_range = np.arange(T)
+
+axes[0].plot(t_range, x_true,  lw=1.5, label='True state $x_t$')
+axes[0].plot(t_range, x_hats,  lw=1.5, linestyle='--', label=r'$\hat{x}_t$ (Kalman)')
+axes[0].plot(t_range, y_obs,   alpha=0.35, label='Observation $y_t$')
+axes[0].set_title('State, Kalman estimate, and observation')
+axes[0].legend(fontsize=9)
+
+axes[1].plot(t_range, Sigmas, color='tab:orange')
+axes[1].set_title(r'Conditional variance $\Sigma_t$')
+axes[1].axhline(kf.Sigma_infinity[0, 0], ls='--', color='k',
+                label=r'Steady-state $\Sigma_\infty$')
+axes[1].legend(fontsize=9)
+
+axes[2].plot(t_range, y_obs - x_hats, color='tab:green', lw=0.8, alpha=0.7)
+axes[2].set_title(r'Innovation $a_t = y_t - G\hat{x}_t$')
+axes[2].set_xlabel('time $t$')
+
+plt.tight_layout()
+plt.show()
+```
+
+### Convergence of the Riccati equation
+
+The `Kalman` class computes the steady-state covariance $\Sigma_\infty$ by
+solving the discrete algebraic Riccati equation directly.
+
+```{code-cell} ipython3
+Sigma_inf, K_inf = kf.stationary_values()
+
+print(f"Steady-state covariance  Σ_inf = {Sigma_inf[0, 0]:.6f}")
+print(f"Kalman filter converged to Σ_t = {Sigmas[-1]:.6f}")
+print(f"Steady-state Kalman gain K  = {K_inf[0, 0]:.6f}")
+
+A_minus_KG = A - K_inf @ G
+eigval = np.linalg.eigvals(A_minus_KG)[0]
+print(f"\nEigenvalue of (A - KG)      = {eigval:.6f}")
+print(f"Stable VAR: {np.abs(eigval) < 1}")
+```
+
+### The VAR representation
+
+Using {eq}`eq:var1`, the coefficients in the infinite-order VAR representation are
+$G(A - KG)^j K$ for $j = 0, 1, 2, \ldots$
+
+We retrieve them via `stationary_coefficients`:
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: VAR coefficients from the innovations representation
+    name: fig-kfvar-varcoef
+---
+J = 30
+var_coeffs = kf.stationary_coefficients(J, coeff_type='var')
+
+# var_coeffs[j] gives the mxm coefficient matrix at lag j+1
+lags = np.arange(1, J + 1)
+coeff_values = np.array([var_coeffs[j][0, 0] for j in range(J)])
+
+fig, ax = plt.subplots(figsize=(8, 4))
+ax.stem(lags, coeff_values, basefmt=' ')
+ax.set_xlabel('lag $j$')
+ax.set_ylabel(r'VAR coefficient $G(A{-}KG)^{j-1}K$')
+plt.tight_layout()
+plt.show()
+```
+
+The coefficients decay geometrically, confirming that the infinite-order VAR
+{eq}`eq:var1` is well approximated by a finite-lag truncation.
+
+### Likelihood evaluation
+
+We use {eq}`eq:gauss100` to evaluate the log-likelihood of the simulated sample.
+
+```{code-cell} ipython3
+def log_likelihood(A, C, G, R, y_data, x_hat_0, Sigma_0):
+    """Evaluate the log-likelihood using the Kalman filter recursions."""
+    H_     = np.linalg.cholesky(R)   # R = H_ @ H_.T
+    lss_   = qe.LinearStateSpace(A, C, G, H_, mu_0=x_hat_0, Sigma_0=Sigma_0)
+    kf_    = qe.Kalman(lss_)
+    kf_.set_state(x_hat_0, Sigma_0)
+
+    T_, m_ = y_data.shape
+    loglik = 0.0
+
+    for t in range(T_):
+        x_h  = kf_.x_hat
+        Sig  = kf_.Sigma
+        Omega = G @ Sig @ G.T + R        # innovation covariance
+        a_t   = y_data[t] - (G @ x_h).flatten()
+
+        sign, logdet = np.linalg.slogdet(Omega)
+        loglik += -0.5 * (m_ * np.log(2 * np.pi) + logdet
+                          + float(a_t @ np.linalg.solve(Omega, a_t)))
+        kf_.update(y_data[t])
+
+    return loglik
+
+
+y_data_col = y_obs.reshape(-1, 1)
+ll = log_likelihood(A, C, G, R,
+                    y_data_col,
+                    np.zeros(1), np.eye(1) * 10.0)
+print(f"Log-likelihood of sample: {ll:.4f}")
+```
+
+## An example
+
+We now work through a structured example that shows how a bivariate VAR(2) fits naturally into the state space framework and how the Kalman filter delivers a 
+Wold (innovations) representation.
+
+### Linear state-space system
+
+The state and observation equations are
+
+$$
+x_{t+1} = A x_t + C w_{t+1}
+$$ (eq:ex_state)
+
+$$
+y_t = G x_t + v_t
+$$ (eq:ex_obs)
+
+with initial condition and shock distributions
+
+$$
+x_0 \sim \mathcal{N}(M_0, \Omega_0), \quad
+w_{t+1} \sim \mathcal{N}(0, I), \quad
+v_t \sim \mathcal{N}(0, R).
+$$
+
+### Steady-state Riccati equation
+
+The steady-state error covariance matrix $\Sigma$ satisfies
+
+$$
+\Sigma = A \Sigma A' + CC'
+         - A \Sigma G' \bigl(G \Sigma G' + R\bigr)^{-1} G \Sigma A'
+$$ (eq:ex_riccati)
+
+### Kalman gain
+
+$$
+K = A \Sigma G' \bigl(G \Sigma G' + R\bigr)^{-1}
+$$ (eq:ex_gain)
+
+### Kalman filter recursion
+
+Starting from an initial estimate $\hat{x}_0$, the Kalman filter updates the state estimate via
+
+$$
+\hat{x}_{t+1} = A \hat{x}_t + K a_t
+$$ (eq:ex_kf_update)
+
+where the **innovation** (prediction error) is
+
+$$
+a_t = y_t - G \hat{x}_t
+$$ (eq:ex_innovation)
+
+Substituting {eq}`eq:ex_innovation` into {eq}`eq:ex_kf_update` and expanding:
+
+$$
+\hat{x}_{t+1} = A \hat{x}_t + K(y_t - G\hat{x}_t)
+              = (A - KG)\hat{x}_t + K y_t
+              = (A - KG)\hat{x}_t + K G x_t + K v_t
+$$ (eq:ex_kf_expanded)
+
+### Impulse responses of $y_t$ to the innovations $a_t$
+
+It is useful to compute the **ordinary impulse response functions** of the
+observable vector $y_t$ to its own innovations $a_t$, the moving-average (Wold)
+representation that is the mirror image of the VAR {eq}`eq:var1`.
+
+From the time-invariant innovations representation {eq}`eq:innovti`
+
+$$
+\hat{x}_{t+1} = A\hat{x}_t + K a_t, \qquad y_t = G\hat{x}_t + a_t,
+$$
+
+the moving-average representation {eq}`eq:sf_wold` is
+
+$$
+y_t = \bigl[I + G(I - AL)^{-1} K L\bigr]\, a_t
+    = a_t + \sum_{h=1}^{\infty} G A^{h-1} K\, a_{t-h}.
+$$
+
+Hence the impulse response of $y_t$ to a unit innovation $a_t$ is
+
+$$
+\Psi_0 = I, \qquad \Psi_h = G A^{h-1} K \quad (h \ge 1).
+$$ (eq:ex_y_to_a)
+
+These coefficients decay at the rate governed by the eigenvalues of $A$, in
+contrast to the innovation-to-structural-shock responses studied later in the
+numerical example, which decay at the rate governed by the eigenvalues of
+$A - KG$.
+
+We can read the coefficients {eq}`eq:ex_y_to_a` directly off a `quantecon`
+`LinearStateSpace` object.
+
+We build a state-space system whose state is the
+filtered estimate $\hat{x}_t$, whose single "shock" is the innovation $a_t$
+loaded through $C = K$, and whose observation matrix is $G$.
+
+The
+`impulse_response` method of that object returns the sequence $G A^{j} K$ for
+$j = 0, 1, 2, \ldots$, which are exactly the $\Psi_h$ for $h \ge 1$; we prepend
+$\Psi_0 = I$ to capture the contemporaneous feed-through $y_t = G\hat{x}_t + a_t$.
+
+```{code-cell} ipython3
+def y_to_a_irf(A, K, G, T=40):
+    """
+    Ordinary impulse responses of the observable y_t to its own
+    innovations a_t in the innovations (Wold) representation
+
+        x_hat_{t+1} = A x_hat_t + K a_t
+        y_t      = G x_hat_t + a_t.
+
+    The moving-average coefficients are
+        Ψ_0 = I,   Ψ_h = G A^{h-1} K   (h >= 1).
+
+    The h >= 1 terms come from quantecon's LinearStateSpace: loading the
+    innovation a_t through C = K makes its impulse_response method return
+    G A^j K for j = 0, 1, 2, ....  We prepend Ψ_0 = I for the
+    contemporaneous feed-through.
+
+    Returns an array of shape (T, m, m); entry [h, i, j] is the response
+    of observable i at horizon h to a unit innovation in component j.
+    """
+    n, m     = A.shape[0], G.shape[0]
+    lss      = qe.LinearStateSpace(A, K, G, np.zeros((m, m)), mu_0=np.zeros(n))
+    _, ycoef = lss.impulse_response(j=T - 2)      # [GK, GAK, GA^2K, ...]
+    Psi      = np.empty((T, m, m))
+    Psi[0]   = np.eye(m)                          # contemporaneous response
+    for h in range(1, T):
+        Psi[h] = ycoef[h - 1]
+    return Psi
+```
+
+### Augmented state-space representation
+
+We want to express the innovations $a_t$ as a function of the state shocks
+$w_{t+1}$ and the measurement error $v_t$.
+
+To accomplish this, we start by stacking the true state $x_t$ and the filter estimate $\hat{x}_t$ into an augmented state vector that 
+obeys 
+
+$$
+\begin{pmatrix} x_{t+1} \\ \hat{x}_{t+1} \end{pmatrix}
+=
+\begin{pmatrix} A & 0 \\ KG & A - KG \end{pmatrix}
+\begin{pmatrix} x_t \\ \hat{x}_t \end{pmatrix}
++
+\begin{pmatrix} C & 0 \\ 0 & K \end{pmatrix}
+\begin{pmatrix} w_{t+1} \\ v_t \end{pmatrix}
+$$ (eq:ex_augmented)
+
+
+### Bivariate VAR(2) in state-space form
+
+Consider two observable series $r_t$ and $z_t$.
+
+Stack them into the state vector $x_t = (r_t,\; r_{t-1},\; z_t,\; z_{t-1})'$.
+
+We posit the VAR(2) state-transition equation:
+
+$$
+\begin{pmatrix} r_{t+1} \\ r_t \\ z_{t+1} \\ z_t \end{pmatrix}
+=
+\begin{pmatrix}
+  d_1      & d_2      & d_3      & d_4      \\
+  1        & 0        & 0        & 0        \\
+  \delta_1 & \delta_2 & \delta_3 & \delta_4 \\
+  0        & 0        & 1        & 0
+\end{pmatrix}
+\begin{pmatrix} r_t \\ r_{t-1} \\ z_t \\ z_{t-1} \end{pmatrix}
++
+\begin{pmatrix}
+  c_{11} & c_{12} \\
+  0      & 0      \\
+  c_{21} & c_{22} \\
+  0      & 0
+\end{pmatrix}
+\begin{pmatrix} w_{1,t+1} \\ w_{2,t+1} \end{pmatrix}
+$$ (eq:ex_var2_state)
+
+We consider two possible  observation equations. 
+
+The first is a bivariate observation of $r_t$ and $z_t$:
+
+$$
+\begin{pmatrix} r_t \\ z_t \end{pmatrix}
+=
+\begin{pmatrix}
+  1 & 0 & 0 & 0 \\
+  0 & 0 & 1 & 0
+\end{pmatrix}
+\begin{pmatrix} r_t \\ r_{t-1} \\ z_t \\ z_{t-1} \end{pmatrix}
++
+\begin{pmatrix} 1 & 0 \\ 0 & 1 \end{pmatrix}
+\begin{pmatrix} v_{1t} \\ v_{2t} \end{pmatrix}
+$$ (eq:ex_var2_obs)
+
+The second is a univariate observation of $r_t$:
+
+$$
+y_t = \begin{pmatrix} 1 & 0 & 0 & 0 \end{pmatrix}
+\begin{pmatrix} r_t \\ r_{t-1} \\ z_t \\ z_{t-1} \end{pmatrix}
++ v_{1t}
+$$ (eq:ex_scalar_obs)
+
+For the bivariate observation case, 
+the Wold representation is
+
+$$
+\begin{pmatrix} r_t \\ z_t \end{pmatrix}
+=
+\underbrace{G \bigl(I - (A - KG)L\bigr)^{-1} K}_{\text{transfer matrix}}
+\, a_t
+$$ (eq:ex_wold_bivariate)
+
+where $\mathbb{E} a_t a_t' = G \Sigma G' + R$ and $L$ is the lag operator.
+
+The innovation expressed in terms of the augmented state is
+
+$$
+a_t
+= \begin{bmatrix} G & -G \end{bmatrix}
+  \begin{pmatrix} x_t \\ \hat{x}_t \end{pmatrix}
++ \begin{bmatrix} 0 & I \end{bmatrix}
+  \begin{pmatrix} w_{t+1} \\ v_t \end{pmatrix}
+= G(x_t - \hat{x}_t) + v_t
+$$ (eq:ex_innovation_aug)
+
+
+
+For the scalar ($1 \times 1$) case with a single observable $r_t$, the observation matrix $G_1$ is the first row of $G$ and the scalar gain $K_1$ is the outcome of the Kalman filter for that case.  The univariate Wold representation is
+
+$$
+r_t = G_1 \bigl(I - (A - K_1 G_1) L\bigr)^{-1} K_1 \, u_t
+$$ (eq:ex_wold_scalar)
+
+where the univariate innovation $u_t$ is given by 
+
+$$
+u_t
+= \begin{bmatrix} G_1 & -G_1 \end{bmatrix}
+  \begin{pmatrix} x_t \\ \hat{x}_t \end{pmatrix}
++ \begin{bmatrix} 0 & 1 \end{bmatrix}
+  \begin{pmatrix} w_{t+1} \\ v_t \end{pmatrix}
+= G_1(x_t - \hat{x}_t) + v_{1,t}
+$$ (eq:ex_innovation_aug2)
+
+and $\mathbb{E} u_t^2 = G_1 \check  \Sigma G_1' + R_{11}$, and $\check \Sigma$ is the steady-state covariance of the augmented state vector associated with 
+$G_1, R_{11}$.
+
+### Numerical example: impulse responses of innovations
+
+We now set specific parameter values and compute the **impulse response functions
+of the Kalman innovations** ($a_t$ in System 1 and $u_t$ in System 2) to the
+two structural shocks $w_{1,t+1}$ and $w_{2,t+1}$.
+
+The key object is **not** the response of the observable $y_t$ to the shocks, but
+rather the response of the innovation that the Kalman filter produces.
+
+In System 1, $a_t$ is the $2 \times 1$ forecast error of $(r_t, z_t)'$ given the
+filter's estimate; in System 2, $u_t$ is the scalar forecast error of $r_t$ given
+the filter's estimate based on the $r_t$ history alone.
+
+The parameter values are:
+
+$$
+\begin{aligned}
+d_1 &= 0.80,\quad d_2 = 0.05,\quad d_3 = 0.75,\quad d_4 = -0.72 \\
+\delta_1 &= 0.00,\quad \delta_2 = 0.00,\quad \delta_3 = 0.75,\quad \delta_4 = 0.20 \\
+c_{11} &= 1.0,\quad c_{12} = 0.0,\quad c_{21} = 0.0,\quad c_{22} = 1.0 \\
+R &= 0.0001 \times I_2 \quad \text{(bivariate case)}, \qquad
+R = 0.0001 \quad \text{(univariate case)}.
+\end{aligned}
+$$
+
+These give the $4 \times 4$ transition matrix and $4 \times 2$ shock-loading matrix
+
+$$
+A = \begin{pmatrix}
+0.80 & 0.05 & 0.75 & -0.72 \\
+1    & 0    & 0    & 0    \\
+0    & 0    & 0.75 & 0.20 \\
+0    & 0    & 1    & 0
+\end{pmatrix}, \qquad
+C = \begin{pmatrix}
+1   & 0   \\
+0   & 0   \\
+0   & 1   \\
+0   & 0
+\end{pmatrix}.
+$$
+
+**System 1** uses the bivariate observation equation {eq}`eq:ex_var2_obs`, so
+$G$ selects $(r_t, z_t)'$ from the state and the innovation $a_t$ is $2 \times 1$.
+
+**System 2** uses the univariate observation equation {eq}`eq:ex_scalar_obs`, so
+$G_1$ selects only $r_t$ and the innovation $u_t$ is scalar.
+
+Because the innovation equals $a_t = G(x_t - \hat{x}_t) + v_t$, a unit shock $w_j$
+propagates through the **augmented** state $(x_t', \hat{x}_t')'$.
+
+A straightforward
+calculation using the augmented recursion {eq}`eq:ex_augmented` shows that the
+**impulse response of the innovation** at horizon $h \geq 0$ is
+
+$$
+\text{IRF}_{a}(h) = G(A - KG)^h C e_j, \qquad j = 1, 2,
+$$
+
+where $e_j$ is the $j$-th column of $I_2$ and $K$ is the appropriate steady-state
+Kalman gain (System 1 uses $K$ from the bivariate filter; System 2 uses $K_1$ from
+the univariate filter).
+
+At $h=0$ the shock hits and the innovation equals $GCe_j$;
+for $h \geq 1$ the response decays at the rate governed by the eigenvalues of $A - KG$.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+import quantecon as qe
+
+# -- Parameters ---------------------------------------------------------------
+d1, d2, d3, d4             = 0.80, 0.05, 0.75, -.72
+delta1, delta2, delta3, delta4 = 0.00, 0.00, 0.75, 0.20
+c11, c12, c21, c22         = 1.0,  0.0,  0.0,  1.0
+sigma_v                    = 0.01          # sqrt(0.0001)
+
+# -- Shared matrices -----------------------------------------------------------
+A_var = np.array([[d1,     d2,     d3,     d4    ],
+                  [1.0,    0.0,    0.0,    0.0   ],
+                  [delta1, delta2, delta3, delta4],
+                  [0.0,    0.0,    1.0,    0.0   ]])
+
+C_var = np.array([[c11, c12],
+                  [0.0, 0.0],
+                  [c21, c22],
+                  [0.0, 0.0]])
+
+# -- System 1: bivariate observation ------------------------------------------
+G_biv = np.array([[1.0, 0.0, 0.0, 0.0],
+                  [0.0, 0.0, 1.0, 0.0]])
+H_biv = sigma_v * np.eye(2)          # H @ H.T = 0.0001 * I_2
+
+lss_biv = qe.LinearStateSpace(A_var, C_var, G_biv, H_biv,
+                               mu_0=np.zeros(4), Sigma_0=np.eye(4))
+kf_biv  = qe.Kalman(lss_biv)
+Sigma_biv, K_biv = kf_biv.stationary_values()
+
+print("System 1 - steady-state Kalman gain K (4x2):")
+print(np.round(K_biv, 5))
+
+# -- System 2: univariate observation -----------------------------------------
+G_uni = np.array([[1.0, 0.0, 0.0, 0.0]])
+H_uni = np.array([[sigma_v]])         # H @ H.T = 0.0001
+
+lss_uni = qe.LinearStateSpace(A_var, C_var, G_uni, H_uni,
+                               mu_0=np.zeros(4), Sigma_0=np.eye(4))
+kf_uni  = qe.Kalman(lss_uni)
+Sigma_uni, K_uni = kf_uni.stationary_values()
+
+print("\nSystem 2 - steady-state Kalman gain K (4x1):")
+print(np.round(K_uni, 5))
+```
+
+```{code-cell} ipython3
+# -- Covariance comparisons ----------------------------------------------------
+
+# State-noise covariance CC' versus innovation covariance G Σ G' + R (System 1)
+CC_prime       = C_var @ C_var.T
+R_biv          = (sigma_v**2) * np.eye(2)
+innov_cov_biv  = G_biv @ Sigma_biv @ G_biv.T + R_biv
+
+print("State-noise covariance  CC'  (4x4):")
+print(np.round(CC_prime, 6))
+
+print("\nSystem 1 - innovation covariance  GΣG' + R  (2x2):")
+print(np.round(innov_cov_biv, 6))
+
+# Steady-state state covariance: System 1 (Σ_biv) vs System 2 (Σ_uni)
+print("\nSystem 1 - steady-state state covariance  Σ  (4x4):")
+print(np.round(Sigma_biv, 6))
+
+print("\nSystem 2 - steady-state state covariance  Σ_check  (4x4):")
+print(np.round(Sigma_uni, 6))
+
+print("\nDifference  Σ_check - Σ  (System 2 minus System 1):")
+print(np.round(Sigma_uni - Sigma_biv, 6))
+```
+
+#### Impulse responses of $y_t$ to the innovations $a_t$
+
+We now apply the helper `y_to_a_irf` defined above to compute the ordinary
+impulse responses {eq}`eq:ex_y_to_a` of the observable $y_t$ to its own
+innovations $a_t$, for both System 1 (bivariate, so $a_t$ is $2 \times 1$) and
+System 2 (univariate, so $u_t$ is scalar).
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: "System 1: IRFs of $y_t = (r_t, z_t)$ to its innovations $a_t$"
+    name: fig-kfvar-sys1-ya
+---
+T_irf    = 40
+horizons = np.arange(T_irf)
+
+Psi_biv = y_to_a_irf(A_var, K_biv, G_biv, T_irf)   # System 1: (T, 2, 2)
+Psi_uni = y_to_a_irf(A_var, K_uni, G_uni, T_irf)   # System 2: (T, 1, 1)
+
+obs_labels   = [r'$r_t$', r'$z_t$']
+innov_labels = [r'$a_{1,t}$', r'$a_{2,t}$']
+
+# -- Figure: System 1 - responses of y_t = (r_t, z_t)' to its innovations a_t --
+fig, axes = plt.subplots(2, 2, figsize=(10, 6), sharex=True)
+for i, obs in enumerate(obs_labels):
+    for j, inn in enumerate(innov_labels):
+        ax = axes[i, j]
+        ax.plot(horizons, Psi_biv[:, i, j], lw=1.8)
+        ax.axhline(0, color='k', lw=0.6, ls='--')
+        ax.set_title(f'{obs} to {inn}', fontsize=9)
+        if i == 1:
+            ax.set_xlabel('horizon $h$')
+        if j == 0:
+            ax.set_ylabel('response')
+
+plt.tight_layout()
+plt.show()
+```
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: "System 2: IRF of $r_t$ to its innovation $u_t$"
+    name: fig-kfvar-sys2-ya
+---
+fig, ax = plt.subplots(figsize=(7, 3.5))
+ax.plot(horizons, Psi_uni[:, 0, 0], lw=1.8)
+ax.axhline(0, color='k', lw=0.6, ls='--')
+ax.set_xlabel('horizon $h$')
+ax.set_ylabel('response')
+plt.tight_layout()
+plt.show()
+```
+
+At horizon $h = 0$ each innovation passes one-for-one into its own component of
+$y_t$, the contemporaneous response equals the identity matrix $\Psi_0 = I$, so
+the diagonal panels start at $1$ and the off-diagonal panels start at $0$.
+
+For $h \ge 1$ the responses propagate through the state matrix $A$ and decay
+geometrically, tracing out the Wold moving-average representation of the
+bivariate (System 1) and univariate (System 2) processes.
+
+#### Impulse responses of the innovations to the structural shocks
+
+```{code-cell} ipython3
+def innovation_irf(A, K, G, C, T=40):
+    """
+    Impulse responses of the Kalman innovation to each structural shock.
+
+    The innovation at horizon h to a unit shock e_j is
+        G (A - KG)^h C e_j,   h = 0, 1, ..., T-1.
+
+    Returns irf of shape (T, n_a, n_w).
+    """
+    AKG   = A - K @ G
+    n_a   = G.shape[0]
+    n_w   = C.shape[1]
+    irf   = np.zeros((T, n_a, n_w))
+    x     = C.copy()           # (A-KG)^0 @ C  at h = 0
+    for h in range(T):
+        irf[h] = G @ x
+        x      = AKG @ x
+    return irf
+
+T_irf    = 40
+horizons = np.arange(T_irf)
+
+irf_biv = innovation_irf(A_var, K_biv, G_biv, C_var, T_irf)   # (T, 2, 2)
+irf_uni = innovation_irf(A_var, K_uni, G_uni, C_var, T_irf)   # (T, 1, 2)
+
+shock_labels  = [r'Shock $w_{1,t+1}$', r'Shock $w_{2,t+1}$']
+innov_biv_lbl = [r'$a_{1,t}$', r'$a_{2,t}$']
+
+# -- Figure 1: System 1 - innovation IRFs --------------------------------------
+fig, axes = plt.subplots(2, 2, figsize=(10, 6), sharex=True)
+for j, shock in enumerate(shock_labels):
+    for i, albl in enumerate(innov_biv_lbl):
+        ax = axes[i, j]
+        ax.plot(horizons, irf_biv[:, i, j], lw=1.8)
+        ax.axhline(0, color='k', lw=0.6, ls='--')
+        ax.set_title(f'System 1 - {albl} to {shock}', fontsize=9)
+        ax.set_xlabel('horizon $h$')
+        if j == 0:
+            ax.set_ylabel('innovation response')
+
+plt.suptitle(r'System 1: IRFs of innovation $a_t$ to structural shocks',
+             fontsize=11)
+plt.tight_layout()
+plt.show()
+
+# -- Figure 2: System 2 - innovation IRFs --------------------------------------
+fig, axes = plt.subplots(1, 2, figsize=(10, 3.5))
+for j, shock in enumerate(shock_labels):
+    axes[j].plot(horizons, irf_uni[:, 0, j], lw=1.8)
+    axes[j].axhline(0, color='k', lw=0.6, ls='--')
+    axes[j].set_title(f'System 2 - $u_t$ to {shock}', fontsize=9)
+    axes[j].set_xlabel('horizon $h$')
+axes[0].set_ylabel('innovation response')
+
+plt.suptitle(r'System 2: IRFs of innovation $u_t$ to structural shocks',
+             fontsize=11)
+plt.tight_layout()
+plt.show()
+```
+
+The two sets of innovation impulse responses illustrate an important point.
+
+In **System 1** the filter uses both $r_t$ and $z_t$, so $a_t$ is $2 \times 1$ and
+its response to each shock is a pair of numbers at every horizon.
+
+In **System 2** only $r_t$ is observed, so $u_t$ is scalar and carries less
+information about the two-dimensional shock vector.
+
+Because $K$ differs across the two systems, the decay matrix $A - KG$ differs
+as well, so the innovation IRFs $G(A-KG)^h C e_j$ differ across the two systems
+even though the structural DGP is identical.
+
+The covariance comparisons printed above reveal two further insights.
+
+**State-noise $CC'$ vs. innovation covariance $G\Sigma G' + R$ (System 1).**
+
+The matrix $CC'$ is the unconditional covariance contributed by the structural
+shocks to the state transition in one period.
+
+The innovation covariance $G\Sigma G' + R$ is the covariance of the one-step
+forecast error in the *observable* vector after the Kalman filter has processed
+all available information.
+
+In this example $GCC'G'$ has diagonal entries equal to $1$.
+
+The printed innovation covariance has diagonal entries slightly above $1$ because it also
+contains residual uncertainty from estimating the hidden state and the small
+measurement-error variance $R = 0.0001 I_2$.
+
+Thus $G\Sigma G' + R$ should not be compared directly with $GCC'G'$ as a
+strict variance reduction.
+
+The variance reduction created by filtering is measured relative to the broader
+forecast-error covariance before conditioning on the available observations.
+
+**System 1 covariance $\Sigma$ vs. System 2 covariance $\check\Sigma$.**
+
+Restricting observations to $r_t$ alone (System 2) yields less information about
+the hidden state, so the filter is less precise: $\check\Sigma \succeq \Sigma$,
+i.e., the difference $\check\Sigma - \Sigma$ is positive semi-definite.
+
+The printed difference matrix confirms this ordering for our parameter values.
+
+## Exercises
+
+```{exercise-start}
+:label: kf_ex1
+```
+
+Consider the scalar AR(1) state space system used above with $\rho = 0.9$,
+$\sigma_w = 0.5$, $\sigma_v = 1.0$.
+
+Derive an algebraic expression for the **steady-state** conditional variance
+$\Sigma_\infty$ by solving the scalar Riccati equation {eq}`eq:riccati` at its
+fixed point $\Sigma_{t+1} = \Sigma_t = \Sigma$.
+
+Show that $\Sigma$ satisfies a quadratic equation, find its positive root, and
+verify numerically that your formula matches `kf.Sigma_infinity`.
+
+```{exercise-end}
+```
+
+```{solution-start} kf_ex1
+:class: dropdown
+```
+
+Setting $\Sigma_{t+1} = \Sigma_t = \Sigma$ in the scalar version of
+{eq}`eq:riccati` with $A = \rho$, $CC' = \sigma_w^2$, $GG' = 1$,
+$R = \sigma_v^2$:
+
+$$
+\Sigma = \rho^2 \Sigma + \sigma_w^2 - \frac{\rho^2 \Sigma^2}{\Sigma + \sigma_v^2}
+$$
+
+Multiplying through by $\Sigma + \sigma_v^2$ and rearranging:
+
+$$
+\Sigma^2 + \left[\sigma_v^2(1-\rho^2) - \sigma_w^2\right]\Sigma
+  - \sigma_v^2 \sigma_w^2 = 0
+$$
+
+Taking the positive root of this quadratic:
+
+$$
+\Sigma_\infty
+  = \frac{\sigma_w^2 - \sigma_v^2(1-\rho^2)
+          + \sqrt{\left[\sigma_v^2(1-\rho^2) - \sigma_w^2\right]^2
+          + 4 \sigma_v^2 \sigma_w^2}}{2}
+$$
+
+```{code-cell} ipython3
+rho_, sigma_w_, sigma_v_ = 0.9, 0.5, 1.0
+
+b = sigma_v_**2 * (1 - rho_**2) - sigma_w_**2
+discriminant = b**2 + 4 * sigma_v_**2 * sigma_w_**2
+Sigma_formula = (-b + np.sqrt(discriminant)) / 2
+
+A_ = np.array([[rho_]])
+C_ = np.array([[sigma_w_]])
+G_ = np.array([[1.0]])
+R_ = np.array([[sigma_v_**2]])
+H_ = np.array([[sigma_v_]])   # R_ = H_ @ H_.T
+lss_ = qe.LinearStateSpace(A_, C_, G_, H_, mu_0=np.zeros(1), Sigma_0=np.eye(1))
+kf_  = qe.Kalman(lss_)
+
+print(f"Analytical Σ_inf   = {Sigma_formula:.8f}")
+print(f"Numerical  Σ_inf   = {kf_.Sigma_infinity[0, 0]:.8f}")
+```
+
+```{solution-end}
+```
+
+```{exercise-start}
+:label: kf_ex2
+```
+
+**Bivariate example.**
+
+Consider a two-dimensional state with a one-dimensional observation:
+
+$$
+A = \begin{pmatrix} 0.9 & 0.1 \\ 0 & 0.8 \end{pmatrix}, \quad
+C = \begin{pmatrix} 0.4 \\ 0.1 \end{pmatrix}, \quad
+G = \begin{pmatrix} 1 & 0 \end{pmatrix}, \quad
+R = [0.5]
+$$
+
+(a) Simulate $T = 500$ observations from this system starting from a diffuse prior.
+
+(b) Run the Kalman filter and plot both components of $\hat{x}_t$ against the
+    true hidden state path.
+
+(c) Compute and report the steady-state covariance $\Sigma_\infty$ and
+    Kalman gain $K_\infty$.
+
+(d) Check that the eigenvalues of $A - K_\infty G$ lie strictly inside the
+    unit circle, confirming that the VAR representation {eq}`eq:var1` is stable.
+
+```{exercise-end}
+```
+
+```{solution-start} kf_ex2
+:class: dropdown
+```
+
+```{code-cell} ipython3
+A2 = np.array([[0.9, 0.1],
+               [0.0, 0.8]])
+C2 = np.array([[0.4],
+               [0.1]])
+G2 = np.array([[1.0, 0.0]])
+R2 = np.array([[0.5]])
+H2 = np.array([[np.sqrt(0.5)]])   # R2 = H2 @ H2.T
+
+lss2  = qe.LinearStateSpace(A2, C2, G2, H2,
+                             mu_0=np.zeros(2),
+                             Sigma_0=np.eye(2) * 5.0)
+kf2   = qe.Kalman(lss2)
+kf2.set_state(np.zeros(2), np.eye(2) * 5.0)
+
+T2 = 500
+x2_path, y2_path = lss2.simulate(ts_length=T2, random_state=0)
+
+x_hats2 = np.zeros((T2, 2))
+for t in range(T2):
+    kf2.update(y2_path[:, t])
+    x_hats2[t] = kf2.x_hat.ravel()
+
+fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
+for i, ax in enumerate(axes):
+    ax.plot(x2_path[i, :T2], lw=1.2, label=f'True $x_{{{i+1},t}}$')
+    ax.plot(x_hats2[:, i],   lw=1.2, ls='--', label=rf'$\hat{{x}}_{{{i+1},t}}$')
+    ax.legend(fontsize=9)
+    ax.set_ylabel(f'component {i+1}')
+axes[1].set_xlabel('time $t$')
+plt.suptitle('Kalman filter: bivariate hidden state')
+plt.tight_layout()
+plt.show()
+
+# (c) Steady-state values
+Sigma2_inf, K2_inf = kf2.stationary_values()
+print("Steady-state covariance Σ_inf:")
+print(np.round(Sigma2_inf, 5))
+print("\nSteady-state Kalman gain K_inf:")
+print(np.round(K2_inf, 5))
+
+# (d) Eigenvalues of A - K_inf G
+AKG2 = A2 - K2_inf @ G2
+eigvals2 = np.linalg.eigvals(AKG2)
+print(f"\nEigenvalues of A - K_inf G: {np.round(eigvals2, 5)}")
+print(f"Stable VAR: {np.all(np.abs(eigvals2) < 1)}")
+```
+
+```{solution-end}
+```
+
+```{exercise-start}
+:label: kf_ex3
+```
+
+**Likelihood and parameter estimation.**
+
+Using the scalar model from the main text with true parameters
+$(\rho, \sigma_w, \sigma_v) = (0.9, 0.5, 1.0)$:
+
+(a) Simulate $T = 300$ observations.
+
+(b) Write a function that evaluates the **log-likelihood** as a function of
+    $\rho \in (0, 1)$, holding $\sigma_w = 0.5$ and $\sigma_v = 1.0$ fixed,
+    and plot the log-likelihood against $\rho$ for a grid of values.
+
+(c) Locate the maximum numerically and check that it is close to the true value
+    $\rho = 0.9$.
+
+```{exercise-end}
+```
+
+```{solution-start} kf_ex3
+:class: dropdown
+```
+
+```{code-cell} ipython3
+# True parameters
+rho_true, sw_true, sv_true = 0.9, 0.5, 1.0
+
+A_t = np.array([[rho_true]])
+C_t = np.array([[sw_true]])
+G_t = np.array([[1.0]])
+R_t = np.array([[sv_true**2]])
+H_t = np.array([[sv_true]])   # R_t = H_t @ H_t.T
+
+lss_t = qe.LinearStateSpace(A_t, C_t, G_t, H_t,
+                             mu_0=np.zeros(1), Sigma_0=np.eye(1))
+_, y_sim = lss_t.simulate(ts_length=300, random_state=7)
+y_sim = y_sim.T          # shape (300, 1)
+
+def ll_rho(rho_val):
+    A_ = np.array([[rho_val]])
+    C_ = np.array([[sw_true]])
+    G_ = np.array([[1.0]])
+    R_ = np.array([[sv_true**2]])
+    return log_likelihood(A_, C_, G_, R_, y_sim,
+                          np.zeros(1), np.eye(1) * 10.0)
+
+rho_grid = np.linspace(0.5, 0.99, 60)
+ll_vals  = np.array([ll_rho(r) for r in rho_grid])
+
+rho_mle = rho_grid[np.argmax(ll_vals)]
+
+fig, ax = plt.subplots(figsize=(8, 4))
+ax.plot(rho_grid, ll_vals)
+ax.axvline(rho_true, color='k',   ls='--', label=f'True ρ = {rho_true}')
+ax.axvline(rho_mle,  color='tab:red', ls=':', label=f'MLE  ρ_hat = {rho_mle:.3f}')
+ax.set_xlabel(r'$\rho$')
+ax.set_ylabel('log-likelihood')
+ax.set_title('Profile log-likelihood as a function of $\\rho$')
+ax.legend()
+plt.tight_layout()
+plt.show()
+
+print(f"True ρ = {rho_true},  MLE ρ_hat = {rho_mle:.4f}")
+```
+
+```{solution-end}
+```
diff --git a/lectures/lq_consumption_smoothing.md b/lectures/lq_consumption_smoothing.md
new file mode 100644
index 000000000..410138279
--- /dev/null
+++ b/lectures/lq_consumption_smoothing.md
@@ -0,0 +1,2149 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.11.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+# LQ Consumption Smoothing: Incomplete Markets, Complete Markets, and Robust Control
+
+
+```{index} single: LQ Permanent Income Model
+```
+
+```{index} single: Consumption Smoothing
+```
+
+## Overview
+
+This lecture describes two representations of a linear-quadratic (LQ) permanent income model, then
+extends the model to incorporate a consumer's concern for robustness to misspecification of the
+stochastic process governing labor income.
+
+**Part A** covers the standard LQ permanent income model, a rational-expectations version of the
+theories of {cite}`Friedman1956` and {cite}`Hall1978`.
+
+We use the model to illustrate:
+
+- impulse response functions
+- alternative state-space representations of the optimal decision rule
+- cointegration of consumption and assets
+- a "borrowers and lenders" closed economy (Bewley model)
+- complete-markets consumption smoothing
+
+**Part B** covers a robust version of the permanent income model due to {cite}`HST_1999`
+(HST) and {cite}`HansenSargent2008`.  A consumer who distrusts his specification of the labor income process engages in a form of
+precautionary savings even when preferences are quadratic.
+
+We show:
+
+- how a concern for robustness is observationally equivalent (for quantities) to an increase in
+  impatience
+- how the worst-case model distorts the endowment process toward greater persistence
+- frequency-domain and detection-error-probability characterizations of the size of model
+  uncertainty
+
+**Synthesis: a Bewley model with heterogeneous beliefs and discount factors.**  The lecture
+concludes by combining Parts A and B in a single general-equilibrium model.
+
+Using tools that we borrow from {cite}`HansenSargent2008`, we show:
+
+- how a continuum of consumers can differ in their robustness parameter $\sigma^i \leq 0$ and
+  their discount factor $\beta^i$, with each pair lying on the observational-equivalence locus
+  from Part B
+- how every such consumer nevertheless chooses the **same consumption-saving rule** as the
+  plain-vanilla $(\sigma = 0, \beta_0)$ agent of Part A
+- how the equilibrium interest rate $R = \beta_0^{-1}$ and all aggregate dynamics therefore
+  coincide with those of the Part A Bewley model
+- how agents can hold genuinely different internal (worst-case) models of their income process
+  while remaining observationally indistinguishable in quantities
+
+Throughout, we set $\beta R = 1$, so the consumer's subjective discount factor equals the bond price.
+
+Let's begin with some imports.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.linalg import solve, inv, solve_discrete_lyapunov
+from scipy.stats import norm
+
+```
+
+---
+
+## Part A: the standard LQ permanent income model
+
+```{index} single: LQ Permanent Income Model; standard
+```
+
+### Setup
+
+```{index} single: Permanent Income Hypothesis; Friedman
+```
+
+A consumer has preferences over consumption streams ordered by
+
+$$
+\mathbb{E}_0 \sum_{t=0}^{\infty} \beta^t u(c_t)
+$$ (eq:sprob1)
+
+where $\mathbb{E}_t$ is the mathematical expectation conditioned on the consumer's time-$t$ information,
+$c_t$ is time-$t$ consumption, $u(c)$ is a strictly concave one-period utility function, and
+$\beta \in (0,1)$ is a discount factor.
+
+The consumer maximizes {eq}`eq:sprob1` by choosing a plan
+$\{c_t, b_{t+1}\}_{t=0}^{\infty}$ subject to the sequence of budget constraints
+
+$$
+c_t + b_t = R^{-1} b_{t+1} + y_t, \quad t \geq 0
+$$ (eq:sprob2)
+
+where $\{y_t\}$ is an exogenous stationary endowment process, $R$ is a constant gross risk-free
+interest rate, $b_t$ is a one-period risk-free bond maturing at $t$, and $b_0$ is a given
+initial condition.
+
+```{note}
+For $t \geq 1$, $b_t$ is chosen at time $t-1$.  The bond $b_t > 0$ represents debt owed by
+the consumer at the start of period $t$.
+```
+
+We assume $R^{-1} = \beta$.
+
+The endowment process has the state-space representation
+
+$$
+\begin{aligned}
+z_{t+1} &= \check{A}\, z_t + \check{C}\, w_{t+1} \\
+y_t &= \check{G}\, z_t
+\end{aligned}
+$$ (eq:sprob15)
+
+where $w_{t+1}$ is i.i.d. with mean zero and identity covariance matrix, $\check{A}$ is a stable
+matrix (eigenvalues strictly less than one in modulus), and $\check{G}$ is a row vector.
+
+The state confronting the household at $t$ is
+$\bigl[b_t \;\; z_t'\bigr]'$, where $b_t$ is its one-period debt due at the start of period $t$
+and $z_t$ contains all variables useful for forecasting its future endowment.
+
+To make the problem linear-quadratic, we adopt the **quadratic utility function**
+
+$$
+u(c_t) = -\tfrac{1}{2}(c_t - \gamma)^2
+$$
+
+where $\gamma > 0$ is a bliss level of consumption.
+
+We allow $c_t$ to be negative (a producer
+rather than a consumer).
+
+We impose the **transversality condition**
+
+$$
+\mathbb{E}_0 \sum_{t=0}^{\infty} \beta^t b_t^2 < +\infty
+$$ (eq:sprob3)
+
+which rules out Ponzi schemes.
+
+### Euler equation and certainty equivalence
+
+With quadratic utility, the first-order conditions for the consumer's problem imply the **martingale
+Euler equation**
+
+$$
+\mathbb{E}_t c_{t+1} = c_t
+$$ (eq:sprob5)
+
+```{note}
+Equation {eq}`eq:sprob5` says that consumption is a martingale.  This is the key implication of
+the LQ permanent income model.  It contrasts with models that have convex marginal utility
+($u''' > 0$), where consumption is instead a submartingale.
+```
+
+The problem satisfies a **certainty-equivalence** property: one can find the optimal plan by (1)
+solving the problem under perfect foresight to express $c_t$ as a function of $b_t$ and the entire
+future path $\{y_{t+j}\}_{j=0}^{\infty}$, and then (2) replacing those future values with
+$\{\mathbb{E}_t y_{t+j}\}_{j=0}^{\infty}$.
+
+### The optimal consumption function
+
+Solving the budget constraint {eq}`eq:sprob2` forward, imposing the transversality condition, and
+taking conditional expectations gives
+
+$$
+b_t = \sum_{j=0}^{\infty} \beta^j \mathbb{E}_t y_{t+j} - \frac{1}{1-\beta} c_t
+$$ (eq:sprob7)
+
+Rearranging yields the **consumption function**
+
+$$
+c_t = (1-\beta)\!\left[\sum_{j=0}^{\infty} \beta^j \mathbb{E}_t y_{t+j} - b_t\right]
+$$ (eq:sprob8)
+
+Equivalently, with net interest rate $r$ defined by $\beta = 1/(1+r)$,
+
+$$
+c_t = \frac{r}{1+r}\!\left[\sum_{j=0}^{\infty} \beta^j \mathbb{E}_t y_{t+j} - b_t\right]
+$$ (eq:sprob9)
+
+Consumption equals $r/(1+r)$ times total wealth, where total wealth is the sum of human wealth
+$\sum_{j=0}^{\infty}\beta^j \mathbb{E}_t y_{t+j}$ and financial wealth $-b_t$.
+
+Using the state-space representation {eq}`eq:sprob15` to evaluate the geometric sum of expected
+future endowments,
+
+$$
+\sum_{j=0}^{\infty} \beta^j \mathbb{E}_t y_{t+j} = \check{G}(I - \beta \check{A})^{-1} z_t
+$$ (eq:discount1)
+
+we obtain the **Lucas-critique-respecting consumption function**
+
+$$
+c_t = (1-\beta)\!\left[\check{G}(I-\beta\check{A})^{-1} z_t - b_t\right]
+$$ (eq:lccf)
+
+This expresses $c_t$ as a function of the state $[b_t,\, z_t']'$ that confronts the household.
+
+### Representation 1: state $(b_t, z_t)$
+
+Combining the endowment law of motion with the optimal debt dynamics (derived by substituting
+{eq}`eq:lccf` into {eq}`eq:sprob2`) gives the first system representation:
+
+$$
+\begin{aligned}
+z_{t+1} &= \check{A}\, z_t + \check{C}\, w_{t+1} \\
+b_{t+1} &= b_t + \check{G}\bigl[(I - \beta\check{A})^{-1}(\check{A}-I)\bigr] z_t \\
+y_t &= \check{G}\, z_t \\
+c_t &= (1-\beta)\!\left[\check{G}(I-\beta\check{A})^{-1} z_t - b_t\right]
+\end{aligned}
+$$ (eq:rep1)
+
+In this representation the **exogenous** state is $z_t$ and the **endogenous** state is $b_t$.
+
+### Representation 2: state $(c_t, z_t)$
+
+{cite}`Hall1978` showed that the LQ permanent income model implies a particularly sharp
+state-space representation in which the state consists of current consumption $c_t$ and the
+exogenous endowment state $z_t$, with assets $b_t$ becoming an outcome rather than a state
+variable.
+
+**Consumption innovation.** Shifting {eq}`eq:sprob8` forward, eliminating $b_{t+1}$ via
+{eq}`eq:sprob2`, and rearranging yields
+
+$$
+c_{t+1} - c_t = (1-\beta)\sum_{j=0}^{\infty} \beta^j \bigl(\mathbb{E}_{t+1} y_{t+j+1} - \mathbb{E}_t y_{t+j+1}\bigr)
+$$ (eq:sprob11)
+
+The right-hand side is $(1-\beta)$ times the time-$(t+1)$ **innovation** to the expected present
+value of the endowment stream.
+
+Suppose the endowment has the moving-average representation
+
+$$
+y_{t+1} = d(L)\, w_{t+1}, \qquad d(L) = \sum_{j=0}^{\infty} d_j L^j
+$$ (eq:sprob12)
+
+where $d(L) = \check{G}(I - \check{A} L)^{-1}\check{C}$.  Then
+
+$$
+\mathbb{E}_{t+1} y_{t+j} - \mathbb{E}_t y_{t+j} = d_{j-1}\, w_{t+1}
+$$ (eq:sprob120)
+
+Substituting {eq}`eq:sprob120` into {eq}`eq:sprob11` gives the key result
+
+$$
+c_{t+1} - c_t = (1-\beta)\, d(\beta)\, w_{t+1}
+$$ (eq:sprob13)
+
+where $d(\beta) = \check{G}(I-\beta\check{A})^{-1}\check{C}$ is the **present value of the
+moving-average coefficients**.  Consumption is a **random walk** with innovation
+$(1-\beta)d(\beta)w_{t+1}$.
+
+**Full second representation.** Combining {eq}`eq:sprob13` and {eq}`eq:sprob7` gives
+
+$$
+\begin{aligned}
+c_{t+1} &= c_t + (1-\beta)\,\check{G}(I-\beta\check{A})^{-1}\check{C}\, w_{t+1} \\
+b_t &= \check{G}(I-\beta\check{A})^{-1} z_t - \frac{1}{1-\beta}\,c_t \\
+y_t &= \check{G}\, z_t \\
+z_{t+1} &= \check{A}\, z_t + \check{C}\, w_{t+1}
+\end{aligned}
+$$ (eq:sprob16)
+
+This representation reveals several important features of the optimal decision rule:
+
+1. **State**: The state consists of the endogenous component $c_t$ and the exogenous component
+   $z_t$.  Financial assets $b_t$ have disappeared as a state variable because they are encoded
+   in $c_t$.
+
+2. **Random walk**: Consumption is a random walk with innovation $(1-\beta)d(\beta)w_{t+1}$, which
+   confirms that the Euler equation {eq}`eq:sprob5` is built into the solution.  The random-walk
+   property implies that consumption has no asymptotic stationary distribution.
+
+3. **Box impulse response**: For all $j \geq 1$, the response of $c_{t+j}$ to the innovation
+   $w_{t+1}$ is the constant $(1-\beta)d(\beta)$, giving a "box-shaped" impulse response.
+
+4. **Cointegration**: Both $c_t$ and $b_t$ are nonstationary (unit-root processes), but the
+   linear combination $(1-\beta)b_t + c_t$ is stationary.  From {eq}`eq:sprob7`,
+
+$$
+(1-\beta)b_t + c_t = (1-\beta)\mathbb{E}_t\sum_{j=0}^{\infty}\beta^j y_{t+j}
+$$ (eq:cointegration)
+
+   The left side is the cointegrating residual, which equals the consumer's expected present value
+   of future income.
+
+### Debt dynamics
+
+```{index} single: History Dependence
+```
+
+Subtracting {eq}`eq:sprob16` (equation for $b_t$) at time $t$ from the same equation at time $t+1$
+and substituting gives
+
+$$
+b_{t+1} - b_t = \check{G}(I-\beta\check{A})^{-1}(\check{A}-I)\, z_t
+$$ (eq:debt_evolution)
+
+This shows that $b_{t+1}$ is **predetermined** at time $t$ as a function of $z_t$ alone.
+Solving backward from any $t$, $b_t$ depends on the entire history $z^{t-1} = [z_{t-1},\ldots,z_0]$
+and the initial condition $b_0$.  This **history dependence** is a hallmark of incomplete markets.
+
+### Two classic examples
+
+```{index} single: Permanent Income Model; examples
+```
+
+We illustrate formulas {eq}`eq:sprob16` with two examples.
+
+In both, the endowment is
+$y_t = z_{1t} + z_{2t}$, where
+
+$$
+\begin{pmatrix}z_{1,t+1}\\z_{2,t+1}\end{pmatrix}
+=
+\begin{pmatrix}1 & 0\\0 & 0\end{pmatrix}
+\begin{pmatrix}z_{1t}\\z_{2t}\end{pmatrix}
++
+\begin{pmatrix}\sigma_1 & 0\\0 & \sigma_2\end{pmatrix}
+\begin{pmatrix}w_{1,t+1}\\w_{2,t+1}\end{pmatrix}
+$$ (eq:twofactor)
+
+Here $z_{1t}$ is a **permanent** component of $y_t$ and $z_{2t}$ is a **purely transitory**
+component.
+
+**Example 1 (full information).** The consumer observes the state $z_t$ at time $t$, so he
+can reconstruct $w_{t+1}$ from $z_{t+1}$ and $z_t$.
+
+Applying {eq}`eq:sprob16`:
+
+$$
+c_{t+1} - c_t = \sigma_1 w_{1,t+1} + (1-\beta)\,\sigma_2\, w_{2,t+1}
+$$ (eq:consexample1)
+
+A unit increment to the permanent component $z_{1t}$ raises consumption **one-for-one** permanently
+and causes **zero net saving**.
+
+A unit increment to the purely transitory component raises
+consumption by only the fraction $(1-\beta)$ permanently, while the remaining fraction $\beta$ is
+saved.
+
+From {eq}`eq:debt_evolution`:
+
+$$
+b_{t+1} - b_t = -z_{2t} = -\sigma_2 w_{2t}
+$$ (eq:consexample1a)
+
+confirming that none of the permanent shock is saved, while all of the transitory shock is saved.
+
+**Example 2 (partial information / Muth model).** The consumer observes $y_t$ and its history,
+but not $z_{1t}$ and $z_{2t}$ separately.
+
+The appropriate approach uses an **innovations
+representation** derived by the Kalman filter.
+
+At the Kalman filter steady state, the **Kalman gain** $K \in [0,1]$ satisfies
+
+$$
+K = \frac{\Sigma}{\Sigma + \sigma_2^2}, \qquad \Sigma = \frac{\sigma_1^2 + \sqrt{\sigma_1^4 + 4\sigma_1^2\sigma_2^2}}{2}
+$$ (eq:kalmangain)
+
+where $K$ increases with the ratio $\sigma_1^2/\sigma_2^2$ (the variance of the permanent shock
+relative to the transitory shock).
+
+The innovations representation expresses the endowment as an ARMA(1,1) in
+its own innovation $a_t = y_t - \mathbb{E}[y_t \mid y^{t-1}]$ (the one-step-ahead forecast error):
+
+$$
+y_{t+1} = y_t - (1-K)\,a_t + a_{t+1}
+$$ (eq:muth_innov)
+
+Here the coefficient $-(1-K)$ on the lagged innovation reflects that only the fraction
+$K$ of last period's surprise was treated as permanent; the remainder mean-reverts.
+
+The scalar
+$a_t$ is i.i.d. with variance $\Sigma + \sigma_2^2$.
+Applying {eq}`eq:sprob16` to this innovation representation:
+
+$$
+c_{t+1} - c_t = [1 - \beta(1-K)]\, a_{t+1}
+$$ (eq:consexample2)
+
+The consumer regards a fraction $K$ of the innovation $a_{t+1}$ as permanent and fraction $1-K$
+as transitory.
+
+He permanently increments consumption by $K + (1-\beta)(1-K) = 1 - \beta(1-K)$
+of $a_{t+1}$ and saves the remaining fraction $\beta(1-K)$.
+
+The first difference of income obeys a first-order moving average:
+
+$$
+y_{t+1} - y_t = a_{t+1} - (1-K)\,a_t
+$$ (eq:incomemaar)
+
+while the first difference of consumption is i.i.d.
+
+### Numerical illustration of the two examples
+
+```{code-cell} ipython3
+# Parameters
+β = 0.95       # discount factor (so R = 1/β)
+σ1 = 0.15      # std of permanent shock
+σ2 = 0.30      # std of transitory shock
+
+# -- Example 1: full information ----------------------------------------------
+A_check = np.array([[1.0, 0.0],
+                    [0.0, 0.0]])
+C_check = np.array([[σ1, 0.0],
+                    [0.0, σ2]])
+G_check = np.array([[1.0, 1.0]])
+
+# Key matrix M = G(I - βA)^{-1}
+IbA = np.eye(2) - β * A_check
+M   = G_check @ inv(IbA)   # shape (1, 2)
+
+# Impulse response of consumption (permanent to unit permanent / transitory shock)
+h = (1 - β) * M @ C_check          # shape (1, 2)
+irf_perm_ex1  = h[0, 0] / σ1       # response per unit std of permanent shock
+irf_trans_ex1 = h[0, 1] / σ2       # response per unit std of transitory shock
+
+print("Example 1 (full information)")
+print(f"  IRF c to permanent shock  (normalised): {irf_perm_ex1:.4f}   (theory: 1.0)")
+print(f"  IRF c to transitory shock (normalised): {irf_trans_ex1:.4f}   "
+      f"(theory: {1-β:.4f})")
+```
+
+```{code-cell} ipython3
+# -- Example 2: partial information / Muth model -------------------------------
+Σ = (σ1**2 + np.sqrt(σ1**4 + 4 * σ1**2 * σ2**2)) / 2
+K = Σ / (Σ + σ2**2)
+
+print("Example 2 (partial information)")
+print(f"  Steady-state Kalman gain K = {K:.4f}")
+print(f"  IRF c to unit innovation a_{{t+1}}: {1 - β*(1-K):.4f}")
+print(f"  Fraction of innovation treated as permanent (K): {K:.4f}")
+print(f"  Fraction saved: β(1-K) = {β*(1-K):.4f}")
+```
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Consumption impulse responses in the two examples
+    name: fig-lqcs-irf-examples
+---
+# -- Compare impulse responses across examples ---------------------------------
+T = 30
+irf_c_ex1_perm  = np.ones(T) * irf_perm_ex1 * σ1
+irf_c_ex1_trans = np.ones(T) * irf_trans_ex1 * σ2
+
+irf_c_ex2 = np.ones(T) * (1 - β * (1 - K))   # per unit innovation a_t
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+axes[0].axhline(0, color='k', linewidth=0.8)
+axes[0].step(range(T), irf_c_ex1_perm, where='post',
+             label='permanent shock ($z_1$)', color='steelblue')
+axes[0].step(range(T), irf_c_ex1_trans, where='post',
+             label='transitory shock ($z_2$)', color='tomato', linestyle='--')
+axes[0].set_title('Example 1: IRF of $c$ (full information)')
+axes[0].set_xlabel('periods after shock')
+axes[0].set_ylabel('response of $c$')
+axes[0].legend()
+axes[0].grid(alpha=0.3)
+
+axes[1].axhline(0, color='k', linewidth=0.8)
+axes[1].step(range(T), irf_c_ex2, where='post',
+             label=f'unit innovation $a_{{t+1}}$ (K = {K:.2f})',
+             color='purple')
+axes[1].set_title('Example 2: IRF of $c$ (partial information)')
+axes[1].set_xlabel('periods after shock')
+axes[1].set_ylabel('response of $c$')
+axes[1].legend()
+axes[1].grid(alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+```
+
+```{note}
+The impulse responses have the "box" shape characteristic of the LQ permanent income model: once
+a shock occurs, consumption shifts permanently to a new level and stays there.
+```
+
+### Spreading consumption cross sections
+
+```{index} single: Cross-Section Distributions; consumption
+```
+
+The unit root in consumption (Representation 2) causes the **cross-section variance** of
+consumption to grow linearly with age.
+
+Consider a continuum of *ex ante* identical households born at $t = 0$.
+
+Each household $i$ has
+the same preferences and the same stochastic income process, but faces **idiosyncratic** shocks
+$w_{t+1}^i$.
+
+Let all households start from the same initial conditions $c_0^i = c_0$ and $z_0^i$.
+
+From {eq}`eq:sprob16`, household $i$'s consumption follows
+
+$$
+c_{t+1}^i = c_t^i + h\, w_{t+1}^i, \qquad h = (1-\beta)\,\check{G}(I-\beta\check{A})^{-1}\check{C}
+$$
+
+Since the $w^i_{t+1}$ are independent across agents,
+
+$$
+\mathbb{E}_0\bigl(c_t^i - c_0^i\bigr)^2 = t\, h h'
+$$ (eq:varspread)
+
+In the two-factor model, $h$ is a $1 \times 2$ row vector so $hh'$ is a positive scalar equal to
+$\sigma_1^2 + (1-\beta)^2\sigma_2^2$.
+
+The cross-section variance of consumption grows like $t$.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Spreading consumption cross sections
+    name: fig-lqcs-spread
+---
+# Simulate cross-section spreading
+rng = np.random.default_rng(42)
+N      = 5000       # number of agents
+T_sim  = 80         # number of periods
+
+h_vec  = (1 - β) * (M @ C_check)   # shape (1, 2), then flatten
+h_vec  = h_vec.flatten()            # h = [h1, h2]
+
+c = np.zeros((N, T_sim + 1))       # consumption paths
+# initialise all agents at c_0 = 0 (demeaned)
+for t in range(T_sim):
+    eps = rng.standard_normal((N, 2))    # N draws of 2D shock
+    dc  = eps @ h_vec              # shape (N,)
+    c[:, t+1] = c[:, t] + dc
+
+# Cross-section variance at each date
+var_c  = np.var(c, axis=0)
+theory = np.arange(T_sim + 1) * np.dot(h_vec, h_vec)
+
+fig, ax = plt.subplots()
+ax.plot(var_c,  label='simulated cross-section variance', linewidth=2)
+ax.plot(theory, label=r'theoretical: $t \cdot h h^\prime$',
+        linestyle='--', color='tomato')
+ax.set_xlabel('period $t$')
+ax.set_ylabel('cross-section variance of $c$')
+ax.legend()
+ax.grid(alpha=0.3)
+plt.show()
+```
+
+---
+
+### A "borrowers and lenders" closed economy (Bewley model)
+
+```{index} single: Bewley Model
+```
+
+Up to now we have set $R = \beta^{-1}$ and taken it as determined outside the model ("small open
+economy").
+
+Following ideas of {cite}`Bewley1977`, we can construct a **closed economy** in which
+$R = \beta^{-1}$ is an **equilibrium outcome**.
+
+**Environment.** A continuum of measure one of consumers, indexed by $i \in [0,1]$, trade a
+risk-free one-period bond with price $\beta$.
+
+All consumers have the same preferences and the
+same stochastic income process, but face **idiosyncratic** income shocks.
+
+Initial bond positions
+are zero: $b_0^i = 0$ for all $i$.
+
+Initial endowment states $z_0^i$ are independent draws from
+the stationary distribution of {eq}`eq:sprob15`.
+
+**Individual decisions.** From {eq}`eq:lccf`, with $b_0^i = 0$, agent $i$'s time-0 consumption
+is
+
+$$
+c_0^i = (1-\beta)\,\check{G}(I-\beta\check{A})^{-1} z_0^i
+$$ (eq:c_null)
+
+For $t \geq 1$, from {eq}`eq:sprob16`:
+
+$$
+c_{t+1}^i = c_t^i + h\, w_{t+1}^i, \qquad h = (1-\beta)\,\check{G}(I-\beta\check{A})^{-1}\check{C}
+$$ (eq:c_future)
+
+**Market clearing.** Let $Y$ denote the stationary mean of the cross-section average of
+non-financial income.  Integrating {eq}`eq:c_null` over all agents:
+
+$$
+\int_0^1 c_0^i\, di = (1-\beta)\sum_{j=0}^{\infty}\beta^j \mathbb{E}_0\!\int_0^1 y_j^i\, di = Y
+$$ (eq:c_marketclear_0)
+
+because the continuum of idiosyncratic shocks averages out.
+
+For future periods, integrating
+{eq}`eq:c_future`:
+
+$$
+\int_0^1 c_{t+1}^i\, di = \int_0^1 c_t^i\, di + h\!\underbrace{\int_0^1 w_{t+1}^i\, di}_{=\,0} = Y
+$$
+
+The goods market clears at every date at **constant** aggregate consumption equal to $Y$.
+
+The
+bond market clears at zero net supply each period.  Thus $R = \beta^{-1}$ is an equilibrium
+outcome: we have constructed a Bewley model.
+
+**Cross-section inequality.** While the cross-section mean of consumption is constant, the
+cross-section **variance** grows without bound according to {eq}`eq:varspread`.  Initial
+differences in endowment draws $z_0^i$ create permanent differences in consumption levels, and
+idiosyncratic shocks create ongoing divergence.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Bewley economy cross-section moments
+    name: fig-lqcs-bewley
+---
+# Verify Bewley market clearing via simulation
+# We use an online (Welford) accumulator for mean and variance so that we never
+# store the full NxT consumption array in memory.
+rng = np.random.default_rng(0)
+N_bew  = 10000    # agents - enough to illustrate the law of large numbers
+T_bew  = 60
+
+# Draw initial z^i_0; z1 is a random walk so its stationary distribution is
+# degenerate - we draw from a distribution with std 1 for illustration.
+z0_i   = rng.standard_normal((N_bew, 2)) * np.array([1.0, σ2])
+c0_i   = ((1 - β) * (M @ z0_i.T)).flatten()   # shape (N_bew,)
+
+# Online accumulation: propagate c_t across agents without storing all paths.
+mean_c = np.zeros(T_bew + 1)
+var_c2 = np.zeros(T_bew + 1)
+mean_c[0] = c0_i.mean()
+var_c2[0] = c0_i.var()
+
+c_now = c0_i.copy()
+for t in range(T_bew):
+    eps    = rng.standard_normal((N_bew, 2))
+    c_now  = c_now + eps @ h_vec
+    mean_c[t + 1] = c_now.mean()
+    var_c2[t + 1] = c_now.var()
+
+# Keep c0_i available for the complete-markets figure below
+c_bew_t0 = c0_i
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+axes[0].plot(mean_c, linewidth=2, color='steelblue')
+axes[0].axhline(mean_c[0], linestyle='--', color='tomato', label='initial mean')
+axes[0].set_title('Cross-section mean of $c$ (Bewley economy)')
+axes[0].set_xlabel('period $t$')
+axes[0].set_ylabel('mean consumption')
+axes[0].legend()
+axes[0].grid(alpha=0.3)
+
+axes[1].plot(var_c2, linewidth=2, color='steelblue', label='simulated variance')
+axes[1].set_title('Cross-section variance of $c$ (Bewley economy)')
+axes[1].set_xlabel('period $t$')
+axes[1].set_ylabel('variance of consumption')
+axes[1].legend()
+axes[1].grid(alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+```
+
+**An inefficiency.** Because each consumer dislikes variation of consumption over time, each
+consumer would prefer a completely smoothed stream $c_t^i = c_0^i$ for all $t$.  Such an
+allocation is feasible (the cross-section average of income is constant), and it is **Pareto
+superior** to the incomplete-markets equilibrium.  The next section describes the complete-markets
+allocation that achieves this.
+
+---
+
+### Consumption smoothing with complete markets
+
+```{index} single: Complete Markets; Arrow securities
+```
+
+We replace the single bond with a **complete set of Arrow securities**.  The budget constraint
+becomes
+
+$$
+c_t + b_{t-1}(z_t) = \int q(z_{t+1}|z_t)\, b_t(z_{t+1})\, dz_{t+1} + y_t
+$$ (eq:CMbudget)
+
+where $q(z_{t+1}|z_t)$ is the pricing kernel for one-period state-contingent claims and
+$b_t(z_{t+1})$ is the household's portfolio of Arrow securities chosen at $t$.
+
+**Equilibrium pricing kernel.** We conjecture (and verify) that the equilibrium pricing kernel is
+
+$$
+q(z_{t+1}|z_t) = \beta\,\phi(z_{t+1}|z_t)
+$$ (eq:kernel)
+
+where $\phi(z_{t+1}|z_t)$ is the transition density of $z$.  This kernel prices a one-period
+risk-free bond at $\beta$, so $R = \beta^{-1}$, consistent with the incomplete-markets
+equilibrium.
+
+**Constant consumption conjecture.** We conjecture that the equilibrium delivers each consumer
+$i$ a **constant** consumption level:
+
+$$
+c_t^i = \bar{c}^i = c_0^i, \quad \forall\, t \geq 0
+$$ (eq:constcons)
+
+where $c_0^i = (1-\beta)\,\check{G}(I-\beta\check{A})^{-1} z_0^i$ is the consumer's time-0
+consumption in the incomplete-markets economy.
+
+**Supporting portfolio.** The state-contingent debt that supports constant consumption is
+
+$$
+b_{t-1}(z_t) = \check{G}(I-\beta\check{A})^{-1} z_t - \frac{1}{1-\beta}\,\bar{c}^i \;\equiv\; b(z_t, \bar{c}^i)
+$$ (eq:cmdebt)
+
+Note that indebtedness depends only on the current Markov state $z_t$, **not** on the history of
+earlier states.  This absence of history dependence reflects the **complete risk sharing** attained
+under complete markets.
+
+Substituting the pricing kernel {eq}`eq:kernel` and the portfolio conjecture {eq}`eq:cmdebt` into
+the budget constraint {eq}`eq:CMbudget` and using the law of iterated expectations confirms that
+the budget constraint simplifies to $c_t = \bar{c}^i$ in every state and period.
+
+**Cross-section implications.** Under complete markets:
+
+- The cross-section distribution of consumption is **time-invariant**.
+- Consumer $i$'s rank in the consumption distribution is fixed forever.
+- A lucky initial draw $z_0^i$ manifests itself as perpetually high consumption $\bar{c}^i$ and
+  lower indebtedness $b(z_t^i, \bar{c}^i)$ across all future states.
+
+This contrasts sharply with the incomplete-markets Bewley economy, where the cross-section variance
+of consumption grows without bound.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Complete vs incomplete markets consumption distributions
+    name: fig-lqcs-markets
+---
+# Illustrate: complete vs incomplete markets consumption distributions over time
+rng = np.random.default_rng(1)
+N_cm = 5000
+T_cm = 50
+
+# Initial consumption draws (same as Bewley economy)
+c0_cm = c_bew_t0[:N_cm]   # initial consumption draws from the Bewley cell above
+
+# Incomplete markets: consumption evolves (random walk)
+c_inc = np.zeros((N_cm, T_cm + 1))
+c_inc[:, 0] = c0_cm
+for t in range(T_cm):
+    eps = rng.standard_normal((N_cm, 2))
+    c_inc[:, t+1] = c_inc[:, t] + eps @ h_vec
+
+# Complete markets: consumption stays constant
+c_comp = np.tile(c0_cm[:, np.newaxis], T_cm + 1)
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+for t_plot, color in zip([0, 10, 30, 50], ['steelblue', 'orange', 'tomato', 'purple']):
+    axes[0].hist(c_inc[:, t_plot], bins=60, alpha=0.4,
+                 label=f't = {t_plot}', color=color, density=True)
+axes[0].set_title('Incomplete markets:\ncross-section distribution of $c$')
+axes[0].set_xlabel('$c$')
+axes[0].legend(fontsize=9)
+axes[0].grid(alpha=0.3)
+
+for t_plot, color in zip([0, 10, 30, 50], ['steelblue', 'orange', 'tomato', 'purple']):
+    axes[1].hist(c_comp[:, t_plot], bins=60, alpha=0.4,
+                 label=f't = {t_plot}', color=color, density=True)
+axes[1].set_title('Complete markets:\ncross-section distribution of $c$')
+axes[1].set_xlabel('$c$')
+axes[1].legend(fontsize=9)
+axes[1].grid(alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+```
+
+```{note}
+Under **complete markets** the histogram stays the same across all $t$ (distributions overlap
+perfectly), while under **incomplete markets** the distribution spreads out over time.
+```
+
+---
+
+**Connecting Part A to Part B.** Part A derived optimal consumption rules for a consumer who
+fully trusts his stochastic income model.  Part B relaxes that assumption: the consumer fears that
+the model is misspecified and therefore seeks decision rules that are robust to plausible
+alternatives.  We will see that the optimal robust rule takes the same form as the Part A rule,
+but under a distorted model of the income process that looks more persistent than the approximating
+one.
+
+---
+
+## Part B: a robust permanent income model
+
+```{index} single: Robust Control; permanent income
+```
+
+```{index} single: Precautionary Savings; robustness
+```
+
+### Introduction
+
+Part B studies a consumer who **distrusts** his specification of the stochastic process governing
+his labor income.  The model is due to Hansen, Sargent, and Tallarini (1999) (HST) {cite}`HST_1999`, who estimated
+it on US quarterly consumption and investment data.
+
+A consumer who fears model misspecification engages in a form of **precautionary savings** that is
+distinct from the usual precautionary motive (which requires a convex marginal utility).
+
+Here, the
+precautionary motive arises because the consumer wants to protect against misspecification of the
+**conditional means** of income shocks, and it operates even with quadratic preferences.
+
+HST showed an important **observational equivalence** result: for quantities $(c_t, i_t)$ alone,
+a concern for robustness is indistinguishable from an increase in impatience (a decrease in
+$\beta$).
+
+We develop this result carefully below.
+
+```{index} single: Observational Equivalence; robustness and discounting
+```
+
+**Outline of Part B:**
+
+1. State the HST model (planner's Bellman equation with robustness).
+2. Solve the $\sigma = 0$ benchmark.
+3. Prove observational equivalence (Theorem 1 and Theorem 2).
+4. Interpret the result using distorted expectations from a Stackelberg multiplier game.
+5. Illustrate frequency-domain and detection-error-probability aspects.
+6. Evaluate the robustness of alternative decision rules.
+
+---
+
+### The HST model
+
+```{index} single: Hansen Sargent Tallarini; model
+```
+
+HST's model features a planner with preferences over consumption streams $\{c_t\}$, mediated
+through **service streams** $\{s_t\}$.  Let $b$ be a preference shifter (utility bliss point).
+
+The **Bellman equation for the robust planner** is
+
+$$
+-x' P x - p =
+\sup_c \inf_w \Bigl\{-(s-b)^2 + \beta\bigl(\theta w^{*\prime} w^* - \mathbb{E}\,x^{*\prime} P x^* - p\bigr)\Bigr\}
+$$ (eq:income1)
+
+subject to the household technology, capital accumulation, endowment dynamics, and the state law:
+
+$$
+\begin{aligned}
+s &= (1+\lambda)c - \lambda h \\
+h^* &= \delta_h h + (1-\delta_h) c \\
+k^* &= \delta_k k + i \\
+c + i &= \gamma k + d \\
+\begin{pmatrix}d\\b\end{pmatrix} &= U z \\
+z^* &= A_{22} z + C_2(\epsilon^* + w^*)
+\end{aligned}
+$$ (eq:income1a)
+
+Here $^*$ denotes the next-period value; $c$ is consumption; $s$ is the scalar service measure;
+$h$ is a habit stock; $k$ is the capital stock; $i$ is investment; $d$ is an endowment/technology
+shock; $b$ is a **preference shock** (bliss-point shifter, distinct from the bond/debt variable
+$b_t$ used in Part A); $\epsilon^* \sim \mathcal{N}(0,I)$ is the baseline shock; and
+$w^*$ is a **distortion** to the conditional mean of $\epsilon^*$ chosen by an evil agent.
+
+The penalty parameter $\theta > 0$ governs the consumer's concern about robustness.  We use the
+transformation
+
+$$
+\sigma = -\theta^{-1} \leq 0
+$$
+
+so $\sigma = 0$ corresponds to no robustness concern and $\sigma < 0$ to an increasing concern.
+
+**Household technology.** When $\lambda > 0$ and $\delta_h \in (0,1)$, the technology
+{eq}`eq:income1a` accommodates **habit persistence** (positive $\lambda$) or durability.  The stock
+$h_t$ is a geometric weighted average of current and past consumption.
+
+**Production technology.** Equation $c_t + k_t = Rk_{t-1} + d_t$ with
+$R = \delta_k + \gamma$ combines capital accumulation with a linear production technology.  $R$ is
+the physical gross return on capital.
+
+**State vector.** Let $x_t' = [h_{t-1},\, k_{t-1},\, z_t']$.  There is a set of state transition
+equations:
+
+$$
+x_{t+1} = A\, x_t + B\, u_t + C(\epsilon_{t+1} + w_{t+1})
+$$ (eq:law0)
+
+where $u_t = c_t$ and $w_{t+1}$ is the distortion to the conditional mean of $\epsilon_{t+1}$.
+
+**HST's calibration.** HST estimated the model on U.S. quarterly data (1970Q1-1996Q3) using
+nondurables plus services for consumption and durable consumption plus gross private investment for
+investment.  Key estimates are summarised in the following table (reported in Appendix A of HST):
+
+| Parameter | Habit | No Habit |
+|-----------|-------|----------|
+| Risk-free rate | 0.025 | 0.025 |
+| $\beta$ | 0.997 | 0.997 |
+| $\delta_h$ | 0.682 | — |
+| $\lambda$ | 2.443 | 0 |
+| $\alpha_1$ | 0.813 | 0.900 |
+| $\alpha_2$ | 0.189 | 0.241 |
+| $\phi_1$ | 0.998 | 0.995 |
+| $\phi_2$ | 0.704 | 0.450 |
+| $2 \times \log L$ | 779.05 | 762.55 |
+
+HST imposed $\beta R = 1$ and $\delta_k = 0.975$, so $\gamma$ is pinned down once $\beta$ is
+estimated.  An annual real interest rate of 2.5% corresponds to $\beta = 0.997$.
+
+---
+
+### Solution when $\sigma = 0$
+
+When $\sigma = 0$ the objective reduces to
+
+$$
+\mathbb{E}_0\sum_{t=0}^{\infty}\beta^t\bigl\{-(s_t - b_t)^2\bigr\}
+$$ (eq:income5)
+
+Formulating a Lagrangian and deriving first-order conditions yields:
+
+$$
+\begin{aligned}
+\mu_{st} &= b_t - s_t \\
+\mu_{ct} &= (1+\lambda)\mu_{st} + (1-\delta_h)\mu_{ht} \\
+\mu_{ht} &= \beta \mathbb{E}_t[\delta_h \mu_{h,t+1} - \lambda \mu_{s,t+1}] \\
+\mu_{ct} &= \beta R\, \mathbb{E}_t\mu_{c,t+1}
+\end{aligned}
+$$ (eq:foc)
+
+Here $\mu_{st}$ is the **marginal valuation of consumption services**, which summarises the
+endogenous state variables $h_{t-1}$ and $k_{t-1}$.  Equation {eq}`eq:foc` (last line) implies
+$\mathbb{E}_t\mu_{c,t+1} = (\beta R)^{-1}\mu_{ct}$, so $\mu_{st}$ satisfies a **martingale representation**
+when $\beta R = 1$:
+
+$$
+\mu_{st} = \mu_{s,t-1} + \nu'\epsilon_t
+$$ (eq:martingale)
+
+for some vector $\nu$.  Solving forward and substituting gives
+
+$$
+\mu_{st} = \Psi_1 k_{t-1} + \Psi_2 h_{t-1} + \Psi_3\sum_{j=0}^{\infty} R^{-j} \mathbb{E}_t b_{t+j}
+            + \Psi_4\sum_{j=0}^{\infty} R^{-j} \mathbb{E}_t d_{t+j}
+$$ (eq:income10)
+
+where
+
+$$
+\Psi_1 = -(1+\lambda)R(1-R^{-2}\beta^{-1})\!\left[\frac{1-R^{-1}\tilde\delta_h}{1-R^{-1}\tilde\delta_h+\lambda(1-\tilde\delta_h)}\right], \quad
+\Psi_4 = R^{-1}\Psi_1
+$$ (eq:income100a)
+
+and $\tilde\delta_h = (\delta_h + \lambda)/(1+\lambda)$.
+
+In the widely-studied special case $\lambda = \delta_h = 0$, so $s_t = c_t$ and
+$\mu_{st} = b_t - c_t$, the marginal propensity to consume out of **non-human wealth** $Rk_{t-1}$
+equals that out of **human wealth** $\sum_{j=0}^{\infty}R^{-j}\mathbb{E}_t d_{t+j}$, a well-known feature of
+the LQ model.
+
+**Representing $\nu$ in matrix form.** The formula for $\mu_{st}$ can be written as
+$\mu_{st} = M_s x_t$ where $x_t$ follows {eq}`eq:law0`.  It follows that
+
+$$
+\nu' = M_s C, \qquad \alpha = \sqrt{\nu'\nu} = \sqrt{M_s CC' M_s'}
+$$ (eq:hsoffset2)
+
+The scalar $\alpha$ plays a central role in the observational equivalence result below.
+
+---
+
+### Observational equivalence
+
+#### Theorem 1: HST observational equivalence
+
+```{index} single: Observational Equivalence; Theorem 1
+```
+
+The key result of HST is:
+
+**Theorem 1 (Observational Equivalence, I).** *Fix all parameters except $(\sigma, \beta)$.
+Suppose $\beta R = 1$ when $\sigma = 0$.  There exists $\underline\sigma < 0$ such that for any
+$\sigma \in (\underline\sigma, 0)$, the optimal consumption-investment plan for $(0,\beta)$ is also
+chosen by a robust decision maker with parameters $(\sigma, \hat\beta(\sigma))$, where*
+
+$$
+\hat\beta(\sigma) = \frac{1}{R} + \frac{\sigma\alpha^2}{R-1}
+$$ (eq:obseq)
+
+*and $\hat\beta(\sigma) < \beta$.*
+
+**Intuition.** Since $R > 1$ and $\alpha^2 > 0$, a more negative $\sigma$ (stronger robustness
+concern) lowers $\hat\beta$.  The observational equivalence proposition asserts that two offsetting
+forces cancel:
+
+- A concern about robustness ($\sigma < 0$) makes the consumer save **more**, because he fears
+  that future income is lower than the approximating model predicts.
+- Decreasing $\beta$ to $\hat\beta(\sigma)$ makes the consumer **less patient**, inducing less
+  saving.
+
+When these two forces are balanced according to {eq}`eq:obseq`, the consumption and investment
+quantities are identical across $(\sigma, \hat\beta(\sigma))$ pairs.
+
+#### Proof sketch
+
+When $\beta R = 1$ and $\sigma = 0$, the marginal utility $\mu_{st}$ obeys the martingale
+
+$$
+\mu_{st} = \mu_{s,t-1} + \alpha\,\tilde\epsilon_t
+$$ (eq:reversee1)
+
+where $\tilde\epsilon_t$ is scalar i.i.d.\ with mean zero and unit variance.  Activating robustness
+($\sigma < 0$) means the **evil agent** solves
+
+$$
+\tilde w_t = K(\sigma,\hat\beta)\,\mu_{s,t-1}
+$$
+
+making the worst-case model for $\mu_{st}$:
+
+$$
+\mu_{st} = (1 + \alpha\,K(\sigma,\hat\beta))\,\mu_{s,t-1} + \alpha\,\tilde\epsilon_t
+$$ (eq:reversee3)
+
+For the allocation to remain the same, we require the robust Euler equation
+$\hat\beta R\,\hat{\mathbb{E}}_t\mu_{s,t+1} = \mu_{st}$ to hold under the worst-case model, which gives
+
+$$
+(\hat\beta R)^{-1} = 1 + \alpha\, K(\sigma,\hat\beta)
+$$ (eq:eulerdist)
+
+The evil agent's Bellman equation (a pure forecasting problem) yields
+
+$$
+\hat\zeta(\hat\beta) \equiv 1 + \alpha K(\sigma,\hat\beta) = \frac{1}{1 - \sigma\alpha^2 P(\hat\beta)}
+$$ (eq:distort2)
+
+where $P(\hat\beta)$ solves the scalar Bellman equation:
+
+$$
+-P(\hat\beta) = \frac{\hat\beta - 1 + \sigma\alpha^2 + \sqrt{(\hat\beta-1+\sigma\alpha^2)^2 + 4\sigma\alpha^2}}{-2\sigma\alpha^2}
+$$ (eq:distortcons)
+
+Solving {eq}`eq:eulerdist`-{eq}`eq:distortcons` for $\hat\beta$ gives exactly {eq}`eq:obseq`.
+$\square$
+
+---
+
+### Observational equivalence: numerical illustration
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Observational equivalence locus
+    name: fig-lqcs-oe-locus
+---
+# Observational equivalence locus: β_hat(σ) = 1/R + σα^2 / (R-1)
+# HST benchmark values
+β0    = 0.997          # benchmark discount factor
+R0    = 1.0 / β0       # gross interest rate
+α2_vals = [0.005, 0.015, 0.030]   # three values of α^2 for illustration
+
+σ_grid = np.linspace(-0.002, 0.0, 400)
+
+fig, ax = plt.subplots()
+for α2 in α2_vals:
+    β_hat = 1/R0 + σ_grid * α2 / (R0 - 1)
+    ax.plot(σ_grid, β_hat, label=rf'$\alpha^2 = {α2}$')
+
+ax.axhline(β0, linestyle=':', color='grey', linewidth=1, label=r'$\beta_0 = 0.997$')
+ax.axvline(0,  linestyle=':', color='grey', linewidth=1)
+ax.set_xlabel(r'$\sigma = -\theta^{-1}$ (robustness parameter)')
+ax.set_ylabel(r'$\hat\beta(\sigma)$')
+ax.legend()
+ax.grid(alpha=0.3)
+plt.show()
+```
+
+```{note}
+Each point on a locus is observationally equivalent (given only data on $(c_t, i_t)$) to the
+benchmark $(\sigma=0, \beta_0)$.  A larger $|\sigma|$ requires a smaller $\hat\beta$ to offset the
+precautionary motive.  A larger $\alpha^2$ (more volatile endowment process) amplifies the offset
+required.
+```
+
+---
+
+### Precautionary savings interpretation
+
+```{index} single: Precautionary Savings; robustness vs convex marginal utility
+```
+
+The consumer's concern about model misspecification activates a particular kind of precautionary
+savings motive that underlies the observational equivalence proposition.
+
+**How it works.** A concern about robustness inspires the consumer to save **more** (precautionary
+motive).  Decreasing $\beta$ induces the consumer to save **less** (impatience).  The
+observational equivalence proposition asserts that these two forces can be made to offset each
+other exactly.
+
+**The special case $\lambda = \delta_h = 0$.** Here $s_t = c_t$ and the consumption rule is
+
+$$
+c_t = (1 - R^{-2}\beta^{-1})\!\left[Rk_{t-1} + \mathbb{E}_t\sum_{j=0}^{\infty}R^{-j}d_{t+j}\right]
+      + \left(\frac{(R\beta)^{-1}-1}{R-1}\right)\!b
+$$ (eq:consfunction)
+
+The **marginal propensity to consume** out of non-human wealth $Rk_{t-1}$ **equals** that out of
+human wealth $\mathbb{E}_t\sum R^{-j}d_{t+j}$.  This equal-propensity property is a hallmark of the LQ
+model and **persists** when a concern for robustness is present (in contrast to the usual
+precautionary savings models with convex marginal utility, where the two propensities diverge).
+
+**Theorem 1 offset.** Theorem 1 says that with $\sigma < 0$, the observationally equivalent
+$\hat\beta$ satisfies $\hat\beta < \beta$.  If the starting point has $\beta R = 1$, then
+$\hat\beta R < 1$.  For a non-robust consumer with discount factor $\hat\beta$ at the same
+interest rate, the Euler equation implies $\mathbb{E}_t c_{t+1} < c_t$: expected consumption
+declines over time.
+
+This downward drift is the impatience offset in Theorem 1.  It cancels the robust consumer's
+precautionary-savings motive, leaving the consumption and investment quantities unchanged.
+The upward-drift comparison appears below in Theorem 2, which asks the reverse observational
+equivalence question.
+
+**Comparison with classical precautionary savings.** The classical precautionary motive (see Leland 1968 and Miller 1974) arises because:
+
+$$
+u'''(c) > 0 \;\Rightarrow\; \mathbb{E}_t u'(c_{t+1}) > u'(\mathbb{E}_t c_{t+1}) \;\Rightarrow\; \mathbb{E}_t c_{t+1} > c_t
+$$
+
+This channel requires **convexity of marginal utility** and is absent with quadratic preferences.
+In contrast, the robustness-based precautionary motive operates through distortions of **conditional
+means** of shocks, it shifts the first moments of the shock distribution and is active even with
+quadratic preferences.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Expected consumption profile under Theorem 1 discount-factor offsets
+    name: fig-lqcs-drift
+---
+# Illustrate the Theorem 1 offset: lower β_hat at fixed R
+# In the special case λ = δ_h = 0:
+# With β0 R = 1, Theorem 1 gives β_hat < β0 and hence β_hat R < 1
+# Euler equation: E_t c_{t+1} = (β_hat R) c_t  -> downward drift
+
+β_vals  = [0.990, 0.993, 0.995]     # σ < 0 <-> β_hat < β0 in Theorem 1
+R_fixed = 1 / 0.997
+T_plot  = 40
+
+fig, ax = plt.subplots()
+t_grid = np.arange(T_plot + 1)
+c0     = 1.0
+for β_hat_i in β_vals:
+    drift = (β_hat_i * R_fixed) ** t_grid
+    label = rf'$\hat\beta = {β_hat_i}$, $\hat\beta R = {β_hat_i*R_fixed:.4f}$'
+    ax.plot(t_grid, c0 * drift, label=label)
+
+ax.axhline(c0, linestyle=':', color='grey', linewidth=1,
+           label=r'$\beta_0 R = 1$ (benchmark)')
+ax.set_xlabel('period $t$')
+ax.set_ylabel(r'$E_0 c_t / c_0$')
+ax.legend(fontsize=9)
+ax.grid(alpha=0.3)
+plt.show()
+```
+
+---
+
+### Observational equivalence and distorted expectations
+
+```{index} single: Distorted Expectations; Stackelberg multiplier game
+```
+
+The observational equivalence result can be interpreted using a **Stackelberg multiplier game**.
+After the minimising evil agent has committed to a distortion process $\{w_{t+1}\}$, the maximising
+consumer faces the following worst-case law of motion for the state $X_t$:
+
+$$
+\begin{aligned}
+X_{t+1} &= \bigl(A - BF(\sigma,\hat\beta) + CK(\sigma,\hat\beta)\bigr) X_t + C\tilde\epsilon_{t+1} \\
+\begin{pmatrix}b_t\\d_t\end{pmatrix} &= S X_t
+\end{aligned}
+$$ (eq:sys2)
+
+The consumer forms expectations of future income using the **distorted transition matrix**
+$A - BF + CK$ rather than the approximating transition matrix $A - BF$.
+
+The distorted expectations operator $\hat{\mathbb{E}}_t$ satisfies
+
+$$
+\hat{\mathbb{E}}_t X_{t+j} = (A - BF(\sigma,\hat\beta) + CK(\sigma,\hat\beta))^j X_t
+$$
+
+Observational equivalence requires that the modified human-wealth formula
+
+$$
+\hat\Psi_4 \sum_{j=0}^{\infty} R^{-j}\hat{\mathbb{E}}_t d_{t+j}
+$$
+
+equals its benchmark counterpart $\Psi_4 \sum_{j=0}^{\infty} R^{-j} \mathbb{E}_t d_{t+j}$.  This is
+achieved by a mutual adjustment of the coefficients $\hat\Psi_j$ (via $\hat\beta$) and the
+distorted expectation operator $\hat{\mathbb{E}}_t$ (via $\sigma$).
+
+**Distorted endowment process.** The worst-case eigenvalue of $A - BF + CK$ exceeds that of
+$A - BF$ in modulus: the worst-case distortions make the income process **more persistent** than
+under the approximating model.  This is the heart of the precautionary motive, the evil agent
+makes future income look more risky by introducing low-frequency persistence.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Worst-case model makes endowment more persistent
+    name: fig-lqcs-persistence
+---
+# Illustrate how worst-case model makes endowment process more persistent
+# We work with a simplified scalar AR(1) version for illustration:
+#   d_{t+1} = ρ d_t + σ_d ε_{t+1}      (approximating model)
+# The worst-case distortion increases the effective AR coefficient
+
+ρ_approx = 0.80   # AR coefficient under approximating model
+σ_d      = 0.10   # endowment shock std
+
+# Under worst-case model: effective AR coefficient = ρ + α*K(σ, β_hat)
+# Here we show how the worst-case AR coefficient varies with |σ|
+σ_rob_vals = [0.0, -0.0001, -0.0003, -0.0005]
+T_irf      = 30
+
+fig, ax = plt.subplots()
+for σ_rob in σ_rob_vals:
+    # α K ~= |σ| * P * α^2 / (1 - |σ| * α^2 * P) - simplified scalar version
+    # For illustration, use a linear approximation: Δρ ~= -σ * α^2 * const
+    α2_scal  = 0.01
+    Δρ       = -σ_rob * α2_scal * 2.0    # heuristic scaling for illustration
+    ρ_wc     = ρ_approx + Δρ
+    irf      = ρ_wc ** np.arange(T_irf)
+    if σ_rob == 0.0:
+        ax.plot(irf, linewidth=2.5, linestyle='-',
+                label=rf'approx. model ($\sigma = 0$, $\rho = {ρ_approx}$)')
+    else:
+        ax.plot(irf, linestyle='--',
+                label=rf'worst-case ($\sigma = {σ_rob}$, $\rho_{{wc}} \approx {ρ_wc:.4f}$)')
+
+ax.set_xlabel('periods after shock')
+ax.set_ylabel('impulse response of $d_t$')
+ax.legend(fontsize=9)
+ax.grid(alpha=0.3)
+plt.show()
+```
+
+---
+
+### Frequency domain interpretation
+
+```{index} single: Frequency Domain; permanent income model
+```
+
+The LQ permanent income framework has a natural frequency-domain interpretation:
+
+- The consumer's concave utility makes him dislike **high-frequency** fluctuations in consumption.
+  He smooths out high-frequency income fluctuations by adjusting savings.
+- High-frequency fluctuations are easier to smooth, the consumer is automatically robust to
+  misspecification of high-frequency features of the income process.
+- **Low-frequency** (very persistent) fluctuations are harder to smooth and cause the consumer the
+  most trouble.
+
+In the frequency-domain notation of HST, the transfer function from shocks $\epsilon_t$ to the
+target $s_t - b_t$ is $G(\zeta)$, and the frequency decomposition of the $H_2$ criterion is
+
+$$
+H_2 = -\frac{1}{2\pi}\int_{-\pi}^{\pi} \operatorname{trace}\!\bigl[G(\sqrt\beta\, e^{i\omega})'\,G(\sqrt\beta\, e^{i\omega})\bigr]\, d\omega
+$$
+
+The integrand $G'G$ is **largest at low frequencies** $\omega \approx 0$, the consumer's welfare is
+most sensitive to low-frequency income variability.
+
+Recognising this, the evil agent concentrates the worst-case distortions at low frequencies.  The
+distortion process has spectral density $W(\zeta)' W(\zeta)$ that is concentrated near $\omega = 0$.
+The variance of the worst-case shocks grows as $|\sigma|$ increases.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Worst-case distortions concentrate at low frequencies
+    name: fig-lqcs-frequency
+---
+# Illustrate frequency concentration of worst-case shocks
+# We plot a stylised version of trace[W(ζ)'W(ζ)] vs frequency
+
+ω_grid = np.linspace(-np.pi, np.pi, 500)
+
+# Approximating model: flat spectrum (i.i.d. shocks)
+spectrum_approx = np.ones_like(ω_grid)
+
+# Worst-case shocks: power concentrated at low frequencies
+# Model as a low-pass spectrum W(ω) ~ 1/(ω^2 + κ^2)
+def low_pass(ω, κ, scale):
+    return scale / (ω**2 + κ**2)
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+# Left panel: criterion G'G (low-frequency dominance)
+G_sq = 1.0 / (np.abs(np.exp(1j * ω_grid) - 0.95)**2 + 0.01)
+G_sq = G_sq / G_sq.max()
+axes[0].plot(ω_grid, G_sq, color='steelblue', linewidth=2)
+axes[0].set_title(r"Stylised $G(\zeta)'G(\zeta)$ vs frequency $\omega$")
+axes[0].set_xlabel(r'frequency $\omega$')
+axes[0].set_ylabel(r'$G(\zeta)^\prime G(\zeta)$')
+axes[0].set_xlim(-np.pi, np.pi)
+axes[0].grid(alpha=0.3)
+axes[0].text(0, 0.5, 'Low-frequency\ndominance', ha='center', fontsize=10, color='steelblue')
+
+# Right panel: worst-case shock spectra for two values of |σ|
+for σ_abs, kap, scale in [(0.0001, 0.02, 0.8), (0.00005, 0.02, 0.4)]:
+    W_sq = low_pass(ω_grid, kap, scale) + 0.01
+    W_sq = W_sq / W_sq.max() * σ_abs / 0.0001
+    axes[1].plot(ω_grid, W_sq,
+                 label=rf'$|\sigma| = {σ_abs}$')
+axes[1].set_title(r"Stylised $\operatorname{tr}[W(\zeta)'W(\zeta)]$ vs frequency")
+axes[1].set_xlabel(r'frequency $\omega$')
+axes[1].set_ylabel(r'$\operatorname{tr}[W^\prime W]$')
+axes[1].set_xlim(-np.pi, np.pi)
+axes[1].legend()
+axes[1].grid(alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+```
+
+---
+
+### Detection error probabilities
+
+```{index} single: Detection Error Probabilities
+```
+
+A natural way to discipline the choice of $\sigma$ (or $\theta$) is to ask: **how difficult would
+it be to statistically distinguish the approximating model from the worst-case model?**
+
+For a sample of length $T$, one can use a **log-likelihood ratio test** to compare the two
+hypotheses.  The **detection error probability** (DEP) is the probability of making the wrong
+decision using the log-likelihood ratio statistic when one does not know which model generated the
+data.  Specifically:
+
+$$
+\text{DEP}(\sigma) = \frac{1}{2}\bigl[\mathbb{P}(\text{prefer approx.} \mid \text{worst-case is true})
+                                    + \mathbb{P}(\text{prefer worst-case} \mid \text{approx. is true})\bigr]
+$$
+
+When $\sigma = 0$ the two models are identical and DEP $= 0.5$.  As $|\sigma|$ increases the
+models diverge and the DEP falls toward zero.
+
+The **relative entropy** between the worst-case and approximating models provides an analytical
+approximation.  For the scalar version of the problem, the Kullback-Leibler divergence between the
+two Gaussian models over $T$ periods is approximately:
+
+$$
+\Delta(\sigma) \approx \frac{T}{2}\,\alpha^2\, K(\sigma,\hat\beta)^2 \cdot \text{tr}[\text{state covariance}]
+$$
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Detection error probabilities
+    name: fig-lqcs-dep
+---
+# Illustrate detection error probabilities (scalar stylised version)
+# We compute relative entropy approximation and convert to DEP
+
+T_sample = 107        # HST sample length (1970Q1 - 1996Q3)
+α2_val   = 0.01       # representative α^2 value
+R_dep    = 1.0 / 0.997
+
+# For each σ compute: K(σ,β_hat) from worst-case multiplier, then Δ, then DEP
+def compute_dep(σ_val, α2, R, T):
+    """
+    Compute detection error probability for a given σ.
+    Uses a simplified scalar approximation.
+    """
+    if σ_val == 0.0:
+        return 0.5
+    β_hat = 1.0 / R + σ_val * α2 / (R - 1)
+    if β_hat <= 0:
+        return 0.0
+
+    # Scalar Bellman equation for P(β_hat)
+    disc  = (β_hat - 1 + σ_val * α2)**2 + 4 * σ_val * α2
+    if disc < 0:
+        return np.nan
+    P_val = (β_hat - 1 + σ_val * α2 + np.sqrt(disc)) / (-2 * σ_val * α2)
+
+    # Worst-case gain K
+    K_val  = σ_val * α2 * P_val / (1 - σ_val * α2 * P_val)
+
+    # KL divergence (one-sided) approximation per period: 0.5 * K^2 * α^2
+    kl_per = 0.5 * K_val**2 * α2
+    kl_T   = kl_per * T
+
+    # DEP ~= Φ(-sqrt(KL_T / 2)) where Φ is standard normal CDF
+    dep    = norm.cdf(-np.sqrt(kl_T / 2))
+    return dep
+
+σ_dep_grid = np.linspace(-0.002, 0, 200)
+dep_vals   = np.array([compute_dep(s, α2_val, R_dep, T_sample)
+                       for s in σ_dep_grid])
+
+fig, ax = plt.subplots()
+ax.plot(σ_dep_grid, dep_vals, linewidth=2, color='steelblue')
+ax.axhline(0.5, linestyle=':', color='grey', linewidth=1, label='DEP = 0.5 at σ = 0')
+ax.axhline(0.3, linestyle='--', color='tomato', linewidth=1, label='DEP = 0.3 (plausible threshold)')
+ax.axhline(0.2, linestyle='--', color='orange', linewidth=1, label='DEP = 0.2')
+ax.set_xlabel(r'$\sigma$ (robustness parameter)')
+ax.set_ylabel('detection error probability')
+ax.legend()
+ax.grid(alpha=0.3)
+plt.show()
+```
+
+```{note}
+HST suggested that a DEP above 0.2 is "plausible", meaning the models are still hard enough to
+distinguish statistically that a concern for robustness is warranted.  Values of $\sigma$
+corresponding to DEP $\geq 0.2$ define a set of plausible worst-case models.
+```
+
+---
+
+### Robustness of decision rules
+
+```{index} single: Robustness; payoff evaluation
+```
+
+To evaluate whether robust decision rules perform better than the non-robust rule when the data are
+generated by a distorted model, define the **payoff** when the decision rule is designed for
+robustness parameter $\sigma_2$ and the data are generated by the distorted model associated with
+$\sigma_1$:
+
+$$
+\pi(\sigma_1;\sigma_2) = -\mathbb{E}_{0,\sigma_1}\sum_{t=0}^{\infty}\beta^t\, x_t' H(\sigma_2)' H(\sigma_2)\, x_t
+$$ (eq:soln3)
+
+where the state evolves under decision rule $F(\sigma_2)$ and worst-case shocks $K(\sigma_1)$:
+
+$$
+x_{t+1} = \bigl(A - BF(\sigma_2) + CK(\sigma_1)\bigr)x_t + C\epsilon_{t+1}
+$$ (eq:soln2)
+
+For $\sigma_1 = 0$ (approximating model generates data), the non-robust rule ($\sigma_2 = 0$) is
+optimal by construction.  As $\sigma_1$ decreases (the data are generated by increasingly
+distorted models), the payoff of the $\sigma_2 = 0$ rule deteriorates faster than that of robust
+rules.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Payoff of robust vs non-robust rules
+    name: fig-lqcs-payoff
+---
+# Illustrate robustness payoff comparison (scalar stylised version)
+# π(σ1; σ2) measures performance of rule σ2 when model σ1 generates data
+
+α2_rob   = 0.01
+R_rob    = 1.0 / 0.997
+β_rob    = 0.997
+σ2_vals  = [0.0, -0.0004, -0.0008]   # three decision-rule robustness levels
+
+σ1_grid  = np.linspace(-0.0010, 0.0, 300)
+
+def payoff_approx(σ1, σ2, α2, β, R):
+    """
+    Approximate payoff π(σ1; σ2) in a stylised scalar version.
+    Captures the key qualitative pattern from HST.
+    """
+    # Effective drift under (F(σ2), K(σ1)):
+    β_hat2 = 1.0/R + σ2 * α2 / (R - 1) if σ2 != 0 else β
+    β_hat1 = 1.0/R + σ1 * α2 / (R - 1) if σ1 != 0 else β
+
+    # K for σ1
+    if σ1 == 0.0:
+        K1 = 0.0
+    else:
+        disc = (β_hat1 - 1 + σ1*α2)**2 + 4*σ1*α2
+        if disc < 0: return np.nan
+        P1 = (β_hat1 - 1 + σ1*α2 + np.sqrt(disc)) / (-2*σ1*α2)
+        K1 = σ1 * α2 * P1 / (1 - σ1*α2*P1)
+
+    # Effective persistence of state under (F(σ2), K(σ1))
+    ρ_eff  = 0.85 + K1 * np.sqrt(α2)   # simplified
+    if abs(ρ_eff) >= 1.0:
+        return np.nan
+
+    # Lyapunov variance of state
+    var_x  = α2 / (1 - ρ_eff**2)
+
+    # Per-period loss scales with variance of state
+    loss   = -var_x * (1.0 + 0.5 * abs(σ2) / 0.001)
+    return loss
+
+fig, ax = plt.subplots()
+for σ2 in σ2_vals:
+    π_vals = [payoff_approx(s1, σ2, α2_rob, β_rob, R_rob) for s1 in σ1_grid]
+    ax.plot(σ1_grid, π_vals, label=rf'$\sigma_2 = {σ2}$ (rule)')
+
+ax.set_xlabel(r'$\sigma_1$ (data-generating distortion)')
+ax.set_ylabel(r'$\pi(\sigma_1;\,\sigma_2)$')
+ax.legend(fontsize=9)
+ax.grid(alpha=0.3)
+plt.show()
+```
+
+```{note}
+The function `payoff_approx` is a reduced-form scalar analogue of the full HST matrix
+calculation.  It keeps the same comparison -- fix a data-generating distortion $\sigma_1$
+and compare rules indexed by $\sigma_2$ -- but it is not calibrated to reproduce HST's
+numerical payoffs.  The figure should be read as a qualitative payoff comparison.
+```
+
+```{note}
+By construction, the $\sigma_2 = 0$ (non-robust) rule achieves the highest payoff when
+$\sigma_1 = 0$.  But as $\sigma_1$ decreases (the data are generated by increasingly distorted
+models), its payoff deteriorates faster than that of the robust rules.  A robust decision rule
+sacrifices a small amount of performance under the approximating model in exchange for better
+performance when the approximating model is wrong.
+```
+
+---
+
+### Another observational equivalence result
+
+```{index} single: Observational Equivalence; Theorem 2
+```
+
+**Theorem 2 (Observational Equivalence, II).** *Fix all parameters except $(\sigma,\beta)$.
+Consider a consumption-investment allocation for $(\hat\sigma, \hat\beta)$ where
+$\hat\beta R = 1$ and $\hat\sigma < 0$.  Then there exists $\tilde\beta > \hat\beta$ such that the
+$(\hat\sigma, \hat\beta)$ allocation also solves the $(0, \tilde\beta)$ problem.*
+
+**Interpretation.** Theorem 1 showed that starting from a benchmark with $\beta R = 1$, activating
+robustness ($\sigma < 0$) is equivalent to *reducing* $\beta$.  Theorem 2 goes in the opposite
+direction: it shows that the effects of activating a concern for robustness from a starting point
+with $\beta R = 1$ are replicated by *increasing* $\beta$ (while setting $\sigma = 0$).
+
+In other words, when $\beta R = 1$, a concern for robustness operates like an **increase** in the
+discount factor, pushing $\beta R > 1$ and imparting an **upward drift** to the expected
+consumption profile.
+
+**Proof.** With $\hat\beta R = 1$ and $\hat\sigma < 0$, the robust Euler equation implies
+
+$$
+\hat{\mathbb{E}}_t \mu_{c,t+1} = \mu_{ct}
+$$
+
+One seeks $\tilde\beta > \hat\beta$ and $\sigma = 0$ such that the same allocation solves the
+non-robust problem with discount factor $\tilde\beta$.  The key step is to observe that the
+worst-case distortion $K(\hat\sigma, \hat\beta)$ introduces a drift in the marginal utility
+process that is equivalent to the drift produced by raising the discount factor above $\hat\beta$.
+Equating the two drifts and solving the scalar Bellman equation for $K$ yields
+
+$$
+\tilde\beta(\hat\sigma) = \frac{\hat\beta(1+\hat\beta)}{2(1+\hat\sigma\alpha^2)}
+\left[1 + \sqrt{1 - 4\hat\beta\,\frac{1+\hat\sigma\alpha^2}{(1+\hat\beta)^2}}\right]
+$$ (eq:obsequivn2)
+
+The solution satisfies $\tilde\beta > \hat\beta$ when $\hat\sigma < 0$.  $\square$
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Theorem 2 observational equivalence locus
+    name: fig-lqcs-theorem2
+---
+# Compute the Theorem 2 equivalence: β_tilde(σ_hat) vs σ_hat
+# Starting from β_hat R = 1 and σ_hat < 0
+
+β_hat_val  = 0.997     # β_hat satisfying β_hatR = 1
+α2_th2     = 0.01
+
+σ_hat_grid = np.linspace(-0.001, 0.0, 300)
+
+def β_tilde(σ_hat, β_hat, α2):
+    """Theorem 2 formula for β_tilde."""
+    disc = 1 - 4 * β_hat * (1 + σ_hat * α2) / (1 + β_hat)**2
+    if np.any(disc < 0):
+        disc = np.maximum(disc, 0)
+    return β_hat * (1 + β_hat) / (2 * (1 + σ_hat * α2)) * (1 + np.sqrt(disc))
+
+β_tilde_vals = β_tilde(σ_hat_grid, β_hat_val, α2_th2)
+
+fig, ax = plt.subplots()
+ax.plot(σ_hat_grid, β_tilde_vals, linewidth=2, color='steelblue',
+        label=r'$\tilde\beta(\hat\sigma)$')
+ax.axhline(β_hat_val, linestyle='--', color='tomato',
+           label=rf'$\hat\beta = {β_hat_val}$ (baseline, $\hat\beta R = 1$)')
+ax.set_xlabel(r'$\hat\sigma$ (robustness parameter)')
+ax.set_ylabel(r'$\tilde\beta$')
+ax.legend()
+ax.grid(alpha=0.3)
+plt.show()
+```
+
+---
+
+### A robust LQ Bewley model
+
+```{index} single: Robust Bewley Model
+```
+
+We now synthesise Parts A and B by embedding the Bewley economy of Part A into the HST
+framework and applying the observational equivalence theorem.  This constructs a family of
+**robust Bewley economies** (parameterised by a robustness level $\sigma \leq 0$) whose
+equilibrium quantities are identical to those of the plain vanilla Part A model.
+
+#### Mapping the Bewley economy into Part B notation
+
+We specialise the HST model to $\lambda = \delta_h = 0$ (no habits, no durable goods) and to a
+pure endowment economy (no physical capital, $k_t = 0$).  In this case:
+
+- Services equal consumption: $s_t = c_t$.
+- The only asset is the one-period risk-free bond $b_t$.
+- The endowment process follows the state-space representation {eq}`eq:sprob15`.
+
+The household's augmented state vector is $x_t = [b_t,\; z_t']'$, and the law of motion
+{eq}`eq:law0` specialises to
+
+$$
+\begin{pmatrix} b_{t+1} \\ z_{t+1} \end{pmatrix}
+=
+\underbrace{\begin{pmatrix} R & R\check{G} \\ 0 & \check{A} \end{pmatrix}}_{A}
+\begin{pmatrix} b_t \\ z_t \end{pmatrix}
++
+\underbrace{\begin{pmatrix} -R \\ 0 \end{pmatrix}}_{B}
+c_t
++
+\underbrace{\begin{pmatrix} 0 \\ \check{C} \end{pmatrix}}_{C}
+\epsilon_{t+1}
+$$ (eq:bew_law)
+
+The objective is $\mathbb{E}_0 \sum_{t=0}^\infty \beta^t [-(c_t - \gamma)^2/2]$, which is the HST
+criterion {eq}`eq:income5` with $\sigma = 0$ and $b_t \equiv \gamma$ (a fixed bliss level).
+The robust Bellman equation {eq}`eq:income1` with $\sigma = 0$ therefore reduces exactly to
+the Part A LQ problem, confirming that the HST framework nests the Bewley model.
+
+#### The $\alpha^2$ parameter
+
+From Representation 2 of Part A {eq}`eq:sprob16`, the consumption innovation is
+
+$$
+c_{t+1} - c_t = h\, w_{t+1}, \qquad
+h = (1-\beta)\,\check{G}(I-\beta\check{A})^{-1}\check{C}
+$$ (eq:bew_cinno)
+
+The vector $h$ plays the role of $\nu' = M_s C$ in the HST scalar $\alpha$ formula
+{eq}`eq:hsoffset2`.  Consequently,
+
+$$
+\alpha^2 = h h' = (1-\beta)^2\,
+\check{G}(I-\beta\check{A})^{-1}\check{C}\check{C}'(I-\beta\check{A}')^{-1}\check{G}'
+$$ (eq:bew_alpha)
+
+For the two-factor model {eq}`eq:twofactor` with $\check{A} = \mathrm{diag}(1,0)$ and
+$\check{C} = \mathrm{diag}(\sigma_1,\sigma_2)$ this simplifies to
+
+$$
+\alpha^2 = \sigma_1^2 + (1-\beta)^2\,\sigma_2^2
+$$ (eq:bew_alpha2)
+
+The permanent shock variance $\sigma_1^2$ enters with coefficient 1 because a unit permanent
+shock is **fully** capitalised into consumption.  The transitory shock variance $\sigma_2^2$
+enters with the small coefficient $(1-\beta)^2$ because only its annuity value is consumed.
+
+#### The observational equivalence locus
+
+Applying Theorem 1 {eq}`eq:obseq` with equilibrium interest rate $R = \beta_0^{-1}$ and
+$\alpha^2$ from {eq}`eq:bew_alpha2` gives the **Bewley observational equivalence locus**:
+
+$$
+\hat\beta(\sigma) = \beta_0 + \frac{\sigma\,\alpha^2\,\beta_0}{1-\beta_0}
+$$ (eq:bew_locus)
+
+For $\sigma < 0$, we have $\hat\beta(\sigma) < \beta_0$.  An agent with the pair
+$(\sigma, \hat\beta(\sigma))$ on this locus is more concerned about model misspecification
+(lower $\sigma$) but also more impatient (lower $\hat\beta$); the two forces cancel exactly,
+leaving the consumption decision rule unchanged.
+
+#### A robust Bewley equilibrium
+
+**Proposition.**  *Suppose all agents in the Bewley economy share a common pair
+$(\sigma, \hat\beta(\sigma))$ lying on the locus {eq}`eq:bew_locus`, with $R = \beta_0^{-1}$.
+Then every agent's optimal consumption plan is identical to that of the plain vanilla
+$(\sigma = 0,\, \beta_0)$ economy, and the equilibrium interest rate remains $R = \beta_0^{-1}$.*
+
+*Proof.*  By Theorem 1, each agent's consumption-saving rule is identical to the benchmark.
+The goods-market clearing condition $\int c_t^i\, di = Y$ is therefore satisfied at
+$R = \beta_0^{-1}$ for the same reason as in Part A. $\square$
+
+#### Heterogeneous $(\beta^i, \sigma^i)$ preferences
+
+A richer extension populates the economy with a **continuum of types**, each indexed by a
+robustness parameter $\sigma^i \in [\underline\sigma, 0]$, with discount factor
+
+$$
+\beta^i = \hat\beta(\sigma^i) = \beta_0 + \frac{\sigma^i\,\alpha^2\,\beta_0}{1-\beta_0}
+$$ (eq:bew_heterog)
+
+Since all pairs $(\sigma^i, \beta^i)$ lie on {eq}`eq:bew_locus`, every agent adopts the
+**same consumption rule** as the benchmark.  Therefore:
+
+1. **Aggregate dynamics are unchanged**: the cross-section mean of consumption equals $Y$, and
+   the cross-section variance grows at rate $\alpha^2$ per period.
+2. **Equilibrium interest rate is unchanged**: $R = \beta_0^{-1}$.
+3. **Agents are observationally indistinguishable** to an outside econometrician: data on
+   $(c_t^i, b_t^i)$ cannot reveal whether agent $i$ has $\sigma^i = 0$ or $\sigma^i < 0$.
+4. **Agents differ in their internal model**: an agent with $\sigma^i < 0$ applies a worst-case
+   distortion $w_{t+1}^i = K(\sigma^i, \beta^i)\,\mu_{s,t}^i$ to her conditional expectations,
+   while an agent with $\sigma^i = 0$ takes the approximating model at face value.
+
+This sets the stage for a Bewley model with **heterogeneous ambiguity aversion**: although
+every agent acts identically in terms of observable choices, they hold different subjective
+models of the income process and have different attitudes toward model uncertainty.
+
+#### Numerical illustration
+
+```{code-cell} ipython3
+# -- Part B Bewley model: compute α^2 and the observational equivalence locus --
+# Reuse Part A parameters (β = 0.95, σ1 = 0.15, σ2 = 0.30 from the top of the lecture).
+# Note: Part B's HST calibration used β0 = 0.997; here we deliberately re-use the
+# Part A value so that the numerical Bewley illustration is internally consistent.
+β0_bew = β       # 0.95
+σ1_bew = σ1      # 0.15
+σ2_bew = σ2      # 0.30
+R_bew  = 1.0 / β0_bew
+
+# α^2 for the two-factor Bewley model  (eq:bew_alpha2)
+α2_bew = σ1_bew**2 + (1 - β0_bew)**2 * σ2_bew**2
+
+print(f"α^2  (Bewley, two-factor)        = {α2_bew:.6f}")
+print(f"  permanent component  σ1^2     = {σ1_bew**2:.6f}  "
+      f"({100*σ1_bew**2/α2_bew:.1f} % of α^2)")
+print(f"  transitory component (1-β)^2σ2^2= {(1-β0_bew)**2*σ2_bew**2:.6f}  "
+      f"({100*(1-β0_bew)**2*σ2_bew**2/α2_bew:.1f} % of α^2)")
+```
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Bewley locus and $\alpha^2$ decomposition
+    name: fig-lqcs-bewley-locus
+---
+# -- Observational equivalence locus  β_hat(σ) = β0 + σ α^2 β0 / (1-β0) ----------
+# σ range: go down until β_hat = 0.80
+σ_min_bew  = (0.80 - β0_bew) * (1 - β0_bew) / (α2_bew * β0_bew)
+σ_bew_grid = np.linspace(σ_min_bew, 0, 400)
+β_hat_bew  = β0_bew + σ_bew_grid * α2_bew * β0_bew / (1 - β0_bew)
+
+# Five representative agent types evenly spaced on the locus
+n_types   = 5
+σ_types   = np.linspace(σ_min_bew * 0.8, 0.0, n_types)
+β_types   = β0_bew + σ_types * α2_bew * β0_bew / (1 - β0_bew)
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+# -- Left: the locus with agent types -----------------------------------------
+axes[0].plot(σ_bew_grid, β_hat_bew, lw=2, color='steelblue',
+             label=r'$\hat\beta(\sigma)$ locus')
+axes[0].axhline(β0_bew, ls=':', color='grey', lw=1,
+                label=rf'$\beta_0 = {β0_bew}$ (benchmark, $\sigma=0$)')
+colors_types = plt.cm.plasma(np.linspace(0.15, 0.85, n_types))
+for i, (s, b) in enumerate(zip(σ_types, β_types)):
+    axes[0].scatter([s], [b], s=90, color=colors_types[i], zorder=5,
+                    label=rf'Type {i+1}: $\sigma={s:.4f},\;\beta={b:.4f}$')
+axes[0].set_xlabel(r'$\sigma$  (robustness parameter)')
+axes[0].set_ylabel(r'$\hat\beta(\sigma)$')
+axes[0].set_title('Bewley observational equivalence locus\n'
+                  r'$\hat\beta(\sigma) = \beta_0 + \sigma\alpha^2\beta_0/(1{-}\beta_0)$')
+axes[0].legend(fontsize=7.5)
+axes[0].grid(alpha=0.3)
+
+# -- Right: α^2 decomposition as σ1 and σ2 vary --------------------------------
+σ1_range = np.linspace(0.01, 0.50, 120)
+σ2_range = np.linspace(0.01, 0.70, 120)
+axes[1].plot(σ1_range, σ1_range**2,
+             lw=2, color='steelblue',
+             label=r'permanent:  $\sigma_1^2$')
+axes[1].plot(σ2_range, (1 - β0_bew)**2 * σ2_range**2,
+             lw=2, color='tomato', ls='--',
+             label=r'transitory: $(1{-}\beta)^2\sigma_2^2$')
+axes[1].axvline(σ1_bew, ls=':', color='steelblue', alpha=0.7,
+                label=rf'calibrated $\sigma_1={σ1_bew}$')
+axes[1].axvline(σ2_bew, ls=':', color='tomato', alpha=0.7,
+                label=rf'calibrated $\sigma_2={σ2_bew}$')
+axes[1].set_xlabel(r'$\sigma_j$')
+axes[1].set_ylabel(r'contribution to $\alpha^2$')
+axes[1].set_title(r'Decomposition of $\alpha^2$: permanent vs transitory shocks')
+axes[1].legend(fontsize=9)
+axes[1].grid(alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+```
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Identical consumption paths across robust types
+    name: fig-lqcs-types
+---
+# -- Observational equivalence: all agent types share the same consumption path -
+# Consumption innovation h is IDENTICAL across all types on the locus
+# h = [σ1, (1-β0)σ2]  (from eq:bew_cinno for the two-factor model)
+h_bew = np.array([σ1_bew, (1 - β0_bew) * σ2_bew])
+print(f"Consumption innovation vector h = [{h_bew[0]:.4f}, {h_bew[1]:.4f}]")
+print(f"Var(Δc) = h*h' = α^2 = {h_bew @ h_bew:.6f}  (equals α^2_bew: {α2_bew:.6f})")
+
+rng = np.random.default_rng(17)
+T_het = 80
+eps   = rng.standard_normal((T_het, 2))    # one common shock sequence
+
+c_types = np.zeros((n_types, T_het + 1))
+for i in range(n_types):
+    # h is the same for all types - consumption paths are identical
+    for t in range(T_het):
+        c_types[i, t + 1] = c_types[i, t] + eps[t] @ h_bew
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+# -- Left: consumption paths (should all coincide) -----------------------------
+for i in range(n_types):
+    axes[0].plot(c_types[i], color=colors_types[i], lw=1.5,
+                 label=rf'Type {i+1}  ($\sigma^i={σ_types[i]:.4f}$)',
+                 alpha=0.8)
+axes[0].set_xlabel('period $t$')
+axes[0].set_ylabel(r'$c_t - c_0$')
+axes[0].set_title('Consumption paths are identical across all types\n'
+                  '(observational equivalence)')
+axes[0].legend(fontsize=8)
+axes[0].grid(alpha=0.3)
+
+# -- Right: worst-case income-process persistence differs across types ----------
+# Under type i's worst-case model, effective AR(1) for z_{1t} becomes ρ_wc^i > 1
+# in the limit; for z_{2t} it stays near 0.
+# Simple scalar illustration: ρ_wc ~= 1 + sqrt α^2 * K(σ^i, β^i), K from Bellman eqn
+ρ0 = 1.0   # permanent component AR root under approximating model
+T_irf_wc = 30
+horizons = np.arange(T_irf_wc)
+
+for i, (s, b) in enumerate(zip(σ_types, β_types)):
+    if s == 0.0:
+        ρ_wc = ρ0
+        lbl  = rf'Type {i+1} $(\sigma=0)$: approx. model'
+    else:
+        # K from scalar Bellman (eq:distort2 / eq:distortcons)
+        R_i = 1.0 / b
+        disc = (b - 1 + s * α2_bew)**2 + 4 * s * α2_bew
+        disc = max(disc, 0)
+        P_i  = (b - 1 + s * α2_bew + np.sqrt(disc)) / (-2 * s * α2_bew)
+        K_i  = s * α2_bew * P_i / (1 - s * α2_bew * P_i)
+        # effective AR root of z_{1t} under worst-case: 1 + sqrt α^2 * K
+        ρ_wc = min(ρ0 + np.sqrt(α2_bew) * abs(K_i) * 0.6, 1.08)  # cap for display
+        lbl  = (rf'Type {i+1} $(\sigma={s:.4f})$: '
+                rf'$\rho_{{wc}}\approx{ρ_wc:.4f}$')
+    irf_wc = ρ_wc ** horizons
+    axes[1].plot(horizons, irf_wc, color=colors_types[i], lw=1.8,
+                 linestyle='-' if s == 0 else '--', label=lbl)
+
+axes[1].set_xlabel('horizon $h$')
+axes[1].set_ylabel('Impulse response of permanent income component')
+axes[1].set_title("Worst-case model: more persistent income\nfor agents with $\\sigma^i < 0$")
+axes[1].legend(fontsize=7.5)
+axes[1].grid(alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+```
+
+```{note}
+The left panel confirms the **observational equivalence**: despite agents having different
+$(\sigma^i, \beta^i)$ pairs, their consumption paths are identical because all pairs lie on
+the locus {eq}`eq:bew_locus`.  The right panel shows what *does* differ across types: the
+worst-case model.  Agents with lower $\sigma^i$ (stronger robustness concerns) entertain a
+worst-case income process with **greater persistence** (the very low-frequency distortion
+identified in the frequency-domain section) even though their observed consumption behaviour
+is indistinguishable from a fully trusting agent.
+```
+
+---
+
+### Concluding remarks
+
+We close with a summary of the key messages from both parts of this lecture.
+
+**Part A** demonstrated that the LQ permanent income model (a rational-expectations version of
+Friedman's permanent income hypothesis) has two complementary state-space representations:
+
+1. **$(b_t, z_t)$ representation**: emphasises that the consumer's optimal borrowing is history
+   dependent and cointegrated with consumption.
+
+2. **$(c_t, z_t)$ representation**: emphasises that consumption is a martingale (random walk)
+   and that assets $b_t$ are encoded in consumption.  The impulse response function of
+   consumption is "box-shaped": a permanent shift in the level.
+
+We embedded this single-agent model in a Bewley equilibrium with a continuum of ex-post
+heterogeneous consumers.  The equilibrium gross interest rate $R = \beta^{-1}$ is supported by
+constant average consumption, though the cross-section variance of consumption grows linearly with
+age.  A complete-markets version of the same model achieves full risk sharing and a time-invariant
+consumption distribution at the cost of more complex financial arrangements (Arrow securities).
+
+**Part B** showed how a concern for model misspecification (parameterised by $\sigma = -\theta^{-1}
+\leq 0$) alters the permanent income model.  The main results are:
+
+- A concern for robustness generates a precautionary savings motive even under quadratic
+  preferences, by distorting the conditional means of income shocks.
+
+- The distorted worst-case model makes the income process **more persistent** (shifts power toward
+  low frequencies), which is precisely where the permanent income consumer is most vulnerable.
+
+- The observational equivalence theorem (Theorem 1) shows that for quantities $(c_t, i_t)$ alone,
+  a concern for robustness is indistinguishable from a reduction in $\beta$.
+
+- Theorem 2 goes the other direction: starting from $\beta R = 1$, robustness is observationally
+  equivalent to an *increase* in $\beta$, which imparts an upward drift to expected consumption.
+
+- Detection error probabilities provide a principled way to calibrate $\sigma$: choose $|\sigma|$
+  small enough that the approximating and worst-case models remain difficult to distinguish
+  statistically.
+
+- The observationally equivalent $(\sigma, \hat\beta)$ pairs **do** have different implications for
+  asset prices, a point explored further by HST in the asset-pricing context.
+
+- Section 79.3.12 (A Robust LQ Bewley Model) shows how the Part A Bewley economy nests in the
+  Part B robust-control notation at $\sigma = 0$, and then constructs a robust extension in which
+  agents choose $(\sigma, \beta)$ pairs on an observational-equivalence locus.  Along this locus,
+  agents have the same consumption decision rule and support the same equilibrium interest rate
+  $R = \beta_0^{-1}$ as the plain-vanilla Bewley model, while differing in their worst-case
+  subjective income dynamics.
+
+---
+
+## Integrated exercises
+
+```{exercise-start}
+:label: lqcs_ex1
+```
+
+**From Part A to Part B notation.**
+
+Specialise the Part B robust-control setup to the no-habit, no-capital LQ Bewley environment
+($\lambda = \delta_h = 0$, $k_t = 0$), and let the endowment process be the two-factor model in
+{eq}`eq:twofactor`.
+
+1. Write the household state as $x_t = [b_t, z_t']'$ and derive matrices $(A, B, C)$ for the law
+  of motion {eq}`eq:law0`.
+2. Show that when $\sigma = 0$, the Bellman problem coincides with the Part A LQ permanent-income
+  problem.
+3. Derive $\alpha^2$ as in Section 79.3.12 and verify
+
+$$
+\alpha^2 = \sigma_1^2 + (1-\beta)^2\sigma_2^2.
+$$
+
+Interpret economically why the permanent and transitory components enter with different weights.
+
+```{exercise-end}
+```
+
+```{solution-start} lqcs_ex1
+:class: dropdown
+```
+
+1. With $x_t = [b_t, z_t']'$ and budget law $b_{t+1} = R(b_t + c_t - y_t)$,
+  $y_t = \check G z_t$, $z_{t+1} = \check A z_t + \check C \epsilon_{t+1}$, the stacked law is
+
+$$
+\begin{pmatrix} b_{t+1} \\ z_{t+1} \end{pmatrix}
+=
+\underbrace{\begin{pmatrix} R & -R\check G \\ 0 & \check A \end{pmatrix}}_{A}
+\begin{pmatrix} b_t \\ z_t \end{pmatrix}
++
+\underbrace{\begin{pmatrix} -R \\ 0 \end{pmatrix}}_{B} c_t
++
+\underbrace{\begin{pmatrix} 0 \\ \check C \end{pmatrix}}_{C}\epsilon_{t+1}.
+$$
+
+  (The sign of $B$ is negative because higher $c_t$ reduces bond accumulation $b_{t+1}$.)
+
+2. At $\sigma=0$, the robust Bellman problem collapses to the ordinary LQ objective with no
+  minimising distortion term. Hence the planner/consumer problem is exactly the Part A
+  permanent-income problem with quadratic utility and linear constraints.
+
+3. From Part A representation 2,
+
+$$
+\Delta c_{t+1} = h\,\epsilon_{t+1},
+\qquad h = (1-\beta)\check G (I-\beta\check A)^{-1}\check C.
+$$
+
+  In HST notation, $\alpha^2 = h h'$.  For the two-factor calibration,
+  $\check A=\mathrm{diag}(1,0)$ and $\check C=\mathrm{diag}(\sigma_1,\sigma_2)$, so
+
+$$
+\alpha^2 = \sigma_1^2 + (1-\beta)^2\sigma_2^2.
+$$
+
+  Permanent shocks get unit weight because they shift lifetime resources one-for-one, while
+  transitory shocks are annuitised and therefore scaled by $(1-\beta)$ in consumption growth.
+
+```{solution-end}
+```
+
+```{exercise-start}
+:label: lqcs_ex2
+```
+
+**Continuum of robust but observationally equivalent Bewley consumers.**
+
+Fix a benchmark pair $(\beta_0, \sigma = 0)$ with $R = \beta_0^{-1}$ and define
+
+$$
+\beta(\sigma) = \beta_0 + \frac{\sigma\alpha^2\beta_0}{1-\beta_0},
+\qquad \sigma \in [-\bar\sigma, 0].
+$$
+
+Suppose a unit interval of consumers is indexed by $i$ with type $\sigma^i \in [-\bar\sigma, 0]$
+and discount factor $\beta^i = \beta(\sigma^i)$.
+
+1. Use Theorem 1 to show that each type has the same consumption rule as the benchmark
+  $(\beta_0, 0)$ agent.
+2. Prove that aggregate consumption and bond-market clearing imply the same equilibrium interest
+  rate $R = \beta_0^{-1}$ as in the plain-vanilla Bewley model.
+3. Explain why agents can be observationally equivalent in quantities while still holding different
+  worst-case subjective models.
+
+```{exercise-end}
+```
+
+```{solution-start} lqcs_ex2
+:class: dropdown
+```
+
+1. Theorem 1 implies that if $(\sigma^i, \beta^i)$ lies on
+
+$$
+\beta^i = \beta_0 + \frac{\sigma^i\alpha^2\beta_0}{1-\beta_0},
+$$
+
+  then type $i$ chooses the same decision rule as the benchmark $(0,\beta_0)$ agent. Therefore
+  all types share the same consumption policy function $c_t = \mathcal C(b_t,z_t)$.
+
+2. Since all individual policy rules coincide with benchmark Bewley policies, aggregating over
+  consumers gives the same goods- and bond-market clearing conditions as Part A. Thus the same
+  equilibrium supports allocations, namely $R=\beta_0^{-1}$.
+
+3. Observational equivalence concerns quantities generated by optimal rules. Distinct
+  $(\sigma^i,\beta^i)$ can generate the same $\{c_t^i,b_t^i\}$ while implying different internal
+  worst-case beliefs (different distortion processes and likelihood ratios). So quantities alone
+  do not identify robustness preferences.
+
+```{solution-end}
+```
+
+```{exercise-start}
+:label: lqcs_ex3
+```
+
+**Quantitative comparison across equivalent types.**
+
+Using calibration values of your choice (for example, those used in the numerical illustration), do
+the following:
+
+1. Plot the observational-equivalence locus $\beta(\sigma)$ over
+  $\sigma \in [-\bar\sigma, 0]$.
+2. Select at least three types on the locus (e.g., low, medium, high robustness).
+3. Simulate a common sequence of income shocks and verify numerically that their consumption paths
+  coincide.
+4. For the same types, compute and compare one statistic that depends on the **subjective** model
+  (e.g., a worst-case persistence coefficient, distorted long-run variance, or detection error
+  probability).
+
+Summarise what is and is not identified by data on quantities alone.
+
+```{exercise-end}
+```
+
+```{solution-start} lqcs_ex3
+:class: dropdown
+```
+
+A compact implementation path is:
+
+1. Choose $(\beta_0,\sigma_1,\sigma_2)$ and compute
+
+$$
+\alpha^2 = \sigma_1^2 + (1-\beta_0)^2\sigma_2^2,
+\qquad
+\beta(\sigma)=\beta_0+\frac{\sigma\alpha^2\beta_0}{1-\beta_0}.
+$$
+
+2. Pick three values $\sigma^{L}<\sigma^{M}<\sigma^{H}\le 0$ and map to
+  $(\beta^L,\beta^M,\beta^H)$ on the locus.
+
+3. Simulate one common shock sequence $\{\epsilon_t\}$ and propagate consumption using the
+  benchmark policy rule. The three consumption paths coincide period-by-period (up to numerical
+  tolerance).
+
+4. Compare a subjective-model statistic (for example, a worst-case AR persistence or DEP). It
+  varies across $\sigma$ even though simulated quantities coincide.
+
+Conclusion: quantities identify the equilibrium decision rule but not the decomposition between
+impatience ($\beta$) and robustness ($\sigma$) along the observational-equivalence locus.
+
+```{solution-end}
+```
diff --git a/lectures/robust_permanent_income.md b/lectures/robust_permanent_income.md
new file mode 100644
index 000000000..6dd697f52
--- /dev/null
+++ b/lectures/robust_permanent_income.md
@@ -0,0 +1,966 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+(robust_permanent_income)=
+```{raw} jupyter
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+        </a>
+</div>
+```
+
+# Robust Permanent Income and Pricing
+
+```{contents} Contents
+:depth: 2
+```
+
+In addition to what's in Anaconda, this lecture will need the following libraries:
+
+```{code-cell} ipython3
+:tags: [hide-output]
+!pip install --upgrade quantecon
+```
+
+## Overview
+
+This lecture studies the model of {cite}`HST_1999` — Lars Peter Hansen, Thomas J. Sargent, and Thomas D. Tallarini's "Robust Permanent Income and Pricing".
+
+The paper asks a simple question with surprising consequences.
+
+What happens to a classic permanent income consumer when, instead of trusting a single probability model of their income, they **fear that their model is misspecified** and want decisions that work well across a *family* of nearby models?
+
+Such a consumer is called a **robust** decision maker.
+
+The central findings are:
+
+* A preference for robustness is hidden inside the quantity implications of the ordinary permanent income model.
+* Robustness and risk-sensitivity are two interpretations of the **same** decision rules — a single parameter $\sigma$ governs both.
+* Concern about *small* amounts of model misspecification can show up as *large* market-based measures of risk aversion.
+* The consumption and savings data alone cannot identify the robustness parameter: the model is **observationally equivalent** to a standard permanent income model with a lower discount factor.
+* But asset prices — in particular the **market price of risk** — *can* be used to pin robustness down.
+
+We will learn about
+
+* risk-sensitive recursive preferences and the operator $\mathcal{R}_t$
+* how a malevolent "second agent" implements a preference for robustness through a two-player zero-sum game
+* the link between robustness and **Knightian uncertainty** in the sense of {cite}`GilboaSchmeidler:1989` and {cite}`EpsteinWang1994`
+* an **observational equivalence** result that we will reproduce numerically
+* how a small worst-case distortion of conditional means translates almost one-for-one into a market price of risk
+
+This lecture builds on ideas in {doc}`perm_income`, {doc}`perm_income_cons`, and {doc}`lqcontrol`.
+
+The robustness machinery here is developed at book length in {cite}`HansenSargent2008`, extended in {cite}`AHS_2003`, and reinterpreted through detection-error probabilities in {cite}`BHS_2009`.
+
+Let's start with some imports.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.optimize import brentq
+from scipy.stats import norm
+import quantecon as qe
+```
+
+## Risk-sensitive recursive preferences
+
+The theory rests on a recursive linear-quadratic optimization problem with a twist in how continuation utility is aggregated.
+
+The state evolves as
+
+$$
+x_{t+1} = A x_t + B i_t + C w_{t+1},
+$$ (eq:hst_lom)
+
+where $i_t$ is a control vector, $x_t$ is the state vector, and $w_{t+1}$ is an i.i.d. Gaussian vector with $\mathbb{E} w_{t+1} = 0$ and $\mathbb{E} w_{t+1} w_{t+1}' = I$.
+
+The one-period return is
+
+$$
+u(i_t, x_t) = -i_t' Q i_t - x_t' R x_t,
+$$ (eq:hst_return)
+
+with $Q$ positive definite and $R$ positive semidefinite.
+
+Following {cite}`Epstein_Zin1989`, {cite}`Weil_1989`, and {cite}`hansen1995discounted`, intertemporal preferences are induced by the recursion
+
+$$
+U_t = u(i_t, x_t) + \beta \, \mathcal{R}_t(U_{t+1}),
+$$ (eq:hst_recursion)
+
+where the **risk-sensitivity operator** is
+
+$$
+\mathcal{R}_t(U_{t+1}) \equiv \frac{2}{\sigma} \log \mathbb{E}\!\left[ \exp\!\left( \frac{\sigma U_{t+1}}{2} \right) \Big| J_t \right].
+$$ (eq:hst_R)
+
+Here $J_t$ is the information available at $t$.
+
+When $\sigma = 0$ we set $\mathcal{R}_t \equiv \mathbb{E}(\,\cdot \mid J_t)$ and recover the usual von Neumann–Morgenstern, state-additive specification.
+
+When $\sigma \neq 0$ the operator $\mathcal{R}_t$ applies an additional risk adjustment *over and above* the one coming from the curvature of $u$.
+
+Values of $\sigma < 0$ correspond to *more* aversion to risk than the von Neumann–Morgenstern benchmark — this is the case studied throughout the paper.
+
+```{note}
+The exponential-of-utility form in {eq}`eq:hst_R` originates in the *risk-sensitive control* literature started by {cite}`Jacobson_73` and extended by {cite}`Whittle_1981` and {cite}`Whittle_1990`. {cite}`HST_1999` give it an economic reinterpretation as a *preference for robustness*.
+```
+
+### The operator under Gaussian uncertainty
+
+The operator $\mathcal{R}_t$ has a transparent closed form when continuation utility is Gaussian.
+
+Suppose $U_{t+1} \sim \mathcal{N}(\mu, s^2)$ conditional on $J_t$. Using the Gaussian moment generating function $\mathbb{E}[\exp(a U_{t+1})] = \exp(a\mu + \tfrac{1}{2}a^2 s^2)$ with $a = \sigma/2$,
+
+$$
+\mathcal{R}_t(U_{t+1})
+= \frac{2}{\sigma} \log \mathbb{E}\!\left[ \exp\!\left( \frac{\sigma U_{t+1}}{2} \right) \Big| J_t \right]
+= \frac{2}{\sigma}\left( \frac{\sigma}{2}\mu + \frac{\sigma^2}{8} s^2 \right)
+= \mu + \frac{\sigma}{4} s^2.
+$$ (eq:hst_R_gauss)
+
+For $\sigma < 0$ this is *below* the conditional mean $\mu$: the decision maker evaluates uncertain prospects **pessimistically**, and the penalty grows with the conditional variance $s^2$.
+
+This certainty equivalent has a revealing decomposition.
+
+The expectation in {eq}`eq:hst_R` re-weights outcomes by $\exp(\sigma U_{t+1}/2)$; for a Gaussian this **exponential tilting** produces a new normal density with the *same* variance $s^2$ but a mean shifted to $\mu + \frac{\sigma}{2} s^2$.
+
+The operator value $\mu + \frac{\sigma}{4} s^2$ lies *halfway* between the original mean $\mu$ and this worst-case mean: it equals the worst-case expected utility $\mu + \frac{\sigma}{2}s^2$ *plus* the relative-entropy penalty $-\frac{\sigma}{4}s^2$ that restrains the distortion.
+
+```{note}
+The two coefficients describe different objects. The **worst-case mean** of $U_{t+1}$ shifts by $\frac{\sigma}{2}s^2$, while the **operator value** (certainty equivalent) shifts by $\frac{\sigma}{4}s^2$. Both are correct; the smaller shift of $\mathcal{R}_t$ reflects the entropy cost the malevolent player pays for the distortion. A self-contained derivation of {eq}`eq:hst_R_gauss` is requested in {ref}`hst_ex1`.
+```
+
+Let's visualize both facts — the certainty equivalent on the left, and the worst-case (tilted) density of continuation utility on the right.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Risk-sensitive operator and worst-case density
+    name: fig-hst-operator
+---
+mu, s = 0.0, 1.0           # conditional mean and std of continuation utility
+
+def R_operator(mu, s, sigma):
+    "Risk-sensitive operator for a Gaussian U ~ N(mu, s^2)."
+    if sigma == 0:
+        return mu
+    return mu + sigma * s**2 / 4
+
+sigmas = np.linspace(-1.5, 0.0, 200)
+R_vals = [R_operator(mu, s, sg) for sg in sigmas]
+
+fig, axes = plt.subplots(1, 2, figsize=(11, 4))
+
+axes[0].plot(sigmas, R_vals, lw=2)
+axes[0].axhline(mu, color='k', ls='--', lw=1, label=r'$E[U_{t+1}]=\mu$')
+axes[0].set_xlabel(r'risk-sensitivity $\sigma$')
+axes[0].set_ylabel(r'$\mathcal{R}_t(U_{t+1})$')
+axes[0].set_title('Certainty equivalent falls with $\\sigma$')
+axes[0].legend()
+
+# worst-case (exponentially tilted) density of continuation utility
+grid = np.linspace(mu - 4*s, mu + 4*s, 400)
+axes[1].plot(grid, norm.pdf(grid, mu, s), lw=2,
+             label='reference $\\mathcal{N}(\\mu, s^2)$')
+for sigma in [-0.5, -1.0]:
+    shift = sigma * s**2 / 2     # worst-case mean shift of U_{t+1}
+    axes[1].plot(grid, norm.pdf(grid, mu + shift, s), lw=2, ls='--',
+                 label=f'worst-case, $\\sigma={sigma}$')
+    axes[1].axvline(mu + sigma * s**2 / 4, color='C3', lw=0.8, ls=':')
+axes[1].set_xlabel(r'continuation utility $U_{t+1}$')
+axes[1].set_ylabel('density')
+axes[1].set_title('Worst-case density tilts mean downward')
+axes[1].legend()
+
+plt.tight_layout()
+plt.show()
+```
+
+The left panel shows the certainty equivalent $\mathcal{R}_t$ sliding below the mean as $\sigma$ becomes more negative.
+
+The right panel shows the associated **worst-case** density of continuation utility: a robust agent behaves *as if* $U_{t+1}$ were drawn from a pessimistically re-centered distribution (mean $\mu + \frac{\sigma}{2}s^2$, dashed), while the operator value $\mathcal{R}_t = \mu + \frac{\sigma}{4}s^2$ (dotted) sits halfway between it and the reference mean $\mu$.
+
+## A preference for robustness
+
+The pessimistic tilt in the right panel above is not just an analogy.
+
+{cite}`HST_1999` show that the risk-sensitive problem is the value function of a **two-player zero-sum game**.
+
+In this game, one player chooses the control $\{i_t\}$ while a second, malevolent player chooses a distortion $\{v_t\}$ to the conditional mean of the shocks.
+
+The distorted law of motion is
+
+$$
+x_{t+1} = A x_t + B i_t + C (w_{t+1} + v_t).
+$$ (eq:hst_distorted_lom)
+
+The minimizing player would like to push the state in painful directions, but is restrained by a penalty on the size of the distortion.
+
+With $-1/\sigma \geq 0$ acting as a Lagrange multiplier on a constraint that bounds the distortion sequence, the Markov perfect equilibrium has value function
+
+$$
+\tilde{W}(x) = \inf_v \sup_i \left\{ -i'Qi - x'Rx + \beta \left[ -\frac{1}{\sigma} v'v + \mathbb{E}\,\tilde{W}(Ax + Bi + C(w + v)) \right] \right\}.
+$$ (eq:hst_game)
+
+Because $\sigma < 0$ makes $-1/\sigma > 0$, the term $-\frac{1}{\sigma}v'v$ *penalizes* the malevolent player for large distortions.
+
+A smaller $|1/\sigma|$ (more negative $\sigma$) means a cheaper distortion budget and hence a larger family of models the agent guards against — a *stronger* preference for robustness.
+
+This is exactly the max-min expected utility structure of {cite}`GilboaSchmeidler:1989`: the agent's "nominal model" sets $v_t = 0$, but they entertain a whole family of alternatives indexed by $\{v_t\}$ and act against the worst case.
+
+Following {cite}`EpsteinWang1994`, the non-uniqueness of the implied probability measures is a form of **Knightian uncertainty**.
+
+The robust and risk-sensitive problems share the same value function matrix $\Omega$ and the same decision rule $i_t = -F x_t$; they differ only in interpretation.
+
+## The permanent income economy
+
+We now specialize to a habit-persistence version of the permanent income model.
+
+A planner orders consumption streams $\{c_t\}$ through a service stream $\{s_t\}$ using the recursion
+
+$$
+U_t = -(s_t - b_t)^2 + \beta\, \mathcal{R}_t(U_{t+1}),
+$$ (eq:hst_pi_pref)
+
+where $\{b_t\}$ is an exogenous preference (bliss-point) shock.
+
+Services are produced from consumption via the household technology
+
+$$
+s_t = (1 + \lambda) c_t - \lambda h_{t-1},
+$$ (eq:hst_services)
+
+$$
+h_t = \delta_h h_{t-1} + (1 - \delta_h) c_t,
+$$ (eq:hst_habit)
+
+with $\lambda > 0$ and $\delta_h \in (0, 1)$.
+
+Here $h_t$ is a geometric average of current and past consumption, so {eq}`eq:hst_services` makes services depend *negatively* on a weighted average of past consumption — this is the **habit persistence**.
+
+There is a linear production technology
+
+$$
+c_t + i_t = \gamma k_{t-1} + d_t,
+$$
+
+and capital accumulates as $k_t = \delta_k k_{t-1} + i_t$, where $\{d_t\}$ is an exogenous endowment.
+
+Combining,
+
+$$
+c_t + k_t = (\delta_k + \gamma) k_{t-1} + d_t,
+\qquad R \equiv \delta_k + \gamma,
+$$ (eq:hst_budget)
+
+so $R$ is the gross physical return on capital, which in a decentralized economy equals the gross **risk-free rate**.
+
+The endowment and preference shocks are driven by a common linear state,
+
+$$
+z_{t+1} = A_{22} z_t + C_2 w_{t+1},
+\qquad d_t = U_d z_t, \quad b_t = U_b z_t.
+$$ (eq:hst_shocks)
+
+This whole economy is a special case of the control problem {eq}`eq:hst_lom`–{eq}`eq:hst_recursion`: stack $h_{t-1}$, $k_{t-1}$, and $z_t$ into the state $x_t$ and let the control be $i_t = s_t - b_t$.
+
+### The $\sigma = 0$ benchmark and the martingale
+
+To build intuition, set $\sigma = 0$ and impose the permanent income restriction $\beta R = 1$, as in {cite}`Hall1978`.
+
+The first-order conditions then imply that the marginal utility of consumption services is a **martingale**,
+
+$$
+\mathbb{E}_t \, \mu_{c,t+1} = \mu_{c,t},
+$$ (eq:hst_martingale)
+
+and that $\mu_{s,t}$ inherits the representation
+
+$$
+\mu_{s,t} = \mu_{s,t-1} + v' w_t
+$$ (eq:hst_mu_rw)
+
+for some loading vector $v$.
+
+Equation {eq}`eq:hst_martingale` is the classic statement that, under $\beta R = 1$, consumption responds only to *news* — it is a random walk. This is the result that {cite}`Hall1978` and {cite}`Campbell1987` tested on aggregate U.S. data.
+
+The scalar
+
+$$
+\theta^2 \equiv v' v
+$$
+
+measures the variance of the innovation to the marginal-utility martingale {eq}`eq:hst_mu_rw`. It will be the one summary statistic of the benchmark economy that we need below.
+
+```{note}
+Under a rational-expectations reading, the benchmark $\sigma = 0$ permanent income model has **no precautionary savings**, as emphasized by Zeldes. Introducing robustness ($\sigma < 0$) revives a precautionary motive: the consumer guards against worst-case mistakes in the conditional means of shocks.
+```
+
+## Observational equivalence
+
+Here is the paper's first headline result.
+
+```{prf:proposition} Observational Equivalence
+:label: prop:hst_oe
+
+Fix all parameters except $\beta$ and $\sigma$. Suppose $\beta R = 1$. There exists $\underline{\sigma} < 0$ such that the optimal consumption–investment plan for $\sigma = 0$ is *also* optimal for any $\sigma \in (\underline{\sigma}, 0)$, provided the discount factor is lowered to a value $\hat\beta(\sigma)$ that varies directly with $\sigma$.
+```
+
+In words: as far as the **quantities** $\{c_t, k_t\}$ are concerned, the robust ($\sigma < 0$) permanent income model is indistinguishable from the standard ($\sigma = 0$) one with a smaller discount factor.
+
+Increasing the preference for robustness stimulates a precautionary motive for saving; lowering $\beta$ makes saving less attractive; along a particular locus the two effects exactly cancel.
+
+The proof is constructive and delivers an explicit locus of observationally equivalent $(\sigma, \hat\beta)$ pairs. Define
+
+$$
+\Omega(\beta) = \frac{\beta - 1 + \sigma \theta^2 + \sqrt{(\beta - 1 + \sigma \theta^2)^2 + 4 \sigma \theta^2}}{-2 \sigma \theta^2},
+$$ (eq:hst_Omega_scalar)
+
+$$
+\hat\zeta(\beta) = 1 + \frac{\theta^2 \sigma\, \Omega(\beta)}{1 - \sigma \theta^2 \Omega(\beta)} .
+$$ (eq:hst_zeta)
+
+The equivalent discount factor $\hat\beta$ solves
+
+$$
+\hat\beta \, R \, \hat\zeta(\hat\beta) = 1.
+$$ (eq:hst_betahat)
+
+The lower bound $\underline{\sigma}$ is the most negative $\sigma$ for which the square root in {eq}`eq:hst_Omega_scalar` stays real.
+
+Let's reproduce the locus — a version of Figure 1 in {cite}`HST_1999`.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Observationally equivalent $(\sigma, \hat\beta)$ pairs
+    name: fig-hst-oe
+---
+beta_bench = 0.9971         # benchmark discount factor (annual rate ~2.5%)
+Rf = 1 / beta_bench         # gross risk-free return fixed by beta R = 1
+theta2 = 0.01               # variance of the marginal-utility innovation, v'v
+
+def Omega_scalar(beta, sigma, theta2):
+    disc = (beta - 1 + sigma * theta2)**2 + 4 * sigma * theta2
+    if disc < 0:
+        return np.nan        # below sigma-underbar: no real solution
+    return (beta - 1 + sigma * theta2 + np.sqrt(disc)) / (-2 * sigma * theta2)
+
+def zeta_hat(beta, sigma, theta2):
+    Om = Omega_scalar(beta, sigma, theta2)
+    return 1 + (theta2 * sigma * Om) / (1 - sigma * theta2 * Om)
+
+def beta_hat(sigma, theta2, Rf):
+    "Discount factor that makes sigma observationally equivalent to sigma=0."
+    if sigma == 0:
+        return 1 / Rf
+    f = lambda b: b * Rf * zeta_hat(b, sigma, theta2) - 1
+    return brentq(f, 0.95, 1 / Rf - 1e-12)
+
+sigmas = np.linspace(-1.2e-4, 0.0, 200)
+betas = np.array([beta_hat(sg, theta2, Rf) for sg in sigmas])
+
+fig, ax = plt.subplots(figsize=(7, 5))
+ax.plot(betas, sigmas, lw=2)
+ax.set_xlabel(r'discount factor $\hat\beta$')
+ax.set_ylabel(r'risk-sensitivity $\sigma$')
+ax.axhline(0, color='k', lw=0.8, ls='--')
+plt.tight_layout()
+plt.show()
+```
+
+Every point on this curve generates *exactly the same* consumption and investment data.
+
+Moving down the curve (more negative $\sigma$, i.e. a stronger preference for robustness) requires a *lower* discount factor $\hat\beta$ to keep quantities unchanged.
+
+This is why consumption and savings data alone cannot tell us how much the consumer fears model misspecification.
+
+## Estimation
+
+Section 4 of {cite}`HST_1999` turns the observational-equivalence result into an empirical strategy.
+
+Because the quantity data cannot pin down $\sigma$, the authors first estimate the $\sigma = 0$ version of the model, conditioning the likelihood **only on consumption and investment**, and then use the locus of {prf:ref}`prop:hst_oe` to trace out the family of $(\sigma, \hat\beta)$ pairs consistent with those estimates.
+
+Asset prices (the next section) break the tie.
+
+### The data and the likelihood
+
+The model is fit to U.S. post-war quarterly data, **1970:I–1996:III**.
+
+* **Consumption** is measured as nondurables plus services.
+* **Investment** is measured as durables plus gross private investment.
+
+Both series are deflated by the deterministic growth factor $1.0033^{t}$, so the model is fit to *detrended* data.
+
+The likelihood is Gaussian, built recursively (a Kalman filter), with the unobserved part of the initial state estimated using the methods of Hansen and Sargent.
+
+### Specification
+
+The preference shock is a constant, $b_t = \mu_b$, fixed at $\mu_b = 32$ — recall from {eq}`eq:hst_budget`-discussion that the *level* of $b_t$ does not affect the decision rules, only prices.
+
+The endowment is the sum of a **persistent** and a **transitory** component, each a second-order autoregression driven by orthogonal shocks,
+
+$$
+(1 - \phi_1 L)(1 - \phi_2 L)\, d^{*}_t = c_{d^{*}}\, w^{d^{*}}_t,
+$$ (eq:hst_dstar)
+
+$$
+(1 - \alpha_1 L)(1 - \alpha_2 L)\, \hat d_t = c_{\hat d}\, w^{\hat d}_t,
+$$ (eq:hst_dhat)
+
+with $d_t = \mu_d + d^{*}_t + \hat d_t$.
+
+A likelihood comparison (a gain from AR(1) to AR(2) but not beyond) led the authors to a second-order specification for the transitory component.
+
+The four parameters governing the endogenous dynamics are $(\gamma, \delta_k, \beta, \lambda)$.
+
+The depreciation factor is set to $\delta_k = 0.975$, and the permanent-income restriction $\beta R = 1$ — confirmed by the unrestricted estimates — is imposed with $\beta = 0.9971$, implying a $2.5\%$ annual real interest rate after the growth adjustment.
+
+The maximum-likelihood estimates (with habit persistence) are reproduced below — a version of Table 2 in {cite}`HST_1999`.
+
+| Parameter | Symbol | Estimate |
+|---|---|---|
+| Discount factor | $\beta$ | 0.997 |
+| Habit depreciation | $\delta_h$ | 0.682 |
+| Habit weight | $\lambda$ | 2.443 |
+| Transitory AR roots | $\alpha_1, \alpha_2$ | 0.813, 0.189 |
+| Persistent AR roots | $\phi_1, \phi_2$ | 0.998, 0.704 |
+| Endowment mean | $\mu_d$ | 13.710 |
+| Transitory shock scale | $c_{\hat d}$ | 0.155 |
+| Persistent shock scale | $c_{d^{*}}$ | 0.108 |
+
+The single most striking estimate is the autoregressive root $\phi_1 = 0.998$ of the persistent endowment component — a number all but indistinguishable from a unit root.
+
+### Impulse responses and the permanent income mechanism
+
+The persistence of a shock is what determines how strongly consumption reacts to it.
+
+Under the permanent income logic (with $\beta R = 1$ and, for transparency, *no* habit), consumption jumps on impact by the **annuity value** of the change in wealth and is a martingale thereafter,
+
+$$
+\Delta c = \left(1 - \frac{1}{R}\right) \sum_{j \geq 0} R^{-j}\, \psi_j,
+$$ (eq:hst_pi_mpc)
+
+where $\psi_j = \partial d_{t+j} / \partial w_t$ is the endowment's own impulse response.
+
+A near-permanent shock has a large present value and moves consumption a lot; a transitory shock has a small present value and barely moves it.
+
+Let's compute the endowment responses {eq}`eq:hst_dstar`–{eq}`eq:hst_dhat` and the implied consumption responses.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Endowment and consumption impulse responses
+    name: fig-hst-irf
+---
+def ar2_irf(r1, r2, c, H):
+    "Impulse response of (1 - r1 L)(1 - r2 L) x = c w to a unit w shock."
+    psi = np.zeros(H)
+    psi[0] = c
+    if H > 1:
+        psi[1] = (r1 + r2) * psi[0]
+    for j in range(2, H):
+        psi[j] = (r1 + r2) * psi[j-1] - r1 * r2 * psi[j-2]
+    return psi
+
+H = 50
+disc = Rf**(-np.arange(H))                      # Rf = 1/beta from the previous cell
+
+psi_p = ar2_irf(0.998, 0.704, 0.108, H)         # persistent endowment d*
+psi_t = ar2_irf(0.813, 0.189, 0.155, H)         # transitory endowment d_hat
+
+# permanent income consumption response: a flat (martingale) jump
+dc_p = (1 - 1/Rf) * np.sum(disc * psi_p)
+dc_t = (1 - 1/Rf) * np.sum(disc * psi_t)
+
+print(f"persistent shock: consumption responds to "
+      f"{100*dc_p/psi_p[0]:.0f}% of the impact")
+print(f"transitory shock: consumption responds to "
+      f"{100*dc_t/psi_t[0]:.0f}% of the impact")
+
+fig, axes = plt.subplots(1, 2, figsize=(11, 4), sharey=True)
+for ax, psi, dc, title in [
+        (axes[0], psi_p, dc_p, 'Persistent endowment shock $d^{*}$'),
+        (axes[1], psi_t, dc_t, 'Transitory endowment shock $\\hat d$')]:
+    ax.plot(psi, lw=2, label='endowment response $\\psi_j$')
+    ax.axhline(dc, color='C3', ls='--', lw=2, label='consumption response $\\Delta c$')
+    ax.set_xlabel('horizon $j$ (quarters)')
+    ax.set_title(title)
+    ax.legend()
+axes[0].set_ylabel('response')
+plt.tight_layout()
+plt.show()
+```
+
+The contrast is the heart of the permanent income hypothesis tested by {cite}`Hall1978` and {cite}`Campbell1987`.
+
+Consumption tracks a large fraction of the **persistent** shock — whose near-unit root makes it almost permanent income — but only a sliver of the **transitory** shock, the bulk of which is saved and shows up as investment.
+
+```{note}
+With habit persistence ($\lambda > 0$) the consumption responses are no longer flat: they become hump-shaped, because services rather than consumption obey the martingale logic. The estimated $\lambda = 2.443$ and $\delta_h = 0.682$ imply economically important habit effects, and a likelihood-ratio comparison strongly rejects $\lambda = 0$. {cite}`HST_1999` compare these magnitudes with the habit estimates in the time-nonseparable preference literature.
+```
+
+## Asset pricing and the market price of risk
+
+Section 5 of {cite}`HST_1999` shows how the observationally-equivalent pairs that look identical in quantity data have *different* implications for asset prices.
+
+### Decentralization
+
+Following {cite}`Lucas1978`, we regard the robust (or risk-sensitive) planning solution as the allocation of a competitive economy populated by a large number of identical agents who trade securities.
+
+Equilibrium prices are the **shadow prices** that leave each agent content to consume the planner's allocation, treating it as an endowment process.
+
+The equilibrium law of motion for the state is
+
+$$
+x_{t+1} = A^{0} x_t + C w_{t+1},
+$$ (eq:hst_equil_lom)
+
+and the value function at the optimum is $U^{e}_t = x_t' \Omega x_t + \rho$.
+
+To support the robust allocation, prices must be computed using the **same** pessimistic, distorted beliefs that rationalize the planner's choices.
+
+This is where risk-sensitivity ($\sigma < 0$) leaves its fingerprint on prices even though, by observational equivalence, it leaves no trace in quantities.
+
+### The twisting operator and distorted beliefs
+
+Pricing a claim to next period's *utility* is trivial under the von Neumann–Morgenstern specification but nontrivial under risk-sensitivity.
+
+The key object is the **twisting operator**
+
+$$
+\mathcal{T}_t U_{t+1} \equiv \frac{\mathbb{E}(V_{t+1} U_{t+1} \mid J_t)}{\mathbb{E}(V_{t+1} \mid J_t)},
+\qquad
+V_{t+1} \equiv \exp\!\left(\frac{\sigma U^{e}_{t+1}}{2}\right),
+$$ (eq:hst_twist)
+
+which re-weights outcomes by the exponential of equilibrium continuation utility.
+
+It satisfies the subgradient inequality
+
+$$
+\mathcal{R}_t(U_{t+1}) - \mathcal{R}_t(U^{e}_{t+1}) \leq \mathcal{T}_t U_{t+1} - \mathcal{T}_t U^{e}_{t+1},
+$$ (eq:hst_subgrad)
+
+so $\mathcal{T}_t$ behaves like a *distorted* conditional expectation — exactly the change of measure used to price derivative claims in {cite}`EpsteinWang1994`.
+
+Concretely, $\mathcal{T}_t$ is the ordinary conditional expectation under a **distorted transition law**
+
+$$
+x_{t+1} = \hat A x_t + \hat C w_{t+1},
+\qquad
+\hat C \hat C' = C (I - \sigma C' \Omega C)^{-1} C',
+$$ (eq:hst_pricing_lom)
+
+with $\hat A$ given by {eq}`eq:hst_D`-style risk corrections.
+
+Because $\sigma < 0$ and $\Omega$ is negative semidefinite, $(I - \sigma C'\Omega C)^{-1}$ exceeds the identity: the pricing measure assigns a **pessimistically shifted conditional mean** *and* an **inflated conditional variance** to next period's state.
+
+These two distortions are precisely what generate risk premia.
+
+### Multi-period claims and the one-period stochastic discount factor
+
+Prices of streams are built by iterating the operator.
+
+Define $\mathcal{S}_{t,\tau} = \mathcal{T}_t \mathcal{T}_{t+1} \cdots \mathcal{T}_{t+\tau-1}$. The time-$t$ price of a claim to the consumption stream $\{c_{t+\tau}\}$ is then a discounted sum of twisted marginal utilities, and the one-period security with payoff $p_{t+1}$ is priced as
+
+$$
+q_t = \mathcal{T}_t\!\left\{ \beta \frac{\mathcal{M}^{c}_{t+1}}{\mathcal{M}^{c}_{t}}\, p_{t+1} \right\}
+    = \mathbb{E}\!\left( m_{t+1,t}\, p_{t+1} \mid J_t \right),
+$$ (eq:hst_oneperiod)
+
+where $\mathcal{M}^{c}_t$ is the marginal utility of consumption and $m_{t+1,t}$ is the one-period **stochastic discount factor** (intertemporal marginal rate of substitution).
+
+Under risk-sensitivity, $m_{t+1,t}$ factors into two pieces,
+
+$$
+m_{t+1,t} = m^{f}_{t+1,t}\; m^{r}_{t+1,t},
+$$ (eq:hst_sdf)
+
+where
+
+$$
+m^{f}_{t+1,t} = \beta \frac{\mathcal{M}^{c}_{t+1}}{\mathcal{M}^{c}_{t}}
+$$
+
+is the *familiar* intertemporal marginal rate of substitution (the only term present when $\sigma = 0$), and
+
+$$
+m^{r}_{t+1,t} = \frac{\exp(\sigma U^{e}_{t+1}/2)}{\mathbb{E}[\exp(\sigma U^{e}_{t+1}/2) \mid J_t]}
+$$ (eq:hst_mr)
+
+is a multiplicative adjustment with conditional mean one. The factor {eq}`eq:hst_mr` is the source of the extra risk premia.
+
+### The market price of risk
+
+Under the *robustness* interpretation, the same multiplicative factor equals a **likelihood ratio** between the worst-case and reference shock densities,
+
+$$
+m^{u}_{t+1,t} = \frac{\exp[-(w_{t+1} - \hat v_t)'(w_{t+1} - \hat v_t)/2]}{\exp(-w_{t+1}' w_{t+1}/2)},
+$$ (eq:hst_mu)
+
+where $\hat v_t$ is the worst-case conditional-mean distortion chosen by the malevolent player.
+
+A direct calculation gives
+
+$$
+\mathbb{E}_t(m^{u}_{t+1,t}) = 1,
+\qquad
+\mathbb{E}_t\big[(m^{u}_{t+1,t})^2\big] = \exp(\hat v_t' \hat v_t),
+$$
+
+so that, for small distortions,
+
+$$
+\operatorname{std}_t(m^{u}_{t+1,t}) = \big[\exp(\hat v_t' \hat v_t) - 1\big]^{1/2} \approx |\hat v_t|.
+$$ (eq:hst_mpr)
+
+The **market price of risk** — the maximal Sharpe ratio attainable, equal to $\operatorname{std}_t(m_{t+1,t}) / \mathbb{E}_t(m_{t+1,t})$ along the efficient frontier (the {cite}`Hansen_Jagannathan_1991` bound) — is therefore approximately equal to the **magnitude of the worst-case distortion** $|\hat v_t|$.
+
+This is the paper's punchline: a conditional-mean misspecification of $x\%$ of a unit-norm direction raises the market price of risk by roughly $x/100$.
+
+A *small*, statistically-hard-to-detect doubt about the model can generate the *large* price of risk seen in the data.
+
+Let's check the key identity {eq}`eq:hst_mpr` by Monte Carlo.
+
+```{code-cell} ipython3
+rng = np.random.default_rng(12345)
+
+def mpr_check(v_hat, n=2_000_000):
+    """
+    Simulate the worst-case likelihood ratio m^u and compare its
+    conditional standard deviation to |v_hat|.
+    """
+    k = len(v_hat)
+    w = rng.standard_normal((n, k))
+    # log likelihood ratio of N(v_hat, I) relative to N(0, I)
+    log_mu = w @ v_hat - 0.5 * v_hat @ v_hat
+    mu = np.exp(log_mu)
+    return mu.mean(), mu.std(), np.linalg.norm(v_hat)
+
+print(f"{'|v_hat|':>10}{'E[m^u]':>12}{'std(m^u)':>12}{'approx |v_hat|':>16}")
+for scale in [0.05, 0.10, 0.20]:
+    v_hat = np.array([scale, 0.0])      # distortion in one direction
+    mean, std, norm_v = mpr_check(v_hat)
+    print(f"{norm_v:10.3f}{mean:12.4f}{std:12.4f}{norm_v:16.3f}")
+```
+
+The simulated conditional mean of $m^{u}$ is one, and its conditional standard deviation tracks $|\hat v_t|$ closely — confirming {eq}`eq:hst_mpr`.
+
+A 10% distortion delivers a market price of risk near 0.10.
+
+## A risk-sensitive regulator
+
+To see the robust decision rules and worst-case shocks concretely, we solve the recursive risk-sensitive control problem {eq}`eq:hst_lom`–{eq}`eq:hst_recursion` directly.
+
+Guess a value function $W(x) = x' \Omega x + \rho$ with $\Omega$ negative semidefinite. The risk-sensitive operator {eq}`eq:hst_R` acting on this quadratic form introduces the **risk adjustment**
+
+$$
+\mathcal{D}(\Omega) = \Omega + \sigma \Omega C (I - \sigma C' \Omega C)^{-1} C' \Omega,
+$$ (eq:hst_D)
+
+so that, replacing $\Omega$ by $\mathcal{D}(\Omega)$, the Bellman equation becomes an ordinary linear-quadratic one.
+
+Iterating
+
+$$
+F = (Q - \beta B' \mathcal{D} B)^{-1}(N - \beta B' \mathcal{D} A),
+$$
+
+$$
+\Omega \leftarrow -R - F' Q F + (F'N + N'F) + \beta (A - BF)' \mathcal{D} (A - BF),
+$$
+
+to a fixed point yields the optimal rule $i_t = -F x_t$.
+
+The worst-case mean distortion is then linear in the state, $\hat v_t = G x_t$, with
+
+$$
+G = \sigma (I - \sigma C' \Omega C)^{-1} C' \Omega (A - B F).
+$$ (eq:hst_G)
+
+When $\sigma = 0$ we have $\mathcal{D}(\Omega) = \Omega$ and $G = 0$, recovering the standard regulator.
+
+```{code-cell} ipython3
+def solve_rslq(A, B, C, Q, R, beta, sigma, N=None,
+               tol=1e-12, max_iter=100_000):
+    """
+    Solve the recursive risk-sensitive LQ problem
+
+        U_t = -(x'R x + i'Q i + 2 i'N x) + beta R_t(U_{t+1})
+        x_{t+1} = A x_t + B i_t + C w_{t+1}
+
+    Returns the feedback rule F (i = -F x), the value matrix Omega,
+    the closed-loop matrix A - B F, and the worst-case loading G (v = G x).
+    """
+    A, B, C, Q, R = map(np.atleast_2d, (A, B, C, Q, R))
+    n, kw = A.shape[0], C.shape[1]
+    if N is None:
+        N = np.zeros((B.shape[1], n))
+    Omega = -np.eye(n)                # negative-definite start
+    Iw = np.eye(kw)
+    for it in range(max_iter):
+        M = Iw - sigma * C.T @ Omega @ C
+        D = Omega + sigma * Omega @ C @ np.linalg.solve(M, C.T @ Omega)
+        F = np.linalg.solve(Q - beta * B.T @ D @ B, N - beta * B.T @ D @ A)
+        Acl = A - B @ F
+        Omega_new = (-R - F.T @ Q @ F + (F.T @ N + N.T @ F)
+                     + beta * Acl.T @ D @ Acl)
+        if np.max(np.abs(Omega_new - Omega)) < tol:
+            Omega = Omega_new
+            break
+        Omega = Omega_new
+    M = Iw - sigma * C.T @ Omega @ C
+    G = sigma * np.linalg.solve(M, C.T @ Omega @ (A - B @ F))
+    return F, Omega, A - B @ F, G
+```
+
+We first verify that at $\sigma = 0$ our solver reproduces QuantEcon's ordinary LQ regulator.
+
+```{code-cell} ipython3
+# a stylized stable regulator with a persistent shock
+A = np.array([[0.9, 0.0],
+              [0.0, 0.8]])
+B = np.array([[1.0],
+              [0.0]])
+C = np.array([[0.3],
+              [0.2]])
+Q = np.array([[1.0]])
+R = np.eye(2)
+beta = 0.95
+
+# QuantEcon ordinary LQ
+lq = qe.LQ(Q, R, A, B, C=C, beta=beta)
+P, F_qe, d = lq.stationary_values()
+
+# our solver at sigma = 0
+F0, Omega0, Acl0, G0 = solve_rslq(A, B, C, Q, R, beta, sigma=0.0)
+
+print("QuantEcon LQ feedback rule F :", F_qe.flatten())
+print("solve_rslq  feedback rule F  :", F0.flatten())
+print("max |difference|             :", np.max(np.abs(F0 - F_qe)))
+```
+
+The two agree to machine precision.
+
+Now we crank up the preference for robustness and inspect how the control rule and the worst-case shock respond.
+
+```{code-cell} ipython3
+sigmas = [0.0, -0.3, -0.6]
+
+print(f"{'sigma':>7}{'F[0]':>10}{'F[1]':>10}{'G[0]':>10}{'G[1]':>10}")
+for sigma in sigmas:
+    F, Omega, Acl, G = solve_rslq(A, B, C, Q, R, beta, sigma)
+    print(f"{sigma:7.2f}{F[0,0]:10.4f}{F[0,1]:10.4f}{G[0,0]:10.4f}{G[0,1]:10.4f}")
+```
+
+As $\sigma$ becomes more negative:
+
+* the feedback gain $F$ grows — the robust agent reacts *more aggressively* to the state, since it fears the worst-case shock will amplify deviations;
+* the worst-case loading $G$ moves away from zero — the malevolent player pushes the shock in the direction that hurts most.
+
+Finally, let's see the worst-case distortion in action by simulating the controlled state under the *reference* model while displaying the conditional mean distortion $\hat v_t = G x_t$ the robust agent is guarding against.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Controlled state and worst-case distortion
+    name: fig-hst-sim
+---
+def simulate(A, B, C, F, G, T=60, seed=0):
+    rng = np.random.default_rng(seed)
+    n, kw = A.shape[0], C.shape[1]
+    x = np.zeros((T + 1, n))
+    v = np.zeros((T, kw))
+    x[0] = np.array([1.0, 1.0])          # initial deviation
+    for t in range(T):
+        v[t] = (G @ x[t]).flatten()
+        w = rng.standard_normal(kw)
+        x[t + 1] = A @ x[t] + (B @ (-F @ x[t])).flatten() + (C @ w).flatten()
+    return x, v
+
+fig, axes = plt.subplots(1, 2, figsize=(11, 4))
+
+for sigma, color in zip([0.0, -0.6], ['C0', 'C3']):
+    F, Omega, Acl, G = solve_rslq(A, B, C, Q, R, beta, sigma)
+    x, v = simulate(A, B, C, F, G, seed=42)
+    axes[0].plot(x[:, 0], color=color, lw=2, label=f'$\\sigma={sigma}$')
+    axes[1].plot(v[:, 0], color=color, lw=2, label=f'$\\sigma={sigma}$')
+
+axes[0].set_title('Controlled state $x_{1,t}$')
+axes[0].set_xlabel('$t$')
+axes[0].axhline(0, color='k', lw=0.8, ls='--')
+axes[0].legend()
+
+axes[1].set_title(r'Worst-case mean distortion $\hat v_{1,t}$')
+axes[1].set_xlabel('$t$')
+axes[1].axhline(0, color='k', lw=0.8, ls='--')
+axes[1].legend()
+
+plt.tight_layout()
+plt.show()
+```
+
+For $\sigma = 0$ the distortion is identically zero — the agent fully trusts the model.
+
+For $\sigma < 0$ the robust agent's decisions are shaped by a nonzero worst-case distortion that feeds back on the state, exactly the mechanism that — through {eq}`eq:hst_mpr` — inflates the market price of risk.
+
+## Exercises
+
+```{exercise}
+:label: hst_ex1
+
+The Gaussian formula {eq}`eq:hst_R_gauss` says that for $U_{t+1} \sim \mathcal{N}(\mu, s^2)$,
+
+$$
+\mathcal{R}_t(U_{t+1}) = \mu + \frac{\sigma}{4} s^2 .
+$$
+
+Derive this result directly from the definition {eq}`eq:hst_R`.
+
+*Hint:* use the moment generating function of a normal random variable, $\mathbb{E}[\exp(a U)] = \exp(a\mu + a^2 s^2 / 2)$.
+```
+
+```{solution-start} hst_ex1
+:class: dropdown
+```
+
+Start from the definition with $a = \sigma/2$:
+
+$$
+\mathcal{R}_t(U_{t+1}) = \frac{2}{\sigma} \log \mathbb{E}\!\left[\exp\!\left(\frac{\sigma}{2} U_{t+1}\right)\right].
+$$
+
+Since $U_{t+1} \sim \mathcal{N}(\mu, s^2)$, the moment generating function gives
+
+$$
+\mathbb{E}\!\left[\exp\!\left(\frac{\sigma}{2} U_{t+1}\right)\right]
+= \exp\!\left(\frac{\sigma}{2}\mu + \frac{1}{2}\frac{\sigma^2}{4} s^2\right).
+$$
+
+Taking logs and multiplying by $2/\sigma$,
+
+$$
+\mathcal{R}_t(U_{t+1})
+= \frac{2}{\sigma}\left(\frac{\sigma}{2}\mu + \frac{\sigma^2}{8} s^2\right)
+= \mu + \frac{\sigma}{4} s^2 .
+$$
+
+Letting $\sigma \to 0$ recovers $\mathcal{R}_t(U_{t+1}) = \mu = \mathbb{E}_t U_{t+1}$, as expected.
+
+```{solution-end}
+```
+
+```{exercise}
+:label: hst_ex2
+
+The observational-equivalence locus has a left endpoint $\underline{\sigma}$: the most negative $\sigma$ for which {eq}`eq:hst_Omega_scalar` has a real solution.
+
+Using the code from the lecture, find $\underline{\sigma}$ numerically for `theta2 = 0.01` and for `theta2 = 0.02`, and explain why a larger $\theta^2$ shrinks the admissible range of $\sigma$.
+
+*Hint:* the square root is real when the discriminant $(\beta - 1 + \sigma\theta^2)^2 + 4\sigma\theta^2 \geq 0$, evaluated at the relevant $\hat\beta$.
+```
+
+```{solution-start} hst_ex2
+:class: dropdown
+```
+
+The boundary $\underline{\sigma}$ is the most negative $\sigma$ at which a valid $\hat\beta$ can still be found. We scan $\sigma$ downward and stop when `beta_hat` can no longer return a real solution.
+
+```{code-cell} ipython3
+def sigma_underbar(theta2, Rf, grid=np.linspace(-1e-6, -5e-4, 5000)):
+    last_ok = 0.0
+    for sg in grid:
+        try:
+            b = beta_hat(sg, theta2, Rf)
+            if np.isnan(Omega_scalar(b, sg, theta2)):
+                break
+            last_ok = sg
+        except ValueError:
+            break
+    return last_ok
+
+for theta2 in [0.01, 0.02]:
+    sb = sigma_underbar(theta2, Rf)
+    print(f"theta2 = {theta2}:  sigma_underbar ≈ {sb:.3e}")
+```
+
+A larger $\theta^2$ means the marginal-utility martingale {eq}`eq:hst_mu_rw` carries a bigger innovation variance, so each unit of $|\sigma|$ generates a larger risk adjustment. The discriminant in {eq}`eq:hst_Omega_scalar`, which contains the term $4\sigma\theta^2 < 0$, turns negative at a *smaller* $|\sigma|$. Hence the admissible range $(\underline{\sigma}, 0)$ shrinks as $\theta^2$ grows.
+
+```{solution-end}
+```
+
+```{exercise}
+:label: hst_ex3
+
+The market-price-of-risk approximation {eq}`eq:hst_mpr` states that
+$\operatorname{std}_t(m^u_{t+1,t}) = [\exp(\hat v_t'\hat v_t) - 1]^{1/2} \approx |\hat v_t|$.
+
+For the risk-sensitive regulator solved in the lecture (with $A$, $B$, $C$, $Q$, $R$, $\beta$ as given there), compute the *exact* market price of risk $[\exp(\hat v_t' \hat v_t) - 1]^{1/2}$ as a function of $\sigma$, evaluated at the state $x = (1, 1)'$, and plot it together with the linear approximation $|\hat v_t|$.
+
+Comment on the range of $\sigma$ over which the approximation is accurate.
+```
+
+```{solution-start} hst_ex3
+:class: dropdown
+```
+
+We solve the regulator for a grid of $\sigma$ values, evaluate $\hat v_t = G x$ at $x = (1,1)'$, and compare the exact and approximate market prices of risk.
+
+```{code-cell} ipython3
+x_eval = np.array([1.0, 1.0])
+sigma_grid = np.linspace(-1.0, 0.0, 80)
+
+exact, approx = [], []
+for sigma in sigma_grid:
+    F, Omega, Acl, G = solve_rslq(A, B, C, Q, R, beta, sigma)
+    v_hat = (G @ x_eval).flatten()
+    nv2 = v_hat @ v_hat
+    exact.append(np.sqrt(np.exp(nv2) - 1))
+    approx.append(np.sqrt(nv2))           # |v_hat|
+
+fig, ax = plt.subplots(figsize=(7, 5))
+ax.plot(sigma_grid, exact, lw=2, label=r'exact $[\exp(\hat v\,\!^\prime \hat v)-1]^{1/2}$')
+ax.plot(sigma_grid, approx, lw=2, ls='--', label=r'approximation $|\hat v|$')
+ax.set_xlabel(r'risk-sensitivity $\sigma$')
+ax.set_ylabel('market price of risk')
+ax.set_title('exact vs. approximate market price of risk')
+ax.legend()
+plt.tight_layout()
+plt.show()
+```
+
+The two curves are nearly indistinguishable for small $|\hat v_t|$ (i.e. $\sigma$ close to zero) because $\exp(z) - 1 \approx z$ when $z$ is small.
+
+They separate only when the preference for robustness — and hence the worst-case distortion — becomes large.
+
+This is precisely the regime {cite}`HST_1999` emphasize: *small*, hard-to-detect distortions map almost linearly into the market price of risk.
+
+```{solution-end}
+```
+
+## Related lectures
+
+This lecture connects to several others in the QuantEcon collection.
+
+The underlying consumption-smoothing economics is developed in {doc}`perm_income` and {doc}`perm_income_cons`, and the linear-quadratic control machinery is laid out in {doc}`lqcontrol`.
+
+The reinterpretation of the market price of risk as a *price of model uncertainty*, calibrated through detection-error probabilities, is the subject of {cite}`BHS_2009`; the broader semigroup treatment of robustness, pricing, and model detection is in {cite}`AHS_2003`.
+
+Both build directly on {cite}`HST_1999`.
diff --git a/lectures/theil_1.md b/lectures/theil_1.md
index 1af931bb9..1cda048ff 100644
--- a/lectures/theil_1.md
+++ b/lectures/theil_1.md
@@ -73,7 +73,7 @@ That volume collected early papers on rational expectations modeling and econome
 
 ## A central problem of empirical economics
 
-To set the stage, {cite:t}`lucas1981rational` stated the central question for empirical economics that had been posed by Leonid Hurwicz ({cite}`Hurwicz:1962`,{cite}`Hurwicz:1966`):
+To set the stage, {cite:t}`lucas1981rational` stated the central question for empirical economics that had been posed by Leonid Hurwicz ({cite}`Hurwicz:1962`):
 
  *  Given observations on an agent's behavior in a particular economic environment, what can we infer about how that behavior *would have differed* had the environment been altered?