diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index 82f5cc7ec..2701fe981 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -1,3 +1,169 @@
+@article{BreedenLitzenberger1978,
+  author    = {Breeden, Douglas T. and Litzenberger, Robert H.},
+  title     = {Prices of State-Contingent Claims Implicit in Option Prices},
+  journal   = {Journal of Business},
+  volume    = {51},
+  number    = {4},
+  pages     = {621--651},
+  year      = {1978},
+  doi       = {10.1086/296025}
+}
+
+@article{CarrYu2012,
+  author    = {Carr, Peter and Yu, Jiming},
+  title     = {Risk, Return, and {Ross} Recovery},
+  journal   = {Journal of Derivatives},
+  volume    = {20},
+  number    = {1},
+  pages     = {38--59},
+  year      = {2012},
+  doi       = {10.3905/jod.2012.20.1.038}
+}
+
+@article{BlackScholes1973,
+  author    = {Black, Fischer and Scholes, Myron},
+  title     = {The Pricing of Options and Corporate Liabilities},
+  journal   = {Journal of Political Economy},
+  volume    = {81},
+  number    = {3},
+  pages     = {637--654},
+  year      = {1973},
+  doi       = {10.1086/260062}
+}
+
+@article{Merton1973,
+  author    = {Merton, Robert C.},
+  title     = {Theory of Rational Option Pricing},
+  journal   = {Bell Journal of Economics and Management Science},
+  volume    = {4},
+  number    = {1},
+  pages     = {141--183},
+  year      = {1973},
+  doi       = {10.2307/3003143}
+}
+
+@article{CoxRossRubinstein1979,
+  author    = {Cox, John C. and Ross, Stephen A. and Rubinstein, Mark},
+  title     = {Option Pricing: A Simplified Approach},
+  journal   = {Journal of Financial Economics},
+  volume    = {7},
+  number    = {3},
+  pages     = {229--263},
+  year      = {1979},
+  doi       = {10.1016/0304-405X(79)90015-1}
+}
+
+@article{JackwerthRubinstein1996,
+  author    = {Jackwerth, Jens Carsten and Rubinstein, Mark},
+  title     = {Recovering Probability Distributions from Option Prices},
+  journal   = {Journal of Finance},
+  volume    = {51},
+  number    = {5},
+  pages     = {1611--1631},
+  year      = {1996},
+  doi       = {10.1111/j.1540-6261.1996.tb05219.x}
+}
+
+@article{Weitzman2007,
+  author    = {Weitzman, Martin L.},
+  title     = {Subjective Expectations and Asset-Return Puzzles},
+  journal   = {American Economic Review},
+  volume    = {97},
+  number    = {4},
+  pages     = {1102--1130},
+  year      = {2007},
+  doi       = {10.1257/aer.97.4.1102}
+}
+
+@article{BorovickaHansenScheinkman2016,
+  author    = {Borovička, Jaroslav and Hansen, Lars Peter and Scheinkman, José A.},
+  title     = {Misspecified Recovery},
+  journal   = {Journal of Finance},
+  volume    = {71},
+  number    = {6},
+  pages     = {2493--2544},
+  year      = {2016},
+  doi       = {10.1111/jofi.12404}
+}
+
+@article{Ross2015,
+  author    = {Ross, Stephen A.},
+  title     = {The Recovery Theorem},
+  journal   = {Journal of Finance},
+  volume    = {70},
+  number    = {2},
+  pages     = {615--648},
+  year      = {2015},
+  doi       = {10.1111/jofi.12092}
+}
+
+@article{HansenScheinkman2009,
+  author    = {Hansen, Lars Peter and Scheinkman, José A.},
+  title     = {Long-Term Risk: An Operator Approach},
+  journal   = {Econometrica},
+  volume    = {77},
+  number    = {1},
+  pages     = {177--234},
+  year      = {2009},
+  doi       = {10.3982/ECTA6761}
+}
+
+@article{AlvarezJermann2005,
+  author    = {Alvarez, Fernando and Jermann, Urban J.},
+  title     = {Using Asset Prices to Measure the Persistence of the Marginal Utility of Wealth},
+  journal   = {Econometrica},
+  volume    = {73},
+  number    = {6},
+  pages     = {1977--2016},
+  year      = {2005},
+  doi       = {10.1111/j.1468-0262.2005.00643.x}
+}
+
+@article{BakshiChabiYo2012,
+  author    = {Bakshi, Gurdip and Chabi-Yo, Fousseni},
+  title     = {Variance Bounds on the Permanent and Transitory Components of Stochastic Discount Factors},
+  journal   = {Journal of Financial Economics},
+  volume    = {105},
+  number    = {1},
+  pages     = {191--208},
+  year      = {2012},
+  doi       = {10.1016/j.jfineco.2012.01.003}
+}
+
+@article{BackusGregoryZin1989,
+  author    = {Backus, David K. and Gregory, Allan W. and Zin, Stanley E.},
+  title     = {Risk Premiums in the Term Structure: Evidence from Artificial Economies},
+  journal   = {Journal of Monetary Economics},
+  volume    = {24},
+  number    = {3},
+  pages     = {371--399},
+  year      = {1989},
+  doi       = {10.1016/0304-3932(89)90027-5}
+}
+
+@article{Hansen2012,
+  author    = {Hansen, Lars Peter},
+  title     = {Dynamic Valuation Decomposition within Stochastic Economies},
+  journal   = {Econometrica},
+  volume    = {80},
+  number    = {3},
+  pages     = {911--967},
+  year      = {2012},
+  note      = {Fisher--Schultz Lecture},
+  doi       = {10.3982/ECTA8070}
+}
+
+@article{BackusChernovZin2014,
+  author    = {Backus, David K. and Chernov, Mikhail and Zin, Stanley E.},
+  title     = {Sources of Entropy in Representative Agent Models},
+  journal   = {Journal of Finance},
+  volume    = {69},
+  number    = {1},
+  pages     = {51--99},
+  year      = {2014},
+  doi       = {10.1111/jofi.12090}
+}
+
 @article{Borovicka2020,
   author    = {Borovička, Jaroslav},
   title     = {Survival and Long-Run Dynamics with Heterogeneous Beliefs under Recursive Preferences},
@@ -6,7 +172,8 @@ @article{Borovicka2020
   number    = {1},
   pages     = {206--251},
   year      = {2020},
-  publisher = {University of Chicago Press}
+  publisher = {University of Chicago Press},
+  doi       = {10.1086/704072}
 }
 
 @article{Sandroni2000Markets,
@@ -613,28 +780,34 @@ @book{Sargent_Stachurski_2025
   place={Cambridge}, 
   title={Dynamic Programming: Finite States}, 
   publisher={Cambridge University Press}, 
-  author={Sargent, Thomas J and Stachurski, John}, 
+  author={Sargent, Thomas J. and Stachurski, John},
   year={2025}
 }
 
-@incollection{slutsky:1927,
- address = {Moscow},
- author = {Slutsky, Eugen},
- booktitle = {Problems of Economic Conditions},
- date-added = {2021-02-16 14:44:03 -0600},
- date-modified = {2021-02-16 14:44:03 -0600},
- publisher = {The Conjuncture Institute},
- title = {The Summation of Random Causes as the Source of Cyclic Processes},
- volume = {3},
- year = {1927}
+@book{Sargent_Stachurski_2024,
+  place={Cambridge},
+  series={Structural Analysis in the Social Sciences},
+  title={Economic Networks: Theory and Computation},
+  publisher={Cambridge University Press},
+  author={Sargent, Thomas J. and Stachurski, John},
+  year={2024}
+}
+
+@article{slutsky1937,
+  author  = {Slutzky, Eugen},
+  title   = {The Summation of Random Causes as the Source of Cyclic Processes},
+  journal = {Econometrica},
+  volume  = {5},
+  number  = {2},
+  pages   = {105--146},
+  year    = {1937},
+  doi     = {10.2307/1907241}
 }
 
 @incollection{frisch33,
- author = {Ragar Frisch},
+ author = {Ragnar Frisch},
  booktitle = {Economic Essays in Honour of Gustav Cassel},
- date-added = {2015-01-09 21:08:15 +0000},
- date-modified = {2015-01-09 21:08:15 +0000},
- pages = {171-205},
+ pages = {171--203},
  publisher = {Allen and Unwin},
  title = {Propagation Problems and Impulse Problems in Dynamic Economics},
  year = {1933}
@@ -814,8 +987,6 @@ @incollection{Hurwicz:1962
  address = {Stanford, CA},
  author = {Hurwicz, Leonid},
  booktitle = {Logic, Methodology and Philosophy of Science},
- date-added = {2014-12-26 17:45:57 +0000},
- date-modified = {2022-01-09 19:40:37 -0600},
  pages = {232-239},
  publisher = {Stanford University Press},
  title = {On the Structural Form of Interdependent Systems},
@@ -855,8 +1026,8 @@ @article{wecker1979predicting
 }
 
 @book{Chadhuri_Mukerjee_88,
-  title     = {Randomized Response: Theory and Technique},
-  author    = {A Chadhuri and R Mukerjee},
+  title     = {Randomized Response: Theory and Techniques},
+  author    = {Chaudhuri, A. and Mukerjee, R.},
   year      = {1988},
   publisher = {Marcel Dekker},
   address   = {New York}
@@ -979,8 +1150,8 @@ @article{apostolakis1990
 }
 
 @unpublished{Greenfield_Sargent_1993,
-  author = {Moses A Greenfield and Thomas J Sargent},
-  title  = {A Probabilistic Analysis of a Catastrophic Transuranic Waste Hoise Accident at the WIPP},
+  author = {Greenfield, Moses A. and Sargent, Thomas J.},
+  title  = {A Probabilistic Analysis of a Catastrophic Transuranic Waste Hoist Accident at the WIPP},
   year   = {1993},
   month  = {June},
   note   = {Environmental Evaluation Group, Albuquerque, New Mexico},
@@ -1007,12 +1178,12 @@ @article{Groves_73
 }
 
 @article{Clarke_71,
-  author  = {Clarke, E.},
-  year    = { 1971},
+  author  = {Clarke, Edward H.},
+  year    = {1971},
   title   = {Multipart pricing of public goods},
   journal = {Public Choice},
-  volume  = {8},
-  pages   = {19-33}
+  volume  = {11},
+  pages   = {17--33}
 }
 
 @article{Vickrey_61,
@@ -1066,8 +1237,6 @@ @article{tu_Rowley
 
 @book{Knight:1921,
   author        = {Knight, Frank H.},
-  date-added    = {2020-08-20 10:29:34 -0500},
-  date-modified = {2020-08-20 11:10:35 -0500},
   keywords      = {climate,modeling},
   publisher     = {Houghton Mifflin},
   title         = {{Risk, Uncertainty, and Profit}},
@@ -1076,12 +1245,10 @@ @book{Knight:1921
 
 @article{MaccheroniMarinacciRustichini:2006b,
   author        = {Maccheroni, Fabio and Marinacci, Massimo and Rustichini, Aldo},
-  date-added    = {2021-05-19 08:04:27 -0500},
-  date-modified = {2021-05-19 08:04:27 -0500},
   journal       = {Econometrica},
   keywords      = {*file-import-17-01-11},
   number        = {6},
-  pages         = {1147--1498},
+  pages         = {1447--1498},
   title         = {{Ambiguity Aversion, Robustness, and the Variational Representation of Preferences}},
   volume        = {74},
   year          = {2006}
@@ -1089,8 +1256,6 @@ @article{MaccheroniMarinacciRustichini:2006b
 
   @article{GilboaSchmeidler:1989,
   author          = {Gilboa, Itzhak and Schmeidler, David},
-  date-added      = {2020-08-10 09:11:02 -0500},
-  date-modified   = {2020-08-10 09:11:02 -0500},
   journal         = {Journal of Mathematical Economics},
   keywords        = {climate,modeling},
   mendeley-groups = {nsfbib},
@@ -1225,8 +1390,8 @@ @book{Galichon_2016
 }
 
 @book{DMD_book,
-  title     = {Dynamic mode decomposition: data-driven modeling of complex systems},
-  author    = {J. N.  Kutz and  S. L. Brunton and  B. W, Brunton and  J. L. Proctor},
+  title     = {Dynamic Mode Decomposition: Data-Driven Modeling of Complex Systems},
+  author    = {Kutz, J. Nathan and Brunton, Steven L. and Brunton, Bingni W. and Proctor, Joshua L.},
   year      = {2016},
   publisher = {SIAM}
 }
@@ -1240,14 +1405,14 @@ @book{DDSE_book
 }
 
 @book{bertsimas_tsitsiklis1997,
-  author    = {Bertsimas, D. & Tsitsiklis, J. N.},
-  title     = {{Introduction to linear optimization}},
+  author    = {Bertsimas, Dimitris and Tsitsiklis, John N.},
+  title     = {{Introduction to Linear Optimization}},
   publisher = {Athena Scientific},
   year      = {1997}
 }
 
 @book{hu_guo2018,
-  author    = {Hu, Y. & Guo, Y.},
+  author    = {Hu, Yunquan and Guo, Yaohuang},
   title     = {{Operations research}},
   publisher = {Tsinghua University Press},
   edition   = {5th},
@@ -1256,9 +1421,7 @@ @book{hu_guo2018
 
 @article{definetti,
   author        = {Bruno de Finetti},
-  date-added    = {2014-12-26 17:45:57 +0000},
-  date-modified = {2014-12-26 17:45:57 +0000},
-  journal       = {Annales de l'Institute Henri Poincare'},
+  journal       = {Annales de l'Institut Henri Poincaré},
   note          = {English translation in Kyburg and Smokler (eds.), {\it Studies in Subjective Probability}, Wiley, New York, 1964},
   pages         = {1 - 68},
   title         = {La Prevision: Ses Lois Logiques, Ses Sources Subjectives},
@@ -1505,11 +1668,12 @@ @article{benhabib2018skewed
   year    = {2018}
 }
 
-@article{pareto1896cours,
+@book{pareto1896cours,
   title   = {Cours d'{\'e}conomie politique},
-  author  = {Vilfredo, Pareto},
-  journal = {Rouge, Lausanne},
-  volume  = {2},
+  author  = {Pareto, Vilfredo},
+  publisher = {F. Rouge},
+  address = {Lausanne},
+  volume  = {1},
   year    = {1896}
 }
 
@@ -1662,7 +1826,7 @@ @article{Samuelson1939
   title   = {Interactions Between the Multiplier Analysis
              and the Principle of Acceleration},
   author  = {Samuelson, Paul A.},
-  journal = {Review of Economic Studies},
+  journal = {The Review of Economics and Statistics},
   volume  = {21},
   number  = {2},
   year    = {1939},
@@ -1697,7 +1861,7 @@ @incollection{Koopmans
   title      = {On the Concept of Optimal Economic Growth},
   booktitle  = {The Economic Approach to Development Planning},
   address    = { Chicago},
-  publilsher = {Rand McNally},
+  publisher  = {Rand McNally},
   pages      = {225-287}
 }
 
@@ -1789,29 +1953,33 @@ @article{Jovanovic1979
   publisher = {The University of Chicago Press}
 }
 
-@article{Deneckere1992,
-  title     = {Cyclical and chaotic behavior in a dynamic equilibrium model, with implications for fiscal policy},
-  author    = {Deneckere, Raymond J and Judd, Kenneth L},
-  journal   = {Cycles and chaos in economic equilibrium},
+@incollection{Deneckere1992,
+  title     = {Cyclical and Chaotic Behavior in a Dynamic Equilibrium Model, with Implications for Fiscal Policy},
+  author    = {Deneckere, Raymond J. and Judd, Kenneth L.},
+  editor    = {Benhabib, Jess},
+  booktitle = {Cycles and Chaos in Economic Equilibrium},
   pages     = {308--329},
   year      = {1992},
-  publisher = {Princeton University Press}
+  publisher = {Princeton University Press},
+  address   = {Princeton}
 }
 
 @article{Judd1985,
   title     = {On the performance of patents},
   author    = {Judd, Kenneth L},
   journal   = {Econometrica},
+  volume    = {53},
+  number    = {3},
   pages     = {567--585},
-  year      = {1985},
-  publisher = {JSTOR}
+  year      = {1985}
 }
 
 @book{Helpman1985,
-  title     = {Market structure and international trade},
-  author    = {Helpman, Elhanan and Krugman, Paul},
-  year      = {1985},
-  publisher = {MIT Press Cambridge}
+  title     = {Market Structure and Foreign Trade: Increasing Returns, Imperfect Competition, and the International Economy},
+  author    = {Helpman, Elhanan and Krugman, Paul R.},
+  year      = {1987},
+  publisher = {MIT Press},
+  address   = {Cambridge, MA}
 }
 
 @article{LettLud2004,
@@ -1837,13 +2005,13 @@ @article{LettLud2001
 }
 
 @article{CampbellShiller88,
-  author  = {John Y. Campbell, Robert J. Shiller},
+  author  = {Campbell, John Y. and Shiller, Robert J.},
   title   = {{The Dividend-Price Ratio and Expectations of Future Dividends and Discount Factors}},
   journal = {Review of Financial Studies},
   year    = 1988,
   volume  = {1},
   number  = {3},
-  pages   = {195-228}
+  pages   = {195--228}
 }
 
 @book{Friedman98,
@@ -1877,9 +2045,9 @@ @book{Kreps88
 }
 
 @book{Bertsekas75,
-  author    = {Dmitri Bertsekas},
+  author    = {Bertsekas, Dimitri P.},
   title     = {Dynamic Programming and Stochastic Control},
-  year      = {1975},
+  year      = {1976},
   publisher = {Academic Press},
   address   = {New York}
 }
@@ -2001,9 +2169,9 @@ @book{Orfanidisoptimum1988
 }
 
 @book{Athanasios1991,
-  title     = {Probability, random variables, and stochastic processes},
-  author    = {Athanasios, Papoulis and Pillai, S Unnikrishna},
-  publisher = {Mc-Graw Hill},
+  title     = {Probability, Random Variables, and Stochastic Processes},
+  author    = {Papoulis, Athanasios},
+  publisher = {McGraw-Hill},
   year      = {1991}
 }
 
@@ -2033,7 +2201,7 @@ @article{PhelanStacchetti2001
   year    = 2001,
   volume  = {69},
   number  = {6},
-  pages   = {1491-1518},
+  pages   = {1491--1518},
   month   = {November}
 }
 
@@ -2178,9 +2346,12 @@ @article{arellano2008default
 }
 
 @article{davis2006flow,
-  title   = {The flow approach to labor markets: New data sources, micro-macro links and the recent downturn},
-  author  = {Davis, Steven J and Faberman, R Jason and Haltiwanger, John},
+  title   = {The Flow Approach to Labor Markets: New Data Sources and Micro-Macro Links},
+  author  = {Davis, Steven J. and Faberman, R. Jason and Haltiwanger, John},
   journal = {Journal of Economic Perspectives},
+  volume  = {20},
+  number  = {3},
+  pages   = {3--26},
   year    = {2006}
 }
 
@@ -2226,11 +2397,9 @@ @article{Rust1996
 }
 
 @book{AKR1990,
-  author    = {Amman, H. M. and Kendrick, D.A. and Rust, J.},
-  address   = {Burlington, MA},
-  publisher = {Elsevier},
+  editor    = {Amman, H. M. and Kendrick, D. A. and Rust, John},
   title     = {{Handbook of Computational Economics}},
-  year      = {1990}
+  year      = {1996}
 }
 
 @book{AndersonMoore2005,
@@ -2424,10 +2593,12 @@ @article{Hall1978
 }
 
 @article{HallMishkin1982,
-  author  = {Hall, Robert E and Mishkin, Frederic S},
-  journal = {National Bureau of Economic Research Working Paper Series},
+  author  = {Hall, Robert E. and Mishkin, Frederic S.},
+  journal = {Econometrica},
   title   = {{The Sensitivity of Consumption to Transitory Income: Estimates from Panel Data on Households}},
-  volume  = {No. 505},
+  volume  = {50},
+  number  = {2},
+  pages   = {461--481},
   year    = {1982}
 }
 
@@ -2605,23 +2776,23 @@ @article{Kuhn2013
 }
 
 @article{KydlandPrescott1977,
-  author  = {Kydland, Finn E., and Edward C. Prescott},
+  author  = {Kydland, Finn E. and Prescott, Edward C.},
   journal = {Journal of Political Economy},
-  pages   = {867-896},
+  pages   = {473--492},
   title   = {Rules Rather than Discretion: The Inconsistency of Optimal Plans},
-  volume  = {106},
-  number  = {5},
+  volume  = {85},
+  number  = {3},
   year    = {1977}
 }
 
 @article{KydlandPrescott1980,
-  author  = {Kydland, Finn E., and Edward C. Prescott},
-  journal = {Econometrics},
-  pages   = {1345-2370},
+  author  = {Kydland, Finn E. and Prescott, Edward C.},
+  journal = {Econometrica},
+  pages   = {1345--1370},
   title   = {Time to Build and Aggregate Fluctuations},
   volume  = {50},
   number  = {6},
-  year    = {1980}
+  year    = {1982}
 }
 
 @book{LasotaMackey1994,
@@ -2651,9 +2822,9 @@ @article{Lucas1978
 }
 
 @article{LucasStokey1983,
-  author  = {Lucas, Jr., Robert E and Stokey, Nancy L},
-  journal = {Journal of monetary Economics},
-  number  = {3},
+  author  = {Lucas, Jr., Robert E. and Stokey, Nancy L.},
+  journal = {Journal of Monetary Economics},
+  number  = {1},
   pages   = {55--93},
   title   = {{Optimal Fiscal and Monetary Policy in an Economy without Capital}},
   volume  = {12},
@@ -2661,13 +2832,13 @@ @article{LucasStokey1983
 }
 
 @article{MarcetMarimon1994,
-  author      = {Albert Marcet and Ramon Marimon},
-  title       = {{Recursive contracts}},
-  year        = 1994,
-  institution = {Department of Economics and Business, Universitat Pompeu Fabra},
-  type        = {Economics Working Papers},
-  url         = {http://ideas.repec.org/p/upf/upfgen/337.html},
-  number      = {337}
+  author  = {Marcet, Albert and Marimon, Ramon},
+  title   = {{Recursive Contracts}},
+  journal = {Econometrica},
+  volume  = {87},
+  number  = {5},
+  pages   = {1589--1631},
+  year    = {2019}
 }
 
 @article{MarcetSargent1989,
@@ -2783,12 +2954,13 @@ @article{Pearlman1992
 }
 
 @article{PearlmanCurrieLevine1986,
-  author  = {Pearlman, J.G. and Currie, D.A. and Levine, P.L.},
-  title   = {Rational expectations with partial information},
-  journal = {Economic Modeling},
+  author  = {Pearlman, Joseph and Currie, David and Levine, Paul},
+  title   = {Rational expectations models with partial information},
+  journal = {Economic Modelling},
   volume  = {3},
-  pages   = {90-105},
-  year    = {1992}
+  number  = {2},
+  pages   = {90--105},
+  year    = {1986}
 }
 
 @book{Popper1992,
@@ -2803,9 +2975,9 @@ @article{Prescott1977
   author  = {Prescott, Edward C.},
   year    = {1977},
   title   = {Should Control Theory Be Used for Economic Stabilization?},
-  journal = {Journal of Monetary Economics},
+  journal = {Carnegie-Rochester Conference Series on Public Policy},
   volume  = {7},
-  pages   = {13-38}
+  pages   = {13--38}
 }
 
 @article{Rabault2002,
@@ -2841,12 +3013,13 @@ @article{Reiter2009
 }
 
 @article{Sargent1979,
-  author  = {Sargent, T J},
+  author  = {Sargent, Thomas J.},
   year    = {1979},
   title   = {A Note On Maximum Likelihood Estimation of The Rational Expectations Model of The Term Structure},
   journal = {Journal of Monetary Economics},
-  volume  = {35},
-  pages   = {245-274}
+  volume  = {5},
+  number  = {1},
+  pages   = {133--143}
 }
 
 @book{Sargent1987,
@@ -3077,13 +3250,15 @@ @article{barro2006rare
   publisher={MIT Press}
 }
 
-@article{Brock1982,
+@incollection{Brock1982,
   title={Asset prices in a production economy},
   author={Brock, William A},
-  journal={The Economics of Information and Uncertainty},
-  pages={1--43},
+  booktitle={The Economics of Information and Uncertainty},
+  editor={McCall, John J.},
+  pages={1--46},
   year={1982},
-  publisher={University of Chicago Press}
+  publisher={University of Chicago Press},
+  address={Chicago}
 }
 
 @article{PrescottMehra1980,
@@ -3371,12 +3546,14 @@ @book{Hans_Sarg_book_2016
 }
 
 @article{Neyman_Pearson,
-  author  = {Neyman, J. and  Pearson, E. S},
+  author  = {Neyman, J. and Pearson, E. S.},
   year    = {1933},
   title   = {On the problem of the most efficient tests of statistical
              hypotheses},
-  journal = {Phil. Trans. R. Soc. Lond. A. 231 (694–706)},
-  pages   = {289–337}
+  journal = {Philosophical Transactions of the Royal Society of London},
+  volume  = {231},
+  number  = {694--706},
+  pages   = {289--337}
 }
 
 @article{ma2020income,
@@ -3631,3 +3808,54 @@ @article{shwartz_ziv_tishby2017
   journal = {arXiv preprint arXiv:1703.00810},
   year    = 2017
 }
+
+@article{kihlstrom_mirman1975,
+  author    = {Kihlstrom, Richard E. and Mirman, Leonard J.},
+  title     = {Information and Market Equilibrium},
+  journal   = {The Bell Journal of Economics},
+  volume    = {6},
+  number    = {1},
+  pages     = {357--376},
+  year      = {1975},
+  publisher = {The RAND Corporation}
+}
+
+@article{muth1961,
+  author  = {Muth, John F.},
+  title   = {Rational Expectations and the Theory of Price Movements},
+  journal = {Econometrica},
+  volume  = {29},
+  number  = {3},
+  pages   = {315--335},
+  year    = {1961}
+}
+
+@article{radner1972,
+  author  = {Radner, Roy},
+  title   = {Existence of Equilibrium of Plans, Prices, and Price Expectations in a Sequence of Markets},
+  journal = {Econometrica},
+  volume  = {40},
+  number  = {2},
+  pages   = {289--303},
+  year    = {1972}
+}
+
+@article{arrow1964,
+  author  = {Arrow, Kenneth J.},
+  title   = {The Role of Securities in the Optimal Allocation of Risk-bearing},
+  journal = {Review of Economic Studies},
+  volume  = {31},
+  number  = {2},
+  pages   = {91--96},
+  year    = {1964}
+}
+
+@article{grossman1976,
+  author  = {Grossman, Sanford J.},
+  title   = {On the Efficiency of Competitive Stock Markets Where Trades Have Diverse Information},
+  journal = {Journal of Finance},
+  volume  = {31},
+  number  = {2},
+  pages   = {573--585},
+  year    = {1976}
+}
diff --git a/lectures/_toc.yml b/lectures/_toc.yml
index f322b2864..078ba02e6 100644
--- a/lectures/_toc.yml
+++ b/lectures/_toc.yml
@@ -43,6 +43,7 @@ parts:
   - file: exchangeable
   - file: likelihood_bayes
   - file: blackwell_kihlstrom
+  - file: information_market_equilibrium
   - file: mix_model
   - file: navy_captain
   - file: merging_of_opinions
@@ -141,6 +142,8 @@ parts:
   - file: harrison_kreps
   - file: morris_learn
   - file: affine_risk_prices
+  - file: ross_recovery
+  - file: misspecified_recovery
 - caption: Data and Empirics
   numbered: true
   chapters:
diff --git a/lectures/blackwell_kihlstrom.md b/lectures/blackwell_kihlstrom.md
index ad1cfb9d3..65192a9ec 100644
--- a/lectures/blackwell_kihlstrom.md
+++ b/lectures/blackwell_kihlstrom.md
@@ -962,7 +962,7 @@ The Blackwell order says that, absent costs, more information is always better f
 
 With costs, the consumer chooses quality investment $\theta$ to maximize *net value*.
 
-If quality investment translates into experiment accuracy with diminishing returns — say, accuracy $\phi(\theta) = 1 - e^{-a\theta}$ for a rate parameter $a$ — then the marginal value of information eventually decreases in $\theta$.
+If quality investment translates into experiment accuracy with diminishing returns -- say, accuracy $\phi(\theta) = 1 - e^{-a\theta}$ for a rate parameter $a$ -- then the marginal value of information eventually decreases in $\theta$.
 
 With a convex cost $c(\theta) = c \, \theta^2$, the increasing marginal cost eventually overtakes the declining marginal value, producing an interior optimum.
 
diff --git a/lectures/cass_fiscal.md b/lectures/cass_fiscal.md
index b7e3646df..618549610 100644
--- a/lectures/cass_fiscal.md
+++ b/lectures/cass_fiscal.md
@@ -1133,7 +1133,7 @@ and capital stock across time:
     - The jump in $\tau_c$ depresses $\bar{R}$ below $1$, causing a *sharp drop in consumption*.
 - After $T = 10$:
     - The effects of anticipated distortion are over, and the economy gradually adjusts to the lower capital stock.
-    - Capital must now rise, requiring *austerity* —consumption plummets after $t = T$,  indicated by  lower levels of consumption.
+    - Capital must now rise, requiring *austerity* --consumption plummets after $t = T$,  indicated by  lower levels of consumption.
     - The interest rate gradually declines, and consumption grows at a diminishing rate along the path to the terminal steady-state.
 
 +++
diff --git a/lectures/cass_fiscal_2.md b/lectures/cass_fiscal_2.md
index 3f396e500..2ff84230b 100644
--- a/lectures/cass_fiscal_2.md
+++ b/lectures/cass_fiscal_2.md
@@ -498,7 +498,7 @@ This means that foreign households begin repaying part of their external debt by
 
 We now explore the impact of an increase in capital taxation in the domestic economy $10$ periods after its announcement at $t = 1$.
 
-Because the change is anticipated, households in both countries adjust immediately—even though the tax does not take effect until period $t = 11$.
+Because the change is anticipated, households in both countries adjust immediately--even though the tax does not take effect until period $t = 11$.
 
 ```{code-cell} ipython3
 shocks_global = {
diff --git a/lectures/chow_business_cycles.md b/lectures/chow_business_cycles.md
index 6fcc777a5..245f04291 100644
--- a/lectures/chow_business_cycles.md
+++ b/lectures/chow_business_cycles.md
@@ -351,9 +351,9 @@ The second equation is the discrete Lyapunov equation for $\Gamma_0$.
 > But in reality the cycles ... are generally not damped.
 > How can the maintenance of the swings be explained?
 > ... One way which I believe is particularly fruitful and promising is to study what would become of the solution of a determinate dynamic system if it were exposed to a stream of erratic shocks ...
-> Thus, by connecting the two ideas: (1) the continuous solution of a determinate dynamic system and (2) the discontinuous shocks intervening and supplying the energy that may maintain the swings—we get a theoretical setup which seems to furnish a rational interpretation of those movements which we have been accustomed to see in our statistical time data.
+> Thus, by connecting the two ideas: (1) the continuous solution of a determinate dynamic system and (2) the discontinuous shocks intervening and supplying the energy that may maintain the swings--we get a theoretical setup which seems to furnish a rational interpretation of those movements which we have been accustomed to see in our statistical time data.
 >
-> — Ragnar Frisch (1933) {cite}`frisch33`
+> -- Ragnar Frisch (1933) {cite}`frisch33`
 
 Chow's main insight is that oscillations in the deterministic system are *neither necessary nor sufficient* for producing "cycles" in the stochastic system.
 
@@ -844,7 +844,7 @@ The peak appears at $\omega/\pi \approx 0.10$, which corresponds to a cycle leng
 
 ### The Slutsky connection
 
-Chow connects this result to Slutsky's {cite}`slutsky:1927`  finding that  moving averages of a random series have recurrent cycles.
+Chow connects this result to Slutsky's {cite}`slutsky1937`  finding that  moving averages of a random series have recurrent cycles.
 
 The VAR(1) model can be written as an infinite moving average:
 
@@ -1408,7 +1408,7 @@ plt.show()
 
 As $v$ increases, eigenvalues approach the unit circle: oscillations become more persistent in the time domain (left), and the spectral peak becomes sharper in the frequency domain (right).
 
-Complex roots produce a pronounced peak at interior frequencies—the spectral signature of business cycles.
+Complex roots produce a pronounced peak at interior frequencies--the spectral signature of business cycles.
 
 ```{solution-end}
 ```
diff --git a/lectures/hansen_singleton_1982.md b/lectures/hansen_singleton_1982.md
index 862b7ba03..560cd5c4b 100644
--- a/lectures/hansen_singleton_1982.md
+++ b/lectures/hansen_singleton_1982.md
@@ -225,7 +225,7 @@ The vector $z_t$ plays the role of **instruments**.
 
 The conditional Euler equation $E_t[M_{t+1}R_{t+1}^i - 1] = 0$ says that the pricing error is unpredictable given *everything* in the agent's time-$t$ information set.
 
-That is a very strong restriction — it says the pricing error is orthogonal to every time-$t$ measurable random variable.
+That is a very strong restriction -- it says the pricing error is orthogonal to every time-$t$ measurable random variable.
 
 We cannot use the entire information set in practice, but we can pick any finite collection of time-$t$ observable variables $z_t$ and the orthogonality must still hold.
 
diff --git a/lectures/hansen_singleton_1983.md b/lectures/hansen_singleton_1983.md
index c2df80578..ee70ff25b 100644
--- a/lectures/hansen_singleton_1983.md
+++ b/lectures/hansen_singleton_1983.md
@@ -36,7 +36,7 @@ kernelspec:
 > rational expectations econometrics. A rational expectations equilibrium is a
 > likelihood function. Maximize it.
 >
-> — An Interview with Thomas J. Sargent {cite}`evans2005interview`
+> -- An Interview with Thomas J. Sargent {cite}`evans2005interview`
 
 ## Overview
 
@@ -1869,7 +1869,7 @@ Our estimates reproduce the pattern that {cite:t}`MehraPrescott1985` later calle
 
 - *Low estimated risk aversion:* The estimated $\hat\alpha$ values (and thus risk aversion $-\hat\alpha$) from the table above are similar to those in {cite:t}`hansen1983stochastic`, who report $\hat\alpha$ between $-0.32$ and $-1.25$.
 
-- *Tiny return predictability:* The unrestricted-VAR $R_R^2$ values are comparable to the 0.02 to 0.06 range in {cite:t}`hansen1983stochastic` — the predictable component of stock returns is small relative to the unpredictable component.
+- *Tiny return predictability:* The unrestricted-VAR $R_R^2$ values are comparable to the 0.02 to 0.06 range in {cite:t}`hansen1983stochastic` -- the predictable component of stock returns is small relative to the unpredictable component.
 
 - *Strong rejection for Treasury bills:* The Euler-equation restrictions are decisively rejected for the nominally risk-free Treasury bill return, just as in Table 4 of {cite:t}`hansen1983stochastic`.
 
diff --git a/lectures/information_market_equilibrium.md b/lectures/information_market_equilibrium.md
new file mode 100644
index 000000000..2fb1f9257
--- /dev/null
+++ b/lectures/information_market_equilibrium.md
@@ -0,0 +1,1574 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+(information_market_equilibrium)=
+```{raw} jupyter
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;"
+                width="250px"
+                src="https://assets.quantecon.org/img/qe-menubar-logo.svg"
+                alt="QuantEcon">
+        </a>
+</div>
+```
+
+# Information and Market Equilibrium
+
+```{contents} Contents
+:depth: 2
+```
+
+## Overview
+
+This lecture studies two questions about the **informational role of prices**
+posed and
+answered by {cite:t}`kihlstrom_mirman1975`.
+
+1. *When do prices transmit inside information?*   
+   - An informed insider observes a private
+   signal correlated with an unknown state of the world and adjusts demand
+   accordingly.
+   - Equilibrium prices shift. 
+   - Under what conditions can an outside observer *infer* the insider's
+   posterior distribution from the equilibrium price?
+
+2. *Do Bayesian price expectations converge?*  
+   - In a stationary stochastic exchange
+   economy, an uninformed observer uses the history of market prices and
+   Bayes' Law to form
+   beliefs about the economy's structure and hence about its induced price
+   distribution.
+   - Do those expectations eventually
+   agree with those of a fully informed observer?
+
+Kihlstrom and Mirman's answers rely on two classical ideas from statistics:
+
+- **Blackwell sufficiency**: a random variable $\tilde{y}$ is said to be
+  *sufficient* for a random variable
+  $\tilde{y}'$ with respect to an unknown state if knowing $\tilde{y}$ gives
+  all the
+  information about the state that $\tilde{y}'$ contains.
+- **Bayesian consistency**: as the sample grows, posterior beliefs eliminate
+  models that imply the wrong **price distribution**, so even when structure is
+  not identified from prices the posterior mass on the true **reduced form**
+  still converges to one.
+
+Important findings of {cite:t}`kihlstrom_mirman1975` are:
+
+- Equilibrium prices transmit inside information *if and only if* the map from
+  the
+  insider's posterior distribution to the equilibrium price is one-to-one on
+  the set of
+  posteriors that can actually arise from the signal.
+  - For the two-state case ($S = 2$), invertibility holds when the informed
+    agent's utility is homothetic and the elasticity of substitution is everywhere
+    either below one or above one.
+- In the dynamic economy, as information accumulates, Bayesian price
+  expectations converge to **rational expectations**, even when the deep
+  structure is not identified from prices alone.
+
+```{note}
+{cite:t}`kihlstrom_mirman1975` use the terms "reduced form" and "structural"
+models in a
+way that careful econometricians do. 
+
+Reduced-form and structural models come in pairs. 
+
+To each structure or structural model
+there is a reduced form, or collection of reduced forms, underlying different
+possible regressions.
+```
+
+The lecture is organized as follows.
+
+1. Set up the static two-commodity model and define equilibrium.
+2. State the price-revelation theorem and the invertibility conditions.
+3. Illustrate invertibility and its failure with numerical examples using CES
+   and
+   Cobb-Douglas preferences.
+4. Introduce the dynamic stochastic economy and derive the Bayesian convergence
+   result.
+5. Simulate Bayesian learning from price observations.
+
+This lecture builds on ideas in {doc}`blackwell_kihlstrom` and
+{doc}`likelihood_bayes`.
+
+We start by importing some Python packages.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.optimize import brentq
+from scipy.stats import norm
+```
+
+
+## Setup
+
+### Preferences, endowments, and the unknown state
+
+The economy has two goods. 
+
+Good 2 is the numeraire (price normalized to 1). 
+
+Good 1 trades at price $p > 0$.
+
+An unknown parameter $\bar{a}$ affects the value of good 1. 
+
+Agent $i$'s expected utility
+from a bundle $(x_1^i, x_2^i)$ is
+
+$$
+U^i(x_1^i, x_2^i)
+  = \sum_{s=1}^{S} u^i(a_s x_1^i,\, x_2^i)\, P^i(\bar{a} = a_s),
+$$
+
+where $P^i$ is agent $i$'s subjective probability distribution over the finite
+state space
+$A = \{a_1, \ldots, a_S\}$.
+
+Each agent starts with an endowment $w^i$ of good 2 and a share $\theta^i$ of
+the
+representative firm.
+
+In the paper's formal model, a single firm transforms good 2 into good 1
+according to
+$y_1 = f(y_2)$ with $f' < 0$ and chooses production to maximize
+
+$$
+\pi(p) = \max_{y_2 \leq 0} \{p f(y_2) + y_2\}.
+$$
+
+The firm's profit $\pi$ is then distributed to households according to the
+shares
+$\theta^i$.
+
+Agent
+$i$'s budget constraint is
+
+$$
+p x_1^i + x_2^i = w^i + \theta^i \pi.
+$$
+
+Agents maximize expected utility subject to their budget constraints.
+
+A **competitive
+equilibrium** is a price $\hat{p}$ that clears both markets simultaneously.
+
+Under the maintained convexity assumptions equilibrium exists, and following
+{cite:t}`kihlstrom_mirman1975` we assume the equilibrium price is unique, so
+that we can write $\hat p = p(\mu)$ as a well-defined function of the informed
+agent's posterior.
+
+For most of what follows, the production side matters only through the induced
+equilibrium price map, so when we turn to numerical illustrations we will
+suppress production and use a pure-exchange / portfolio interpretation to keep
+the calculations transparent.
+
+### The informed agent's problem
+
+Suppose **agent 1** (the insider) observes a private signal $\tilde{y}$
+correlated with
+$\bar{a}$ before trading, where $\tilde{y}$ takes values in a finite set $Y$.
+
+Before the signal arrives, agent 1 has prior beliefs
+$\mu_0 = P^1$.
+
+Upon observing $\tilde{y} = y$, agent 1 updates to the
+**posterior** $\mu_y = (\mu_{y1}, \ldots, \mu_{yS})$ via Bayes' rule:
+
+$$
+\mu_{ys} = P(\bar{a} = a_s \mid \tilde{y} = y).
+$$
+
+Because agent 1's demand depends on $\mu_y$, the new equilibrium price satisfies
+
+$$
+\hat{p} = p(\mu_y).
+$$
+
+Outside observers who see $\hat{p}$ but not $\tilde{y}$ can try to *back out*
+the
+insider's posterior from the price.
+
+Define the set of realized posteriors
+
+$$
+M = \{\mu_y : y \in Y,\; P(\tilde y = y) > 0\}.
+$$
+
+The key question is whether the map $\mu \mapsto p(\mu)$ is one-to-one on $M$.
+
+To answer that question, we now translate "information in prices" into
+Blackwell's language of sufficiency.
+
+(price_revelation_theorem)=
+## Price revelation
+
+### Blackwell sufficiency
+
+The price variable $p(\mu_{\tilde{y}})$ *accurately transmits* the insider's
+private
+information if observing the equilibrium price is just as informative about
+$\bar{a}$ as
+observing the signal $\tilde{y}$ directly.
+
+In Blackwell's language ({cite:t}`blackwell1951` and {cite:t}`blackwell1953`),
+this means
+$p(\mu_{\tilde{y}})$ is **sufficient** for $\tilde{y}$.
+
+```{prf:definition} Sufficiency
+:label: ime_def_sufficiency
+
+A random variable $\tilde{y}$ is *sufficient* for $\tilde{y}'$ with
+respect to $\bar{a}$ if there exists a conditional distribution $P(y' \mid y)$,
+**independent of** $\bar{a}$, such that
+
+$$
+\phi'_a(y') = \sum_{y \in Y} P(y' \mid y)\, \phi_a(y)
+\quad \text{for all } a \text{ and all } y',
+$$
+
+where $\phi_a(y) = P(\tilde{y} = y \mid \bar{a} = a)$.
+
+Thus, once $\tilde{y}$ is known, $\tilde{y}'$ provides no additional information
+about $\bar{a}$.
+```
+
+{cite:t}`kihlstrom_mirman1975` show that 
+
+```{prf:lemma} Posterior Sufficiency
+:label: ime_lemma_posterior_sufficiency
+
+The posterior distribution $\mu_{\tilde{y}}$ is a sufficient statistic for
+$\tilde{y}$.
+```
+
+```{prf:proof} (Sketch)
+The posterior $\mu_{\tilde{y}}$ satisfies
+
+$$
+P(\bar{a} = a_s \mid \mu_{\tilde{y}} = \mu_y,\; \tilde{y} = y) = \mu_{ys}
+  = P(\bar{a} = a_s \mid \mu_{\tilde{y}} = \mu_y).
+$$
+
+This identity says that once the posterior is known, conditioning on the
+original signal
+$\tilde y$ does not change beliefs about $\bar a$.
+
+Equivalently, the conditional law of $\tilde y$ given $\mu_{\tilde y}$ is
+independent of
+$\bar a$, so $\mu_{\tilde y}$ is sufficient for $\tilde y$ in Blackwell's sense.
+```
+
+Now let's think about the mapping from 
+belief to price.
+
+```{prf:theorem} Price Revelation
+:label: ime_theorem_price_revelation
+
+In the model outlined above, the price random variable $p(\mu_{\tilde{y}})$ is
+sufficient for the random variable $\tilde{y}$ if and only if the function
+$p(P^1)$ is invertible on the set of prices
+
+$$
+\mathcal{P} = \Bigl\{\, p(\mu_y) : y \in Y,\;
+  P(\tilde{y} = y) = \sum_{a \in A} \phi_a(y)\,\mu_0(a) > 0 \Bigr\}.
+$$
+```
+
+The logic is
+
+$$
+\tilde y \quad \longrightarrow \quad \mu_{\tilde y} \quad \longrightarrow \quad
+p(\mu_{\tilde y}).
+$$
+
+The first arrow loses no information about $\bar a$ by
+{prf:ref}`ime_lemma_posterior_sufficiency`, and the theorem asks when the second
+arrow also loses no information.
+
+The proof has two parts.
+
+If $p(\cdot)$ is one-to-one on $M$, then observing the price is equivalent to
+observing the
+posterior itself because
+
+$$
+P(\mu_{\tilde y} = \mu \mid p(\mu_{\tilde y}) = p)
+= \begin{cases}
+1 & \text{if } \mu = p^{-1}(p), \\
+0 & \text{otherwise.}
+\end{cases}
+$$
+
+This conditional distribution is independent of the state, so price is
+sufficient for the
+posterior; together with {prf:ref}`ime_lemma_posterior_sufficiency`, price is
+therefore
+sufficient for the signal.
+
+Conversely, if two different posteriors in $M$ generated the same price, an
+observer of the price could not tell which posterior had occurred, and the paper
+shows formally that in this case the conditional distribution of the posterior
+given price would depend on the state, so price could not be sufficient.
+
+Before turning to invertibility itself, it helps to keep in mind the two
+economic interpretations emphasized in the paper.
+
+### Two interpretations
+
+#### Insider trading in a stock market
+
+Good 1 is a risky asset with random return $\bar{a}$; good 2 is "money".
+
+An insider's demand reveals private information about the return.
+
+If the invertibility condition holds, outside observers can read the insider's
+posterior distribution -- the useful information the insider's signal carries
+about $\bar a$ -- from the equilibrium stock price.
+
+#### Price as a quality signal
+
+Good 1 has uncertain quality $\bar{a}$.
+
+Experienced consumers (who have sampled the good) observe a signal correlated
+with quality
+and buy accordingly.
+
+Uninformed consumers can infer quality from the market price, provided
+invertibility holds.
+
+(invertibility_conditions)=
+## Invertibility and the elasticity of substitution
+
+When does the belief-to-price map fail to be invertible?
+
+{prf:ref}`ime_theorem_invertibility_conditions`
+shows that for a two-state economy ($S = 2$), the answer depends on the
+**elasticity of
+substitution** $\sigma$ of agent 1's utility function.
+
+Before stating the theorem, it helps to see the two intermediate steps in the
+paper's
+argument.
+
+```{prf:lemma} Same Price Implies Same Allocation
+:label: ime_lemma_same_price_same_allocation
+
+Assume that $u^i$ has continuous first partial derivatives and that $u^i$ is
+quasi-concave.
+
+Let $p \in \mathcal{P}$.
+
+If there exist two measures $\mu^*$ and $\mu'$ in $M$ such that
+$p(\mu^*, P^2, \ldots, P^n) = p(\mu', P^2, \ldots, P^n) = p$, then
+
+$$
+x^i(\mu^*, P^2, \ldots, P^n) = x^i(\mu', P^2, \ldots, P^n), \quad
+i = 1, \ldots, n.
+$$
+```
+
+Fix the beliefs of all agents except agent 1.
+
+The lemma says that if two posterior beliefs $\mu^*$ and $\mu'$ for agent 1
+both support the same equilibrium price $p$, then they support the same
+equilibrium allocation for every trader.
+
+The intuition is that when the price is unchanged, the demands of the
+uninformed traders are unchanged too, so market clearing forces the informed
+agent's bundle to be unchanged as well.
+
+This lemma lets us define the informed agent's equilibrium bundle as a function
+of price alone:
+
+$$
+x(p) = (x_1(p), x_2(p)).
+$$
+
+Throughout, $u^i_j$ denotes the partial derivative of $u^i$ with respect to its
+$j$-th argument.
+
+Whenever the informed agent consumes positive amounts of both goods, optimality
+of $x(p)$
+under posterior $\mu$ gives the interior first-order condition
+
+$$
+p = \frac{\sum_{s=1}^S a_s u_1^1(a_s x_1(p), x_2(p))\, \mu(a_s)}
+         {\sum_{s=1}^S u_2^1(a_s x_1(p), x_2(p))\, \mu(a_s)}.
+$$
+
+For a fixed price $p$, the bundle $x(p)$ is fixed too, so invertibility boils
+down to
+whether this equation admits a unique posterior $\mu$.
+
+```{prf:lemma} Unique Posterior at a Given Price
+:label: ime_lemma_unique_posterior
+
+Assume that the first partial derivatives of $u^1$ exist and that $u^1$ is
+quasi-concave.
+
+Also assume that agent 1 always consumes positive quantities of both goods.
+
+Then $p(P^1)$ is invertible on $\mathcal{P}$ if for each $p \in \mathcal{P}$
+there exists a unique probability measure $\mu \in M$ such that
+
+$$
+\frac{\sum_{s=1}^S a_s\, u^1_1(a_s x_1(p), x_2(p))\, \mu(a_s)}
+     {\sum_{s=1}^S u^1_2(a_s x_1(p), x_2(p))\, \mu(a_s)} = p.
+$$
+```
+
+If two different posteriors gave the same price, then by
+{prf:ref}`ime_lemma_same_price_same_allocation` they would share the same bundle
+$x(p)$, contradicting uniqueness of the posterior that solves the first-order
+condition at that price.
+
+### The two-state first-order condition
+
+With $S = 2$ and $\mu = (q,\, 1-q)$, define
+
+$$
+\alpha_s(p) = a_s\, u^1_1(a_s x_1(p),\, x_2(p)), \qquad
+\beta_s(p)  = u^1_2(a_s x_1(p),\, x_2(p)), \qquad s = 1, 2.
+$$
+
+Then the first-order condition becomes
+
+$$
+p = \frac{\alpha_1(p)\, q + \alpha_2(p)\, (1-q)}
+         {\beta_1(p)\, q + \beta_2(p)\, (1-q)}.
+$$
+
+At a fixed price $p$, the quantities $\alpha_s(p)$ and $\beta_s(p)$ are
+constants, so
+uniqueness of the posterior is the same as uniqueness of the scalar $q$ solving
+this
+equation.
+
+```{prf:theorem} Invertibility Conditions
+:label: ime_theorem_invertibility_conditions
+
+Assume that the first partial derivatives of $u^1$ exist and that $u^1$ is
+quasi-concave and homothetic.
+
+Also suppose that the informed agent always consumes positive quantities of
+both goods in all equilibrium allocations.
+
+If $S = 2$ and the elasticity of substitution of $u^1$ is either always less
+than one or always greater than one, then $p(P^1)$ is invertible on
+$\mathcal{P}$.
+
+If $u^1$ is Cobb-Douglas (elasticity of substitution constant and equal to
+one), then $p(P^1)$ is constant on $\mathcal{P}$.
+```
+
+When $\sigma = 1$ the income and substitution effects exactly cancel, so
+agent 1's demand for good 1 does not respond to changes in beliefs about
+$\bar{a}$.
+
+Because the demand is unchanged, the market-clearing price is unchanged too,
+and the price reveals nothing about the insider's signal.
+
+### CES utility
+
+For concreteness we work with a simplified example with the **constant-elasticity-of-substitution** (CES)
+utility
+function
+
+$$
+u(c_1, c_2) = \bigl(c_1^{\rho} + c_2^{\rho}\bigr)^{1/\rho}, \qquad \rho \in
+(-\infty,0) \cup (0,1),
+$$
+
+whose elasticity of substitution is $\sigma = 1/(1-\rho)$.
+
+- $\rho \to 0$: Cobb-Douglas ($\sigma = 1$).
+- $\rho < 0$: $\sigma < 1$ (complements).
+- $0 < \rho < 1$: $\sigma > 1$ (substitutes).
+
+Pertinent partial derivatives are
+
+$$
+u_1(c_1,c_2) = \bigl(c_1^\rho + c_2^\rho\bigr)^{1/\rho - 1}\, c_1^{\rho-1},
+\qquad
+u_2(c_1,c_2) = \bigl(c_1^\rho + c_2^\rho\bigr)^{1/\rho - 1}\, c_2^{\rho-1}.
+$$
+
+This CES example is only an illustration, because the theorem itself covers any
+homothetic utility with elasticity everywhere above one or everywhere below one.
+
+With that example in hand, we can compute the equilibrium price directly as a
+function of the posterior.
+
+### Equilibrium price as a function of the posterior
+
+We focus on agent 1 as the *only* informed trader who absorbs one unit of good 1
+at
+equilibrium (i.e., $x_1 = 1$).
+
+Let $W_1 = w^1 + \theta^1 \pi$ denote agent 1's total wealth (endowment plus
+profit share).
+
+Agent 1's budget constraint then reduces to
+$x_2 = W_1 - p$, and the equilibrium price is the unique $p \in (0, W_1)$
+satisfying
+the first-order condition
+
+$$
+p \bigl[q\, u_2(a_1,\, W_1-p) + (1-q)\, u_2(a_2,\, W_1-p)\bigr]
+= q\, a_1\, u_1(a_1,\, W_1-p) + (1-q)\, a_2\, u_1(a_2,\, W_1-p).
+$$
+
+For Cobb-Douglas utility ($\sigma = 1$), the first-order condition becomes $p =
+W_1 - p$,
+giving $p^* = W_1/2$ regardless of the posterior $q$, confirming that no
+information
+is transmitted through the price in the Cobb-Douglas case.
+
+We compute first-order conditions numerically below.
+
+```{code-cell} ipython3
+def ces_derivatives(c1, c2, ρ):
+    """
+    Return CES marginal utilities.
+
+    Use the Cobb-Douglas limit near rho = 0.
+    """
+    if abs(ρ) < 1e-4:
+        u1 = 0.5 * np.sqrt(c2 / c1)
+        u2 = 0.5 * np.sqrt(c1 / c2)
+    else:
+        common = (c1**ρ + c2**ρ)**(1 / ρ - 1)
+        u1 = common * c1**(ρ - 1)
+        u2 = common * c2**(ρ - 1)
+    return u1, u2
+
+
+def eq_price(q, a1, a2, W1, ρ):
+    """Return the equilibrium price for posterior q."""
+    def residual(p):
+        x2 = W1 - p
+        u1_s1, u2_s1 = ces_derivatives(a1, x2, ρ)
+        u1_s2, u2_s2 = ces_derivatives(a2, x2, ρ)
+        lhs = p * (q * u2_s1 + (1 - q) * u2_s2)
+        rhs = q * a1 * u1_s1 + (1 - q) * a2 * u1_s2
+        return lhs - rhs
+
+    try:
+        return brentq(residual, 1e-6, W1 - 1e-6, xtol=1e-10)
+    except ValueError:
+        return np.nan
+```
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: equilibrium price vs posterior
+    name: fig-eq-price-posterior
+---
+a1, a2 = 2.0, 0.5     # state values (a1 > a2)
+W1 = 4.0
+
+q_grid = np.linspace(0.05, 0.95, 200)
+
+ρ_values = [-0.5, 0.0, 0.5]
+ρ_labels = [
+    r"$\rho = -0.5$ ($\sigma = 0.67$, complements)",
+    r"$\rho = 0$ ($\sigma = 1$, Cobb-Douglas)",
+    r"$\rho = 0.5$ ($\sigma = 2$, substitutes)",
+]
+
+fig, ax = plt.subplots(figsize=(8, 5))
+
+for ρ, label in zip(ρ_values, ρ_labels):
+    prices = [eq_price(q, a1, a2, W1, ρ) for q in q_grid]
+    ax.plot(q_grid, prices, label=label, lw=2)
+
+ax.set_xlabel(r"posterior probability $q = \Pr(\bar{a} = a_1)$", fontsize=12)
+ax.set_ylabel("equilibrium price $p^*(q)$", fontsize=12)
+ax.legend(fontsize=10)
+plt.tight_layout()
+plt.show()
+```
+
+The plot confirms {prf:ref}`ime_theorem_invertibility_conditions`.
+
+For CES with $\sigma \neq 1$, the equilibrium price is strictly monotone in $q$.
+
+An outside observer who knows the equilibrium map $p^*(\cdot)$ can therefore
+invert the price uniquely to recover $q$, so the inside information is fully
+transmitted.
+
+For Cobb-Douglas ($\sigma = 1$), the price is flat in $q$, so information is
+never transmitted through the market.
+
+```{code-cell} ipython3
+p_cd = [eq_price(q, a1, a2, W1, ρ=0.0) for q in q_grid]
+
+print(f"Cobb-Douglas (rho=0): min p* = {min(p_cd):.6f}, "
+      f"max p* = {max(p_cd):.6f}, "
+      f"range = {max(p_cd)-min(p_cd):.2e}")
+print(f"Analytical CD price  = W1/2 = {W1/2:.6f}")
+```
+
+Every entry equals $W_1/2 = 2.0$ exactly, confirming analytically that the
+Cobb-Douglas
+equilibrium price is independent of $q$ and of the state values $a_1, a_2$.
+
+The numerical plot shows monotonicity, and the next subsection connects that
+pattern back to the proof of {prf:ref}`ime_theorem_invertibility_conditions`.
+
+(price_monotonicity)=
+### Why monotonicity depends on $\sigma$
+
+Fix a price $p$ and treat $\alpha_s(p)$ and $\beta_s(p)$ as constants.
+
+The right-hand side of the two-state first-order condition
+
+$$
+\frac{\alpha_1(p)\, q + \alpha_2(p)\, (1-q)}
+     {\beta_1(p)\, q + \beta_2(p)\, (1-q)}
+$$
+
+is then a function of $q$ alone, with derivative
+
+$$
+\frac{\partial}{\partial q}
+\frac{\alpha_1 q + \alpha_2 (1-q)}
+     {\beta_1 q + \beta_2 (1-q)}
+= \frac{\alpha_1 \beta_2 - \alpha_2 \beta_1}
+       {\bigl[\beta_1 q + \beta_2 (1-q)\bigr]^2}.
+$$
+
+So the sign is determined by $\alpha_1 \beta_2 - \alpha_2 \beta_1$, and if that
+sign is constant then for each fixed price there is at most one posterior weight
+$q$ consistent with the first-order condition, which is exactly what
+{prf:ref}`ime_theorem_invertibility_conditions` requires.
+
+Using
+
+$$
+\frac{\alpha_s}{\beta_s}
+  = \frac{a_s\, u_1(a_s x_1, x_2)}{u_2(a_s x_1, x_2)}
+  = a_s^{(\sigma-1)/\sigma}\,\Bigl(\frac{x_2}{x_1}\Bigr)^{1/\sigma},
+$$
+
+one can show
+
+$$
+\frac{\partial}{\partial a}\,\frac{\alpha}{\beta}
+  = \frac{(\sigma - 1)}{\sigma}\, a^{-1/\sigma}\,
+    \Bigl(\frac{x_2}{x_1}\Bigr)^{1/\sigma}.
+$$
+
+For the CES specification, this derivative is positive when $\sigma > 1$,
+negative when
+$\sigma < 1$, and *zero when $\sigma = 1$*.
+
+In other words, for CES utility the ratio $\alpha_s / \beta_s$ moves
+monotonically with the state value $a_s$ unless $\sigma = 1$, which makes the
+fixed-price first-order-condition expression monotone in $q$ and in turn
+delivers invertibility.
+
+The vanishing derivative in the Cobb-Douglas case means the marginal rate of
+substitution is
+independent of $a_s$, so the informed agent's demand, and hence the equilibrium
+price, does
+not respond to changes in beliefs.
+
+Let us visualize the ratio $\alpha_s / \beta_s$ as a function of $a_s$ for
+different
+values of $\sigma$:
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: marginal rate of substitution
+    name: fig-mrs-alpha-beta
+---
+a_vals = np.linspace(0.3, 3.0, 300)
+x1_fix, x2_fix = 1.0, 1.0
+
+fig, ax = plt.subplots(figsize=(7, 4))
+for ρ in [-0.5, -1e-6, 0.5]:
+    σ = 1 / (1 - ρ) if abs(ρ) > 1e-8 else 1.0
+    ratios = []
+    for a in a_vals:
+        u1, u2 = ces_derivatives(a * x1_fix, x2_fix, ρ)
+        ratios.append(a * u1 / u2)
+    ax.plot(a_vals, ratios, label=rf"$\sigma = {σ:.2f}$", lw=2)
+
+ax.set_xlabel(r"state value $a_s$", fontsize=12)
+ax.set_ylabel(r"$\alpha_s / \beta_s = a_s u_1 / u_2$", fontsize=12)
+ax.axhline(y=1.0, color="black", lw=0.8, ls="--")
+ax.legend(fontsize=10)
+plt.tight_layout()
+plt.show()
+```
+
+When $\sigma = 1$ the ratio is constant across all $a_s$ values, so
+information about the state has no effect on the marginal rate of substitution.
+
+For $\sigma < 1$ the ratio is decreasing in $a_s$, and for $\sigma > 1$ it is
+increasing, making the equilibrium price strictly monotone in the posterior $q$
+in both cases.
+
+The static analysis asks whether a current price reveals current private
+information, whereas the next section asks what a whole history of prices
+reveals over time.
+
+(bayesian_price_expectations)=
+## Bayesian price expectations in a dynamic economy
+
+We now turn to a question addressed in Section 3 of
+{cite:t}`kihlstrom_mirman1975`.
+
+### A stochastic exchange economy
+
+Time is discrete: $t = 1, 2, \ldots$
+
+In each period $t$:
+
+1. Consumer $i$ receives a random endowment $\omega_i^t$.
+2. Markets open; competitive prices $p^t = p(\omega^t)$ clear all markets.
+3. Consumers trade and consume.
+
+The endowment vectors $\{\tilde{\omega}^t\}$ are **i.i.d.** with density
+$f(\omega^t \mid \lambda)$, where $\lambda = (\lambda_1, \ldots, \lambda_K)$ is
+a **structural parameter vector** (of dimension $K$) that is *fixed but
+unknown*.
+
+The equilibrium price at time $t$ is a deterministic function of $\omega^t$, so
+$\{p^t\}$ is also i.i.d.
+
+For any measurable price set $P$, let
+
+$$
+W(P) = \{\omega^t : p(\omega^t) \in P\}.
+$$
+
+Then
+
+$$
+P_\lambda(p^t \in P) = P_\lambda(\omega^t \in W(P))
+= \int_{W(P)} f(\omega^t \mid \lambda)\, d\omega^t.
+$$
+
+The induced price density is denoted by $g(p^t \mid \lambda)$.
+
+For a given structure $\lambda$, this density is the observable implication of
+the model, and when several structures imply the same density we group them
+into a single reduced-form class.
+
+The next issue is therefore what an observer can and cannot infer about the
+structure from price data alone.
+
+### The identification problem
+
+Because price observations identify only the induced price density
+$g(\cdot \mid \lambda)$, and because the structural-to-reduced-form map
+$\lambda \mapsto g(\cdot \mid \lambda)$ may be many-to-one, price data may
+identify only a reduced-form class rather than the exact structure.
+
+In particular, it may be impossible to recover $\lambda$ from
+$g(p \mid \lambda)$ even with infinite price data.
+
+To handle this, partition $\Lambda$ into equivalence classes $\mu$ such that
+$\lambda \in \mu$ and $\lambda' \in \mu$ whenever $g(p \mid \lambda) = g(p \mid
+\lambda')$
+for all $p$.
+
+The equivalence class $\mu$ containing the true $\lambda$ is the **reduced
+form** relevant for price data.
+
+An observer who knows the infinite price history learns
+$\mu$ but not necessarily $\lambda$.
+
+Once that distinction is clear, Bayesian updating can be written down directly.
+
+### Bayesian updating
+
+An uninformed observer begins with a prior $h(\lambda)$ over $\lambda \in
+\Lambda$.
+
+If the observer could see endowments directly, the posterior would be
+
+$$
+h(\lambda \mid \omega^1, \ldots, \omega^t)
+  = \frac{h(\lambda)\, \prod_{\tau=1}^{t} f(\omega^\tau \mid \lambda)}
+         {\displaystyle\sum_{\lambda' \in \Lambda}
+           h(\lambda')\, \prod_{\tau=1}^{t} f(\omega^\tau \mid \lambda')},
+$$
+
+and the paper appeals to a Bayesian consistency result to conclude that this
+posterior concentrates on the true structure $\bar \lambda$.
+
+After observing the price sequence $(p^1, \ldots, p^t)$, the observer's Bayesian
+posterior is
+
+$$
+h(\lambda \mid p^1, \ldots, p^t)
+  = \frac{h(\lambda)\, \prod_{\tau=1}^{t} g(p^\tau \mid \lambda)}
+         {\displaystyle\sum_{\lambda' \in \Lambda}
+           h(\lambda')\, \prod_{\tau=1}^{t} g(p^\tau \mid \lambda')}.
+$$
+
+Price data cannot distinguish structures inside the same reduced-form class.
+
+Indeed, if
+$\lambda$ and $\lambda'$ belong to the same class $\mu$, then
+$g(\cdot \mid \lambda) = g(\cdot \mid \lambda')$, so
+
+$$
+\frac{h(\lambda \mid p^1, \ldots, p^t)}
+     {h(\lambda' \mid p^1, \ldots, p^t)}
+= \frac{h(\lambda)}{h(\lambda')}
+$$
+
+for every sample history, so the relative odds within an observationally
+equivalent class never change.
+
+At time $t$, the observer's price expectations for the next period are
+
+$$
+g(p^{t+1} \mid p^1, \ldots, p^t)
+  = \sum_{\lambda \in \Lambda} g(p^{t+1} \mid \lambda)\,
+    h(\lambda \mid p^1, \ldots, p^t).
+$$
+
+### The convergence theorem
+
+```{prf:theorem} Bayesian Convergence
+:label: ime_theorem_bayesian_convergence
+
+Let $\bar\lambda$ be the true
+structural parameter and $\bar\mu$ the reduced form that contains $\bar\lambda$.
+
+Assume the prior assigns positive probability to the reduced-form class $\bar\mu$.
+
+Define the posterior mass on a reduced-form class by
+
+$$
+H_t(\mu) = \sum_{\lambda \in \mu} h(\lambda \mid p^1, \ldots, p^t).
+$$
+
+Because all structures inside a class imply the same $g(\cdot \mid \lambda)$,
+the
+predictive density can equivalently be written as
+
+$$
+g(p^{t+1} \mid p^1, \ldots, p^t)
+  = \sum_{\mu} g(p^{t+1} \mid \mu)\, H_t(\mu).
+$$
+
+Then
+
+$$
+\lim_{t \to \infty} H_t(\mu)
+  = \begin{cases} 1 & \text{if } \mu = \bar\mu, \\ 0 & \text{otherwise,}
+  \end{cases}
+$$
+
+with probability one.
+
+Consequently,
+
+$$
+\lim_{t \to \infty} g(p^{t+1} \mid p^1, \ldots, p^t) = g(p \mid \bar\mu),
+$$
+
+which equals the rational-expectations price distribution for a fully informed
+observer.
+```
+
+```{note}
+Note that the theorem only requires the prior to assign positive probability to the reduced-form class $\bar\mu$ that contains the true structure $\bar\lambda$.
+
+This is implied by, but weaker than, assigning positive probability to the true
+structural parameter $\bar\lambda$ itself.
+
+A prior could place zero mass on $\bar\lambda$
+while still placing positive mass on other structures inside $\bar\mu$.
+```
+
+The important distinction is that price observers need not learn $\bar \lambda$
+itself.
+
+They only learn which reduced-form class is correct.
+
+That is enough for forecasting because every $\lambda \in \bar \mu$ generates
+the same price density $g(\cdot \mid \bar \mu)$.
+
+Rational price expectations emerge from
+learning the
+reduced form, not from identifying every structural detail of the economy.
+
+Here "rational expectations" means that the observer's predictive distribution
+for next
+period's price matches the objective price distribution generated by the true
+reduced form.
+
+Let's now turn to a simple simulation.
+
+(bayesian_simulation)=
+## Simulating Bayesian learning from prices
+
+We illustrate the theorem with a two-state example.
+
+Two possible reduced forms $\mu_1$ and $\mu_2$ generate prices
+$p^t \sim N(\bar{p}_i, \sigma_p^2)$ for $i = 1, 2$ respectively.
+
+The observer knows the two possible price distributions (the reduced forms) but
+not which
+one governs the data.
+
+This is a **Bayesian model selection** problem we have seen in {doc}`likelihood_bayes`.
+
+With a prior $h_0$ on $\mu_1$ and the observed price $p^t$, the posterior weight
+on $\mu_1$
+after period $t$ is
+
+$$
+h_t = \frac{h_{t-1}\, g(p^t \mid \mu_1)}{h_{t-1}\, g(p^t \mid \mu_1)
+      + (1-h_{t-1})\, g(p^t \mid \mu_2)}.
+$$
+
+We consider a numerical example with two normal distributions with different means
+
+```{code-cell} ipython3
+def simulate_bayesian_learning(
+    p_bar_true, p_bar_alt, σ_p, T, h0, n_paths, seed=42
+):
+    """Simulate posterior learning between two Gaussian reduced forms."""
+    rng = np.random.default_rng(seed)
+    h_paths = np.zeros((n_paths, T + 1))
+    h_paths[:, 0] = h0
+
+    for path in range(n_paths):
+        h = h0
+        prices = rng.normal(p_bar_true, σ_p, size=T)
+        for t, p in enumerate(prices):
+            g_true = norm.pdf(p, loc=p_bar_true, scale=σ_p)
+            g_alt = norm.pdf(p, loc=p_bar_alt, scale=σ_p)
+            denom = h * g_true + (1 - h) * g_alt
+            h = h * g_true / denom
+            h_paths[path, t + 1] = h
+
+    return h_paths
+
+
+def plot_bayesian_learning(h_paths, p_bar_true, p_bar_alt, ax):
+    """Plot posterior beliefs over time."""
+    T = h_paths.shape[1] - 1
+    t_grid = np.arange(T + 1)
+
+    for path in h_paths:
+        ax.plot(t_grid, path, alpha=0.25, lw=0.8, color="steelblue")
+
+    median_path = np.median(h_paths, axis=0)
+    ax.plot(t_grid, median_path, color="navy", lw=2, label="median posterior")
+
+    ax.axhline(
+        y=1.0,
+        color="black",
+        ls="--",
+        lw=1.2,
+        label="true model weight = 1",
+    )
+    ax.set_xlabel("period $t$", fontsize=12)
+    ax.set_ylabel(r"$h_t$ = posterior weight on true model", fontsize=12)
+    ax.legend(fontsize=10)
+```
+
+We consider two cases, one that is easy to learn and another one that is harder to learn,
+using $T = 300$ periods, $n = 40$ simulated paths, a diffuse prior $h_0 = 0.5$, and
+common standard deviation $\sigma_p = 0.4$.
+
+- *Easy case*: true model $N(2.0,\, 0.4^2)$, alternative $N(1.2,\, 0.4^2)$.
+- *Hard case*: true model $N(2.0,\, 0.4^2)$, alternative $N(1.8,\, 0.4^2)$.
+
+Whether easy or hard to learn depends on "how close" the true distribution is compared to the
+alternative hypothesis.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: bayesian learning across paths
+    name: fig-bayesian-learning
+---
+T = 300
+h0 = 0.5     # diffuse prior
+n_paths = 40
+σ_p = 0.4
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 5))
+
+# Distinct reduced forms
+p_bar_true, p_bar_alt = 2.0, 1.2
+h_paths = simulate_bayesian_learning(p_bar_true, p_bar_alt, σ_p, T, h0, n_paths)
+plot_bayesian_learning(h_paths, p_bar_true, p_bar_alt, axes[0])
+
+# Similar reduced forms
+p_bar_true, p_bar_alt = 2.0, 1.8
+h_paths_hard = simulate_bayesian_learning(
+    p_bar_true, p_bar_alt, σ_p, T, h0, n_paths
+)
+plot_bayesian_learning(h_paths_hard, p_bar_true, p_bar_alt, axes[1])
+
+plt.tight_layout()
+plt.show()
+```
+
+In both panels the posterior weight on the true model converges to 1 with
+probability one,
+though convergence is slower when the two price distributions are similar (right
+panel).
+
+### Price expectations vs. rational expectations
+
+We now verify that the observer's price expectations converge to the
+rational-expectations
+distribution $g(p \mid \bar\mu)$.
+
+We use the parameterization of the "hard-to-learn" example above
+($\bar{p}_{\text{true}} = 2.0$, $\bar{p}_{\text{alt}} = 1.8$, $\sigma_p = 0.4$),
+extending to $T = 1{,}000$ periods with a single simulated path and prior $h_0 = 0.5$
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: price distribution convergence
+    name: fig-price-convergence
+---
+def price_expectation(h_t, p_bar_true, p_bar_alt, σ_p, p_grid):
+    """Return the predictive price density at posterior weight h_t."""
+    return (
+        h_t * norm.pdf(p_grid, loc=p_bar_true, scale=σ_p)
+        + (1 - h_t) * norm.pdf(p_grid, loc=p_bar_alt, scale=σ_p)
+    )
+
+
+p_bar_true, p_bar_alt = 2.0, 1.8
+σ_p = 0.4
+n_paths = 1
+T_long = 1000
+
+h_paths_long = simulate_bayesian_learning(
+    p_bar_true, p_bar_alt, σ_p, T_long, h0=0.5, n_paths=n_paths, seed=7
+)
+
+p_grid = np.linspace(0.0, 3.5, 300)
+re_density = norm.pdf(p_grid, loc=p_bar_true, scale=σ_p)
+
+fig, ax = plt.subplots(figsize=(8, 5))
+snapshots = [0, 25, 100, 300, 1000]
+palette   = plt.cm.Blues(np.linspace(0.3, 1.0, len(snapshots)))
+
+for t_snap, col in zip(snapshots, palette):
+    h_t = h_paths_long[0, t_snap]
+    dens = price_expectation(h_t, p_bar_true, p_bar_alt, σ_p, p_grid)
+    ax.plot(
+        p_grid,
+        dens,
+        color=col,
+        lw=2,
+        label=rf"$t = {t_snap}$, $h_t = {h_t:.3f}$",
+    )
+
+ax.plot(p_grid, re_density, "k--", lw=2,
+        label=r"rational expectations $g(p \mid \bar{\mu})$")
+ax.set_xlabel("price $p$", fontsize=12)
+ax.set_ylabel("density", fontsize=12)
+ax.legend(fontsize=9)
+plt.tight_layout()
+plt.show()
+```
+
+The sequence of predictive densities (shades of blue) converges to the
+rational-expectations
+density (dashed black line) as experience accumulates.
+
+This illustrates {prf:ref}`ime_theorem_bayesian_convergence`.
+
+We can now sharpen the point by looking at a case in which the reduced form is
+learned but the underlying structure is not.
+
+(km_extension_nonidentification)=
+### Learning the reduced form without identifying the structure
+
+The convergence result is particularly striking because the observer converges
+to
+*rational expectations* even when the underlying **structure** $\lambda$ is
+*not identified* by prices.
+
+To illustrate this, consider a case with *three* possible structures
+$\lambda^{(1)}, \lambda^{(2)}, \lambda^{(3)}$ but only *two* reduced forms
+$\mu_1 = \{\lambda^{(1)}, \lambda^{(2)}\}$ and $\mu_2 = \{\lambda^{(3)}\}$
+(because $\lambda^{(1)}$ and $\lambda^{(2)}$ generate the same price
+distribution).
+
+We continue with the hard-to-learn parameterization, so the three structures
+have price means $\bar{p}_1 = \bar{p}_2 = 2.0$ and $\bar{p}_3 = 1.8$, with
+common standard deviation $\sigma_p = 0.4$, a uniform prior
+$h_0 = (1/3, 1/3, 1/3)$, and $T = 400$ periods over $30$ paths.
+
+The true structure is $\lambda^{(1)}$.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: learning with non-identification
+    name: fig-nonidentification
+---
+def simulate_learning_3struct(
+    T, h0_vec, p_bar_vec, σ_p, true_idx, n_paths, seed=0
+):
+    """Simulate learning with three structures and two reduced forms."""
+    rng = np.random.default_rng(seed)
+    h_paths = np.zeros((n_paths, T + 1, 3))
+    h_paths[:, 0, :] = h0_vec
+
+    for path in range(n_paths):
+        h = np.array(h0_vec, dtype=float)
+        prices = rng.normal(p_bar_vec[true_idx], σ_p, size=T)
+        for t, p in enumerate(prices):
+            likelihoods = norm.pdf(p, loc=p_bar_vec, scale=σ_p)
+            h = h * likelihoods
+            h /= h.sum()
+            h_paths[path, t + 1, :] = h
+
+    return h_paths
+
+
+# Structures 0 and 1 share the same reduced form
+p_bar_vec = np.array([2.0, 2.0, 1.8])
+h0_vec = np.array([1 / 3, 1 / 3, 1 / 3])
+σ_p = 0.4
+T = 400
+true_idx = 0     # Structure 0 is observationally equivalent to 1
+
+h_paths_3 = simulate_learning_3struct(
+    T, h0_vec, p_bar_vec, σ_p, true_idx, n_paths=30
+)
+t_grid = np.arange(T + 1)
+
+fig, axes = plt.subplots(1, 3, figsize=(13, 4), sharey=True)
+struct_labels = [
+    r"$\lambda^{(1)}$",
+    r"$\lambda^{(2)}$",
+    r"$\lambda^{(3)}$",
+]
+
+for k, (ax, label) in enumerate(zip(axes, struct_labels)):
+    for path in h_paths_3:
+        ax.plot(t_grid, path[:, k], alpha=0.25, lw=0.8, color="steelblue")
+    ax.plot(t_grid, np.median(h_paths_3[:, :, k], axis=0),
+            color="navy", lw=2, label=f"median weight on {label}")
+    ax.set_xlabel("period $t$", fontsize=11)
+    ax.legend(fontsize=9)
+
+axes[0].set_ylabel("posterior weight", fontsize=11)
+plt.tight_layout()
+plt.show()
+```
+
+The observer correctly rules out $\lambda^{(3)}$ (the wrong reduced form) with
+probability
+one, but cannot distinguish $\lambda^{(1)}$ from $\lambda^{(2)}$ because they
+generate an
+identical price distribution.
+
+Nevertheless, the observer's **price expectations** converge
+to rational expectations because both structures imply the same reduced form
+$\bar\mu$.
+
+
+## Exercises
+
+```{exercise}
+:label: km_ex1
+
+**CARA portfolio utility and the stock-market interpretation.**
+
+Consider a two-state economy ($a_1 = 2$, $a_2 = 0.5$) where the informed agent has
+**CARA** (constant absolute risk aversion) preferences over portfolio wealth:
+
+$$
+u(W) = -e^{-\gamma W}, \quad W = x_2 + \bar{a}\, x_1.
+$$
+
+The agent chooses $x_1$ to maximize
+
+$$
+q\,u(W_1) + (1-q)\,u(W_2), \quad W_s = w - p\,x_1 + a_s\,x_1,
+$$
+
+subject to the budget constraint $p\,x_1 + x_2 = w$.
+
+Total supply of good 1 is $X_1 = 1$.
+
+1. Derive the first-order condition for the informed agent's optimal $x_1$.
+
+1. Use the market-clearing condition $x_1 = 1$ (the informed agent absorbs the
+   entire supply) to obtain an implicit equation for the equilibrium price
+   $p^*(q)$, and solve it numerically for $q \in (0,1)$ and several values of
+   $\gamma$.
+
+1. Show *analytically* that $p^*(q)$ admits the closed form
+
+   $$
+   p^*(q) = \frac{a_2 + R(q,\gamma)\, a_1}{1 + R(q,\gamma)},
+   \qquad R(q,\gamma) = \frac{q}{1-q}\, e^{-\gamma(a_1-a_2)},
+   $$
+
+   and verify that $p^*(q)$ is strictly increasing in $q$.
+```
+
+```{solution-start} km_ex1
+:class: dropdown
+```
+
+For the first-order condition, define $W_s = w + (a_s - p)\,x_1$ for
+$s = 1, 2$.
+
+Then the FOC is
+
+$$
+q\,(a_1 - p)\,\gamma\, e^{-\gamma W_1}
+= (1-q)\,(p - a_2)\,\gamma\, e^{-\gamma W_2},
+$$
+
+or equivalently (dividing by $\gamma$ and rearranging)
+
+$$
+q\,(a_1 - p)\, e^{-\gamma(a_1-p) x_1}
+  = (1-q)\,(p - a_2)\, e^{\gamma(p-a_2) x_1}.
+$$
+
+Setting $x_1 = 1$ (the informed agent absorbs all supply), this becomes a
+scalar root-finding problem in $p$:
+
+$$
+F(p;\,q,\gamma) \equiv
+  q\,(a_1-p)\,e^{-\gamma(a_1-p)} - (1-q)\,(p-a_2)\,e^{\gamma(p-a_2)} = 0.
+$$
+
+```{code-cell} ipython3
+from scipy.optimize import brentq
+
+def F_cara(p, q, a1, a2, γ, x1=1.0):
+    """Residual for the CARA equilibrium condition."""
+    return (q * (a1 - p) * np.exp(-γ * (a1 - p) * x1)
+            - (1 - q) * (p - a2) * np.exp(γ * (p - a2) * x1))
+
+a1, a2 = 2.0, 0.5
+q_grid = np.linspace(0.05, 0.95, 200)
+γ_values = [0.5, 1.0, 2.0, 5.0]
+colors_sol = plt.cm.plasma(np.linspace(0.15, 0.85, len(γ_values)))
+
+fig, ax = plt.subplots(figsize=(8, 5))
+for γ, color in zip(γ_values, colors_sol):
+    p_eq = [brentq(F_cara, a2, a1,
+                   args=(q, a1, a2, γ))
+            for q in q_grid]
+    ax.plot(q_grid, p_eq, lw=2, color=color,
+            label=rf"$\gamma = {γ}$")
+
+ax.set_xlabel(r"posterior $q = \Pr(\bar a = a_1)$", fontsize=12)
+ax.set_ylabel("equilibrium price $p^*(q)$", fontsize=12)
+ax.set_title("CARA preferences: equilibrium prices", fontsize=12)
+ax.legend(fontsize=10)
+plt.tight_layout()
+plt.show()
+```
+
+The price is strictly increasing in $q$ for every $\gamma > 0$.
+
+For the closed form, start from the FOC at $x_1 = 1$, divide both sides by
+$(a_1 - p)(p - a_2)$, and combine the exponentials:
+
+$$
+\frac{q\,(a_1 - p)}{(1-q)\,(p - a_2)} = e^{\gamma(a_1 - a_2)}.
+$$
+
+Rearranging gives
+
+$$
+\frac{p - a_2}{a_1 - p} = \frac{q}{1-q}\, e^{-\gamma(a_1 - a_2)}
+\equiv R(q,\gamma),
+$$
+
+and solving the resulting linear equation in $p$ yields
+
+$$
+p^*(q) = \frac{a_2 + R(q,\gamma)\, a_1}{1 + R(q,\gamma)}.
+$$
+
+Since $R(q,\gamma)$ is strictly increasing in $q$ and
+$dp^*/dR = (a_1 - a_2)/(1 + R)^2 > 0$, the equilibrium price $p^*(q)$ is
+strictly increasing in $q$.
+
+This exercise uses the stock-market interpretation emphasized by
+{cite:t}`kihlstrom_mirman1975`.
+
+Portfolio wealth is $W = x_2 + \bar{a}\, x_1$, so $a x_1$ and $x_2$ are perfect
+substitutes in each state.
+
+Hence the elasticity of substitution between the two arguments of
+$u(a x_1, x_2)$ is infinite, corresponding to the $\sigma > 1$ side of
+{prf:ref}`ime_theorem_invertibility_conditions`.
+
+The difference is that this example is not the full equilibrium model the theorem analyzes, but rather a partial equilibrium model with a single informed agent and a fixed supply of the risky asset.
+
+```{solution-end}
+```
+
+```{exercise}
+:label: km_ex2
+
+In the Bayesian learning simulation, the speed of
+convergence to rational expectations is determined by the **Kullback-Leibler
+divergence**
+between the two reduced forms.
+
+The KL divergence $D_{KL}(\mu_1 \| \mu_2)$ from $g(\cdot \mid \mu_1)$ to
+$g(\cdot \mid \mu_2)$, for two normal distributions with means $\bar{p}_1$ and
+$\bar{p}_2$ and common variance $\sigma_p^2$, is
+
+$$
+D_{KL}(\mu_1 \| \mu_2) = \frac{(\bar{p}_1 - \bar{p}_2)^2}{2\sigma_p^2},
+$$
+
+which is symmetric in the two means under equal variances.
+
+1. For the "easy" case ($\bar{p}_1 = 2.0$, $\bar{p}_2 = 1.2$) and the "hard"
+   case
+($\bar{p}_1 = 2.0$, $\bar{p}_2 = 1.8$), compute $D_{KL}$ for $\sigma_p = 0.4$.
+
+1. Re-run the simulations from the lecture for both cases with $n=100$ paths.
+   For each
+path compute the first period $T_{0.99}$ at which $h_t \geq 0.99$.  Plot
+histograms of
+$T_{0.99}$ for both cases.
+
+1. How does the median $T_{0.99}$ scale with $D_{KL}$?  Verify numerically that
+roughly $T_{0.99} \approx C / D_{KL}$ for some constant $C$.
+```
+
+```{solution-start} km_ex2
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+σ_p = 0.4
+
+def kl_normal(p1, p2, σ):
+    """Return the KL divergence for N(p1, σ^2) and N(p2, σ^2)."""
+    return (p1 - p2)**2 / (2 * σ**2)
+
+cases = [("Easy",  2.0, 1.2), ("Hard", 2.0, 1.8)]
+for name, p1, p2 in cases:
+    kl = kl_normal(p1, p2, σ_p)
+    print(f"{name} case: D_KL = {kl:.4f}")
+
+n_paths = 100
+
+fig, axes = plt.subplots(1, 2, figsize=(11, 4))
+for ax, (name, p1, p2) in zip(axes, cases):
+    kl = kl_normal(p1, p2, σ_p)
+    paths = simulate_bayesian_learning(p1, p2, σ_p, T=2000,
+                                       h0=0.5, n_paths=n_paths, seed=42)
+    # First period with posterior >= 0.99
+    T99 = []
+    for path in paths:
+        idx = np.where(path >= 0.99)[0]
+        T99.append(idx[0] if len(idx) > 0 else 2001)
+
+    median_T = np.median(T99)
+    ax.hist(T99, bins=20, color="steelblue", edgecolor="white", alpha=0.8)
+    ax.axvline(median_T, color="crimson", lw=2,
+               label=fr"Median $T_{{0.99}} = {median_T:.0f}$")
+    ax.set_title(
+        f"{name}: $D_{{KL}} = {kl:.4f}$,  "
+        fr"$\widehat C = T_{{0.99}} D_{{KL}} \approx {median_T * kl:.1f}$",
+        fontsize=11 
+    )
+    ax.set_xlabel(r"$T_{0.99}$", fontsize=12)
+    ax.set_ylabel("count", fontsize=11)
+    ax.legend(fontsize=10)
+
+plt.tight_layout()
+plt.show()
+```
+
+The median $T_{0.99}$ scales as approximately $C/D_{KL}$, confirming that
+learning is
+faster when the two reduced forms are more easily distinguished (large
+$D_{KL}$).
+
+```{solution-end}
+```
+
+```{exercise}
+:label: km_ex3
+
+{prf:ref}`ime_theorem_bayesian_convergence` requires the prior to assign
+positive probability to the true reduced-form class $\bar\mu$, equivalently to
+some structure that generates the true price distribution
+$g(\cdot \mid \bar\mu)$.
+
+In this exercise the true reduced form itself is excluded from the prior
+support, so we investigate what happens when no model in the prior generates the
+true price distribution.
+
+Simulate $T = 1,000$ periods of prices from $N(2.0, 0.4^2)$ but use a prior
+that places equal weight on two *wrong* models: $N(1.5, 0.4^2)$ and
+$N(2.3, 0.4^2)$.
+
+Plot the posterior weight on each model over time.
+
+Discuss your findings.
+```
+
+```{solution-start} km_ex3
+:class: dropdown
+```
+
+Here is  one solution:
+
+```{code-cell} ipython3
+def simulate_misspecified(
+    T, p_bar_true, p_bar_wrong, σ_p, h0, n_paths, seed=0
+):
+    """Simulate learning under a misspecified two-model prior."""
+    rng = np.random.default_rng(seed)
+    h_paths = np.zeros((n_paths, T + 1, 2))
+    h_paths[:, 0, :] = h0
+
+    for path in range(n_paths):
+        h = np.array(h0, dtype=float)
+        prices = rng.normal(p_bar_true, σ_p, size=T)
+        for t, price in enumerate(prices):
+            likes = norm.pdf(price, loc=p_bar_wrong, scale=σ_p)
+            h = h * likes
+            h /= h.sum()
+            h_paths[path, t + 1, :] = h
+
+    return h_paths
+
+
+def predictive_density(weights, means, σ_p, p_grid):
+    """Return the predictive density under the current posterior weights."""
+    density = np.zeros_like(p_grid)
+    for weight, mean in zip(weights, means):
+        density += weight * norm.pdf(p_grid, loc=mean, scale=σ_p)
+    return density
+
+
+T = 1000
+p_true = 2.0
+p_wrong = np.array([1.5, 2.3])
+σ_p = 0.4
+h0 = np.array([0.5, 0.5])
+n_paths = 30
+
+h_misspec = simulate_misspecified(T, p_true, p_wrong, σ_p, h0, n_paths)
+
+kl_vals = (p_true - p_wrong)**2 / (2 * σ_p**2)
+for mean, kl in zip(p_wrong, kl_vals):
+    print(f"KL(true || N({mean:.1f}, σ^2)) = {kl:.4f}")
+
+t_grid = np.arange(T + 1)
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+labels = [r"$N(1.5, \sigma^2)$", r"$N(2.3, \sigma^2)$"]
+for ax, k, label in zip(axes, [0, 1], labels):
+    for path in h_misspec:
+        ax.plot(t_grid, path[:, k], alpha=0.2, lw=0.8, color="steelblue")
+    ax.plot(t_grid, np.median(h_misspec[:, :, k], axis=0),
+            color="navy", lw=2, label="median")
+    ax.set_title(f"Posterior weight on {label}", fontsize=11)
+    ax.set_xlabel("period $t$", fontsize=11)
+    ax.set_ylabel("posterior weight", fontsize=11)
+    ax.legend(fontsize=9)
+
+plt.tight_layout()
+plt.show()
+
+# Predictive density and mean along the median posterior path
+median_path = np.median(h_misspec, axis=0)
+p_grid = np.linspace(0.0, 3.5, 300)
+closer_idx = np.argmin(kl_vals)
+
+fig, ax = plt.subplots(figsize=(8, 4))
+colors = plt.cm.Blues(np.linspace(0.3, 1.0, 4))
+for t_snap, color in zip([0, 10, 100, T], colors):
+    dens = predictive_density(median_path[t_snap], p_wrong, σ_p, p_grid)
+    ax.plot(p_grid, dens, color=color, lw=2, label=f"t = {t_snap}")
+
+ax.plot(
+    p_grid,
+    norm.pdf(p_grid, loc=p_wrong[closer_idx], scale=σ_p),
+    "k--",
+    lw=2,
+    label="KL-best wrong model",
+)
+ax.set_xlabel("price $p$", fontsize=11)
+ax.set_ylabel("density", fontsize=11)
+ax.legend(fontsize=9)
+plt.tight_layout()
+plt.show()
+
+pred_mean = np.median(
+    h_misspec[:, :, 0] * p_wrong[0] + h_misspec[:, :, 1] * p_wrong[1], axis=0
+)
+print(f"True mean: {p_true}")
+print(f"Predictive mean at T={T}: {pred_mean[-1]:.4f}")
+print(f"Closer misspecified mean: {p_wrong[np.argmin(kl_vals)]:.1f}")
+```
+
+Here
+
+$$
+D_{KL}\bigl(N(2.0, 0.4^2)\,\|\,N(2.3, 0.4^2)\bigr)
+<
+D_{KL}\bigl(N(2.0, 0.4^2)\,\|\,N(1.5, 0.4^2)\bigr),
+$$
+
+so the model with mean $2.3$ is the KL-best approximation among the two wrong
+models, and in the simulation posterior weight concentrates on that model.
+
+Posterior odds are cumulative {doc}`likelihood ratios<likelihood_bayes>`.
+
+If we compare the two wrong Gaussian models $f$ and $g$, then under the true
+distribution $h$ the average log likelihood ratio satisfies
+
+$$
+\frac{1}{t} E_h[\log L_t] = K(h,g) - K(h,f).
+$$
+
+So if $f$ is KL-closer to $h$ than $g$ is, $\log L_t$ has positive drift and
+posterior odds tilt toward $f$.
+
+```{solution-end}
+```
diff --git a/lectures/inventory_q.md b/lectures/inventory_q.md
index e9c9de9ce..9095b5727 100644
--- a/lectures/inventory_q.md
+++ b/lectures/inventory_q.md
@@ -35,7 +35,7 @@ A firm must decide how much stock to order each period, facing uncertain demand
 We approach the problem in two ways.
 
 First, we solve it exactly using dynamic programming, assuming full knowledge of
-the model — the demand distribution, cost parameters, and transition dynamics.
+the model -- the demand distribution, cost parameters, and transition dynamics.
 
 Second, we show how a manager can learn the optimal policy from experience alone, using [Q-learning](https://en.wikipedia.org/wiki/Q-learning).
 
@@ -475,7 +475,7 @@ All the manager needs to observe at each step is:
 4. the discount factor $\beta$, which is determined by the interest rate, and
 5. the next inventory level $X_{t+1}$ (which they can read off the warehouse).
 
-These are all directly observable quantities — no model knowledge is required.
+These are all directly observable quantities -- no model knowledge is required.
 
 
 ### The Q-table and the role of the max
@@ -483,7 +483,7 @@ These are all directly observable quantities — no model knowledge is required.
 It is important to understand how the update rule relates to the manager's
 actions.
 
-The manager maintains a **Q-table** — a lookup table storing an estimate $q_t(x,
+The manager maintains a **Q-table** -- a lookup table storing an estimate $q_t(x,
 a)$ for every state-action pair $(x, a)$.
 
 At each step, the manager is in some state $x$ and must choose a specific action
@@ -492,7 +492,7 @@ and next state $X_{t+1}$, and updates *that one entry* $q_t(x, a)$ of the
 table using the rule above.
 
 It is tempting to read the $\max_{a'}$ in the update rule as prescribing the
-manager's next action — that is, to interpret the update as saying "move to
+manager's next action -- that is, to interpret the update as saying "move to
 state $X_{t+1}$ and take an action in $\argmax_{a'} q_t(X_{t+1}, a')$."
 
 But the $\max$ plays a different role.  
@@ -512,7 +512,7 @@ The rule governing how the manager chooses actions is called the **behavior poli
 
 Because the $\max$ in the update target always points toward $q^*$
 regardless of how the manager selects actions, the behavior policy affects only
-which $(x, a)$ entries get visited — and hence updated — over time.
+which $(x, a)$ entries get visited -- and hence updated -- over time.
 
 In the reinforcement learning literature, this property is called **off-policy**
 learning: the convergence target ($q^*$) does not depend on the behavior policy.
@@ -521,8 +521,8 @@ As long as every $(x, a)$ pair is visited infinitely often (so that every entry
 of the Q-table receives infinitely many updates) and the learning rates satisfy
 standard conditions (see below), the Q-table converges to $q^*$.
 
-The behavior policy affects the *speed* of convergence — visiting important
-state-action pairs more frequently leads to faster learning — but not the
+The behavior policy affects the *speed* of convergence -- visiting important
+state-action pairs more frequently leads to faster learning -- but not the
 *limit*.
 
 In practice, we want the manager to mostly take good actions (to earn reasonable
@@ -555,11 +555,11 @@ The stochastic demand shocks naturally drive the manager across different invent
 
 A simple but powerful technique for accelerating learning is **optimistic initialization**: instead of starting the Q-table at zero, we initialize every entry to a value above the true optimum.
 
-Because every untried action looks optimistically good, the agent is "disappointed" whenever it tries one — the update pulls that entry down toward reality. This drives the agent to try other actions (which still look optimistically high), producing broad exploration of the state-action space early in training.
+Because every untried action looks optimistically good, the agent is "disappointed" whenever it tries one -- the update pulls that entry down toward reality. This drives the agent to try other actions (which still look optimistically high), producing broad exploration of the state-action space early in training.
 
 This idea is sometimes called **optimism in the face of uncertainty** and is widely used in both bandit and reinforcement learning settings.
 
-In our problem, the value function $v^*$ ranges from about 13 to 18. We initialize the Q-table at 20 — modestly above the true maximum — to ensure optimistic exploration without being so extreme as to distort learning.
+In our problem, the value function $v^*$ ranges from about 13 to 18. We initialize the Q-table at 20 -- modestly above the true maximum -- to ensure optimistic exploration without being so extreme as to distort learning.
 
 ### Implementation
 
@@ -581,7 +581,7 @@ def greedy_policy_from_q(q, K):
     return σ
 ```
 
-The Q-learning loop runs for `n_steps` total steps in a single continuous trajectory — just as a real manager would learn from the ongoing stream of data.
+The Q-learning loop runs for `n_steps` total steps in a single continuous trajectory -- just as a real manager would learn from the ongoing stream of data.
 
 At specified step counts (given by `snapshot_steps`), we record the current greedy policy.
 
diff --git a/lectures/lagrangian_lqdp.md b/lectures/lagrangian_lqdp.md
index f1e680cc6..4d80bb632 100644
--- a/lectures/lagrangian_lqdp.md
+++ b/lectures/lagrangian_lqdp.md
@@ -451,11 +451,16 @@ solves. See {cite}`Ljungqvist2012`,  ch 12.
 
 ## Application
 
-Here we demonstrate the computation with an example which is the deterministic version of an example borrowed from this [quantecon lecture](https://python.quantecon.org/lqcontrol.html).
+Here we demonstrate the computation with the deterministic permanent-income example from this {doc}`lqcontrol`.
+
+Because that model is discounted, we apply the invariant-subspace method to the
+equivalent *undiscounted* system obtained from the transformed matrices
+$\hat A = \beta^{1/2} A$ and $\hat B = \beta^{1/2} B$.
 
 ```{code-cell} ipython3
 # Model parameters
 r = 0.05
+β = 1 / (1 + r)
 c_bar = 2
 μ = 1
 
@@ -468,7 +473,7 @@ B = [[-1],
      [0]]
 
 # Construct an LQ instance
-lq = LQ(Q, R, A, B)
+lq = LQ(Q, R, A, B, beta=β)
 ```
 
 Given matrices $A$, $B$, $Q$, $R$, we can then compute $L$, $N$, and $M=L^{-1}N$.
@@ -476,7 +481,7 @@ Given matrices $A$, $B$, $Q$, $R$, we can then compute $L$, $N$, and $M=L^{-1}N$
 ```{code-cell} ipython3
 def construct_LNM(A, B, Q, R):
 
-    n, k = lq.n, lq.k
+    n = A.shape[0]
 
     # construct L and N
     L = np.zeros((2*n, 2*n))
@@ -496,7 +501,10 @@ def construct_LNM(A, B, Q, R):
 ```
 
 ```{code-cell} ipython3
-L, N, M = construct_LNM(lq.A, lq.B, lq.Q, lq.R)
+A_bar = lq.A * lq.beta ** (1/2)
+B_bar = lq.B * lq.beta ** (1/2)
+
+L, N, M = construct_LNM(A_bar, B_bar, lq.Q, lq.R)
 ```
 
 ```{code-cell} ipython3
@@ -517,7 +525,7 @@ M @ J @ M.T - J
 We can compute the eigenvalues of $M$ using `np.linalg.eigvals`, arranged in ascending order.
 
 ```{code-cell} ipython3
-eigvals = sorted(np.linalg.eigvals(M))
+eigvals = sorted(np.linalg.eigvals(M), key=lambda z: (abs(z), z.real, z.imag))
 eigvals
 ```
 
@@ -529,18 +537,14 @@ When we apply Schur decomposition such that $M=V W V^{-1}$, we want
 To get what we want, let's define a sorting function that tells `scipy.schur` to sort the corresponding eigenvalues with modulus smaller than 1 to the upper left.
 
 ```{code-cell} ipython3
-stable_eigvals = eigvals[:n]
+tol = 1e-10
 
 def sort_fun(x):
-    "Sort the eigenvalues with modules smaller than 1 to the top-left."
-
-    if x in stable_eigvals:
-        stable_eigvals.pop(stable_eigvals.index(x))
-        return True
-    else:
-        return False
+    "Sort the eigenvalues with modulus smaller than 1 to the top-left."
+    return abs(x) < 1 - tol
 
-W, V, _ = schur(M, sort=sort_fun)
+W, V, stable_dim = schur(M, sort=sort_fun)
+stable_dim
 ```
 
 ```{code-cell} ipython3
@@ -584,25 +588,24 @@ def stable_solution(M, verbose=True):
         The matrix represents the linear difference equations system.
     """
     n = M.shape[0] // 2
-    stable_eigvals = list(sorted(np.linalg.eigvals(M))[:n])
+    tol = 1e-10
 
     def sort_fun(x):
-        "Sort the eigenvalues with modules smaller than 1 to the top-left."
-
-        if x in stable_eigvals:
-            stable_eigvals.pop(stable_eigvals.index(x))
-            return True
-        else:
-            return False
-
-    W, V, _ = schur(M, sort=sort_fun)
+        "Sort the eigenvalues with modulus smaller than 1 to the top-left."
+        return abs(x) < 1 - tol
+
+    W, V, stable_dim = schur(M, sort=sort_fun)
+    if stable_dim != n:
+        raise ValueError(
+    f"Expected {n} stable eigenvalues inside the unit circle, found {stable_dim}."
+    )
     if verbose:
         print('eigenvalues:\n')
         print('    W11: {}'.format(np.diag(W[:n, :n])))
         print('    W22: {}'.format(np.diag(W[n:, n:])))
 
-    # compute V21 V11^{-1}
-    P = V[n:, :n] @ np.linalg.inv(V[:n, :n])
+    # compute V21 V11^{-1} without forming the inverse explicitly
+    P = np.linalg.solve(V[:n, :n].T, V[n:, :n].T).T
 
     return W, V, P
 
@@ -761,11 +764,6 @@ For example, when $\beta=\frac{1}{1+r}$, we can solve for $P$ with $\hat{A}=\bet
 
 These settings are adopted by default in the function `stationary_P` defined above.
 
-```{code-cell} ipython3
-β = 1 / (1 + r)
-lq.beta = β
-```
-
 ```{code-cell} ipython3
 stationary_P(lq)
 ```
diff --git a/lectures/lqcontrol.md b/lectures/lqcontrol.md
index 10f66284f..68edfe99b 100644
--- a/lectures/lqcontrol.md
+++ b/lectures/lqcontrol.md
@@ -1267,7 +1267,7 @@ The parameters are $r = 0.05, \beta = 1 / (1 + r), \bar c = 1.5,  \mu = 2, \sigm
 
 Here’s one solution.
 
-We use some fancy plot commands to get a certain style — feel free to
+We use some fancy plot commands to get a certain style -- feel free to
 use simpler ones.
 
 The model is an LQ permanent income / life-cycle model with hump-shaped
diff --git a/lectures/markov_perf.md b/lectures/markov_perf.md
index 49d870890..fd5bcbf70 100644
--- a/lectures/markov_perf.md
+++ b/lectures/markov_perf.md
@@ -140,7 +140,10 @@ v_i(q_i, q_{-i}) = \max_{\hat q_i}
    \left\{\pi_i (q_i, q_{-i}, \hat q_i) + \beta v_i(\hat q_i, f_{-i}(q_{-i}, q_i)) \right\}
 ```
 
-**Definition**  A **Markov perfect equilibrium** of the duopoly model is a pair of value functions $(v_1, v_2)$ and a pair of policy functions $(f_1, f_2)$ such that, for each $i \in \{1, 2\}$ and each possible state,
+```{prf:definition} Markov Perfect Equilibrium
+:label: def-markov-perfect-equilibrium
+
+A **Markov perfect equilibrium** of the duopoly model is a pair of value functions $(v_1, v_2)$ and a pair of policy functions $(f_1, f_2)$ such that, for each $i \in \{1, 2\}$ and each possible state,
 
 * The value function $v_i$ satisfies  Bellman equation {eq}`game4`.
 * The maximizer on the right side of {eq}`game4`  equals $f_i(q_i, q_{-i})$.
@@ -150,6 +153,7 @@ The adjective "Markov" denotes that the equilibrium decision rules depend only o
 "Perfect" means complete, in the sense that the equilibrium is constructed by backward induction and hence builds in optimizing behavior for each firm at all possible future states.
 
 * These include many states that will not be reached when we iterate forward on the pair of equilibrium strategies $f_i$ starting from a given initial state.
+```
 
 ### Computation
 
diff --git a/lectures/misspecified_recovery.md b/lectures/misspecified_recovery.md
new file mode 100644
index 000000000..71a073a65
--- /dev/null
+++ b/lectures/misspecified_recovery.md
@@ -0,0 +1,2324 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+(misspecified_recovery)=
+```{raw} html
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+        </a>
+</div>
+```
+
+# Misspecified recovery
+
+```{contents} Contents
+:depth: 2
+```
+
+## Overview
+
+The lecture {doc}`ross_recovery` studies the case in which recovery is valid.
+
+There, **transition independence** lets us use Arrow prices to separate investors'
+beliefs from the pricing kernel.
+
+This lecture asks what the same Perron--Frobenius approach delivers when that
+restriction is not imposed.
+
+We will keep three probability measures separate.
+
+The first is the correctly specified probability measure, which governs the Markov
+state in the model.
+
+The second is the one-period risk-neutral probability measure, which comes from
+normalizing one-period Arrow prices by bond prices.
+
+The third is the probability measure recovered by Perron--Frobenius Theory, also called
+the long-term risk-neutral measure.
+
+The central question is whether the recovered probability measure equals the correctly
+specified probability measure.
+
+{cite:t}`BorovickaHansenScheinkman2016` show that, in general, the answer is no.
+
+The paper studies the ratio of the recovered probability measure to the correctly
+specified probability measure.
+
+The reason is that the stochastic discount factor can contain a martingale component
+that changes the probability measure.
+
+If that martingale component is identically one, Ross recovery returns the correctly
+specified transition probabilities.
+
+If it is not identically one, the recovered probability measure absorbs long-term risk
+adjustments, because martingale increments compound along histories.
+
+In the examples below, the recovered probability measure assigns more probability to
+adverse long-run-risk states than the correctly specified probability measure.
+
+We will:
+
+- use results from {doc}`ross_recovery` without re-proving it,
+- study misspecification through the martingale component,
+- show why recursive utility and permanent shocks make the recovered probability
+  measure differ from the correctly specified probability measure,
+- measure the difference in a long-run risk model.
+
+We will use the following imports.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy import linalg
+from scipy.integrate import solve_ivp
+from scipy.stats import gaussian_kde
+```
+
+The next cell contains code inherited from the previous lecture.
+
+It row-normalizes Arrow prices, finds the Perron--Frobenius eigenvalue and positive
+right eigenvector, and computes stationary distributions.
+
+```{code-cell} ipython3
+:tags: [hide-input]
+
+def risk_neutral_probs(Q):
+    """Normalize Arrow prices by one-period bond prices."""
+    q_bonds = Q.sum(axis=1)
+    P_bar = Q / q_bonds[:, None]
+    return P_bar, q_bonds
+
+
+def perron_frobenius(Q):
+    """Perron-Frobenius eigenpair and associated transition matrix."""
+    eigenvalues, eigenvectors = linalg.eig(Q)
+    eigenvalues = np.real_if_close(eigenvalues, tol=1000)
+    eigenvectors = np.real_if_close(eigenvectors, tol=1000)
+
+    real_mask = np.isreal(eigenvalues)
+    vals = np.asarray(eigenvalues[real_mask].real, dtype=float)
+    vecs = np.asarray(eigenvectors[:, real_mask].real, dtype=float)
+
+    for idx in np.argsort(vals)[::-1]:
+        exp_eta = vals[idx]
+        e = vecs[:, idx]
+        if e.sum() < 0:
+            e = -e
+        if exp_eta > 0 and np.all(e > 0):
+            break
+    else:
+        raise ValueError("No strictly positive Perron-Frobenius eigenvector found")
+
+    e = e / e.sum()
+    eta = np.log(exp_eta)
+    P_hat = (1 / exp_eta) * Q * e[None, :] / e[:, None]
+
+    if np.max(np.abs(P_hat.sum(axis=1) - 1)) > 1e-8:
+        raise ValueError("Recovered transition matrix is not stochastic")
+    if P_hat.min() < -1e-10:
+        raise ValueError("Recovered transition matrix has negative entries")
+
+    return eta, exp_eta, e, P_hat
+
+
+def stationary_dist(P):
+    """Stationary distribution of an ergodic transition matrix."""
+    n = P.shape[0]
+    A = P.T - np.eye(n)
+    A[-1] = 1
+    b = np.zeros(n)
+    b[-1] = 1
+    return linalg.solve(A, b)
+
+
+def martingale_increment(Q, P):
+    """Martingale increment for the recovered probability measure."""
+    eta, exp_eta, e, P_hat = perron_frobenius(Q)
+    H = np.ones_like(P)
+    mask = P > 0
+    H[mask] = P_hat[mask] / P[mask]
+    return H, eta, e, P_hat
+```
+
+## Three transition matrices
+
+Let $\mathbf{P}=[p_{ij}]$ denote the correctly specified transition matrix and
+$\mathbf{Q}=[q_{ij}]$ the Arrow price matrix.
+
+Here "correctly specified" means that $\mathbf{P}$ is the transition matrix that
+governs the Markov state in the model.
+
+The one-period stochastic discount factor (SDF) satisfies
+
+```{math}
+:label: eq-mr-arrow-price-finite
+
+q_{ij} = s_{ij} p_{ij}.
+```
+
+We will compare $\mathbf{P}$ with two probability matrices constructed from
+$\mathbf{Q}$.
+
+The first one is the **one-period risk-neutral matrix**.
+
+It divides each row of $\mathbf{Q}$ by the price of a one-period discount bond in the
+current state:
+
+$$
+\bar p_{ij}
+= \frac{q_{ij}}{\sum_k q_{ik}}.
+$$
+
+This matrix absorbs one-period risk adjustments into transition probabilities.
+
+The second one is the transition matrix associated with the **long-term risk-neutral
+probability**.
+
+It starts from the Perron--Frobenius eigenvalue and positive right eigenvector of
+$\mathbf{Q}$.
+
+Let $(\exp(\hat \eta), \hat e)$ solve
+
+```{math}
+:label: eq-mr-pf-finite
+
+\mathbf{Q}\hat e = \exp(\hat \eta)\hat e.
+```
+
+Then define
+
+```{math}
+:label: eq-mr-phat-finite
+
+\hat p_{ij}
+= \exp(-\hat \eta) q_{ij} \frac{\hat e_j}{\hat e_i}.
+```
+
+The factor $\hat e_j/\hat e_i$ is chosen to cancel any SDF component of the form
+$\exp(\hat \eta)\hat e_i/\hat e_j$.
+
+The result is a stochastic matrix $\hat{\mathbf{P}}$.
+
+This construction assumes that $\mathbf{Q}$ has a unique positive right eigenvector up
+to scale.
+
+For a finite irreducible nonnegative matrix, the Perron root has a strictly positive
+right eigenvector unique up to scale.
+
+For long-horizon dominance and convergence, one typically imposes a stronger condition
+such as primitivity or aperiodicity; the paper uses a positivity condition on
+$\sum_{t=0}^{\infty}\lambda^t\mathbf Q^t$.
+
+In general state spaces this guarantee does not carry over: multiple positive
+eigenfunctions may exist, and an additional selection condition is needed to pin down
+the long-term risk-neutral measure.
+
+The general framework in the next section makes that selection condition explicit.
+
+Following {cite:t}`BorovickaHansenScheinkman2016`, $\hat{\mathbf{P}}$ is called a
+**long-term risk-neutral** transition matrix.
+
+The name means that the Perron--Frobenius eigenvalue and eigenvector isolate the part
+of pricing that dominates long-maturity Arrow claims.
+
+It is not the same transition matrix as the one-period risk-neutral matrix
+$\bar{\mathbf{P}}$.
+
+In {doc}`ross_recovery`, transition independence restricts the SDF to
+
+$$
+s_{ij}=\exp(-\delta)\frac{m_j}{m_i}
+$$
+
+for a positive vector $m$ and scalar $\delta$, which pins down the split between
+$s_{ij}$ and $p_{ij}$.
+
+Here we drop that restriction.
+
+The question is whether the transition matrix associated with the long-term
+risk-neutral probability, $\hat{\mathbf{P}}$, still equals the correctly specified
+matrix $\mathbf{P}$.
+
+### Degenerate Martingale Component
+
+We start with a three-state economy: recession, normal, and expansion.
+
+The correctly specified transition matrix is deliberately simple.
+
+For trend-stationary consumption and power utility, the SDF is
+
+$$
+s_{ij}=A\left(\frac{c_j}{c_i}\right)^{-\gamma}.
+$$
+
+This is a case where Ross recovery should return the correctly specified transition
+matrix.
+
+```{code-cell} ipython3
+P_true = np.array([
+    [0.70, 0.25, 0.05],
+    [0.15, 0.65, 0.20],
+    [0.05, 0.30, 0.65],
+])
+
+c_levels = np.array([0.997, 1.000, 1.003])
+state_names = ['recession', 'normal', 'expansion']
+
+δ = -np.log(0.99)   # monthly subjective discount rate
+γ_power = 5.0       # risk aversion
+g_c = 0.002         # monthly trend growth
+
+# Price Arrow claims as actual probabilities times the power-utility SDF
+S_power = (
+    np.exp(-δ - γ_power * g_c)
+    * (c_levels[None, :] / c_levels[:, None])**(-γ_power)
+)
+Q_power = S_power * P_true
+```
+
+We now compute the one-period risk-neutral matrix and the transition matrix associated
+with the long-term risk-neutral probability from the same Arrow price matrix.
+
+```{code-cell} ipython3
+P_bar, q_bonds = risk_neutral_probs(Q_power)
+η_hat, exp_η, e_hat, P_hat = perron_frobenius(Q_power)
+π_true = stationary_dist(P_true)
+π_bar = stationary_dist(P_bar)
+π_hat = stationary_dist(P_hat)
+```
+
+These two matrices should not be expected to agree.
+
+The row-normalized matrix $\bar{\mathbf{P}}$ is a short-horizon risk-neutral change of
+measure: it folds the one-period SDF into transition probabilities, so it generally
+differs from the correctly specified matrix $\mathbf{P}$.
+
+The logic comes from the Perron--Frobenius construction in {doc}`ross_recovery`.
+
+In the transition-independent case, the pricing kernel has the form
+$s_{ij}=\exp(\hat\eta)\hat e_i/\hat e_j$.
+
+Substituting this into the Perron--Frobenius transition formula gives
+
+$$
+\hat p_{ij}
+= \exp(-\hat\eta) q_{ij}\frac{\hat e_j}{\hat e_i}
+= \exp(-\hat\eta)
+  \left(\exp(\hat\eta)\frac{\hat e_i}{\hat e_j}p_{ij}\right)
+  \frac{\hat e_j}{\hat e_i}
+=p_{ij}.
+$$
+
+Thus the transition matrix $\hat{\mathbf{P}}$ associated with the long-term
+risk-neutral probability cancels the transition-independent part of the SDF.
+
+In this power-utility benchmark, the whole SDF has exactly that form, so the remaining
+martingale increment should be one and $\hat{\mathbf{P}}$ should coincide with
+$\mathbf{P}$.
+
+The next calculation checks this by comparing the Perron--Frobenius eigenfunction with
+$c_i^\gamma$ and then computing the ratio $\hat{\mathbf{P}}/\mathbf{P}$.
+
+Define the one-period martingale increment
+
+$$
+\hat h_{ij}
+= \frac{\hat p_{ij}}{p_{ij}}
+= \exp(-\hat\eta)s_{ij}\frac{\hat e_j}{\hat e_i}.
+$$
+
+When $\hat h_{ij}=1$ for every transition, $\hat{\mathbf P}$ and $\mathbf P$ are the
+same.
+
+The next section explains why this ratio is the one-period martingale increment.
+
+In the power-utility example, write
+
+$$
+A = \exp(-\delta-\gamma g_c),
+\qquad
+s_{ij}=A\left(\frac{c_j}{c_i}\right)^{-\gamma}.
+$$
+
+Taking $\hat e_i=c_i^\gamma$, up to scale, gives
+
+$$
+[\mathbf{Q}\hat e]_i
+= \sum_j A\left(\frac{c_j}{c_i}\right)^{-\gamma}p_{ij}c_j^\gamma
+= A c_i^\gamma
+= A\hat e_i,
+$$
+
+so $\exp(\hat\eta)=A$.
+
+Consequently,
+
+$$
+\hat h_{ij}
+= A^{-1}A\left(\frac{c_j}{c_i}\right)^{-\gamma}
+  \frac{c_j^\gamma}{c_i^\gamma}
+=1.
+$$
+
+```{code-cell} ipython3
+H_power = np.divide(P_hat, P_true, out=np.ones_like(P_true), where=P_true > 0)
+e_theory = c_levels**γ_power
+
+print("Perron-Frobenius eigenfunction: numerical vs c^gamma")
+for name, e_num, e_th in zip(state_names, e_hat / e_hat[1],
+                             e_theory / e_theory[1]):
+    print(f"{name:9s}: {e_num:.6f}  {e_th:.6f}")
+
+print("\nmartingale increment h_hat = P_hat / P")
+print(np.round(H_power, 6))
+
+print("\nconditional means under P")
+print(np.round((P_true * H_power).sum(axis=1), 6))
+
+print(f"\nmax |h_hat - 1| = "
+      f"{np.max(np.abs(H_power[P_true > 0] - 1)):.2e}")
+```
+
+The output separates a short-horizon risk adjustment from the Perron--Frobenius
+approach.
+
+The one-period risk-neutral matrix $\bar{\mathbf{P}}$ is close to, but not the same as,
+the correctly specified matrix $\mathbf{P}$.
+
+It changes the transition probabilities because one-period Arrow prices include
+one-period risk adjustments.
+
+By contrast, the long-term risk-neutral matrix $\hat{\mathbf{P}}$ is exactly the same
+as $\mathbf{P}$ in this example.
+
+The calculation confirms why: the martingale increment $\hat h_{ij}$ is one for every
+transition.
+
+This is the condition under which Ross recovery returns the correctly specified
+transition matrix.
+
+In this example, that cancellation exhausts the SDF, so the martingale component is
+degenerate.
+
+## Martingale Component
+
+Let $(\hat \eta, \hat e)$ be the Perron--Frobenius eigenvalue exponent and positive
+right eigenvector of $\mathbf{Q}$:
+
+$$
+\mathbf{Q} \hat e = \exp(\hat\eta) \hat e.
+$$
+
+The associated long-term risk-neutral transition matrix is
+
+$$
+\hat p_{ij}
+= \exp(-\hat\eta) q_{ij} \frac{\hat e_j}{\hat e_i}.
+$$
+
+To see whether recovery has changed the probability measure, compare each recovered
+transition probability with the corresponding correctly specified transition
+probability.
+
+For feasible transitions with $p_{ij}>0$, define the one-period martingale increment
+
+```{math}
+:label: eq-mr-hhat-finite
+
+\hat h_{ij} = \frac{\hat p_{ij}}{p_{ij}}.
+```
+
+If $\hat h_{ij}>1$, the recovered probability measure assigns more probability to
+transition $(i,j)$ than the correctly specified probability measure.
+
+If $\hat h_{ij}<1$, it assigns less probability to that transition.
+
+For a fixed current state $i$, the numbers $\hat h_{ij}$ average to one under the
+correctly specified transition probabilities:
+
+$$
+\sum_j \hat h_{ij} p_{ij}=1.
+$$
+
+Thus $\hat h_{ij}$ is a one-period martingale increment.
+
+Multiplying these increments along a history of states gives the ratio of the recovered
+probability measure to the correctly specified probability measure for the whole
+history.
+
+That ratio process is a martingale, which is why the last term in
+{eq}`eq-mr-finite-sdf-decomposition` is called a martingale component.
+
+Using {eq}`eq-mr-arrow-price-finite`, {eq}`eq-mr-phat-finite`, and
+{eq}`eq-mr-hhat-finite`, the one-period SDF can be written as
+
+```{math}
+:label: eq-mr-finite-sdf-decomposition
+
+s_{ij}
+= \exp(\hat\eta) \frac{\hat e_i}{\hat e_j} \hat h_{ij}.
+```
+
+The Perron--Frobenius approach therefore separates the SDF into:
+
+| Part | Role |
+|---|---|
+| $\exp(\hat\eta)$ | deterministic long-run discounting |
+| $\hat e_i / \hat e_j$ | state-dependent long-run term |
+| $\hat h_{ij}$ | martingale increment that changes probabilities |
+
+If $\hat h_{ij}=1$ for every feasible transition, then the transition matrix associated
+with the recovered probability measure and the correctly specified transition matrix
+are the same.
+
+This is the condition under which Ross recovery returns the correctly specified
+transition matrix.
+
+```{prf:proposition} Finite-state martingale component
+:label: prop-misspecified-recovery-martingale-component
+
+Under the finite-state assumptions used in this lecture, for a Markov model with
+correctly specified transition matrix $\mathbf{P}$ and Arrow matrix $\mathbf{Q}$,
+the probability measure recovered by Perron--Frobenius Theory returns the correctly
+specified transition matrix if and only if $\hat h_{ij}=1$ for every transition with
+$p_{ij}>0$.
+
+Equivalently, recovery returns the correctly specified transition matrix if and only if
+the SDF in {eq}`eq-mr-finite-sdf-decomposition` has no nonconstant martingale
+component:
+
+$$
+s_{ij}=\exp(\hat\eta)\frac{\hat e_i}{\hat e_j}.
+$$
+```
+
+```{prf:proof}
+Using $q_{ij}=s_{ij}p_{ij}$,
+
+$$
+\hat h_{ij}
+=\frac{\hat p_{ij}}{p_{ij}}
+=\exp(-\hat\eta)s_{ij}\frac{\hat e_j}{\hat e_i}.
+$$
+
+Thus $\hat{\mathbf{P}}=\mathbf{P}$ if and only if $\hat h_{ij}=1$ on every feasible
+transition.
+
+This condition is the same as saying that the SDF can be written as
+{eq}`eq-mr-finite-sdf-decomposition` with no extra martingale increment.
+```
+
+This finite-state implication is a special case of the paper's general identification
+result.
+
+If a pair $(S,P)$ explains asset prices and $H$ is a positive multiplicative
+martingale, then the same asset prices are also explained by the changed probability
+measure $P^H$ together with the adjusted stochastic discount factor
+
+$$
+S_t^H = S_t\frac{H_0}{H_t}.
+$$
+
+More generally, any strictly positive martingale can change probability measures, but
+multiplicativity preserves the Markov structure used here.
+
+Thus Arrow prices alone cannot usually distinguish a change in beliefs from a change in
+the SDF.
+
+Ross recovery becomes an identification result only after imposing a restriction such
+as
+
+$$
+S_t = \exp(-\delta t)\frac{m(X_t)}{m(X_0)},
+$$
+
+which rules out a nontrivial martingale component.
+
+The power-utility example above illustrates the proposition.
+
+In that benchmark, the martingale increment $\hat h_{ij}$ is identically one.
+
+## From matrices to the general framework
+
+The finite-state calculation has three objects:
+
+1. the correctly specified transition probabilities $p_{ij}$,
+2. the SDF increments $s_{ij}$,
+3. the Arrow prices $q_{ij}=s_{ij}p_{ij}$.
+
+It also has one diagnostic object:
+
+$$
+\hat h_{ij}
+= \frac{\hat p_{ij}}{p_{ij}}
+= \exp(-\hat\eta)s_{ij}\frac{\hat e_j}{\hat e_i}.
+$$
+
+The numbers $\hat h_{ij}$ are one-period martingale increments.
+
+They change the probability of a one-period transition from $p_{ij}$ to
+$\hat p_{ij}=\hat h_{ij}p_{ij}$.
+
+The general framework in {cite:t}`BorovickaHansenScheinkman2016` does the same thing
+without assuming that states are finite.
+
+The transition matrix becomes a Markov probability measure, the Arrow price matrix
+becomes a family of pricing operators, and the one-period ratios $\hat h_{ij}$ become
+increments of a positive multiplicative martingale.
+
+The point of this section is to build that dictionary.
+
+### Probability space and state
+
+Start with a probability space $(\Omega,\mathcal F,P)$.
+
+Here $P$ is the correctly specified probability measure.
+
+In the rational-expectations interpretation of the paper, this is the actual, or
+original, probability measure governing the state.
+
+In the finite-state section, $P$ was represented by the transition matrix
+$\mathbf P=[p_{ij}]$.
+
+The index set is either discrete time, $\mathbb T=\{0,1,2,\ldots\}$, or continuous
+time, $\mathbb T=\mathbb R_+$.
+
+The main state process is $X=\{X_t:t\in\mathbb T\}$, which is stationary and Markov
+under $P$.
+
+A second process $W=\{W_t:t\in\mathbb T\}$ records shocks that drive $X$ and other
+economic quantities.
+
+In discrete time, the shock increment between dates $t$ and $t+1$ is
+
+$$
+\Delta W_{t+1}=W_{t+1}-W_t.
+$$
+
+The known function $\phi_x$ maps today's state and the next shock increment into
+tomorrow's state.
+
+The discrete-time state evolution is
+
+$$
+X_{t+1}=\phi_x(X_t,\Delta W_{t+1}).
+$$
+
+The conditional law generated by this equation is the general-state replacement for
+the finite-matrix row indexed by the current state $x$.
+
+````{prf:assumption} Markov state and shock increments
+:label: assumption-mr-markov-shocks
+
+The process $X$ is ergodic under $P$.
+
+The conditional distribution of $\Delta W_{t+1}$ given $X_t$ is time invariant and
+independent of past shock histories conditioned on $X_t$.
+````
+
+The filtration $\{\mathcal F_t\}$ is generated by the initial condition $X_0$ and by
+the history of shocks through date $t$.
+
+### Information and $Y$
+
+The Markov state and the information that reveals shocks need not coincide.
+
+The state $X_t$ is observed at date $t$.
+
+The next shock $\Delta W_{t+1}$ need not be directly observed from the pair
+$(X_t,X_{t+1})$.
+
+If $(X_t,X_{t+1})$ does reveal $\Delta W_{t+1}$, then $X$ alone carries the relevant
+shock information.
+
+If it does not, introduce an auxiliary process $Y=\{Y_t\}$ with stationary increments.
+
+The known function $\phi_y$ maps today's state and the next shock increment into the
+increment of $Y$.
+
+The discrete-time evolution for the auxiliary increment is
+
+$$
+Y_{t+1}-Y_t=\phi_y(X_t,\Delta W_{t+1}).
+$$
+
+The pair $(X_{t+1},Y_{t+1}-Y_t)$ is then rich enough, together with $X_t$, to recover
+the shock increment $\Delta W_{t+1}$.
+
+This device lets the model handle shocks or growth components that affect payoffs and
+SDFs but are not fully summarized by the next Markov state alone.
+
+Write the enlarged process as $Z=(X,Y)$.
+
+The process $Z$ is Markov with a triangular structure: the conditional distribution of
+$(X_{t+1},Y_{t+1}-Y_t)$ depends on the past only through $X_t$.
+
+Histories of $Z$, together with $X_0$, generate the same information as the shock
+history.
+
+This is why the next Perron--Frobenius problem can first be posed with eigenfunctions
+of $X$ alone.
+
+The section {ref}`mr_additional_state` later returns to what changes when the
+eigenfunction is allowed to depend on $Y$ as well.
+
+### Multiplicative functionals
+
+The general framework needs a way to describe objects that compound over time.
+
+This is the role of a positive multiplicative functional $M=\{M_t\}$.
+
+Its log increment is a function of today's state and the next shock increment:
+
+$$
+\log \frac{M_{t+1}}{M_t}
+= \kappa(X_t,\Delta W_{t+1}).
+$$
+
+Equivalently,
+
+$$
+\frac{M_{t+1}}{M_t}
+= \exp\{\kappa(X_t,\Delta W_{t+1})\}.
+$$
+
+Thus $M_t/M_0$ is a product of positive one-period increments.
+
+This is the Condition-1 version of a multiplicative functional used in the paper.
+
+The formal definition is slightly broader, but this form covers the models studied
+below.
+
+Under {prf:ref}`assumption-mr-markov-shocks`, the logarithm of $M$ has stationary
+increments.
+
+Products and reciprocals of positive multiplicative functionals are again positive
+multiplicative functionals.
+
+Exponential functions of linear combinations of the components of $Y$ are examples.
+
+Stochastic discount factors, stochastic growth factors, and positive multiplicative
+martingales are all modeled this way.
+
+In the finite-state model, the SDF increment $s_{ij}$ is one example of a
+multiplicative-functional increment.
+
+The ratio $h_{ij}$ that changes probabilities is another.
+
+### Stochastic discount factors and pricing operators
+
+A stochastic discount factor $S=\{S_t\}$ is a positive multiplicative functional with
+$S_0=1$ and finite first moments conditional on $X_0$.
+
+Let $\Phi_t$ be a bounded payoff measurable with respect to the date-$t$ information.
+
+The date-$\tau$ price of $\Phi_t$ is
+
+$$
+\Pi_{\tau,t}(\Phi_t)
+= E\left[\frac{S_t}{S_\tau}\Phi_t\mid\mathcal F_\tau\right].
+$$
+
+The ratio $S_t/S_\tau$ is the stochastic discount factor from date $t$ back to date
+$\tau$.
+
+If the payoff is a bounded function $f(X_t)$ of the future Markov state, this pricing
+formula defines a horizon-$t$ operator $Q_t$ by
+
+$$
+[Q_t f](x)
+= E[S_t f(X_t)\mid X_0=x].
+$$
+
+This operator is the general-state analogue of multiplying a payoff vector by the
+Arrow-price matrix $\mathbf Q$.
+
+To see the connection, suppose again that the state space is finite and $t=1$.
+
+Then
+
+$$
+[Q_1 f]_i
+= \sum_j s_{ij}p_{ij}f_j
+= \sum_j q_{ij}f_j.
+$$
+
+Thus $Q_1$ is exactly the matrix $\mathbf Q$.
+
+In discrete time, the multiplicative property of $S$ implies that $Q_t$ is obtained
+by applying the one-period operator $Q_1$ repeatedly.
+
+In continuous time, the family $\{Q_t:t\geq0\}$ is a semigroup of pricing operators.
+
+### Martingales and equivalent probability measures
+
+Different stochastic discount factor / probability pairs can produce the same pricing
+operators, and that flexibility is the source of the identification problem.
+
+For the Markov setting used here, the probability changes of interest are generated by
+positive multiplicative martingales.
+
+At the level of probability changes, let $H=\{H_t\}$ be a strictly positive martingale
+with $E[H_0]=1$ under $P$.
+
+For an event $A$ observable by date $\tau$, the changed probability measure $P^H$ is
+defined by
+
+$$
+P^H(A)=E[1_A H_\tau].
+$$
+
+The law of iterated expectations makes this definition consistent across dates.
+
+When $H$ is also a multiplicative functional, making it a multiplicative martingale,
+the change of probability preserves the Markov structure of $Z$.
+
+The SDF that represents the same prices under $P^H$ is
+
+$$
+S_t^H=S_t\frac{H_0}{H_t}.
+$$
+
+Thus the same pricing operators can be represented by the pair $(S,P)$ or by the
+pair $(S^H,P^H)$.
+
+In the finite-state model, this is just
+
+$$
+p^H_{ij}=h_{ij}p_{ij},
+\qquad
+s^H_{ij}=\frac{s_{ij}}{h_{ij}},
+$$
+
+so that
+
+$$
+s^H_{ij}p^H_{ij}=s_{ij}p_{ij}=q_{ij}.
+$$
+
+The same Arrow prices can therefore be explained by changing the probability measure
+and offsetting that change in the SDF.
+
+This is also the precise sense in which Arrow prices alone do not identify beliefs.
+
+### What Perron--Frobenius recovers
+
+Now return to the Perron--Frobenius step.
+
+The finite-state equation was {eq}`eq-mr-pf-finite`.
+
+The general-state replacement is an eigenfunction problem for the pricing operators:
+find a scalar $\hat\eta$ and a positive function $\hat e$ such that, for every horizon
+$t$,
+
+```{math}
+:label: eq-mr-pf-general
+
+[Q_t\hat e](x)
+=\exp(\hat\eta t)\hat e(x).
+```
+
+The positive function $\hat e$ is the general-state counterpart of the
+Perron--Frobenius eigenvector.
+
+The scalar $\hat\eta$ is the log eigenvalue.
+
+In finite states, $\hat e$ is just a positive vector with one entry for each state.
+
+In general state spaces, $\hat e(x)$ is a positive function of the current state.
+
+Its job is to record the state-dependent part of long-horizon valuation.
+
+Equation {eq}`eq-mr-pf-general` says that a future payoff equal to $\hat e(X_t)$ has
+date-0 price $\exp(\hat\eta t)\hat e(X_0)$.
+
+Thus $\hat\eta$ gives the common growth or discount rate, while $\hat e$ gives the
+state-dependent scaling.
+
+Since eigenfunctions are defined only up to scale, uniqueness always means uniqueness
+up to multiplication by a positive constant.
+
+In general state spaces, existence of a positive eigenfunction is also a substantive
+condition.
+
+The eigenfunction equation implies the conditional moment restriction
+
+$$
+E[S_t\hat e(X_t)\mid\mathcal F_\tau]
+=\exp((t-\tau)\hat\eta)S_\tau\hat e(X_\tau),
+\qquad t\geq \tau.
+$$
+
+Use this restriction to define
+
+```{math}
+:label: eq-mr-hhat-process
+
+\frac{\hat H_t}{\hat H_0}
+=\exp(-\hat\eta t)S_t
+  \frac{\hat e(X_t)}{\hat e(X_0)}.
+```
+
+This process is a martingale because, for $t\geq \tau$,
+
+$$
+\begin{aligned}
+E\left[\frac{\hat H_t}{\hat H_0}\mid\mathcal F_\tau\right]
+&=
+\frac{\exp(-\hat\eta t)}{\hat e(X_0)}
+E[S_t\hat e(X_t)\mid\mathcal F_\tau] \\
+&=
+\frac{\exp(-\hat\eta t)}{\hat e(X_0)}
+\exp((t-\tau)\hat\eta)S_\tau\hat e(X_\tau) \\
+&=
+\exp(-\hat\eta \tau)S_\tau
+\frac{\hat e(X_\tau)}{\hat e(X_0)}
+=\frac{\hat H_\tau}{\hat H_0}.
+\end{aligned}
+$$
+
+The process is positive because $S$ and $\hat e$ are positive.
+
+Its one-period increment is
+
+```{math}
+:label: eq-mr-hhat-increment-general
+
+\frac{\hat H_{t+1}}{\hat H_t}
+= \exp(-\hat\eta)\frac{S_{t+1}}{S_t}
+  \frac{\hat e(X_{t+1})}{\hat e(X_t)}.
+```
+
+In the finite-state model, when $X_t=i$ and $X_{t+1}=j$,
+{eq}`eq-mr-hhat-increment-general` becomes
+
+$$
+\frac{\hat H_{t+1}}{\hat H_t}
+= \exp(-\hat\eta)s_{ij}\frac{\hat e_j}{\hat e_i}
+= \hat h_{ij}
+= \frac{\hat p_{ij}}{p_{ij}}.
+$$
+
+This is the same three-component decomposition as
+{eq}`eq-mr-finite-sdf-decomposition`.
+
+In finite states, {eq}`eq-mr-finite-sdf-decomposition` is equivalent to
+
+$$
+\frac{S_{t+1}}{S_t}
+= \exp(\hat\eta)
+  \frac{\hat e(X_t)}{\hat e(X_{t+1})}
+  \frac{\hat H_{t+1}}{\hat H_t}.
+$$
+
+The only change is notation: $\hat h_{ij}$ is the one-period density ratio in a finite
+Markov chain, while $\hat H_{t+1}/\hat H_t$ is the corresponding one-period density
+ratio in the general Markov setting.
+
+Conditional on $X_0$, the likelihood ratio for histories through date $t$ is
+$\hat H_t/\hat H_0$.
+
+For the unconditional measure on $\mathcal F_t$, the Radon--Nikodym density is
+$\hat H_t$, where $\hat H_0$ adjusts the initial distribution.
+
+If $\hat H_{t+1}/\hat H_t$ is not identically one, the recovered probability measure
+differs from the correctly specified probability measure.
+
+We show the restriction that rules out this difference in the next section.
+
+### Selection and recovery
+
+In finite irreducible matrix problems, Perron--Frobenius theory gives a unique positive
+eigenvector up to scale, so the recovered transition matrix is pinned down by
+$\mathbf Q$.
+
+In general state spaces, a positive eigenfunction need not exist, and multiple
+positive eigenfunctions may solve the same pricing operator problem when one does.
+
+The paper therefore imposes a selection condition on the probability measure induced by
+the candidate eigenfunction.
+
+````{prf:assumption} Ergodicity of the recovered measure
+:label: assumption-mr-ergodicity
+
+The process $X$ is stationary and ergodic under $P^{\hat H}$, the probability measure
+induced by the multiplicative martingale $\hat H$ defined in the previous section.
+````
+
+````{prf:proposition} Uniqueness of the Perron--Frobenius solution
+:label: prop-mr-uniqueness
+
+There is at most one solution $(\hat e, \hat\eta)$ to the Perron--Frobenius problem
+such that $X$ is stationary and ergodic under the induced probability measure
+$P^{\hat H}$.
+````
+
+This selected solution, when it exists, identifies the long-term risk-neutral measure.
+
+It does not by itself identify subjective beliefs.
+
+To make recovery identify beliefs, an additional restriction is needed on the SDF.
+
+The restriction used by Ross recovery is the paper's Condition 4:
+
+````{prf:assumption}
+:label: assumption-mr-condition-4
+
+Let
+
+$$
+S_t=\exp(-\delta t)\frac{m(X_t)}{m(X_0)}
+$$
+
+for some positive function $m$ and real number $\delta$.
+````
+
+This is the transition-independence restriction from {doc}`ross_recovery`, imposed on
+the SDF representation whose probability measure one wants to recover.
+
+Under this restriction, setting $\hat e=1/m$ and $\hat\eta=-\delta$ gives
+
+$$
+\frac{\hat H_t}{\hat H_0}
+= \exp(\delta t)
+  \left[\exp(-\delta t)\frac{m(X_t)}{m(X_0)}\right]
+  \frac{1/m(X_t)}{1/m(X_0)}
+=1.
+$$
+
+Thus the martingale component is identically one after normalization.
+
+This is the general-state version of the finite-state condition
+$\hat h_{ij}=1$ for every feasible transition.
+
+If this martingale is not identically one, the recovered probability measure absorbs it
+and generally differs from the correctly specified probability measure.
+
+We will see a few important examples of this in the next section.
+
+The section {ref}`mr_additional_state` returns to what happens when the eigenfunction
+is allowed to depend on the auxiliary process $Y$.
+
+### Continuous-time version
+
+Let's briefly introduce the model in continuous time before discussing examples where recovery fails.
+
+We introduce the diffusion notation because the long-run risk example below is
+written in continuous time.
+
+The objects are the same as before:
+
+- $X$ is the Markov state,
+- $Y$ records additional growing or shock-revealing components,
+- $M$ is a positive multiplicative functional, such as an SDF, a cash-flow growth
+  process, or a martingale used to change probabilities.
+
+In the continuous-time version, $W$ is a Brownian motion.
+
+The state, auxiliary process, and multiplicative functional satisfy
+
+$$
+\begin{aligned}
+dX_t &= \mu_x(X_t)dt+\sigma_x(X_t)dW_t,\\
+dY_t &= \mu_y(X_t)dt+\sigma_y(X_t)dW_t,\\
+d\log M_t &= \beta(X_t)dt+\alpha(X_t)\cdot dW_t.
+\end{aligned}
+$$
+
+Here $\mu_x$ and $\mu_y$ are drift functions, while $\sigma_x$ and $\sigma_y$ are
+shock-exposure matrices.
+
+The function $\beta$ is the drift of $\log M$, and $\alpha$ is the Brownian shock
+exposure of $\log M$.
+
+The invertibility assumption on the stacked shock-exposure matrix is the continuous-time
+counterpart of the discrete-time condition that $(X_{t+1},Y_{t+1}-Y_t)$ reveals the
+shock increment.
+
+It lets the history of $Z=(X,Y)$ reveal the Brownian information.
+
+For $M$ to be a local martingale, its drift must satisfy
+
+$$
+\beta(x)=-\frac{1}{2}\alpha(x)\cdot\alpha(x).
+$$
+
+This follows from Ito's formula:
+
+$$
+\frac{dM_t}{M_t}
+= \left(\beta(X_t)+\frac{1}{2}\alpha(X_t)\cdot\alpha(X_t)\right)dt
+  + \alpha(X_t)\cdot dW_t.
+$$
+
+A local martingale has zero drift in $dM_t/M_t$, which gives the displayed restriction.
+
+Additional integrability conditions then ensure that this local martingale is a true
+martingale.
+
+Under the probability measure induced by a martingale $H$ with this exposure
+$\alpha$, $\widetilde W_t=W_t-\int_0^t\alpha(X_s)ds$ is Brownian.
+
+With this sign convention, the drift of $X$ changes from $\mu_x$ to
+$\mu_x+\sigma_x\alpha$, and the drift of $Y$ changes from $\mu_y$ to
+$\mu_y+\sigma_y\alpha$.
+
+This is the continuous-time analogue of replacing $p_{ij}$ by $h_{ij}p_{ij}$ in the
+finite-state model.
+
+The Markov and triangular structure of $Z$ is preserved, which is why the same
+Perron--Frobenius decomposition can be applied.
+
+
+## When the recovery fails
+
+Now let's discuss a few examples where the recovered probability measure differs from the correctly specified probability measure.
+
+### Recursive utility
+
+We now use the martingale component to see when the recovered probability measure
+differs from the correctly specified probability measure.
+
+In the previous example, all risk adjustment in the SDF could be written as a ratio of
+a function of today's state to a function of tomorrow's state.
+
+The Perron--Frobenius transition formula cancels exactly that kind of term.
+
+Recursive utility adds a continuation-value term.
+
+The key point is that this term behaves like the martingale increment defined above.
+
+For the unit-EIS Epstein--Zin case in {cite:t}`BorovickaHansenScheinkman2016`, with
+$C_t=\exp(g_c t)c(X_t)$, write the translated continuation value as $V_t=g_c t+v(X_t)$,
+and define
+
+$$
+v_i^*=\exp((1-\gamma)v_i).
+$$
+
+The SDF is
+
+$$
+s_{ij}
+= \exp(-\delta-g_c) \frac{c_i}{c_j}
+  \frac{v_j^*}{\sum_k p_{ik}v_k^*}.
+$$
+
+In this unit-EIS example, the Perron--Frobenius eigenfunction is $\hat e_j=c_j$ and
+$\hat\eta=-(\delta+g_c)$.
+
+Applying the Perron--Frobenius transition formula therefore leaves
+
+$$
+\hat p_{ij}
+= p_{ij}\frac{v_j^*}{\sum_k p_{ik}v_k^*}.
+$$
+
+The denominator is the conditional expectation of $v_j^*$ given current state $i$.
+
+Therefore the last fraction has conditional mean one under $\mathbf{P}$.
+
+It is therefore a martingale increment.
+
+When $v^*$ is not constant, that ratio varies across next-period states.
+
+That variation is why the probability measure recovered by Perron--Frobenius Theory no
+longer gives the correctly specified transition matrix.
+
+The next cell solves the finite-state continuation-value equation and builds the SDF.
+
+```{code-cell} ipython3
+def solve_ez_unit_eis(P, c, δ, γ, g_c, tol=1e-12, max_iter=10_000):
+    """Finite-state unit-EIS Epstein-Zin continuation values and SDF."""
+    β = np.exp(-δ)
+    log_c = np.log(c)
+    n = len(c)
+    flow = (1 - β) * log_c + β * g_c
+
+    if abs(γ - 1) < 1e-10:
+        v = linalg.solve(np.eye(n) - β * P, flow)
+        v_star = np.ones(n)
+        Pv_star = np.ones(n)
+    else:
+        v = log_c.copy()
+        for _ in range(max_iter):
+            v_star = np.exp((1 - γ) * v)
+            Pv_star = P @ v_star
+            v_new = flow + β / (1 - γ) * np.log(Pv_star)
+            if np.max(np.abs(v_new - v)) < tol:
+                v = v_new
+                break
+            v = v_new
+        else:
+            raise ValueError("Epstein-Zin fixed point did not converge.")
+
+        v_star = np.exp((1 - γ) * v)
+        Pv_star = P @ v_star
+
+    S = (
+        np.exp(-δ - g_c)
+        * (c[:, None] / c[None, :])
+        * (v_star[None, :] / Pv_star[:, None])
+    )
+
+    return v, v_star, S
+```
+
+At log utility, $v^*$ is constant and the martingale increment is one.
+
+As risk aversion rises, continuation values matter more.
+
+The recovered probability measure then differs more from the correctly specified
+probability measure.
+
+To make the mechanism visible in a small three-state example, the figure below uses
+the more dispersed consumption vector
+
+$$
+c=(0.85, 1.00, 1.15).
+$$
+
+The heatmap reports percentage deviations of the martingale increment from one:
+$100(\hat h_{ij}-1)$.
+
+Positive entries are transitions that receive more probability under the recovered
+probability measure than under the correctly specified probability measure.
+
+The right panel reports the increase in the recovered recession probability, measured
+in percentage points.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Recursive utility generates a nonconstant martingale increment.
+    name: fig-mr-recursive-martingale
+---
+c_recursive = np.array([0.85, 1.00, 1.15])
+γ_demo = 10.0
+_, _, S_demo = solve_ez_unit_eis(P_true, c_recursive, δ, γ_demo, g_c)
+Q_demo = S_demo * P_true
+H_demo, _, _, P_hat_demo = martingale_increment(Q_demo, P_true)
+H_dev = 100 * (H_demo - 1)
+
+γ_grid = np.linspace(1, 15, 80)
+rec_prob = []
+for γ in γ_grid:
+    _, _, S_g = solve_ez_unit_eis(P_true, c_recursive, δ, γ, g_c)
+    Q_g = S_g * P_true
+    _, _, _, P_hat_g = martingale_increment(Q_g, P_true)
+    rec_prob.append(stationary_dist(P_hat_g)[0])
+rec_prob = np.array(rec_prob)
+rec_prob_gain = 100 * (rec_prob - π_true[0])
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4.5))
+
+bound = np.max(np.abs(H_dev))
+im = axes[0].imshow(H_dev, cmap='RdBu_r', vmin=-bound, vmax=bound)
+axes[0].set_xticks(range(3))
+axes[0].set_yticks(range(3))
+axes[0].set_xticklabels(state_names, rotation=20)
+axes[0].set_yticklabels(state_names)
+axes[0].set_xlabel('next state')
+axes[0].set_ylabel(r'current state')
+axes[0].set_title(r'martingale increment, $\gamma=10$')
+
+for i in range(3):
+    for j in range(3):
+        axes[0].text(j, i, f"{H_dev[i, j]:.1f}",
+                     ha='center', va='center', fontsize=9)
+plt.colorbar(im, ax=axes[0], fraction=0.046, pad=0.04,
+             label=r'$100(\hat h_{ij}-1)$')
+
+axes[1].plot(γ_grid, rec_prob_gain, lw=2.5)
+axes[1].axhline(0, ls='--', lw=1.5, color='0.5')
+axes[1].set_xlabel(r"risk aversion $\gamma$")
+axes[1].set_ylabel('increase in recession probability\n(percentage points)')
+axes[1].set_title('recovered recession probability')
+axes[1].set_ylim(0, rec_prob_gain.max() * 1.08)
+
+plt.tight_layout()
+plt.show()
+```
+
+Recursive utility makes the recovered probability measure assign more probability to
+recession transitions.
+
+At $\gamma=10$, transitions into recession receive more probability under the recovered
+probability measure, while transitions into expansion receive less.
+
+As risk aversion rises, the stationary recession probability under the recovered
+probability measure moves further above its correctly specified value.
+
+Thus, as the continuation-value term creates a nonconstant $\hat h_{ij}$, the transition
+matrix associated with the long-term risk-neutral probability no longer equals the
+correctly specified transition matrix.
+
+### Permanent Shocks
+
+Recursive utility gives one nonconstant martingale component.
+
+Permanent shocks provide another.
+
+Suppose consumption has a permanent shock,
+
+$$
+\log C_{t+1}-\log C_t
+= g + x(X_{t+1})-x(X_t) + \sigma \varepsilon_{t+1},
+$$
+
+where $\varepsilon_{t+1}$ is independent over time.
+
+With power utility, the SDF contains
+
+$$
+\exp(-\delta-\gamma g)
+\exp\{-\gamma[x(X_{t+1})-x(X_t)]\}
+\exp(-\gamma\sigma\varepsilon_{t+1}).
+$$
+
+The middle term depends only on the current and next Markov states.
+
+It is a ratio of state functions, so the Perron--Frobenius transition formula can cancel
+it.
+
+The permanent shock term depends on the new shock $\varepsilon_{t+1}$.
+
+Because that shock is not summarized by the finite Markov state in this construction,
+there is no state function whose ratio can cancel it.
+
+After dividing by its conditional mean, the shock term becomes a martingale increment:
+
+$$
+\frac{\exp(-\gamma\sigma\varepsilon_{t+1})}
+     {E[\exp(-\gamma\sigma\varepsilon_{t+1})]}.
+$$
+
+Thus permanent consumption shocks can make the recovered probability measure differ
+from investors' beliefs, even under ordinary power utility.
+
+This statement is relative to the Markov state used in the recovery procedure.
+
+Enlarging the state or information structure to account for the shock can accommodate
+it, but doing so leads to the identification problem discussed in
+{ref}`mr_additional_state`.
+
+### Long-run risk
+
+We now move from small finite-state examples to a standard continuous-time
+macro-finance model.
+
+The model is the Bansal--Yaron long-run risk model, using the calibration reported by
+{cite:t}`BorovickaHansenScheinkman2016`.
+
+The point is to compare the recovered probability measure with the correctly specified
+probability measure in a standard macro-finance model.
+
+The construction has the same structure as before.
+
+We first write the correctly specified state dynamics, then compute the probability
+measure implied by the Perron--Frobenius approach.
+
+The state vector $X_t=(X_{1t},X_{2t})'$ follows
+
+$$
+\begin{aligned}
+dX_{1t}
+&= [\mu_{11}(X_{1t}-\iota_1)+\mu_{12}(X_{2t}-\iota_2)]dt
+   + \sqrt{X_{2t}}\sigma_1 dW_t, \\
+dX_{2t}
+&= \mu_{22}(X_{2t}-\iota_2)dt
+   + \sqrt{X_{2t}}\sigma_2 dW_t .
+\end{aligned}
+$$
+
+Here $X_1$ is predictable consumption growth and $X_2$ is stochastic volatility.
+
+The representative agent has Epstein--Zin utility with unit elasticity of intertemporal
+substitution.
+
+The continuation value introduces the continuous-time analogue of the martingale
+component above.
+
+We denote that process by $H^*$, and the SDF satisfies
+
+$$
+d\log S_t = -\delta dt - d\log C_t + d\log H_t^*.
+$$
+
+Here $H^*$ is the continuation-value martingale entering the Epstein--Zin SDF.
+
+The multiplicative martingale $\hat H$ associated with the Perron--Frobenius problem is
+obtained only after also incorporating the Perron--Frobenius eigenfunction.
+
+In models with martingale components in consumption growth, $H^*$ and $\hat H$ need not
+coincide.
+
+The next cell sets the calibration.
+
+```{code-cell} ipython3
+lrr_params = dict(
+    δ=0.002,
+    γ=10.0,
+    μ11=-0.021,
+    μ12=0.0,
+    μ22=-0.013,
+    ι1=0.0,
+    ι2=1.0,
+    σ1=np.array([0.0, 0.00034, 0.0]),
+    σ2=np.array([0.0, 0.0, -0.038]),
+    β_c0=0.0015,
+    β_c1=1.0,
+    β_c2=0.0,
+    α_c=np.array([0.0078, 0.0, 0.0]),
+)
+```
+
+The next code block computes how the different probability measures change the drift of
+the state vector.
+
+The first quantity is the continuation value.
+
+In this affine model, the translated continuation value is linear in the state:
+
+$$
+v(x) = v_0 + v_1 x_1 + v_2 x_2.
+$$
+
+This is why we call $v_1$ and $v_2$ slopes.
+
+They are the derivatives of the continuation value with respect to predictable growth
+and volatility.
+
+These slopes enter the continuation-value martingale $H^*$.
+
+In the code, this martingale has shock exposure
+
+$$
+\alpha_{H^*}
+= (1-\gamma)(\alpha_c + \sigma_1 v_1 + \sigma_2 v_2).
+$$
+
+Since the SDF is $d\log S_t=-\delta dt-d\log C_t+d\log H_t^*$, its shock exposure is
+
+$$
+\alpha_S = -\alpha_c + \alpha_{H^*}.
+$$
+
+This vector $\alpha_S$ drives the one-period risk-neutral change of measure.
+
+The second quantity is the Perron--Frobenius eigenfunction.
+
+It is exponential-affine:
+
+$$
+\hat e(x) = \exp(e_0 + e_1 x_1 + e_2 x_2).
+$$
+
+Thus $e_1$ and $e_2$ are slopes of the log eigenfunction.
+
+Because $X_1$ and $X_2$ have shock loadings $\sigma_1$ and $\sigma_2$, the
+Perron--Frobenius eigenfunction contributes the additional shock exposure
+
+$$
+\sigma_1 e_1 + \sigma_2 e_2.
+$$
+
+Therefore the one-period risk-neutral dynamics use only $\alpha_S$, while the dynamics
+under the long-term risk-neutral measure use
+
+$$
+\alpha_S + \sigma_1 e_1 + \sigma_2 e_2.
+$$
+
+The functions below follow this order: compute $(v_1, v_2)$, compute $\alpha_S$ and
+$(e_1, e_2)$, and then translate these shock exposures into drifts for $X$.
+
+```{code-cell} ipython3
+def solve_value_function(p):
+    """Slopes of the affine continuation value."""
+    δ, γ = p["δ"], p["γ"]
+    μ11, μ12, μ22 = p["μ11"], p["μ12"], p["μ22"]
+    σ1, σ2 = p["σ1"], p["σ2"]
+    β_c1, β_c2 = p["β_c1"], p["β_c2"]
+    α_c = p["α_c"]
+
+    # v1 is the coefficient on predictable growth in v(x).
+    v1 = β_c1 / (δ - μ11)
+
+    # v2 is the coefficient on volatility.
+    # In the affine model it is the stable root of a scalar quadratic.
+    A_vec = α_c + σ1 * v1
+    B_vec = σ2
+
+    a = 0.5 * (1 - γ) * np.dot(B_vec, B_vec)
+    b = (μ22 - δ) + (1 - γ) * np.dot(A_vec, B_vec)
+    c = β_c2 + μ12 * v1 + 0.5 * (1 - γ) * np.dot(A_vec, A_vec)
+
+    disc = b**2 - 4 * a * c
+    if disc < 0:
+        raise ValueError("Value function does not exist for these parameters.")
+
+    v2 = (-b - np.sqrt(disc)) / (2 * a)
+    return v1, v2
+
+
+def solve_pf_lrr(p, v1, v2):
+    """Perron-Frobenius eigenfunction slopes and the SDF diffusion loading."""
+    δ, γ = p["δ"], p["γ"]
+    μ11, μ12, μ22 = p["μ11"], p["μ12"], p["μ22"]
+    ι1, ι2 = p["ι1"], p["ι2"]
+    σ1, σ2 = p["σ1"], p["σ2"]
+    α_c = p["α_c"]
+    β_c0, β_c1, β_c2 = p["β_c0"], p["β_c1"], p["β_c2"]
+
+    # Continuation-value martingale exposure and SDF exposure.
+    α_h_star = (1 - γ) * (α_c + σ1 * v1 + σ2 * v2)
+    α_s = -α_c + α_h_star
+
+    # Drift coefficients of log S before the Perron-Frobenius decomposition.
+    β_s11 = -β_c1
+    β_s12 = -β_c2 - 0.5 * np.dot(α_h_star, α_h_star)
+    β_s0 = -δ - β_c0 - 0.5 * ι2 * np.dot(α_h_star, α_h_star)
+
+    # e1 and e2 are coefficients in log e(x) = e0 + e1 x1 + e2 x2.
+    e1 = -β_s11 / μ11
+
+    # e2 solves the remaining quadratic from the Perron-Frobenius eigenvalue equation.
+    const = (β_s12 + 0.5 * np.dot(α_s, α_s)
+             + e1 * (μ12 + np.dot(σ1, α_s))
+             + 0.5 * e1**2 * np.dot(σ1, σ1))
+    lin = μ22 + np.dot(σ2, α_s) + e1 * np.dot(σ1, σ2)
+    quad = 0.5 * np.dot(σ2, σ2)
+
+    disc = lin**2 - 4 * quad * const
+    roots = [(-lin - np.sqrt(disc)) / (2 * quad),
+             (-lin + np.sqrt(disc)) / (2 * quad)]
+
+    candidates = []
+    for e2 in roots:
+        eta = (β_s0 - β_s11 * ι1 - β_s12 * ι2
+               - e1 * (μ11 * ι1 + μ12 * ι2) - e2 * μ22 * ι2)
+        candidates.append((eta, e2))
+
+    # Choose the solution that gives the smaller eigenvalue exponent.
+    eta, e2 = min(candidates)
+    return e1, e2, eta, α_s
+
+
+def recovered_lrr_dynamics(p, e1, e2, α_s):
+    """State dynamics under the long-term risk-neutral measure."""
+    μ11, μ12, μ22 = p["μ11"], p["μ12"], p["μ22"]
+    ι1, ι2 = p["ι1"], p["ι2"]
+    σ1, σ2 = p["σ1"], p["σ2"]
+
+    # The long-term risk-neutral measure uses the SDF exposure plus the eigenfunction exposure.
+    α_h = α_s + σ1 * e1 + σ2 * e2
+
+    # A diffusion change of measure shifts each drift by sigma_i dot alpha_h.
+    μ_hat_11 = μ11
+    μ_hat_12 = μ12 + np.dot(σ1, α_h)
+    μ_hat_22 = μ22 + np.dot(σ2, α_h)
+
+    # Rewrite the shifted drift in mean-reversion form.
+    ι_hat_2 = (μ22 / μ_hat_22) * ι2
+    ι_hat_1 = ι1 + (μ12 * ι2 - μ_hat_12 * ι_hat_2) / μ11
+
+    return dict(
+        μ11=μ_hat_11,
+        μ12=μ_hat_12,
+        μ22=μ_hat_22,
+        ι1=ι_hat_1,
+        ι2=ι_hat_2,
+        σ1=σ1,
+        σ2=σ2,
+        α_h=α_h,
+    )
+
+
+def risk_neutral_lrr_dynamics(p, α_s):
+    """State dynamics under the one-period risk-neutral measure."""
+    μ11, μ12, μ22 = p["μ11"], p["μ12"], p["μ22"]
+    ι1, ι2 = p["ι1"], p["ι2"]
+    σ1, σ2 = p["σ1"], p["σ2"]
+
+    # The one-period risk-neutral measure uses only the SDF exposure.
+    μ_bar_11 = μ11
+    μ_bar_12 = μ12 + np.dot(σ1, α_s)
+    μ_bar_22 = μ22 + np.dot(σ2, α_s)
+
+    # Rewrite the shifted drift in mean-reversion form.
+    ι_bar_2 = (μ22 / μ_bar_22) * ι2
+    ι_bar_1 = ι1 + (μ12 * ι2 - μ_bar_12 * ι_bar_2) / μ11
+
+    return dict(
+        μ11=μ_bar_11,
+        μ12=μ_bar_12,
+        μ22=μ_bar_22,
+        ι1=ι_bar_1,
+        ι2=ι_bar_2,
+        σ1=σ1,
+        σ2=σ2,
+    )
+```
+
+For the calibration used here, the recovered probability measure changes the long-run
+state distribution.
+
+It lowers the mean of expected growth and raises the mean of volatility.
+
+```{code-cell} ipython3
+v1, v2 = solve_value_function(lrr_params)
+e1, e2, η_lrr, α_s = solve_pf_lrr(lrr_params, v1, v2)
+dyn_hat = recovered_lrr_dynamics(lrr_params, e1, e2, α_s)
+dyn_bar = risk_neutral_lrr_dynamics(lrr_params, α_s)
+
+print(f"value slopes:       v1 = {v1:.4f}, v2 = {v2:.4f}")
+print(f"eigenfunction coefficients: e1 = {e1:.4f}, e2 = {e2:.4f}")
+print(f"log eigenvalue:     eta = {η_lrr:.6f}  "
+      f"(annualized {12 * η_lrr:.4f})")
+print()
+print("Long-run means under three measures")
+print("measure        iota_1     iota_2     mu_12      mu_22")
+print("---------   --------   --------   --------   --------")
+print(f"actual      {lrr_params['ι1']:8.5f}   {lrr_params['ι2']:8.5f}"
+      f"   {lrr_params['μ12']:8.5f}   {lrr_params['μ22']:8.5f}")
+print(f"one-period  {dyn_bar['ι1']:8.5f}   {dyn_bar['ι2']:8.5f}"
+      f"   {dyn_bar['μ12']:8.5f}   {dyn_bar['μ22']:8.5f}")
+print(f"long-term   {dyn_hat['ι1']:8.5f}   {dyn_hat['ι2']:8.5f}"
+      f"   {dyn_hat['μ12']:8.5f}   {dyn_hat['μ22']:8.5f}")
+```
+
+These numbers show the mechanism clearly.
+
+The positive value slope $v_1$ says that the continuation value is very sensitive to
+predictable consumption growth.
+
+The volatility slope $v_2$ is negative in this calibration, so higher volatility lowers
+continuation value.
+
+The eigenfunction coefficient $e_1$ has the opposite sign: the long-term change of
+measure loads negatively on predictable growth.
+
+Thus the recovered probability measure assigns more probability to histories with lower
+expected growth.
+
+The positive $e_2$ has the opposite implication for volatility, assigning more
+probability to higher-volatility states.
+
+The table translates those coefficients into state dynamics.
+
+Relative to the correctly specified probability measure, both risk-neutral measures
+lower the long-run mean of predictable growth and raise the long-run mean of
+volatility.
+
+The long-term risk-neutral measure moves further in that direction than the one-period
+risk-neutral measure: $\iota_1$ falls from $0$ to about $-0.0027$, while $\iota_2$
+rises from $1$ to about $1.13$.
+
+The small negative log eigenvalue means that $\exp(\eta)$ is slightly below one; with
+the usual yield sign convention, $-\eta$ is the corresponding long-run discount rate.
+
+#### Stationary Densities
+
+The coefficient table gives one summary of the difference between probability measures.
+
+A stationary-density plot gives another.
+
+It shows not only that the means of $X_1$ and $X_2$ move, but also which combinations of
+growth and volatility become more likely.
+
+This matters because treating the recovered probability measure as beliefs changes the
+whole forecast distribution, not just a pair of long-run averages.
+
+Under the recovered probability measure, probability mass shifts toward adverse
+long-run-risk states.
+
+These are states with lower predictable growth $X_1$ and higher volatility $X_2$.
+
+The dashed contour adds the one-period risk-neutral probability measure.
+
+In this calibration, the one-period risk-neutral and long-term risk-neutral stationary
+distributions are close to each other, and both are far from the correctly specified
+distribution.
+
+Thus the martingale component accounts for much of the risk adjustment in the
+state dynamics.
+
+The paper's Figure 1 reports model-implied stationary densities; the simulation below
+is a numerical approximation to those densities.
+
+The plot below simulates the state process under each probability measure and estimates
+the stationary joint density of $(X_2, X_1)$.
+
+The horizontal line marks $X_1=0$ and the vertical line marks the correctly specified
+mean of volatility, $X_2=\iota_2$.
+
+```{code-cell} ipython3
+def simulate_lrr(dyn, T=180_000, seed=123):
+    """
+    Euler simulation of the LRR state process under one probability measure.
+    """
+    rng = np.random.default_rng(seed)
+    X1 = np.zeros(T)
+    X2 = np.full(T, dyn["ι2"])
+
+    # Euler step with monthly time increment
+    for t in range(1, T):
+        X2_prev = max(X2[t-1], 1e-9)
+        dW = rng.standard_normal(3)
+        sqrt_X2 = np.sqrt(X2_prev)
+
+        X1[t] = (
+            X1[t-1]
+            + dyn["μ11"] * (X1[t-1] - dyn["ι1"])
+            + dyn["μ12"] * (X2_prev - dyn["ι2"])
+            + sqrt_X2 * np.dot(dyn["σ1"], dW)
+        )
+        X2[t] = max(
+            X2_prev
+            + dyn["μ22"] * (X2_prev - dyn["ι2"])
+            + sqrt_X2 * np.dot(dyn["σ2"], dW),
+            1e-9,
+        )
+
+    burn = T // 5
+    return X1[burn:], X2[burn:]
+
+
+def kde2d_contour(ax, X1, X2, label, levels=7, fill=True,
+                  linestyle='solid', outer_only=False):
+    """Estimate the stationary density and draw its contours."""
+    m = min(25_000, len(X1))
+    idx = np.linspace(0, len(X1) - 1, m, dtype=int)
+    x1 = X1[idx]
+    x2 = X2[idx]
+
+    kde = gaussian_kde(np.vstack([x2, x1]))
+    x2_grid = np.linspace(0.6, 1.6, 140)
+    x1_grid = np.linspace(-0.006, 0.006, 140)
+    X2g, X1g = np.meshgrid(x2_grid, x1_grid)
+    Z = kde(np.vstack([X2g.ravel(), X1g.ravel()])).reshape(X2g.shape)
+
+    contour_levels = np.linspace(0.12 * Z.max(), 0.9 * Z.max(), levels)
+    if outer_only:
+        contour_levels = contour_levels[:1]
+
+    if fill:
+        fill_levels = np.r_[contour_levels, Z.max()]
+        ax.contourf(X2g, X1g, Z, levels=fill_levels, cmap='Greys',
+                    alpha=0.85)
+        ax.contour(X2g, X1g, Z, levels=contour_levels, colors='0.55',
+                   linewidths=0.4)
+        ax.plot([], [], color='0.25', lw=1.5, label=label)
+    else:
+        ax.contour(X2g, X1g, Z, levels=contour_levels, colors='black',
+                   linewidths=1.5, linestyles=linestyle)
+        ax.plot([], [], color='black', lw=1.5, ls=linestyle, label=label)
+
+
+dyn_true = dict(
+    μ11=lrr_params["μ11"],
+    μ12=lrr_params["μ12"],
+    μ22=lrr_params["μ22"],
+    ι1=lrr_params["ι1"],
+    ι2=lrr_params["ι2"],
+    σ1=lrr_params["σ1"],
+    σ2=lrr_params["σ2"],
+)
+
+X1_P, X2_P = simulate_lrr(dyn_true, seed=1)
+X1_H, X2_H = simulate_lrr(dyn_hat, seed=2)
+X1_B, X2_B = simulate_lrr(dyn_bar, seed=3)
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4.8), sharex=True, sharey=True)
+kde2d_contour(axes[0], X1_P, X2_P, label=r'correctly specified $\mathbf{P}$')
+kde2d_contour(axes[1], X1_H, X2_H,
+              label=r'long-term risk-neutral $\hat{\mathbf{P}}$')
+kde2d_contour(axes[1], X1_B, X2_B,
+              label=r'risk-neutral $\bar{\mathbf{P}}$',
+              fill=False, linestyle='--', outer_only=True)
+
+for ax in axes:
+    ax.axhline(0, lw=0.8, ls='--')
+    ax.axvline(lrr_params["ι2"], lw=0.8, ls='--')
+    ax.set_xlim(0.6, 1.6)
+    ax.set_ylim(-0.006, 0.006)
+    ax.set_xlabel(r"conditional volatility $X_2$")
+    ax.legend(fontsize=9)
+
+axes[0].set_ylabel(r"mean growth rate $X_1$")
+plt.tight_layout()
+plt.show()
+```
+
+The movement below the horizontal line means lower expected growth, while movement to
+the right of the vertical line means higher volatility.
+
+#### Yield implications
+
+The difference between probability measures matters for asset-pricing interpretation
+because yields mix two quantities: a payoff forecast and an asset price.
+
+The recovered probability measure is called long-term risk-neutral because it absorbs
+the martingale component that prices long-horizon risk.
+
+For stochastically growing cash flows, the paper's long-horizon result is that risk
+premia relative to maturity-matched bonds vanish under the recovered probability
+measure, subject to the stability and moment conditions used for the limit.
+
+Under the correctly specified probability measure, those same long-term risk premia need
+not vanish.
+
+For a cash flow $G_t$, write expectations under the correctly specified probability
+measure as $E_P$ and expectations under the recovered probability measure as
+$E_{\hat P}$.
+
+The yield computed under the correctly specified probability measure is
+
+$$
+y_t^P[G](x)
+= \frac{1}{t}\log E_P[G_t \mid X_0=x]
+  - \frac{1}{t}\log E_P[S_tG_t \mid X_0=x].
+$$
+
+The first term is the payoff forecast.
+
+The second term is the asset price, written using the original SDF representation.
+
+Arrow prices determine the second term.
+
+The question here is what happens to the first term if an analyst treats the recovered
+probability measure $\hat{\mathbf{P}}$ as investors' beliefs.
+
+In that comparison, prices are held fixed and only the forecast term is recomputed:
+
+$$
+y_t^{\hat P}[G](x)
+= \frac{1}{t}\log E_{\hat P}[G_t \mid X_0=x]
+  - \frac{1}{t}\log E_P[S_tG_t \mid X_0=x].
+$$
+
+For an aggregate-consumption payoff, the answer is substantial.
+
+The recovered probability measure assigns more probability to low-growth,
+high-volatility states, so it forecasts lower future consumption.
+
+Holding prices fixed, that lower forecast translates into lower consumption yields.
+
+The zero-coupon bond is the comparison case.
+
+Its payoff is one, so the forecast term is always $\log E[1]=0$.
+
+Changing beliefs therefore does not move the bond-yield panel.
+
+The same solution to the Perron--Frobenius problem also appears in long-bond and
+forward-measure limits.
+
+The limiting one-period return on a very long bond is
+
+$$
+R^\infty_{t,t+1}
+= \exp(-\hat\eta)\frac{\hat e(X_{t+1})}{\hat e(X_t)}.
+$$
+
+The martingale increment satisfies
+
+$$
+\frac{\hat H_{t+1}}{\hat H_t}
+= \frac{S_{t+1}}{S_t} R^\infty_{t,t+1}.
+$$
+
+Thus the limiting one-period transition from forward measures coincides with the
+transition associated with the long-term risk-neutral probability.
+
+The calculation below uses the affine formulas implied by the long-run risk model.
+
+If a multiplicative functional $M$ has log drift affine in $X$ and diffusion proportional
+to $\sqrt{X_2}$, then
+
+$$
+E[M_t \mid X_0=x]
+= \exp\{\theta_0(t)+\theta_1(t)x_1+\theta_2(t)x_2\},
+$$
+
+where the coefficients solve Riccati equations.
+
+The code below computes these affine expectations under the correctly specified
+measure, recomputes only the consumption forecast under the recovered probability
+measure, and keeps asset prices fixed.
+
+It then plots median and interquartile yield bands across the same simulated initial
+states.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: >-
+      Yield implications of using the recovered probability measure as beliefs.
+      Dashed consumption-yield bands use payoff forecasts under the recovered
+      probability measure with prices fixed; bond yields are unchanged because the
+      zero-coupon payoff has no forecast term.
+    name: fig-mr-lrr-figure-2
+---
+def affine_expectation_coeffs(dyn, β0, β1, β2, α, horizons):
+    """Riccati coefficients for log E[M_t | X_0=x]."""
+    μ11, μ12, μ22 = dyn["μ11"], dyn["μ12"], dyn["μ22"]
+    ι1, ι2 = dyn["ι1"], dyn["ι2"]
+    σ1, σ2 = dyn["σ1"], dyn["σ2"]
+
+    def ode(_, θ):
+        θ0, θ1, θ2 = θ
+        θ0_dot = (β0 - β1 * ι1 - β2 * ι2
+                  - θ1 * (μ11 * ι1 + μ12 * ι2)
+                  - θ2 * μ22 * ι2)
+        θ1_dot = β1 + μ11 * θ1
+        θ2_dot = (β2 + μ12 * θ1 + μ22 * θ2
+                  + 0.5 * np.dot(α, α)
+                  + θ1 * np.dot(σ1, α)
+                  + θ2 * np.dot(σ2, α)
+                  + 0.5 * θ1**2 * np.dot(σ1, σ1)
+                  + θ1 * θ2 * np.dot(σ1, σ2)
+                  + 0.5 * θ2**2 * np.dot(σ2, σ2))
+        return [θ0_dot, θ1_dot, θ2_dot]
+
+    sol = solve_ivp(ode, (0, horizons[-1]), np.zeros(3),
+                    t_eval=horizons, rtol=1e-8, atol=1e-10)
+    if not sol.success:
+        raise ValueError("Riccati equation failed to solve")
+    return sol.y.T
+
+
+def log_expectation(θ, X1, X2):
+    """Evaluate log E[M_t | X_0=x] on simulated states."""
+    return θ[:, 0, None] + θ[:, 1, None] * X1[None, :] + θ[:, 2, None] * X2[None, :]
+
+
+def yield_quantiles(log_num, log_den, horizons):
+    """Quartiles of annualized yields across initial states."""
+    yields = 12 * (log_num - log_den) / horizons[:, None]
+    return np.quantile(yields, [0.25, 0.5, 0.75], axis=1)
+
+
+def transform_functional(β0, β1, β2, α, dyn_old, dyn_new, α_h):
+    """Rewrite a multiplicative functional after changing probabilities."""
+    # The drift changes because the martingale component changes the
+    # Brownian shock exposure used to forecast the cash flow.
+    β_level = β0 - β1 * dyn_old["ι1"] - β2 * dyn_old["ι2"]
+    β2_new = β2 + np.dot(α, α_h)
+    β0_new = β_level + β1 * dyn_new["ι1"] + β2_new * dyn_new["ι2"]
+    return β0_new, β1, β2_new, α
+
+
+def sdf_coefficients(p, v1, v2):
+    """SDF coefficients used in the affine expectation calculation."""
+    δ, γ = p["δ"], p["γ"]
+    α_c, σ1, σ2 = p["α_c"], p["σ1"], p["σ2"]
+
+    α_h_star = (1 - γ) * (α_c + σ1 * v1 + σ2 * v2)
+    α_s = -α_c + α_h_star
+
+    β_s1 = -p["β_c1"]
+    β_s2 = -p["β_c2"] - 0.5 * np.dot(α_h_star, α_h_star)
+    β_s0 = -δ - p["β_c0"] - 0.5 * p["ι2"] * np.dot(α_h_star, α_h_star)
+
+    return β_s0, β_s1, β_s2, α_s
+
+
+quarters = np.arange(1, 101)
+horizons = 3 * quarters
+
+β_c0, β_c1, β_c2 = (lrr_params["β_c0"],
+                    lrr_params["β_c1"],
+                    lrr_params["β_c2"])
+α_c = lrr_params["α_c"]
+
+β_s0, β_s1, β_s2, α_s = sdf_coefficients(lrr_params, v1, v2)
+
+# Numerators and denominators for yields under the correctly specified measure
+θ_C_P = affine_expectation_coeffs(dyn_true, β_c0, β_c1, β_c2, α_c, horizons)
+θ_S_P = affine_expectation_coeffs(dyn_true, β_s0, β_s1, β_s2, α_s, horizons)
+θ_SC_P = affine_expectation_coeffs(
+    dyn_true, β_s0 + β_c0, β_s1 + β_c1, β_s2 + β_c2,
+    α_s + α_c, horizons
+)
+
+# Numerator for the aggregate-consumption payoff under the recovered probability measure
+β_Ch0, β_Ch1, β_Ch2, α_Ch = transform_functional(
+    β_c0, β_c1, β_c2, α_c, dyn_true, dyn_hat, dyn_hat["α_h"]
+)
+θ_C_H = affine_expectation_coeffs(dyn_hat, β_Ch0, β_Ch1, β_Ch2,
+                                  α_Ch, horizons)
+
+log_C_P = log_expectation(θ_C_P, X1_P, X2_P)
+log_C_H = log_expectation(θ_C_H, X1_P, X2_P)
+log_S_P = log_expectation(θ_S_P, X1_P, X2_P)
+log_SC_P = log_expectation(θ_SC_P, X1_P, X2_P)
+
+qC_P = yield_quantiles(log_C_P, log_SC_P, horizons)
+qC_H = yield_quantiles(log_C_H, log_SC_P, horizons)
+qB_P = yield_quantiles(np.zeros_like(log_S_P), log_S_P, horizons)
+# A zero-coupon payoff has the same numerator, log E[1] = 0, under either belief.
+qB_H = qB_P.copy()
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4.8), sharex=True)
+
+def plot_yield_band(ax, x, q, color, label, linestyle='solid',
+                    alpha=0.35):
+    """Plot quartile band and quartile lines."""
+    ax.fill_between(x, q[0], q[2], color=color, alpha=alpha, linewidth=0)
+    ax.plot(x, q[1], color=color, lw=2.4, ls=linestyle, label=label)
+    ax.plot(x, q[0], color=color, lw=1.3, ls=linestyle)
+    ax.plot(x, q[2], color=color, lw=1.3, ls=linestyle)
+
+
+plot_yield_band(axes[0], quarters, qC_P, color='0.2',
+                label='correctly specified measure', alpha=0.45)
+plot_yield_band(axes[0], quarters, qC_H, color='0.65',
+                label='recovered probability measure', linestyle='--', alpha=0.35)
+plot_yield_band(axes[1], quarters, qB_P, color='0.2',
+                label='correctly specified measure', alpha=0.45)
+plot_yield_band(axes[1], quarters, qB_H, color='0.65',
+                label='recovered probability measure', linestyle='--', alpha=0.25)
+
+axes[0].set_xlabel('maturity (quarters)')
+axes[0].set_ylabel('consumption yield to maturity')
+axes[1].set_xlabel('maturity (quarters)')
+axes[1].set_ylabel('bond yield to maturity')
+
+axes[0].legend(fontsize=9)
+
+plt.tight_layout()
+plt.show()
+```
+
+The left panel is the key one: treating the recovered probability measure as beliefs
+assigns more probability to low-growth, high-volatility states, so the implied forecast
+for consumption is lower and consumption yields fall when prices are held fixed.
+
+The bond panel verifies the zero-coupon comparison.
+
+Since $\log E[1]=0$ under any measure, the solid and dashed
+bond-yield bands coincide.
+
+(mr_additional_state)=
+## Additional state vector
+
+{cite:t}`BorovickaHansenScheinkman2016` then asks whether enlarging the state vector
+changes the recovery problem.
+
+So far, the Perron--Frobenius eigenfunction has depended only on the Markov state
+$X_t$.
+
+But many models also contain a growing component $Y_t$, such as log consumption, with
+increments driven by the same shock increments.
+
+Here $\Delta W_{t+1}$ denotes the shock increment between dates $t$ and $t+1$.
+
+The map $\phi_x$ sends today's state and the next shock increment into tomorrow's
+state.
+
+The map $\phi_y$ sends today's state and the next shock increment into the increment
+of $Y$.
+
+$$
+X_{t+1}=\phi_x(X_t,\Delta W_{t+1}),
+\qquad
+Y_{t+1}-Y_t=\phi_y(X_t,\Delta W_{t+1}).
+$$
+
+Let $\varepsilon$ denote an eigenfunction candidate that is allowed to depend on both
+the stationary state $X_t$ and the growing component $Y_t$.
+
+Let $\zeta$ be a vector of loadings on $Y$, and let $e_\zeta$ be a positive function
+of $X$.
+
+Then a natural candidate is
+
+$$
+\varepsilon(x,y)=\exp(\zeta \cdot y)e_\zeta(x).
+$$
+
+This form is natural because $Y$ enters through increments.
+
+Along a path,
+
+$$
+\exp(\zeta \cdot Y_{t+1})
+= \exp(\zeta \cdot Y_t)
+  \exp\{\zeta \cdot (Y_{t+1}-Y_t)\}.
+$$
+
+Since $Y_{t+1}-Y_t$ is a function of $(X_t,\Delta W_{t+1})$, the ratio
+$\exp(\zeta \cdot Y_{t+1})/\exp(\zeta \cdot Y_t)$ is a one-period positive
+multiplicative functional increment.
+
+For a fixed $\zeta$, this factor tilts the one-period pricing operator by
+$\exp\{\zeta \cdot (Y_{t+1}-Y_t)\}$.
+
+The $x$-dependent term is therefore not simply the earlier eigenfunction reused.
+
+For each choice of $\zeta$, the remaining $x$-dependent part solves a different
+Perron--Frobenius problem:
+
+$$
+E\left[
+    \frac{S_{t+1}}{S_t}
+    \exp\{\zeta \cdot (Y_{t+1}-Y_t)\}
+    e_\zeta(X_{t+1})
+    \mid X_t=x
+\right]
+=\exp(\eta_\zeta)e_\zeta(x).
+$$
+
+Changing $\zeta$ changes how much long-run growth risk is loaded into the eigenfunction.
+
+Thus adding $Y_t$ can make the subjective probability measure one possible solution, but
+it also creates a family of possible solutions.
+
+The extra state variable therefore does not remove the identification problem; it
+usually makes the selection problem more explicit.
+
+The paper also points out a related practical issue.
+
+Highly persistent stationary processes can be hard to distinguish from processes with
+stationary increments.
+
+A stationary approximation may have a unique solution to the Perron--Frobenius problem
+for each finite persistence level, but as persistence becomes extreme, the limiting
+problem can have many approximate solutions.
+
+Numerically, this means the solution to the Perron--Frobenius problem can be sensitive
+exactly in the cases where a stationary model is being used to approximate stochastic
+growth.
+
+There is, however, a structured way forward.
+
+If the analyst supplies a reference multiplicative functional $Y^r$ that is known to
+have the same martingale component as the SDF, then one can restrict the enlarged
+eigenfunction to the form
+
+$$
+(Y^r)^{-1}e(x).
+$$
+
+This restriction chooses which long-run martingale component is allowed into the
+eigenfunction.
+
+With this extra structure, Arrow prices can again reveal subjective probabilities.
+
+But the key input is external: the long-run martingale component has been supplied by
+the analyst, not recovered from Arrow prices alone.
+
+## Measuring the martingale component
+
+The paper also asks how large the martingale component is in asset-market data.
+
+Under rational expectations, this measures how important long-term risk adjustments are
+for valuation.
+
+Under a subjective-beliefs interpretation, it measures the discrepancy between
+subjective beliefs and the correctly specified probability measure only after imposing
+that the subjective SDF itself has no martingale component.
+
+With that extra restriction in place, a small martingale component would make the
+recovered probability measure close to beliefs, while a large one would make long-term
+risk adjustments more important for the recovered probability measure.
+
+One family of measures applies a convex function to the martingale increment
+$\hat H_{t+1}/\hat H_t$.
+
+For example, conditional relative entropy uses
+
+$$
+E\left[
+    \frac{\hat H_{t+1}}{\hat H_t}
+    \log\frac{\hat H_{t+1}}{\hat H_t}
+    \mid X_t=x
+\right].
+$$
+
+This expression is zero only when the martingale increment is identically one.
+
+With incomplete asset-market data, the full martingale increment is not observed.
+
+The paper therefore uses pricing restrictions and long-bond return approximations to
+derive lower bounds on such discrepancy measures.
+
+These bounds are a way to test whether the martingale component is economically small
+without requiring a full set of Arrow prices.
+
+## Lessons
+
+The Perron--Frobenius approach remains useful under misspecification, but it no
+longer solves the belief-recovery problem by itself.
+
+It delivers a probability measure that may include long-horizon risk premia.
+
+That measure equals investors' beliefs only when the martingale component is
+identically one.
+
+Recursive utility, permanent shocks, and long-run risk models give this martingale an
+economically important role, so it should not be overlooked when assessing the
+implications of transition independence for belief recovery.
+
+## Exercises
+
+```{exercise}
+:label: ex_misspecified_recovery_martingale_component
+
+**A two-state martingale component.**
+
+Let
+
+$$
+\mathbf{P} =
+\begin{pmatrix}
+0.8 & 0.2 \\
+0.4 & 0.6
+\end{pmatrix},
+\qquad
+\mathbf{Q} =
+\begin{pmatrix}
+0.72 & 0.15 \\
+0.36 & 0.42
+\end{pmatrix}.
+$$
+
+1. Compute the one-period risk-neutral transition matrix $\bar{\mathbf{P}}$.
+2. Compute the transition matrix $\hat{\mathbf{P}}$ associated with the recovered
+   probability measure.
+3. Compute $\hat h_{ij}=\hat p_{ij}/p_{ij}$ and decide whether recovery returns the
+   correctly specified transition matrix.
+```
+
+```{solution-start} ex_misspecified_recovery_martingale_component
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+P2 = np.array([[0.8, 0.2],
+               [0.4, 0.6]])
+Q2 = np.array([[0.72, 0.15],
+               [0.36, 0.42]])
+
+Pbar2, qb2 = risk_neutral_probs(Q2)
+H2, eta2, e2, Phat2 = martingale_increment(Q2, P2)
+
+print("One-period risk-neutral transition matrix P_bar")
+print(np.round(Pbar2, 4))
+print("\nTransition matrix P_hat associated with the recovered probability measure")
+print(np.round(Phat2, 4))
+print("\nMartingale increment h_hat")
+print(np.round(H2, 4))
+print("\nRecovery returns P:", np.allclose(H2[P2 > 0], 1))
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: ex_power_utility_success
+
+**Power utility benchmark.**
+
+For trend-stationary consumption and power utility,
+
+$$
+s_{ij}=A\left(\frac{c_j}{c_i}\right)^{-\gamma}.
+$$
+
+Show that $\hat e_i=c_i^\gamma$ is the Perron--Frobenius eigenvector and that
+$\hat{\mathbf{P}}=\mathbf{P}$.
+
+Then verify the result numerically using the three-state baseline in the lecture.
+```
+
+```{solution-start} ex_power_utility_success
+:class: dropdown
+```
+
+The analytical check is:
+
+$$
+[\mathbf{Q}\hat e]_i
+=\sum_j A\left(\frac{c_j}{c_i}\right)^{-\gamma}p_{ij}c_j^\gamma
+=A c_i^\gamma
+=A\hat e_i.
+$$
+
+Thus $\exp(\hat\eta)=A$ and
+
+$$
+\hat p_{ij}
+=\frac{1}{A}q_{ij}\frac{\hat e_j}{\hat e_i}
+=p_{ij}.
+$$
+
+Below is the numerical check.
+
+```{code-cell} ipython3
+H_power, _, e_power, P_hat_power = martingale_increment(Q_power, P_true)
+e_theory = c_levels**γ_power
+e_theory = e_theory / e_theory.sum()
+
+print("Perron-Frobenius eigenvector")
+print(np.round(e_power, 6))
+print("\nNormalized c^gamma")
+print(np.round(e_theory, 6))
+print("\nmax |P_hat - P|:",
+      np.max(np.abs(P_hat_power - P_true)))
+print("max |h_hat - 1|:",
+      np.max(np.abs(H_power[P_true > 0] - 1)))
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: ex_recursive_utility_martingale_component
+
+**Recursive utility and risk aversion.**
+
+Using the finite-state Epstein--Zin example with
+$c=(0.85, 1.00, 1.15)$, compute the stationary distribution of
+$\hat{\mathbf{P}}$ for $\gamma \in \{1, 5, 10, 15\}$.
+
+Which state receives the largest increase in stationary probability as $\gamma$ rises?
+```
+
+```{solution-start} ex_recursive_utility_martingale_component
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+for γ in [1, 5, 10, 15]:
+    _, _, S_g = solve_ez_unit_eis(P_true, c_recursive, δ, γ, g_c)
+    Q_g = S_g * P_true
+    _, _, _, P_hat_g = martingale_increment(Q_g, P_true)
+    π_g = stationary_dist(P_hat_g)
+    print(f"gamma={γ:2.0f}: {np.round(π_g, 4)}")
+
+print("\nCorrectly specified:", np.round(π_true, 4))
+```
+
+The recession state receives the largest increase.
+
+```{solution-end}
+```
diff --git a/lectures/multivariate_normal.md b/lectures/multivariate_normal.md
index 7aacee6eb..96adcd647 100644
--- a/lectures/multivariate_normal.md
+++ b/lectures/multivariate_normal.md
@@ -3,8 +3,10 @@ jupytext:
   text_representation:
     extension: .md
     format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
 kernelspec:
-  display_name: Python 3
+  display_name: Python 3 (ipykernel)
   language: python
   name: python3
 ---
@@ -35,7 +37,7 @@ In this lecture, you will learn formulas for
 * marginal distributions for all subvectors of $x$
 * conditional distributions for subvectors of $x$ conditional on other subvectors of $x$
 
-We will use  the multivariate normal distribution to formulate some useful models:
+We will use the multivariate normal distribution to formulate some useful models:
 
 * a factor analytic model of an intelligence quotient, i.e., IQ
 * a factor analytic model of two independent inherent abilities, say, mathematical and verbal.
@@ -44,7 +46,7 @@ We will use  the multivariate normal distribution to formulate some useful model
 * time series generated by linear stochastic difference equations
 * optimal linear filtering theory
 
-## The Multivariate Normal Distribution
+## The multivariate normal distribution
 
 This lecture defines a Python class `MultivariateNormal` to be used
 to generate **marginal** and **conditional** distributions associated
@@ -58,13 +60,15 @@ For a multivariate normal distribution it is very convenient that
 
 We apply our Python class to some examples.
 
-We  use the following imports:
+We use the following imports:
 
-```{code-cell} ipython
+```{code-cell} ipython3
 import matplotlib.pyplot as plt
 import numpy as np
 from numba import jit
 import statsmodels.api as sm
+
+rng = np.random.default_rng(0)
 ```
 
 Assume that an $N \times 1$ random vector $z$ has a
@@ -73,11 +77,11 @@ multivariate normal probability density.
 This means that the probability density takes the form
 
 $$
-f\left(z;\mu,\Sigma\right)=\left(2\pi\right)^{-\left(\frac{N}{2}\right)}\det\left(\Sigma\right)^{-\frac{1}{2}}\exp\left(-.5\left(z-\mu\right)^{\prime}\Sigma^{-1}\left(z-\mu\right)\right)
+f\left(z;\mu,\Sigma\right)=\left(2\pi\right)^{-\left(\frac{N}{2}\right)}\det\left(\Sigma\right)^{-\frac{1}{2}}\exp\left(-.5\left(z-\mu\right)^\top\Sigma^{-1}\left(z-\mu\right)\right)
 $$
 
 where $\mu=Ez$ is the mean of the random vector $z$ and
-$\Sigma=E\left(z-\mu\right)\left(z-\mu\right)^\prime$ is the
+$\Sigma=E\left(z-\mu\right)\left(z-\mu\right)^\top$ is the
 covariance matrix of $z$.
 
 The covariance matrix $\Sigma$ is symmetric and positive definite.
@@ -95,7 +99,7 @@ def f(z, μ, Σ):
     μ: ndarray(float, dim=1 or 2)
         the mean of z, N by 1
     Σ: ndarray(float, dim=2)
-        the covarianece matrix of z, N by 1
+        the covariance matrix of z, N by N
     """
 
     z = np.atleast_2d(z)
@@ -155,7 +159,7 @@ $$
 and covariance matrix
 
 $$
-\hat{\Sigma}_{11}=\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{21}=\Sigma_{11}-\beta\Sigma_{22}\beta^{\prime}
+\hat{\Sigma}_{11}=\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{21}=\Sigma_{11}-\beta\Sigma_{22}\beta^\top
 $$
 
 where
@@ -186,7 +190,7 @@ class MultivariateNormal:
     μ: ndarray(float, dim=1)
         the mean of z, N by 1
     Σ: ndarray(float, dim=2)
-        the covarianece matrix of z, N by 1
+        the covariance matrix of z, N by N
 
     Arguments
     ---------
@@ -262,7 +266,7 @@ squares regressions.
 We’ll compare those linear least squares regressions for the simulated
 data to their population counterparts.
 
-## Bivariate Example
+## Bivariate example
 
 We start with a bivariate normal distribution pinned down by
 
@@ -296,7 +300,7 @@ Let's illustrate the fact that you _can regress anything on anything else_.
 
 We have computed everything we need to compute two regression lines, one of $z_2$ on $z_1$, the other of $z_1$ on $z_2$.
 
-We'll represent  these regressions as
+We'll represent these regressions as
 
 $$
 z_1 = a_1 + b_1 z_2 + \epsilon_1
@@ -320,17 +324,17 @@ $$
 E \epsilon_2 z_1 = 0
 $$
 
-Let's  compute $a_1, a_2, b_1, b_2$.
+Let's compute $a_1, a_2, b_1, b_2$.
 
 ```{code-cell} python3
 
-beta = multi_normal.βs
+β = multi_normal.βs
 
-a1 = μ[0] - beta[0]*μ[1]
-b1 = beta[0]
+a1 = μ[0] - β[0]*μ[1]
+b1 = β[0]
 
-a2 = μ[1] - beta[1]*μ[0]
-b2 = beta[1]
+a2 = μ[1] - β[1]*μ[0]
+b2 = β[1]
 ```
 
 Let's print out the intercepts and slopes.
@@ -356,7 +360,12 @@ Now let's plot the two regression lines and stare at them.
 
 
 ```{code-cell} python3
-
+---
+mystnb:
+  figure:
+    caption: two regressions
+    name: fig-two-regressions
+---
 z2 = np.linspace(-4,4,100)
 
 
@@ -383,14 +392,13 @@ ax.xaxis.set_ticks_position('bottom')
 ax.yaxis.set_ticks_position('left')
 plt.ylabel('$z_1$', loc = 'top')
 plt.xlabel('$z_2$,', loc = 'right')
-plt.title('two regressions')
 plt.plot(z2,z1, 'r', label = "$z_1$ on $z_2$")
 plt.plot(z2,z1h, 'b', label = "$z_2$ on $z_1$")
 plt.legend()
 plt.show()
 ```
 
-The red line is the  expectation of $z_1$ conditional on $z_2$.
+The red line is the expectation of $z_1$ conditional on $z_2$.
 
 The intercept and slope of the red line are
 
@@ -410,7 +418,7 @@ print("1/b2 = ", 1/b2)
 
 We can use these regression lines or our code to compute conditional expectations.
 
-Let's  compute the mean and variance of the distribution of $z_2$
+Let's compute the mean and variance of the distribution of $z_2$
 conditional on $z_1=5$.
 
 After that we'll reverse what are on the left and right sides of the regression.
@@ -468,7 +476,7 @@ of $\epsilon$ will converge to $\hat{\Sigma}_1$.
 n = 1_000_000 # sample size
 
 # simulate multivariate normal random vectors
-data = np.random.multivariate_normal(μ, Σ, size=n)
+data = rng.multivariate_normal(μ, Σ, size=n)
 z1_data = data[:, 0]
 z2_data = data[:, 1]
 
@@ -502,17 +510,17 @@ Thus, in each case, for our very large sample size, the sample analogues
 closely approximate their population counterparts.
 
 A Law of Large
-Numbers explains why  sample analogues approximate  population objects.
+Numbers explains why sample analogues approximate population objects.
 
-## Trivariate Example
+## Trivariate example
 
 Let’s apply our code to a trivariate example.
 
 We’ll specify the mean vector and the covariance matrix as follows.
 
 ```{code-cell} python3
-μ = np.random.random(3)
-C = np.random.random((3, 3))
+μ = rng.random(3)
+C = rng.random((3, 3))
 Σ = C @ C.T # positive semi-definite
 
 multi_normal = MultivariateNormal(μ, Σ)
@@ -539,7 +547,7 @@ z2 = np.array([2., 5.])
 
 ```{code-cell} python3
 n = 1_000_000
-data = np.random.multivariate_normal(μ, Σ, size=n)
+data = rng.multivariate_normal(μ, Σ, size=n)
 z1_data = data[:, :k]
 z2_data = data[:, k:]
 ```
@@ -568,7 +576,7 @@ multi_normal.βs[0], results.params
 Once again, sample analogues do a good job of approximating their
 populations counterparts.
 
-## One Dimensional Intelligence (IQ)
+## One dimensional intelligence (IQ)
 
 Let’s move closer to a real-life example, namely, inferring a
 one-dimensional measure of intelligence called IQ from a list of test
@@ -708,7 +716,7 @@ $\theta$ conditional on our test scores.
 Let’s do that and then print out some pertinent quantities.
 
 ```{code-cell} python3
-x = np.random.multivariate_normal(μ_IQ, Σ_IQ)
+x = rng.multivariate_normal(μ_IQ, Σ_IQ)
 y = x[:-1] # test scores
 θ = x[-1]  # IQ
 ```
@@ -723,7 +731,7 @@ conditional normal distribution of the IQ $\theta$.
 
 In the following code, `ind` sets the variables on the right side of the regression.
 
-Given the way we have defined the vector $X$, we want  to set `ind=1` in order to make $\theta$ the left side variable in the
+Given the way we have defined the vector $X$, we want to set `ind=1` in order to make $\theta$ the left side variable in the
 population regression.
 
 ```{code-cell} python3
@@ -809,9 +817,9 @@ Thus, each $y_{i}$ adds information about $\theta$.
 
 If we were to drive the number of tests $n \rightarrow + \infty$, the
 conditional standard deviation $\hat{\sigma}_{\theta}$ would
-converge to $0$ at  rate $\frac{1}{n^{.5}}$.
+converge to $0$ at rate $\frac{1}{n^{.5}}$.
 
-## Information as Surprise
+## Information as surprise
 
 By using a different representation, let’s look at things from a
 different perspective.
@@ -826,13 +834,13 @@ where $C$ is a lower triangular **Cholesky factor** of
 $\Sigma$ so that
 
 $$
-\Sigma \equiv DD^{\prime} = C C^\prime
+\Sigma \equiv DD^\top = C C^\top
 $$
 
 and
 
 $$
-E \epsilon \epsilon' = I .
+E \epsilon \epsilon^\top = I .
 $$
 
 It follows that
@@ -926,13 +934,13 @@ np.max(np.abs(μθ_hat_arr - μθ_hat_arr_C)) < 1e-10
 np.max(np.abs(Σθ_hat_arr - Σθ_hat_arr_C)) < 1e-10
 ```
 
-## Cholesky Factor Magic
+## Cholesky factor magic
 
 Evidently, the Cholesky factorizations automatically computes the
 population  **regression coefficients** and associated statistics
 that are produced by our `MultivariateNormal` class.
 
-The Cholesky factorization  computes these things **recursively**.
+The Cholesky factorization computes these things **recursively**.
 
 Indeed, in formula {eq}`mnv_1`,
 
@@ -942,7 +950,7 @@ Indeed, in formula {eq}`mnv_1`,
 - the coefficient $c_i$ is the simple population regression
   coefficient of $\theta - \mu_\theta$ on $\epsilon_i$
 
-## Math and Verbal  Intelligence
+## Math and verbal intelligence
 
 We can alter the preceding example to be more realistic.
 
@@ -997,7 +1005,7 @@ w_{6}
 $$
 
 where
-$w \begin{bmatrix} w_1 \cr w_2 \cr \vdots \cr w_6 \end{bmatrix}$
+$w = \begin{bmatrix} w_1 \cr w_2 \cr \vdots \cr w_6 \end{bmatrix}$
 is a standard normal random vector.
 
 We construct a Python function `construct_moments_IQ2d` to construct
@@ -1038,7 +1046,7 @@ n = 2
 
 ```{code-cell} python3
 # take one draw
-x = np.random.multivariate_normal(μ_IQ2d, Σ_IQ2d)
+x = rng.multivariate_normal(μ_IQ2d, Σ_IQ2d)
 y1 = x[:n]
 y2 = x[n:2*n]
 θ = x[2*n]
@@ -1060,7 +1068,7 @@ multi_normal_IQ2d.partition(k)
 multi_normal_IQ2d.cond_dist(1, [*y1, *y2])
 ```
 
-Now let’s compute distributions of $\theta$ and $\mu$
+Now let’s compute distributions of $\theta$ and $\eta$
 separately conditional on various subsets of test scores.
 
 It will be fun to compare outcomes with the help of an auxiliary function
@@ -1093,10 +1101,10 @@ for indices, IQ, conditions in [([*range(2*n), 2*n], 'θ', 'y1, y2, y3, y4'),
           f'{μ_hat[0]:1.2f} and {Σ_hat[0, 0]:1.2f} respectively')
 ```
 
-Evidently, math tests provide no information about $\mu$ and
-language tests provide no information about $\eta$.
+Evidently, math tests provide no information about $\eta$ and
+language tests provide no information about $\theta$.
 
-## Univariate Time Series Analysis
+## Univariate time series analysis
 
 We can use the multivariate normal distribution and a little matrix
 algebra to present foundations of univariate linear time series
@@ -1108,7 +1116,7 @@ Consider the following model:
 
 $$
 \begin{aligned}
-x_0 & \sim  N\left(0, \sigma_0^2\right) \\
+x_0 & \sim N\left(0, \sigma_0^2\right) \\
 x_{t+1} & = a x_{t} + b w_{t+1}, \quad w_{t+1} \sim N\left(0, 1\right), t \geq 0  \\
 y_{t} & = c x_{t} + d v_{t}, \quad v_{t} \sim N\left(0, 1\right), t \geq 0
 \end{aligned}
@@ -1164,7 +1172,7 @@ $c$ and $d$ as diagonal respectively.
 Consequently, the covariance matrix of $Y$ is
 
 $$
-\Sigma_{y} = E Y Y^{\prime} = C \Sigma_{x} C^{\prime} + D D^{\prime}
+\Sigma_{y} = E Y Y^\top = C \Sigma_{x} C^\top + D D^\top
 $$
 
 By stacking $X$ and $Y$, we can write
@@ -1179,8 +1187,8 @@ $$
 and
 
 $$
-\Sigma_{z} = EZZ^{\prime}=\left[\begin{array}{cc}
-\Sigma_{x} & \Sigma_{x}C^{\prime}\\
+\Sigma_{z} = EZZ^\top=\left[\begin{array}{cc}
+\Sigma_{x} & \Sigma_{x}C^\top\\
 C\Sigma_{x} & \Sigma_{y}
 \end{array}\right]
 $$
@@ -1255,13 +1263,13 @@ This is going to be very useful for doing the conditioning to be used in
 the fun exercises below.
 
 ```{code-cell} python3
-z = np.random.multivariate_normal(μz, Σz)
+z = rng.multivariate_normal(μz, Σz)
 
 x = z[:T+1]
 y = z[T+1:]
 ```
 
-### Smoothing Example
+### Smoothing example
 
 This is an instance of a classic `smoothing` calculation whose purpose
 is to compute $E X \mid Y$.
@@ -1295,7 +1303,7 @@ print(" E [ X | Y] = ", )
 multi_normal_ex1.cond_dist(0, y)
 ```
 
-### Filtering Exercise
+### Filtering exercise
 
 Compute $E\left[x_{t} \mid y_{t-1}, y_{t-2}, \dots, y_{0}\right]$.
 
@@ -1338,7 +1346,7 @@ sub_y = y[:t]
 multi_normal_ex2.cond_dist(0, sub_y)
 ```
 
-### Prediction Exercise
+### Prediction exercise
 
 Compute $E\left[y_{t} \mid y_{t-j}, \dots, y_{0} \right]$.
 
@@ -1378,10 +1386,10 @@ sub_y = y[:t-j+1]
 multi_normal_ex3.cond_dist(0, sub_y)
 ```
 
-### Constructing a Wold Representation
+### Constructing a Wold representation
 
 Now we’ll apply Cholesky decomposition to decompose
-$\Sigma_{y}=H H^{\prime}$ and form
+$\Sigma_{y}=H H^\top$ and form
 
 $$
 \epsilon = H^{-1} Y.
@@ -1412,12 +1420,12 @@ y
 This example is an instance of what is known as a **Wold representation** in time series analysis.
 
 
-## Stochastic Difference Equation
+## Stochastic difference equation
 
 Consider the stochastic second-order linear difference equation
 
 $$
-y_{t} = \alpha_{0} + \alpha_{1} y_{y-1} + \alpha_{2} y_{t-2} + u_{t}
+y_{t} = \alpha_{0} + \alpha_{1} y_{t-1} + \alpha_{2} y_{t-2} + u_{t}
 $$
 
 where $u_{t} \sim N \left(0, \sigma_{u}^{2}\right)$ and
@@ -1474,8 +1482,8 @@ We have
 $$
 \begin{aligned}
 \mu_{y} = A^{-1} \mu_{b} \\
-\Sigma_{y} &= A^{-1} E \left[\left(b - \mu_{b} + u \right) \left(b - \mu_{b} + u \right)^{\prime}\right] \left(A^{-1}\right)^{\prime} \\
-           &= A^{-1} \left(\Sigma_{b} + \Sigma_{u} \right) \left(A^{-1}\right)^{\prime}
+\Sigma_{y} &= A^{-1} E \left[\left(b - \mu_{b} + u \right) \left(b - \mu_{b} + u \right)^\top\right] \left(A^{-1}\right)^\top \\
+           &= A^{-1} \left(\Sigma_{b} + \Sigma_{u} \right) \left(A^{-1}\right)^\top
 \end{aligned}
 $$
 
@@ -1493,7 +1501,7 @@ $$
 
 $$
 \Sigma_{b}=\left[\begin{array}{cc}
-C\Sigma_{\tilde{y}}C^{\prime} & \boldsymbol{0}_{N-2\times N-2}\\
+C\Sigma_{\tilde{y}}C^\top & \boldsymbol{0}_{N-2\times N-2}\\
 \boldsymbol{0}_{N-2\times2} & \boldsymbol{0}_{N-2\times N-2}
 \end{array}\right],\quad C=\left[\begin{array}{cc}
 \alpha_{2} & \alpha_{1}\\
@@ -1512,7 +1520,6 @@ $$
 
 ```{code-cell} python3
 # set parameters
-T = 80
 T = 160
 # coefficients of the second order difference equation
 𝛼0 = 10
@@ -1520,7 +1527,6 @@ T = 160
 𝛼2 = -.9
 
 # variance of u
-σu = 1.
 σu = 10.
 
 # distribution of y_{-1} and y_{0}
@@ -1529,7 +1535,7 @@ T = 160
 ```
 
 ```{code-cell} python3
-# construct A and A^{\prime}
+# construct A and A^\top
 A = np.zeros((T, T))
 
 for i in range(T):
@@ -1565,7 +1571,7 @@ C = np.array([[𝛼2, 𝛼1], [0, 𝛼2]])
 Σy = A_inv @ (Σb + Σu) @ A_inv.T
 ```
 
-## Application to Stock Price Model
+## Application to stock price model
 
 Let
 
@@ -1602,7 +1608,7 @@ we have
 $$
 \begin{aligned}
 \mu_{p} = B \mu_{y} \\
-\Sigma_{p} = B \Sigma_{y} B^{\prime}
+\Sigma_{p} = B \Sigma_{y} B^\top
 \end{aligned}
 $$
 
@@ -1639,7 +1645,7 @@ $$
 $$
 
 $$
-\Sigma_{z}=D\Sigma_{y}D^{\prime}
+\Sigma_{z}=D\Sigma_{y}D^\top
 $$
 
 ```{code-cell} python3
@@ -1656,7 +1662,7 @@ conditional mean $E \left[p_{t} \mid y_{t-1}, y_{t}\right]$ using
 the `MultivariateNormal` class.
 
 ```{code-cell} python3
-z = np.random.multivariate_normal(μz, Σz)
+z = rng.multivariate_normal(μz, Σz)
 y, p = z[:T], z[T:]
 ```
 
@@ -1688,12 +1694,12 @@ plt.show()
 
 In the above graph, the green line is what the price of the stock would
 be if people had perfect foresight about the path of dividends while the
-green line is the conditional expectation $E p_t | y_t, y_{t-1}$, which is what the price would
+red line is the conditional expectation $E p_t | y_t, y_{t-1}$, which is what the price would
 be if people did not have perfect foresight but were optimally
 predicting future dividends on the basis of the information
 $y_t, y_{t-1}$ at time $t$.
 
-## Filtering Foundations
+## Filtering foundations
 
 Assume that $x_0$ is an $n \times 1$ random vector and that
 $y_0$ is a $p \times 1$ random vector determined by the
@@ -1711,7 +1717,7 @@ We consider the problem of someone who
 
 * *observes* $y_0$
 *  does not observe $x_0$,
-*  knows $\hat x_0, \Sigma_0, G, R$ and therefore  the joint probability distribution of the vector $\begin{bmatrix} x_0 \cr y_0 \end{bmatrix}$
+* knows $\hat x_0, \Sigma_0, G, R$ and therefore the joint probability distribution of the vector $\begin{bmatrix} x_0 \cr y_0 \end{bmatrix}$
 * wants to infer $x_0$ from $y_0$ in light of what he knows about that
 joint probability distribution.
 
@@ -1728,7 +1734,7 @@ $$
                           G \Sigma_0 & G \Sigma_0 G' + R \end{bmatrix}
 $$
 
-By applying an appropriate instance of the above formulas for the  mean vector $\hat \mu_1$ and covariance matrix
+By applying an appropriate instance of the above formulas for the mean vector $\hat \mu_1$ and covariance matrix
 $\hat \Sigma_{11}$ of $z_1$ conditional on $z_2$, we find that the probability distribution of
 $x_0$ conditional on $y_0$ is
 ${\mathcal N}(\tilde x_0, \tilde \Sigma_0)$ where
@@ -1834,7 +1840,7 @@ of $x_t$ conditional on
 $y_0, y_1, \ldots , y_{t-1} = y^{t-1}$ is
 
 $$
-x_t | y^{t-1} \sim {\mathcal N}(A \tilde x_t , A \tilde \Sigma_t A' + C C' )
+x_t | y^{t-1} \sim {\mathcal N}(A \tilde x_{t-1} , A \tilde \Sigma_{t-1} A' + C C' )
 $$
 
 where $\{\tilde x_t, \tilde \Sigma_t\}_{t=1}^\infty$ can be
@@ -1858,7 +1864,7 @@ $$
 \Sigma_{t+1}= C C' + A \Sigma_t A' - A \Sigma_t G' (G \Sigma_t G' +R)^{-1} G \Sigma_t A' .
 $$
 
-This is a matrix Riccati difference equation that is closely related to another matrix Riccati difference equation that appears in  a quantecon lecture on the basics of linear quadratic control theory.
+This is a matrix Riccati difference equation that is closely related to another matrix Riccati difference equation that appears in a quantecon lecture on the basics of linear quadratic control theory.
 
 That equation has the form
 
@@ -1874,7 +1880,7 @@ P_{t-1} =R + A' P_t A  - A' P_t B
 Stare at the two preceding equations for a moment or two, the first being a matrix difference equation for a conditional covariance matrix, the
 second being a matrix difference equation in the matrix appearing in a quadratic form for an intertemporal cost of value function.
 
-Although the  two equations are not identical, they display striking family resemblences.
+Although the two equations are not identical, they display striking family resemblances.
 
 * the first equation tells dynamics that work **forward**  in time
 * the second equation tells dynamics that work  **backward** in time
@@ -1895,7 +1901,7 @@ G = np.array([[1., 3.]])
 R = np.array([[1.]])
 
 x0_hat = np.array([0., 1.])
-Σ0 = np.array([[1., .5], [.3, 2.]])
+Σ0 = np.array([[1., .5], [.5, 2.]])
 
 μ = np.hstack([x0_hat, G @ x0_hat])
 Σ = np.block([[Σ0, Σ0 @ G.T], [G @ Σ0, G @ Σ0 @ G.T + R]])
@@ -1929,7 +1935,7 @@ x1_cond = A @ μ1_hat
 x1_cond, Σ1_cond
 ```
 
-### Code for Iterating
+### Code for iterating
 
 Here is code for solving a dynamic filtering problem by iterating on our
 equations, followed by an example.
@@ -1970,12 +1976,12 @@ iterate(x0_hat, Σ0, A, C, G, R, [2.3, 1.2, 3.2])
 
 The iterative algorithm just described is a version of the celebrated **Kalman filter**.
 
-We describe the Kalman filter  and some applications of it in {doc}`A First Look at the Kalman Filter <kalman>`
+We describe the Kalman filter and some applications of it in {doc}`A First Look at the Kalman Filter <kalman>`
 
 
-## Classic Factor Analysis Model
+## Classic factor analysis model
 
-The factor analysis model widely used in psychology and other fields can
+The factor analysis model can
 be represented as
 
 $$
@@ -1985,31 +1991,31 @@ $$
 where
 
 1. $Y$ is $n \times 1$ random vector,
-   $E U U^{\prime} = D$ is a diagonal matrix,
-1. $\Lambda$ is $n \times k$ coefficient matrix,
-1. $f$ is $k \times 1$ random vector,
-   $E f f^{\prime} = I$,
-1. $U$ is $n \times 1$ random vector, and $U \perp f$ (i.e., $E U f' = 0 $ )
-1. It is presumed that $k$ is small relative to $n$; often
+   $E U U^\top = D$ is a diagonal matrix,
+2. $\Lambda$ is $n \times k$ coefficient matrix,
+3. $f$ is $k \times 1$ random vector,
+   $E f f^\top = I$,
+4. $U$ is $n \times 1$ random vector, and $U \perp f$ (i.e., $E U f^\top = 0 $ )
+5. It is presumed that $k$ is small relative to $n$; often
    $k$ is only $1$ or $2$, as in our IQ examples.
 
 This implies that
 
 $$
 \begin{aligned}
-\Sigma_y = E Y Y^{\prime} = \Lambda \Lambda^{\prime} + D \\
-E Y f^{\prime} = \Lambda \\
-E f Y^{\prime} = \Lambda^{\prime}
+\Sigma_y = E Y Y^\top = \Lambda \Lambda^\top + D \\
+E Y f^\top = \Lambda \\
+E f Y^\top = \Lambda^\top
 \end{aligned}
 $$
 
 Thus, the covariance matrix $\Sigma_Y$ is the sum of a diagonal
 matrix $D$ and a positive semi-definite matrix
-$\Lambda \Lambda^{\prime}$ of rank $k$.
+$\Lambda \Lambda^\top$ of rank $k$.
 
 This means that all covariances among the $n$ components of the
 $Y$ vector are intermediated by their common dependencies on the
-$k<$ factors.
+$k$ factors.
 
 Form
 
@@ -2024,9 +2030,9 @@ the covariance matrix of the expanded random vector $Z$ can be
 computed as
 
 $$
-\Sigma_{z} = EZZ^{\prime}=\left(\begin{array}{cc}
-I & \Lambda^{\prime}\\
-\Lambda & \Lambda\Lambda^{\prime}+D
+\Sigma_{z} = EZZ^\top=\left(\begin{array}{cc}
+I & \Lambda^\top\\
+\Lambda & \Lambda\Lambda^\top+D
 \end{array}\right)
 $$
 
@@ -2093,7 +2099,7 @@ $Z$.
 ```
 
 ```{code-cell} python3
-z = np.random.multivariate_normal(μz, Σz)
+z = rng.multivariate_normal(μz, Σz)
 
 f = z[:k]
 y = z[k:]
@@ -2113,7 +2119,7 @@ multi_normal_factor.cond_dist(0, y)
 
 We can verify that the conditional mean
 $E \left[f \mid Y=y\right] = B Y$ where
-$B = \Lambda^{\prime} \Sigma_{y}^{-1}$.
+$B = \Lambda^\top \Sigma_{y}^{-1}$.
 
 ```{code-cell} python3
 B = Λ.T @ np.linalg.inv(Σy)
@@ -2134,7 +2140,7 @@ $\Lambda I^{-1} f = \Lambda f$.
 Λ @ f
 ```
 
-## PCA and Factor Analysis
+## PCA and factor analysis
 
 To learn about Principal Components Analysis (PCA), please see this lecture {doc}`Singular Value Decompositions <svd_intro>`.
 
@@ -2144,8 +2150,9 @@ model.
 
 
 
-Technically, this means that the PCA model is misspecified. (Can you
-explain why?)
+Technically, this means that the PCA model is misspecified.
+
+(Can you explain why?)
 
 Nevertheless, this exercise will let us study how well the first two
 principal components from a PCA can approximate the conditional
@@ -2156,7 +2163,7 @@ governs the data on $Y$ we have generated.
 So we compute the PCA decomposition
 
 $$
-\Sigma_{y} = P \tilde{\Lambda} P^{\prime}
+\Sigma_{y} = P \tilde{\Lambda} P^\top
 $$
 
 where $\tilde{\Lambda}$ is a diagonal matrix.
@@ -2170,7 +2177,7 @@ $$
 and
 
 $$
-\epsilon = P^\prime Y
+\epsilon = P^\top Y
 $$
 
 Note that we will arrange the eigenvectors in $P$ in the
@@ -2246,51 +2253,144 @@ Let’s look at them, after which we’ll look at $E f | y = B y$
 B @ y
 ```
 
-The fraction of variance in $y_{t}$ explained by the first two
-principal components can be computed as below.
+```{note}
+The two largest eigenvalues are both $5.25$ in this example. 
+
+
+When an
+eigenvalue is repeated, the associated principal components are not
+individually pinned down: any orthonormal basis for the same
+two-dimensional eigenspace is valid.
+
+For that reason, it is not meaningful to compare $\epsilon_1$ and
+$\epsilon_2$ component-by-component with $E[f \mid Y]$. 
+
+The PC scores
+live in a PCA coordinate system, while $E[f \mid Y]$ lives in factor
+space. 
+
+Even within the common two-dimensional subspace, the PCA basis can
+be rotated or sign-flipped, and its coordinates need not use the same
+scaling as the factor coordinates.
+
+What is uniquely determined is the two-dimensional subspace spanned by
+the first two columns of $P$. 
+
+In this symmetric example, that subspace is
+exactly the column space of $\Lambda$.
+```
+
+The fraction of variance in $y_t$ explained by the first two principal
+components is
 
 ```{code-cell} python3
 𝜆_tilde[:2].sum() / 𝜆_tilde.sum()
 ```
 
-Compute
+To compare PCA with the factor model in observation space, compute
 
 $$
 \hat{Y} = P_{j} \epsilon_{j} + P_{k} \epsilon_{k}
 $$
 
-where $P_{j}$ and $P_{k}$ correspond to the largest two
-eigenvalues.
+where $P_j$ and $P_k$ are the eigenvectors associated with the two
+largest eigenvalues.
 
 ```{code-cell} python3
 y_hat = P[:, :2] @ ε[:2]
 ```
 
-In this example, it turns out that the projection $\hat{Y}$ of
-$Y$ on the first two principal components does a good job of
-approximating $Ef \mid y$.
+$\hat{Y}$ is the rank-2 PCA approximation to $Y$ in observation space,
+so it is a 10-vector rather than a 2-vector. 
+
+The natural observation-space
+counterpart from the factor model is $\Lambda E[f \mid Y]$, which is
+also a 10-vector.
+
+In this symmetric example, both vectors lie in the same two-dimensional
+subspace, namely the column space of $\Lambda$. 
+
+They are therefore close,
+but not identical. 
+
+The PCA reconstruction uses the block means directly,
+while $\Lambda E[f \mid Y]$ shrinks those block means toward zero by the
+factor $5/(5+\sigma_u^2) \approx 0.952$.
+
+The next plot makes this comparison concrete.
+
+The two scatter plots, $E[Y \mid f] = \Lambda f$ and $\hat{Y}$, are both
+10-vectors in observation space, so they can be compared directly.
+
+The horizontal lines show the factor values $f_1$ and $f_2$, together
+with their posterior means $E[f_i \mid Y]$. 
+
+These are 2-dimensional
+factor-space quantities, drawn over the relevant half of the index set to
+match the block structure of $\Lambda$.
+
+This uses the same idea as the earlier formula
+$E[Y \mid f] = \Lambda f$: the matrix $\Lambda$ maps a 2-vector in factor
+space into a 10-vector in observation space. 
 
-We confirm this in the following plot of $f$,
-$E y \mid f$, $E f \mid y$, and $\hat{y}$ on the
-coordinate axis versus $y$ on the ordinate axis.
+In our example,
+
+$$
+\Lambda a
+=
+\begin{bmatrix}
+a_1 \\
+\vdots \\
+a_1 \\
+a_2 \\
+\vdots \\
+a_2
+\end{bmatrix}
+\quad \text{for any } a = \begin{bmatrix} a_1 \\ a_2 \end{bmatrix},
+$$
+
+because the first five rows of $\Lambda$ are $(1,0)$ and the last five
+rows are $(0,1)$.
+
+Therefore, once we observe $Y=y$, the posterior mean
+$E[f \mid Y=y] = \begin{bmatrix} E[f_1 \mid y] \\ E[f_2 \mid y] \end{bmatrix}$
+is converted into the observation-space vector
+
+$$
+\Lambda E[f \mid Y=y]
+=
+\begin{bmatrix}
+E[f_1 \mid y] \\
+\vdots \\
+E[f_1 \mid y] \\
+E[f_2 \mid y] \\
+\vdots \\
+E[f_2 \mid y]
+\end{bmatrix}.
+$$
+
+So the horizontal line at height $E[f_1 \mid y]$ over the first five
+indices, together with the horizontal line at height $E[f_2 \mid y]$
+over the last five indices, is exactly a picture of
+$\Lambda E[f \mid Y=y]$.
 
 ```{code-cell} python3
-plt.scatter(range(N), Λ @ f, label='$Ey|f$')
-plt.scatter(range(N), y_hat, label=r'$\hat{y}$')
+plt.scatter(range(N), Λ @ f, label=r'$E[Y \mid f]$')
+plt.scatter(range(N), y_hat, label=r'$\hat{Y}$')
 plt.hlines(f[0], 0, N//2-1, ls='--', label='$f_{1}$')
 plt.hlines(f[1], N//2, N-1, ls='-.', label='$f_{2}$')
 
 Efy = B @ y
-plt.hlines(Efy[0], 0, N//2-1, ls='--', color='b', label='$Ef_{1}|y$')
-plt.hlines(Efy[1], N//2, N-1, ls='-.', color='b', label='$Ef_{2}|y$')
+plt.hlines(Efy[0], 0, N//2-1, ls='--', color='b', label=r'$E[f_1 \mid y]$')
+plt.hlines(Efy[1], N//2, N-1, ls='-.', color='b', label=r'$E[f_2 \mid y]$')
 plt.legend()
 
 plt.show()
 ```
 
-The covariance matrix of $\hat{Y}$ can be computed by first
-constructing the covariance matrix of $\epsilon$ and then use the
-upper left block for $\epsilon_{1}$ and $\epsilon_{2}$.
+To compute the covariance matrix of $\hat{Y}$, first form the covariance
+matrix of $\epsilon$ and then extract the upper-left block corresponding
+to $\epsilon_1$ and $\epsilon_2$.
 
 ```{code-cell} python3
 Σεjk = (P.T @ Σy @ P)[:2, :2]
@@ -2300,3 +2400,439 @@ Pjk = P[:, :2]
 Σy_hat = Pjk @ Σεjk @ Pjk.T
 print('Σy_hat = \n', Σy_hat)
 ```
+
+## Exercises
+
+```{exercise}
+:label: mv_normal_ex1
+
+**Verify conditional mean and variance by simulation**
+
+For the bivariate normal with
+
+$$
+\mu = \begin{bmatrix} 0.5 \\ 1.0 \end{bmatrix}, \quad
+\Sigma = \begin{bmatrix} 1 & 0.5 \\ 0.5 & 1 \end{bmatrix}
+$$
+
+fix $z_2 = 2$.
+
+1. Use `MultivariateNormal` to compute the analytical conditional mean
+$\hat{\mu}_1$ and variance $\hat{\Sigma}_{11}$ of $z_1 \mid z_2 = 2$.
+
+1. Draw $10^6$ samples from the joint distribution.
+
+   Retain only those for which $|z_2 - 2| < 0.05$.
+
+   Compute the sample mean and variance of the retained $z_1$ values.
+
+1. Confirm that the sample estimates are close to the analytical values.
+```
+
+```{solution-start} mv_normal_ex1
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} python3
+μ = np.array([.5, 1.])
+Σ = np.array([[1., .5], [.5, 1.]])
+
+mn = MultivariateNormal(μ, Σ)
+mn.partition(1)
+μ1_hat, Σ11_hat = mn.cond_dist(0, np.array([2.]))
+print(f"Analytical  μ1_hat = {μ1_hat[0]:.4f},  Σ11_hat = {Σ11_hat[0,0]:.4f}")
+
+n = 1_000_000
+data = rng.multivariate_normal(μ, Σ, size=n)
+z1_all, z2_all = data[:, 0], data[:, 1]
+
+mask = np.abs(z2_all - 2.) < 0.05
+z1_cond = z1_all[mask]
+print(f"Sample size in band: {mask.sum()}")
+print(f"Sample      μ1_hat = {np.mean(z1_cond):.4f},  Σ11_hat = {np.var(z1_cond, ddof=1):.4f}")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: mv_normal_ex2
+
+**Product of regression slopes equals squared correlation**
+
+For a bivariate normal with standard deviations $\sigma_1 = \sigma_2 = 1$ and
+correlation $\rho$, show analytically that $b_1 b_2 = \rho^2$, where
+$b_1$ is the slope of $z_1$ on $z_2$ and $b_2$ is the slope of $z_2$
+on $z_1$.
+
+Then verify numerically for $\rho \in \{0.2, 0.5, 0.9\}$ that
+`βs[0] * βs[1]` $= \rho^2$ by constructing the appropriate
+`MultivariateNormal` instances.
+```
+
+```{solution-start} mv_normal_ex2
+:class: dropdown
+```
+
+The regression slopes are
+
+$$
+b_1 = \frac{\Sigma_{12}}{\Sigma_{22}} = \frac{\rho \sigma_1 \sigma_2}{\sigma_2^2}
+= \rho \frac{\sigma_1}{\sigma_2}, \qquad
+b_2 = \frac{\Sigma_{21}}{\Sigma_{11}} = \rho \frac{\sigma_2}{\sigma_1}
+$$
+
+so $b_1 b_2 = \rho^2$.
+
+```{code-cell} python3
+for ρ in [0.2, 0.5, 0.9]:
+    Σ = np.array([[1., ρ], [ρ, 1.]])
+    mn = MultivariateNormal(np.zeros(2), Σ)
+    mn.partition(1)
+    product = mn.βs[0].item() * mn.βs[1].item()
+    print(f"ρ={ρ:.1f}:  b1*b2 = {product:.4f}")
+    print(f"ρ^2 = {ρ**2:.4f},  match: {np.isclose(product, ρ**2)}")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: mv_normal_ex3
+
+**IQ inference: effect of the signal-to-noise ratio**
+
+Using the one-dimensional IQ model with $n = 50$ test scores and
+$\mu_\theta = 100$, $\sigma_\theta = 10$:
+
+1. Vary the test-score noise $\sigma_y \in \{1, 5, 10, 20, 50\}$.
+
+- For each value, plot the posterior standard deviation
+$\hat{\sigma}_\theta$ as a function of the number of test scores
+included (from 1 to 50), with all curves on the same axes.
+
+2. Explain intuitively why a larger $\sigma_y$ leads to a slower
+decline of posterior uncertainty.
+```
+
+```{solution-start} mv_normal_ex3
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} python3
+n_max = 50
+μθ_val, σθ_val = 100., 10.
+
+fig, ax = plt.subplots()
+for σy_val in [1., 5., 10., 20., 50.]:
+    σθ_hat_arr = np.empty(n_max)
+    for i in range(1, n_max + 1):
+        μ_i, Σ_i, _ = construct_moments_IQ(i, μθ_val, σθ_val, σy_val)
+        mn_i = MultivariateNormal(μ_i, Σ_i)
+        mn_i.partition(i)
+        _, Σθ_i = mn_i.cond_dist(1, np.zeros(i))
+        σθ_hat_arr[i - 1] = np.sqrt(Σθ_i[0, 0])
+    ax.plot(range(1, n_max + 1), σθ_hat_arr, label=f'σy={σy_val:.0f}')
+
+ax.set_xlabel('number of test scores')
+ax.set_ylabel(r'posterior $\hat{\sigma}_\theta$')
+ax.legend()
+plt.show()
+```
+
+When $\sigma_y$ is large each test score is a noisy signal about $\theta$,
+so many more observations are required before the posterior variance falls
+appreciably. 
+
+In the limit $\sigma_y \to 0$ a single observation pins down
+$\theta$ exactly.
+
+```{solution-end}
+```
+
+````{exercise}
+:label: mv_normal_ex4
+
+**Prior vs. likelihood in IQ inference**
+
+Using the one-dimensional IQ model with $n = 20$ test scores and
+$\mu_\theta = 100$, $\sigma_y = 10$:
+
+1. Fix $\sigma_y = 10$ and vary the prior spread
+$\sigma_\theta \in \{1, 5, 10, 50, 500\}$.
+
+    - For each value compute the
+    posterior mean $\hat{\mu}_\theta$ given the same set of $n = 20$ test
+    scores and plot $\hat{\mu}_\theta$ against $\sigma_\theta$.
+
+1. Show analytically (or verify numerically) that
+
+   - as $\sigma_\theta \to \infty$ the posterior mean converges to the
+     sample mean $\bar{y}$ (the data dominate the prior), and
+   - as $\sigma_\theta \to 0$ the posterior mean converges to the prior
+     mean $\mu_\theta$ (the prior dominates the data).
+
+```{hint}
+The posterior mean formula is
+$\hat{\mu}_\theta = \bigl(\mu_\theta/\sigma_\theta^2 + n\bar{y}/\sigma_y^2\bigr)
+\big/ \bigl(1/\sigma_\theta^2 + n/\sigma_y^2\bigr)$.
+```
+
+Examine each limit by letting $\sigma_\theta$ go to $\infty$ or $0$.
+````
+
+```{solution-start} mv_normal_ex4
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} python3
+n_scores = 20
+μθ_val, σy_val = 100., 10.
+
+rng = np.random.default_rng(42)
+true_θ = 108.
+y_obs = true_θ + σy_val * rng.standard_normal(n_scores)
+y_bar = np.mean(y_obs)
+
+σθ_vals = [1., 5., 10., 50., 500.]
+μθ_hat_vals = []
+
+for σθ_val in σθ_vals:
+    μ_i, Σ_i, _ = construct_moments_IQ(n_scores, μθ_val, σθ_val, σy_val)
+    mn_i = MultivariateNormal(μ_i, Σ_i)
+    mn_i.partition(n_scores)
+    μθ_hat, _ = mn_i.cond_dist(1, y_obs)
+    μθ_hat_vals.append(μθ_hat.item())
+
+def posterior_mean(σθ_val):
+    μ_i, Σ_i, _ = construct_moments_IQ(n_scores, μθ_val, σθ_val, σy_val)
+    mn_i = MultivariateNormal(μ_i, Σ_i)
+    mn_i.partition(n_scores)
+    μθ_hat, _ = mn_i.cond_dist(1, y_obs)
+    return μθ_hat.item()
+
+fig, ax = plt.subplots()
+ax.semilogx(σθ_vals, μθ_hat_vals, 'o-', 
+            label=r'$\hat{\mu}_\theta$')
+ax.axhline(y_bar,  ls='--', color='r', 
+            label=f'sample mean y_bar = {y_bar:.1f}')
+ax.axhline(μθ_val, ls=':',  color='g', 
+            label=f'prior mean μθ = {μθ_val:.0f}')
+ax.set_xlabel(r'$\sigma_\theta$')
+ax.set_ylabel(r'posterior mean $\hat{\mu}_\theta$')
+ax.legend()
+plt.show()
+
+σθ_small = 1e-2
+σθ_large = 1e4
+
+print(f"y_bar = {y_bar:.4f}")
+print(f"Posterior mean with σθ={σθ_large:.0e}: {posterior_mean(σθ_large):.4f}")
+print(f"Posterior mean with σθ={σθ_small:.0e}: {posterior_mean(σθ_small):.4f}")
+print(f"Prior mean μθ = {μθ_val:.4f}")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: mv_normal_ex5
+
+**Kalman filter convergence**
+
+Using the `iterate` function from the Filtering Foundations section with
+
+$$
+A = \begin{bmatrix} 0.9 & 0 \\ 0 & 0.5 \end{bmatrix}, \quad
+C = \begin{bmatrix} 1 \\ 1 \end{bmatrix}, \quad
+G = \begin{bmatrix} 1 & 0 \end{bmatrix}, \quad
+R = \begin{bmatrix} 1 \end{bmatrix}
+$$
+
+and initial conditions $\hat{x}_0 = [0, 0]'$, $\Sigma_0 = I_2$:
+
+1. Simulate $T = 60$ periods of $\{x_t, y_t\}$ and run the filter.
+
+1. Plot the sequences of conditional variances $\Sigma_t[0,0]$ and
+$\Sigma_t[1,1]$ over time.
+
+   Verify that they converge to a steady state.
+
+1. Plot the filtered state estimates $\tilde{x}_t[0]$ together with the
+true $x_t[0]$ and the raw observations $y_t$ on a single figure.
+```
+
+```{solution-start} mv_normal_ex5
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} python3
+A_ex = np.array([[0.9, 0.], [0., 0.5]])
+C_ex = np.array([[1.], [1.]])
+G_ex = np.array([[1., 0.]])
+R_ex = np.array([[1.]])
+
+T_ex = 60
+x0_hat_ex = np.zeros(2)
+Σ0_ex = np.eye(2)
+
+rng = np.random.default_rng(7)
+x_true = np.zeros((T_ex + 1, 2))
+y_seq_ex = np.zeros(T_ex)
+for t in range(T_ex):
+    x_true[t + 1] = A_ex @ x_true[t] + C_ex[:, 0] * rng.standard_normal()
+    y_seq_ex[t] = (G_ex @ x_true[t]).item() + rng.standard_normal()
+
+x_hat_seq, Σ_hat_seq = iterate(
+    x0_hat_ex, Σ0_ex, A_ex, C_ex, G_ex, R_ex, y_seq_ex)
+
+# x_hat_seq[t] = E[x_t | y^{t-1}] (one-step-ahead prediction)
+# Σ_hat_seq[t] = corresponding prediction-error covariance
+fig, ax = plt.subplots()
+ax.plot(Σ_hat_seq[:, 0, 0], label=r'$\Sigma_t[0,0]$')
+ax.plot(Σ_hat_seq[:, 1, 1], label=r'$\Sigma_t[1,1]$')
+ax.set_xlabel('t')
+ax.set_ylabel('prediction-error variance')
+ax.legend()
+plt.show()
+
+# The `iterate` function stores one-step-ahead predictions. 
+# We recover the filtered estimates E[x_t | y^t] by re-applying
+# the measurement-update step at each t.
+n_state = 2
+x_filt_seq = np.empty((T_ex, n_state))
+for t in range(T_ex):
+    xt_hat = x_hat_seq[t]
+    Σt     = Σ_hat_seq[t]
+    μ_k = np.hstack([xt_hat, G_ex @ xt_hat])
+    Σ_k = np.block([[Σt,          Σt  @ G_ex.T          ],
+                    [G_ex @ Σt,   G_ex @ Σt @ G_ex.T + R_ex]])
+    mn_k = MultivariateNormal(μ_k, Σ_k)
+    mn_k.partition(n_state)
+    x_filt_seq[t], _ = mn_k.cond_dist(0, y_seq_ex[t:t+1])
+
+fig, ax = plt.subplots()
+ax.plot(x_true[:-1, 0], label='true $x_t[0]$', alpha=0.7)
+ax.plot(x_filt_seq[:, 0], label=r'filtered $\tilde{x}_t[0]$', ls='--')
+ax.plot(y_seq_ex, label='observations $y_t$', alpha=0.4, lw=0.8)
+ax.set_xlabel('t')
+ax.legend()
+plt.show()
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: mv_normal_ex6
+
+**PCA vs. factor analysis**
+
+In the classic factor analysis model at the end of the lecture the true
+covariance is $\Sigma_y = \Lambda \Lambda' + D$.
+
+1. Set $\sigma_u = 2$ (instead of $0.5$). 
+
+    - Recompute the fraction of
+    variance explained by the first two principal components and compare
+    it with the $\sigma_u = 0.5$ result. 
+    - Explain the change.
+
+1. Show that the observation-space factor-analytic posterior
+   $\Lambda E[f \mid Y] = \Lambda B Y$ (an $N$-vector) is **not** equal to
+   the two-component PCA reconstruction
+   $\hat{Y} = P_{:,1:2}\,\epsilon_{1:2}$ (also an $N$-vector).
+    - Plot both on the same axes.
+
+   *Note:* $E[f \mid Y] = BY$ is a $k$-vector and $\hat{Y}$ is an
+   $N$-vector, so they cannot be compared directly; the comparison must be
+   made in observation space via $\Lambda E[f \mid Y]$.
+
+1. In one or two sentences, explain why PCA is misspecified for
+factor-analytic data.
+```
+
+```{solution-start} mv_normal_ex6
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} python3
+rng = np.random.default_rng(42)
+
+N_fa = 10
+k_fa = 2
+
+Λ_fa = np.zeros((N_fa, k_fa))
+Λ_fa[:N_fa//2, 0] = 1
+Λ_fa[N_fa//2:, 1] = 1
+
+for σu_val in [0.5, 2.0]:
+    D_fa = np.eye(N_fa) * σu_val ** 2
+    Σy_fa = Λ_fa @ Λ_fa.T + D_fa
+
+    λ_fa, P_fa = np.linalg.eigh(Σy_fa)
+    ind_fa = sorted(range(N_fa), key=lambda x: λ_fa[x], reverse=True)
+    P_fa   = P_fa[:, ind_fa]
+    λ_fa   = λ_fa[ind_fa]
+
+    frac = λ_fa[:2].sum() / λ_fa.sum()
+    print(f"σu={σu_val}: fraction explained by first 2 PCs = {frac:.4f}")
+
+σu_b = 0.5
+D_b = np.eye(N_fa) * σu_b ** 2
+Σy_b = Λ_fa @ Λ_fa.T + D_b
+
+μz_b = np.zeros(k_fa + N_fa)
+Σz_b = np.block([[np.eye(k_fa), Λ_fa.T], [Λ_fa, Σy_b]])
+z_b = rng.multivariate_normal(μz_b, Σz_b)
+f_b = z_b[:k_fa]
+y_b = z_b[k_fa:]
+
+B_b = Λ_fa.T @ np.linalg.inv(Σy_b)
+Efy_b  = B_b @ y_b
+
+λ_b, P_b = np.linalg.eigh(Σy_b)
+ind_b = sorted(range(N_fa), key=lambda x: λ_b[x], reverse=True)
+P_b = P_b[:, ind_b]
+ε_b = P_b.T @ y_b
+y_hat_b = P_b[:, :2] @ ε_b[:2]
+
+fig, ax = plt.subplots(figsize=(8, 4))
+ax.scatter(range(N_fa), 
+        Λ_fa @ Efy_b, label=r'Factor-analytic $\Lambda E[f\mid y]$')
+ax.scatter(range(N_fa), 
+        y_hat_b, marker='x', label=r'PCA projection $\hat{y}$')
+ax.scatter(range(N_fa), 
+        Λ_fa @ f_b, marker='^', alpha=0.6, label=r'True signal $\Lambda f$')
+ax.set_xlabel('observation index')
+ax.legend()
+plt.show()
+```
+
+In this symmetric example, PCA does recover the same two-dimensional
+observation-space subspace as the factor model, namely the column space
+of $\Lambda$. But PCA is still misspecified for factor-analytic data,
+because it treats the covariance matrix as an arbitrary matrix to be
+approximated and does not use the special decomposition
+$\Sigma_y = \Lambda \Lambda^\top + D$ into a common part and an
+idiosyncratic noise part.
+
+So the two methods are solving different problems. PCA forms
+$\hat{Y}$ as the best rank-2 approximation to the observed data vector
+$Y$, which in this example amounts to using the block means. The factor
+model instead computes $\Lambda E[f \mid Y]$, the conditional mean of the
+latent common component $\Lambda f$ given the data, and because it
+accounts for noise it shrinks those block means toward zero.
+
+```{solution-end}
+```
diff --git a/lectures/odu.md b/lectures/odu.md
index c62519468..f15c54fb9 100644
--- a/lectures/odu.md
+++ b/lectures/odu.md
@@ -245,12 +245,12 @@ What kind of optimal policy might result from
 {eq}`odu_mvf` and the parameterization specified above?
 
 Intuitively, if we accept at $w_a$ and $w_a\leq w_b$,
-then — all other things being given — we should also accept at $w_b$.
+then -- all other things being given -- we should also accept at $w_b$.
 
 This suggests a policy of accepting whenever $w$ exceeds some
 threshold value $\bar w$.
 
-But $\bar w$ should depend on $\pi$ — in
+But $\bar w$ should depend on $\pi$ -- in
 fact, it should be decreasing in $\pi$ because
 
 - $f$ is a less attractive offer distribution than $g$
diff --git a/lectures/prob_matrix.md b/lectures/prob_matrix.md
index b142b9e39..b29820c20 100644
--- a/lectures/prob_matrix.md
+++ b/lectures/prob_matrix.md
@@ -1,10 +1,10 @@
 ---
 jupytext:
   text_representation:
-    extension: .myst
+    extension: .md
     format_name: myst
     format_version: 0.13
-    jupytext_version: 1.13.8
+    jupytext_version: 1.17.1
 kernelspec:
   display_name: Python 3 (ipykernel)
   language: python
@@ -17,7 +17,7 @@ kernelspec:
 
 This lecture uses matrix algebra to illustrate some basic ideas about probability theory.
 
-After introducing  underlying objects, we'll use matrices and vectors to describe probability distributions.
+After introducing underlying objects, we'll use matrices and vectors to describe probability distributions.
 
 Among concepts that we'll be studying include
 
@@ -29,13 +29,13 @@ Among concepts that we'll be studying include
     - couplings
     - copulas
 - the probability distribution of a sum of two independent random variables
-    - convolution of  marginal distributions
+    - convolution of marginal distributions
 - parameters that define a probability distribution
 - sufficient statistics as data summaries
 
 We'll use a matrix to represent a bivariate or multivariate probability distribution and a vector to represent a univariate probability distribution
 
-This {doc}`companion lecture <stats_examples>` describes some popular probability distributions and describes how to  use Python to sample from them. 
+This {doc}`companion lecture <stats_examples>` describes some popular probability distributions and describes how to use Python to sample from them.
 
 
 In addition to what's in Anaconda, this lecture will need the following libraries:
@@ -53,79 +53,94 @@ As usual, we'll start with some imports
 import numpy as np
 import matplotlib.pyplot as plt
 import prettytable as pt
+from scipy import stats
+from scipy.special import comb
 from mpl_toolkits.mplot3d import Axes3D
 from matplotlib_inline.backend_inline import set_matplotlib_formats
 set_matplotlib_formats('retina')
+
+rng = np.random.default_rng(0)
 ```
 
 
-## Sketch of Basic Concepts
+## Sketch of basic concepts
 
 We'll briefly define what we mean by a **probability space**, a **probability measure**, and a **random variable**.
 
 For most of this lecture, we sweep these objects into the background
  
 ```{note}
-Nevertheless, they'll be lurking beneath  **induced distributions** of random variables that  we'll  focus on here. These deeper objects are essential for defining  and analysing  the concepts of stationarity and ergodicity that underly laws of large numbers.  For a relatively
-nontechnical presentation of some of these results see this chapter from Lars Peter Hansen and Thomas J. Sargent's online monograph titled "Risk, Uncertainty, and Values":<https://lphansen.github.io/QuantMFR/book/1_stochastic_processes.html>.
+Nevertheless, they'll be lurking beneath **induced distributions** of random variables that we'll focus on here. 
+
+These deeper objects are essential for defining and analysing the concepts of stationarity and ergodicity that underly laws of large numbers.
+
+For a relatively
+nontechnical presentation of some of these results see this chapter from Lars Peter Hansen and Thomas J. Sargent's online monograph titled [*Risk, Uncertainty, and Values*](https://lphansen.github.io/QuantMFR/book/1_stochastic_processes.html).
 ``` 
   
 
 
-Let $\Omega$ be a set of possible underlying outcomes and let $\omega \in \Omega$ be a particular underlying outcomes.
+Let $\Omega$ be a set of possible underlying outcomes and let $\omega \in \Omega$ be a particular underlying outcome.
 
-Let $\mathcal{G} \subset \Omega$ be a subset of $\Omega$.
+Let $\mathcal{F}$ be a collection of subsets of $\Omega$ that we call **events**.
 
-Let $\mathcal{F}$ be a collection of such subsets  $\mathcal{G} \subset \Omega$.
+(Technically, $\mathcal{F}$ is a [$\sigma$-algebra](https://en.wikipedia.org/wiki/Sigma-algebra).)
 
-The pair $\Omega,\mathcal{F}$  forms our **probability space** on which we want to put a probability measure.
+A **probability measure** $\mu$ maps each event $\mathcal{G} \in \mathcal{F}$ into a scalar number $\mu(\mathcal{G})$ between $0$ and $1$, with $\mu(\Omega)=1$.
 
-A **probability measure** $\mu$ maps a set of possible underlying outcomes  $\mathcal{G} \in \mathcal{F}$  into a scalar number between $0$ and $1$
+The triple $\Omega,\mathcal{F},\mu$ forms our **probability space**.
 
-- this is the "probability" that $X$ belongs to $A$, denoted by $ \textrm{Prob}\{X\in A\}$.
+A **random variable** $X(\omega)$ is a function of the underlying outcome $\omega \in \Omega$ that assigns a value in some set of possible values.
 
-A **random variable** $X(\omega)$ is a function of the underlying outcome $\omega \in \Omega$.
+If $A$ is a set of possible values of $X$, then the event that $X$ lies in $A$ is
 
+$$
+\mathcal{G} = \{\omega \in \Omega : X(\omega) \in A\}.
+$$
 
-The random variable $X(\omega)$  has a **probability distribution** that is induced by the underlying probability measure $\mu$ and the function
-$X(\omega)$:
+The random variable $X(\omega)$ has a **probability distribution** induced by the probability measure $\mu$:
 
 $$
-\textrm{Prob} (X \in A ) = \int_{\mathcal{G}} \mu(\omega) d \omega
-$$ (eq:CDFfromdensity)
+\textrm{Prob}(X \in A) = \mu(\mathcal{G}).
+$$
 
-where ${\mathcal G}$ is the subset of $\Omega$ for which $X(\omega) \in A$.
+If $\mu$ has a density $p(\omega)$, then we can also write
+
+$$
+\textrm{Prob}(X \in A) = \int_{\mathcal{G}} p(\omega)\, d \omega
+$$ (eq:CDFfromdensity)
 
 We call this the induced probability distribution of random variable $X$.
 
-Instead of working explicitly with an underlying probability space $\Omega,\mathcal{F}$  and probability measure $\mu$,
-applied statisticians often proceed simply by specifying a form for an induced distribution for a random variable $X$. 
+Instead of working explicitly with an underlying probability space $\Omega,\mathcal{F}$ and probability measure $\mu$,
+applied statisticians often proceed simply by specifying a form for an induced distribution for a random variable $X$.
 
-That is how we'll proceed in this lecture and in many subsequent lectures. 
+That is how we'll proceed in this lecture and in many subsequent lectures.
 
 
-## What Does Probability Mean?
+## What does probability mean?
 
 Before diving in, we'll say a few words about what probability theory means and how it connects to statistics.
 
-We  also touch  on these topics in the quantecon lectures  <https://python.quantecon.org/prob_meaning.html> and <https://python.quantecon.org/navy_captain.html>.
+We also touch on these topics in {doc}`prob_meaning` and {doc}`navy_captain`.
 
-For much of this lecture we'll be discussing  fixed "population" probabilities.
+For much of this lecture we'll be discussing fixed "population" probabilities.
 
 These are purely mathematical objects.
 
 To appreciate how statisticians connect probabilities to data, the key is to understand the following concepts:
 
 * A single draw from a probability distribution
-* Repeated independently  and identically distributed (i.i.d.)  draws of "samples" or "realizations" from the same probability distribution
-* A **statistic** defined as a  function of a sequence of samples
-* An **empirical distribution** or **histogram** (a binned empirical distribution) that records observed  **relative frequencies**
-* The idea that a  population probability  distribution is  what we anticipate **relative frequencies** will be in a long sequence of i.i.d. draws. Here the following mathematical machinery makes precise what is meant by **anticipated relative frequencies**
+* Repeated independently and identically distributed (i.i.d.) draws of "samples" or "realizations" from the same probability distribution
+* A **statistic** defined as a function of a sequence of samples
+* An **empirical distribution** or **histogram** (a binned empirical distribution) that records observed **relative frequencies**
+* The idea that a population probability distribution is what we anticipate **relative frequencies** will be in a long sequence of i.i.d. draws. Here the following mathematical machinery makes precise what is meant by **anticipated relative frequencies**
      - **Law of Large Numbers (LLN)**
-     -  **Central Limit Theorem (CLT)**
+     - **Central Limit Theorem (CLT)**
 
+### A discrete random variable example
 
-**Scalar example**
+#### Scalar example
 
 Let $X$ be a scalar random variable that takes on the $I$ possible values
 $0, 1, 2, \ldots, I-1$ with probabilities
@@ -147,23 +162,23 @@ $$
 
 as a short-hand way of saying that the random variable $X$ is described by the probability distribution $ \{{f_i}\}_{i=0}^{I-1}$.
 
-Consider drawing a  sample $x_0, x_1, \dots , x_{N-1}$ of  $N$ independent and identically distributoed  draws of $X$. 
+Consider drawing a sample $x_0, x_1, \dots , x_{N-1}$ of $N$ independent and identically distributed draws of $X$.
 
-What do the "identical" and "independent" mean in   IID or iid ("identically and independently distributed")?
+What do "identical" and "independent" mean in IID or iid ("identically and independently distributed")?
 
 - "identical" means that each draw is from the same distribution.
-- "independent" means that  joint distribution  equal  products of marginal distributions, i.e.,
+- "independent" means that the joint distribution equals the product of marginal distributions, i.e.,
 
 $$
 \begin{aligned}
-\textrm{Prob}\{x_0 = i_0, x_1 = i_1, \dots , x_{N-1} = i_{N-1}\} &= \textrm{Prob}\{x_0 = i_0\} \cdot \dots \cdot \textrm{Prob}\{x_{I-1} = i_{I-1}\}\\
+\textrm{Prob}\{x_0 = i_0, x_1 = i_1, \dots , x_{N-1} = i_{N-1}\} &= \textrm{Prob}\{x_0 = i_0\} \cdot \dots \cdot \textrm{Prob}\{x_{N-1} = i_{N-1}\}\\
 &= f_{i_0} f_{i_1} \cdot \dots \cdot f_{i_{N-1}}\\
 \end{aligned}
 $$
 
-We define an  **empirical distribution** as follows.
+We define an **empirical distribution** as follows.
 
-For each $i  = 0,\dots,I-1$, let 
+For each $i = 0,\dots,I-1$, let
 
 $$
 \begin{aligned}
@@ -174,35 +189,31 @@ N & = \sum^{I-1}_{i=0} N_i \quad \text{total number of draws},\\
 $$
 
 
-Key concepts that connect probability theory with statistics are laws of large numbers and central limit theorems
-
-**LLN:**
+Key concepts that connect probability theory with statistics are laws of large numbers and central limit theorems.
 
-- A Law of Large Numbers (LLN) states that $\tilde {f_i} \to f_i \text{ as } N \to \infty$
+A Law of Large Numbers (LLN) states that $\tilde {f_i} \to f_i$ as $N \to \infty$.
 
-**CLT:**
+A Central Limit Theorem (CLT) describes a **rate** at which $\tilde {f_i} \to f_i$.
 
-- A Central Limit Theorem (CLT) describes a  **rate** at which $\tilde {f_i} \to f_i$
+See {doc}`lln_clt` for a detailed treatment of both results.
 
+### Understanding probability: frequentist vs. Bayesian
 
-**Remarks**
+For "frequentist" statisticians, **anticipated relative frequency** is **all** that a probability distribution means.
 
-- For "frequentist" statisticians, **anticipated relative frequency**  is **all** that a probability distribution means.
+But for a Bayesian it means something else -- something partly subjective and purely personal.
 
-- But for a Bayesian it means something else -- something partly  subjective and purely personal.
-     
-     * we say "partly" because a Bayesian also pays attention to relative frequencies 
+We say "partly" because a Bayesian also pays attention to relative frequencies.
 
+## Representing probability distributions
 
-## Representing  Probability Distributions
-
-A  probability distribution $\textrm{Prob} (X \in A)$ can  be described by its **cumulative distribution function (CDF)**
+A probability distribution $\textrm{Prob} (X \in A)$ can be described by its **cumulative distribution function (CDF)**
 
 $$
 F_{X}(x) = \textrm{Prob}\{X\leq x\}.
 $$
 
-Sometimes, but not always, a random variable can also be described by  **density function** $f(x)$
+Sometimes, but not always, a random variable can also be described by a **density function** $f(x)$
 that is related to its CDF by
 
 $$
@@ -215,13 +226,13 @@ $$
 
 Here $B$ is a set of possible $X$'s whose probability of occurring we want to compute.
 
-When a probability density exists, a probability distribution can be characterized either by its CDF or by its  density.
+When a probability density exists, a probability distribution can be characterized either by its CDF or by its density.
 
 For a **discrete-valued** random variable
 
 * the number  of possible values of $X$ is finite or countably infinite
 * we replace a  **density** with a **probability mass function**, a non-negative sequence that sums to one
-* we replace integration with summation in the formula like {eq}`eq:CDFfromdensity` that relates a CDF to a probability mass function
+* when a density exists, we replace integration with summation in formulas like {eq}`eq:CDFfromdensity`
 
 
 In this lecture, we mostly discuss discrete random variables.
@@ -231,7 +242,7 @@ Doing this enables us to confine our tool set basically to linear algebra.
 Later we'll briefly discuss how to approximate a continuous random variable with a discrete random variable.
 
 
-## Univariate Probability Distributions
+## Univariate probability distributions
 
 We'll devote most of this lecture to discrete-valued random variables, but we'll say a few things
 about continuous-valued random variables.
@@ -281,15 +292,19 @@ $$
 where $\theta $ is a vector of parameters that is of much smaller dimension than $I$.
 
 
-**Remarks:**
+A **statistical model** is a joint probability distribution characterized by a list of **parameters**.
+
+The concept of **parameter** is intimately related to the notion of **sufficient statistic**.
+
+A **statistic** is a nonlinear function of a data set.
+
+**Sufficient statistics** summarize all **information** that a data set contains about parameters of a statistical model.
+
+Note that a sufficient statistic corresponds to a particular statistical model.
 
-- A **statistical model** is a joint probability distribution characterized by a list of **parameters** 
-- The concept of  **parameter** is intimately related to the notion of  **sufficient statistic**.
-- A **statistic** is a   nonlinear function of a data set.
-- **Sufficient statistics**  summarize all  **information** that a  data set contains  about  parameters of statistical model.
-   * Note that a sufficient statistic corresponds to a particular statistical model. 
-   * Sufficient statistics are key  tools that AI uses to summarize or compress  a **big data** set.
--  R. A. Fisher provided a rigorous definition of **information** -- see <https://en.wikipedia.org/wiki/Fisher_information>
+Sufficient statistics are key tools that AI uses to summarize or compress a **big data** set.
+
+R. A. Fisher provided a rigorous definition of **information** -- see [Fisher information](https://en.wikipedia.org/wiki/Fisher_information).
 
 
 
@@ -298,7 +313,7 @@ An example of a parametric probability distribution is  a **geometric distributi
 It is described by
 
 $$
-f_{i} = \textrm{Prob}\{X=i\} = (1-\lambda)\lambda^{i},\quad \lambda \in [0,1], \quad i = 0, 1, 2, \ldots
+f_{i} = \textrm{Prob}\{X=i\} = (1-\lambda)\lambda^{i},\quad \lambda \in [0,1), \quad i = 0, 1, 2, \ldots
 $$
 
 Evidently,  $\sum_{i=0}^{\infty}f_i=1$.
@@ -311,7 +326,7 @@ $$
 
 ### Continuous random variable
 
-Let $X$ be a continous random variable that takes values $X \in \tilde{X}\equiv[X_U,X_L]$ whose distributions have parameters $\theta$.
+Let $X$ be a continuous random variable that takes values in a set $\tilde{X} \subseteq \mathbb{R}$ and whose distribution has parameters $\theta$.
 
 $$
 \textrm{Prob}\{X\in A\} = \int_{x\in A} f(x;\theta)\,dx;  \quad f(x;\theta)\ge0
@@ -323,7 +338,7 @@ $$
 \textrm{Prob}\{X\in \tilde{X}\} =1
 $$
 
-## Bivariate Probability Distributions
+## Bivariate probability distributions
 
 We'll now discuss a bivariate **joint distribution**.
 
@@ -357,9 +372,9 @@ $$
 \sum_{i}\sum_{j}f_{ij}=1
 $$
 
-## Marginal Probability Distributions
+## Marginal probability distributions
 
-The joint distribution induce marginal distributions
+The joint distribution induces marginal distributions
 
 $$
 \textrm{Prob}\{X=i\}= \sum_{j=0}^{J-1}f_{ij} = \mu_i, \quad i=0,\ldots,I-1
@@ -391,7 +406,7 @@ $$
 \end{aligned}
 $$
 
-**Digression:** If two random variables $X,Y$ are continuous and have joint density $f(x,y)$, then marginal distributions can be computed by
+As a digression, if two random variables $X,Y$ are continuous and have joint density $f(x,y)$, then marginal distributions can be computed by
 
 $$
 \begin{aligned}
@@ -400,7 +415,7 @@ f(y)& = \int_{\mathbb{R}} f(x,y) dx
 \end{aligned}
 $$
 
-## Conditional Probability  Distributions
+## Conditional probability distributions
 
 Conditional probabilities are defined according to
 
@@ -422,18 +437,22 @@ where $i=0, \ldots,I-1, \quad j=0,\ldots,J-1$.
 Note that
 
 $$
-\sum_{i}\textrm{Prob}\{X_i=i|Y_j=j\}
+\sum_{i}\textrm{Prob}\{X=i|Y=j\}
 =\frac{ \sum_{i}f_{ij} }{ \sum_{i}f_{ij}}=1
 $$
 
-**Remark:** The mathematics  of conditional probability  implies:
+The mathematics of conditional probability implies:
 
 $$
 \textrm{Prob}\{X=i|Y=j\}	=\frac{\textrm{Prob}\{X=i,Y=j\}}{\textrm{Prob}\{Y=j\}}=\frac{\textrm{Prob}\{Y=j|X=i\}\textrm{Prob}\{X=i\}}{\textrm{Prob}\{Y=j\}}
 $$ (eq:condprobbayes)
 
 ```{note}
-Formula {eq}`eq:condprobbayes` is also  what a  Bayesian calls **Bayes' Law**. A Bayesian statistician regards  marginal probability distribution $\textrm{Prob}({X=i}), i = 1,  \ldots, J$ as a **prior** distribution that describes his personal subjective beliefs about $X$.
+Formula {eq}`eq:condprobbayes` is also  what a  Bayesian calls **Bayes' Law**. 
+
+A Bayesian statistician regards  marginal probability distribution $\textrm{Prob}({X=i}), i = 0,  \ldots, I-1$ as a **prior** distribution that describes his personal subjective beliefs about $X$.
+
+
 He  then interprets  formula {eq}`eq:condprobbayes` as a procedure for constructing a **posterior** distribution that describes how he would  revise his subjective beliefs after observing that $Y$ equals $j$.  
 ```
 
@@ -446,7 +465,7 @@ $$
 $$
 
 
-## Transition Probability Matrix
+## Transition probability matrix
 
 Consider the following joint probability distribution of  two random variables.
 
@@ -465,7 +484,7 @@ $$
 An associated conditional distribution is
 
 $$
-\textrm{Prob}\{Y=i\vert X=j\} = \frac{\rho_{ij}}{ \sum_{j}\rho_{ij}}
+\textrm{Prob}\{Y=j\vert X=i\} = \frac{\rho_{ij}}{ \sum_{j}\rho_{ij}}
 = \frac{\textrm{Prob}\{Y=j, X=i\}}{\textrm{Prob}\{ X=i\}}
 $$
 
@@ -480,8 +499,8 @@ where
 $$
 \left[
    \begin{matrix}
-  p_{11} & p_{12}\\
-  p_{21} & p_{22}
+  p_{00} & p_{01}\\
+  p_{10} & p_{11}
   \end{matrix}
 \right]
 $$
@@ -491,11 +510,11 @@ The first row is the probability that $Y=j, j=0,1$ conditional on $X=0$.
 The second row is the probability that $Y=j, j=0,1$ conditional on $X=1$.
 
 Note that
-- $\sum_{j}\rho_{ij}= \frac{ \sum_{j}\rho_{ij}}{ \sum_{j}\rho_{ij}}=1$, so each row of the transition matrix $P$ is a probability distribution (not so for each column).
+- $\sum_{j}p_{ij}= \frac{ \sum_{j}\rho_{ij}}{ \sum_{j}\rho_{ij}}=1$, so each row of the transition matrix $P$ is a probability distribution (not so for each column).
 
 
 
-## Application: Forecasting a Time Series
+## Application: forecasting a time series
 
 Suppose that there are two time periods.
 
@@ -508,7 +527,7 @@ Suppose that
 
 $$
 \begin{aligned}
-\text{Prob} \{X(0)=i,X(1)=j\} &=f_{ij}≥0，i=0,\cdots,I-1\\
+\text{Prob} \{X(0)=i,X(1)=j\} &=f_{ij}\geq 0, \quad i=0,\cdots,I-1, \quad j=0,\cdots,J-1\\
 \sum_{i}\sum_{j}f_{ij}&=1
 \end{aligned}
 $$
@@ -519,11 +538,10 @@ A conditional distribution is
 
 $$\text{Prob} \{X(1)=j|X(0)=i\}= \frac{f_{ij}}{ \sum_{j}f_{ij}}$$
 
-**Remark:**
-- This formula is a workhorse for applied economic forecasters.
+This formula is a workhorse for applied economic forecasters.
 
 
-## Statistical Independence
+## Statistical independence
 
 Random variables X and Y are statistically **independent** if
 
@@ -535,8 +553,8 @@ where
 
 $$
 \begin{aligned}
-\textrm{Prob}\{X=i\} &=f_i\ge0， \sum{f_i}=1 \cr
-\textrm{Prob}\{Y=j\} & =g_j\ge0， \sum{g_j}=1
+\textrm{Prob}\{X=i\} &=f_i\ge 0, \quad \sum_{i}{f_i}=1 \cr
+\textrm{Prob}\{Y=j\} & =g_j\ge 0, \quad \sum_{j}{g_j}=1
 \end{aligned}
 $$
 
@@ -550,7 +568,7 @@ $$
 $$
 
 
-## Means and Variances
+## Means and variances
 
 The  mean and variance of a discrete random variable $X$  are
 
@@ -562,7 +580,7 @@ $$
 \end{aligned}
 $$
 
-A continuous random variable having  density $f_{X}(x)$) has  mean and variance
+A continuous random variable having  density $f_{X}(x)$ has  mean and variance
 
 $$
 \begin{aligned}
@@ -571,7 +589,7 @@ $$
 \end{aligned}
 $$
 
-## Matrix Representations of Some Bivariate Distributions
+## Matrix representations of some bivariate distributions
 
 Let's use matrices to represent a joint distribution, conditional distribution, marginal distribution, and the mean and variance of a  bivariate random variable.
 
@@ -590,12 +608,9 @@ $$ \textrm{Prob}(X=i)=\sum_j{f_{ij}}=u_i  $$
 $$ \textrm{Prob}(Y=j)=\sum_i{f_{ij}}=v_j $$
 
 
-**Sampling:**
-
-Let's write some Python code that let's us  draw some long samples and compute relative frequencies.
+Let's write some Python code that lets us draw some long samples and compute relative frequencies.
 
-The code will let us  check whether  the "sampling" distribution agrees   with the "population" distribution - confirming that
-the population distribution correctly tells us the relative frequencies that we should expect in a large sample. 
+The code lets us check whether the "sampling" distribution agrees with the "population" distribution -- confirming that the population distribution correctly tells us the relative frequencies that we should expect in a large sample.
 
 
 
@@ -607,7 +622,7 @@ f = np.array([[0.3, 0.2], [0.1, 0.4]])
 f_cum = np.cumsum(f)
 
 # draw random numbers
-p = np.random.rand(1_000_000)
+p = rng.random(1_000_000)
 x = np.vstack([xs[1]*np.ones(p.shape), ys[1]*np.ones(p.shape)])
 # map to the bivariate distribution
 
@@ -764,7 +779,7 @@ class discrete_bijoint:
         xs = self.xs
         ys = self.ys
         f_cum = np.cumsum(self.f)
-        p = np.random.rand(n)
+        p = rng.random(n)
         x = np.empty([2, p.shape[0]])
         lf = len(f_cum)
         lx = len(xs)-1
@@ -844,7 +859,9 @@ class discrete_bijoint:
 
 Let's apply our code to some examples.
 
-**Example 1**
+### Numerical examples
+
+#### Example 1
 
 ```{code-cell} ipython3
 # joint
@@ -863,7 +880,7 @@ d.marg_dist()
 d.cond_dist()
 ```
 
-**Example 2**
+#### Example 2
 
 ```{code-cell} ipython3
 xs_new = np.array([10, 20, 30])
@@ -882,7 +899,7 @@ d_new.marg_dist()
 d_new.cond_dist()
 ```
 
-## A Continuous Bivariate Random Vector
+## A continuous bivariate random vector
 
 
 A two-dimensional Gaussian distribution has  joint density
@@ -891,11 +908,6 @@ $$
 f(x,y) =(2\pi\sigma_1\sigma_2\sqrt{1-\rho^2})^{-1}\exp\left[-\frac{1}{2(1-\rho^2)}\left(\frac{(x-\mu_1)^2}{\sigma_1^2}-\frac{2\rho(x-\mu_1)(y-\mu_2)}{\sigma_1\sigma_2}+\frac{(y-\mu_2)^2}{\sigma_2^2}\right)\right]
 $$
 
-
-$$
-\frac{1}{2\pi\sigma_1\sigma_2\sqrt{1-\rho^2}}\exp\left[-\frac{1}{2(1-\rho^2)}\left(\frac{(x-\mu_1)^2}{\sigma_1^2}-\frac{2\rho(x-\mu_1)(y-\mu_2)}{\sigma_1\sigma_2}+\frac{(y-\mu_2)^2}{\sigma_2^2}\right)\right]
-$$
-
 We start with a  bivariate normal distribution pinned down by
 
 $$
@@ -934,9 +946,11 @@ y = np.linspace(-10, 10, 1_000)
 x_mesh, y_mesh = np.meshgrid(x, y, indexing="ij")
 ```
 
-**Joint Distribution**
+### Joint, marginal, and conditional distributions
+
+#### Joint distribution
 
-Let's  plot the **population** joint density.
+Let's plot the **population** joint density.
 
 ```{code-cell} ipython3
 # %matplotlib notebook
@@ -967,18 +981,18 @@ Next  we can use   a built-in `numpy` function to draw random samples, then calc
 μ= np.array([0, 5])
 σ= np.array([[5, .2], [.2, 1]])
 n = 1_000_000
-data = np.random.multivariate_normal(μ, σ, n)
+data = rng.multivariate_normal(μ, σ, n)
 x = data[:, 0]
 y = data[:, 1]
 ```
 
-**Marginal distribution**
+#### Marginal distribution
 
 ```{code-cell} ipython3
 plt.hist(x, bins=1_000, alpha=0.6)
 μx_hat, σx_hat = np.mean(x), np.std(x)
 print(μx_hat, σx_hat)
-x_sim = np.random.normal(μx_hat, σx_hat, 1_000_000)
+x_sim = rng.normal(μx_hat, σx_hat, 1_000_000)
 plt.hist(x_sim, bins=1_000, alpha=0.4, histtype="step")
 plt.show()
 ```
@@ -987,19 +1001,19 @@ plt.show()
 plt.hist(y, bins=1_000, density=True, alpha=0.6)
 μy_hat, σy_hat = np.mean(y), np.std(y)
 print(μy_hat, σy_hat)
-y_sim = np.random.normal(μy_hat, σy_hat, 1_000_000)
+y_sim = rng.normal(μy_hat, σy_hat, 1_000_000)
 plt.hist(y_sim, bins=1_000, density=True, alpha=0.4, histtype="step")
 plt.show()
 ```
 
-**Conditional distribution**
+#### Conditional distribution
 
 For a bivariate normal population distribution, the conditional distributions are also normal:
 
 $$
-\begin{aligned} \\
-[X|Y &= y ]\sim \mathbb{N}\bigg[\mu_X+\rho\sigma_X\frac{y-\mu_Y}{\sigma_Y},\sigma_X^2(1-\rho^2)\bigg] \\
-[Y|X &= x ]\sim \mathbb{N}\bigg[\mu_Y+\rho\sigma_Y\frac{x-\mu_X}{\sigma_X},\sigma_Y^2(1-\rho^2)\bigg]
+\begin{aligned}
+X \mid Y = y &\sim \mathbb{N}\bigg[\mu_X+\rho\sigma_X\frac{y-\mu_Y}{\sigma_Y},\sigma_X^2(1-\rho^2)\bigg] \\
+Y \mid X = x &\sim \mathbb{N}\bigg[\mu_Y+\rho\sigma_Y\frac{x-\mu_X}{\sigma_X},\sigma_Y^2(1-\rho^2)\bigg]
 \end{aligned}
 $$
 
@@ -1007,30 +1021,33 @@ $$
 Please see this {doc}`quantecon lecture <multivariate_normal>` for more details.
 ```
 
-Let's approximate  the joint density by discretizing and mapping the approximating joint density into a  matrix.
+Let's approximate the joint density by discretizing and mapping the approximating joint density into a matrix.
+
+On an evenly spaced grid, we can approximate the conditional distribution by assigning probability weights proportional to a slice of the joint density.
 
-We can compute the discretized marginal density  by just using matrix algebra and  noting that
+For fixed $y$, this means that
 
 $$
-\textrm{Prob}\{X=i|Y=j\}=\frac{f_{ij}}{\sum_{i}f_{ij}}
+z_i
+\equiv \frac{f(x_i,y)}{\sum_k f(x_k,y)}
 $$
 
 Fix $y=0$.
 
 ```{code-cell} ipython3
-# discretized marginal density
+# discretized conditional distribution of X given Y = 0
 x = np.linspace(-10, 10, 1_000_000)
 z = func(x, y=0) / np.sum(func(x, y=0))
 plt.plot(x, z)
 plt.show()
 ```
 
-The mean and variance are computed by
+The conditional mean and variance are then approximated by
 
 $$
 \begin{aligned}
-\mathbb{E}\left[X\vert Y=j\right] & =\sum_{i}iProb\{X=i\vert Y=j\}=\sum_{i}i\frac{f_{ij}}{\sum_{i}f_{ij}} \\
-\mathbb{D}\left[X\vert Y=j\right] &=\sum_{i}\left(i-\mu_{X\vert Y=j}\right)^{2}\frac{f_{ij}}{\sum_{i}f_{ij}}
+\mathbb{E}\left[X\vert Y=y\right] & \approx \sum_i x_i z_i \\
+\mathbb{D}\left[X\vert Y=y\right] & \approx \sum_i\left(x_i-\mu_{X\vert Y=y}\right)^{2} z_i
 \end{aligned}
 $$
 
@@ -1044,7 +1061,7 @@ Let's draw from a normal distribution with above mean and variance and check how
 σx = np.sqrt(np.dot((x - μx)**2, z))
 
 # sample
-zz = np.random.normal(μx, σx, 1_000_000)
+zz = rng.normal(μx, σx, 1_000_000)
 plt.hist(zz, bins=300, density=True, alpha=0.3, range=[-10, 10])
 plt.show()
 ```
@@ -1052,19 +1069,19 @@ plt.show()
 Fix $x=1$.
 
 ```{code-cell} ipython3
-y = np.linspace(0, 10, 1_000_000)
+y = np.linspace(-10, 10, 1_000_000)
 z = func(x=1, y=y) / np.sum(func(x=1, y=y))
 plt.plot(y,z)
 plt.show()
 ```
 
 ```{code-cell} ipython3
-# discretized mean and standard deviation
+# discretized conditional mean and standard deviation
 μy = np.dot(y,z)
 σy = np.sqrt(np.dot((y - μy)**2, z))
 
 # sample
-zz = np.random.normal(μy,σy,1_000_000)
+zz = rng.normal(μy, σy, 1_000_000)
 plt.hist(zz, bins=100, density=True, alpha=0.3)
 plt.show()
 ```
@@ -1079,7 +1096,7 @@ print(μy, σy)
 print(μ2 + ρ * σ2 * (1 - μ1) / σ1, np.sqrt(σ2**2 * (1 - ρ**2)))
 ```
 
-## Sum of Two Independently Distributed Random Variables
+## Sum of two independently distributed random variables
 
 Let $X, Y$ be two independent discrete random variables that take values in $\bar{X}, \bar{Y}$, respectively.
 
@@ -1127,10 +1144,10 @@ Start with a joint distribution
 $$
 \begin{aligned}
 f_{ij} & =\textrm{Prob}\{X=i,Y=j\}\\
-i& =0, \cdots，I-1\\
-j& =0, \cdots，J-1\\
-& \text{stacked to an }I×J\text{ matrix}\\
-& e.g. \quad I=1, J=1
+i& =0, \cdots, I-1\\
+j& =0, \cdots, J-1\\
+& \text{stacked to an }I\times J\text{ matrix}\\
+& e.g. \quad I=2, J=2
 \end{aligned}
 $$
 
@@ -1139,8 +1156,8 @@ where
 $$
 \left[
    \begin{matrix}
-  f_{11} & f_{12}\\
-  f_{21} & f_{22}
+  f_{00} & f_{01}\\
+  f_{10} & f_{11}
   \end{matrix}
 \right]
 $$
@@ -1149,7 +1166,7 @@ From the joint distribution, we have shown above that we  obtain **unique** marg
 
 Now we'll try to go in a reverse direction.
 
-We'll find that from two marginal distributions, can we usually construct more than one   joint distribution that verifies these marginals.
+We'll find that from two marginal distributions we can usually construct more than one joint distribution that satisfies these marginals.
 
 Each of these joint distributions is called a **coupling** of the two marginal distributions.
 
@@ -1162,9 +1179,7 @@ $$
 \end{aligned}
 $$
 
-Given two marginal distribution, $\mu$ for $X$ and $\nu$ for $Y$, a joint distribution $f_{ij}$ is said to be a **coupling** of $\mu$ and $\nu$.
-
-**Example:**
+Given two marginal distributions, $\mu$ for $X$ and $\nu$ for $Y$, a joint distribution $f_{ij}$ is said to be a **coupling** of $\mu$ and $\nu$.
 
 Consider the following bivariate example.
 
@@ -1174,13 +1189,13 @@ $$
 \text{Prob} \{X=1\}=& q  =\mu_{1}\\
 \text{Prob} \{Y=0\}=& 1-r  =\nu_{0}\\
 \text{Prob} \{Y=1\}= & r  =\nu_{1}\\
-\text{where } 0 \leq q < r \leq 1
+\text{where } 0 \leq q \leq r \leq 1
 \end{aligned}
 $$
 
 We construct  two couplings.
 
-The first coupling if our two marginal distributions is the joint distribution
+The first coupling of our two marginal distributions is the joint distribution
 
 $$f_{ij}=
 \left[
@@ -1199,7 +1214,7 @@ $$
 \mu_{0}= (1-q)(1-r)+(1-q)r & =1-q\\
 \mu_{1}= q(1-r)+qr & =q\\
 \nu_{0}= (1-q)(1-r)+(1-r)q& =1-r\\
-\mu_{1}= r(1-q)+qr& =r
+\nu_{1}= r(1-q)+qr& =r
 \end{aligned}
 $$
 
@@ -1216,7 +1231,7 @@ f_{ij}=
 \right]
 $$
 
-The verify that this is a coupling, note that
+To verify that this is a coupling, note that
 
 $$
 \begin{aligned}
@@ -1234,12 +1249,11 @@ But the joint distributions differ.
 
 Thus, multiple  joint distributions $[f_{ij}]$ can have  the same marginals.
 
-**Remark:**
-- Couplings  are important in optimal transport problems and in Markov processes. Please see this {doc}`lecture about optimal transport <opt_transport>`
+Couplings are important in optimal transport problems and in Markov processes. Please see this {doc}`lecture about optimal transport <opt_transport>`.
 
-## Copula Functions
+## Copula functions
 
-Suppose that $X_1, X_2, \dots, X_n$ are $N$ random variables  and that
+Suppose that $X_1, X_2, \dots, X_N$ are $N$ random variables  and that
 
 * their marginal distributions are $F_1(x_1), F_2(x_2),\dots, F_N(x_N)$,  and
 
@@ -1251,12 +1265,16 @@ $$
 H(x_1,x_2,\dots,x_N) = C(F_1(x_1), F_2(x_2),\dots,F_N(x_N)).
 $$
 
-We can obtain
+If the marginal distributions are continuous, then the copula is unique.
+
+In that case, we can recover it from the marginal inverses:
 
 $$
-C(u_1,u_2,\dots,u_n) = H[F^{-1}_1(u_1),F^{-1}_2(u_2),\dots,F^{-1}_N(u_N)]
+C(u_1,u_2,\dots,u_N) = H(F^{-1}_1(u_1),F^{-1}_2(u_2),\dots,F^{-1}_N(u_N))
 $$
 
+When marginal distributions are not continuous, one uses generalized inverses, and the copula is uniquely determined only on $\textrm{Ran}(F_1)\times \cdots \times \textrm{Ran}(F_N)$.
+
 In a reverse direction of logic, given univariate  **marginal distributions**
 $F_1(x_1), F_2(x_2),\dots,F_N(x_N)$ and a copula function $C(\cdot)$, the function $H(x_1,x_2,\dots,x_N) = C(F_1(x_1), F_2(x_2),\dots,F_N(x_N))$ is a **coupling** of $F_1(x_1), F_2(x_2),\dots,F_N(x_N)$.
 
@@ -1265,9 +1283,11 @@ Thus, for given marginal distributions, we can use  a copula function to determi
 
 Copula functions are often used to characterize **dependence** of  random variables.
 
-**Discrete marginal distribution**
+### Bivariate examples with discrete and continuous distributions
+
+#### Discrete marginal distribution
 
-As mentioned above,  for two given marginal distributions there can be more than one coupling.
+As mentioned above, for two given marginal distributions there can be more than one coupling.
 
 For example, consider two  random variables $X, Y$ with distributions
 
@@ -1285,23 +1305,23 @@ For these two random variables there can be more than one coupling.
 Let's first generate X and Y.
 
 ```{code-cell} ipython3
-# define parameters
-mu = np.array([0.6, 0.4])
-nu = np.array([0.3, 0.7])
+μ = np.array([0.6, 0.4])
+ν = np.array([0.3, 0.7])
 
 # number of draws
 draws = 1_000_000
 
-# generate draws from uniform distribution
-p = np.random.rand(draws)
+# generate independent draws from uniform distribution for X and Y
+p_x = rng.random(draws)
+p_y = rng.random(draws)
 
-# generate draws of X and Y via uniform distribution
+# generate draws of X and Y via independent uniform draws
 x = np.ones(draws)
 y = np.ones(draws)
-x[p <= mu[0]] = 0
-x[p > mu[0]] = 1
-y[p <= nu[0]] = 0
-y[p > nu[0]] = 1
+x[p_x <= μ[0]] = 0
+x[p_x > μ[0]] = 1
+y[p_y <= ν[0]] = 0
+y[p_y > ν[0]] = 1
 ```
 
 ```{code-cell} ipython3
@@ -1353,9 +1373,9 @@ f1_cum = np.cumsum(f1)
 draws1 = 1_000_000
 
 # generate draws from uniform distribution
-p = np.random.rand(draws1)
+p = rng.random(draws1)
 
-# generate draws of first copuling via uniform distribution
+# generate draws of first coupling via uniform distribution
 c1 = np.vstack([np.ones(draws1), np.ones(draws1)])
 # X=0, Y=0
 c1[0, p <= f1_cum[0]] = 0
@@ -1428,9 +1448,9 @@ f2_cum = np.cumsum(f2)
 draws2 = 1_000_000
 
 # generate draws from uniform distribution
-p = np.random.rand(draws2)
+p = rng.random(draws2)
 
-# generate draws of first coupling via uniform distribution
+# generate draws of second coupling via uniform distribution
 c2 = np.vstack([np.ones(draws2), np.ones(draws2)])
 # X=0, Y=0
 c2[0, p <= f2_cum[0]] = 0
@@ -1454,7 +1474,7 @@ f2_10 = sum((c2[0, :] == 1)*(c2[1, :] == 0))/draws2
 f2_11 = sum((c2[0, :] == 1)*(c2[1, :] == 1))/draws2
 
 # print output of second joint distribution
-print("first joint distribution for c2")
+print("second joint distribution for c2")
 c2_mtb = pt.PrettyTable()
 c2_mtb.field_names = ['c2_x_value', 'c2_y_value', 'c2_prob']
 c2_mtb.add_row([0, 0, f2_00])
@@ -1488,3 +1508,337 @@ print(c2_ymtb)
 We have verified that both joint distributions, $c_1$ and $c_2$, have identical marginal distributions of $X$ and $Y$, respectively.
 
 So they are both couplings of $X$ and $Y$.
+
+### Gaussian copula example
+
+A **Gaussian copula** uses the bivariate normal distribution to induce dependence between
+arbitrary marginal distributions.
+
+The construction has three steps:
+
+1. Draw $(Z_1, Z_2)$ from a bivariate standard normal with correlation $\rho$.
+2. Apply the standard normal CDF: $U_k = \Phi(Z_k)$. 
+   - The pair $(U_1, U_2)$ has uniform marginals but retains the dependence structure of $(Z_1, Z_2)$ --- this is the copula.
+3. Apply the inverse CDF of any desired marginal: $X_k = F_k^{-1}(U_k)$.
+
+The following code illustrates this with exponential marginals.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: gaussian copula with exponential marginals
+    name: fig-gaussian-copula
+---
+
+# Gaussian copula parameters
+ρ_cop = 0.8
+n_cop = 100_000
+
+# Draw from bivariate standard normal with correlation ρ_cop
+z = rng.multivariate_normal(
+    [0, 0], [[1, ρ_cop], [ρ_cop, 1]], n_cop
+)
+
+# Apply normal CDF -> uniform marginals (the copula itself)
+u1 = stats.norm.cdf(z[:, 0])
+u2 = stats.norm.cdf(z[:, 1])
+
+# Apply inverse CDFs of desired marginals (here: Exponential)
+x1 = stats.expon.ppf(u1, scale=1.0)   # Exp with mean 1
+x2 = stats.expon.ppf(u2, scale=0.5)   # Exp with mean 0.5
+
+fig, axes = plt.subplots(1, 2, figsize=(10, 4))
+axes[0].scatter(u1[:3000], u2[:3000], alpha=0.2, s=2)
+axes[0].set_xlabel('$u_1$')
+axes[0].set_ylabel('$u_2$')
+axes[1].scatter(x1[:3000], x2[:3000], alpha=0.2, s=2)
+axes[1].set_xlabel('$x_1$ (Exp, mean=1)')
+axes[1].set_ylabel('$x_2$ (Exp, mean=0.5)')
+plt.show()
+
+print(f"Sample correlation of (x1, x2): {np.corrcoef(x1, x2)[0, 1]:.3f}")
+print(f"Sample correlation of (u1, u2): {np.corrcoef(u1, u2)[0, 1]:.3f}")
+```
+
+The left panel shows the copula itself -- the dependence structure in uniform coordinates, drawn from a bivariate normal with correlation $\rho = 0.8$.
+
+The right panel shows the same dependence translated to exponential marginals.
+
+Changing $\rho$ controls the strength of dependence while the marginals remain unchanged.
+
+## Exercises
+
+```{exercise}
+:label: prob_matrix_ex1
+
+**Independence Test**
+
+Consider the joint distribution
+
+$$
+F = \begin{bmatrix} 0.3 & 0.2 \\ 0.1 & 0.4 \end{bmatrix}
+$$
+
+where $X \in \{0,1\}$ and $Y \in \{10, 20\}$.
+
+1. Compute the marginal distributions $\mu_i = \text{Prob}\{X=i\}$ and $\nu_j = \text{Prob}\{Y=j\}$.
+
+1. Form the independence matrix $f^{\perp}_{ij} = \mu_i \nu_j$ (the outer product of the two marginal vectors).
+
+1. Compare $F$ with $f^{\perp}$ and determine whether $X$ and $Y$ are independent.
+
+1. Verify your conclusion by computing $\text{Prob}\{X=0|Y=10\}$ and checking whether it equals $\text{Prob}\{X=0\}$.
+```
+
+```{solution-start} prob_matrix_ex1
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+F = np.array([[0.3, 0.2],
+              [0.1, 0.4]])
+
+μ = F.sum(axis=1)
+ν = F.sum(axis=0)
+print("μ (marginal of X):", μ)
+print("ν (marginal of Y):", ν)
+
+F_indep = np.outer(μ, ν)
+print("\nIndependence matrix (outer product):\n", F_indep)
+print("\nActual joint F:\n", F)
+
+print("\nIndependent (F == μ times ν)?", np.allclose(F, F_indep))
+
+prob_X0_given_Y10 = F[0, 0] / ν[0]
+print(f"\nProb(X=0 | Y=10) = {prob_X0_given_Y10:.4f}")
+print(f"Prob(X=0)         = {μ[0]:.4f}")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: prob_matrix_ex2
+
+**Covariance and Correlation**
+
+Using the same joint distribution $F$ and values $X \in \{0,1\}$, $Y \in \{10, 20\}$ as in Exercise 1:
+
+1. Compute $\mathbb{E}[X]$, $\mathbb{E}[Y]$, and $\mathbb{E}[XY] = \sum_i \sum_j x_i y_j f_{ij}$.
+
+1. Compute $\text{Cov}(X,Y) = \mathbb{E}[XY] - \mathbb{E}[X]\mathbb{E}[Y]$.
+
+1. Compute $\text{Cor}(X,Y) = \text{Cov}(X,Y) / (\sigma_X \sigma_Y)$.
+
+1. Show analytically that $X \perp Y$ implies $\text{Cov}(X,Y) = 0$.
+```
+
+```{solution-start} prob_matrix_ex2
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+xs = np.array([0, 1])
+ys = np.array([10, 20])
+F  = np.array([[0.3, 0.2],
+               [0.1, 0.4]])
+
+μ = F.sum(axis=1)
+ν = F.sum(axis=0)
+
+E_X  = xs @ μ
+E_Y  = ys @ ν
+E_XY = sum(xs[i] * ys[j] * F[i, j] for i in range(2) for j in range(2))
+print(f"E[X] = {E_X}, E[Y] = {E_Y}, E[XY] = {E_XY}")
+
+cov_XY = E_XY - E_X * E_Y
+print(f"Cov(X,Y) = {cov_XY:.4f}")
+
+var_X  = ((xs - E_X)**2) @ μ
+var_Y  = ((ys - E_Y)**2) @ ν
+cor_XY = cov_XY / np.sqrt(var_X * var_Y)
+print(f"Cor(X,Y) = {cor_XY:.4f}")
+```
+
+For part 4: if $X \perp Y$ then $f_{ij} = \mu_i \nu_j$, so
+
+$$
+\mathbb{E}[XY] = \sum_i \sum_j x_i y_j \mu_i \nu_j
+= \left(\sum_i x_i \mu_i\right)\!\left(\sum_j y_j \nu_j\right)
+= \mathbb{E}[X]\,\mathbb{E}[Y]
+$$
+
+and therefore $\text{Cov}(X,Y) = \mathbb{E}[XY] - \mathbb{E}[X]\mathbb{E}[Y] = 0$.
+
+```{solution-end}
+```
+
+```{exercise}
+:label: prob_matrix_ex3
+
+**Sum of Two Dice**
+
+Let $X$ and $Y$ be **independent** random variables, each uniformly distributed on $\{1,2,3,4,5,6\}$, and let $Z = X + Y$.
+
+1. Use the convolution formula $h_k = \sum_i f_i g_{k-i}$ to compute the distribution of $Z$.
+
+1. Plot the result generated by the formula.
+
+1. Simulate $10^6$ rolls and overlay the empirical histogram on the plot.
+
+1. Compute $\mathbb{E}[Z]$ and $\text{Var}(Z)$ from the two calculations
+```
+
+```{solution-start} prob_matrix_ex3
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+f = np.ones(6) / 6
+g = np.ones(6) / 6
+h = [
+    sum(f[i]*g[k-i] for i in range(
+        max(0, k-len(g)+1), # f_i exists 
+        min(len(f), k+1))   # g_{k-i} exists
+        ) 
+        for k in range(len(f) + len(g) - 1)]
+z_vals = np.arange(2, 13)
+
+n = 1_000_000
+z_sim = rng.integers(1, 7, n) + rng.integers(1, 7, n)
+counts = np.bincount(z_sim, minlength=13)[2:]
+
+fig, ax = plt.subplots()
+ax.bar(z_vals - 0.2, h,          0.4, alpha=0.7, label='Theoretical')
+ax.bar(z_vals + 0.2, counts / n, 0.4, alpha=0.7, label='Empirical')
+ax.set_xlabel('Z = X + Y')
+ax.set_ylabel('Probability')
+ax.legend()
+plt.show()
+
+E_Z   = z_vals @ h
+Var_Z = ((z_vals - E_Z)**2) @ h
+print(f"Theory:     E[Z] = {E_Z:.2f}, Var(Z) = {Var_Z:.4f}")
+print(f"Simulation: E[Z] = {np.mean(z_sim):.2f}, Var(Z) = {np.var(z_sim):.4f}")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: prob_matrix_ex4
+
+**Multi-Step Transition Probabilities**
+
+Consider a two-state Markov chain with transition matrix
+
+$$
+P = \begin{bmatrix} 0.9 & 0.1 \\ 0.2 & 0.8 \end{bmatrix}
+$$
+
+where $p_{ij} = \text{Prob}\{X(t+1)=j \mid X(t)=i\}$.
+
+1. Starting from $\psi_0 = [1, 0]$, compute $\psi_n = \psi_0 P^n$ for $n = 1, 5, 20, 100$.
+
+1. Find the stationary distribution $\psi^*$ satisfying $\psi^* P = \psi^*$ and $\sum_i \psi^*_i = 1$.
+
+1. Verify numerically that $\psi_n \to \psi^*$ as $n$ grows.
+```
+
+```{solution-start} prob_matrix_ex4
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+P = np.array([[0.9, 0.1],
+              [0.2, 0.8]])
+ψ0 = np.array([1.0, 0.0])
+
+for n in [1, 5, 20, 100]:
+    print(f"ψ_{n:3d} = {ψ0 @ np.linalg.matrix_power(P, n)}")
+
+A = np.vstack([P.T - np.eye(2), np.ones(2)])
+b = np.array([0.0, 0.0, 1.0])
+ψ_star, *_ = np.linalg.lstsq(A, b, rcond=None)
+print(f"\nStationary distribution: {ψ_star}")
+
+ψ_100 = ψ0 @ np.linalg.matrix_power(P, 100)
+print(f"ψ_100 close to stationary? {np.allclose(ψ_100, ψ_star, atol=1e-6)}")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: prob_matrix_ex5
+
+**Bayes' Law with a Discrete Prior**
+
+A coin has unknown bias $\theta \in \{0.2,\, 0.5,\, 0.8\}$ with prior $\pi = [0.25,\, 0.50,\, 0.25]$.
+
+Assume that, conditional on $\theta$, the coin flips are i.i.d. Bernoulli($\theta$).
+
+1. After observing $k = 7$ heads in $n = 10$ flips, compute the likelihood
+
+   $$
+   \mathcal{L}(\theta \mid \text{data}) = \binom{10}{7}\,\theta^7\,(1-\theta)^3
+   $$
+
+   for each $\theta$.
+
+2. Apply equation {eq}`eq:condprobbayes` to compute the posterior $\pi(\theta \mid \text{data})$.
+
+3. Plot the prior and posterior side by side.
+
+4. Repeat for $k = 3$ heads and describe how the posterior shifts.
+```
+
+```{solution-start} prob_matrix_ex5
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+θ_vals = np.array([0.2, 0.5, 0.8])
+π = np.array([0.25, 0.50, 0.25])
+
+def compute_posterior(k, n, θ_vals, π):
+    likelihood = comb(n, k) * θ_vals**k * (1 - θ_vals)**(n - k)
+    unnorm = likelihood * π
+    return unnorm / unnorm.sum(), likelihood
+
+post7, lik7 = compute_posterior(7, 10, θ_vals, π)
+post3, lik3 = compute_posterior(3, 10, θ_vals, π)
+
+print("k=7:  likelihood =", lik7.round(4), 
+      " posterior =", post7.round(4))
+print("k=3:  likelihood =", lik3.round(4), 
+      " posterior =", post3.round(4))
+
+x = np.arange(len(θ_vals))
+w = 0.3
+fig, axes = plt.subplots(1, 2, figsize=(10, 4))
+for ax, post, title in zip(
+    axes, [post7, post3], ['k=7 heads', 'k=3 heads']):
+    ax.bar(x - w/2, π, w, label='Prior',     alpha=0.7)
+    ax.bar(x + w/2, post,  w, label='Posterior', alpha=0.7)
+    ax.set_xticks(x)
+    ax.set_xticklabels([f'θ={t}' for t in θ_vals])
+    ax.set_ylabel('Probability')
+    ax.set_title(title)
+    ax.legend()
+plt.show()
+```
+
+```{solution-end}
+```
diff --git a/lectures/ross_recovery.md b/lectures/ross_recovery.md
new file mode 100644
index 000000000..3f879d069
--- /dev/null
+++ b/lectures/ross_recovery.md
@@ -0,0 +1,1383 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+(ross_recovery)=
+```{raw} jupyter
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+        </a>
+</div>
+```
+
+# The Recovery Theorem
+
+```{contents} Contents
+:depth: 2
+```
+
+## Overview
+
+Asset prices are forward-looking: they encode investors' expectations about future
+economic states and their valuations of different risks.
+
+A long-standing question in finance is whether one can *recover* the probability
+distribution used by investors -- their subjective beliefs -- from observed asset
+prices alone.
+
+Option prices reveal **state prices**; once these are normalized by the riskless
+discount factor, the resulting probabilities are the **risk-neutral probabilities**
+implied by asset prices after risk adjustments have been folded in.
+
+These are not the **natural probabilities** that investors actually assign to future
+states of the world.
+
+The two differ because risk-neutral probabilities blend together two distinct objects:
+the market's true beliefs about the future, and investors' aversion to risk.
+
+The link between them is the **pricing kernel**, which reweights natural probabilities
+to deliver state prices.
+
+Separating beliefs from risk aversion has traditionally required parametric assumptions
+about the preferences of a representative investor.
+
+{cite:t}`Ross2015` showed otherwise.
+
+Ross's theorem says that, in a finite-state Markov economy, state prices can be
+enough.
+
+Suppose the Arrow–Debreu state-price transition matrix is arbitrage-free and
+irreducible.
+
+If the pricing kernel also satisfies a structural restriction called **transition
+independence**, then state prices uniquely determine both the natural probability
+transition matrix and the transition pricing kernel.
+
+No historical return data or assumed utility function is needed if some assumptions 
+about the structure of the pricing kernel hold.
+
+This is the **Recovery Theorem**.
+
+It has several important implications:
+
+* It shows how state-price transition data can identify the market's forward-looking
+  natural distribution when the assumption holds
+* It provides tests of the efficient market hypothesis.
+* It sheds light on the "dark matter" of finance: the probability of rare
+  catastrophic events embedded in market prices.
+
+This lecture covers
+
+* the Arrow–Debreu framework linking state prices, risk-neutral probabilities,
+  the pricing kernel, and natural probabilities,
+* Ross's Recovery Theorem and its proof via the Perron–Frobenius theorem,
+* an implementation that recovers the natural distribution from a
+  simulated state-price matrix, and
+* how option prices and forward equations can be used to estimate transition
+  state prices,
+* comparisons between risk-neutral and recovered natural densities.
+
+Let's import the packages we'll need.
+
+```{code-cell} ipython3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.linalg import eig
+from scipy.stats import norm
+import matplotlib.cm as cm
+```
+
+## Model setup
+
+### Arrow–Debreu state prices
+
+Consider a discrete-time, discrete-state economy.
+
+At each date the economy occupies one of $m$ states $\theta_1, \ldots, \theta_m$.
+
+An **Arrow–Debreu security** pays \$1 if the economy is in state $\theta_j$ next period
+and nothing otherwise.
+
+Denote by $p(\theta_i, \theta_j)$ the price today, when the current state is $\theta_i$,
+of the Arrow–Debreu security paying in state $\theta_j$ next period.
+
+Collect these into an $m \times m$ **state price transition matrix**
+
+$$
+P = [p(\theta_i, \theta_j)]_{i,j=1}^m.
+$$
+
+As in {doc}`ge_arrow`, the row sums give the state-dependent riskless discount factor:
+$\sum_j p(\theta_i, \theta_j) = e^{-r(\theta_i)}$.
+
+Here $r(\theta_i)$ is the one-period continuously compounded riskless rate in
+current state $\theta_i$.
+
+More generally, if an asset pays $g(\theta_j)$ next period, then its price in
+state $\theta_i$ is
+
+$$
+p_g(\theta_i)
+    = \sum_j p(\theta_i, \theta_j) g(\theta_j).
+$$
+
+Let
+
+$$
+b(\theta_i) \equiv \sum_j p(\theta_i, \theta_j) = e^{-r(\theta_i)}
+$$
+
+be the price of a one-period riskless bond in state $\theta_i$.
+
+Normalizing Arrow prices by this bond price gives the **risk-neutral transition
+probabilities**
+
+$$
+q^*(\theta_i, \theta_j)
+    = \frac{p(\theta_i, \theta_j)}{b(\theta_i)}
+    = e^{r(\theta_i)} p(\theta_i, \theta_j).
+$$
+
+Thus the same asset price can be written as
+
+$$
+p_g(\theta_i)
+    = b(\theta_i) \sum_j q^*(\theta_i, \theta_j) g(\theta_j)
+    = e^{-r(\theta_i)} E_i^*[g(\theta_{t+1})].
+$$
+
+Here $E_i^*$ denotes conditional expectation under
+$q^*(\theta_i,\cdot)$.
+
+The asterisk marks the risk-neutral, or martingale, probability measure.
+
+It is useful to separate this one-period normalization from the dynamic
+transition structure.
+
+If $Q(\theta_i,\theta_j,T)$ denotes the risk-neutral probability of moving from
+$\theta_i$ to $\theta_j$ over $T$ periods, and $0<t<T$ is an intermediate
+horizon, then the Markov forward equation is
+
+$$
+Q(\theta_i,\theta_j,T)
+    = \sum_k Q(\theta_i,\theta_k,t) Q(\theta_k,\theta_j,T-t).
+$$
+
+In matrix notation, multiperiod risk-neutral transition matrices compose by
+matrix multiplication.
+
+### The pricing kernel
+
+Using the stochastic-discount-factor notation studied in {doc}`markov_asset` and the
+Arrow-security notation used in {doc}`ge_arrow`, the pricing kernel $\phi(\theta_i,
+\theta_j)$ relates state prices to natural probabilities via
+
+$$
+p(\theta_i, \theta_j) = \phi(\theta_i, \theta_j) \, f(\theta_i, \theta_j),
+$$
+
+where $f(\theta_i, \theta_j)$ is the natural (conditional) probability of transitioning
+from state $\theta_i$ to $\theta_j$.
+
+As in the representative-agent equilibrium calculation in {doc}`ge_arrow`, the
+canonical additively separable model with discount factor $\beta$ gives
+
+$$
+\phi(\theta_i, \theta_j) = \frac{p(\theta_i, \theta_j)}{f(\theta_i, \theta_j)}
+    = \frac{\beta U'(c(\theta_j))}{U'(c(\theta_i))}.
+$$ (eq:canon_ge)
+
+This formula has a special structure: the kernel can be written as a ratio of two
+state-specific terms.
+
+Ross calls this property **transition independence**.
+
+We will say more about it soon.
+
+### The identification challenge
+
+Before stating the restriction, it helps to see why one is needed at all.
+
+Given $P$, any pair $(\phi, f)$ satisfying $p_{ij} = \phi_{ij} f_{ij}$ for every
+$(i,j)$ is consistent with observed state prices.
+
+The state-price matrix $P$ supplies $m^2$ equations.
+
+A natural transition matrix $F$
+contributes $m(m-1)$ free entries (rows sum to one), and an arbitrary kernel $\phi$
+contributes another $m^2$ -- a total of $2m^2 - m$ unknowns against only $m^2$
+equations.
+
+The system is under-identified by $m^2 - m$ parameters, so some structural
+restriction on the kernel is needed to pin down $\phi$ and $f$ separately.
+
+Transition independence restriction does the job, as we will see in the next section.
+
+### Transition independence
+
+
+```{prf:definition} Transition Independence
+:label: def-transition-independence
+
+A pricing kernel is **transition independent** if there exists a positive function $h$ on
+the state space and a positive scalar $\beta$ such that for every transition from state
+$\theta_i$ to $\theta_j$,
+
+$$
+\phi(\theta_i, \theta_j) = \beta \, \frac{h(\theta_j)}{h(\theta_i)}.
+$$
+```
+
+
+Transition independence says the kernel depends on the *ending* state and normalizes by
+the *beginning* state.
+
+In the representative-agent complete-markets environment above, it holds under
+intertemporally additive separable utility (where $h = U'$).
+
+In particular, this holds for {eq}`eq:canon_ge`.
+
+Transition independence helps because it ties all $m^2$ entries of $\phi$ together:
+once the $m$ state-specific values are known, the whole kernel is pinned down.
+
+It therefore cuts $\phi$ from $m^2$ free entries down to $m$, so the system becomes
+exactly identified.
+
+Under transition independence, the state-price equation becomes
+
+$$
+p(\theta_i, \theta_j) = \beta \, \frac{h(\theta_j)}{h(\theta_i)} \,
+    f(\theta_i, \theta_j).
+$$
+
+In matrix notation, defining the diagonal matrix $D$ with $D_{ii} = h(\theta_i)/\beta$,
+
+$$
+DP = \beta F D,
+$$
+
+or equivalently,
+
+$$
+F = \frac{1}{\beta} D P D^{-1}.
+$$
+
+## The recovery theorem
+
+### Reduction to an eigenvalue problem
+
+Since $F$ is a stochastic matrix, its rows sum to one: $F e = e$ where $e$ is the vector
+of ones.
+
+Substituting the expression for $F$:
+
+$$
+\frac{1}{\beta} D P D^{-1} e = e
+\quad \Longrightarrow \quad
+P z = \beta z, \quad z \equiv D^{-1} e.
+$$
+
+This is an **eigenvalue problem** where we seek a positive vector $z$ and scalar $\beta$
+satisfying $Pz = \beta z$.
+
+In principle every eigenvalue-eigenvector pair of $P$ is a formal solution, but only the
+one with a strictly positive eigenvector is economically valid: $D_{ii} = 1/z_i$ must be
+positive (so $z_i > 0$), and $F$ must have nonnegative entries.
+
+The Perron–Frobenius theorem guarantees that exactly one such pair exists.
+
+```{prf:theorem} Perron--Frobenius
+:label: thm-perron-frobenius
+
+If $A$ is a nonnegative irreducible matrix, then
+
+1. $A$ has a positive real eigenvalue $r$ equal to its spectral radius (the Perron root).
+2. There exists a strictly positive eigenvector $z \gg 0$ with $Az = rz$,
+   unique up to scaling.
+3. No other eigenvector is strictly positive.
+```
+
+Other eigenvalues can have the same modulus when the matrix is imprimitive, but the
+strictly positive eigenvector is unique up to scale.
+
+See Section 1.2.3 of {cite:t}`Sargent_Stachurski_2024` for details.
+
+See also the full statement in {doc}`intro:eigen_II`.
+
+Applied to the recovery problem: the Perron root is $\beta$ (the subjective discount
+factor) and the Perron vector $z$ determines $D$ via $D_{ii} = 1/z_i$.
+
+
+### Ross's recovery theorem
+
+The three assumptions in the theorem each carry a specific role.
+
+Assuming the Arrow–Debreu state prices are identified, no-arbitrage guarantees that
+$P$ has nonnegative entries and that the state prices encode a well-defined pricing
+measure.
+
+Irreducibility ensures the economy is not divided into disconnected sub-economies --
+without it, the Perron–Frobenius theorem gives multiple candidate eigenvectors and
+recovery breaks down.
+
+Transition independence is the key economic restriction.
+
+It says the pricing kernel
+factors as $\beta h(\theta_j)/h(\theta_i)$, so the entire kernel is pinned down by a
+single vector $h$ (or equivalently $z$).
+
+With these in mind, the Recovery Theorem follows from the Perron–Frobenius theorem.
+
+
+```{prf:theorem} Recovery Theorem
+:label: thm-ross-recovery
+
+Suppose prices provide no arbitrage opportunities, that the state
+price transition matrix $P$ is irreducible, and that the pricing kernel is transition
+independent.
+
+Then there exists a positive solution $(\beta, z, F)$ to the recovery problem in which
+$z$ is unique up to normalization, and the implied natural probability transition
+matrix $F$ and transition pricing kernel are unique.
+```
+
+```{prf:proof}
+Because $P$ is nonnegative and irreducible, the Perron–Frobenius theorem gives a unique
+positive eigenvector $z \gg 0$ with positive eigenvalue $\lambda > 0$ satisfying
+$Pz = \lambda z$.
+
+Setting
+
+$$
+\beta = \lambda, \qquad D_{ii} = \frac{1}{z_i},
+$$
+
+the natural probability transition matrix is uniquely recovered as
+
+$$
+f_{ij} = \frac{1}{\beta} \frac{z_j}{z_i} \, p_{ij}.
+$$
+
+To confirm $F$ is stochastic, note that all entries are nonnegative (since
+$p_{ij} \geq 0$ and $z_i, z_j > 0$) and
+
+$$
+\sum_j f_{ij}
+= \frac{1}{\beta z_i} \sum_j z_j \, p_{ij}
+= \frac{[Pz]_i}{\beta z_i}
+= \frac{\beta z_i}{\beta z_i} = 1.
+$$
+
+Uniqueness follows from the uniqueness of the Perron--Frobenius eigenvector.
+```
+
+### Pricing kernel from the eigenvector
+
+The recovered transition-kernel values are
+
+$$
+\phi(\theta_i, \theta_j) = \beta \frac{z_i}{z_j},
+\qquad h(\theta_i) = \frac{\beta}{z_i},
+$$
+
+where $h(\theta_i) = \beta/z_i$ follows from $D_{ii} = h(\theta_i)/\beta = 1/z_i$.
+
+It is useful to distinguish the **full transition kernel** $\phi_{ij} = \beta z_i/z_j$,
+which depends on both origin and destination states, from the **relative kernel
+component** $1/z_j$, which depends only on the destination state.
+
+Ross's Table I reports the destination-state shape $1/z_j$, normalized so that the
+middle state equals one.
+
+Destination states with high $z_j$ have *low* kernel values: for a fixed origin $i$,
+the kernel $\beta z_i/z_j$ is decreasing in $z_j$.
+
+When $h$ represents marginal utility and states are ordered by consumption or
+payoff, larger $z_j$ corresponds to lower marginal utility -- "good times" that
+require less insurance and so receive less pricing weight per unit of natural
+probability. 
+
+The same eigenvector argument also yields a useful limiting case.
+
+If the one-period
+bond price is identical in every current state, then the vector of ones is already the
+Perron vector, so recovery has no state-dependent change of measure left to perform.
+
+
+```{prf:corollary}
+:label: cor-risk-neutral-recovery
+
+If the riskless rate is the same in all states ($Pe = b e$ for
+some scalar $b$), then the unique natural distribution consistent with recovery is
+the risk-neutral (martingale) distribution itself: $F = (1/b) P$.
+```
+
+```{prf:proof}
+When $Pe = b e$, the vector of ones $e$ is the Perron eigenvector with eigenvalue
+$b$.
+
+By the uniqueness part of the Perron--Frobenius theorem, $z = e$ (up to scaling) and
+$\beta = b$.
+
+Setting $z = e$ gives $D = I$, so
+
+$$
+F = \frac{1}{\beta} D P D^{-1} = \frac{1}{b} P. \qquad \square
+$$
+```
+
+(ross-recovery-single-crossing)=
+### Single crossing and the risk premium
+
+Ross also uses the representative-agent formula to compare the natural and
+risk-neutral densities directly.
+
+For a fixed current state $\theta_i$,
+
+$$
+\frac{q^*(\theta_i,\theta_j)}{f(\theta_i,\theta_j)}
+    = e^{r(\theta_i)} \phi(\theta_i,\theta_j)
+    = e^{r(\theta_i)} \beta
+      \frac{U'(c(\theta_j))}{U'(c(\theta_i))}.
+$$ (eq:rn-natural-ratio)
+
+If $U'$ is decreasing in consumption, then the ratio in
+{eq}`eq:rn-natural-ratio` is decreasing in next-period consumption
+$c(\theta_j)$.
+
+Since both $q^*(\theta_i,\cdot)$ and $f(\theta_i,\cdot)$ integrate to one, there
+is a crossing point $v$ defined by
+
+$$
+e^{r(\theta_i)} \beta U'(v) = U'(c(\theta_i)).
+$$
+
+Below $v$, risk-neutral probability exceeds natural probability; above $v$, the
+natural probability exceeds the risk-neutral probability.
+
+Hence the natural consumption distribution first-order stochastically dominates
+the risk-neutral one.
+
+In a one-period model where terminal consumption is the market payoff, this
+also gives a positive market risk premium.
+
+Let $R$ denote the market return under the natural
+law, let $R^*$ denote the same return under the risk-neutral law, and let
+$R_f$ denote the riskless return in the same one-period units.
+
+The stochastic-dominance result can be represented as
+
+$$
+R^* \sim R - Z + \epsilon,
+$$
+
+where $Z \geq 0$ captures the downward shift induced by risk adjustment and
+$\epsilon$ is a residual satisfying $E[\epsilon \mid R-Z]=0$.
+
+Taking expectations gives
+
+$$
+E[R] = R_f + E[Z] > R_f.
+$$
+
+## Numerical example
+
+We now demonstrate the Recovery Theorem numerically.
+
+### Building a finite-state example
+
+We build the economy directly
+on a finite grid of log payoff states $s_1, \ldots, s_m$.
+
+On this grid we choose three primitives:
+
+1. a row-stochastic irreducible natural transition matrix $F$,
+2. a subjective discount factor $\beta = e^{-\rho T}$, and
+3. a CRRA transition pricing kernel
+   $\phi_{ij} = \beta e^{-\gamma(s_j-s_i)}$.
+
+The state-price matrix is then constructed from
+
+$$
+p_{ij} = \phi_{ij} f_{ij}.
+$$
+
+This means the Recovery Theorem assumptions hold by construction: $P$ is nonnegative,
+$F$ is a Markov transition matrix, and the kernel is transition independent with
+$z_i \propto e^{\gamma s_i}$.
+
+To keep the example close to Ross's Section IV, we choose $F$ to have lognormal-shaped
+rows.
+
+The continuous benchmark is a lognormal payoff with CRRA utility:
+
+$$
+U(S_T) = \frac{S_T^{1-\gamma}}{1-\gamma},
+\qquad
+S_T = S_0
+      \exp\!\left((\mu-\tfrac{1}{2}\sigma^2)T
+                 + \sigma \sqrt{T} \xi\right),
+$$
+
+where $\xi \sim N(0,1)$, $\mu$ is the expected growth-rate parameter,
+$\sigma$ is volatility, $T$ is the horizon, $\gamma$ is the CRRA coefficient,
+and $\rho$ is the continuously compounded subjective discount rate.
+
+The $T$-period pricing kernel is
+
+$$
+\phi_T
+    = e^{-\rho T}\left(\frac{S_T}{S_0}\right)^{-\gamma}.
+$$
+
+Equivalently, if $s=\log S_0$ and $s_T=\log S_T$, then the state-price density
+with respect to the future log state $s_T$ is
+
+$$
+p_T(s,s_T)
+    = e^{-\rho T} e^{-\gamma(s_T-s)}
+      \frac{1}{\sigma \sqrt{T}}
+      n\!\left(
+        \frac{s_T-s-(\mu-\frac{1}{2}\sigma^2)T}
+             {\sigma \sqrt{T}}
+      \right),
+$$
+
+where $n$ is the standard normal density.
+
+Thus the natural log return satisfies
+
+$$
+\log(S_T/S_0) \sim \mathcal{N}\!\left((\mu - \tfrac{1}{2}\sigma^2)T, \sigma^2 T\right).
+$$
+
+Following Ross's Table I, we represent the distribution on a finite grid of states.
+
+This example is Ross-inspired rather than an exact reproduction of Ross's Table I.
+
+Ross's Table I uses a fixed future payoff distribution, so its rows of $F$ are
+identical. 
+
+Here the same CRRA/lognormal pricing logic is embedded in a finite Markov
+transition matrix whose rows shift with the current state.
+
+Ross uses states from $-5$ to $+5$ standard deviations; we use
+the same range below.
+
+The truncation is an essential part of the finite-state model: it is what brings the example into the Perron--Frobenius setting.
+
+In the
+unbounded continuous lognormal growth model, Ross shows that recovery is not unique.
+
+On the finite grid, the natural transition probabilities and state prices are
+
+$$
+f_{ij} \propto
+    n\!\left(\frac{s_j - s_i - (\mu - \frac{1}{2}\sigma^2)T}{\sigma\sqrt{T}}\right)
+    \Delta s,
+\qquad
+p_{ij} = e^{-\rho T} e^{-\gamma(s_j - s_i)} f_{ij},
+$$
+
+where $s_i = \ln S_i$, $s_j = \ln S_j$, $n(\cdot)$ is the standard normal density, and
+the discretized probabilities $f_{ij}$ are normalized row by row.
+
+The next cell constructs this finite grid and builds $P$.
+
+```{code-cell} ipython3
+def build_state_price_matrix(μ, σ, γ, ρ, T=1.0, n_states=11, n_σ=5):
+    """Build a discretized lognormal/CRRA state-price matrix."""
+    states = np.linspace(-n_σ * σ * np.sqrt(T),
+                          n_σ * σ * np.sqrt(T),
+                          n_states)
+    ds = states[1] - states[0]
+
+    m = n_states
+    P = np.zeros((m, m))
+    F = np.zeros((m, m))
+
+    drift = (μ - 0.5 * σ**2) * T
+
+    # First build a row-stochastic natural transition matrix on the bounded grid
+    for i in range(m):
+        s_i = states[i]
+        for j in range(m):
+            s_j = states[j]
+            log_return = s_j - s_i
+            F[i, j] = norm.pdf(log_return, loc=drift,
+                               scale=σ * np.sqrt(T)) * ds
+
+        F[i] = F[i] / F[i].sum()
+
+        # Price each Arrow claim as natural probability times the CRRA kernel
+        for j in range(m):
+            log_return = states[j] - s_i
+            kernel = np.exp(-ρ * T) * np.exp(-γ * log_return)
+            P[i, j] = kernel * F[i, j]
+
+    return P, states
+```
+
+Now choose a calibration and build the state-price matrix.
+
+```{code-cell} ipython3
+μ = 0.08    # 8% annual expected return
+σ = 0.20    # 20% annual volatility
+γ = 3.0     # CRRA coefficient
+ρ = 0.02    # 2% annual continuous discount rate
+T = 1.0     # one-year horizon
+
+P, states = build_state_price_matrix(μ, σ, γ, ρ, T,
+                                     n_states=11, n_σ=5)
+
+print("State-price row sums:")
+print(np.round(P.sum(axis=1), 4))
+print(f"Middle-state risk-free rate: {-np.log(P[5].sum()):.4f}")
+```
+
+The row sums are the model-implied one-period bond prices in each current state.
+
+They
+vary near the boundaries because the finite grid truncates and renormalizes the
+conditional transition probabilities.
+
+### Applying the recovery theorem
+
+The Recovery Theorem requires computing the **Perron eigenvector** of $P$.
+
+```{code-cell} ipython3
+def recover_natural_distribution(P, tol=1e-10):
+    """
+    Recover natural probabilities and the relative pricing kernel
+    from state prices.
+    """
+
+    m = P.shape[0]
+
+    eigenvalues, eigenvectors = eig(P)
+    eigenvalues = np.real_if_close(eigenvalues, tol=1000)
+    eigenvectors = np.real_if_close(eigenvectors, tol=1000)
+
+    # Ross recovery uses the Perron root and its strictly positive eigenvector
+    real_mask = np.isreal(eigenvalues)
+    real_eigenvalues = np.asarray(
+        eigenvalues[real_mask].real, dtype=float)
+    real_eigenvectors = np.asarray(
+        eigenvectors[:, real_mask].real, dtype=float)
+
+    order = np.argsort(real_eigenvalues)[::-1]
+
+    for idx in order:
+        β_candidate = real_eigenvalues[idx]
+        z_candidate = real_eigenvectors[:, idx]
+
+        if np.mean(z_candidate) < 0:
+            z_candidate = -z_candidate
+
+        if β_candidate > 0 and np.all(z_candidate > tol):
+            β_recovered = β_candidate
+            z = z_candidate
+            break
+    else:
+        raise ValueError("No strictly positive real eigenvector found")
+
+    z = z / z[m // 2]
+
+    D = np.diag(1.0 / z)
+    D_inv = np.diag(z)
+
+    # Converts state prices into probabilities
+    F = (1.0 / β_recovered) * D @ P @ D_inv
+
+    min_entry = F.min()
+    row_sum_error = np.max(np.abs(F.sum(axis=1) - 1.0))
+
+    if min_entry < -tol:
+        raise ValueError(f"Recovered F has negative entries: min={min_entry}")
+
+    if row_sum_error > 1e-8:
+        raise ValueError(
+            f"Recovered F row sums are not one: max error={row_sum_error}"
+        )
+
+    # The kernel relative to the middle state normalization
+    φ_relative = 1.0 / z
+
+    return F, z, β_recovered, φ_relative
+```
+
+The Perron vector also recovers the shape of the pricing kernel.
+
+Ross's Table I reports this shape with the middle state normalized to one, which is
+$1/z_j$ under our normalization $z_{\text{mid}}=1$.
+
+```{code-cell} ipython3
+F, z, β_rec, φ_relative = recover_natural_distribution(P)
+
+print("Ross-normalized kernel 1/z (middle state = 1):")
+print(np.round(φ_relative, 4))
+```
+
+Because we know the data-generating natural transition matrix used to construct
+$P$, we can verify that recovery works in this simulation.
+
+```{code-cell} ipython3
+def true_lognormal_transition_matrix(states, μ, σ, T):
+    """
+    Construct the bounded-grid natural transition matrix used in the simulation.
+    """
+    m = len(states)
+    ds = states[1] - states[0]
+    drift = (μ - 0.5 * σ**2) * T
+    F_true = np.zeros((m, m))
+
+    for i in range(m):
+        log_returns = states - states[i]
+        F_true[i] = norm.pdf(log_returns, loc=drift,
+                             scale=σ * np.sqrt(T)) * ds
+        F_true[i] = F_true[i] / F_true[i].sum()
+
+    return F_true
+
+
+F_true = true_lognormal_transition_matrix(states, μ, σ, T)
+P_reconstructed = β_rec * (z[:, None] / z[None, :]) * F
+
+print("Recovery numerical checks")
+print(f"max |F - true F| = {np.max(np.abs(F - F_true)):.2e}")
+print(f"max |P - recovered kernel times F| = "
+      f"{np.max(np.abs(P - P_reconstructed)):.2e}")
+```
+
+Indeed, the discrepancies are at the level of numerical roundoff.
+
+## Natural vs. risk-neutral distributions
+
+A key insight of {cite:t}`Ross2015` is that the natural distribution can differ
+systematically from the risk-neutral one.
+
+In this CRRA example, where states are ordered from low to high payoff, the
+single-crossing argument in {ref}`ross-recovery-single-crossing` implies that
+the natural marginal density **first-order stochastically dominates** the
+risk-neutral density: the CDF of the natural distribution lies *below* that of
+the risk-neutral distribution.
+
+Because the pricing kernel is declining (investors fear bad outcomes), risk-neutral
+probabilities overweight bad states and underweight good states relative to the natural
+measure.
+
+We first plot the natural distribution against the risk-neutral one and the recovered
+relative pricing kernel
+
+```{code-cell} ipython3
+mid = len(states) // 2
+
+row_sums = P.sum(axis=1, keepdims=True)
+
+# Normalize Arrow prices by the one-period riskless bond price in each state
+Q_rn = P / row_sums
+
+f_nat = F[mid, :]
+f_rn = Q_rn[mid, :]
+
+gross_returns = np.exp(states)
+
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+axes[0].plot(gross_returns, f_nat, label='natural (recovered)', lw=2)
+axes[0].plot(gross_returns, f_rn, label='risk-neutral', lw=2)
+axes[0].set_xlabel('gross return $S_T / S_0$')
+axes[0].set_ylabel('probability')
+axes[0].set_title('one-period marginal distributions')
+axes[0].legend()
+
+axes[1].plot(gross_returns, φ_relative, 'g-^', lw=2)
+axes[1].set_xlabel('gross return $S_T / S_0$')
+axes[1].set_ylabel('relative kernel $1/z$')
+axes[1].set_title('recovered relative pricing kernel')
+plt.show()
+```
+
+The CDF clearly shows the first-order stochastic dominance
+
+```{code-cell} ipython3
+cdf_nat = np.cumsum(f_nat)
+cdf_rn = np.cumsum(f_rn)
+
+fig, ax = plt.subplots(figsize=(9, 5))
+ax.plot(gross_returns, cdf_nat, lw=2, label='natural cdf')
+ax.plot(gross_returns, cdf_rn, lw=2, label='risk-neutral cdf')
+ax.set_xlabel('gross return $S_T / S_0$')
+ax.set_ylabel('cumulative probability')
+ax.legend()
+plt.show()
+
+print(f"Natural CDF <= Risk-neutral CDF at all states: "
+      f"{np.all(cdf_nat <= cdf_rn + 1e-10)}")
+```
+
+The gap between the two CDFs is generated by the slope of the pricing kernel. 
+
+In the
+CRRA benchmark, this slope is controlled by the risk-aversion coefficient $\gamma$.
+
+We next vary $\gamma$ to see how the recovered kernel and the natural/risk-neutral
+wedge change.
+
+## Effect of risk aversion
+
+The shape of the pricing kernel, and hence the gap between natural and risk-neutral
+probabilities, depends on the coefficient of risk aversion $\gamma$.
+
+We illustrate this by plotting the relative pricing kernel $1/z$ and the gap between
+the natural and risk-neutral densities for a range of values of $\gamma$.
+
+```{code-cell} ipython3
+γs = [1.0, 2.0, 3.0, 5.0, 8.0]
+colors = cm.viridis(np.linspace(0.1, 0.9, len(γs)))
+
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+for γ_val, color in zip(γs, colors):
+    P_g, states_g = build_state_price_matrix(μ, σ, γ_val, ρ, T)
+    F_g, z_g, β_g, φ_relative_g = recover_natural_distribution(P_g)
+    mid_g = len(states_g) // 2
+
+    f_nat_g = F_g[mid_g, :]
+    row_sum = P_g[mid_g].sum()
+    f_rn_g = P_g[mid_g] / row_sum
+
+    gross = np.exp(states_g)
+
+    axes[0].plot(gross, φ_relative_g, color=color, lw=2,
+                 label=f'$\\gamma={γ_val:.0f}$')
+    axes[1].plot(gross, f_nat_g - f_rn_g, color=color, lw=2,
+                 label=f'$\\gamma={γ_val:.0f}$')
+
+axes[0].set_xlabel('gross return')
+axes[0].set_ylabel('relative kernel $1/z$')
+axes[0].set_title('relative pricing kernel vs risk aversion')
+axes[0].legend(fontsize=9)
+
+axes[1].axhline(0, color='k', lw=0.8, ls='--')
+axes[1].set_xlabel('gross return')
+axes[1].set_ylabel('natural minus risk-neutral probability')
+axes[1].set_title('natural minus risk-neutral density')
+axes[1].legend(fontsize=9)
+
+plt.show()
+```
+
+Because the states are ordered from low to high payoff, the plots show the
+single-crossing property discussed in {ref}`ross-recovery-single-crossing`: for
+returns below some threshold $v$, risk-neutral probability exceeds natural
+probability; above $v$ the natural probability dominates.
+
+A higher $\gamma$ amplifies this wedge.
+
+## Recovering the discount rate
+
+A useful by-product of the Recovery Theorem is the *recovered subjective discount
+factor* $\beta$, which equals the Perron–Frobenius eigenvalue of $P$.
+
+If the horizon is $T$, the corresponding continuously compounded subjective discount
+rate is
+
+$$
+\rho = -\frac{\log \beta}{T}.
+$$
+
+In the numerical examples below, $T=1$, so this reduces to $\rho = -\log \beta$.
+
+Corollary 1 of {cite:t}`Ross2015` states that $\beta$ is bounded above by the largest
+state-dependent one-period discount factor — equivalently, the maximum row sum of $P$:
+
+$$
+\beta \leq \max_i \sum_j p(\theta_i, \theta_j).
+$$
+
+Sweeping the true $\rho$ over a grid and reporting the recovered values alongside the
+recovery error confirms that the eigenvalue calculation pins down $\beta$ accurately:
+
+```{code-cell} ipython3
+true_ρs = np.linspace(0.00, 0.06, 13)
+recovered_ρs = np.empty_like(true_ρs)
+
+for k, rho in enumerate(true_ρs):
+    P_d, _ = build_state_price_matrix(μ, σ, γ=3.0, ρ=rho, T=1.0)
+    _, _, β_d, _ = recover_natural_distribution(P_d)
+    recovered_ρs[k] = -np.log(β_d)
+
+print(
+    f"max |true ρ - recovered ρ| = {np.max(np.abs(true_ρs - recovered_ρs)):.2e}")
+np.column_stack([true_ρs, recovered_ρs])
+```
+
+## Tail risk: natural vs. risk-neutral probabilities of catastrophe
+
+One of the most striking applications of the Recovery Theorem is its ability to separate
+the market's recovered natural probability of catastrophes from the risk premium
+attached to them.
+
+{cite:t}`barro2006rare` and {cite:t}`MehraPrescott1985` discuss how rare disasters might
+explain the equity premium puzzle.
+
+The risk-neutral probability of a large decline is elevated both because (a) the market
+assigns a high natural probability to such events and (b) the pricing kernel upweights
+bad outcomes.
+
+Ross's recovery machinery lets us decompose these two forces.
+
+The next cell plots left-tail probabilities under the recovered natural and the
+risk-neutral measures from the middle state, so the gap between the curves isolates
+the pricing-kernel contribution to crash probabilities.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Tail probabilities under the recovered natural and risk-neutral measures
+    name: fig-tail-probs
+---
+thresholds = np.linspace(-0.40, 0.10, 200)
+
+def tail_prob(f_dist, states, threshold):
+    """Left-tail probability for log returns."""
+    return float(np.sum(f_dist[states <= threshold]))
+
+P_base, states_base = build_state_price_matrix(
+    μ, σ, γ=3.0, ρ=0.02, T=1.0,
+    n_states=41, n_σ=5)
+F_base, z_base, β_base, φ_relative_base = recover_natural_distribution(P_base)
+
+mid_b = len(states_base) // 2
+f_nat_base = F_base[mid_b]
+f_rn_base = P_base[mid_b] / P_base[mid_b].sum()
+
+prob_nat = [tail_prob(f_nat_base, states_base, t) for t in thresholds]
+prob_rn = [tail_prob(f_rn_base, states_base, t) for t in thresholds]
+
+fig, ax = plt.subplots(figsize=(10, 5))
+ax.plot(np.exp(thresholds), prob_nat, lw=2, label='natural (recovered)')
+ax.plot(np.exp(thresholds), prob_rn, lw=2, label='risk-neutral')
+ax.set_xlabel('gross return threshold')
+ax.set_ylabel('probability of decline below threshold')
+ax.axvline(x=0.75, color='gray', ls=':', lw=1.5, label='25% decline')
+ax.axvline(x=0.70, color='silver', ls=':', lw=1.5, label='30% decline')
+ax.legend()
+plt.show()
+```
+
+This is a simulation illustrating Ross's decomposition.
+
+The risk-neutral density assigns higher probability to large drops than the recovered
+natural density.
+
+In this CRRA
+simulation, increasing risk aversion makes the risk-neutral crash probability rise
+faster than the recovered natural crash probability.
+
+We will say more in {ref}`rt_ex3`.
+
+## From option prices to transition prices
+
+The numerical example above starts from a known state-price transition matrix
+$P$.
+
+Empirically, Ross starts one step earlier: option prices reveal state-price
+densities at different maturities from the current state, and the transition
+matrix must be inferred from those maturity-by-maturity state prices.
+
+Let $C(K,T)$ be the price of a call option with strike $K$ and maturity $T$.
+
+If $p(S,T)$ is the state-price density for terminal index level $S$, then
+
+$$
+C(K,T)
+    = \int_K^\infty (S-K) p(S,T) \, dS.
+$$
+
+Differentiating twice with respect to the strike gives the
+{cite:t}`BreedenLitzenberger1978` formula
+
+$$
+p(K,T) = \frac{\partial^2 C(K,T)}{\partial K^2}.
+$$
+
+After discretizing strikes and maturities, let
+
+$$
+p_t(c) = \big(p_t(c,1), \ldots, p_t(c,m)\big)
+$$
+
+be the vector of state prices at horizon $t$ observed from today's state $c$.
+
+Here $c$ indexes the current state and $t$ counts discrete maturity steps.
+
+The first one-period vector $p_1(c)$ identifies the row of $P$ corresponding to
+the current state $c$, supplying $m$ equations.
+
+If the one-period state-price transition matrix $P$ is time homogeneous, these
+vectors satisfy the forward recursion
+
+$$
+p_{t+1}(c) = p_t(c) P,
+\qquad t=1,\ldots,m-1.
+$$
+
+Componentwise,
+
+$$
+p_{t+1}(c,j) = \sum_k p_t(c,k) p(k,j).
+$$
+
+The remaining $m-1$ forward equations $p_{t+1}(c)=p_t(c)P$, each with $m$
+components, supply the remaining $m(m-1)$ equations.
+
+Together these give $m^2$ equations for the $m^2$ transition prices $p(k,j)$.
+
+In practice this step is numerically delicate because the second derivative in
+the option-price formula amplifies measurement error, and because additional
+shape restrictions such as positivity or unimodality may be needed to obtain a
+reasonable transition matrix.
+
+## Testing efficient markets
+
+The recovered pricing kernel can also be used to test market efficiency, under the assumptions of the Recovery Theorem.
+
+If a trading strategy has a very high Sharpe ratio, then some pricing kernel must be
+volatile enough to price that payoff.
+
+The Hansen--Jagannathan bound {cite}`Hansen_Jagannathan_1991` says that, for any excess
+return with mean $\mu_\text{excess}$ and standard deviation $\sigma_\text{asset}$,
+
+$$
+\frac{|\mu_\text{excess}|}{\sigma_\text{asset}} \leq e^{rT}\, \sigma(M),
+$$
+
+where $M$ is the one-period stochastic discount factor and $r$ is the
+continuously compounded riskless rate over horizon $T$.
+
+Ross's point is that recovery gives an estimate of the relevant volatility
+$\sigma(M)$.
+
+Hence it gives an upper bound on the Sharpe ratio of any strategy based on the same
+stock-market information used in recovery.
+
+If such a strategy has a Sharpe ratio above the bound, then it is too profitable to be
+consistent with efficiency, under the assumptions of the Recovery
+Theorem.
+
+The same logic gives a bound on return predictability.
+
+Suppose excess returns are decomposed as
+
+$$
+x_{t+1} = \mu(I_t) + \epsilon_{t+1},
+$$
+
+where $I_t$ is the stock-market information set and $\epsilon_{t+1}$ is unpredictable
+from $I_t$.
+
+Then the $R^2$ of a forecasting regression based on $I_t$ is bounded above by the
+variance of the recovered kernel:
+
+$$
+R^2 \leq e^{2rT} \, \sigma^2(M).
+$$
+
+Only the component of the kernel projected on this information set is relevant.
+
+Adding unrelated noise to a candidate pricing kernel would raise its variance, but it
+would not justify stronger return predictability from stock-market information.
+
+## Limitations and extensions
+
+The Recovery Theorem is a remarkable theoretical result, but several caveats apply in
+practice.
+
+*Finite state space:*
+
+Ross's theorem is proved for a finite-state irreducible Markov chain; bounded
+continuous-state recovery requires additional results in {doc}`misspecified_recovery`.
+
+In continuous, unbounded state spaces (e.g., a lognormal diffusion), uniqueness fails
+because any exponential $e^{\alpha x}$ satisfies the characteristic equation.
+
+To see the issue, consider the continuous lognormal growth state-price density
+above.
+
+The natural continuous-space analogue of the Perron--Frobenius problem is
+
+$$
+\int p_T(s,y) v(y) \, dy = \lambda v(s).
+$$
+
+Here $y$ is a possible future log state, $v$ is a candidate positive
+eigenfunction, and $\lambda$ is its eigenvalue.
+
+For every real $\alpha$, the exponential function $v_\alpha(s)=e^{\alpha s}$
+solves this equation with eigenvalue
+
+$$
+\lambda(\alpha)
+    =
+    \exp\!\left(
+        -\rho T
+        +(\alpha-\gamma)(\mu-\tfrac{1}{2}\sigma^2)T
+        +\tfrac{1}{2}\sigma^2T(\alpha-\gamma)^2
+    \right).
+$$
+
+The positive eigenfunction is therefore not unique.
+
+This is why truncation or boundedness assumptions matter: they turn the
+continuous operator problem back into a Perron--Frobenius problem with a
+unique positive eigenvector.
+
+{cite:t}`CarrYu2012` establish recovery with a bounded diffusion.
+
+*Transition independence:*
+
+If the kernel is not transition independent, recovery is not guaranteed.
+
+{cite:t}`BorovickaHansenScheinkman2016` show that the Ross recovery can confound the
+long-run risk component of the kernel with the natural probability distribution,
+yielding an incorrect decomposition.
+
+We discuss this in the sequel lecture {doc}`misspecified_recovery`.
+
+*Empirical estimation:*
+
+Extracting reliable state prices from observed option prices requires careful
+interpolation and extrapolation.
+
+The mapping from implied volatilities to state prices via the
+{cite:t}`BreedenLitzenberger1978` formula involves second derivatives, which amplify
+measurement error.
+
+## Exercises
+
+```{exercise}
+:label: rt_ex1
+
+**The Perron–Frobenius vector and the pricing kernel.**
+
+Consider the $3 \times 3$ state price matrix
+
+$$
+P = \begin{pmatrix}
+0.5950 & 0.1700 & 0.0272 \\
+0.159375 & 0.5525 & 0.1360 \\
+0.06640625 & 0.31875 & 0.5525
+\end{pmatrix}.
+$$
+
+1. Compute the Perron eigenvalue $\beta$ and the corresponding eigenvector $z$ of
+$P$.
+
+2. Use $z$ to recover the natural probability transition matrix $F$ via
+
+$$
+f_{ij} = \frac{1}{\beta} \frac{z_j}{z_i} p_{ij}.
+$$
+
+3. Verify that each row of $F$ sums to one and all entries are positive.
+
+4. For destination state $j$, the relative kernel component is $1/z_j$; for a
+transition from state $i$ to state $j$, the full pricing kernel is $\beta z_i/z_j$.
+Compute $1/z_j$ for each state.
+
+Does the kernel decrease as we move from state 1 to state 3 (i.e., from bad to good
+states)?
+```
+
+```{solution-start} rt_ex1
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+P_ex = np.array([
+    [0.5950, 0.1700, 0.0272],
+    [0.159375, 0.5525, 0.1360],
+    [0.06640625, 0.31875, 0.5525]
+])
+
+eigenvalues, eigenvectors = eig(P_ex)
+real_mask = np.isreal(eigenvalues)
+real_ev = eigenvalues[real_mask].real
+real_evec = eigenvectors[:, real_mask].real
+
+idx = np.argmax(real_ev)
+β_ex = real_ev[idx]
+z_ex = real_evec[:, idx]
+if z_ex.min() < 0:
+    z_ex = -z_ex
+z_ex = z_ex / z_ex[1]
+
+print(f"β = {β_ex:.6f}")
+print(f"z = {z_ex}")
+
+D_ex = np.diag(1.0 / z_ex)
+D_inv_ex = np.diag(z_ex)
+F_ex = (1.0 / β_ex) * D_ex @ P_ex @ D_inv_ex
+
+print("\nRecovered F:")
+print(np.round(F_ex, 4))
+
+print(f"\nRow sums: {np.round(F_ex.sum(axis=1), 8)}")
+print(f"Nonnegative: {(F_ex >= -1e-10).all()}")
+
+φ_relative_ex = 1.0 / z_ex
+print(f"\nrelative kernel 1/z = {np.round(φ_relative_ex, 4)}")
+print(f"Decreasing: {φ_relative_ex[0] > φ_relative_ex[1] > φ_relative_ex[2]}")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: rt_ex2
+
+**Stochastic dominance.**
+
+Using the recovered $F$ and the normalised risk-neutral matrix $Q = P / \text{row sums}$
+from the exercise above:
+
+1. Compute the one-step marginal distributions $f_j = F_{2,j}$ and $q_j = Q_{2,j}$
+starting from state 2 (index 1 in Python).
+
+2. Compute the CDFs $\hat F_k = \sum_{j \leq k} f_j$ and
+$\hat Q_k = \sum_{j \leq k} q_j$ for each state.
+
+3. Verify numerically that $\hat F_k \leq \hat Q_k$ for every $k$, confirming stochastic
+dominance in this ordered three-state example.
+```
+
+```{solution-start} rt_ex2
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+eigenvalues, eigenvectors = eig(P_ex)
+real_mask = np.isreal(eigenvalues)
+real_ev = eigenvalues[real_mask].real
+real_evec = eigenvectors[:, real_mask].real
+idx = np.argmax(real_ev)
+β_ex = real_ev[idx]
+z_ex = real_evec[:, idx]
+if z_ex.min() < 0:
+    z_ex = -z_ex
+z_ex = z_ex / z_ex[1]
+
+D_ex = np.diag(1.0 / z_ex)
+D_inv_ex = np.diag(z_ex)
+F_ex = (1.0 / β_ex) * D_ex @ P_ex @ D_inv_ex
+
+start = 1
+f_marg = F_ex[start]
+q_marg = P_ex[start] / P_ex[start].sum()
+
+print("One-step marginals from state 2:")
+print(f"natural     = {np.round(f_marg, 4)}")
+print(f"risk-neutral = {np.round(q_marg, 4)}")
+
+cdf_nat = np.cumsum(f_marg)
+cdf_rn = np.cumsum(q_marg)
+
+print("\nCDFs:")
+for k in range(3):
+    print(f"state {k+1}: natural = {cdf_nat[k]:.4f}, risk-neutral = {cdf_rn[k]:.4f}")
+
+dominates = np.all(cdf_nat <= cdf_rn + 1e-10)
+print(f"\nNatural CDF <= risk-neutral CDF: {dominates}")
+```
+
+```{solution-end}
+```
+
+```{exercise}
+:label: rt_ex3
+
+**Risk aversion and tail risk.**
+
+Write a function `tail_risk_ratio(γ, threshold, μ, σ, ρ, T)` that:
+
+1. Constructs the state price matrix $P$ using `build_state_price_matrix` with
+   the given parameters and `n_states=41`.
+2. Applies `recover_natural_distribution` to obtain $F$.
+3. Computes $P(\text{log-return} \leq \text{threshold})$ under both the natural
+   and risk-neutral distributions starting from the middle state.
+4. Returns the ratio $p_\text{risk-neutral} / p_\text{natural}$.
+
+Using this function, plot the ratio as a function of $\gamma \in [1, 10]$ for a
+30 percent simple decline, i.e. `threshold = np.log(0.70)`.
+
+Explain the economic interpretation: why does a higher $\gamma$ raise the ratio?
+```
+
+```{solution-start} rt_ex3
+:class: dropdown
+```
+
+Here is one solution:
+
+```{code-cell} ipython3
+def tail_risk_ratio(γ, threshold, μ=0.08, σ=0.20, ρ=0.02, T=1.0):
+    """Risk-neutral / natural left-tail probability."""
+    P_g, states_g = build_state_price_matrix(
+        μ, σ, γ, ρ, T, n_states=41, n_σ=5)
+
+    F_g, _, _, _ = recover_natural_distribution(P_g)
+
+    mid_g = len(states_g) // 2
+
+    f_nat_g = F_g[mid_g]
+    f_rn_g = P_g[mid_g] / P_g[mid_g].sum()
+
+    p_nat = float(np.sum(f_nat_g[states_g <= threshold]))
+    p_rn = float(np.sum(f_rn_g[states_g <= threshold]))
+
+    if p_nat < 1e-12:
+        return np.nan
+    return p_rn / p_nat
+
+
+γs = np.linspace(1.0, 10.0, 20)
+threshold_30 = np.log(0.70)
+ratios = [tail_risk_ratio(g, threshold_30) for g in γs]
+
+plt.figure(figsize=(9, 5))
+plt.plot(γs, ratios, '-o', ms=5, lw=2)
+plt.xlabel('risk aversion coefficient $\\gamma$')
+plt.ylabel('risk-neutral / natural tail probability')
+plt.title('tail risk ratio for a 30% decline vs risk aversion')
+plt.show()
+```
+
+A higher coefficient of risk aversion $\gamma$ makes the pricing kernel steeper: the
+market assigns a larger premium per unit of probability to bad-state payoffs.
+
+Risk-neutral probabilities incorporate this premium, so in this CRRA simulation the
+risk-neutral crash probability rises faster with $\gamma$ than the recovered natural
+crash probability.
+
+Recovery separates the market's estimated natural crash probability from the
+pricing-kernel premium attached to crash states.
+
+```{solution-end}
+```
diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md
index 347932d20..0a65a44c7 100644
--- a/lectures/rs_inventory_q.md
+++ b/lectures/rs_inventory_q.md
@@ -378,7 +378,7 @@ The key is to identify where the randomness in profits actually comes from.
 Recall that per-period profit is $\pi(x, a, d) = \min(x, d) - ca - \kappa
 \mathbf{1}\{a > 0\}$.
 
-The ordering cost $ca + \kappa \mathbf{1}\{a > 0\}$ is **deterministic** — it
+The ordering cost $ca + \kappa \mathbf{1}\{a > 0\}$ is **deterministic** -- it
 is chosen before the demand shock is realized.
 
 So higher ordering shifts the level of profits down but does not affect their
@@ -387,9 +387,9 @@ variance.
 The variance comes from **revenue**: $\min(x, D)$.
 
 When inventory $x$ is high, $\min(x, D) \approx D$ for most demand
-realizations — revenue inherits the full variance of demand.
+realizations -- revenue inherits the full variance of demand.
 
-When inventory $x$ is low, $\min(x, D) \approx x$ for most realizations —
+When inventory $x$ is low, $\min(x, D) \approx x$ for most realizations --
 revenue is nearly deterministic, capped at the inventory level.
 
 A risk-sensitive agent therefore prefers lower inventory because it **caps the
@@ -484,7 +484,7 @@ $$
     \right].
 $$
 
-This is a fixed point equation in $q$ alone — $v^*$ has been eliminated.
+This is a fixed point equation in $q$ alone -- $v^*$ has been eliminated.
 
 ### The Q-learning update rule
 
@@ -515,7 +515,7 @@ standard Q-learning.
 Notice several differences from the risk-neutral case:
 
 - The Q-values are **positive** (expectations of exponentials) rather than signed.
-- The optimal policy is $\sigma(x) = \argmin_a q(x, a)$ — we **minimize**
+- The optimal policy is $\sigma(x) = \argmin_a q(x, a)$ -- we **minimize**
   rather than maximize, because $\psi^{-1}$ is decreasing.
 - The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than
   additively.
@@ -523,7 +523,7 @@ Notice several differences from the risk-neutral case:
   than a scaled sum $\beta \cdot \max_{a'} q_t$.
 
 As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and
-$X_{t+1}$ — no model knowledge is required.
+$X_{t+1}$ -- no model knowledge is required.
 
 ### Implementation plan
 
@@ -552,7 +552,7 @@ Our implementation follows the same structure as the risk-neutral Q-learning in
 
 As in {doc}`inventory_q`, we use optimistic initialization to accelerate learning.
 
-The logic is the same — initialize the Q-table so that every untried action looks attractive, driving the agent to explore broadly — but the direction is reversed.
+The logic is the same -- initialize the Q-table so that every untried action looks attractive, driving the agent to explore broadly -- but the direction is reversed.
 
 Since the optimal policy *minimizes* $q$, "optimistic" means initializing the Q-table *below* the true values.  When the agent tries an action, the update pushes $q$ upward toward reality, making that entry look worse and prompting the agent to try other actions that still appear optimistically good.
 
diff --git a/lectures/survival_recursive_preferences.md b/lectures/survival_recursive_preferences.md
index b1347c279..8dfa5b673 100644
--- a/lectures/survival_recursive_preferences.md
+++ b/lectures/survival_recursive_preferences.md
@@ -315,7 +315,7 @@ subject to $z^1 + z^2 \leq 1$.
 
 The first line is the flow payoff from the two agents' felicity functions.
 
-The second line multiplies $\tilde{J}(\upsilon)$ by a term that combines the agents' discount rates, belief-weighted endowment drift, and a variance correction — these arise from absorbing the $Y^{1-\gamma}$ factor via Itô's lemma.
+The second line multiplies $\tilde{J}(\upsilon)$ by a term that combines the agents' discount rates, belief-weighted endowment drift, and a variance correction -- these arise from absorbing the $Y^{1-\gamma}$ factor via Itô's lemma.
 
 The third line multiplies $\tilde{J}'(\upsilon)$ by the drift of the Pareto share, which depends on the difference in discount rates and the belief-weighted response to endowment risk.
 
@@ -387,7 +387,7 @@ $$ (eq:wealth_decomp)
 
 The first term measures how much faster agent 1's portfolio grows.
 
-The second measures how much less agent 1 consumes out of wealth — a lower consumption-wealth ratio means more saving and faster wealth accumulation.
+The second measures how much less agent 1 consumes out of wealth -- a lower consumption-wealth ratio means more saving and faster wealth accumulation.
 
 When this total difference is positive, agent 1 survives; when negative, she shrinks toward extinction.
 
@@ -508,7 +508,7 @@ $$
 = \frac{1-\rho}{\rho} \left[(\omega^1 - \omega^2)\sigma_Y + \frac{(\omega^1 - \omega^2)^2}{2\gamma}\right]
 $$ (eq:consumption_rates)
 
-The term in brackets is the difference in *subjective* expected portfolio returns — what agent 1 believes she earns relative to agent 2.
+The term in brackets is the difference in *subjective* expected portfolio returns -- what agent 1 believes she earns relative to agent 2.
 
 The factor $(1-\rho)/\rho$ translates this perceived return advantage into a saving response.
 
@@ -772,8 +772,8 @@ plt.show()
 
 Each panel plots two curves in the $(\gamma, \rho)$ plane for a different value of agent 1's belief distortion $\omega^1$ (agent 2 has correct beliefs, $\omega^2 = 0$).
 
-- The dashed curve (blue) is where the boundary drift at $\upsilon = 0$ equals zero — condition (i) in {prf:ref}`survival_conditions`.
-- The solid curve (red) is where the boundary drift at $\upsilon = 1$ equals zero — condition (ii).
+- The dashed curve (blue) is where the boundary drift at $\upsilon = 0$ equals zero -- condition (i) in {prf:ref}`survival_conditions`.
+- The solid curve (red) is where the boundary drift at $\upsilon = 1$ equals zero -- condition (ii).
 - The shaded region between the two curves is where both agents survive.
 - The dotted diagonal $\gamma = \rho$ is the separable CRRA case, along which the agent with more accurate beliefs always dominates.
 
@@ -937,7 +937,7 @@ This is outcome (d) in {prf:ref}`survival_conditions`: neither boundary is repel
 
 As $\gamma$ increases past roughly 1, the blue curve crosses zero and becomes positive while the red curve stays negative.
 
-Now both boundaries are repelling and we enter the coexistence region — outcome (a).
+Now both boundaries are repelling and we enter the coexistence region -- outcome (a).
 
 ## The separable case
 
@@ -1004,7 +1004,7 @@ This figure simulates 20 sample paths of the Pareto share $\upsilon_t$ under sep
 
 Agent 2 has correct beliefs, so the log-odds drift is negative and all paths trend toward $\upsilon = 0$.
 
-Agent 1 is driven to extinction — the classical market-selection result of {cite:t}`Blume_Easley2006`.
+Agent 1 is driven to extinction -- the classical market-selection result of {cite:t}`Blume_Easley2006`.
 
 ## Asset pricing implications
 
@@ -1227,7 +1227,7 @@ plt.show()
 
 The left panel shows 20 sample paths of the Pareto share $\upsilon_t$ under parameters inside the coexistence region ($\omega^1 = 0.25$, $\omega^2 = 0$, $\gamma = 5$, IES $\approx 1.49$).
 
-Unlike the separable case in {numref}`fig-crra-pareto-paths`, the paths do not drift to zero — they repeatedly visit a wide range of values, bouncing between the two repelling boundaries.
+Unlike the separable case in {numref}`fig-crra-pareto-paths`, the paths do not drift to zero -- they repeatedly visit a wide range of values, bouncing between the two repelling boundaries.
 
 The right panel approximates the stationary density by pooling the second half of longer simulations.
 
diff --git a/lectures/theil_1.md b/lectures/theil_1.md
index caaa874ec..1af931bb9 100644
--- a/lectures/theil_1.md
+++ b/lectures/theil_1.md
@@ -53,7 +53,7 @@ from quantecon import LQ
 Their result justifies a convenient two-step algorithm:
 
 1. **Optimize** under perfect foresight (treat future exogenous variables as known).
-2. **Forecast** — substitute optimal forecasts for the unknown future values.
+2. **Forecast** -- substitute optimal forecasts for the unknown future values.
 
 The striking insight is that these two steps are completely separable.
 
@@ -177,7 +177,7 @@ As part of its computational tractability, this specialization delivers a striki
 
 Under quadratic $V$ and linear $g$, the optimal decision rule $h$ decomposes into two components applied in sequence.
 
-**Step 1 — Forecasting.** Define the infinite sequence of optimal point forecasts of all current and future states of nature:
+**Step 1 -- Forecasting.** Define the infinite sequence of optimal point forecasts of all current and future states of nature:
 
 ```{math}
 :label: eq:forecast_sequence_v3
@@ -195,7 +195,7 @@ The optimal forecast sequence is a (generally nonlinear) function of the current
 
 The function $h_2 : S_1 \to S_1^\infty$ depends entirely on the environment $(f, \Phi)$ and is obtained as the solution to a **pure forecasting problem**, with no reference to preferences or technology.
 
-**Step 2 — Optimization.** Given the forecast sequence $\tilde{z}_t$, the optimal action is a **linear** function of $\tilde{z}_t$ and $x_t$:
+**Step 2 -- Optimization.** Given the forecast sequence $\tilde{z}_t$, the optimal action is a **linear** function of $\tilde{z}_t$ and $x_t$:
 
 ```{math}
 :label: eq:optimization_rule_v3
@@ -226,21 +226,21 @@ The relationship of original interest, $h = T(f)$, then follows directly from {e
 
 ### Certainty equivalence and perfect foresight
 
-The name "certainty equivalence" reflects a further implication of the LQ structure: the function $h_1$ can be derived as if the agent **knew the future path $z_{t+1}, z_{t+2}, \ldots$ with certainty** — i.e., by solving the deterministic problem in which $\tilde{z}_t$ is treated as the realized path rather than a forecast.
+The name "certainty equivalence" reflects a further implication of the LQ structure: the function $h_1$ can be derived as if the agent **knew the future path $z_{t+1}, z_{t+2}, \ldots$ with certainty** -- i.e., by solving the deterministic problem in which $\tilde{z}_t$ is treated as the realized path rather than a forecast.
 
 Randomness of the environment affects actions only through the forecast $\tilde{z}_t$; conditional on $\tilde{z}_t$, the optimization problem is deterministic.
 
 This means the LQ problem decouples into:
 
- *  **Dynamic optimization under perfect foresight** — solve for $h_1$ from $(V, g)$ by treating $\tilde{z}_t$ as known, yielding a standard deterministic LQ regulator problem independent of the environment $(f, \Phi)$.
+ *  **Dynamic optimization under perfect foresight** -- solve for $h_1$ from $(V, g)$ by treating $\tilde{z}_t$ as known, yielding a standard deterministic LQ regulator problem independent of the environment $(f, \Phi)$.
 
- *  **Optimal linear prediction** — solve for $h_2 = S(f)$ from $(f, \Phi)$ using least-squares forecasting theory, which reduces to a standard Kalman/Wiener prediction formula when $f$ is itself linear.
+ *  **Optimal linear prediction** -- solve for $h_2 = S(f)$ from $(f, \Phi)$ using least-squares forecasting theory, which reduces to a standard Kalman/Wiener prediction formula when $f$ is itself linear.
 
 ### Cross-equation restrictions
 
 A hallmark of the rational expectations hypothesis as it appears in this framework is that it ties together what would otherwise be free parameters in different equations.
 
-The requirement that $\tilde{z}_t = h_2(z_t) = S(f)(z_t)$ — i.e., that agents' forecasts be *optimal* with respect to the *actual* law of motion $f$ — imposes **cross-equation restrictions** between the parameters of the forecasting rule $h_2$ and the parameters of the environment $f$.
+The requirement that $\tilde{z}_t = h_2(z_t) = S(f)(z_t)$ -- i.e., that agents' forecasts be *optimal* with respect to the *actual* law of motion $f$ -- imposes **cross-equation restrictions** between the parameters of the forecasting rule $h_2$ and the parameters of the environment $f$.
 
 These restrictions, rather than any conditions on distributed lags within a single equation, are the operative empirical content of rational expectations.
 
@@ -318,7 +318,7 @@ Prior practice, exemplified by the adaptive expectations mechanisms of {cite:t}`
 
 treating the coefficient $\lambda$ as a free parameter to be estimated from data, with no reference to the underlying environment $f$.
 
-The deficiency is not that {eq}`eq:adaptive_expectations_v3` is a distributed lag — linear forecasting rules are perfectly acceptable simplifications.
+The deficiency is not that {eq}`eq:adaptive_expectations_v3` is a distributed lag -- linear forecasting rules are perfectly acceptable simplifications.
 
 The deficiency is that the **coefficients** of the distributed lag are left unrestricted by theory.
 
@@ -381,7 +381,7 @@ print(f"Theoretical slope β/(1-β)*P = {theoretical_slope:.4f}")
 
 The slope is indeed $\tfrac{\beta}{1-\beta} P$, confirming the analytic formula.
 
-The value matrix $P$ is determined entirely by preferences and technology, not by the noise level — a direct consequence of the certainty equivalence principle.
+The value matrix $P$ is determined entirely by preferences and technology, not by the noise level -- a direct consequence of the certainty equivalence principle.
 
 ```{solution-end}
 ```
diff --git a/lectures/theil_2.md b/lectures/theil_2.md
index b342cda1f..7c1dd2a3d 100644
--- a/lectures/theil_2.md
+++ b/lectures/theil_2.md
@@ -41,7 +41,7 @@ problems.
 The property justifies a two-step algorithm for computing optimal decision rules:
 
 1. *Optimize* under perfect foresight (treat future exogenous variables as known).
-2. *Forecast* — substitute optimal forecasts for the unknown future values.
+2. *Forecast* -- substitute optimal forecasts for the unknown future values.
 
 This lecture extends the certainty equivalence property in two directions motivated by
 {cite}`hansen2004certainty`:
@@ -58,8 +58,8 @@ This lecture extends the certainty equivalence property in two directions motiva
   parameter $\theta$ and the risk-sensitivity parameter $\sigma$ are linked by
   $\theta = -\sigma^{-1}$.
 
-We illustrate all three settings — ordinary CE, robust CE, and the permanent income
-application — with Python code using `quantecon`.
+We illustrate all three settings -- ordinary CE, robust CE, and the permanent income
+application -- with Python code using `quantecon`.
 
 ### Model features
 
@@ -138,7 +138,7 @@ gain $F$ is invariant to the noise level $\sigma$ while $d$ grows with it.
 ---
 mystnb:
   figure:
-    caption: CE principle — policy vs. value
+    caption: CE principle -- policy vs. value
     name: fig-ce-policy-value
 ---
 a, b_coeff = 0.9, 1.0
@@ -180,7 +180,7 @@ plt.show()
 
 ### Setup and the multiplier problem
 
-The decision maker in Simon and Theil's setting knows his model exactly — he has
+The decision maker in Simon and Theil's setting knows his model exactly -- he has
 no doubt about the transition law {eq}`eq:z_transition_o`.
 
 Now suppose he suspects that the true
@@ -207,7 +207,7 @@ where $\eta_0$ parametrises the tolerated misspecification budget and $\hat{\mat
 is the expectation under the distorted law {eq}`eq:distorted_law`.
 
 To construct a *robust* decision rule the decision maker solves the
-**multiplier problem** — a two-player zero-sum dynamic game:
+**multiplier problem** -- a two-player zero-sum dynamic game:
 
 ```{math}
 :label: eq:multiplier
@@ -555,7 +555,7 @@ equation is
 
 With $\beta R = 1$ (Hall's case), this is
 $\mathbb{E}_t[\mu_{c,t+1}] = \mu_{ct}$, i.e., the **marginal utility of
-consumption is a martingale** — equivalently, consumption follows a random walk.
+consumption is a martingale** -- equivalently, consumption follows a random walk.
 
 The optimal policy is $\mu_{ct} = -F y_t$ where, from the solved-forward
 Euler equation, $F = [(R-1),\ (R-1)/(R - \rho)]$.
@@ -594,7 +594,7 @@ The consumption rule takes the certainty-equivalent form
         \sum_{j=0}^{\infty} R^{-j}(z_{t+j} - b)\right]\right)
 ```
 
-where $h_1$ — the first step of the CE algorithm — is *identical* to the
+where $h_1$ -- the first step of the CE algorithm -- is *identical* to the
 non-robust case.
 
 Only the expectations operator changes.
@@ -607,7 +607,7 @@ The resulting AR(1) dynamics for $\mu_{ct}$ become:
 ```
 
 with $\tilde{\varphi} < 1$, implying $\mathbb{E}_t[c_{t+1}] > c_t$ under the
-approximating model — a form of **precautionary saving**.
+approximating model -- a form of **precautionary saving**.
 
 The observational equivalence formula {eq}`eq:oe_locus` (derived below) immediately
 gives the robust AR(1) coefficient: $\tilde{\varphi} = 1/(\tilde{\beta} R)$
@@ -722,8 +722,8 @@ plt.show()
 ```
 
 The plot confirms the paper's key finding: *activating a preference for
-robustness is observationally equivalent — for consumption and saving behaviour
-— to increasing the discount factor*.
+robustness is observationally equivalent -- for consumption and saving behaviour
+-- to increasing the discount factor*.
 
 However, {cite:t}`HST_1999` show that the two
 parametrisations do *not* imply the same asset prices.
diff --git a/lectures/two_computation.md b/lectures/two_computation.md
index 606b3daef..38ec17088 100644
--- a/lectures/two_computation.md
+++ b/lectures/two_computation.md
@@ -2078,7 +2078,7 @@ The declining profile among retirees reflects the actuarial calculation: older r
 We now plot the aggregate transition paths for the labor tax, government debt, capital, and consumption under both schemes.
 
 ```{code-cell} ipython3
-# hh, tech, ss0, ss1 already in scope — just alias from dict for readability
+# hh, tech, ss0, ss1 already in scope -- just alias from dict for readability
 ss0_exp1 = exp1_exo['ss0']
 ss1_exp1 = exp1_exo['ss1']
 
diff --git a/lectures/var_dmd.md b/lectures/var_dmd.md
index 8fa8d190f..ee8e8c2bd 100644
--- a/lectures/var_dmd.md
+++ b/lectures/var_dmd.md
@@ -601,9 +601,14 @@ This is  a consequence of a  result established by Tu et al. {cite}`tu_Rowley` t
 
 
   
-**Proposition** The $p$ columns of $\Phi$ are eigenvectors of $\hat A$.
+```{prf:proposition}
+:label: prop-dmd-eigenvectors
 
-**Proof:** From formula {eq}`eq:Phiformula` we have
+The $p$ columns of $\Phi$ are eigenvectors of $\hat A$.
+```
+
+```{prf:proof}
+From formula {eq}`eq:Phiformula` we have
 
 $$  
 \begin{aligned}
@@ -620,20 +625,16 @@ $$
 \hat A \Phi = \Phi \Lambda .
 $$ (eq:APhiLambda)
 
-  
-
 Let $\phi_i$ be the $i$th  column of $\Phi$ and $\lambda_i$ be the corresponding $i$ eigenvalue of $\tilde A$ from decomposition {eq}`eq:tildeAeigenred`. 
 
 Equating the $m \times 1$ vectors that appear on the two  sides of  equation {eq}`eq:APhiLambda`  gives
 
-
 $$
 \hat A \phi_i = \lambda_i \phi_i .
 $$
 
 This equation confirms that  $\phi_i$ is an eigenvector of $\hat A$ that corresponds to eigenvalue  $\lambda_i$ of both  $\tilde A$ and $\hat A$.
-
-This concludes the proof. 
+```
 
 Also see {cite}`DDSE_book` (p. 238)