mlfactor.github.io/bayes.html at master · theauheral/mlfactor.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
<!DOCTYPE html>
<html lang="" xml:lang="">
<head>

  <meta charset="utf-8" />
  <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  <title>Chapter 9 Bayesian methods | Machine Learning for Factor Investing</title>
  <meta name="description" content="Chapter 9 Bayesian methods | Machine Learning for Factor Investing" />
  <meta name="generator" content="bookdown 0.21 and GitBook 2.6.7" />

  <meta property="og:title" content="Chapter 9 Bayesian methods | Machine Learning for Factor Investing" />
  <meta property="og:type" content="book" />


  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="Chapter 9 Bayesian methods | Machine Learning for Factor Investing" />


<meta name="author" content="Guillaume Coqueret and Tony Guida" />


<meta name="date" content="2021-01-08" />

  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <meta name="apple-mobile-web-app-capable" content="yes" />
  <meta name="apple-mobile-web-app-status-bar-style" content="black" />


<link rel="prev" href="svm.html"/>
<link rel="next" href="valtune.html"/>
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />


<script src="libs/accessible-code-block-0.0.1/empty-anchor.js"></script>
<link href="libs/anchor-sections-1.0/anchor-sections.css" rel="stylesheet" />
<script src="libs/anchor-sections-1.0/anchor-sections.js"></script>
<script src="libs/kePrint-0.0.1/kePrint.js"></script>
<link href="libs/lightable-0.0.1/lightable.css" rel="stylesheet" />


<style type="text/css">
code.sourceCode > span { display: inline-block; line-height: 1.25; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>

</head>

<body>


  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">

    <div class="book-summary">
      <nav role="navigation">

<ul class="summary">
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html"><i class="fa fa-check"></i>Preface</a><ul>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#what-this-book-is-not-about"><i class="fa fa-check"></i>What this book is not about</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#the-targeted-audience"><i class="fa fa-check"></i>The targeted audience</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#how-this-book-is-structured"><i class="fa fa-check"></i>How this book is structured</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#companion-website"><i class="fa fa-check"></i>Companion website</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#why-r"><i class="fa fa-check"></i>Why R?</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#coding-instructions"><i class="fa fa-check"></i>Coding instructions</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#acknowledgments"><i class="fa fa-check"></i>Acknowledgments</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#future-developments"><i class="fa fa-check"></i>Future developments</a></li>
</ul></li>
<li class="part"><span><b>I Introduction</b></span></li>
<li class="chapter" data-level="1" data-path="notdata.html"><a href="notdata.html"><i class="fa fa-check"></i><b>1</b> Notations and data</a><ul>
<li class="chapter" data-level="1.1" data-path="notdata.html"><a href="notdata.html#notations"><i class="fa fa-check"></i><b>1.1</b> Notations</a></li>
<li class="chapter" data-level="1.2" data-path="notdata.html"><a href="notdata.html#dataset"><i class="fa fa-check"></i><b>1.2</b> Dataset</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="intro.html"><a href="intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
<li class="chapter" data-level="2.1" data-path="intro.html"><a href="intro.html#context"><i class="fa fa-check"></i><b>2.1</b> Context</a></li>
<li class="chapter" data-level="2.2" data-path="intro.html"><a href="intro.html#portfolio-construction-the-workflow"><i class="fa fa-check"></i><b>2.2</b> Portfolio construction: the workflow</a></li>
<li class="chapter" data-level="2.3" data-path="intro.html"><a href="intro.html#machine-learning-is-no-magic-wand"><i class="fa fa-check"></i><b>2.3</b> Machine learning is no magic wand</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="factor.html"><a href="factor.html"><i class="fa fa-check"></i><b>3</b> Factor investing and asset pricing anomalies</a><ul>
<li class="chapter" data-level="3.1" data-path="factor.html"><a href="factor.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
<li class="chapter" data-level="3.2" data-path="factor.html"><a href="factor.html#detecting-anomalies"><i class="fa fa-check"></i><b>3.2</b> Detecting anomalies</a><ul>
<li class="chapter" data-level="3.2.1" data-path="factor.html"><a href="factor.html#challenges"><i class="fa fa-check"></i><b>3.2.1</b> Challenges</a></li>
<li class="chapter" data-level="3.2.2" data-path="factor.html"><a href="factor.html#simple-portfolio-sorts"><i class="fa fa-check"></i><b>3.2.2</b> Simple portfolio sorts  </a></li>
<li class="chapter" data-level="3.2.3" data-path="factor.html"><a href="factor.html#factors"><i class="fa fa-check"></i><b>3.2.3</b> Factors</a></li>
<li class="chapter" data-level="3.2.4" data-path="factor.html"><a href="factor.html#predictive-regressions-sorts-and-p-value-issues"><i class="fa fa-check"></i><b>3.2.4</b> Predictive regressions, sorts, and p-value issues</a></li>
<li class="chapter" data-level="3.2.5" data-path="factor.html"><a href="factor.html#fama-macbeth-regressions"><i class="fa fa-check"></i><b>3.2.5</b> Fama-Macbeth regressions</a></li>
<li class="chapter" data-level="3.2.6" data-path="factor.html"><a href="factor.html#factor-competition"><i class="fa fa-check"></i><b>3.2.6</b> Factor competition</a></li>
<li class="chapter" data-level="3.2.7" data-path="factor.html"><a href="factor.html#advanced-techniques"><i class="fa fa-check"></i><b>3.2.7</b> Advanced techniques</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="factor.html"><a href="factor.html#factors-or-characteristics"><i class="fa fa-check"></i><b>3.3</b> Factors or characteristics?</a></li>
<li class="chapter" data-level="3.4" data-path="factor.html"><a href="factor.html#hot-topics-momentum-timing-and-esg"><i class="fa fa-check"></i><b>3.4</b> Hot topics: momentum, timing and ESG</a><ul>
<li class="chapter" data-level="3.4.1" data-path="factor.html"><a href="factor.html#factor-momentum"><i class="fa fa-check"></i><b>3.4.1</b> Factor momentum</a></li>
<li class="chapter" data-level="3.4.2" data-path="factor.html"><a href="factor.html#factor-timing"><i class="fa fa-check"></i><b>3.4.2</b> Factor timing</a></li>
<li class="chapter" data-level="3.4.3" data-path="factor.html"><a href="factor.html#the-green-factors"><i class="fa fa-check"></i><b>3.4.3</b> The green factors</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="factor.html"><a href="factor.html#the-links-with-machine-learning"><i class="fa fa-check"></i><b>3.5</b> The links with machine learning</a><ul>
<li class="chapter" data-level="3.5.1" data-path="factor.html"><a href="factor.html#a-short-list-of-recent-references"><i class="fa fa-check"></i><b>3.5.1</b> A short list of recent references</a></li>
<li class="chapter" data-level="3.5.2" data-path="factor.html"><a href="factor.html#explicit-connections-with-asset-pricing-models"><i class="fa fa-check"></i><b>3.5.2</b> Explicit connections with asset pricing models</a></li>
</ul></li>
<li class="chapter" data-level="3.6" data-path="factor.html"><a href="factor.html#coding-exercises"><i class="fa fa-check"></i><b>3.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="Data.html"><a href="Data.html"><i class="fa fa-check"></i><b>4</b> Data preprocessing</a><ul>
<li class="chapter" data-level="4.1" data-path="Data.html"><a href="Data.html#know-your-data"><i class="fa fa-check"></i><b>4.1</b> Know your data</a></li>
<li class="chapter" data-level="4.2" data-path="Data.html"><a href="Data.html#missing-data"><i class="fa fa-check"></i><b>4.2</b> Missing data</a></li>
<li class="chapter" data-level="4.3" data-path="Data.html"><a href="Data.html#outlier-detection"><i class="fa fa-check"></i><b>4.3</b> Outlier detection</a></li>
<li class="chapter" data-level="4.4" data-path="Data.html"><a href="Data.html#feateng"><i class="fa fa-check"></i><b>4.4</b> Feature engineering</a><ul>
<li class="chapter" data-level="4.4.1" data-path="Data.html"><a href="Data.html#feature-selection"><i class="fa fa-check"></i><b>4.4.1</b> Feature selection</a></li>
<li class="chapter" data-level="4.4.2" data-path="Data.html"><a href="Data.html#scaling"><i class="fa fa-check"></i><b>4.4.2</b> Scaling the predictors</a></li>
</ul></li>
<li class="chapter" data-level="4.5" data-path="Data.html"><a href="Data.html#labelling"><i class="fa fa-check"></i><b>4.5</b> Labelling</a><ul>
<li class="chapter" data-level="4.5.1" data-path="Data.html"><a href="Data.html#simple-labels"><i class="fa fa-check"></i><b>4.5.1</b> Simple labels</a></li>
<li class="chapter" data-level="4.5.2" data-path="Data.html"><a href="Data.html#categorical-labels"><i class="fa fa-check"></i><b>4.5.2</b> Categorical labels</a></li>
<li class="chapter" data-level="4.5.3" data-path="Data.html"><a href="Data.html#the-triple-barrier-method"><i class="fa fa-check"></i><b>4.5.3</b> The triple barrier method</a></li>
<li class="chapter" data-level="4.5.4" data-path="Data.html"><a href="Data.html#filtering-the-sample"><i class="fa fa-check"></i><b>4.5.4</b> Filtering the sample</a></li>
<li class="chapter" data-level="4.5.5" data-path="Data.html"><a href="Data.html#horizons"><i class="fa fa-check"></i><b>4.5.5</b> Return horizons</a></li>
</ul></li>
<li class="chapter" data-level="4.6" data-path="Data.html"><a href="Data.html#pers"><i class="fa fa-check"></i><b>4.6</b> Handling persistence</a></li>
<li class="chapter" data-level="4.7" data-path="Data.html"><a href="Data.html#extensions"><i class="fa fa-check"></i><b>4.7</b> Extensions</a><ul>
<li class="chapter" data-level="4.7.1" data-path="Data.html"><a href="Data.html#transforming-features"><i class="fa fa-check"></i><b>4.7.1</b> Transforming features</a></li>
<li class="chapter" data-level="4.7.2" data-path="Data.html"><a href="Data.html#macrovar"><i class="fa fa-check"></i><b>4.7.2</b> Macro-economic variables</a></li>
<li class="chapter" data-level="4.7.3" data-path="Data.html"><a href="Data.html#active-learning"><i class="fa fa-check"></i><b>4.7.3</b> Active learning</a></li>
</ul></li>
<li class="chapter" data-level="4.8" data-path="Data.html"><a href="Data.html#additional-code-and-results"><i class="fa fa-check"></i><b>4.8</b> Additional code and results</a><ul>
<li class="chapter" data-level="4.8.1" data-path="Data.html"><a href="Data.html#impact-of-rescaling-graphical-representation"><i class="fa fa-check"></i><b>4.8.1</b> Impact of rescaling: graphical representation</a></li>
<li class="chapter" data-level="4.8.2" data-path="Data.html"><a href="Data.html#impact-of-rescaling-toy-example"><i class="fa fa-check"></i><b>4.8.2</b> Impact of rescaling: toy example</a></li>
</ul></li>
<li class="chapter" data-level="4.9" data-path="Data.html"><a href="Data.html#coding-exercises-1"><i class="fa fa-check"></i><b>4.9</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>II Common supervised algorithms</b></span></li>
<li class="chapter" data-level="5" data-path="lasso.html"><a href="lasso.html"><i class="fa fa-check"></i><b>5</b> Penalized regressions and sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.1" data-path="lasso.html"><a href="lasso.html#penalized-regressions"><i class="fa fa-check"></i><b>5.1</b> Penalized regressions</a><ul>
<li class="chapter" data-level="5.1.1" data-path="lasso.html"><a href="lasso.html#penreg"><i class="fa fa-check"></i><b>5.1.1</b> Simple regressions</a></li>
<li class="chapter" data-level="5.1.2" data-path="lasso.html"><a href="lasso.html#forms-of-penalizations"><i class="fa fa-check"></i><b>5.1.2</b> Forms of penalizations</a></li>
<li class="chapter" data-level="5.1.3" data-path="lasso.html"><a href="lasso.html#illustrations"><i class="fa fa-check"></i><b>5.1.3</b> Illustrations</a></li>
</ul></li>
<li class="chapter" data-level="5.2" data-path="lasso.html"><a href="lasso.html#sparse-hedging-for-minimum-variance-portfolios"><i class="fa fa-check"></i><b>5.2</b> Sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.2.1" data-path="lasso.html"><a href="lasso.html#presentation-and-derivations"><i class="fa fa-check"></i><b>5.2.1</b> Presentation and derivations</a></li>
<li class="chapter" data-level="5.2.2" data-path="lasso.html"><a href="lasso.html#sparseex"><i class="fa fa-check"></i><b>5.2.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="5.3" data-path="lasso.html"><a href="lasso.html#predictive-regressions"><i class="fa fa-check"></i><b>5.3</b> Predictive regressions</a><ul>
<li class="chapter" data-level="5.3.1" data-path="lasso.html"><a href="lasso.html#literature-review-and-principle"><i class="fa fa-check"></i><b>5.3.1</b> Literature review and principle</a></li>
<li class="chapter" data-level="5.3.2" data-path="lasso.html"><a href="lasso.html#code-and-results"><i class="fa fa-check"></i><b>5.3.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="5.4" data-path="lasso.html"><a href="lasso.html#coding-exercise"><i class="fa fa-check"></i><b>5.4</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="trees.html"><a href="trees.html"><i class="fa fa-check"></i><b>6</b> Tree-based methods</a><ul>
<li class="chapter" data-level="6.1" data-path="trees.html"><a href="trees.html#simple-trees"><i class="fa fa-check"></i><b>6.1</b> Simple trees</a><ul>
<li class="chapter" data-level="6.1.1" data-path="trees.html"><a href="trees.html#principle"><i class="fa fa-check"></i><b>6.1.1</b> Principle</a></li>
<li class="chapter" data-level="6.1.2" data-path="trees.html"><a href="trees.html#treeclass"><i class="fa fa-check"></i><b>6.1.2</b> Further details on classification</a></li>
<li class="chapter" data-level="6.1.3" data-path="trees.html"><a href="trees.html#pruning-criteria"><i class="fa fa-check"></i><b>6.1.3</b> Pruning criteria</a></li>
<li class="chapter" data-level="6.1.4" data-path="trees.html"><a href="trees.html#code-and-interpretation"><i class="fa fa-check"></i><b>6.1.4</b> Code and interpretation</a></li>
</ul></li>
<li class="chapter" data-level="6.2" data-path="trees.html"><a href="trees.html#random-forests"><i class="fa fa-check"></i><b>6.2</b> Random forests</a><ul>
<li class="chapter" data-level="6.2.1" data-path="trees.html"><a href="trees.html#principle-1"><i class="fa fa-check"></i><b>6.2.1</b> Principle</a></li>
<li class="chapter" data-level="6.2.2" data-path="trees.html"><a href="trees.html#code-and-results-1"><i class="fa fa-check"></i><b>6.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="6.3" data-path="trees.html"><a href="trees.html#adaboost"><i class="fa fa-check"></i><b>6.3</b> Boosted trees: Adaboost</a><ul>
<li class="chapter" data-level="6.3.1" data-path="trees.html"><a href="trees.html#methodology"><i class="fa fa-check"></i><b>6.3.1</b> Methodology</a></li>
<li class="chapter" data-level="6.3.2" data-path="trees.html"><a href="trees.html#illustration"><i class="fa fa-check"></i><b>6.3.2</b> Illustration</a></li>
</ul></li>
<li class="chapter" data-level="6.4" data-path="trees.html"><a href="trees.html#boosted-trees-extreme-gradient-boosting"><i class="fa fa-check"></i><b>6.4</b> Boosted trees: extreme gradient boosting</a><ul>
<li class="chapter" data-level="6.4.1" data-path="trees.html"><a href="trees.html#managing-loss"><i class="fa fa-check"></i><b>6.4.1</b> Managing loss</a></li>
<li class="chapter" data-level="6.4.2" data-path="trees.html"><a href="trees.html#penalization"><i class="fa fa-check"></i><b>6.4.2</b> Penalization</a></li>
<li class="chapter" data-level="6.4.3" data-path="trees.html"><a href="trees.html#aggregation"><i class="fa fa-check"></i><b>6.4.3</b> Aggregation</a></li>
<li class="chapter" data-level="6.4.4" data-path="trees.html"><a href="trees.html#tree-structure"><i class="fa fa-check"></i><b>6.4.4</b> Tree structure</a></li>
<li class="chapter" data-level="6.4.5" data-path="trees.html"><a href="trees.html#boostext"><i class="fa fa-check"></i><b>6.4.5</b> Extensions</a></li>
<li class="chapter" data-level="6.4.6" data-path="trees.html"><a href="trees.html#boostcode"><i class="fa fa-check"></i><b>6.4.6</b> Code and results</a></li>
<li class="chapter" data-level="6.4.7" data-path="trees.html"><a href="trees.html#instweight"><i class="fa fa-check"></i><b>6.4.7</b> Instance weighting</a></li>
</ul></li>
<li class="chapter" data-level="6.5" data-path="trees.html"><a href="trees.html#discussion"><i class="fa fa-check"></i><b>6.5</b> Discussion</a></li>
<li class="chapter" data-level="6.6" data-path="trees.html"><a href="trees.html#coding-exercises-2"><i class="fa fa-check"></i><b>6.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="NN.html"><a href="NN.html"><i class="fa fa-check"></i><b>7</b> Neural networks</a><ul>
<li class="chapter" data-level="7.1" data-path="NN.html"><a href="NN.html#the-original-perceptron"><i class="fa fa-check"></i><b>7.1</b> The original perceptron</a></li>
<li class="chapter" data-level="7.2" data-path="NN.html"><a href="NN.html#multilayer-perceptron"><i class="fa fa-check"></i><b>7.2</b> Multilayer perceptron</a><ul>
<li class="chapter" data-level="7.2.1" data-path="NN.html"><a href="NN.html#introduction-and-notations"><i class="fa fa-check"></i><b>7.2.1</b> Introduction and notations</a></li>
<li class="chapter" data-level="7.2.2" data-path="NN.html"><a href="NN.html#universal-approximation"><i class="fa fa-check"></i><b>7.2.2</b> Universal approximation</a></li>
<li class="chapter" data-level="7.2.3" data-path="NN.html"><a href="NN.html#backprop"><i class="fa fa-check"></i><b>7.2.3</b> Learning via back-propagation</a></li>
<li class="chapter" data-level="7.2.4" data-path="NN.html"><a href="NN.html#further-details-on-classification"><i class="fa fa-check"></i><b>7.2.4</b> Further details on classification</a></li>
</ul></li>
<li class="chapter" data-level="7.3" data-path="NN.html"><a href="NN.html#howdeep"><i class="fa fa-check"></i><b>7.3</b> How deep we should go and other practical issues</a><ul>
<li class="chapter" data-level="7.3.1" data-path="NN.html"><a href="NN.html#architectural-choices"><i class="fa fa-check"></i><b>7.3.1</b> Architectural choices</a></li>
<li class="chapter" data-level="7.3.2" data-path="NN.html"><a href="NN.html#frequency-of-weight-updates-and-learning-duration"><i class="fa fa-check"></i><b>7.3.2</b> Frequency of weight updates and learning duration</a></li>
<li class="chapter" data-level="7.3.3" data-path="NN.html"><a href="NN.html#penalizations-and-dropout"><i class="fa fa-check"></i><b>7.3.3</b> Penalizations and dropout</a></li>
</ul></li>
<li class="chapter" data-level="7.4" data-path="NN.html"><a href="NN.html#code-samples-and-comments-for-vanilla-mlp"><i class="fa fa-check"></i><b>7.4</b> Code samples and comments for vanilla MLP</a><ul>
<li class="chapter" data-level="7.4.1" data-path="NN.html"><a href="NN.html#regression-example"><i class="fa fa-check"></i><b>7.4.1</b> Regression example</a></li>
<li class="chapter" data-level="7.4.2" data-path="NN.html"><a href="NN.html#classification-example"><i class="fa fa-check"></i><b>7.4.2</b> Classification example</a></li>
<li class="chapter" data-level="7.4.3" data-path="NN.html"><a href="NN.html#custloss"><i class="fa fa-check"></i><b>7.4.3</b> Custom losses</a></li>
</ul></li>
<li class="chapter" data-level="7.5" data-path="NN.html"><a href="NN.html#recurrent-networks"><i class="fa fa-check"></i><b>7.5</b> Recurrent networks</a><ul>
<li class="chapter" data-level="7.5.1" data-path="NN.html"><a href="NN.html#presentation"><i class="fa fa-check"></i><b>7.5.1</b> Presentation</a></li>
<li class="chapter" data-level="7.5.2" data-path="NN.html"><a href="NN.html#code-and-results-2"><i class="fa fa-check"></i><b>7.5.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="7.6" data-path="NN.html"><a href="NN.html#other-common-architectures"><i class="fa fa-check"></i><b>7.6</b> Other common architectures</a><ul>
<li class="chapter" data-level="7.6.1" data-path="NN.html"><a href="NN.html#generative-aversarial-networks"><i class="fa fa-check"></i><b>7.6.1</b> Generative adversarial networks</a></li>
<li class="chapter" data-level="7.6.2" data-path="NN.html"><a href="NN.html#autoencoders"><i class="fa fa-check"></i><b>7.6.2</b> Autoencoders</a></li>
<li class="chapter" data-level="7.6.3" data-path="NN.html"><a href="NN.html#a-word-on-convolutional-networks"><i class="fa fa-check"></i><b>7.6.3</b> A word on convolutional networks</a></li>
<li class="chapter" data-level="7.6.4" data-path="NN.html"><a href="NN.html#advanced-architectures"><i class="fa fa-check"></i><b>7.6.4</b> Advanced architectures</a></li>
</ul></li>
<li class="chapter" data-level="7.7" data-path="NN.html"><a href="NN.html#coding-exercise-1"><i class="fa fa-check"></i><b>7.7</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="svm.html"><a href="svm.html"><i class="fa fa-check"></i><b>8</b> Support vector machines</a><ul>
<li class="chapter" data-level="8.1" data-path="svm.html"><a href="svm.html#svm-for-classification"><i class="fa fa-check"></i><b>8.1</b> SVM for classification</a></li>
<li class="chapter" data-level="8.2" data-path="svm.html"><a href="svm.html#svm-for-regression"><i class="fa fa-check"></i><b>8.2</b> SVM for regression</a></li>
<li class="chapter" data-level="8.3" data-path="svm.html"><a href="svm.html#practice"><i class="fa fa-check"></i><b>8.3</b> Practice</a></li>
<li class="chapter" data-level="8.4" data-path="svm.html"><a href="svm.html#coding-exercises-3"><i class="fa fa-check"></i><b>8.4</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="bayes.html"><a href="bayes.html"><i class="fa fa-check"></i><b>9</b> Bayesian methods</a><ul>
<li class="chapter" data-level="9.1" data-path="bayes.html"><a href="bayes.html#the-bayesian-framework"><i class="fa fa-check"></i><b>9.1</b> The Bayesian framework</a></li>
<li class="chapter" data-level="9.2" data-path="bayes.html"><a href="bayes.html#bayesian-sampling"><i class="fa fa-check"></i><b>9.2</b> Bayesian sampling</a><ul>
<li class="chapter" data-level="9.2.1" data-path="bayes.html"><a href="bayes.html#gibbs-sampling"><i class="fa fa-check"></i><b>9.2.1</b> Gibbs sampling</a></li>
<li class="chapter" data-level="9.2.2" data-path="bayes.html"><a href="bayes.html#metropolis-hastings-sampling"><i class="fa fa-check"></i><b>9.2.2</b> Metropolis-Hastings sampling</a></li>
</ul></li>
<li class="chapter" data-level="9.3" data-path="bayes.html"><a href="bayes.html#bayesian-linear-regression"><i class="fa fa-check"></i><b>9.3</b> Bayesian linear regression</a></li>
<li class="chapter" data-level="9.4" data-path="bayes.html"><a href="bayes.html#naive-bayes-classifier"><i class="fa fa-check"></i><b>9.4</b> Naive Bayes classifier</a></li>
<li class="chapter" data-level="9.5" data-path="bayes.html"><a href="bayes.html#BART"><i class="fa fa-check"></i><b>9.5</b> Bayesian additive trees</a><ul>
<li class="chapter" data-level="9.5.1" data-path="bayes.html"><a href="bayes.html#general-formulation"><i class="fa fa-check"></i><b>9.5.1</b> General formulation</a></li>
<li class="chapter" data-level="9.5.2" data-path="bayes.html"><a href="bayes.html#priors"><i class="fa fa-check"></i><b>9.5.2</b> Priors</a></li>
<li class="chapter" data-level="9.5.3" data-path="bayes.html"><a href="bayes.html#sampling-and-predictions"><i class="fa fa-check"></i><b>9.5.3</b> Sampling and predictions</a></li>
<li class="chapter" data-level="9.5.4" data-path="bayes.html"><a href="bayes.html#code"><i class="fa fa-check"></i><b>9.5.4</b> Code</a></li>
</ul></li>
</ul></li>
<li class="part"><span><b>III From predictions to portfolios</b></span></li>
<li class="chapter" data-level="10" data-path="valtune.html"><a href="valtune.html"><i class="fa fa-check"></i><b>10</b> Validating and tuning</a><ul>
<li class="chapter" data-level="10.1" data-path="valtune.html"><a href="valtune.html#mlmetrics"><i class="fa fa-check"></i><b>10.1</b> Learning metrics</a><ul>
<li class="chapter" data-level="10.1.1" data-path="valtune.html"><a href="valtune.html#regression-analysis"><i class="fa fa-check"></i><b>10.1.1</b> Regression analysis</a></li>
<li class="chapter" data-level="10.1.2" data-path="valtune.html"><a href="valtune.html#classification-analysis"><i class="fa fa-check"></i><b>10.1.2</b> Classification analysis</a></li>
</ul></li>
<li class="chapter" data-level="10.2" data-path="valtune.html"><a href="valtune.html#validation"><i class="fa fa-check"></i><b>10.2</b> Validation</a><ul>
<li class="chapter" data-level="10.2.1" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-theory"><i class="fa fa-check"></i><b>10.2.1</b> The variance-bias tradeoff: theory</a></li>
<li class="chapter" data-level="10.2.2" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-illustration"><i class="fa fa-check"></i><b>10.2.2</b> The variance-bias tradeoff: illustration</a></li>
<li class="chapter" data-level="10.2.3" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-principle"><i class="fa fa-check"></i><b>10.2.3</b> The risk of overfitting: principle</a></li>
<li class="chapter" data-level="10.2.4" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-some-solutions"><i class="fa fa-check"></i><b>10.2.4</b> The risk of overfitting: some solutions</a></li>
</ul></li>
<li class="chapter" data-level="10.3" data-path="valtune.html"><a href="valtune.html#the-search-for-good-hyperparameters"><i class="fa fa-check"></i><b>10.3</b> The search for good hyperparameters</a><ul>
<li class="chapter" data-level="10.3.1" data-path="valtune.html"><a href="valtune.html#methods"><i class="fa fa-check"></i><b>10.3.1</b> Methods</a></li>
<li class="chapter" data-level="10.3.2" data-path="valtune.html"><a href="valtune.html#example-grid-search"><i class="fa fa-check"></i><b>10.3.2</b> Example: grid search</a></li>
<li class="chapter" data-level="10.3.3" data-path="valtune.html"><a href="valtune.html#example-bayesian-optimization"><i class="fa fa-check"></i><b>10.3.3</b> Example: Bayesian optimization</a></li>
</ul></li>
<li class="chapter" data-level="10.4" data-path="valtune.html"><a href="valtune.html#short-discussion-on-validation-in-backtests"><i class="fa fa-check"></i><b>10.4</b> Short discussion on validation in backtests</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="ensemble.html"><a href="ensemble.html"><i class="fa fa-check"></i><b>11</b> Ensemble models</a><ul>
<li class="chapter" data-level="11.1" data-path="ensemble.html"><a href="ensemble.html#linear-ensembles"><i class="fa fa-check"></i><b>11.1</b> Linear ensembles</a><ul>
<li class="chapter" data-level="11.1.1" data-path="ensemble.html"><a href="ensemble.html#principles"><i class="fa fa-check"></i><b>11.1.1</b> Principles</a></li>
<li class="chapter" data-level="11.1.2" data-path="ensemble.html"><a href="ensemble.html#example"><i class="fa fa-check"></i><b>11.1.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="11.2" data-path="ensemble.html"><a href="ensemble.html#stacked-ensembles"><i class="fa fa-check"></i><b>11.2</b> Stacked ensembles</a><ul>
<li class="chapter" data-level="11.2.1" data-path="ensemble.html"><a href="ensemble.html#two-stage-training"><i class="fa fa-check"></i><b>11.2.1</b> Two-stage training</a></li>
<li class="chapter" data-level="11.2.2" data-path="ensemble.html"><a href="ensemble.html#code-and-results-3"><i class="fa fa-check"></i><b>11.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="11.3" data-path="ensemble.html"><a href="ensemble.html#extensions-1"><i class="fa fa-check"></i><b>11.3</b> Extensions</a><ul>
<li class="chapter" data-level="11.3.1" data-path="ensemble.html"><a href="ensemble.html#exogenous-variables"><i class="fa fa-check"></i><b>11.3.1</b> Exogenous variables</a></li>
<li class="chapter" data-level="11.3.2" data-path="ensemble.html"><a href="ensemble.html#shrinking-inter-model-correlations"><i class="fa fa-check"></i><b>11.3.2</b> Shrinking inter-model correlations</a></li>
</ul></li>
<li class="chapter" data-level="11.4" data-path="ensemble.html"><a href="ensemble.html#exercise"><i class="fa fa-check"></i><b>11.4</b> Exercise</a></li>
</ul></li>
<li class="chapter" data-level="12" data-path="backtest.html"><a href="backtest.html"><i class="fa fa-check"></i><b>12</b> Portfolio backtesting</a><ul>
<li class="chapter" data-level="12.1" data-path="backtest.html"><a href="backtest.html#protocol"><i class="fa fa-check"></i><b>12.1</b> Setting the protocol</a></li>
<li class="chapter" data-level="12.2" data-path="backtest.html"><a href="backtest.html#turning-signals-into-portfolio-weights"><i class="fa fa-check"></i><b>12.2</b> Turning signals into portfolio weights</a></li>
<li class="chapter" data-level="12.3" data-path="backtest.html"><a href="backtest.html#perfmet"><i class="fa fa-check"></i><b>12.3</b> Performance metrics</a><ul>
<li class="chapter" data-level="12.3.1" data-path="backtest.html"><a href="backtest.html#discussion-1"><i class="fa fa-check"></i><b>12.3.1</b> Discussion</a></li>
<li class="chapter" data-level="12.3.2" data-path="backtest.html"><a href="backtest.html#pure-performance-and-risk-indicators"><i class="fa fa-check"></i><b>12.3.2</b> Pure performance and risk indicators</a></li>
<li class="chapter" data-level="12.3.3" data-path="backtest.html"><a href="backtest.html#factor-based-evaluation"><i class="fa fa-check"></i><b>12.3.3</b> Factor-based evaluation</a></li>
<li class="chapter" data-level="12.3.4" data-path="backtest.html"><a href="backtest.html#risk-adjusted-measures"><i class="fa fa-check"></i><b>12.3.4</b> Risk-adjusted measures</a></li>
<li class="chapter" data-level="12.3.5" data-path="backtest.html"><a href="backtest.html#transaction-costs-and-turnover"><i class="fa fa-check"></i><b>12.3.5</b> Transaction costs and turnover</a></li>
</ul></li>
<li class="chapter" data-level="12.4" data-path="backtest.html"><a href="backtest.html#common-errors-and-issues"><i class="fa fa-check"></i><b>12.4</b> Common errors and issues</a><ul>
<li class="chapter" data-level="12.4.1" data-path="backtest.html"><a href="backtest.html#forward-looking-data"><i class="fa fa-check"></i><b>12.4.1</b> Forward looking data</a></li>
<li class="chapter" data-level="12.4.2" data-path="backtest.html"><a href="backtest.html#backov"><i class="fa fa-check"></i><b>12.4.2</b> Backtest overfitting</a></li>
<li class="chapter" data-level="12.4.3" data-path="backtest.html"><a href="backtest.html#simple-safeguards"><i class="fa fa-check"></i><b>12.4.3</b> Simple safeguards</a></li>
</ul></li>
<li class="chapter" data-level="12.5" data-path="backtest.html"><a href="backtest.html#implication-of-non-stationarity-forecasting-is-hard"><i class="fa fa-check"></i><b>12.5</b> Implication of non-stationarity: forecasting is hard</a><ul>
<li class="chapter" data-level="12.5.1" data-path="backtest.html"><a href="backtest.html#general-comments"><i class="fa fa-check"></i><b>12.5.1</b> General comments</a></li>
<li class="chapter" data-level="12.5.2" data-path="backtest.html"><a href="backtest.html#the-no-free-lunch-theorem"><i class="fa fa-check"></i><b>12.5.2</b> The no free lunch theorem</a></li>
</ul></li>
<li class="chapter" data-level="12.6" data-path="backtest.html"><a href="backtest.html#first-example-a-complete-backtest"><i class="fa fa-check"></i><b>12.6</b> First example: a complete backtest</a></li>
<li class="chapter" data-level="12.7" data-path="backtest.html"><a href="backtest.html#second-example-backtest-overfitting"><i class="fa fa-check"></i><b>12.7</b> Second example: backtest overfitting</a></li>
<li class="chapter" data-level="12.8" data-path="backtest.html"><a href="backtest.html#coding-exercises-4"><i class="fa fa-check"></i><b>12.8</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>IV Further important topics</b></span></li>
<li class="chapter" data-level="13" data-path="interp.html"><a href="interp.html"><i class="fa fa-check"></i><b>13</b> Interpretability</a><ul>
<li class="chapter" data-level="13.1" data-path="interp.html"><a href="interp.html#global-interpretations"><i class="fa fa-check"></i><b>13.1</b> Global interpretations</a><ul>
<li class="chapter" data-level="13.1.1" data-path="interp.html"><a href="interp.html#surr"><i class="fa fa-check"></i><b>13.1.1</b> Simple models as surrogates</a></li>
<li class="chapter" data-level="13.1.2" data-path="interp.html"><a href="interp.html#variable-importance"><i class="fa fa-check"></i><b>13.1.2</b> Variable importance (tree-based)</a></li>
<li class="chapter" data-level="13.1.3" data-path="interp.html"><a href="interp.html#variable-importance-agnostic"><i class="fa fa-check"></i><b>13.1.3</b> Variable importance (agnostic)</a></li>
<li class="chapter" data-level="13.1.4" data-path="interp.html"><a href="interp.html#partial-dependence-plot"><i class="fa fa-check"></i><b>13.1.4</b> Partial dependence plot</a></li>
</ul></li>
<li class="chapter" data-level="13.2" data-path="interp.html"><a href="interp.html#local-interpretations"><i class="fa fa-check"></i><b>13.2</b> Local interpretations</a><ul>
<li class="chapter" data-level="13.2.1" data-path="interp.html"><a href="interp.html#lime"><i class="fa fa-check"></i><b>13.2.1</b> LIME</a></li>
<li class="chapter" data-level="13.2.2" data-path="interp.html"><a href="interp.html#shapley-values"><i class="fa fa-check"></i><b>13.2.2</b> Shapley values</a></li>
<li class="chapter" data-level="13.2.3" data-path="interp.html"><a href="interp.html#breakdown"><i class="fa fa-check"></i><b>13.2.3</b> Breakdown</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="14" data-path="causality.html"><a href="causality.html"><i class="fa fa-check"></i><b>14</b> Two key concepts: causality and non-stationarity</a><ul>
<li class="chapter" data-level="14.1" data-path="causality.html"><a href="causality.html#causality-1"><i class="fa fa-check"></i><b>14.1</b> Causality</a><ul>
<li class="chapter" data-level="14.1.1" data-path="causality.html"><a href="causality.html#granger"><i class="fa fa-check"></i><b>14.1.1</b> Granger causality</a></li>
<li class="chapter" data-level="14.1.2" data-path="causality.html"><a href="causality.html#causal-additive-models"><i class="fa fa-check"></i><b>14.1.2</b> Causal additive models</a></li>
<li class="chapter" data-level="14.1.3" data-path="causality.html"><a href="causality.html#structural-time-series-models"><i class="fa fa-check"></i><b>14.1.3</b> Structural time series models</a></li>
</ul></li>
<li class="chapter" data-level="14.2" data-path="causality.html"><a href="causality.html#nonstat"><i class="fa fa-check"></i><b>14.2</b> Dealing with changing environments</a><ul>
<li class="chapter" data-level="14.2.1" data-path="causality.html"><a href="causality.html#non-stationarity-yet-another-illustration"><i class="fa fa-check"></i><b>14.2.1</b> Non-stationarity: yet another illustration</a></li>
<li class="chapter" data-level="14.2.2" data-path="causality.html"><a href="causality.html#online-learning"><i class="fa fa-check"></i><b>14.2.2</b> Online learning</a></li>
<li class="chapter" data-level="14.2.3" data-path="causality.html"><a href="causality.html#homogeneous-transfer-learning"><i class="fa fa-check"></i><b>14.2.3</b> Homogeneous transfer learning</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="15" data-path="unsup.html"><a href="unsup.html"><i class="fa fa-check"></i><b>15</b> Unsupervised learning</a><ul>
<li class="chapter" data-level="15.1" data-path="unsup.html"><a href="unsup.html#corpred"><i class="fa fa-check"></i><b>15.1</b> The problem with correlated predictors</a></li>
<li class="chapter" data-level="15.2" data-path="unsup.html"><a href="unsup.html#principal-component-analysis-and-autoencoders"><i class="fa fa-check"></i><b>15.2</b> Principal component analysis and autoencoders</a><ul>
<li class="chapter" data-level="15.2.1" data-path="unsup.html"><a href="unsup.html#a-bit-of-algebra"><i class="fa fa-check"></i><b>15.2.1</b> A bit of algebra</a></li>
<li class="chapter" data-level="15.2.2" data-path="unsup.html"><a href="unsup.html#pca"><i class="fa fa-check"></i><b>15.2.2</b> PCA</a></li>
<li class="chapter" data-level="15.2.3" data-path="unsup.html"><a href="unsup.html#ae"><i class="fa fa-check"></i><b>15.2.3</b> Autoencoders</a></li>
<li class="chapter" data-level="15.2.4" data-path="unsup.html"><a href="unsup.html#application"><i class="fa fa-check"></i><b>15.2.4</b> Application</a></li>
</ul></li>
<li class="chapter" data-level="15.3" data-path="unsup.html"><a href="unsup.html#clustering-via-k-means"><i class="fa fa-check"></i><b>15.3</b> Clustering via k-means</a></li>
<li class="chapter" data-level="15.4" data-path="unsup.html"><a href="unsup.html#nearest-neighbors"><i class="fa fa-check"></i><b>15.4</b> Nearest neighbors</a></li>
<li class="chapter" data-level="15.5" data-path="unsup.html"><a href="unsup.html#coding-exercise-2"><i class="fa fa-check"></i><b>15.5</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="16" data-path="RL.html"><a href="RL.html"><i class="fa fa-check"></i><b>16</b> Reinforcement learning</a><ul>
<li class="chapter" data-level="16.1" data-path="RL.html"><a href="RL.html#theoretical-layout"><i class="fa fa-check"></i><b>16.1</b> Theoretical layout</a><ul>
<li class="chapter" data-level="16.1.1" data-path="RL.html"><a href="RL.html#general-framework"><i class="fa fa-check"></i><b>16.1.1</b> General framework</a></li>
<li class="chapter" data-level="16.1.2" data-path="RL.html"><a href="RL.html#q-learning"><i class="fa fa-check"></i><b>16.1.2</b> Q-learning</a></li>
<li class="chapter" data-level="16.1.3" data-path="RL.html"><a href="RL.html#sarsa"><i class="fa fa-check"></i><b>16.1.3</b> SARSA</a></li>
</ul></li>
<li class="chapter" data-level="16.2" data-path="RL.html"><a href="RL.html#the-curse-of-dimensionality"><i class="fa fa-check"></i><b>16.2</b> The curse of dimensionality</a></li>
<li class="chapter" data-level="16.3" data-path="RL.html"><a href="RL.html#policy-gradient"><i class="fa fa-check"></i><b>16.3</b> Policy gradient</a><ul>
<li class="chapter" data-level="16.3.1" data-path="RL.html"><a href="RL.html#principle-2"><i class="fa fa-check"></i><b>16.3.1</b> Principle</a></li>
<li class="chapter" data-level="16.3.2" data-path="RL.html"><a href="RL.html#extensions-2"><i class="fa fa-check"></i><b>16.3.2</b> Extensions</a></li>
</ul></li>
<li class="chapter" data-level="16.4" data-path="RL.html"><a href="RL.html#simple-examples"><i class="fa fa-check"></i><b>16.4</b> Simple examples</a><ul>
<li class="chapter" data-level="16.4.1" data-path="RL.html"><a href="RL.html#q-learning-with-simulations"><i class="fa fa-check"></i><b>16.4.1</b> Q-learning with simulations</a></li>
<li class="chapter" data-level="16.4.2" data-path="RL.html"><a href="RL.html#RLemp2"><i class="fa fa-check"></i><b>16.4.2</b> Q-learning with market data</a></li>
</ul></li>
<li class="chapter" data-level="16.5" data-path="RL.html"><a href="RL.html#concluding-remarks"><i class="fa fa-check"></i><b>16.5</b> Concluding remarks</a></li>
<li class="chapter" data-level="16.6" data-path="RL.html"><a href="RL.html#exercises"><i class="fa fa-check"></i><b>16.6</b> Exercises</a></li>
</ul></li>
<li class="part"><span><b>V Appendix</b></span></li>
<li class="chapter" data-level="17" data-path="data-description.html"><a href="data-description.html"><i class="fa fa-check"></i><b>17</b> Data description</a></li>
<li class="chapter" data-level="18" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html"><i class="fa fa-check"></i><b>18</b> Solutions to exercises</a><ul>
<li class="chapter" data-level="18.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-3"><i class="fa fa-check"></i><b>18.1</b> Chapter 3</a></li>
<li class="chapter" data-level="18.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-4"><i class="fa fa-check"></i><b>18.2</b> Chapter 4</a></li>
<li class="chapter" data-level="18.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-5"><i class="fa fa-check"></i><b>18.3</b> Chapter 5</a></li>
<li class="chapter" data-level="18.4" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-6"><i class="fa fa-check"></i><b>18.4</b> Chapter 6</a></li>
<li class="chapter" data-level="18.5" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-7-the-autoencoder-model"><i class="fa fa-check"></i><b>18.5</b> Chapter 7: the autoencoder model</a></li>
<li class="chapter" data-level="18.6" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-8"><i class="fa fa-check"></i><b>18.6</b> Chapter 8</a></li>
<li class="chapter" data-level="18.7" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-11-ensemble-neural-network"><i class="fa fa-check"></i><b>18.7</b> Chapter 11: ensemble neural network</a></li>
<li class="chapter" data-level="18.8" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-12"><i class="fa fa-check"></i><b>18.8</b> Chapter 12</a><ul>
<li class="chapter" data-level="18.8.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#ew-portfolios-with-the-tidyverse"><i class="fa fa-check"></i><b>18.8.1</b> EW portfolios with the tidyverse</a></li>
<li class="chapter" data-level="18.8.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#advanced-weighting-function"><i class="fa fa-check"></i><b>18.8.2</b> Advanced weighting function</a></li>
<li class="chapter" data-level="18.8.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#functional-programming-in-the-backtest"><i class="fa fa-check"></i><b>18.8.3</b> Functional programming in the backtest</a></li>
</ul></li>
<li class="chapter" data-level="18.9" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-15"><i class="fa fa-check"></i><b>18.9</b> Chapter 15</a></li>
<li class="chapter" data-level="18.10" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-16"><i class="fa fa-check"></i><b>18.10</b> Chapter 16</a></li>
</ul></li>
</ul>

      </nav>
    </div>

    <div class="book-body">
      <div class="body-inner">
        <div class="book-header" role="navigation">
          <h1>
            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Machine Learning for Factor Investing</a>
          </h1>
        </div>

        <div class="page-wrapper" tabindex="-1" role="main">
          <div class="page-inner">

            <section class="normal" id="section-">
<div id="bayes" class="section level1">
<h1><span class="header-section-number">Chapter 9</span> Bayesian methods</h1>
<p>This section is dedicated to the subset of machine learning that makes prior assumptions on parameters. Before we explain how Bayes’ theorem can be applied to simple building blocks in machine learning, we introduce some notations and concepts in the subsection below. Good references for Bayesian analysis are <span class="citation">Gelman et al. (<a href="#ref-gelman2013bayesian" role="doc-biblioref">2013</a>)</span> and <span class="citation">Kruschke (<a href="#ref-kruschke2014doing" role="doc-biblioref">2014</a>)</span>. The latter, like the present book, illustrates the concepts with many lines of R code.</p>
<div id="the-bayesian-framework" class="section level2">
<h2><span class="header-section-number">9.1</span> The Bayesian framework</h2>
<p>Up to now, the models that have been presented rely on data only. This approach is often referred to as ‘<strong>frequentist</strong>’. Given one dataset, a frequentist will extract (i.e., estimate) a unique set of optimal parameters and consider it to be the best model. Bayesians, on the other hand, consider datasets as <strong>snapshots of reality</strong> and, for them, parameters are thus random! Instead of estimating one value for parameters (e.g., a coefficient in a linear model), they are more ambitious and try to determine the <strong>whole distribution</strong> of the parameter.</p>
<p>In order to outline how that can be achieved, we introduce basic notations and results. The foundational concept in Bayesian analysis is the <strong>conditional probability</strong>. Given two random sets (or events) <span class="math inline">\(A\)</span> and <span class="math inline">\(B\)</span>, we define the probability of <span class="math inline">\(A\)</span> knowing <span class="math inline">\(B\)</span> (equivalently, the odds of having <span class="math inline">\(A\)</span>, conditionally on having <span class="math inline">\(B\)</span>) as
<span class="math display">\[P[A|B]=\frac{P[A \cap B]}{P[B]},\]</span>
that is, the probability of the intersection between the two sets divided by the probability of <span class="math inline">\(B\)</span>. Likewise, the probability that both events occur is equal to <span class="math inline">\(P[A \cap B] = P[A]P[B|A]\)</span>. Given <span class="math inline">\(n\)</span> disjoint events <span class="math inline">\(A_i\)</span>, <span class="math inline">\(i=1,...n\)</span> such that <span class="math inline">\(\sum_{i=1}^nP(A_i)=1\)</span>, then for any event <span class="math inline">\(B\)</span>, the law of total probabilities is (or implies)
<span class="math display">\[P(B)=\sum_{i=1}^nP(B \cap A_i)= \sum_{i=1}^nP(B|A_i)P(A_i).\]</span></p>
<p>Given this expression, we can formulate a general version of Bayes’ theorem:
<span class="math display" id="eq:bayes">\[\begin{equation}
\tag{9.1}
P(A_i|B)=\frac{P(A_i)P(B|A_i)}{P(B)}= \frac{P(A_i)P(B|A_i)}{\sum_{i=1}^nP(B|A_i)P(A_i)}.
\end{equation}\]</span></p>
<p>Endowed with this result, we can move forward to the core topic of this section, which is the estimation of some parameter <span class="math inline">\(\boldsymbol{\theta}\)</span> (possibly a vector) given a dataset, which we denote with <span class="math inline">\(\textbf{y}\)</span> thereby following the conventions from <span class="citation">Gelman et al. (<a href="#ref-gelman2013bayesian" role="doc-biblioref">2013</a>)</span>. This notation is suboptimal in this book nonetheless because in all other chapters, <span class="math inline">\(\textbf{y}\)</span> stands for the label of a dataset.</p>
<p>In Bayesian analysis, one sophistication (compared to a frequentist approach) comes from the fact that the data is not almighty. The distribution of the parameter <span class="math inline">\(\boldsymbol{\theta}\)</span> will be a mix between some <strong>prior</strong> distribution set by the statistician (the user, the analyst) and the empirical distribution from the data. More precisely, a simple application of Bayes’ formula yields
<span class="math display" id="eq:bayes2">\[\begin{equation}
\tag{9.2}
p(\boldsymbol{\theta}| \textbf{y})=\frac{p(\boldsymbol{\theta})p(\textbf{y} |\boldsymbol{\theta})}{p(\textbf{y})} \propto p(\boldsymbol{\theta})p(\textbf{y} |\boldsymbol{\theta}).
\end{equation}\]</span></p>
<p>The interpretation is immediate: the distribution of <span class="math inline">\(\boldsymbol{\theta}\)</span> knowing the data <span class="math inline">\(\textbf{y}\)</span> is proportional to the distribution of <span class="math inline">\(\boldsymbol{\theta}\)</span> times the distribution of <span class="math inline">\(\textbf{y}\)</span> knowing <span class="math inline">\(\boldsymbol{\theta}\)</span>. The term <span class="math inline">\(p(\textbf{y})\)</span> is often omitted because it is simply a scaling number that ensures that the density sums or integrates to one.</p>
<p>We use a slightly different notation between Equation <a href="bayes.html#eq:bayes">(9.1)</a> and Equation <a href="bayes.html#eq:bayes2">(9.2)</a>. In the former, <span class="math inline">\(P\)</span> denotes a true probability, i.e., it is a number. In the latter, <span class="math inline">\(p\)</span> stands for the whole probability density function of <span class="math inline">\(\boldsymbol{\theta}\)</span> or <span class="math inline">\(\textbf{y}\)</span>.</p>
<p>The whole purpose of Bayesian analysis is to compute the so-called <strong>posterior</strong> distribution <span class="math inline">\(p(\boldsymbol{\theta}| \textbf{y})\)</span> via the <strong>prior</strong> distribution <span class="math inline">\(p(\boldsymbol{\theta})\)</span> and the <strong>likelihood function</strong> <span class="math inline">\(p(\textbf{y} |\boldsymbol{\theta})\)</span>. Priors are sometimes qualified as informative, weakly informative or uninformative, depending on the degree to which the user is confident on the relevance and robustness of the prior. The simplest way to define a non-informative prior is to set a constant (uniform) distribution over some realistic interval(s).</p>
<p>The most challenging part is usually the likelihood function. The easiest way to solve the problem is to resort to a specific distribution (possibly a parametric family) for the distribution of the data and then consider that obsevations are i.i.d., just as in a simple maximum likelihood inference. If we assume that new parameters for the distributions are gathered into <span class="math inline">\(\boldsymbol{\lambda}\)</span>, then the likelihood can be written as
<span class="math display" id="eq:likelihood">\[\begin{equation}
\tag{9.3}
p(\textbf{y} |\boldsymbol{\theta}, \boldsymbol{\lambda})=\prod_{i=1}^I f_{\boldsymbol{\lambda}}(y_i; \boldsymbol{\beta}),
\end{equation}\]</span>
but in this case the problem becomes slightly more complex because adding new parameters changes the posterior distribution to <span class="math inline">\(p(\boldsymbol{\theta}, \boldsymbol{\lambda}|\textbf{y})\)</span>. The user must find out the joint distribution of <span class="math inline">\(\boldsymbol{\theta}\)</span> and <span class="math inline">\(\boldsymbol{\lambda}\)</span> - given <span class="math inline">\(\textbf{y}\)</span>. Because of their nested structure, these models are often called <strong>hierarchical models</strong>.</p>
<p>Bayesian methods are widely used for portfolio choice. The rationale is that the distribution of asset returns depends on some parameter and the main issue is to determine the posterior distribution. We very briefly review a vast literature below. Bayesian asset allocation is investigated in <span class="citation">Lai et al. (<a href="#ref-lai2011mean" role="doc-biblioref">2011</a>)</span> (via stochastic optimization), <span class="citation">Guidolin and Liu (<a href="#ref-guidolin2016ambiguity" role="doc-biblioref">2016</a>)</span> and <span class="citation">Dangl and Weissensteiner (<a href="#ref-dangl2020optimal" role="doc-biblioref">2020</a>)</span>. Shrinkage techniques (of means and covariance matrices) are tested in <span class="citation">Frost and Savarino (<a href="#ref-frost1986empirical" role="doc-biblioref">1986</a>)</span>, <span class="citation">Kan and Zhou (<a href="#ref-kan2007optimal" role="doc-biblioref">2007</a>)</span> and <span class="citation">DeMiguel, Martı́n-Utrera, and Nogales (<a href="#ref-demiguel2015parameter" role="doc-biblioref">2015</a>)</span>. In a similar vein, <span class="citation">Tu and Zhou (<a href="#ref-tu2010incorporating" role="doc-biblioref">2010</a>)</span> build priors that are coherent with asset pricing theories. Finally, <span class="citation">Bauder et al. (<a href="#ref-bauder2020bayesian" role="doc-biblioref">2020</a>)</span> sample portfolio returns which allows to dervive a Bayesian optimal frontier. We invite the interested reader to also dwelve in the references that are cited within these few articles.</p>
</div>
<div id="bayesian-sampling" class="section level2">
<h2><span class="header-section-number">9.2</span> Bayesian sampling</h2>
<div id="gibbs-sampling" class="section level3">
<h3><span class="header-section-number">9.2.1</span> Gibbs sampling</h3>
<p>
One adjacent field of applications of Bayes’ theorem is <strong>simulation</strong>. Suppose we want to simulate the multivariate distribution of a random vector <span class="math inline">\(\textbf{X}\)</span> given by its density <span class="math inline">\(p=p(x_1,\dots,x_J)\)</span>. Often, the full distribution is complex, but its marginals are more accessible. Indeed, they are simpler because they depend on only one variable (when all other values are known):
<span class="math display">\[p(X_j=x_j|X_1= x_1,\dots,X_{j-1}=x_{j-1},X_{j+1}=x_{j+1},\dots,X_J=x_J)=p(X_j=x_j|\textbf{X}_{-j}=\textbf{x}_{-j}),\]</span>
where we use the compact notation <span class="math inline">\(\textbf{X}_{-j}\)</span> for all variables except <span class="math inline">\(X_j\)</span>. One way to generate samples with law <span class="math inline">\(p\)</span> is the following and relies both on the knowledge of the conditionals <span class="math inline">\(p(x_j|\textbf{x}_{-j})\)</span> and on the notion of <strong>Markov Chain Monte Carlo</strong>, which we outline below. The process is iterative and assumes that it is possible to draw samples of the aforementioned conditionals. We write <span class="math inline">\(x_j^{m}\)</span> for the <span class="math inline">\(m^{th}\)</span> sample of the <span class="math inline">\(j^{th}\)</span> variable (<span class="math inline">\(X_j\)</span>). The simulation starts with a prior (or fixed, or random) sample <span class="math inline">\(\textbf{x}^0=(x^0_1,\dots,x^0_J)\)</span>. Then, for a sufficiently large number of times, say <span class="math inline">\(T\)</span>, new samples are drawn according to
<span class="math display">\[\begin{align*}
x_1^{m+1} &amp;= p(X_1|X_2=x_2^{m}, \dots ,X_J=x_J^m) ;\\
x_2^{m+1} &amp;=p(X_2|X_1=x_1^{m+1}, X_3=x^{m}_3, \dots, X_J=x_J^m); \\
\dots&amp; \\
x_J^{m+1}&amp;= p(X_J|X_1=x_1^{m+1}, X_2=x_2^{m+1}, \dots, X_{J-1}=x_{J-1}^{m+1}).
\end{align*}\]</span></p>
<p>The important detail is that after each line, the value of the variable is updated. Hence, in the second line, <span class="math inline">\(X_2\)</span> is sampled with the knowledge of <span class="math inline">\(X_1=x_1^{m+1}\)</span> and in the last line, all variables except <span class="math inline">\(X_J\)</span> have been updated to their <span class="math inline">\((m+1)^{th}\)</span> state. The above algorithm is called Gibbs sampling. It relates to Markov chains because each new iteration depends only on the previous one.</p>
<p>Under some technical assumptions, as <span class="math inline">\(T\)</span> increases, the distribution of <span class="math inline">\(\textbf{x}_T\)</span> converges to that of <span class="math inline">\(p\)</span>. The conditions under which the convergence occurs have been widely discussed in series of articles in the 1990s. The interested reader can have a look for instance at <span class="citation">Tierney (<a href="#ref-tierney1994markov" role="doc-biblioref">1994</a>)</span>, <span class="citation">Roberts and Smith (<a href="#ref-roberts1994simple" role="doc-biblioref">1994</a>)</span>, as well as at section 11.7 of <span class="citation">Gelman et al. (<a href="#ref-gelman2013bayesian" role="doc-biblioref">2013</a>)</span>.</p>
<p>Sometimes, the full distribution is complex and the conditional laws are hard to determine and to sample. Then, a more general method, called Metropolis-Hastings, can be used that relies on the rejection method for the simulation of random variables.</p>
</div>
<div id="metropolis-hastings-sampling" class="section level3">
<h3><span class="header-section-number">9.2.2</span> Metropolis-Hastings sampling</h3>
<p>
The Gibbs algorithm can be considered as a particular case of the Metropolis-Hastings (MH) method, which, in its simplest version, was introduced in <span class="citation">Metropolis and Ulam (<a href="#ref-metropolis1949monte" role="doc-biblioref">1949</a>)</span>. The premise is similar: the aim is to simulate random variables that follow <span class="math inline">\(p(\textbf{x})\)</span> with the ability to sample from a simpler form <span class="math inline">\(p(\textbf{x}|\textbf{y})\)</span> which gives the probability of the future state <span class="math inline">\(\textbf{x}\)</span>, given the past one <span class="math inline">\(\textbf{y}\)</span>.</p>
<p>Once an initial value for <span class="math inline">\(\textbf{x}\)</span> has been sampled (<span class="math inline">\(\textbf{x}_0\)</span>), each new iteration (<span class="math inline">\(m\)</span>) of the simulation takes place in three stages:</p>
<ol style="list-style-type: decimal">
<li>generate a candidate value <span class="math inline">\(\textbf{x}&#39;_{m+1}\)</span> from <span class="math inline">\(p(\textbf{x}|\textbf{x}_m)\)</span>,<br />
</li>
<li>compute the acceptance ratio <span class="math inline">\(\alpha=\min\left(\frac{p(\textbf{x}&#39;_{m+1})p(\textbf{x}_{m}|\textbf{x}&#39;_{m+1})}{p(\textbf{x}_{m})p(\textbf{x}&#39;_{m+1}|\textbf{x}_{m})} \right)\)</span><br />
</li>
<li>pick <span class="math inline">\(\textbf{x}_{m+1}=\textbf{x}&#39;_{m+1}\)</span> with probability <span class="math inline">\(\alpha\)</span> or stick with the previous value (<span class="math inline">\(\textbf{x}_{m+1}=\textbf{x}_{m}\)</span>) with probability <span class="math inline">\(1-\alpha\)</span>.</li>
</ol>
<p>The interpretation of the acceptance ratio is not straightforward in the general case. When the sampling generator is symmetric (<span class="math inline">\(p(\textbf{x}|\textbf{y})=p(\textbf{y}|\textbf{x})\)</span>), the candidate is always chosen whenever <span class="math inline">\(p(\textbf{x}&#39;_{m+1})\ge p(\textbf{x}_{m})\)</span>. If the reverse condition holds (<span class="math inline">\(p(\textbf{x}&#39;_{m+1})&lt; p(\textbf{x}_{m})\)</span>), then the candidate is retained with odds equal to <span class="math inline">\(p(\textbf{x}&#39;_{m+1})/p(\textbf{x}_{m})\)</span>, which is the ratio of likelihoods. The more likely the new proposal, the higher the odds of retaining it.</p>
<p>Often, the first simulations are discarded in order to leave time to the chain to converge to a high probability region. This procedure (often called ‘<em>burn in</em>’) ensures that the first retained samples are located in a zone that is likely, i.e., that they are more representative of the law we are trying to simulate.</p>
<p>For the sake of brevity, we stick to a succinct presentation here, but some additional details are outlined in section 11.2 of <span class="citation">Gelman et al. (<a href="#ref-gelman2013bayesian" role="doc-biblioref">2013</a>)</span> and in chapter 7 of <span class="citation">Kruschke (<a href="#ref-kruschke2014doing" role="doc-biblioref">2014</a>)</span>.</p>
</div>
</div>
<div id="bayesian-linear-regression" class="section level2">
<h2><span class="header-section-number">9.3</span> Bayesian linear regression</h2>
<p>
Because Bayesian concepts are rather abstract, it is useful to illustrate the theoretical notions with a simple example. In a linear model, <span class="math inline">\(y_i=\textbf{x}_i\textbf{b}+\epsilon_i\)</span> and it is often statistically assumed that the <span class="math inline">\(\epsilon_i\)</span> are i.i.d. and normally distributed with zero mean and variance <span class="math inline">\(\sigma^2\)</span>. Hence, the likelihood of Equation <a href="bayes.html#eq:likelihood">(9.3)</a> translates into
<span class="math display">\[p(\boldsymbol{\epsilon}|\textbf{b}, \sigma)=\prod_{i=1}^I\frac{e^{-\frac{\epsilon_i^2}{2\sigma}}}{\sigma\sqrt{2\pi}}=(\sigma\sqrt{2\pi})^{-I}e^{-\sum_{i=1}^I\frac{\epsilon_i^2}{2\sigma^2}}.\]</span></p>
<p>In a regression analysis, the data is given both by <span class="math inline">\(\textbf{y}\)</span> and by <span class="math inline">\(\textbf{X}\)</span>, hence both are reported in the notations. Simply acknowledging that <span class="math inline">\(\boldsymbol{\epsilon}=\textbf{y}-\textbf{Xb}\)</span>, we get
<span class="math display" id="eq:linlike">\[\begin{align}
p(\textbf{y},\textbf{X}|\textbf{b}, \sigma)&amp;=\prod_{i=1}^I\frac{e^{-\frac{\epsilon_i^2}{2\sigma}}}{\sigma\sqrt{2\pi}}\\
&amp;=(\sigma\sqrt{2\pi})^{-I}e^{-\sum_{i=1}^I\frac{\left(y_i-\textbf{x}_i&#39;\textbf{b}\right)^2}{2\sigma^2}}=(\sigma\sqrt{2\pi})^{-I} e^{-\frac{\left(\textbf{y}-\textbf{X}\textbf{b}\right)&#39; \left(\textbf{y}-\textbf{X}\textbf{b}\right)}{2\sigma^2}} \nonumber \\ \tag{9.4}
&amp;=\underbrace{(\sigma\sqrt{2\pi})^{-I} e^{-\frac{\left(\textbf{y}-\textbf{X}\hat{\textbf{b}}\right)&#39; \left(\textbf{y}-\textbf{X}\hat{\textbf{b}}\right)}{2\sigma^2}}}_{\text{depends on } \sigma, \text{ not } \textbf{b}}\times \underbrace{e^{-\frac{(\textbf{b}-\hat{\textbf{b}})&#39;\textbf{X}&#39;\textbf{X}(\textbf{b}-\hat{\textbf{b}})}{2\sigma^2}}}_{\text{ depends on both } \sigma, \text{ and } \textbf{b} }.
\end{align}\]</span>
In the last line, the second term is a function of the difference <span class="math inline">\(\textbf{b}-\hat{\textbf{b}}\)</span>, where <span class="math inline">\(\hat{\textbf{b}}=(\textbf{X}&#39;\textbf{X})^{-1}\textbf{X}&#39;\textbf{y}\)</span>. This is not surprising: <span class="math inline">\(\hat{\textbf{b}}\)</span> is a natural benchmark for the mean of <span class="math inline">\(\textbf{b}\)</span>. Moreover, introducing <span class="math inline">\(\hat{\textbf{b}}\)</span> yields a relatively simple form for the probability.</p>
<p>The above expression is the frequentist (data-based) block of the posterior: the likelihood. If we want to obtain a tractable expression for the posterior, we need to find a prior component that has a form that will combine well with this likelihood. These forms are called <strong>conjugate priors</strong>. A natural candidate for the right part (that depends on both <strong>b</strong> and <span class="math inline">\(\sigma\)</span>) is the multivariate Gaussian density:
<span class="math display" id="eq:linprior">\[\begin{equation}
\tag{9.5}
p[\textbf{b}|\sigma]=\sigma^{-k}e^{-\frac{(\textbf{b}-\textbf{b}_0)&#39;\boldsymbol{\Lambda}_0(\textbf{b}-\textbf{b}_0)}{2\sigma^2}},
\end{equation}\]</span>
where we are obliged to condition with respect to <span class="math inline">\(\sigma\)</span>. The density has prior mean <span class="math inline">\(\textbf{b}_0\)</span> and prior covariance matrix <span class="math inline">\(\boldsymbol{\Lambda}_0^{-1}\)</span>. This prior gets us one step closer to the posterior because
<span class="math display" id="eq:cascade">\[\begin{align}
p[\textbf{b},\sigma|\textbf{y},\textbf{X}]&amp; \propto p[\textbf{y},\textbf{X}|\textbf{b},\sigma]p[\textbf{b},\sigma] \nonumber \\
\tag{9.6}
&amp;\propto p[\textbf{y},\textbf{X}|\textbf{b},\sigma]p[\textbf{b}|\sigma]p[\sigma].
\end{align}\]</span></p>
<p>In order to fully specify the cascade of probabilities, we need to take care of <span class="math inline">\(\sigma\)</span> and set a density of the form
<span class="math display" id="eq:linsig">\[\begin{equation}
\tag{9.7}
p[\sigma^2]\propto (\sigma^2)^{-1-a_0}e^{-\frac{b_0}{2\sigma^2}},
\end{equation}\]</span>
which is close to that of the left part of <a href="bayes.html#eq:linlike">(9.4)</a>. This corresponds to an inverse gamma distribution for the variance with prior parameters <span class="math inline">\(a_0\)</span> and <span class="math inline">\(b_0\)</span> (this scalar notation is not optimal because it can be confused with the prior mean <span class="math inline">\(\textbf{b}_0\)</span> so we must pay extra attention).</p>
<p>Now, we can simplify <span class="math inline">\(p[\textbf{b},\sigma|\textbf{y},\textbf{X}]\)</span> with <a href="bayes.html#eq:linlike">(9.4)</a>, <a href="bayes.html#eq:linprior">(9.5)</a> and <a href="bayes.html#eq:linsig">(9.7)</a>:
<span class="math display">\[\begin{align*}
p[\textbf{b},\sigma|\textbf{y},\textbf{X}]&amp; \propto
(\sigma\sqrt{2\pi})^{-I} \sigma^{-2(1+a_0)} e^{-\frac{\left(\textbf{y}-\textbf{X}\hat{\textbf{b}}\right)&#39; \left(\textbf{y}-\textbf{X}\hat{\textbf{b}}\right)}{2\sigma^2}} \\
&amp;\quad \times e^{-\frac{(\textbf{b}-\hat{\textbf{b}})&#39;\textbf{X}&#39;\textbf{X}(\textbf{b}-\hat{\textbf{b}})}{2\sigma^2}}\sigma^{-k}e^{-\frac{(\textbf{b}-\textbf{b}_0)&#39;\boldsymbol{\Lambda}_0(\textbf{b}-\textbf{b}_0)}{2\sigma^2}}e^{-\frac{b_0}{2\sigma^2}} \\
\end{align*}\]</span>
which can be rewritten
<span class="math display">\[\begin{align*}
p[\textbf{b},\sigma|\textbf{y},\textbf{X}]&amp; \propto  \sigma^{-I-k-2(1+a_0)} \\
&amp;\times  \exp\left(-\frac{\left(\textbf{y}-\textbf{X}\hat{\textbf{b}}\right)&#39; \left(\textbf{y}-\textbf{X}\hat{\textbf{b}}\right) + (\textbf{b}-\hat{\textbf{b}})&#39;\textbf{X}&#39;\textbf{X}(\textbf{b}-\hat{\textbf{b}}) + (\textbf{b}-\textbf{b}_0)&#39;\boldsymbol{\Lambda}_0(\textbf{b}-\textbf{b}_0)+b_0}{2\sigma^2} \right) .
\end{align*}\]</span></p>
<p>The above expression is simply a quadratic form in <span class="math inline">\(\textbf{b}\)</span> and it can be rewritten after burdensome algebra in a much more compact manner:
<span class="math display">\[\begin{equation}
\label{eq:linpost}
p(\textbf{b}|\textbf{y},\textbf{X},\sigma) \propto \left[\sigma^{-k}e^{-\frac{(\textbf{b}-\textbf{b}_*)&#39;\boldsymbol{\Lambda}_*(\textbf{b}-\textbf{b}_*)}{2\sigma^2}}\right] \times \left[ (\sigma^2)^{-1-a_*}e^{-\frac{b_*}{2\sigma^2}}  \right],
\end{equation}\]</span></p>
<p>where
<span class="math display">\[\begin{align*}
\boldsymbol{\Lambda}_* &amp;= \textbf{X}&#39;\textbf{X}+\boldsymbol{\Lambda}_0  \\
\textbf{b}_*&amp;=  \boldsymbol{\Lambda}_*^{-1}(\boldsymbol{\Lambda}_0\textbf{b}_0+\textbf{X}&#39;\textbf{X}\hat{\textbf{b}}) \\
a_* &amp; = a_0 + I/2  \\
b_* &amp;=b_0+\frac{1}{2}\left(\textbf{y}&#39;\textbf{y}+ \textbf{b}_0&#39;\boldsymbol{\Lambda}_0\textbf{b}_0+\textbf{b}_*&#39;\boldsymbol{\Lambda}_*\textbf{b}_* \right).\\
\end{align*}\]</span></p>
<p>This expression has two parts: the Gaussian component which relates mostly to <span class="math inline">\(\textbf{b}\)</span>, and the inverse gamma component, entirely dedicated to <span class="math inline">\(\sigma\)</span>. The mix between the prior and the data is clear. The posterior covariance matrix of the Gaussian part (<span class="math inline">\(\boldsymbol{\Lambda}_*\)</span>) is the sum between the prior and a quadratic form from the data. The posterior mean <span class="math inline">\(\textbf{b}_*\)</span> is a weighted average of the prior <span class="math inline">\(\textbf{b}_0\)</span> and the sample estimator <span class="math inline">\(\hat{\textbf{b}}\)</span>. Such blends of quantities estimated from data and a user-supplied version are often called <strong>shrinkages</strong>. For instance, the original matrix of cross-terms <span class="math inline">\(\textbf{X}&#39;\textbf{X}\)</span> is shrunk towards the prior <span class="math inline">\(\boldsymbol{\Lambda}_0\)</span>. This can be viewed as a <strong>regularization</strong> procedure: the pure fit originating from the data is mixed with some ‘external’ ingredient to give some structure to the final estimation.</p>
<p>The interested reader can also have a look at section 16.3 of <span class="citation">Greene (<a href="#ref-greene2018econometric" role="doc-biblioref">2018</a>)</span> (the case of conjugate priors is treated in subsection 16.3.2).</p>
<p>The formulae above can be long and risky to implement. Luckily, there is an R package (<span class="math inline">\(spBayes\)</span>) that performs Bayesian inference for linear regression using the conjugate priors. Below, we provide one example of how it works. To simplify the code and curtail computation times, we consider two predictors: market capitalization (size anomaly) and price-to-book ratio (value anomaly). In statistics, the <strong>precision matrix</strong> is the inverse of the covariance matrix. In the parameters, the first two priors relate to the Gaussian law and the last two to the inverse gamma distribution:
<span class="math display">\[f_\text{invgamma}(x, \alpha, \beta)=\frac{\beta^\alpha}{\Gamma(\alpha)}x^{-1-\alpha}e^{-\frac{\beta}{x}},\]</span>
where <span class="math inline">\(\alpha\)</span> is the shape and <span class="math inline">\(\beta\)</span> is the scale.</p>

<div class="sourceCode" id="cb108"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb108-1"><a href="bayes.html#cb108-1"></a>prior_mean &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="fl">0.01</span>,<span class="fl">0.1</span>,<span class="fl">0.1</span>)                    <span class="co"># Average value of parameters (prior)</span></span>
<span id="cb108-2"><a href="bayes.html#cb108-2"></a>precision_mat &lt;-<span class="st"> </span><span class="kw">diag</span>(prior_mean<span class="op">^</span><span class="dv">2</span>) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">solve</span>()  <span class="co"># Inverse cov matrix of parameters (prior)</span></span>
<span id="cb108-3"><a href="bayes.html#cb108-3"></a>fit_lmBayes &lt;-<span class="st"> </span><span class="kw">bayesLMConjugate</span>(</span>
<span id="cb108-4"><a href="bayes.html#cb108-4"></a>    R1M_Usd <span class="op">~</span><span class="st"> </span>Mkt_Cap_3M_Usd <span class="op">+</span><span class="st"> </span>Pb,          <span class="co"># Model: size and value</span></span>
<span id="cb108-5"><a href="bayes.html#cb108-5"></a>    <span class="dt">data =</span> testing_sample,                  <span class="co"># Data source, here, the test sample</span></span>
<span id="cb108-6"><a href="bayes.html#cb108-6"></a>    <span class="dt">n.samples =</span> <span class="dv">2000</span>,                       <span class="co"># Number of samples used</span></span>
<span id="cb108-7"><a href="bayes.html#cb108-7"></a>    <span class="dt">beta.prior.mean =</span> prior_mean,           <span class="co"># Avg prior: size &amp; value rewarded &amp; unit beta</span></span>
<span id="cb108-8"><a href="bayes.html#cb108-8"></a>    <span class="dt">beta.prior.precision =</span> precision_mat,   <span class="co"># Precision matrix</span></span>
<span id="cb108-9"><a href="bayes.html#cb108-9"></a>    <span class="dt">prior.shape =</span> <span class="fl">0.5</span>,                      <span class="co"># Shape for prior distribution of sigma</span></span>
<span id="cb108-10"><a href="bayes.html#cb108-10"></a>    <span class="dt">prior.rate =</span> <span class="fl">0.5</span>)                       <span class="co"># Scale for prior distribution of sigma</span></span></code></pre></div>

<p>In the above specification, we must also provide a prior for the constant. By default, we set its average value to 0.01, which corresponds to a 1% average monthly return. Once the model has been estimated, we can plot the distribution of coefficient estimates.</p>

<div class="sourceCode" id="cb109"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb109-1"><a href="bayes.html#cb109-1"></a>fit_lmBayes<span class="op">$</span>p.beta.tauSq.samples[,<span class="dv">1</span><span class="op">:</span><span class="dv">3</span>] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">as_tibble</span>() <span class="op">%&gt;%</span></span>
<span id="cb109-2"><a href="bayes.html#cb109-2"></a><span class="st">    `</span><span class="dt">colnames&lt;-</span><span class="st">`</span>(<span class="kw">c</span>(<span class="st">&quot;Intercept&quot;</span>, <span class="st">&quot;Size&quot;</span>, <span class="st">&quot;Value&quot;</span>)) <span class="op">%&gt;%</span></span>
<span id="cb109-3"><a href="bayes.html#cb109-3"></a><span class="st">    </span><span class="kw">gather</span>(<span class="dt">key =</span> coefficient, <span class="dt">value =</span> value) <span class="op">%&gt;%</span></span>
<span id="cb109-4"><a href="bayes.html#cb109-4"></a><span class="st">    </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> value, <span class="dt">fill =</span> coefficient)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_histogram</span>(<span class="dt">alpha =</span> <span class="fl">0.5</span>)</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:lmBayesplot"></span>
<img src="ML_factor_files/figure-html/lmBayesplot-1.png" alt="Distribution of linear regression coefficients (betas)." width="320px" />
<p class="caption">
FIGURE 9.1: Distribution of linear regression coefficients (betas).
</p>
</div>

<p>The distribution of the constant in Figure <a href="bayes.html#fig:lmBayesplot">9.1</a> is firmly to the right with a small dispersion, hence it is solidly positive. For the size coefficient, it is the opposite; it is negative (small firms are more profitable). With regard to value, it is hard to conclude, the distribution is balanced around zero: there is no clear exposition to the price-to-book ratio variable.</p>
</div>
<div id="naive-bayes-classifier" class="section level2">
<h2><span class="header-section-number">9.4</span> Naive Bayes classifier</h2>
<p>
</p>
<p>Bayes’ theorem can also be easily applied to <strong>classification</strong>. We formulate it with respect to the label and features and write
<span class="math display" id="eq:naivebayes">\[\begin{equation}
\tag{9.8}
P[\textbf{y} | \textbf{X}] = \frac{P[ \textbf{X} | \textbf{y}]P[\textbf{y}]}{P[\textbf{X}]} \propto P[ \textbf{X} | \textbf{y}]P[\textbf{y}],
\end{equation}\]</span>
and then split the input matrix into its column vectors <span class="math inline">\(\textbf{X}=(\textbf{x}_1,\dots,\textbf{x}_K)\)</span>. This yields
<span class="math display" id="eq:naivebayes2">\[\begin{equation}
\tag{9.9}
P[\textbf{y} | \textbf{x}_1,\dots,\textbf{x}_K] \propto P[\textbf{x}_1,\dots,\textbf{x}_K| \textbf{y}]P[\textbf{y}].
\end{equation}\]</span></p>
<p>The ‘naive’ qualification of the method comes from a simplifying assumption on the features.<a href="#fn19" class="footnote-ref" id="fnref19"><sup>19</sup></a> If they are all mutually independent, then the likelihood in the above expression can be expanded into
<span class="math display" id="eq:naivebayes3">\[\begin{equation}
\tag{9.10}
P[\textbf{y} | \textbf{x}_1,\dots,\textbf{x}_K] \propto P[\textbf{y}]\prod_{k=1}^K P[\textbf{x}_k| \textbf{y}].
\end{equation}\]</span></p>
<p>The next step is to be more specific about the likelihood. This can be done non-parametrically (via kernel estimation) or with common distributions (Gaussian for continuous data, Bernoulli for binary data). In factor investing, the features are continuous, thus the Gaussian law is more adequate:
<span class="math display">\[P[x_{i,k}=z|\textbf{y}_i= c]=\frac{e^{-\frac{(z-m_c)^2}{2\sigma_c^2}}}{\sigma_c\sqrt{2\pi}},\]</span>
where <span class="math inline">\(c\)</span> is the value of the classes taken by <span class="math inline">\(y\)</span> and <span class="math inline">\(\sigma_c\)</span> and <span class="math inline">\(m_c\)</span> are the standard error and mean of <span class="math inline">\(x_{i,k}\)</span>, conditional on <span class="math inline">\(y_i\)</span> being equal to <span class="math inline">\(c\)</span>. In practice, each class is spanned, the training set is filtered accordingly and <span class="math inline">\(\sigma_c\)</span> and <span class="math inline">\(m_c\)</span> are taken to be the sample statistics. This Gaussian parametrization is probably ill-suited to our dataset because the features are uniformly distributed. Even after conditioning, it is unlikely that the distribution will be even remotely close to Gaussian. Technically, this can be overcome via a double transformation method. Given a vector of features <span class="math inline">\(\textbf{x}_k\)</span> with empirical cdf <span class="math inline">\(F_{\textbf{x}_k}\)</span>, the variable
<span class="math display" id="eq:transf">\[\begin{equation}
\tag{9.11}
\tilde{\textbf{x}}_k=\Phi^{-1}\left(F_{\textbf{x}_k}(\textbf{x}_k) \right),
\end{equation}\]</span>
will have a standard normal law whenever <span class="math inline">\(F_{\textbf{x}_k}\)</span> is not pathological. Non-pathological cases are when the cdf is continuous and strictly increasing and when observations lie in the open interval (0,1). If all features are independent, the transformation should not have any impact on the correlation structure. Otherwise, we refer to the literature on the NORmal-To-Anything (NORTA) method (see, e.g., <span class="citation">Chen (<a href="#ref-chen2001initialization" role="doc-biblioref">2001</a>)</span> and <span class="citation">Coqueret (<a href="#ref-coqueret2017approximate" role="doc-biblioref">2017</a>)</span>).</p>
<p>Lastly, the prior <span class="math inline">\(P[\textbf{y}]\)</span> in Equation <a href="bayes.html#eq:naivebayes3">(9.10)</a> is often either taken to be uniform across the classes (<span class="math inline">\(1/K\)</span> for all <span class="math inline">\(k\)</span>) or equal to the sample distribution.</p>
<p>We illustrate the naive Bayes classification tool with a simple example. While the package <em>e1071</em> embeds such a classifier, the <em>naivebayes</em> library offers more options (Gaussian, Bernoulli, multinomial and nonparametric likelihoods). Below, since the features are uniformly distributed, thus the transformation in <a href="bayes.html#eq:transf">(9.11)</a> amounts to apply the Gaussian quantile function (inverse cdf).</p>
<p>For visual clarity, we only use the small set of features.</p>

<div class="sourceCode" id="cb110"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb110-1"><a href="bayes.html#cb110-1"></a><span class="kw">library</span>(naivebayes)                           <span class="co"># Load package</span></span>
<span id="cb110-2"><a href="bayes.html#cb110-2"></a>gauss_features_train &lt;-<span class="st"> </span>training_sample <span class="op">%&gt;%</span><span class="st">   </span><span class="co"># Build sample</span></span>
<span id="cb110-3"><a href="bayes.html#cb110-3"></a><span class="st">    </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb110-4"><a href="bayes.html#cb110-4"></a><span class="st">    </span><span class="kw">as.matrix</span>() <span class="op">%&gt;%</span></span>
<span id="cb110-5"><a href="bayes.html#cb110-5"></a><span class="st">    `</span><span class="dt">*</span><span class="st">`</span>(<span class="fl">0.999</span>) <span class="op">%&gt;%</span><span class="st">                            </span><span class="co"># Features smaller than 1</span></span>
<span id="cb110-6"><a href="bayes.html#cb110-6"></a><span class="st">    </span><span class="op">+</span><span class="st"> </span>(<span class="fl">0.0001</span>) <span class="op">%&gt;%</span><span class="st">                            </span><span class="co"># Features larger than 0</span></span>
<span id="cb110-7"><a href="bayes.html#cb110-7"></a><span class="st">    </span><span class="kw">qnorm</span>() <span class="op">%&gt;%</span><span class="st">                               </span><span class="co"># Inverse Gaussian cdf</span></span>
<span id="cb110-8"><a href="bayes.html#cb110-8"></a><span class="st">    `</span><span class="dt">colnames&lt;-</span><span class="st">`</span>(features_short)</span>
<span id="cb110-9"><a href="bayes.html#cb110-9"></a>fit_NB_gauss &lt;-<span class="st"> </span><span class="kw">naive_bayes</span>(<span class="dt">x =</span> gauss_features_train,      <span class="co"># Transformed features</span></span>
<span id="cb110-10"><a href="bayes.html#cb110-10"></a>                            <span class="dt">y =</span> training_sample<span class="op">$</span>R1M_Usd_C) <span class="co"># Label</span></span>
<span id="cb110-11"><a href="bayes.html#cb110-11"></a><span class="kw">layout</span>(<span class="kw">matrix</span>(<span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>,<span class="dv">4</span>,<span class="dv">5</span>,<span class="dv">6</span>,<span class="dv">7</span>), <span class="dv">4</span>, <span class="dv">2</span>, <span class="dt">byrow =</span> <span class="ot">TRUE</span>),     <span class="co"># Organize graphs</span></span>
<span id="cb110-12"><a href="bayes.html#cb110-12"></a>       <span class="dt">widths=</span><span class="kw">c</span>(<span class="fl">0.9</span>,<span class="fl">0.45</span>))</span>
<span id="cb110-13"><a href="bayes.html#cb110-13"></a><span class="kw">par</span>(<span class="dt">mar=</span><span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">1</span>))</span>
<span id="cb110-14"><a href="bayes.html#cb110-14"></a><span class="kw">plot</span>(fit_NB_gauss, <span class="dt">prob =</span> <span class="st">&quot;conditional&quot;</span>)</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:NB"></span>
<img src="ML_factor_files/figure-html/NB-1.png" alt="Distributions of predictor variables, conditional on the class of the label. TRUE is when the instance corresponds to an above median return and FALSE to a below median return." width="500px" />
<p class="caption">
FIGURE 9.2: Distributions of predictor variables, conditional on the class of the label. TRUE is when the instance corresponds to an above median return and FALSE to a below median return.
</p>
</div>

<p>The plots in Figure <a href="bayes.html#fig:NB">9.2</a> show the distributions of the features, conditionally on each value of the label. Essentially, those are the densities <span class="math inline">\(P[\textbf{x}_k| \textbf{y}]\)</span>. For each feature, both distributions are very similar.</p>
<p>As usual, once the model has been trained, the accuracy of predictions can be evaluated.</p>

<div class="sourceCode" id="cb111"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb111-1"><a href="bayes.html#cb111-1"></a>gauss_features_test &lt;-<span class="st"> </span>testing_sample <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb111-2"><a href="bayes.html#cb111-2"></a><span class="st">    </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%&gt;%</span><span class="st"> </span></span>
<span id="cb111-3"><a href="bayes.html#cb111-3"></a><span class="st">    </span><span class="kw">as.matrix</span>() <span class="op">%&gt;%</span></span>
<span id="cb111-4"><a href="bayes.html#cb111-4"></a><span class="st">    `</span><span class="dt">*</span><span class="st">`</span>(<span class="fl">0.999</span>) <span class="op">%&gt;%</span></span>
<span id="cb111-5"><a href="bayes.html#cb111-5"></a><span class="st">    </span><span class="op">+</span><span class="st"> </span>(<span class="fl">0.0001</span>) <span class="op">%&gt;%</span></span>
<span id="cb111-6"><a href="bayes.html#cb111-6"></a><span class="st">    </span><span class="kw">qnorm</span>() <span class="op">%&gt;%</span></span>
<span id="cb111-7"><a href="bayes.html#cb111-7"></a><span class="st">    `</span><span class="dt">colnames&lt;-</span><span class="st">`</span>(features_short)</span>
<span id="cb111-8"><a href="bayes.html#cb111-8"></a><span class="kw">mean</span>(<span class="kw">predict</span>(fit_NB_gauss, gauss_features_test) <span class="op">==</span><span class="st"> </span>testing_sample<span class="op">$</span>R1M_Usd_C) <span class="co"># Hit ratio</span></span></code></pre></div>
<pre><code>## [1] 0.4956985</code></pre>

<p>The performance of the classifier is not satisfactory as it underperforms a random guess.</p>
</div>
<div id="BART" class="section level2">
<h2><span class="header-section-number">9.5</span> Bayesian additive trees</h2>
<p></p>
<div id="general-formulation" class="section level3">
<h3><span class="header-section-number">9.5.1</span> General formulation</h3>
<p>Bayesian additive regression trees (BARTs) are an ensemble technique that mixes Bayesian thinking and regression trees. In spirit, they are close to the tree ensembles seen in Chapter <a href="trees.html#trees">6</a>, but they differ greatly in their implementation. In BARTs like in Bayesian regressions, the regularization comes from the prior. The original article is <span class="citation">Chipman, George, and McCulloch (<a href="#ref-chipman2010bart" role="doc-biblioref">2010</a>)</span> and the implementation (in R) follows <span class="citation">Sparapani, Spanbauer, and McCulloch (<a href="#ref-sparapani2019r" role="doc-biblioref">2019</a>)</span>.</p>
<p>Formally, the model is an aggregation of <span class="math inline">\(M\)</span> models, which we write as
<span class="math display" id="eq:BART">\[\begin{equation}
\tag{9.12}
y = \sum_{m=1}^M\mathcal{T}_m(q_m,\textbf{w}_m, \textbf{x}) + \epsilon,
\end{equation}\]</span>
where <span class="math inline">\(\epsilon\)</span> is a Gaussian noise with variance <span class="math inline">\(\sigma^2\)</span>, and the <span class="math inline">\(\mathcal{T}_m=\mathcal{T}_m(q_m,\textbf{w}_m, \textbf{x})\)</span> are decision trees with structure <span class="math inline">\(q_m\)</span> and weights vectors <span class="math inline">\(\textbf{w}_m\)</span>. This decomposition of the tree is the one we used for boosted trees and is illustrated in Figure <a href="trees.html#fig:treeq">6.5</a>. <span class="math inline">\(q_m\)</span> codes all splits (variables chosen for the splits and levels of the splits) and the vectors <span class="math inline">\(\textbf{w}_m\)</span> correspond to the leaf values (at the terminal nodes).</p>
<p>At the macro-level, BARTs can be viewed as traditional Bayesian objects, where the parameters <span class="math inline">\(\boldsymbol{\theta}\)</span> are all of the unknowns coded through <span class="math inline">\(q_m\)</span>, <span class="math inline">\(\textbf{w}_m\)</span> and <span class="math inline">\(\sigma^2\)</span> and where the focus is set on determining the posterior
<span class="math display" id="eq:bartpost">\[\begin{equation}
\tag{9.13}
\left(q_m,\textbf{w}_m,\sigma^2\right) | (\textbf{X}, \textbf{Y}).
\end{equation}\]</span></p>
<p>Given particular forms of priors for <span class="math inline">\(\left(q_m,\textbf{w}_m,\sigma^2\right)\)</span>, the algorithm draws the parameters using a combination of Metropolis-Hastings <em>and</em> Gibbs samplers.</p>
</div>
<div id="priors" class="section level3">
<h3><span class="header-section-number">9.5.2</span> Priors</h3>
<p>
The definition of priors in tree models is delicate and intricate. The first important assumption is independence: independence between <span class="math inline">\(\sigma^2\)</span> and all other parameters and independence between trees, that is, between couples <span class="math inline">\((q_m,\textbf{w}_m)\)</span> and <span class="math inline">\((q_n,\textbf{w}_n)\)</span> for <span class="math inline">\(m\neq n\)</span>. This assumption makes BARTs closer to random forests in spirit and further from boosted trees. This independence entails</p>
<p><span class="math display">\[P(\left(q_1,\textbf{w}_1\right),\dots,\left(q_M,\textbf{w}_M\right),\sigma^2)=P(\sigma^2)\prod_{m=1}^MP\left(q_m,\textbf{w}_m\right).\]</span></p>
<p>Moreover, it is customary (for simplicity) to separate the structure of the tree (<span class="math inline">\(q_m\)</span>) and the terminal weights (<span class="math inline">\(\textbf{w}_m\)</span>), so that by a Bayesian conditioning</p>
<p><span class="math display" id="eq:bart1">\[\begin{equation}
\tag{9.14}
P(\left(q_1,\textbf{w}_1\right),\dots,\left(q_M,\textbf{w}_M\right),\sigma^2)=\underbrace{P(\sigma^2)}_{\text{noise term}}\prod_{m=1}^M\underbrace{P\left(\textbf{w}_m|q_m\right)}_{\text{tree weights}}\underbrace{P(q_m)}_{\text{tree struct.}}
\end{equation}\]</span></p>
<p>It remains to formulate the assumptions for each of the three parts.</p>
<p>We start with the trees’ structures, <span class="math inline">\(q_m\)</span>. Trees are defined by their splits (at nodes) and these splits are characterized by the splitting variable and the splitting level. First, the size of trees is parametrized such that a node at depth <span class="math inline">\(d\)</span> is nonterminal with probability given by
<span class="math display" id="eq:bartnode">\[\begin{equation}
\tag{9.15}
\alpha(1+d)^{-\beta}, \quad \alpha \in (0,1), \quad \beta &gt;0.
\end{equation}\]</span>
The authors recommend to set <span class="math inline">\(\alpha = 0.95\)</span> and <span class="math inline">\(\beta=2\)</span>. This gives a probability of 5% to have 1 node, 55% to have 2 nodes, 28% to have 3 nodes, 9% to have 4 nodes and 3% to have 5 nodes. Thus, the aim is to force relatively shallow structures.</p>
<p>Second, the choice of splitting variables is driven by a generalized Bernoulli (categorical) distribution which defines the odds of picking one particular feature. In the original paper by <span class="citation">Chipman, George, and McCulloch (<a href="#ref-chipman2010bart" role="doc-biblioref">2010</a>)</span>, the vector of probabilities was uniform (each predictor has the same odds of being chosen for the split). This vector can also be random and sampled from a more flexible Dirichlet distribution. The level of the split is drawn uniformly on the set of possible values for the chosen predictor.</p>
<p>Having determined the prior of structure of the tree <span class="math inline">\(q_m\)</span>, it remains to fix the terminal values at the leaves (<span class="math inline">\(\textbf{w}_m|q_m\)</span>). The weights at all leaves are assumed to follow a Gaussian distribution <span class="math inline">\(\mathcal{N}(\mu_\mu,\sigma_\mu^2)\)</span>, where <span class="math inline">\(\mu_\mu=(y_\text{min}+y_\text{max})/2\)</span> is the center of the range of the label values. The variance <span class="math inline">\(\sigma_\mu^2\)</span> is chosen such that <span class="math inline">\(\mu_\mu\)</span> plus or minus two times <span class="math inline">\(\sigma_\mu^2\)</span> covers 95% of the range observed in the training dataset. Those are default values and can be altered by the user.</p>
<p>Lastly, for computational purposes similar to those of linear regressions, the parameter <span class="math inline">\(\sigma^2\)</span> (the variance of <span class="math inline">\(\epsilon\)</span> in <a href="bayes.html#eq:BART">(9.12)</a>) is assumed to follow an inverse Gamma law <span class="math inline">\(\text{IG}(\nu/2,\lambda \nu/2)\)</span> akin to that used in Bayesian regressions. The parameters are by default computed from the data so that the distribution of <span class="math inline">\(\sigma^2\)</span> is realistic and prevents overfitting. We refer to the original article, section 2.2.4, for more details on this topic.</p>
<p>In sum, in addition to <span class="math inline">\(M\)</span> (number of trees), the prior depends on a small number of parameters: <span class="math inline">\(\alpha\)</span> and <span class="math inline">\(\beta\)</span> (for the tree structure), <span class="math inline">\(\mu_\mu\)</span> and <span class="math inline">\(\sigma_\mu^2\)</span> (for the tree weights) and <span class="math inline">\(\nu\)</span> and <span class="math inline">\(\lambda\)</span> (for the noise term).</p>
</div>
<div id="sampling-and-predictions" class="section level3">
<h3><span class="header-section-number">9.5.3</span> Sampling and predictions</h3>
<p>
The posterior distribution in <a href="bayes.html#eq:bartpost">(9.13)</a> cannot be obtained analytically but simulations are an efficient shortcut to the model <a href="bayes.html#eq:BART">(9.12)</a>. Just as in Gibbs and Metropolis-Hastings sampling, the distribution of simulations is expected to converge to the sought posterior. After some burn-in sample, a prediction for a newly observed set <span class="math inline">\(\textbf{x}_*\)</span> will simply be the average (or median) of the predictions from the simulations. If we assume <span class="math inline">\(S\)</span> simulations after burn-in, then the average is equal to
<span class="math display">\[\tilde{y}(\textbf{x}_*):=\frac{1}{S}\sum_{s=1}^S\sum_{m=1}^M\mathcal{T}_m\left(q_m^{(s)},\textbf{w}_m^{(s)}, \textbf{x}_*\right).\]</span></p>
<p>The complex part is naturally to generate the simulations. Each tree is sampled using the Metropolis-Hastings method: a tree is proposed, but it replaces the existing one only under some (possibly random) criterion. This procedure is then repeated in a Gibbs-like fashion.</p>
<p>Let us start with the MH building block. We seek to simulate the conditional distribution</p>
<p><span class="math display">\[(q_m,\textbf{w}_m) \ | \ (q_{-m},\textbf{w}_{-m},\sigma^2, \textbf{y}, \textbf{x}),\]</span></p>
<p>where <span class="math inline">\(q_{-m}\)</span> and <span class="math inline">\(\textbf{w}_{-m}\)</span> collect the structures and weights of all trees except for tree number <span class="math inline">\(m\)</span>. One tour de force in BART is to simplify the above Gibbs draws to
<span class="math display">\[(q_m,\textbf{w}_m) \ | \ (\textbf{R}_{m},\sigma^2 ),\]</span>
where <span class="math inline">\(\textbf{R}_{m}=\textbf{y}-\sum_{l \neq m}\mathcal{T}_l(q_l,\textbf{w}_l, \textbf{x})\)</span> is the partial residual on a prediction that excludes the <span class="math inline">\(m^{th}\)</span> tree.</p>
<p>The new MH proposition for <span class="math inline">\(q_m\)</span> is based on the previous tree and there are three possible (and random) alterations to the tree:<br />
- growing a terminal node (increase the complexity of the tree by adding a supplementary leaf);<br />
- pruning a pair of terminal nodes (the opposite operation: reducing complexity);<br />
- changing splitting rules.</p>
<p>For simplicity, the third option is often excluded. Once the tree structure is defined (i.e., sampled), the terminal weights are independently drawn according to a Gaussian distribution <span class="math inline">\(\mathcal{N}(\mu_\mu, \sigma_\mu^2)\)</span>.</p>
<p>After the tree is sampled, the MH principle requires that it be accepted or rejected based on some probability. This probability increases with the odds that the new tree increases the likelihood of the model. Its detailed computation is cumbersome and we refer to section 2.2 in <span class="citation">Sparapani, Spanbauer, and McCulloch (<a href="#ref-sparapani2019r" role="doc-biblioref">2019</a>)</span> for details on the matter.</p>
<p>Now, we must outline the overarching Gibbs procedure. First, the algorithm starts with trees that are simple nodes. Then, a specified number of loops include the following <em>sequential</em> steps:</p>
<table>
<colgroup>
<col width="55%" />
<col width="44%" />
</colgroup>
<thead>
<tr class="header">
<th align="center">Step</th>
<th align="left">Task</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="center">1</td>
<td align="left">sample <span class="math inline">\((q_1,\textbf{w}_1) \ | \ (\textbf{R}_{1},\sigma^2 )\)</span>;</td>
</tr>
<tr class="even">
<td align="center">2</td>
<td align="left">sample <span class="math inline">\((q_2,\textbf{w}_2) \ | \ (\textbf{R}_{2},\sigma^2 )\)</span>;</td>
</tr>
<tr class="odd">
<td align="center">…</td>
<td align="left">…;</td>
</tr>
<tr class="even">
<td align="center">m</td>
<td align="left">sample <span class="math inline">\((q_m,\textbf{w}_m) \ | \ (\textbf{R}_{m},\sigma^2 )\)</span>;</td>
</tr>
<tr class="odd">
<td align="center">…</td>
<td align="left">…;</td>
</tr>
<tr class="even">
<td align="center">M</td>
<td align="left">sample <span class="math inline">\((q_M,\textbf{w}_M) \ | \ (\textbf{R}_{M},\sigma^2 )\)</span>; (last tree )</td>
</tr>
<tr class="odd">
<td align="center">M+1</td>
<td align="left">sample <span class="math inline">\(\sigma^2\)</span> given the full residual <span class="math inline">\(\textbf{R}=\textbf{y}-\sum_{l=1}^M\mathcal{T}_l(q_l,\textbf{w}_l, \textbf{x})\)</span></td>
</tr>
</tbody>
</table>
<p>At each step <span class="math inline">\(m\)</span>, the residual <span class="math inline">\(\textbf{R}_{m}\)</span> is updated with the values from step <span class="math inline">\(m-1\)</span>. We illustrate this process in Figure <a href="bayes.html#fig:bartfig">9.3</a> in which <span class="math inline">\(M=3\)</span>. At step 1, a partition is proposed for the first tree, which is a simple node. In this particular case, the tree is accepted. In this scheme, the terminal weights are omitted for simplicity. At step 2, another partition is proposed for the tree, but it is rejected. In the third step, the proposition for the third is accepted. After the third step, a new value for <span class="math inline">\(\sigma^2\)</span> is drawn and a new round of Gibbs sampling can commence.</p>
<div class="figure" style="text-align: center"><span id="fig:bartfig"></span>
<img src="images/bart.png" alt="Diagram of the MH/Gibbs sampling of BARTs. At step 2, the proposed tree is not validated." width="260px" />
<p class="caption">
FIGURE 9.3: Diagram of the MH/Gibbs sampling of BARTs. At step 2, the proposed tree is not validated.
</p>
</div>
</div>
<div id="code" class="section level3">
<h3><span class="header-section-number">9.5.4</span> Code</h3>
<p>There are several R packages that implement BART methods: <em>BART</em>, <em>bartMachine</em> and an older one (the original), <em>BayesTree</em>. The first one is highly efficient, hence we work with it. We resort to only a few parameters, like the power and base, which are the <span class="math inline">\(\beta\)</span> and <span class="math inline">\(\alpha\)</span> defined in <a href="bayes.html#eq:bartnode">(9.15)</a>. The program is a bit verbose and delivers a few parametric details.</p>

<div class="sourceCode" id="cb113"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb113-1"><a href="bayes.html#cb113-1"></a><span class="kw">library</span>(BART)                                                           <span class="co"># Load package</span></span>
<span id="cb113-2"><a href="bayes.html#cb113-2"></a>fit_bart &lt;-<span class="st"> </span><span class="kw">gbart</span>(                                                      <span class="co"># Main function</span></span>
<span id="cb113-3"><a href="bayes.html#cb113-3"></a>    <span class="dt">x.train =</span> dplyr<span class="op">::</span><span class="kw">select</span>(training_sample, features_short) <span class="op">%&gt;%</span><span class="st">        </span><span class="co"># Training features</span></span>
<span id="cb113-4"><a href="bayes.html#cb113-4"></a><span class="st">        </span><span class="kw">data.frame</span>(), </span>
<span id="cb113-5"><a href="bayes.html#cb113-5"></a>    <span class="dt">y.train =</span> dplyr<span class="op">::</span><span class="kw">select</span>(training_sample, R1M_Usd) <span class="op">%&gt;%</span><span class="st">               </span><span class="co"># Training label</span></span>
<span id="cb113-6"><a href="bayes.html#cb113-6"></a><span class="st">        </span><span class="kw">as.matrix</span>() ,        </span>
<span id="cb113-7"><a href="bayes.html#cb113-7"></a>    <span class="dt">x.test =</span> dplyr<span class="op">::</span><span class="kw">select</span>(testing_sample, features_short)  <span class="op">%&gt;%</span><span class="st">         </span><span class="co"># Testing features</span></span>
<span id="cb113-8"><a href="bayes.html#cb113-8"></a><span class="st">        </span><span class="kw">data.frame</span>(),  </span>
<span id="cb113-9"><a href="bayes.html#cb113-9"></a>    <span class="dt">type =</span> <span class="st">&quot;wbart&quot;</span>,                                          <span class="co"># Option: label is continuous</span></span>
<span id="cb113-10"><a href="bayes.html#cb113-10"></a>    <span class="dt">ntree =</span> <span class="dv">20</span>,                                              <span class="co"># Number of trees in the model </span></span>
<span id="cb113-11"><a href="bayes.html#cb113-11"></a>    <span class="dt">nskip =</span> <span class="dv">100</span>,                                             <span class="co"># Size of burn-in sample</span></span>
<span id="cb113-12"><a href="bayes.html#cb113-12"></a>    <span class="dt">ndpost =</span> <span class="dv">200</span>,                                            <span class="co"># Number of posteriors drawn</span></span>
<span id="cb113-13"><a href="bayes.html#cb113-13"></a>    <span class="dt">power =</span> <span class="dv">2</span>,                                               <span class="co"># beta in the tree structure prior</span></span>
<span id="cb113-14"><a href="bayes.html#cb113-14"></a>    <span class="dt">base =</span> <span class="fl">0.95</span>)                                             <span class="co"># alpha in the tree structure prior</span></span></code></pre></div>
<pre><code>## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 198128, 7, 70208
## y1,yn: -0.049921, 0.024079
## x1,x[n*p]: 0.010000, 0.810000
## xp1,xp[np*p]: 0.270000, 0.880000
## *****Number of Trees: 20
## *****Number of Cut Points: 100 ... 100
## *****burn,nd,thin: 100,200,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,1.57391,3,2.84908e-31,0.0139209
## *****sigma: 0.000000
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,7,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 300)
## done 100 (out of 300)
## done 200 (out of 300)
## time: 30s
## trcnt,tecnt: 200,200</code></pre>

<p>Once the model is trained,<a href="#fn20" class="footnote-ref" id="fnref20"><sup>20</sup></a> we evaluated its performance. We simply compute the hit ratio. The predictions are embedded within the fit variable, under the name ‘<em>yhat.test</em>’.</p>

<div class="sourceCode" id="cb115"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb115-1"><a href="bayes.html#cb115-1"></a><span class="kw">mean</span>(fit_bart<span class="op">$</span>yhat.test <span class="op">*</span><span class="st"> </span>testing_sample<span class="op">$</span>R1M_Usd <span class="op">&gt;</span><span class="st"> </span><span class="dv">0</span>)</span></code></pre></div>
<pre><code>## [1] 0.5433102</code></pre>

<p>The performance <em>seems</em> reasonable but is by no means impressive. The data from all sampled trees is available in the <em>fit_bart</em> variable. It has nonetheless a complex structure (as is often the case with trees). The simplest information we can extract is the value of <span class="math inline">\(\sigma\)</span> across all 300 simulations (see Figure <a href="bayes.html#fig:bartsigplot">9.4</a>).</p>
<div class="sourceCode" id="cb117"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb117-1"><a href="bayes.html#cb117-1"></a><span class="kw">data.frame</span>(<span class="dt">simulation =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">300</span>, <span class="dt">sigma =</span> fit_bart<span class="op">$</span>sigma) <span class="op">%&gt;%</span></span>
<span id="cb117-2"><a href="bayes.html#cb117-2"></a><span class="st">    </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> simulation, <span class="dt">y =</span> sigma)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_point</span>(<span class="dt">size =</span> <span class="fl">0.7</span>)</span></code></pre></div>
<div class="figure"><span id="fig:bartsigplot"></span>
<img src="ML_factor_files/figure-html/bartsigplot-1.png" alt="Evolution of sigma across BART simulations." width="384" />
<p class="caption">
FIGURE 9.4: Evolution of sigma across BART simulations.
</p>
</div>
<p>And we see that, as the number of samples increases, <span class="math inline">\(\sigma\)</span> decreases.</p>

</div>
</div>
</div>


<h3>References</h3>
<div id="refs" class="references">
<div id="ref-bauder2020bayesian">
<p>Bauder, David, Taras Bodnar, Nestor Parolya, and Wolfgang Schmid. 2020. “Bayesian Inference of the Multi-Period Optimal Portfolio for an Exponential Utility.” <em>Journal of Multivariate Analysis</em> 175: 104544.</p>
</div>
<div id="ref-chen2001initialization">
<p>Chen, Huifen. 2001. “Initialization for NORTA: Generation of Random Vectors with Specified Marginals and Correlations.” <em>INFORMS Journal on Computing</em> 13 (4): 312–31.</p>
</div>
<div id="ref-chipman2010bart">
<p>Chipman, Hugh A, Edward I George, and Robert E McCulloch. 2010. “BART: Bayesian Additive Regression Trees.” <em>Annals of Applied Statistics</em> 4 (1): 266–98.</p>
</div>
<div id="ref-coqueret2017approximate">
<p>Coqueret, Guillaume. 2017. “Approximate NORTA Simulations for Virtual Sample Generation.” <em>Expert Systems with Applications</em> 73: 69–81.</p>
</div>
<div id="ref-dangl2020optimal">
<p>Dangl, Thomas, and Alex Weissensteiner. 2020. “Optimal Portfolios Under Time-Varying Investment Opportunities, Parameter Uncertainty, and Ambiguity Aversion.” <em>Journal of Financial and Quantitative Analysis</em> 55 (4): 1163–98.</p>
</div>
<div id="ref-demiguel2015parameter">
<p>DeMiguel, Victor, Alberto Martı́n-Utrera, and Francisco J Nogales. 2015. “Parameter Uncertainty in Multiperiod Portfolio Optimization with Transaction Costs.” <em>Journal of Financial and Quantitative Analysis</em> 50 (6): 1443–71.</p>
</div>
<div id="ref-frost1986empirical">
<p>Frost, Peter A, and James E Savarino. 1986. “An Empirical Bayes Approach to Efficient Portfolio Selection.” <em>Journal of Financial and Quantitative Analysis</em> 21 (3): 293–305.</p>
</div>
<div id="ref-gelman2013bayesian">
<p>Gelman, Andrew, John B Carlin, Hal S Stern, David B Dunson, Aki Vehtari, and Donald B Rubin. 2013. <em>Bayesian Data Analysis, 3rd Edition</em>. Chapman &amp; Hall / CRC.</p>
</div>
<div id="ref-greene2018econometric">
<p>Greene, William H. 2018. <em>Econometric Analysis, Eighth Edition</em>. Pearson Education.</p>
</div>
<div id="ref-guidolin2016ambiguity">
<p>Guidolin, Massimo, and Hening Liu. 2016. “Ambiguity Aversion and Underdiversification.” <em>Journal of Financial and Quantitative Analysis</em> 51 (4): 1297–1323.</p>
</div>
<div id="ref-kan2007optimal">
<p>Kan, Raymond, and Guofu Zhou. 2007. “Optimal Portfolio Choice with Parameter Uncertainty.” <em>Journal of Financial and Quantitative Analysis</em> 42 (3): 621–56.</p>
</div>
<div id="ref-kruschke2014doing">
<p>Kruschke, John. 2014. <em>Doing Bayesian Data Analysis: A Tutorial with R, Jags, and Stan (2nd Ed.)</em>. Academic Press.</p>
</div>
<div id="ref-lai2011mean">
<p>Lai, Tze Leung, Haipeng Xing, Zehao Chen, and others. 2011. “Mean–Variance Portfolio Optimization When Means and Covariances Are Unknown.” <em>Annals of Applied Statistics</em> 5 (2A): 798–823.</p>
</div>
<div id="ref-metropolis1949monte">
<p>Metropolis, Nicholas, and Stanislaw Ulam. 1949. “The Monte Carlo Method.” <em>Journal of the American Statistical Association</em> 44 (247): 335–41.</p>
</div>
<div id="ref-roberts1994simple">
<p>Roberts, Gareth O, and Adrian FM Smith. 1994. “Simple Conditions for the Convergence of the Gibbs Sampler and Metropolis-Hastings Algorithms.” <em>Stochastic Processes and Their Applications</em> 49 (2): 207–16.</p>
</div>
<div id="ref-sparapani2019r">
<p>Sparapani, Rodney, Charles Spanbauer, and Robert McCulloch. 2019. “The BART R Package.” Comprehensive R Archive Network. <a href="https://cran.r-project.org/web/packages/BART/vignettes/the-BART-R-package.pdf">https://cran.r-project.org/web/packages/BART/vignettes/the-BART-R-package.pdf</a>.</p>
</div>
<div id="ref-tierney1994markov">
<p>Tierney, Luke. 1994. “Markov Chains for Exploring Posterior Distributions.” <em>Annals of Statistics</em>, 1701–28.</p>
</div>
<div id="ref-tu2010incorporating">
<p>Tu, Jun, and Guofu Zhou. 2010. “Incorporating Economic Objectives into Bayesian Priors: Portfolio Choice Under Parameter Uncertainty.” <em>Journal of Financial and Quantitative Analysis</em> 45 (4): 959–86.</p>
</div>
</div>
<div class="footnotes">
<hr />
<ol start="19">
<li id="fn19"><p>This assumption can be relaxed, but the algorithms then become more complex and are out of the scope of the current book. One such example that generalizes the naive Bayes approach is <span class="citation">Friedman, Geiger, and Goldszmidt (<a href="#ref-friedman1997bayesian" role="doc-biblioref">1997</a>)</span>.<a href="bayes.html#fnref19" class="footnote-back">↩︎</a></p></li>
<li id="fn20"><p>In the case of BARTs, the training consists exactly in the drawing of posterior samples.<a href="bayes.html#fnref20" class="footnote-back">↩︎</a></p></li>
</ol>
</div>
            </section>

          </div>
        </div>
      </div>
<a href="svm.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="valtune.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
    </div>
  </div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": false,
"twitter": true,
"linkedin": true,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": null,
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "section",
"scroll_highlight": true
},
"toolbar": {
"position": "fixed",
"download": false
},
"search": true,
"info": true
});
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    var src = "true";
    if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
    if (location.protocol !== "file:")
      if (/^https?:/.test(src))
        src = src.replace(/^https?:/, '');
    script.src = src;
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>
</body>

</html>