@article{culbertson_bayesian_2013,
title = {Bayesian machine learning via category theory},
url = {http://arxiv.org/abs/1312.1445},
abstract = {From the Bayesian perspective, the category of conditional probabilities (a variant of the Kleisli category of the Giry monad, whose objects are measurable spaces and arrows are Markov kernels) gives a nice framework for conceptualization and analysis of many aspects of machine learning. Using categorical methods, we construct models for parametric and nonparametric Bayesian reasoning on function spaces, thus providing a basis for the supervised learning problem. In particular, stochastic processes are arrows to these function spaces which serve as prior probabilities. The resulting inference maps can often be analytically constructed in this symmetric monoidal weakly closed category. We also show how to view general stochastic processes using functor categories and demonstrate the Kalman filter as an archetype for the hidden Markov model.},
urldate = {2019-11-22},
journal = {arXiv:1312.1445 [math]},
author = {Culbertson, Jared and Sturtz, Kirk},
month = dec,
year = {2013},
note = {ZSCC: 0000006
arXiv: 1312.1445},
keywords = {Bayesianism, Categorical ML, Categorical probability theory, Purely theoretical}
}
@article{heckerman_tutorial_1995,
title = {A {Tutorial} on {Learning} {With} {Bayesian} {Networks}},
url = {https://www.microsoft.com/en-us/research/publication/a-tutorial-on-learning-with-bayesian-networks/},
abstract = {A Bayesian network is a graphical model that encodes probabilistic relationships among variables of interest. When used in conjunction with statistical techniques, the graphical model has several advantages for data analysis. One, because the model encodes dependencies among all variables, it readily handles situations where some data entries are missing. Two, a Bayesian network can …},
language = {en-US},
urldate = {2019-11-22},
author = {Heckerman, David},
month = mar,
year = {1995},
note = {ZSCC: 0000058},
keywords = {Bayesianism, Classical ML, Machine learning}
}
@article{jacobs_categorical_2018,
title = {Categorical {Aspects} of {Parameter} {Learning}},
url = {http://arxiv.org/abs/1810.05814},
abstract = {Parameter learning is the technique for obtaining the probabilistic parameters in conditional probability tables in Bayesian networks from tables with (observed) data --- where it is assumed that the underlying graphical structure is known. There are basically two ways of doing so, referred to as maximal likelihood estimation (MLE) and as Bayesian learning. This paper provides a categorical analysis of these two techniques and describes them in terms of basic properties of the multiset monad M, the distribution monad D and the Giry monad G. In essence, learning is about the reltionships between multisets (used for counting) on the one hand and probability distributions on the other. These relationsips will be described as suitable natural transformations.},
urldate = {2019-11-21},
journal = {arXiv:1810.05814 [cs]},
author = {Jacobs, Bart},
month = oct,
year = {2018},
note = {ZSCC: 0000001
arXiv: 1810.05814},
keywords = {Bayesianism, Categorical ML, Categorical probability theory, Machine learning}
}
@article{jacobs_predicate/state_2016,
series = {The {Thirty}-second {Conference} on the {Mathematical} {Foundations} of {Programming} {Semantics} ({MFPS} {XXXII})},
title = {A {Predicate}/{State} {Transformer} {Semantics} for {Bayesian} {Learning}},
volume = {325},
issn = {1571-0661},
url = {http://www.sciencedirect.com/science/article/pii/S1571066116300883},
doi = {10/ggdgbb},
abstract = {This paper establishes a link between Bayesian inference (learning) and predicate and state transformer operations from programming semantics and logic. Specifically, a very general definition of backward inference is given via first applying a predicate transformer and then conditioning. Analogously, forward inference involves first conditioning and then applying a state transformer. These definitions are illustrated in many examples in discrete and continuous probability theory and also in quantum theory.},
language = {en},
urldate = {2019-11-24},
journal = {Electronic Notes in Theoretical Computer Science},
author = {Jacobs, Bart and Zanasi, Fabio},
month = oct,
year = {2016},
note = {ZSCC: 0000030},
keywords = {Bayesianism, Categorical ML, Categorical probability theory, Effectus theory, Programming language theory, Semantics},
pages = {185--200}
}
@article{mccullagh_what_2002,
title = {What is a statistical model?},
volume = {30},
url = {http://projecteuclid.org/euclid.aos/1035844977},
doi = {10/bkts3m},
language = {en},
number = {5},
urldate = {2019-11-22},
journal = {The Annals of Statistics},
author = {McCullagh, Peter},
month = oct,
year = {2002},
note = {ZSCC: 0000230},
keywords = {Bayesianism, Categorical ML, Categorical probability theory, Compendium, Purely theoretical, Statistical learning theory},
pages = {1225--1310}
}
@misc{watanabe_algebraic_2009,
title = {Algebraic {Geometry} and {Statistical} {Learning} {Theory}},
url = {/core/books/algebraic-geometry-and-statistical-learning-theory/9C8FD1BDC817E2FC79117C7F41544A3A},
abstract = {Cambridge Core - Pattern Recognition and Machine Learning - Algebraic Geometry and Statistical Learning Theory - by Sumio Watanabe},
language = {en},
urldate = {2019-11-22},
journal = {Cambridge Core},
author = {Watanabe, Sumio},
month = aug,
year = {2009},
doi = {10.1017/CBO9780511800474},
note = {ZSCC: 0000276 },
keywords = {Algebra, Bayesianism, Purely theoretical, Statistical learning theory}
}
@incollection{wermuth_graphical_2001,
address = {Oxford},
title = {Graphical {Models}: {Overview}},
isbn = {978-0-08-043076-8},
shorttitle = {Graphical {Models}},
url = {http://www.sciencedirect.com/science/article/pii/B008043076700440X},
abstract = {Graphical Markov models provide a method of representing possibly complicated multivariate dependencies in such a way that the general qualitative features can be understood, that statistical independencies are highlighted, and that some properties can be derived directly. Variables are represented by the nodes of a graph. Pairs of nodes may be joined by an edge. Edges are directed if one variable is a response to the other variable considered as explanatory, but are undirected if the variables are on an equal footing. Absence of an edge typically implies statistical independence, conditional, or marginal depending on the kind of graph. The need for a number of types of graph arises because it is helpful to represent a number of different kinds of dependence structures. Of special importance are chain graphs in which variables are arranged in a sequence or chain of blocks, the variables in any one block being on an equal footing, some being possibly joint responses to variables in the past and some being jointly explanatory to variables in the future of the block considered. Some main properties of such systems are outlined, and recent research results are sketched. Suggestions for further reading are given. As an illustrative example, some analysis of data on the treatment of chronic pain is presented.},
language = {en},
urldate = {2019-11-22},
booktitle = {International {Encyclopedia} of the {Social} \& {Behavioral} {Sciences}},
publisher = {Pergamon},
author = {Wermuth, N. and Cox, D. R.},
editor = {Smelser, Neil J. and Baltes, Paul B.},
month = jan,
year = {2001},
doi = {10.1016/B0-08-043076-7/00440-X},
note = {ZSCC: NoCitationData[s0] },
keywords = {Bayesianism, Classical ML, Machine learning},
pages = {6379--6386}
}