      \chapter{Notation}
      Most of this thesis is formatted in one and a half columns, which means that a large right margin is filled with complementary material.
      This includes figures, tables and algorithms when space allows, but also epigraphs and marginal notes with supplementary details and comments.
      The titles of important bibliographical references are also given in the margin right of their first mention in the section.
      Some marginal paragraphs are left unnumbered and provide material about the broadly adjacent passage.
      When a section seems unclear, we invite the reader to look for additional information in the margin.
      For example, while relation algebra is introduced in Section~\ref{sec:context:relation algebra}, we do not expect most readers to be familiar with its notation.
      As such, we will systematically provide an interpretation of relation algebra formulae in plain English in unnumbered marginal paragraphs.
     \bigskip
     \newlength{\notationsWidest}
     \settowidth{\notationsWidest}{\(\jsd(P\mathrel{\|}Q)\)}
     \begin{longtable}{@{}c p{\dimexpr\textwidth-\tabcolsep*2-\notationsWidest\relax}@{}}
     \multicolumn{2}{@{}c@{}}{\textbf{Domain of Variables}} \\
     \(x\) & A scalar \\
     \(\vctr{x}\) & A vector, its elements are indexed \(x_i\) \\
     \(\mtrx{X}\) & A matrix, its rows are indexed \(\vctr{x}_i\), its elements \(x_{ij}\) \\
     \(\tnsr{X}\) & A (three-way) tensor, indexed \(\mtrx{X}_i\), \(\vctr{x}_{ij}\), \(x_{ijk}\) \\
     \(\rndm{x}\) & A random variable (sometimes \(\rndm{X}\) to avoid confusion) \\
     \(\rndmvctr{x}\) & A random vector \\
     \(\symbb{R}\) & The set of real numbers \\
     \(\symbb{R}^n\) & The set of real-valued vectors of length \(n\) \\
     \(\symbb{R}^{n\times m}\) & The set of real-valued matrices with \(n\) rows and \(m\) columns \\
     \(B^A\) & The set of functions from \(A\) to \(B\), in particular \(2^A\) denotes the power set of \(A\) \\
     \multicolumn{2}{@{}b{\textwidth}@{}}{
     To describe the set of real-valued vectors with the same number of elements as a set \(A\), we abuse the morphism from the functions \(\symbb{R}^A\) to the vectors \(\symbb{R}^{|A|}\) and simply write \(\vctr{x}\in\symbb{R}^A\) to denote that \(\vctr{x}\) is a vector with \(|A|\) elements.
     } \\[5mm]
     \multicolumn{2}{@{}c@{}}{\textbf{Relation Algebra}} \\
     \multicolumn{2}{@{}l@{}}{Relation algebra is described in more detail in Section~\ref{sec:context:relation algebra}.} \\
     \(\relationZero\) & Empty relation \\
     \(\relationOne\) & Complete relation \\
     \(\relationIdentity\) & Identity relation \\
     \(\bar{r}\) & Complementary relation \\
     \(\breve{r}\) & Converse relation (reversed orientation), when applied to a surface form: \(\widebreve{\textsl{born in}}\) \\
     \(\relationComposition\) & Relation composition \\[5mm]
     \multicolumn{2}{@{}c@{}}{\textbf{Probability and Information Theory}} \\
     \(P(\rndm{x})\), \(Q(\rndm{x})\) & Probability distribution over \(\rndm{x}\), by default we heavily overload \(P\) (as is customary), when confusion is possible we disambiguate by using \(Q\) \\
     \(\empP(\rndm{x})\) & Empirical distribution over \(\rndm{x}\) (as defined by the dataset) \\
     \(\rndm{x} \independent \rndm{y} \mid \rndm{z}\) & Conditional independence of \(\rndm{x}\) and \(\rndm{y}\) given \(\rndm{z}\) \\
     \(\rndm{x} \notindependent \rndm{y}\) & \(\rndm{x}\) and \(\rndm{y}\) are not independent \\
     \(\uniformDistribution(X)\) & Uniform distribution over the set \(X\) \\
     \(\normalDistribution(\mu, \sigma^2)\) & Normal distribution of mean \(\mu\) and variance \(\sigma^2\) (also used for the multivariate case) \\
     \(\entropy(\rndm{x})\) & Shannon entropy of the random variable \(\rndm{x}\), \(\entropy(\rndm{x}, \rndm{y})\) denotes the joint entropy \\
     \(\entropy(\rndm{x}\mid\rndm{y})\) & Conditional entropy of \(\rndm{x}\) given \(\rndm{y}\) \\
     \(\entropy_Q(P)\) & Cross-entropy of \(P\) relative to \(Q\) \\
     \(\operatorname{I}(\rndm{x}; \rndm{y})\) & Mutual information of \(\rndm{x}\) and \(\rndm{y}\) \\
     \(\pmi(x, y)\) & Pointwise mutual information of events \(x\) and \(y\) \\
     \(\kl(P\mathrel{\|}Q)\) & Kullback--Leibler divergence from \(Q\) to \(P\) \\
     \(\jsd(P\mathrel{\|}Q)\) & Jensen--Shannon divergence between \(P\) and \(Q\) \\
     \(W_1(P, Q)\) & 1-Wasserstein distance between \(P\) and \(Q\) \\[5mm]
     \multicolumn{2}{@{}c@{}}{\textbf{Machine Learning}} \\
     \(\sigmoid(x)\) & Logistic sigmoid \(\sigmoid(x) = 1 \divslash (1 + \exp(-x))\) \\
     \(\ReLU(x)\) & Rectified linear unit \(\ReLU(x) = \max(0, x)\), we use \(\ReLU_{\halfCircleScript}\) to refer to the ReLU activation applied to half of the units (see Section~\ref{sec:context:attention lm}) \\
     \(\symcal{L}\) & Loss (to be minimized) \\
     \(J\) & Objective (to be maximized) \\
     \(\overDirected{\fone}\), \(\overUndirected{\fone}\), \(\overHalfdirected{\fone}\) & Directed, undirected and half-directed \fone{} measures (see Section~\ref{sec:relation extraction:supervised evaluation}) \\[5mm]
     \multicolumn{2}{@{}c@{}}{\textbf{Graph Operations}} \\
     \(\gfsource(a)\) & Source vertex of the arc \(a\) \\
     \(\gftarget(a)\) & Target vertex of the arc \(a\) \\
     \(\gfrelation(a)\) & Relation conveyed by the arc \(a\) \\
     \(\gfsentence(a)\) & Sentence corresponding to the arc \(a\) \\
     \(\gfneighbors(e)\) & Vertices neighboring the vertex \(e\) \\
     \(\gfincidents(e)\) & Arcs incident to the vertex \(e\) \\
     \(\gfeneighbors(a)\) & Arcs neighboring the arc \(a\) \\[5mm]
     \multicolumn{2}{@{}c@{}}{\textbf{Other Operations}} \\
     \(\odot\) & Element-wise (Hadamard) product \\
     \(*\) & Convolution \\
     \(\bowtie\) & Natural join \\
     \(\times_A\) & Pullback with common codomain \(A\) \\
     \(\delta_{i,j}\) & Kronecker's delta, 1 if \(i=j\), 0 otherwise \\
     \end{longtable}