PhD

The LaTeX sources of my Ph.D. thesis
git clone https://esimon.eu/repos/PhD.git
Log | Files | Refs | README | LICENSE

bpe.tex (631B)


      1 \begin{algorithmic}
      2 	\Function{bpe}{}
      3 		\FunctionInputs{}  \(n\) the vocabulary size
      4 		\FunctionInputs*{} \(\vctr{t}\) the corpus
      5 		\FunctionOutput{}  \(V\) the vocabulary
      6 		\State
      7 		\State \(V \gets\) all unique characters in \(\vctr{t}\)
      8 		\While{\(|V| < n\)}
      9 			\State \(c_1c_2 \gets\)~\parbox[t]{28mm}{most common bigram in \(\vctr{t}\)}
     10 			\State \(c_\text{new} \gets\)~new token not in \(V\)
     11 			\State \(\vctr{t}\gets\)~\parbox[t]{32mm}{replace all occurrences of \(c_1c_2\) in \(\vctr{t}\) by \(c_\text{new}\)}
     12 			\State \(V \gets V \cup \{c_\text{new}\}\)
     13 		\EndWhile
     14 		\State \Output \(V\)
     15 	\EndFunction
     16 \end{algorithmic}