diff --git a/latex/bib/acmart.bib b/latex/bib/acmart.bib index b32af546..ee8c506e 100644 --- a/latex/bib/acmart.bib +++ b/latex/bib/acmart.bib @@ -2225,4 +2225,11 @@ @article{bendkowski2022automatic author={Bendkowski, Maciej}, journal={arXiv preprint arXiv:2206.06668}, year={2022} +} + +@book{flajolet2009analytic, + title={Analytic Combinatorics}, + author={Flajolet, P}, + year={2009}, + publisher={Cambridge University Press} } \ No newline at end of file diff --git a/latex/thesis/Thesis.pdf b/latex/thesis/Thesis.pdf index e89eb9bf..acb58e6c 100644 Binary files a/latex/thesis/Thesis.pdf and b/latex/thesis/Thesis.pdf differ diff --git a/latex/thesis/Thesis.tex b/latex/thesis/Thesis.tex index 676e5b1d..85eee9c1 100644 --- a/latex/thesis/Thesis.tex +++ b/latex/thesis/Thesis.tex @@ -18,7 +18,7 @@ \newcommand{\hsp}{\hspace{20pt}} \titleformat{\chapter}[display]{\filleft\Huge\bfseries}{\fontsize{100}{100}\selectfont\textcolor{gray75}\thechapter}{1ex}{}[]% -\input{preamble.tex} +\input{preamble} % Some metadata for your generated PDF \title{An Edit Calculus for Probabilistic Program Repair} @@ -26,7 +26,6 @@ \date{\today} \titlespacing*{\chapter}{0pt}{-40pt}{40pt} -\usepackage{amsmath} % DOCUMENT BEGINS \begin{document} @@ -38,7 +37,7 @@ % TITLE PAGE \begin{onehalfspacing} \pagestyle{empty} -\input{content/TitlePage.tex} +\input{content/TitlePage} \cleardoublepage \end{onehalfspacing} diff --git a/latex/thesis/content/Acronyms.tex b/latex/thesis/content/Acronyms.tex index 1ef2fd0b..e7347467 100644 --- a/latex/thesis/content/Acronyms.tex +++ b/latex/thesis/content/Acronyms.tex @@ -1,8 +1,14 @@ \chapter*{\rm\bfseries Acronyms} \label{ch:acronyms} -This part is likewise optional. But it does not hurt to provide a list of all acronyms, e.g.: +Below are a list of acronyms used in the construction of this thesis: \begin{itemize} - \item{\textbf{REST}: \textbf{Re}presentational \textbf{S}tate \textbf{T}ransfer} + \item{\textbf{NFA}: \textbf{N}ondeterministic \textbf{F}inite \textbf{A}utomaton} + \item{\textbf{DFA}: \textbf{D}eterministic \textbf{F}inite \textbf{A}utomaton} + \item{\textbf{CFG}: \textbf{C}ontext \textbf{F}ree \textbf{G}rammar} + \item{\textbf{CFL}: \textbf{C}ontext \textbf{F}ree \textbf{L}anguage} + \item{\textbf{CNF}: \textbf{C}homsky \textbf{N}ormal \textbf{F}orm} + \item {\textbf{GRE}: \textbf{G}eneralized \textbf{R}egular \textbf{E}xpression} + \item {\textbf{OGF}: \textbf{O}rdinary \textbf{G}enerating \textbf{F}unction} \end{itemize} diff --git a/latex/thesis/content/Ch2_Formal_Language_Theory.tex b/latex/thesis/content/Ch2_Formal_Language_Theory.tex index bbdd424f..08e7e189 100644 --- a/latex/thesis/content/Ch2_Formal_Language_Theory.tex +++ b/latex/thesis/content/Ch2_Formal_Language_Theory.tex @@ -3,11 +3,11 @@ \chapter{\rm\bfseries Formal Language Theory} In computer science, it is common to conflate two distinct notions for a set. The first is a collection sitting on some storage device, e.g., a dataset. The second is a lazy construction: not an explicit collection of objects, but a representation that allows us to efficiently determine membership on demand. This lets us represent infinite sets without requiring an infinite amount of storage. Inclusion then, instead of being simply a lookup query, becomes a decision procedure. This is the basis of formal language theory. -The representation we are chiefly interested in is the grammar, a common metanotation for specifying the syntactic constraints on programs, shared by nearly every programming language. Programming language grammars are overapproximations to the true language of interest, but provide a fast procedure for rejecting invalid programs and parsing valid ones. +The representation we are chiefly interested in is the grammar, a common metanotation for specifying the syntactic constraints on programs, shared by nearly every programming language. Programming language grammars are overapproximations to the true language of interest, but provide a reasonably fast procedure for rejecting invalid programs and parsing valid ones. -Formal languages are arranged in a hierarchy of containment, where each language family strictly contains its predecessors. On the lowest level are the finite languages. Type 3 contains infinite languages generated by a regular grammar. Level 2 contains context-free languages, which admit parenthetical nesting. Supersets, such as the recursively enumerable sets, are also possible. There are other kinds of formal languages, such as logics and circuits, which are incomparable with the Chomsky hierarchy. +Formal languages are arranged in a hierarchy of containment, where each language family strictly contains its predecessors. On the lowest level of the hierarchy are finite languages. Type 3 contains finite and infinite languages generated by a regular grammar. Type 2 contains context-free languages, which admit parenthetical nesting. Supersets, such as the recursively enumerable sets, are Type 0. There are other kinds of formal languages, such as logics and circuits, which are incomparable with the Chomsky hierarchy. -For most programming languages, they leave level 2 after the parsing stage, and enter the realm of type theory and static analysis. At this point, compiler authors layer additional semantic constraints on top of the syntactic ones, but must deal with phase ordering problems related to the sequencing of such analyzers, breaking commutativity. +Most programming languages leave level 2 after the parsing stage, and enter the realm of type theory. At this point, compiler authors layer additional semantic refinements on top of syntax, but must deal with phase ordering problems related to the sequencing of such analyzers, breaking commutativity and posing challenges for parallelization. This lack of compositionality is a major obstacle to the development of modular static analyses. The advantage of dealing with formal language representations is that we can reason about them algebraically. Consider the context-free grammar: the arrow $\rightarrow$ becomes an $=$ sign, $\mid$ becomes $+$ and $AB$ becomes $A \times B$. The ambiguous Dyck grammar, then, can be seen as a system of equations. @@ -19,35 +19,63 @@ \chapter{\rm\bfseries Formal Language Theory} \begin{align*} f(x) &= x^2 + x^2 f(x) + f(x)^2\\ - 0 &= f(x)^2 + x^2 f(x) - f(x) + x^2\\ + 0 &= f(x)^2 + x^2 f(x) - f(x) + x^2 \end{align*} \noindent Now, using the quadratic equation, where $a = 1, b = x^2 - 1, c = x^2$, we have: \begin{align*} - f(x) &= \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} \\ - f(x) &= \frac{-x^2 + 1 \pm \sqrt{x^4 - 6x^2 + 1}}{2} + f(x) &= \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} = \frac{-x^2 + 1 \pm \sqrt{x^4 - 6x^2 + 1}}{2} \end{align*} -\noindent We also have that $f(x)=\sum _{n=0}^{\infty }f_nx^{n}$, so to obtain the number of ambiguous Dyck trees of size $n$, we can extract the $n$th coefficient of $f_n$ using the binomial series. Expanding $\sqrt{x^4 - 6x^2 + 1}$, we obtain: +\noindent Note there are two solutions, but only one where $\lim_{x\rightarrow 0} = 1$. From the ordinary generating function (OGF), we also have that $f(x)=\sum _{n=0}^{\infty }f_nx^{n}$. Expanding $\sqrt{x^4 - 6x^2 + 1}$ via the generalized binomial theorem, we have: \begin{align*} -f(x) = (1+x)^{\alpha }&=\sum _{k=0}^{\infty }\;{\binom {\alpha }{k}}\;x^{k}\\ - &=\sum _{k=0}^{\infty }\;{\binom {\frac{1}{2} }{k}}\;(x^4 - 6x^2)^{k}\\ +f(x) = (1+u)^{\alpha }&=\sum _{k=0}^{\infty }\;{\binom {\alpha }{k}}\;u^{k}\\ + &=\sum _{k=0}^{\infty }\;{\binom {\frac{1}{2} }{k}}\;(x^4 - 6x^2)^{k} \text{ where } u = x^4-6x^2 \end{align*} +Now, to obtain the number of ambiguous Dyck trees of size $n$, we can extract the $n$th coefficient using the binomial series: + \begin{align*} [x^n]f(x) &= [x^n]\frac{-x^2 + 1}{2} + \frac{1}{2}[x^n]\sum _{k=0}^{\infty }\;{\binom {\frac{1}{2} }{k}}\;(x^4 - 6x^2)^{k}\\ [x^n]f(x) &= \frac{1}{2}{\binom {\frac{1}{2} }{n}}\;[x^n](x^4 - 6x^2)^n = \frac{1}{2}{\binom {\frac{1}{2} }{n}}\;[x^n](x^2 - 6x)^n \end{align*} -This lets us understand grammars as a kind of algebra, which is useful for enumerative combinatorics on words and syntax-guided synthesis. - - - -Like sets, it is possible to abstractly combine languages by manipulating their grammars, mirroring the setwise operations of union, intersection, and difference over languages. These operations are convenient for combining, for example, syntactic and semantic constraints on programs. For example, we might have two grammars, $G_a, G_b$ representing two properties that are both necessary for a program to be considered valid. We can treat valid programs $P$ as a subset of the language intersection $P \subseteq \mathcal{L}(G_a) \cap \mathcal{L}(G_b)$. +We can use this technique, first described by Flajolet \& Sedgewick~\cite{flajolet2009analytic}, to count the number of trees in any CFL or distinct words in an unambiguous CFL. This lets us understand grammars as a kind of algebra, which is useful for enumerative combinatorics on words and syntax-guided synthesis. + +Like algebra, there is also a kind of calculus to formal languages. Janusz Brzozowski~\cite{brzozowski1964derivatives} introduced the derivative operator for regular languages, which can be used to determine membership, and extract subwords from the language. This operator has been extended to CFLs by Might et al.~\cite{might2011parsing}, and is the basis for a number of elegant parsing algorithms. + +The Brzozowski derivative has an extensional and intensional form. Extensionally we have $\partial_a L = \{b \in \Sigma^* \mid ab \in L\}$. Intensionally, we have an induction over generalized regular expressions (GREs), which are a superset of regular expressions that also support intersection and negation.\vspace{-1cm} + +\begin{multicols}{2} + \begin{eqnarray*} + \phantom{-}\partial_a( & \varnothing & )= \varnothing \\ + \phantom{-}\partial_a( & \varepsilon & )= \varnothing \\ + \phantom{-}\partial_a( & a & )= \varepsilon \\ + \phantom{-}\partial_a( & b & )= \varnothing \text{ for each } a \neq b \\ + \phantom{-}\partial_a( & R^* & )= (\partial_x R)\cdot R^* \\ + \phantom{-}\partial_a( & \neg R & )= \neg \partial_a R \\ + \phantom{-}\partial_a( & R\cdot S & )= (\partial_a R)\cdot S \vee \delta(R)\cdot\partial_a S \\ + \phantom{-}\partial_a( & R\vee S & )= \partial_a R \vee \partial_a S \\ + \phantom{-}\partial_a( & R\land S & )= \partial_a R \land \partial_a S + \end{eqnarray*} \break\vspace{-0.5cm} + \begin{eqnarray*} + \phantom{---}\delta(& \varnothing &)= \varnothing \\ + \phantom{---}\delta(& \varepsilon &)= \varepsilon \\ + \phantom{---}\delta(& a &)= \varnothing \\ + \phantom{---}\delta(& R^* &)= \varepsilon \\ + \phantom{---}\delta(& \neg R &)= \varepsilon \text{ if } \delta(R) = \varnothing \\ + \phantom{---}\delta(& \neg R &)= \varnothing \text{ if } \delta(R) = \varepsilon \\ + \phantom{---}\delta(& R\cdot S &)= \delta(R) \land \delta(S) \\ + \phantom{---}\delta(& R\vee S &)= \delta(R) \vee \delta(S) \\ + \phantom{---}\delta(& R\land S &)= \delta(R) \land \delta(S) + \end{eqnarray*} +\end{multicols} + + +Like sets, it is possible to abstractly combine languages by manipulating their grammars, mirroring the setwise operations of union, intersection, and difference over languages. These operations are convenient for combining, for example, syntactic and semantic constraints on programs. For example, we might have two grammars, $G_a, G_b$ representing two properties that are both necessary for a program to be considered valid. %We can treat valid programs $P$ as a subset of the language intersection $P \subseteq \mathcal{L}(G_a) \cap \mathcal{L}(G_b)$. Like all representations, grammars are a trade-off between expressiveness and efficiency. It is often possible to represent the same finite set with multiple representations of varying complexity. For example, the set of strings containing ten or fewer balanced parentheses can be expressed as deterministic finite automaton containing millions of states, or a simple conjunctive grammar containing a few productions, $\mathcal{L}\Big(S \rightarrow ( ) \mid (S) \mid S S \Big) \cap \Sigma^{[0,10]}$. -Like algebra, there is also a kind of calculus to formal languages. Janusz Brzozowski introduced the derivative operator for regular languages, which can be used to determine whether a string is in a language, and to extract subwords from the language. This operator has been extended to context-free languages by Might et al., and is the basis for many parsing algorithms. \clearpage \ No newline at end of file diff --git a/latex/thesis/preamble.tex b/latex/thesis/preamble.tex index df2a0cf3..33b6a727 100644 --- a/latex/thesis/preamble.tex +++ b/latex/thesis/preamble.tex @@ -2,3 +2,6 @@ \usepackage{blindtext} \usepackage[hidelinks]{hyperref} \usepackage{epigraph} +\usepackage{multicol} +\usepackage{amsmath} +\usepackage{amssymb} diff --git a/src/jvmMain/kotlin/ai/hypergraph/kaliningraph/automata/JFSA.kt b/src/jvmMain/kotlin/ai/hypergraph/kaliningraph/automata/JFSA.kt index 16dd3fb4..08773792 100644 --- a/src/jvmMain/kotlin/ai/hypergraph/kaliningraph/automata/JFSA.kt +++ b/src/jvmMain/kotlin/ai/hypergraph/kaliningraph/automata/JFSA.kt @@ -154,5 +154,5 @@ fun BAutomaton.decodeDFA( fun BAutomaton.decodeDFA( dec: Map, // Maps unicode characters back to strings because BAutomata uses Unicode - take: Int = 1000, + take: Int = 10_000, ) = getFiniteStrings(take).map { it.map { dec[it]!! }.joinToString(" ") } \ No newline at end of file diff --git a/src/jvmTest/kotlin/ai/hypergraph/kaliningraph/automata/WFSATest.kt b/src/jvmTest/kotlin/ai/hypergraph/kaliningraph/automata/WFSATest.kt index 9e5ae8d0..10a39cdf 100644 --- a/src/jvmTest/kotlin/ai/hypergraph/kaliningraph/automata/WFSATest.kt +++ b/src/jvmTest/kotlin/ai/hypergraph/kaliningraph/automata/WFSATest.kt @@ -7,6 +7,7 @@ import ai.hypergraph.kaliningraph.repair.LangCache import ai.hypergraph.kaliningraph.repair.MAX_RADIUS import ai.hypergraph.kaliningraph.repair.MAX_TOKENS import ai.hypergraph.kaliningraph.repair.vanillaS2PCFG +import ai.hypergraph.kaliningraph.repair.vanillaS2PCFGWE import ai.hypergraph.markovian.mcmc.MarkovChain import dk.brics.automaton.Automaton import dk.brics.automaton.RegExp @@ -187,6 +188,23 @@ class WFSATest { println("Found $distinct unique repairs by enumerating PTree") } + /* +./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.automata.WFSATest.testSelfInt" +*/ + @Test + fun testSelfInt() { + val startTime = System.currentTimeMillis() + val toRepair = "_ _ _ _".split(" ") + val toRepair1 = "True _ _ True _ _".split(" ") + val pt = vanillaS2PCFG.startPTree(toRepair)!! + val pt1 = vanillaS2PCFGWE.startPTree(toRepair1)!! + val int = pt.toDFA(true)!!.intersection(pt1.toDFA(true)) + assertFalse(int.isEmpty) + int.decodeDFA(pt1.termDict).toSet() + .also { it.forEach { println(it) }; println("Size: ${it.size}") } + println("Total time: ${System.currentTimeMillis() - startTime} ms") + } + /* ./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.automata.WFSATest.testRepulsivePointProcess" */