diff --git a/latex/splash2024/preamble.tex b/latex/splash2024/preamble.tex index 4b322c38..b4cb6cad 100644 --- a/latex/splash2024/preamble.tex +++ b/latex/splash2024/preamble.tex @@ -96,13 +96,13 @@ \usepackage[skins,breakable,listings]{tcolorbox} -\lstdefinelanguage{kotlin}{ +\lstdefinelanguage{python}{ comment=[l]{//}, commentstyle={\color{gray}\ttfamily}, emph={delegate, filter, firstOrNull, forEach, it, lazy, mapNotNull, println, repeat, assert, with, head, tail, len, return@}, numberstyle=\noncopyable, identifierstyle=\color{black}, - keywords={abstract, actual, as, as?, break, by, class, companion, continue, data, do, dynamic, else, enum, expect, false, final, for, fun, get, if, import, in, infix, interface, internal, is, null, object, open, operator, override, package, private, public, return, sealed, set, super, suspend, this, throw, true, try, catch, typealias, val, var, vararg, when, where, while, tailrec, reified}, + keywords={abstract, actual, as, as?, break, by, class, companion, continue, data, do, dynamic, else, enum, expect, false, final, for, fun, get, if, import, in, infix, interface, internal, is, null, object, open, operator, override, package, private, public, return, sealed, set, super, suspend, this, throw, true, try, catch, typealias, val, var, vararg, when, where, while, tailrec, reified, from, import, def, yield, lambda, as, in, return, else, pass}, keywordstyle={\bfseries}, morecomment=[s]{/*}{*/}, morestring=[b]", @@ -114,6 +114,7 @@ literate={`}{{\char0}}1, escapeinside={(*@}{@*)} } + \lstdefinelanguage{tidy}{ comment=[l]{//}, commentstyle={\color{gray}\ttfamily}, diff --git a/latex/splash2024/splash.pdf b/latex/splash2024/splash.pdf index bfdd56d0..19c4c8d3 100644 Binary files a/latex/splash2024/splash.pdf and b/latex/splash2024/splash.pdf differ diff --git a/latex/splash2024/splash.tex b/latex/splash2024/splash.tex index 6034b239..35cf7d48 100644 --- a/latex/splash2024/splash.tex +++ b/latex/splash2024/splash.tex @@ -243,7 +243,7 @@ Likewise, a finite state automaton is a quintuple $\mathcal{A} = \langle Q, \Sigma, \delta, I, F\rangle$, where $Q$ is a finite set of states, $\Sigma$ is a finite alphabet, $\delta \subseteq Q \times \Sigma \times Q$ is the transition function, and $I, F \subseteq Q$ are the set of initial and final states, respectively. We will adhere to this notation in the following sections. - \pagebreak\subsection{The nominal Levenshtein automaton}\label{sec:lev_nfa} + \pagebreak\subsection{Modeling code edits with the Levenshtein automaton}\label{sec:lev_nfa} \begin{wrapfigure}{r}{0.5\textwidth} \vspace{-0.3cm} @@ -446,7 +446,7 @@ Nominalizing the NFA eliminates the creation of $e=2(|\Sigma| - 1)\cdot|\sigma|\cdot d_\max$ unnecessary arcs over the entire Levenshtein automaton and drastically reduces the size of the construction to follow, but does not affect the underlying semantics. Thus, it is essential to first nominalize the automaton before proceeding to avoid a large blowup in the intermediate grammar. - \subsection{Levenshtein-Bar-Hillel Construction}\label{sec:lev_bh} + \subsection{Recognizing syntactically valid code changes via language intersection}\label{sec:lev_bh} We now describe the Bar-Hillel construction, which generates a grammar recognizing the intersection between a regular and a context-free language, then specialize it to Levenshtein intersections. @@ -1265,20 +1265,115 @@ % References will then be sorted and formatted in the correct style. % % \bibliographystyle{splncs04} - \bibliography{../bib/acmart} + \pagebreak\bibliography{../bib/acmart} -\pagebreak \appendix + \pagebreak\appendix - \section{Raw data} \label{sec:appendix} + \section{Example Repairs}\label{sec:exaple_repairs} - Raw data from Precision@k experiments across snippet length and Levenshtein distance from \S~\ref{sec:stackoverflow}. + Below, we provide a few examples of broken code snippets and their corresponding human repairs that were successfully discovered and ranked first by our method. On the left is a complete snippet fed to the model and on the right, the corresponding human repair that was correctly predicted. + + \begin{figure}[H] + \begin{tabular}{|m{6.6cm}|m{6.6cm}|} + \hline \rule{0pt}{2.5ex}\textbf{Original broken code}\rule[-1ex]{0pt}{2ex} & \rule{0pt}{2.5ex}\textbf{First predicted repair}\rule[-1ex]{0pt}{2ex} \\\hline + \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + (*@\hlorange{form}@*) sympy import * + x = Symbol('x', real=True) + x, re(x), im(x) + + \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + (*@\hlorange{\textbf{from}}@*) sympy import * + x = Symbol('x', real=True) + x, re(x), im(x) + + \end{lstlisting} \\\hline + \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + result = (*@\hlorange{yeald}@*) From(item.create()) + raise Return(result) + + \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + result = (*@\hlorange{\textbf{yield}}@*) From(item.create()) + raise Return(result) + + \end{lstlisting} \\\hline + \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + return 1/sum_p if sum_p \ + (*@\hlorange{\textbf{return}}@*) 0 (*@\hlred{\textbf{else}}@*) + + \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + return 1/sum_p if sum_p \ + (*@\hlorange{\textbf{else}}@*) 0 + + \end{lstlisting} \\\hline + \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + sum(len(v) for v items.values())(*@\hlred{)}@*) + + \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + sum(len(v) for v (*@\hlgreen{\textbf{in}}@*) items.values()) + + \end{lstlisting} \\\hline + \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + df.apply(lambda row: list(set(row['ids'(*@\hlorange{)}@*)))) + + \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + df.apply(lambda row: list(set(row['ids'(*@\hlorange{]}@*)))) + + \end{lstlisting} \\\hline + \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + import numpy (*@\hlorange{ad}@*) np + A_concate = np.array([a_0, a_1, a_2,..., a_n]) + + \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + import numpy (*@\hlorange{\textbf{as}}@*) np + A_concate = np.array([a_0, a_1, a_2,..., a_n]) + + \end{lstlisting} \\\hline + \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + class MixIn(object) + def m(): + pass + + class classA(MixIn): + + class classB(MixIn): + + \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python] + + class MixIn(object)(*@\hlgreen{:}@*) + def m(): + pass + + class classA(MixIn): (*@\hlgreen{\textbf{pass}}@*) + + class classB(MixIn): (*@\hlgreen{\textbf{pass}}@*) + + \end{lstlisting} \\\hline + \end{tabular} + \end{figure} + + \clearpage\section{Raw data}\label{sec:raw_prec_data} + + Raw data from Precision@k experiments across snippet length and Levenshtein distance from \S~\ref{sec:stackoverflow}. $|\err\sigma|$ indicates the snippet length and $\Delta$ indicates the Levenshtein distance between the broken and code and human fix computed over lexical tokens. \begin{table}[!h] \centering \begin{tabular}{c|c|cccccccc} \hline\hline & $\Delta$ & \multicolumn{8}{c}{Precision@1} \\ \hline - $|\sigma|$ & & $(0,10)$ & $[10,20)$ & $[20,30)$ & $[30, 40)$ & $[40,50)$ & $[50, 60)$ & $[60,70)$ & $[70, 80)$ \\ \hline + $|\err\sigma|$ & & $(0,10)$ & $[10,20)$ & $[20,30)$ & $[30, 40)$ & $[40,50)$ & $[50, 60)$ & $[60,70)$ & $[70, 80)$ \\ \hline Tidyparse & 1 & 1.00 & 1.00 & 0.98 & 0.98 & 1.00 & 1.00 & 0.95 & 0.90 \\ & 2 & 0.51 & 0.36 & 0.24 & 0.26 & 0.24 & 0.23 & 0.12 & 0.10 \\ @@ -1302,5 +1397,6 @@ & 3 & 0.20 & 0.13 & 0.08 & 0.17 & 0.15 & 0.18 & 0.17 & 0.07 \\ \hline\hline \end{tabular} \end{table} +\end{document} + -\end{document} \ No newline at end of file diff --git a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/repair/SyntaxRepair.kt b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/repair/SyntaxRepair.kt index c698c689..033e5c13 100644 --- a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/repair/SyntaxRepair.kt +++ b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/repair/SyntaxRepair.kt @@ -14,6 +14,7 @@ var CFG_THRESH = 20_000 var MAX_UNIQUE = 20_000 // Maximum number of unique samples to generate var MAX_SAMPLE = 20 // Maximum number of repairs to sample var MAX_TOKENS = 40 // Maximum number of tokens per repair +var MIN_TOKENS = 3 var MAX_RADIUS = 3 var TIMEOUT_MS = 90_000 // Timeout for each repair attempt (default, modify elsewhere) var MAX_REPAIR = 2 // Maximum number of edits per repair