Skip to content

Commit

Permalink
Merge pull request #15 from trailofbits/13-ignore-list-edits
Browse files Browse the repository at this point in the history
Ignore List Edits
  • Loading branch information
ESultanik authored May 19, 2020
2 parents cee6b0e + 6159aeb commit 3563ae2
Show file tree
Hide file tree
Showing 17 changed files with 351 additions and 110 deletions.
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,20 @@ Likewise, use the `--join-dict-items` or `-jd` option to suppress linebreaks aft
3
], "bar": "baz"}
```
Finally, use `--condensed` or `-j` to apply both of these options:
Use `--condensed` or `-j` to apply both of these options:
```json
{"foo": [1, 2, 3], "bar": "baz"}
```

The `--only-edits` or `-e` option will print out a list of edits rather than applying them to the input file in place.

### Matching Options
By default, Graphtage tries to match all possible pairs of elements in a dictionary. While computationally tractable,
this can sometimes be onerous for input files with huge dictionaries. The `--no-key-edits` or `-k` option will instead
only attempt to match dictionary items that share the same key, drastically reducing computation.
only attempt to match dictionary items that share the same key, drastically reducing computation. Likewise, the
`--no-list-edits` or `-l` option will not consider interstitial insertions and removals when comparing two lists. The
`--no-list-edits-when-same-length` or `-ll` option is a less drastic version of `-l` that will behave normally for lists
that are of different lengths, but behave like `-l` for lists that are of the same length.

### ANSI Color
By default, Graphtage will only use ANSI color in its output if it is run from a TTY. If, for example, you would like
Expand Down
24 changes: 21 additions & 3 deletions graphtage/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from . import version
from .printer import HTMLPrinter, Printer
from .utils import Tempfile
from .yaml import YAMLFormatter


log = logging.getLogger('graphtage')
Expand Down Expand Up @@ -145,6 +144,19 @@ def main(argv=None) -> int:
action='store_true',
help='only match dictionary entries if they share the same key. This drastically reduces computation.'
)
list_edit_group = parser.add_mutually_exclusive_group()
list_edit_group.add_argument(
'--no-list-edits',
'-l',
action='store_true',
help='do not consider removal and insertion when comparing lists'
)
list_edit_group.add_argument(
'--no-list-edits-when-same-length',
'-ll',
action='store_true',
help='do not consider removal and insertion when comparing lists that are the same length'
)
parser.add_argument(
'--no-status',
action='store_true',
Expand Down Expand Up @@ -257,17 +269,23 @@ def printer_type(*pos_args, **kwargs):
else:
match_unless = None

options = graphtage.BuildOptions(
allow_key_edits=not args.no_key_edits,
allow_list_edits=not args.no_list_edits,
allow_list_edits_when_same_length=not args.no_list_edits_when_same_length
)

with printer:
with PathOrStdin(args.FROM_PATH) as from_path:
with PathOrStdin(args.TO_PATH) as to_path:
from_format = graphtage.get_filetype(from_path, from_mime)
to_format = graphtage.get_filetype(to_path, to_mime)
from_tree = from_format.build_tree_handling_errors(from_path, allow_key_edits=not args.no_key_edits)
from_tree = from_format.build_tree_handling_errors(from_path, options)
if isinstance(from_tree, str):
sys.stderr.write(from_tree)
sys.stderr.write('\n\n')
sys.exit(1)
to_tree = to_format.build_tree_handling_errors(to_path, allow_key_edits=not args.no_key_edits)
to_tree = to_format.build_tree_handling_errors(to_path, options)
if isinstance(to_tree, str):
sys.stderr.write(to_tree)
sys.stderr.write('\n\n')
Expand Down
29 changes: 29 additions & 0 deletions graphtage/bounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
"""

import logging
from functools import wraps
from typing import Iterable, Iterator, Optional, TypeVar, Union
from typing_extensions import Protocol

Expand All @@ -30,6 +32,9 @@
from .fibonacci import FibonacciHeap


log = logging.getLogger(__name__)


class Infinity:
"""A class for representing infinite values. This is primarily used for unbounded ranges."""
def __init__(self, positive=True):
Expand Down Expand Up @@ -230,6 +235,30 @@ def bounds(self) -> Range:
raise NotImplementedError(f"Class {self.__class__.__name__} must implement bounds")


def repeat_until_tightened(func):
"""A decorator that will repeatedly call the function until its class's bounds are tightened.
Intended for :meth:`Bounded.tighten_bounds`. The value returned by the decorated function is ignored.
"""
@wraps(func)
def wrapper(self: Bounded, *args, **kwargs):
starting_bounds = self.bounds()
if starting_bounds.definitive():
return False
while True:
func(self, *args, **kwargs)
new_bounds = self.bounds()
if new_bounds.lower_bound < starting_bounds.lower_bound \
or new_bounds.upper_bound > starting_bounds.upper_bound:
log.warning(f"The most recent call to {func} on {self} returned bounds {new_bounds} when the previous bounds were {starting_bounds}")
elif new_bounds.definitive() or new_bounds.lower_bound > starting_bounds.lower_bound \
or new_bounds.upper_bound < starting_bounds.upper_bound:
return True

return wrapper


class ConstantBound(Bounded):
"""An object with constant bounds."""
def __init__(self, value: RangeValue):
Expand Down
17 changes: 9 additions & 8 deletions graphtage/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import csv
from io import StringIO
from typing import Optional

from . import graphtage, json
from .json import JSONFormatter
Expand All @@ -30,17 +31,17 @@ def __eq__(self, other: 'CSVNode'):
return self._children == other._children or (not self and not other)


def build_tree(path: str, allow_key_edits=True, *args, **kwargs) -> CSVNode:
def build_tree(path: str, options: Optional[graphtage.BuildOptions] = None, *args, **kwargs) -> CSVNode:
"""Constructs a :class:`CSVNode` from a CSV file.
The file is parsed using Python's :func:`csv.reader`. The elements in each row are constructed by delegating to
:func:`graphtage.json.build_tree`::
CSVRow([json.build_tree(i, allow_key_edits=allow_key_edits) for i in row])
CSVRow([json.build_tree(i, options=options) for i in row])
Args:
path: The path to the file to be parsed.
allow_key_edits: This is effectively ignored since CSV files cannot contain mappings.
options: Optional build options to pass on to :meth:`graphtage.json.build_tree`.
*args: Any extra positional arguments are passed on to :func:`csv.reader`.
**kwargs: Any extra keyword arguments are passed on to :func:`csv.reader`.
Expand All @@ -51,7 +52,7 @@ def build_tree(path: str, allow_key_edits=True, *args, **kwargs) -> CSVNode:
csv_data = []
with open(path) as f:
for row in csv.reader(f, *args, **kwargs):
rowdata = [json.build_tree(i, allow_key_edits=allow_key_edits) for i in row]
rowdata = [json.build_tree(i, options=options) for i in row]
for col in rowdata:
if isinstance(col, graphtage.StringNode):
col.quoted = False
Expand Down Expand Up @@ -164,12 +165,12 @@ def __init__(self):
'text/csv'
)

def build_tree(self, path: str, allow_key_edits: bool = True) -> TreeNode:
def build_tree(self, path: str, options: Optional[graphtage.BuildOptions] = None) -> TreeNode:
"""Equivalent to :func:`build_tree`"""
return build_tree(path, allow_key_edits=allow_key_edits)
return build_tree(path, options=options)

def build_tree_handling_errors(self, path: str, allow_key_edits: bool = True) -> TreeNode:
return self.build_tree(path=path, allow_key_edits=allow_key_edits)
def build_tree_handling_errors(self, path: str, options: Optional[graphtage.BuildOptions] = None) -> TreeNode:
return self.build_tree(path=path, options=options)

def get_default_formatter(self) -> CSVFormatter:
return CSVFormatter.DEFAULT_INSTANCE
76 changes: 59 additions & 17 deletions graphtage/graphtage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
from typing import Any, Collection, Dict, Generic, Iterable, Iterator, List, Optional, Tuple, Type, TypeVar, Union

from .bounds import Range
from .edits import AbstractEdit, EditCollection, EditSequence
from .edits import AbstractEdit, EditCollection
from .edits import Insert, Match, Remove, Replace, AbstractCompoundEdit
from .levenshtein import EditDistance, levenshtein_distance
from .multiset import MultiSetEdit
from .printer import Back, Fore, NullANSIContext, Printer
from .sequences import SequenceEdit, SequenceNode
from .sequences import FixedLengthSequenceEdit, SequenceEdit, SequenceNode
from .tree import ContainerNode, Edit, GraphtageFormatter, TreeNode
from .utils import HashableCounter

Expand Down Expand Up @@ -213,7 +213,7 @@ def print(self, printer: Printer):
self.value.print(printer)

def calculate_total_size(self):
return self.key.total_size + self.value.total_size
return self.key.total_size + self.value.total_size + 2

def __lt__(self, other):
""" Compares this key/value pair to another.
Expand Down Expand Up @@ -278,8 +278,22 @@ def __str__(self):
class ListNode(SequenceNode[Tuple[T, ...]], Generic[T]):
"""A node containing an ordered sequence of nodes."""

def __init__(self, nodes: Iterable[T]):
def __init__(
self, nodes: Iterable[T],
allow_list_edits: bool = True,
allow_list_edits_when_same_length: bool = True
):
"""Initializes a List node.
Args:
nodes: The set of nodes in this list.
allow_list_edits: Whether to consider removal and insertion when editing this list.
allow_list_edits_when_same_length: Whether to consider removal and insertion when comparing this list to
another list of the same length.
"""
super().__init__(tuple(nodes))
self.allow_list_edits: bool = allow_list_edits
self.allow_list_edits_when_same_length: bool = allow_list_edits_when_same_length

def to_obj(self):
return [n.to_obj() for n in self]
Expand All @@ -296,15 +310,15 @@ def container_type(self) -> Type[Tuple[T, ...]]:

def edits(self, node: TreeNode) -> Edit:
if isinstance(node, ListNode):
if len(self._children) == len(node._children) == 0:
return Match(self, node, 0)
elif len(self._children) == len(node._children) == 1:
return EditSequence(from_node=self, to_node=node, edits=iter((
Match(self, node, 0),
self._children[0].edits(node._children[0])
)))
elif self._children == node._children:
if self._children == node._children:
return Match(self, node, 0)
elif not self.allow_list_edits or (len(self._children) == len(node._children) and (
not self.allow_list_edits_when_same_length or len(self._children) == 1
)):
return FixedLengthSequenceEdit(
from_node=self,
to_node=node
)
else:
if self.all_children_are_leaves() and node.all_children_are_leaves():
insert_remove_penalty = 0
Expand Down Expand Up @@ -348,7 +362,7 @@ def edits(self, node: TreeNode) -> Edit:
return Replace(self, node)

def calculate_total_size(self):
return sum(c.total_size * count for c, count in self._children.items())
return sum((c.total_size + 1) * count for c, count in self._children.items())

def __len__(self):
return sum(self._children.values())
Expand Down Expand Up @@ -861,6 +875,34 @@ def __init__(cls, name, bases, clsdict):
super().__init__(name, bases, clsdict)


class BuildOptions:
"""A class for passing options to tree building functions in :class:`Filetype`"""

def __init__(self, *,
allow_key_edits=True,
allow_list_edits=True,
allow_list_edits_when_same_length=True,
**kwargs
):
"""Initializes the options. All keyword values will be set as attributes of this class.
Options not specified will default to :const:`False`.
"""
self.allow_key_edits = allow_key_edits
"""Whether to consider editing keys when matching :class:`KeyValuePairNode` objects"""
self.allow_list_edits = allow_list_edits
"""Whether to consider insert and remove edits to lists"""
self.allow_list_edits_when_same_length = allow_list_edits_when_same_length
"""Whether to consider insert and remove edits on lists that are the same length"""
for attr, value in kwargs.items():
setattr(self, attr, value)

def __getattr__(self, item):
"""Default all undefined options to :const:`False`"""
return False


class Filetype(metaclass=FiletypeWatcher):
"""Abstract base class from which all Graphtage file formats should extend.
Expand Down Expand Up @@ -898,12 +940,12 @@ def __init__(self, type_name: str, default_mimetype: str, *mimetypes: str):
FILETYPES_BY_TYPENAME[self.name] = self

@abstractmethod
def build_tree(self, path: str, allow_key_edits: bool = True) -> TreeNode:
def build_tree(self, path: str, options: Optional[BuildOptions] = None) -> TreeNode:
"""Builds an intermediate representation tree from a file of this :class:`Filetype`.
Args:
path: Path to the file to parse
allow_key_edits: Whether to allow dictionary keys to be editable
options: An optional set of options for building the tree
Returns:
TreeNode: The root tree node of the provided file
Expand All @@ -912,14 +954,14 @@ def build_tree(self, path: str, allow_key_edits: bool = True) -> TreeNode:
raise NotImplementedError()

@abstractmethod
def build_tree_handling_errors(self, path: str, allow_key_edits: bool = True) -> Union[str, TreeNode]:
def build_tree_handling_errors(self, path: str, options: Optional[BuildOptions] = None) -> Union[str, TreeNode]:
"""Same as :meth:`Filetype.build_tree`, but it should return a human-readable error string on failure.
This function should never throw an exception.
Args:
path: Path to the file to parse
allow_key_edits: Whether to allow dictionary keys to be editable
options: An optional set of options for building the tree
Returns:
Union[str, TreeNode]: On success, the root tree node, or a string containing the error message on failure.
Expand Down
Loading

0 comments on commit 3563ae2

Please sign in to comment.