Skip to content


chg: raise an error if a pyxform reference is malformed
Browse files Browse the repository at this point in the history
  - new module to check pyxform reference syntax
  - unit test generates 2874 unique usage permutations
  - remove extra iteration of sheet data in replace_smart_quotes by
    combining it with clean_text_values and calling that instead
  - move expression parser from utils to here
  - update usages of lexer to reference a cached wrapper func instead
- use compiled regex patterns directly instead of importing re as well
  • Loading branch information
lindsay-stevens committed Oct 29, 2024
1 parent a724215 commit 1a01e17
Show file tree
Hide file tree
Showing 12 changed files with 305 additions and 141 deletions.
99 changes: 97 additions & 2 deletions pyxform/parsing/
Original file line number Diff line number Diff line change
@@ -1,13 +1,108 @@
import re
from import Iterable
from functools import lru_cache
from typing import NamedTuple

from pyxform.utils import parse_expression

def get_expression_lexer() -> re.Scanner:
Get a expression lexer (scanner) for parsing.
# ncname regex adapted from eulxml
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from
# and
namestartchar = (
+ r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
+ r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
+ r"|[\U00010000-\U000EFFFF])"
# additional characters allowed in NCNames after the first character
namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
ncname_regex = (
r"(" + namestartchar + r")(" + namestartchar + r"|" + namechar_extra + r")*"
ncname_regex = ncname_regex + r"(:" + ncname_regex + r")?"

date_regex = r"-?\d{4}-\d{2}-\d{2}"
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
lexer_rules = {
"DATETIME": date_time_regex,
"DATE": date_regex,
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
"OPS_MATH": r"[\*\+\-]|mod|div",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r"and|or",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
"BRACKET": r"\[\]\{\}",
"PARENT_REF": r"\.\.",
"SELF_REF": r"\.",
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
"URI_SCHEME": ncname_regex + r"://",
"NAME": ncname_regex, # Must be after rules containing ncname_regex.
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.

def get_tokenizer(name):
def tokenizer(scan, value):
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
# re.Scanner is undocumented but has been around since at least 2003
return re.Scanner(lexicon)

# Scanner takes a few 100ms to compile so use this shared instance.
class ExpLexerToken(NamedTuple):
name: str
value: str
start: int
end: int

_EXPRESSION_LEXER = get_expression_lexer()

def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
Parse an expression.
Use this function instead of _EXPRESSION_LEXER to take advantage of caching.
:param text: The expression.
:return: The parsed tokens, and any remaining unparsed text.
tokens, remainder = _EXPRESSION_LEXER.scan(text)
return tokens, remainder

def is_single_token_expression(expression: str, token_types: Iterable[str]) -> bool:
Does the expression contain single token of one of the provided token types?
tokens, _ = parse_expression(text=expression.strip())
tokens, _ = parse_expression(expression.strip())
if 1 == len(tokens) and tokens[0].name in token_types:
return True
Expand Down
9 changes: 4 additions & 5 deletions pyxform/parsing/
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from typing import TYPE_CHECKING

from pyxform.utils import BRACKETED_TAG_REGEX, EXPRESSION_LEXER, ExpLexerToken, node
from pyxform.parsing.expression import ExpLexerToken, parse_expression
from pyxform.utils import BRACKETED_TAG_REGEX, node

from pyxform.survey import Survey
Expand Down Expand Up @@ -37,7 +37,7 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
path_enter = False
pred_enter = False
last_token = None
tokens, _ = EXPRESSION_LEXER.scan(xml_text)
tokens, _ = parse_expression(xml_text)
boundaries = []

for t in tokens:
Expand Down Expand Up @@ -111,8 +111,7 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
old_str = xml_text[start:end]
# Pass the new string through the pyxform reference replacer.
# noinspection PyProtectedMember
new_str = re.sub(
new_str = BRACKETED_TAG_REGEX.sub(
lambda m: survey._var_repl_function(m, context),
Expand Down
96 changes: 3 additions & 93 deletions pyxform/
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,18 @@
import re
from io import StringIO
from json.decoder import JSONDecodeError
from typing import Any, NamedTuple
from typing import Any
from xml.dom import Node
from xml.dom.minidom import Element, Text, _write_data

from defusedxml.minidom import parseString

from pyxform import constants as const
from pyxform.errors import PyXFormError
from pyxform.parsing.expression import parse_expression

SEP = "_"

INVALID_XFORM_TAG_REGEXP = r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*"

INVALID_XFORM_TAG_REGEXP = re.compile(r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*")
BRACKETED_TAG_REGEX = re.compile(r"\${(last-saved#)?(.*?)}")
LAST_SAVED_REGEX = re.compile(r"\${last-saved#(.*?)}")
Expand Down Expand Up @@ -334,94 +333,5 @@ def levenshtein_distance(a: str, b: str) -> int:
return v0[n]

def get_expression_lexer() -> re.Scanner:
Get a expression lexer (scanner) for parsing.
# ncname regex adapted from eulxml
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from
# and
namestartchar = (
+ r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
+ r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
+ r"|[\U00010000-\U000EFFFF])"
# additional characters allowed in NCNames after the first character
namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
ncname_regex = (
r"(" + namestartchar + r")(" + namestartchar + r"|" + namechar_extra + r")*"
ncname_regex = ncname_regex + r"(:" + ncname_regex + r")?"

date_regex = r"-?\d{4}-\d{2}-\d{2}"
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
lexer_rules = {
"DATETIME": date_time_regex,
"DATE": date_regex,
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
"OPS_MATH": r"[\*\+\-]|mod|div",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r"and|or",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
"BRACKET": r"\[\]\{\}",
"PARENT_REF": r"\.\.",
"SELF_REF": r"\.",
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
"URI_SCHEME": ncname_regex + r"://",
"NAME": ncname_regex, # Must be after rules containing ncname_regex.
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.

def get_tokenizer(name):
def tokenizer(scan, value):
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
# re.Scanner is undocumented but has been around since at least 2003
return re.Scanner(lexicon)

# Scanner takes a few 100ms to compile so use this shared instance.
class ExpLexerToken(NamedTuple):
name: str
value: str
start: int
end: int

EXPRESSION_LEXER = get_expression_lexer()

def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
Parse a "default" expression, well enough to identify dynamic defaults vs. not.
:param text: The expression.
:return: The parsed tokens, and any remaining unparsed text.
tokens, remainder = EXPRESSION_LEXER.scan(text)
return tokens, remainder

def coalesce(*args):
return next((a for a in args if a is not None), None)
8 changes: 5 additions & 3 deletions pyxform/validators/
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import re

ERROR_MESSAGE_REGEX = re.compile(r"(/[a-z0-9\-_]+(?:/[a-z0-9\-_]+)+)", flags=re.I)

class ErrorCleaner:
"""Cleans up raw error messages from XForm validators for end users."""
Expand All @@ -22,9 +24,9 @@ def _replace_xpath_with_tokens(match):

def _cleanup_errors(error_message):
pattern = r"(/[a-z0-9\-_]+(?:/[a-z0-9\-_]+)+)"
error_message = re.sub(
pattern, ErrorCleaner._replace_xpath_with_tokens, error_message, flags=re.I
error_message = ERROR_MESSAGE_REGEX.sub(
lines = str(error_message).strip().splitlines()
no_dupes = [
Expand Down
53 changes: 53 additions & 0 deletions pyxform/validators/pyxform/
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from errors import PyXFormError
from pyxform import constants as co
from pyxform.parsing.expression import parse_expression

"[row : {row_number}] On the '{sheet}' sheet, the '{column}' value is invalid. "
"Reference expressions must only include question names, and end with '}}'."

def validate_pyxform_reference_syntax(
value: str, sheet_name: str, row_number: int, key: str
) -> None:
# Skip columns in potentially large sheets where references are not allowed.
if sheet_name == co.SURVEY:
if key in (co.TYPE, co.NAME):
elif sheet_name == co.CHOICES:
if key in (co.LIST_NAME_S, co.LIST_NAME_U, co.NAME):
elif sheet_name == co.ENTITIES:
if key == (co.LIST_NAME_S, co.LIST_NAME_U):

tokens, _ = parse_expression(value)
start_token = None

for t in tokens:
# The start of an expression.
if t is not None and == "PYXFORM_REF_START" and start_token is None:
start_token = t
# Tokens that are part of an expression.
elif start_token is not None:
if == "NAME":
elif == "PYXFORM_REF_END":
start_token = None
sheet=sheet_name, row_number=row_number, column=key
raise PyXFormError(msg)
sheet=sheet_name, row_number=row_number, column=key
raise PyXFormError(msg)

if start_token is not None:
sheet=sheet_name, row_number=row_number, column=key
raise PyXFormError(msg)
4 changes: 1 addition & 3 deletions pyxform/validators/pyxform/
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
Validations for question types.

import re

from pyxform.errors import PyXFormError
from pyxform.parsing.expression import is_single_token_expression
from pyxform.utils import PYXFORM_REFERENCE_REGEX
Expand Down Expand Up @@ -37,7 +35,7 @@ def validate_background_geopoint_trigger(row: dict, row_num: int) -> bool:
def validate_references(referrers: list[tuple[dict, int]], questions: set[str]) -> bool:
"""Triggers must refer to a question that exists."""
for row, row_num in referrers:
matches = re.match(PYXFORM_REFERENCE_REGEX, row["trigger"])
matches = PYXFORM_REFERENCE_REGEX.match(row["trigger"])
if matches is not None:
trigger = matches.groups()[0]
if trigger not in questions:
Expand Down
2 changes: 1 addition & 1 deletion pyxform/validators/pyxform/
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from import Sequence

SheetData = tuple[tuple[str, ...]]
SheetData = tuple[tuple[str, ...], ...]
Warnings = list[str]

Expand Down

0 comments on commit 1a01e17

Please sign in to comment.