Skip to content

Commit

Permalink
feat(search): Implemented additional search connectors
Browse files Browse the repository at this point in the history
- Added tests for additional search connectors
- Disable expensive wildcard queries

Fixes: #3888
  • Loading branch information
albertisfu committed Jan 10, 2025
1 parent 9f67ed6 commit 57e3fe7
Show file tree
Hide file tree
Showing 6 changed files with 374 additions and 68 deletions.
1 change: 0 additions & 1 deletion cl/lib/elasticsearch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,6 @@ def build_text_filter(field: str, value: str) -> List:
if value:
if isinstance(value, str):
validate_query_syntax(value, QueryType.FILTER)

return [
Q(
"query_string",
Expand Down
79 changes: 77 additions & 2 deletions cl/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from cl.lib.crypto import sha256
from cl.lib.model_helpers import clean_docket_number, is_docket_number
from cl.lib.types import CleanData
from cl.search.exception import DisallowedWildcardPattern, QueryType


class _UNSPECIFIED:
Expand Down Expand Up @@ -232,9 +233,79 @@ def modify_court_id_queries(query_str: str) -> str:
return modified_query


def check_query_for_disallowed_wildcards(query_string: str) -> bool:
"""Check if the query_string contains not allowed wildcards that can be
really expensive.
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-wildcard
* at the beginning of a term
* in a term with less than 3 characters.
! in a term with less than 3 characters.
Like:
*ing
a* or !a
:param query_string: The query string to be checked.
:return: A boolean indicating if the query string contains not allowed wildcards.
"""

# Match any term that starts with *
wildcard_start = r"(?:^|\s)\*\w+"

# Match any term with less than 3 chars that ends with *
wildcard_end = r"(?:^|\s)\w{1,2}\*(?=$|\s)"

# Match any term with less than 3 chars that starts with !
root_expander_short_term = r"(?:^|\s)\![^\s]{1,2}(?=$|\s)"

if any(
re.search(pattern, query_string)
for pattern in [wildcard_start, wildcard_end, root_expander_short_term]
):
return True
return False


def perform_special_character_replacements(query_string: str) -> str:
"""Perform a series of special character replacements in the given query
string to clean it up and support the % &, !, and * search connectors.
:param query_string: The user query string.
:return: The transformed query string with the specified replacements applied.
"""

# Replace smart quotes with standard double quotes for consistency.
query_string = re.sub(r"[“”]", '"', query_string)

# Replace % (but not) by NOT
query_string = re.sub(r" % ", " NOT ", query_string)

# Replace & by AND
query_string = re.sub(r" & ", " AND ", query_string)

# Replace ! (root expander) at the beginning of words with * at the end.
root_expander_pattern = r"(^|\s)!([a-zA-Z]+)"
root_expander_replacement = r"\1\2*"
query_string = re.sub(
root_expander_pattern, root_expander_replacement, query_string
)

# Replace * (universal character) that is not at the end of a word with ?.
universal_char_pattern = r"\*(?=\w)"
universal_char_replacement = "?"
query_string = re.sub(
universal_char_pattern, universal_char_replacement, query_string
)

return query_string


def cleanup_main_query(query_string: str) -> str:
"""Enhance the query string with some simple fixes
- Check for expensive wildcards and thrown an error if found.
- Perform special character replacements for search connectors.
- Make any numerical queries into phrases (except dates)
- Add hyphens to district docket numbers that lack them
- Ignore tokens inside phrases
Expand All @@ -249,8 +320,12 @@ def cleanup_main_query(query_string: str) -> str:
"""
inside_a_phrase = False
cleaned_items = []
# Replace smart quotes with standard double quotes for consistency.
query_string = re.sub(r"[“”]", '"', query_string)

if check_query_for_disallowed_wildcards(query_string):
raise DisallowedWildcardPattern(QueryType.QUERY_STRING)

query_string = perform_special_character_replacements(query_string)

for item in re.split(r'([^a-zA-Z0-9_\-^~":]+)', query_string):
if not item:
continue
Expand Down
6 changes: 6 additions & 0 deletions cl/search/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,9 @@ class ElasticBadRequestError(APIException):
"Elasticsearch Bad request error. Please review your query."
)
default_code = "bad_request"


class DisallowedWildcardPattern(SyntaxQueryError):
"""Query contains a disallowed wildcard pattern"""

message = "The query contains a disallowed wildcard pattern."
2 changes: 2 additions & 0 deletions cl/search/templates/includes/no_results.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ <h2 class="alt">
Did you forget to close one or more parentheses?
{% elif error_message == "unbalanced_quotes" %}
Did you forget to close one or more quotes?
{% elif error_message == "disallowed_wildcard_pattern" %}
The query contains a disallowed wildcard pattern.
{% endif %}
{% else %}
encountered an error.
Expand Down
Loading

0 comments on commit 57e3fe7

Please sign in to comment.