Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Lucene compliant regex filter expression #675

Merged
merged 35 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
21acdd5
Rebase
djkhl Sep 26, 2024
24ad37f
Adding lucine compliance unit test for development
fabian-moessner Sep 27, 2024
a1f2f47
Adding lucene compliance for filter parsing of a rule.
fabian-moessner Oct 2, 2024
e33bebd
Adding logger with deprecation warning for regex_fields
fabian-moessner Oct 7, 2024
af3524e
Add comment and documentation for lucene regex filter annotation
fabian-moessner Oct 7, 2024
62a777a
Quickfix for lucene regex filter
fabian-moessner Oct 7, 2024
7988810
Adjusting Format
fabian-moessner Oct 7, 2024
6815fb4
Adjusting Format 2
fabian-moessner Oct 7, 2024
8e98add
Adjusting Format 3
fabian-moessner Oct 7, 2024
ff0071c
Attempting to remove indeces for regex filter string
fabian-moessner Oct 9, 2024
7695373
Adding notebook for lucene regex filter development
fabian-moessner Oct 9, 2024
fdf8c37
WIP notebook for lucene regex filter development
fabian-moessner Oct 10, 2024
a6fcb3e
Adding Notebook for lucene regex filter testing.
Oct 22, 2024
1c8a302
Adding Notebook for lucene regex filter testing same results as unit …
Oct 23, 2024
5a3d347
Adding first running version of lucene regex filter
Oct 23, 2024
dbf63d2
Improving notebook for lucene conform regex filter.
Oct 24, 2024
b88683d
Improving notebook for lucene conform regex filter 2.
Oct 24, 2024
9d0ce73
Slight improve
Oct 24, 2024
04a0a09
Bug fix in regex notebook.
Oct 24, 2024
74361b5
Adding Deprecated Warning
Oct 24, 2024
41b0e6a
Removing temporary test
Oct 25, 2024
d6dc4c4
Adding rule tests for lucene compliance
Oct 25, 2024
5b190b3
Black formatting
Oct 25, 2024
4adeb42
Black formatting
Oct 25, 2024
002e09a
Remove prototypey
Oct 28, 2024
c1fb5ad
add changelog entry and some prototypey things that actually do nothi…
djkhl Sep 26, 2024
029766d
Adding lucine compliance unit test for development
fabian-moessner Sep 27, 2024
e14fe60
Adding lucene compliance for filter parsing of a rule.
fabian-moessner Oct 2, 2024
68aa1ca
Quickfix for lucene regex filter
fabian-moessner Oct 7, 2024
148b36c
Adjusting Format 2
fabian-moessner Oct 7, 2024
e62f4be
Adding Deprecated Warning
Oct 24, 2024
6675c41
Black formatting
Oct 25, 2024
308482d
Add documentation
Oct 28, 2024
4adb5d4
Delete prototypeclass
Oct 28, 2024
42bf7d2
add notebook to documentation
djkhl Oct 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* adds `desired_cluster_status` option to opensearch output to signal healthy cluster status
* initially run health checks on setup for every configured component
* make `imagePullPolicy` configurable for helm chart deployments
* it is now possible to use Lucene compliant Filter Expressions
* make `terminationGracePeriodSeconds` configurable in helm chart values


Expand Down
1 change: 1 addition & 0 deletions doc/source/development/coding_examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Processor Case Examples
.. toctree::
:maxdepth: 1

notebooks/processor_examples/regex.ipynb
notebooks/processor_examples/concatenator.ipynb
notebooks/processor_examples/calculator.ipynb
notebooks/processor_examples/dissector.ipynb
Expand Down
206 changes: 206 additions & 0 deletions doc/source/development/notebooks/processor_examples/regex.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lucene regex filter\n",
"This presentations contains an example of a filter with a lucene conform regular expression. \n",
"A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n",
"\n",
"Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"document = {\n",
" 'data_stream': {\n",
" 'dataset': 'windows', \n",
" 'namespace': 'devopslab', \n",
" 'type': 'logs'\n",
" }, \n",
" '_op_type': 'create'\n",
" }\n",
"\n",
"expected = {\n",
" 'data_stream': {\n",
" 'dataset': 'windows', \n",
" 'namespace': 'devopslab', \n",
" 'type': 'logs'\n",
" }, \n",
" '_op_type': 'create', \n",
" '_index': 'logs-windows-devopslab'\n",
" }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define process"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0,\"../../../../../\")\n",
"import tempfile\n",
"from copy import deepcopy\n",
"from pathlib import Path\n",
"\n",
"from unittest import mock\n",
"from logprep.factory import Factory\n",
"\n",
"rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n",
"rule_path.mkdir(exist_ok=True)\n",
"rule_file = rule_path / \"data-stream.yml\"\n",
"\n",
"if rule_file.exists():\n",
" rule_file.unlink()\n",
"\n",
"processor_config = {\n",
" \"myconcatenator\":{ \n",
" \"type\": \"concatenator\",\n",
" \"specific_rules\": [str(rule_path)],\n",
" \"generic_rules\": [\"/dev\"],\n",
" }\n",
" }\n",
"\n",
"def concat_with_rule(rule_yaml):\n",
" mydocument = deepcopy(document)\n",
" if rule_file.exists():\n",
" rule_file.unlink()\n",
" rule_file.write_text(rule_yaml)\n",
" concatenator = Factory.create(processor_config)\n",
" print(f\"before: {mydocument}\")\n",
" concatenator.process(mydocument)\n",
" print(f\"after: {mydocument}\")\n",
" print(mydocument == expected)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### regex_fields version"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Deprecated]: regex_fields are no longer necessary. Use Lucene regex annotation.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"[Deprecation warning]: regex_fields are no longer necessary. Use lucene regex annotation.\n",
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
"True\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: \".*lo.*\"' \n",
"regex_fields:\n",
" - \"data_stream.type\"\n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
" - data_stream.dataset\n",
" - data_stream.namespace\n",
" target_field: _index\n",
" separator: \"-\"\n",
" overwrite_target: false\n",
" delete_source_fields: false\n",
"\"\"\"\n",
"\n",
"concat_with_rule(rule_yaml)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Lucene conform version without the need of regex_fields"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
"True\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: \"/.*lo.*/\"' \n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
" - data_stream.dataset\n",
" - data_stream.namespace\n",
" target_field: _index\n",
" separator: \"-\"\n",
" overwrite_target: false\n",
" delete_source_fields: false\n",
"\"\"\"\n",
"concat_with_rule(rule_yaml)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
},
"vscode": {
"interpreter": {
"hash": "586280540a85d3e21edc698fe7b86af2848b9b02644e6c22463da25c40a3f1be"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
2 changes: 1 addition & 1 deletion logprep/filter/expression/filter_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from abc import ABC, abstractmethod
from itertools import chain, zip_longest
from typing import List, Any
from typing import Any, List


class FilterExpressionError(BaseException):
Expand Down
60 changes: 49 additions & 11 deletions logprep/filter/lucene_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,18 @@
------------

It is possible use regex expressions to match values.
For this, the field with the regex pattern must be added to the optional field
To be recognized as a regular expression the filter field has to be start and end with
:code:`/`.


.. code-block:: yaml
:linenos:
:caption: Example

filter: 'ip_address: "/192\.168\.0\..*/"'


[Deprecated, but still functional] The field with the regex pattern must be added to the optional field
:code:`regex_fields` in the rule definition.

In the following example the field :code:`ip_address` is defined as regex field.
Expand All @@ -84,24 +95,39 @@
from itertools import chain, zip_longest

# pylint: enable=anomalous-backslash-in-string
from typing import List, Union, Optional
from typing import List, Optional, Union

import logging
import luqum
from luqum.parser import parser, ParseSyntaxError, IllegalCharacterError
from luqum.tree import OrOperation, AndOperation, Group, FieldGroup, SearchField, Phrase, Word, Not
from luqum.parser import IllegalCharacterError, ParseSyntaxError, parser
from luqum.tree import (
AndOperation,
FieldGroup,
Group,
Not,
OrOperation,
Phrase,
Regex,
SearchField,
Word,
)

from logprep.filter.expression.filter_expression import (
Or,
Always,
And,
StringFilterExpression,
SigmaFilterExpression,
RegExFilterExpression,
Not as NotExpression,
Exists,
Null,
Always,
FilterExpression,
)
from logprep.filter.expression.filter_expression import Not as NotExpression
from logprep.filter.expression.filter_expression import (
Null,
Or,
RegExFilterExpression,
SigmaFilterExpression,
StringFilterExpression,
)

logger = logging.getLogger("LuceneFilter")


class LuceneFilterError(BaseException):
Expand Down Expand Up @@ -309,10 +335,22 @@ def _get_filter_expression(
return RegExFilterExpression(key[:-1] + key_and_modifier[:-1], value)

dotted_field = ".".join(key)

if self._special_fields.items():
for sf_key, sf_value in self._special_fields.items():
if sf_value is True or dotted_field in sf_value:
if sf_key == "regex_fields":
logger.warning(
"[Deprecated]: regex_fields are no longer necessary. "
"Use Lucene regex annotation."
)

return self._special_fields_map[sf_key](key, value)

if value.startswith("/") and value.endswith("/"):
value = value.strip("/")
return RegExFilterExpression(key, value)

return StringFilterExpression(key, value)

@staticmethod
Expand Down
41 changes: 36 additions & 5 deletions tests/unit/filter/test_lucene_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,20 @@
import pytest
from pytest import raises

from logprep.filter.lucene_filter import LuceneFilter, LuceneFilterError, LuceneTransformer
from logprep.filter.expression.filter_expression import (
StringFilterExpression,
RegExFilterExpression,
Or,
And,
Null,
Always,
Exists,
Not,
Null,
Or,
RegExFilterExpression,
StringFilterExpression,
)
from logprep.filter.lucene_filter import (
LuceneFilter,
LuceneFilterError,
LuceneTransformer,
)


Expand Down Expand Up @@ -451,3 +455,30 @@ def test_create_filter_success(self, testcase, input_str, cleaned_str):
def test_create_filter_error(self, testcase, input_str, message):
with raises(LuceneFilterError, match=re.escape(message)):
LuceneFilter.create(f'foo: "{input_str}"')

def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"',
)

assert lucene_filter == And(
RegExFilterExpression(["regex_key_one"], ".*value.*"),
RegExFilterExpression(["regex_key_two"], ".*value.*"),
)

def test_creates_lucene_compliance_filter_one_regex_key(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/"',
)

assert lucene_filter == RegExFilterExpression(["regex_key_one"], ".*value.*")

def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/" AND key_two: "value"',
)

assert lucene_filter == And(
RegExFilterExpression(["regex_key_one"], ".*value.*"),
StringFilterExpression(["key_two"], "value"),
)
Loading
Loading