{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Lucene regex filter\n", "This presentations contains an example of a filter with a Lucene conform regular expression. \n", "A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n", "\n", "Until now it was necessary to flag the keys in regex_fields, when the value was containing a regular expression. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set document and define concatenator process to test the filter" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.insert(0,\"../../../../../\")\n", "import tempfile\n", "from copy import deepcopy\n", "from pathlib import Path\n", "\n", "from unittest import mock\n", "from logprep.factory import Factory\n", "\n", "document = {\n", " 'data_stream': {\n", " 'dataset': 'windows', \n", " 'namespace': 'devopslab', \n", " 'type': '/logs/'\n", " }, \n", " '_op_type': 'create'\n", " }\n", "\n", "expected = {\n", " 'data_stream': {\n", " 'dataset': 'windows', \n", " 'namespace': 'devopslab', \n", " 'type': 'logs'\n", " }, \n", " '_op_type': 'create', \n", " '_index': 'logs-windows-devopslab'\n", " }\n", "\n", "rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n", "rule_path.mkdir(exist_ok=True)\n", "rule_file = rule_path / \"data-stream.yml\"\n", "\n", "if rule_file.exists():\n", " rule_file.unlink()\n", "\n", "processor_config = {\n", " \"myconcatenator\":{ \n", " \"type\": \"concatenator\",\n", " \"rules\": [str(rule_path), \"/dev\"],\n", " }\n", " }\n", "\n", "concatenator = Factory.create(processor_config)\n", "\n", "def concat_with_rule(rule_yaml):\n", " mydocument = deepcopy(document)\n", " if rule_file.exists():\n", " rule_file.unlink()\n", " rule_file.write_text(rule_yaml)\n", " concatenator = Factory.create(processor_config)\n", " print(f\"before: {mydocument}\")\n", " concatenator.process(mydocument)\n", " print(f\"after: {mydocument}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Former version with explicit regex_fields annotation" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Deprecated]: regex_fields are no longer necessary. Use Lucene regex annotation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n", "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n" ] } ], "source": [ "rule_yaml = \"\"\"---\n", "filter: 'data_stream.type: \".*lo.*\"'\n", "regex_fields:\n", " - \"data_stream.type\"\n", "concatenator:\n", " source_fields:\n", " - data_stream.type\n", " - data_stream.dataset\n", " - data_stream.namespace\n", " target_field: _index\n", " separator: \"-\"\n", " overwrite_target: false\n", " delete_source_fields: false\n", "\"\"\"\n", "\n", "\n", "concat_with_rule(rule_yaml)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### New Lucene conform version without the need of regex_fields" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n", "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n" ] } ], "source": [ "rule_yaml = \"\"\"---\n", "filter: 'data_stream.type: /.*log.*/' \n", "concatenator:\n", " source_fields:\n", " - data_stream.type\n", " - data_stream.dataset\n", " - data_stream.namespace\n", " target_field: _index\n", " separator: \"-\"\n", " overwrite_target: false\n", " delete_source_fields: false\n", "\"\"\"\n", "concat_with_rule(rule_yaml)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Escaping a slash. One escape is needed for yml format, the other one for Lucene syntax. " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n", "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n" ] } ], "source": [ "rule_yaml = \"\"\"---\n", "filter: 'data_stream.type: /\\\\/lo.*/' \n", " \n", "concatenator:\n", " source_fields:\n", " - data_stream.type\n", " - data_stream.dataset\n", " - data_stream.namespace\n", " target_field: _index\n", " separator: \"-\"\n", " overwrite_target: false\n", " delete_source_fields: false\n", "\"\"\"\n", "concat_with_rule(rule_yaml)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" }, "vscode": { "interpreter": { "hash": "586280540a85d3e21edc698fe7b86af2848b9b02644e6c22463da25c40a3f1be" } } }, "nbformat": 4, "nbformat_minor": 4 }