-
Notifications
You must be signed in to change notification settings - Fork 553
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
559 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Logram | ||
|
||
Logram is an automated log parsing technique, which leverages n-gram dictionaries to achieve efficient log parsing. | ||
|
||
Read more information about Logram from the following paper: | ||
|
||
+ Hetong Dai, Heng Li, Che-Shao Chen, Weiyi Shang, and Tse-Hsun (Peter) Chen. [Logram: Efficient Log Parsing Using n-Gram | ||
Dictionaries](https://arxiv.org/pdf/2001.03038.pdf), *IEEE Transactions on Software Engineering (TSE)*, 2020. | ||
|
||
### Running | ||
|
||
The code has been tested in the following enviornment: | ||
+ python 3.7.6 | ||
+ regex 2022.3.2 | ||
+ pandas 1.0.1 | ||
+ numpy 1.18.1 | ||
+ scipy 1.4.1 | ||
|
||
Run the following scripts to start the demo: | ||
|
||
``` | ||
python demo.py | ||
``` | ||
|
||
Run the following scripts to execute the benchmark: | ||
|
||
``` | ||
python benchmark.py | ||
``` | ||
|
||
### Benchmark | ||
|
||
Running the benchmark script on Loghub_2k datasets, you could obtain the following results. | ||
|
||
| Dataset | F1_measure | Accuracy | | ||
|:-----------:|:----------|:--------| | ||
| HDFS | 0.990518 | 0.93 | | ||
| Hadoop | 0.78249 | 0.451 | | ||
| Spark | 0.479691 | 0.282 | | ||
| Zookeeper | 0.923936 | 0.7235 | | ||
| BGL | 0.956032 | 0.587 | | ||
| HPC | 0.993748 | 0.9105 | | ||
| Thunderbird | 0.993876 | 0.554 | | ||
| Windows | 0.913735 | 0.694 | | ||
| Linux | 0.541378 | 0.361 | | ||
| Android | 0.975017 | 0.7945 | | ||
| HealthApp | 0.587935 | 0.2665 | | ||
| Apache | 0.637665 | 0.3125 | | ||
| Proxifier | 0.750476 | 0.5035 | | ||
| OpenSSH | 0.979348 | 0.6115 | | ||
| OpenStack | 0.742866 | 0.3255 | | ||
| Mac | 0.892896 | 0.568 | | ||
|
||
|
||
### Citation | ||
|
||
:telescope: If you use our logparser tools or benchmarking results in your publication, please kindly cite the following papers. | ||
|
||
+ [**ICSE'19**] Jieming Zhu, Shilin He, Jinyang Liu, Pinjia He, Qi Xie, Zibin Zheng, Michael R. Lyu. [Tools and Benchmarks for Automated Log Parsing](https://arxiv.org/pdf/1811.03509.pdf). *International Conference on Software Engineering (ICSE)*, 2019. | ||
+ [**DSN'16**] Pinjia He, Jieming Zhu, Shilin He, Jian Li, Michael R. Lyu. [An Evaluation Study on Log Parsing and Its Use in Log Mining](https://jiemingzhu.github.io/pub/pjhe_dsn2016.pdf). *IEEE/IFIP International Conference on Dependable Systems and Networks (DSN)*, 2016. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .src.Logram import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
# ========================================================================= | ||
# Copyright (C) 2016-2023 LOGPAI (https://github.com/logpai). | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ========================================================================= | ||
|
||
|
||
import sys | ||
sys.path.append("../../") | ||
from logparser.Logram import LogParser | ||
from logparser.utils import evaluator | ||
import os | ||
import pandas as pd | ||
|
||
|
||
input_dir = "../../data/loghub_2k/" # The input directory of log file | ||
output_dir = "Logram_result/" # The output directory of parsing results | ||
|
||
benchmark_settings = { | ||
"HDFS": { | ||
"log_file": "HDFS/HDFS_2k.log", | ||
"log_format": "<Date> <Time> <Pid> <Level> <Component>: <Content>", | ||
"regex": [ | ||
r"blk_(|-)[0-9]+", # block id | ||
r"(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)", # IP | ||
r"(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$", | ||
], | ||
"doubleThreshold": 15, | ||
"triThreshold": 10, | ||
}, | ||
"Hadoop": { | ||
"log_file": "Hadoop/Hadoop_2k.log", | ||
"log_format": "<Date> <Time> <Level> \[<Process>\] <Component>: <Content>", | ||
"regex": [r"(\d+\.){3}\d+"], | ||
"doubleThreshold": 9, | ||
"triThreshold": 10, | ||
}, | ||
"Spark": { | ||
"log_file": "Spark/Spark_2k.log", | ||
"log_format": "<Date> <Time> <Level> <Component>: <Content>", | ||
"regex": [r"(\d+\.){3}\d+", r"\b[KGTM]?B\b", r"([\w-]+\.){2,}[\w-]+"], | ||
"doubleThreshold": 15, | ||
"triThreshold": 10, | ||
}, | ||
"Zookeeper": { | ||
"log_file": "Zookeeper/Zookeeper_2k.log", | ||
"log_format": "<Date> <Time> - <Level> \[<Node>:<Component>@<Id>\] - <Content>", | ||
"regex": [r"(/|)(\d+\.){3}\d+(:\d+)?"], | ||
"doubleThreshold": 15, | ||
"triThreshold": 10, | ||
}, | ||
"BGL": { | ||
"log_file": "BGL/BGL_2k.log", | ||
"log_format": "<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>", | ||
"regex": [r"core\.\d+"], | ||
"doubleThreshold": 92, | ||
"triThreshold": 4, | ||
}, | ||
"HPC": { | ||
"log_file": "HPC/HPC_2k.log", | ||
"log_format": "<LogId> <Node> <Component> <State> <Time> <Flag> <Content>", | ||
"regex": [r"=\d+"], | ||
"doubleThreshold": 15, | ||
"triThreshold": 10, | ||
}, | ||
"Thunderbird": { | ||
"log_file": "Thunderbird/Thunderbird_2k.log", | ||
"log_format": "<Label> <Timestamp> <Date> <User> <Month> <Day> <Time> <Location> <Component>(\[<PID>\])?: <Content>", | ||
"regex": [r"(\d+\.){3}\d+"], | ||
"doubleThreshold": 35, | ||
"triThreshold": 32, | ||
}, | ||
"Windows": { | ||
"log_file": "Windows/Windows_2k.log", | ||
"log_format": "<Date> <Time>, <Level> <Component> <Content>", | ||
"regex": [r"0x.*?\s"], | ||
"doubleThreshold": 15, | ||
"triThreshold": 10, | ||
}, | ||
"Linux": { | ||
"log_file": "Linux/Linux_2k.log", | ||
"log_format": "<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>", | ||
"regex": [r"(\d+\.){3}\d+", r"\d{2}:\d{2}:\d{2}"], | ||
"doubleThreshold": 120, | ||
"triThreshold": 100, | ||
}, | ||
"Android": { | ||
"log_file": "Android/Android_2k.log", | ||
"log_format": "<Date> <Time> <Pid> <Tid> <Level> <Component>: <Content>", | ||
"regex": [ | ||
r"(/[\w-]+)+", | ||
r"([\w-]+\.){2,}[\w-]+", | ||
r"\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b", | ||
], | ||
"doubleThreshold": 15, | ||
"triThreshold": 10, | ||
}, | ||
"HealthApp": { | ||
"log_file": "HealthApp/HealthApp_2k.log", | ||
"log_format": "<Time>\|<Component>\|<Pid>\|<Content>", | ||
"regex": [], | ||
"doubleThreshold": 15, | ||
"triThreshold": 10, | ||
}, | ||
"Apache": { | ||
"log_file": "Apache/Apache_2k.log", | ||
"log_format": "\[<Time>\] \[<Level>\] <Content>", | ||
"regex": [r"(\d+\.){3}\d+"], | ||
"doubleThreshold": 15, | ||
"triThreshold": 10, | ||
}, | ||
"Proxifier": { | ||
"log_file": "Proxifier/Proxifier_2k.log", | ||
"log_format": "\[<Time>\] <Program> - <Content>", | ||
"regex": [ | ||
r"<\d+\ssec", | ||
r"([\w-]+\.)+[\w-]+(:\d+)?", | ||
r"\d{2}:\d{2}(:\d{2})*", | ||
r"[KGTM]B", | ||
], | ||
"doubleThreshold": 500, | ||
"triThreshold": 470, | ||
}, | ||
"OpenSSH": { | ||
"log_file": "OpenSSH/OpenSSH_2k.log", | ||
"log_format": "<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>", | ||
"regex": [r"(\d+\.){3}\d+", r"([\w-]+\.){2,}[\w-]+"], | ||
"doubleThreshold": 88, | ||
"triThreshold": 81, | ||
}, | ||
"OpenStack": { | ||
"log_file": "OpenStack/OpenStack_2k.log", | ||
"log_format": "<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>", | ||
"regex": [r"((\d+\.){3}\d+,?)+", r"/.+?\s", r"\d+"], | ||
"doubleThreshold": 30, | ||
"triThreshold": 25, | ||
}, | ||
"Mac": { | ||
"log_file": "Mac/Mac_2k.log", | ||
"log_format": "<Month> <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>", | ||
"regex": [r"([\w-]+\.){2,}[\w-]+"], | ||
"doubleThreshold": 2, | ||
"triThreshold": 2, | ||
}, | ||
} | ||
|
||
bechmark_result = [] | ||
for dataset, setting in benchmark_settings.items(): | ||
print("\n=== Evaluation on %s ===" % dataset) | ||
indir = os.path.join(input_dir, os.path.dirname(setting["log_file"])) | ||
log_file = os.path.basename(setting["log_file"]) | ||
|
||
parser = LogParser( | ||
log_format=setting["log_format"], | ||
indir=indir, | ||
outdir=output_dir, | ||
rex=setting["regex"], | ||
doubleThreshold=setting["doubleThreshold"], | ||
triThreshold=setting["triThreshold"], | ||
) | ||
parser.parse(log_file) | ||
|
||
F1_measure, accuracy = evaluator.evaluate( | ||
groundtruth=os.path.join(indir, log_file + "_structured.csv"), | ||
parsedresult=os.path.join(output_dir, log_file + "_structured.csv"), | ||
) | ||
bechmark_result.append([dataset, F1_measure, accuracy]) | ||
|
||
print("\n=== Overall evaluation results ===") | ||
df_result = pd.DataFrame(bechmark_result, columns=["Dataset", "F1_measure", "Accuracy"]) | ||
df_result.set_index("Dataset", inplace=True) | ||
print(df_result) | ||
df_result.to_csv("Logram_bechmark_result.csv", float_format="%.6f") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/usr/bin/env python | ||
|
||
import sys | ||
sys.path.append('../../') | ||
from logparser.Logram import LogParser | ||
|
||
input_dir = '../../data/loghub_2k/HDFS/' # The input directory of log file | ||
output_dir = 'demo_result/' # The output directory of parsing results | ||
log_file = 'HDFS_2k.log' # The input log file name | ||
log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>' # HDFS log format | ||
# Regular expression list for optional preprocessing (default: []) | ||
regex = [ | ||
r'blk_(|-)[0-9]+' , # block id | ||
r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP | ||
r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers | ||
] | ||
doubleThreshold = 15 | ||
triThreshold = 10 | ||
|
||
parser = LogParser(log_format, indir=input_dir, outdir=output_dir, rex=regex, | ||
doubleThreshold=doubleThreshold, triThreshold=triThreshold) | ||
parser.parse(log_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
pandas | ||
regex==2022.3.2 | ||
numpy | ||
scipy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
""" | ||
This file is modified from: | ||
https://github.com/BlueLionLogram/Logram/tree/master/Evaluation | ||
""" | ||
|
||
import regex as re | ||
|
||
MyRegex = [ | ||
r"blk_(|-)[0-9]+", # block id | ||
r"(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)", # IP | ||
r"(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$", # Numbers | ||
] | ||
|
||
|
||
def preprocess(logLine, specialRegex): | ||
line = logLine | ||
for regex in specialRegex: | ||
line = re.sub(regex, "<*>", " " + logLine) | ||
return line | ||
|
||
|
||
def tokenSpliter(logLine, regex, specialRegex): | ||
match = regex.search(logLine.strip()) | ||
# print(match) | ||
if match == None: | ||
tokens = None | ||
pass | ||
else: | ||
message = match.group("Content") | ||
# print(message) | ||
line = preprocess(message, specialRegex) | ||
tokens = line.strip().split() | ||
# print(tokens) | ||
return tokens, message | ||
|
||
|
||
def regexGenerator(logformat): | ||
headers = [] | ||
splitters = re.split(r"(<[^<>]+>)", logformat) | ||
regex = "" | ||
for k in range(len(splitters)): | ||
if k % 2 == 0: | ||
splitter = re.sub(" +", "\\\s+", splitters[k]) | ||
regex += splitter | ||
else: | ||
header = splitters[k].strip("<").strip(">") | ||
regex += "(?P<%s>.*?)" % header | ||
headers.append(header) | ||
regex = re.compile("^" + regex + "$") | ||
return regex |
Oops, something went wrong.