-
Notifications
You must be signed in to change notification settings - Fork 71
/
add_markdown_info.py
81 lines (63 loc) · 2.66 KB
/
add_markdown_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
import re
import os
import tiktoken
import argparse
from tqdm import tqdm
from glob import glob
# tqdm.pandas()
def count_markdown_elements(markdown_text, suffix):
counters = {
f"header_count{suffix}": {
"h1": len(re.findall(r"^#{1}\s", markdown_text, re.MULTILINE)),
"h2": len(re.findall(r"^#{2}\s", markdown_text, re.MULTILINE)),
"h3": len(re.findall(r"^#{3}\s", markdown_text, re.MULTILINE)),
"h4": len(re.findall(r"^#{4}\s", markdown_text, re.MULTILINE)),
"h5": len(re.findall(r"^#{5}\s", markdown_text, re.MULTILINE)),
"h6": len(re.findall(r"^#{6}\s", markdown_text, re.MULTILINE)),
},
f"list_count{suffix}": {
"ordered": len(re.findall(r"^\s*\d+\.\s", markdown_text, re.MULTILINE)),
"unordered": len(re.findall(r"^\s*[-*+]\s", markdown_text, re.MULTILINE)),
},
f"bold_count{suffix}": {
"**": len(re.findall(r"\*\*[^*\n]+\*\*", markdown_text)),
"__": len(re.findall(r"__[^_\n]+__", markdown_text)),
},
}
return counters
def remove_pattern(answer, pattern):
blocks = pattern.findall(answer)
for block in blocks:
answer = answer.replace(block, "")
return answer
def get_element_counts(df, column):
pattern = re.compile("```([^`]*)```")
answers = df[column].map(
lambda choices: choices[0]["turns"][0]["content"]
)
results = answers.map(
lambda answer: count_markdown_elements(
remove_pattern(answer, pattern),
suffix="", # Remove code block first
)
)
return results.tolist()
def add_markdown_meta(row, encoder):
conv_meta = {"token_len": len(encoder.encode(row["choices"][0]["turns"][0]["content"], disallowed_special=()))}
return conv_meta | row["markdown_meta"]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dir", type=str, required=True)
parser.add_argument("--output-dir", type=str, required=True)
args = parser.parse_args()
encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
print("loading file...")
for file in tqdm(glob(f"{args.dir}/*.jsonl")):
data = pd.read_json(file, lines=True)
temp = data[["question_id", "choices"]].copy()
temp["markdown_meta"] = get_element_counts(data, column="choices")
data["conv_metadata"] = temp.apply(lambda row: add_markdown_meta(row, encoder), axis=1)
output_file = file.replace(args.dir, args.output_dir)
os.makedirs(os.path.dirname(output_file), exist_ok=True)
data.to_json(output_file, orient="records", lines=True)