-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaner.py
49 lines (41 loc) · 1.38 KB
/
data_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
import re
import nltk
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# 若尚未下載所需資源,請執行以下指令(只需執行一次)
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt_tab")
# 讀取 CSV 檔案,假設檔案名稱為 "cnn_africa_articles.csv"
input_file = "cnn_articles_raw.csv"
df = pd.read_csv(input_file)
# 定義停用字與詞形還原工具
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
# 移除 HTML 標籤
text = BeautifulSoup(text, "html.parser").get_text()
# 轉小寫
text = text.lower()
# 移除標點符號
text = text.translate(str.maketrans("", "", string.punctuation))
# 移除數字
text = re.sub(r"\d+", "", text)
# 斷詞
tokens = nltk.word_tokenize(text)
# 移除停用字,並進行詞形還原
tokens = [
lemmatizer.lemmatize(token) for token in tokens if token not in stop_words
]
# 合併回字串
return " ".join(tokens)
# 對 'text' 欄位進行清理
df["text"] = df["text"].apply(clean_text)
# 將清理後的結果寫入新的 CSV 檔案
output_file = "cnn_articles_cleaned.csv"
df.to_csv(output_file, index=False, encoding="utf-8")
print(f"清理後的資料已儲存至 {output_file}")