add structure feature & fix save models

chakki-works · Apr 7, 2017 · 2f2f64d · 2f2f64d
1 parent 9c5c707
commit 2f2f64d
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 40 deletions.
diff --git a/models/banana.pkl b/models/banana.pkl
diff --git a/models/banana_list.txt b/models/banana_list.txt
@@ -1 +1 @@
-header1_mean_length header2_mean_length hiragana_ratio kanji_ratio katakana_ratio number_ratio section_count sentence_max_length sentence_mean_length sentence_min_length title_length user_followers_count
+hiragana_ratio image_count image_ratio kanji_ratio katakana_ratio number_ratio punctuation_ratio section_count sentence_max_length sentence_mean_length title_length user_followers_count
diff --git a/models/banana_scaler.pkl b/models/banana_scaler.pkl
diff --git a/notebooks/feature_test.ipynb b/notebooks/feature_test.ipynb
diff --git a/scripts/features/structure_extractor.py b/scripts/features/structure_extractor.py
@@ -11,6 +11,7 @@ def extract(self, post, extracted=None):
 
         return count
 
+
 class ImageCountExtractor(FeatureExtractor):
 
     def extract(self, post, extracted=None):
@@ -26,3 +27,43 @@ def extract(self, post, extracted=None):
         count = len(re.findall(r'\$.*?\$+', post.rendered_body))
         return count
 
+
+class ItemizationRatioExtractor(FeatureExtractor):
+
+    def __init__(self, text):
+        self.text = text
+
+    def extract(self, post, extracted=None):
+        soup = BeautifulSoup(post.rendered_body, "html.parser")
+        target_count = len(soup.find_all("ul"))
+        lines_count = len(self.text.split("。"))
+
+        ratio = target_count / lines_count if target_count != 0 else 0
+        return ratio
+
+
+class ImageRatioExtractor(FeatureExtractor):
+
+    def __init__(self, text):
+        self.text = text
+
+    def extract(self, post, extracted=None):
+        soup = BeautifulSoup(post.rendered_body, "html.parser")
+        target_count = len(soup.find_all("img"))
+        lines_count = len(self.text.split("。"))
+
+        ratio = target_count / lines_count if target_count != 0 else 0
+        return ratio
+
+
+class FormulaRatioExtractor(FeatureExtractor):
+
+    def __init__(self, text):
+        self.text = text
+
+    def extract(self, post, extracted=None):
+        target_count = len(re.findall(r'\$.*?\$+', post.rendered_body))
+        lines_count = len(self.text.split("。"))
+
+        ratio = target_count / lines_count if target_count != 0 else 0
+        return ratio
diff --git a/scripts/models/save_models.py b/scripts/models/save_models.py
@@ -4,20 +4,27 @@
 
 class SaveModelsScalor():
 
-    def __init__(self, clf, scaler, data_folder=""):
+    def __init__(self, clf, scaler, pf_df, data_folder=""):
         model_file_name = "banana.pkl"
         scaler_file_name = "banana_scaler.pkl"
+        list_file_name = "banana_list.txt"
 
         def_file_path = "../../models/"
         self.data_folder = data_folder
 
         if not data_folder:
             model_file = os.path.join(os.path.dirname(__file__), def_file_path) + model_file_name
             scaler_file = os.path.join(os.path.dirname(__file__), def_file_path) + scaler_file_name
+            list_file = os.path.join(os.path.dirname(__file__), def_file_path) + list_file_name
         else:
             model_file = self.data_folder + model_file_name
-            scaler_file = self.data_folder + model_file_name
+            scaler_file = self.data_folder + scaler_file_name
+            list_file = self.data_folder + list_file_name
+
 
         joblib.dump(clf, model_file)
         joblib.dump(scaler, scaler_file)
 
+        with open(list_file, "w") as f:
+            f.write(" ".join(pf_df.columns.tolist()))
+
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		header1_mean_length header2_mean_length hiragana_ratio kanji_ratio katakana_ratio number_ratio section_count sentence_max_length sentence_mean_length sentence_min_length title_length user_followers_count
		hiragana_ratio image_count image_ratio kanji_ratio katakana_ratio number_ratio punctuation_ratio section_count sentence_max_length sentence_mean_length title_length user_followers_count