Skip to content
This repository has been archived by the owner on Apr 10, 2019. It is now read-only.

Commit

Permalink
add structure feature & fix save models
Browse files Browse the repository at this point in the history
  • Loading branch information
kamujun committed Apr 7, 2017
1 parent 9c5c707 commit 2f2f64d
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 40 deletions.
Binary file modified models/banana.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion models/banana_list.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
header1_mean_length header2_mean_length hiragana_ratio kanji_ratio katakana_ratio number_ratio section_count sentence_max_length sentence_mean_length sentence_min_length title_length user_followers_count
hiragana_ratio image_count image_ratio kanji_ratio katakana_ratio number_ratio punctuation_ratio section_count sentence_max_length sentence_mean_length title_length user_followers_count
Binary file modified models/banana_scaler.pkl
Binary file not shown.
89 changes: 52 additions & 37 deletions notebooks/feature_test.ipynb

Large diffs are not rendered by default.

41 changes: 41 additions & 0 deletions scripts/features/structure_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def extract(self, post, extracted=None):

return count


class ImageCountExtractor(FeatureExtractor):

def extract(self, post, extracted=None):
Expand All @@ -26,3 +27,43 @@ def extract(self, post, extracted=None):
count = len(re.findall(r'\$.*?\$+', post.rendered_body))
return count


class ItemizationRatioExtractor(FeatureExtractor):

def __init__(self, text):
self.text = text

def extract(self, post, extracted=None):
soup = BeautifulSoup(post.rendered_body, "html.parser")
target_count = len(soup.find_all("ul"))
lines_count = len(self.text.split("。"))

ratio = target_count / lines_count if target_count != 0 else 0
return ratio


class ImageRatioExtractor(FeatureExtractor):

def __init__(self, text):
self.text = text

def extract(self, post, extracted=None):
soup = BeautifulSoup(post.rendered_body, "html.parser")
target_count = len(soup.find_all("img"))
lines_count = len(self.text.split("。"))

ratio = target_count / lines_count if target_count != 0 else 0
return ratio


class FormulaRatioExtractor(FeatureExtractor):

def __init__(self, text):
self.text = text

def extract(self, post, extracted=None):
target_count = len(re.findall(r'\$.*?\$+', post.rendered_body))
lines_count = len(self.text.split("。"))

ratio = target_count / lines_count if target_count != 0 else 0
return ratio
11 changes: 9 additions & 2 deletions scripts/models/save_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,27 @@

class SaveModelsScalor():

def __init__(self, clf, scaler, data_folder=""):
def __init__(self, clf, scaler, pf_df, data_folder=""):
model_file_name = "banana.pkl"
scaler_file_name = "banana_scaler.pkl"
list_file_name = "banana_list.txt"

def_file_path = "../../models/"
self.data_folder = data_folder

if not data_folder:
model_file = os.path.join(os.path.dirname(__file__), def_file_path) + model_file_name
scaler_file = os.path.join(os.path.dirname(__file__), def_file_path) + scaler_file_name
list_file = os.path.join(os.path.dirname(__file__), def_file_path) + list_file_name
else:
model_file = self.data_folder + model_file_name
scaler_file = self.data_folder + model_file_name
scaler_file = self.data_folder + scaler_file_name
list_file = self.data_folder + list_file_name


joblib.dump(clf, model_file)
joblib.dump(scaler, scaler_file)

with open(list_file, "w") as f:
f.write(" ".join(pf_df.columns.tolist()))

0 comments on commit 2f2f64d

Please sign in to comment.