Update baseline.py

nowickit-umich · Dec 15, 2024 · b982116 · b982116
1 parent 27f01db
commit b982116
Showing 1 changed file with 40 additions and 2 deletions.
diff --git a/baseline.py b/baseline.py
@@ -1,17 +1,55 @@
 import numpy as np
 import random
 
+stopwords = [
+    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at",
+    "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could",
+    "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for",
+    "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's",
+    "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm",
+    "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't",
+    "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours",
+    "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so",
+    "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
+    "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until",
+    "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when",
+    "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would",
+    "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"
+]
+
+def similarity(question, answer):
+    count = len(question.split()) + len(answer.split())
+    match = 0
+    for wordq in question.strip().split():
+        if wordq in stopwords:
+            continue
+        for worda in answer.strip().split():
+            if worda in stopwords:
+                continue
+            if wordq == worda:
+                match += 1
+    return match / count
+
 # Load the .npy file
 file_path = 'WP-train.npy'
 data = np.load(file_path, allow_pickle=True)
 
 # Baseline
+# choose the answer which has the most words in common with the question
 count = 0
 correct = 0
 for item in data:
+    #print(item)
     count += 1
-    guess = item["choice_list"][random.randint(0, 2)]
+    max = 0
+    max_choice = 0
+    for i, choice in enumerate(item["choice_list"]):
+        sim = similarity(item["question"], choice)
+        if sim > max:
+            max = sim
+            max_choice = i
+    guess = item["choice_list"][max_choice]
     if guess == item["answer"]:
         correct += 1
-    
+
 print(correct / count)