update project

moh4med · Dec 29, 2016 · ac06627 · ac06627
1 parent a591ab1
commit ac06627
Show file tree

Hide file tree

Showing 8,294 changed files with 593,587 additions and 0 deletions.
diff --git a/.idea/filtering.iml b/.idea/filtering.iml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/Classifier.py b/Classifier.py
@@ -0,0 +1,85 @@
+import math
+class classifier:
+    def __init__(self,getfeatures,filename=None):
+        self.fc={}
+        self.cc={}
+        self.getfeatures=getfeatures
+        self.thresholds = {}
+    def incf(self, f, cat):
+        self.fc.setdefault(f, {})
+        self.fc[f].setdefault(cat, 0)
+        self.fc[f][cat] += 1
+
+    def incc(self, cat):
+        self.cc.setdefault(cat, 0)
+        self.cc[cat] += 1
+
+    def setthreshold(self, cat, t):
+        self.thresholds[cat] = t
+    def getthreshold(self, cat):
+        if cat not in self.thresholds: return 1.0
+        return self.thresholds[cat]
+    def fcount(self, f, cat):
+         if f in self.fc and cat in self.fc[f]:
+            return float(self.fc[f][cat])
+
+         return 0.0
+    def totalfeature(self, f):
+        totals = sum([self.fcount(f, c) for c in self.categories()])
+        return totals
+    def catcount(self, cat):
+         if cat in self.cc:
+                return float(self.cc[cat])
+
+         return 0
+
+    def totalcount(self):
+            return sum(self.cc.values())
+
+    def categories(self):
+            return self.cc.keys()
+
+    def train(self, item, cat):
+        features = self.getfeatures(item)
+        for f in features:
+             self.incf(f, cat)
+        self.incc(cat)
+
+    def fprob(self, f, cat):
+        if self.catcount(cat) == 0:
+            return 0
+        return self.fcount(f, cat) / self.catcount(cat)
+
+    def weightedprob(self, f, cat, prf, weight=3.0, ap=0.5):
+
+         basicprob = prf(f, cat)
+         totals = sum([self.fcount(f, c) for c in self.categories()])
+         bp = ((weight * ap) + (totals * basicprob)) / (weight + totals)
+         return bp
+    def acceptword(self,word,cat):
+        return 1
+        x=self.weightedprob(word,cat,self.fprob)
+        totals = self.totalfeature(word)
+        if x>.01:
+            return 1
+        else:
+            return 0
+    def classify(self, item, default=None):
+        probs = {}
+        max = float('-inf')
+        best=default
+        for cat in self.categories():
+            probs[cat] = self.prob(item, cat)
+            if probs[cat] > max:
+              max = probs[cat]
+              best = cat
+        # Make sure the probability exceeds threshold*next best
+        if max==0:
+            return default
+          #  return self.handlezero(item,default)
+        for cat in probs:
+            if cat == best:
+                continue
+            if probs[cat] +math.log10(self.getthreshold(best)) > probs[best]:
+                return default
+        return best
diff --git a/Classifier.pyc b/Classifier.pyc
diff --git a/Naivebayes.py b/Naivebayes.py
@@ -0,0 +1,46 @@
+from Classifier import *
+class naivebayes(classifier):
+    def docprob(self,item,cat):
+        features=self.getfeatures(item)
+        p=0
+        l=0
+        acat='ham'
+        if cat=='ham':
+            acat='spam'
+        for f in features:
+            if self.acceptword(f,cat):
+                p+=math.log10(self.weightedprob(f,cat,self.fprob))
+            #if self.acceptword(f,acat):
+             #   l *= self.weightedprob(f, acat, self.fprob)
+       # if p+l==0:
+        #    return p;
+        return p#/(p+l)
+
+    def prob(self, item, cat):
+         catprob = self.catcount(cat) / self.totalcount()
+         docprob = self.docprob(item, cat)
+         return docprob +math.log10(catprob)
+
+    def handlezero(self, item,default):
+        features = self.getfeatures(item)
+        p = 1.0
+        l = 1.0
+        acat = 'ham'
+        cat='spam'
+        catprob = self.catcount(cat) / self.totalcount()
+        acatprob = self.catcount(acat) / self.totalcount()
+        length=len(features)/2
+        for f in item:
+            if length<=0:
+                break
+            if self.acceptword(f, cat):
+                p *= self.weightedprob(f, cat, self.fprob)
+            if self.acceptword(f, acat):
+                l *= self.weightedprob(f, acat, self.fprob)
+            length-=1
+        if p==0 or l==0:
+            return default
+        if p*catprob>l*acatprob:
+            return cat
+        else:
+            return acat
diff --git a/Naivebayes.pyc b/Naivebayes.pyc
diff --git a/SPAM FILTER.docx b/SPAM FILTER.docx