Merge branch 'main' into feature-implement-apriori-algorithm

ADORSYS-GIS · Dec 11, 2023 · 186058e · 186058e
2 parents a82917c + 9f17953
commit 186058e
Show file tree

Hide file tree

Showing 9 changed files with 245 additions and 40 deletions.
diff --git a/docs/apriori.md b/docs/apriori.md
@@ -1,4 +1,4 @@
-# Support
+# Apriori
 
 In association rules mining, the aprori help to find all frequent 
 itemsets in a dataset and generate strong association rules.
@@ -28,4 +28,4 @@ Output: Frequent Itemsets, Strong Rules
         ii. If Calculate_Confidence(D, Rule) >= min_confidence:
             a. Add Rule to Strong Rules
 5. Return Frequent Itemsets, Strong Rules
-```
+```
diff --git a/docs/confidence.md b/docs/confidence.md
@@ -1,4 +1,4 @@
-# Support
+# Confidence
 
 In association rules mining, the confidence help to measure the 
 likelihood of occurrence of an itemset given another itemset.
@@ -21,4 +21,4 @@ Output: Confidence value for Rule
    Else:
     a. Confidence = 0
 4. Return Confidence
-```
+```
diff --git a/main.py b/main.py
@@ -2,5 +2,7 @@
 
 if __name__ == '__main__':
     result = apriori([[1, 2, 3], [1, 2, 3], [1, 2, 3]], 0.3, 0.7)
+
+    print(result)
     print(result)
 
diff --git a/notebook/apriori.ipynb b/notebook/apriori.ipynb
@@ -3,14 +3,38 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "initial_id",
-   "metadata": {
-    "collapsed": true
-   },
+   "id": "54e2e8be",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "from src.apriori import apriori"
+    "# Assuming apriori.py is in the same directory or update the path accordingly\n",
+    "from apriori import apriori\n",
+    "\n",
+    "# Define your dataset\n",
+    "dataset = [\n",
+    "    ['Milk', 'Bread', 'Eggs'],\n",
+    "    ['Milk', 'Diapers'],\n",
+    "    ['Bread', 'Butter', 'Eggs'],\n",
+    "]\n",
+    "\n",
+    "# Set your minimum support and confidence thresholds\n",
+    "min_support = 0.4\n",
+    "min_confidence = 0.6\n",
+    "\n",
+    "# Call the apriori function\n",
+    "frequent_itemsets, strong_rules = apriori(dataset, min_support, min_confidence)\n",
+    "\n",
+    "# Display the results\n",
+    "print(\"Frequent Itemsets:\")\n",
+    "for itemset, support_value in frequent_itemsets.items():\n",
+    "    print(f\"{itemset}: {support_value}\")\n",
+    "\n",
+    "print(\"\\nStrong Rules:\")\n",
+    "for rule, confidence_value in strong_rules.items():\n",
+    "    print(f\"{rule[0]} -> {rule[1]} (Confidence: {confidence_value})\")"
    ]
+
+
   },
   {
    "cell_type": "code",
@@ -19,10 +43,15 @@
    "metadata": {
     "collapsed": false
    },
+
    "outputs": [],
    "source": [
     "apriori([[1, 3], [1, 2], [1, 2, 3]], 0.5)"
    ]
+
+   "id": "4d2a7fe66b2d70bb"
+
+
   }
  ],
  "metadata": {
@@ -34,14 +63,14 @@
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/notebook/confidence.ipynb b/notebook/confidence.ipynb
@@ -2,13 +2,33 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "initial_id",
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
+   "execution_count": 10,
+   "id": "0df8ea7c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Confidence for rule (['bread'], ['milk']): 0.7499999999999999\n"
+     ]
+    }
+   ],
+   "source": [
+    "from confidence import confidence\n",
+    "\n",
+    "dataset = [\n",
+    "            ['bread', 'milk'],\n",
+    "            ['bread', 'diaper', 'beer', 'eggs'],\n",
+    "            ['milk', 'diaper', 'beer', 'cola'],\n",
+    "            ['bread', 'milk', 'diaper', 'beer'],\n",
+    "            ['bread', 'milk', 'diaper', 'cola']\n",
+    "        ]\n",
+    "rule = (['bread'], ['milk'])  \n",
+    "\n",
+    "confidence_value = confidence(dataset, rule)\n",
+    "print(f\"Confidence for rule {rule}: {confidence_value}\")"
+   ]
   }
  ],
  "metadata": {
@@ -20,14 +40,14 @@
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/notebook/support.ipynb b/notebook/support.ipynb
@@ -2,13 +2,34 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "initial_id",
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
+   "execution_count": 9,
+   "id": "fb107e08",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Support for ['bread', 'milk']: 0.6\n"
+     ]
+    }
+   ],
+   "source": [
+    "from support import support\n",
+    "\n",
+    "dataset = [\n",
+    "            ['bread', 'milk'],\n",
+    "            ['bread', 'diaper', 'beer', 'eggs'],\n",
+    "            ['milk', 'diaper', 'beer', 'cola'],\n",
+    "            ['bread', 'milk', 'diaper', 'beer'],\n",
+    "            ['bread', 'milk', 'diaper', 'cola']\n",
+    "]\n",
+    "itemset = ['bread', 'milk']\n",
+    "#calling the support function\n",
+    "\n",
+    "support_value = support(itemset, dataset)\n",
+    "print(f\"Support for {itemset}: {support_value}\")"
+   ]
   }
  ],
  "metadata": {
@@ -20,14 +41,14 @@
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/src/apriori.py b/src/apriori.py
@@ -1,15 +1,25 @@
 
+
 sub_dataset_type = tuple[object]
+from typing import List, Tuple, Dict
+from itertools import chain, combinations
+
+# Importing support and confidence functions
+from support import support
+from confidence import confidence
+
+sub_dataset_type = Tuple[object]
 support_type = float
-strong_rules = list[object], list[object]
+strong_rules_type = Tuple[List[object], List[object]]
 confidence_type = float
 
 
-def apriori(transactions: list[list[object]], min_support: float = 0.7, min_confidence: float = 0.5) \
-        -> tuple[dict[sub_dataset_type, support_type], dict[strong_rules, confidence_type]]:
+def apriori(transactions: List[List[object]], min_support: float = 0.7, min_confidence: float = 0.5) \
+        -> Tuple[Dict[sub_dataset_type, support_type], Dict[strong_rules_type, confidence_type]]:
     """
     To find all frequent itemsets in a dataset and generate strong association rules.
     """
+
     return {}, {}
 from itertools import combinations
 
@@ -157,4 +167,89 @@ def apriori(dataset, min_support, min_confidence):
 
 print("\nStrong Rules:")
 for rule, confidence in strong_rules:
-    print(rule, "Confidence:", confidence)
+    print(rule, "Confidence:", confidence)
+
+    # Initializes dictionaries to store frequent itemsets and strong rules.
+    frequent_itemsets = {}
+    strong_rules = {}
+
+    # Creates unique 1-itemsets and calculates their support using the support function.
+    # Initialize L1 = {frequent 1-itemsets}
+    unique_items = set(item for transaction in transactions for item in transaction)
+    candidates_1 = [frozenset([item]) for item in unique_items]
+    frequent_itemsets[1] = {candidate: support(transactions, [list(candidate)]) for candidate in candidates_1}
+
+    # Iterates over the levels of itemsets (k) until no more frequent itemsets are found and generating candidate sets
+    # For (k = 2; Lk-1 is not empty; k++):
+    k = 2
+    while len(frequent_itemsets[k - 1]) > 0:
+        # Generating Ck, candidate k-itemsets, from Lk-1
+        candidates_k = generate_candidates(list(frequent_itemsets[k - 1]), k)
+
+        # For each transaction t in D:
+        for transaction in transactions:
+            # Increment count of all candidates in Ck that are contained in t
+            for candidate in candidates_k:
+                if set(candidate).issubset(transaction):
+                    frequent_itemsets[k - 1][frozenset(candidate)] += 1
+
+        # Lk = {c in Ck | support(c) >= min_support}
+        frequent_itemsets[k] = {candidate: support_value for candidate, support_value in
+                                frequent_itemsets[k - 1].items()
+                                if support_value / len(transactions) >= min_support}
+
+        k += 1
+
+    # Frequent Itemsets = Union of all Lk
+    frequent_itemsets = {itemset: support_value for itemsets in frequent_itemsets.values() for itemset, support_value in
+                         itemsets.items()}
+
+    # For each frequent itemset l in Frequent Itemsets:
+    for itemset in frequent_itemsets.keys():
+
+        # Generate all non-empty subsets of l
+        subsets = get_subsets(itemset)
+
+        # For every non-empty subset s of l:
+        for subset in subsets:
+            # Rule = s -> (l - s)
+            rule = (subset, list(set(itemset) - set(subset)))
+
+            # If Calculate_Confidence(D, Rule) >= min_confidence:
+            confidence_value = confidence(transactions, rule)
+            if confidence_value >= min_confidence:
+                # Add Rule to Strong Rules
+                strong_rules[tuple(rule)] = confidence_value
+
+    # Return Frequent Itemsets, Strong Rules
+    return frequent_itemsets, strong_rules
+
+
+def generate_candidates(frequent_itemsets: List[frozenset], k: int) -> List[frozenset]:
+    """
+    Generate candidate k-itemsets from frequent (k-1)-itemsets.
+    """
+    candidates = []
+    n = len(frequent_itemsets)
+
+    for i in range(n):
+        for j in range(i + 1, n):
+            # Merging the frequent (k-1)-itemsets to generate candidates
+            candidate = frozenset(sorted(set(frequent_itemsets[i]).union(frequent_itemsets[j])))
+
+            # Check if the candidate has length k
+            if len(candidate) == k:
+                candidates.append(candidate)
+
+    return candidates
+
+
+
+def get_subsets(itemset: List[object]) -> List[List[object]]:
+    """
+    Generate all non-empty subsets of a set.
+    """
+    return [list(subset) for subset in chain.from_iterable(combinations(itemset, r) for r in range(1, len(itemset)))]
+
+
+
diff --git a/src/confidence.py b/src/confidence.py
@@ -1,11 +1,15 @@
-dataset_type = list[list[object]]
-rule_type = tuple[list[object], list[object]]
+from typing import List, Tuple
+from support import support
 
+dataset_type = List[List[object]]
+rule_type = Tuple[List[object], List[object]]
 
-def confidence(data_set: dataset_type, rule: rule_type) -> float:
+
+def confidence(dataset: dataset_type, rule: rule_type) -> float:
     """
     To measure the likelihood of occurrence of an itemset given another itemset.
     """
+
     pass
 def calculate_support(dataset, itemset):
     count = 0
@@ -42,4 +46,15 @@ def calculate_confidence(dataset, rule):
 rule = (['A'], ['C'])
 
 confidence = calculate_confidence(dataset, rule)
-print("Confidence:", confidence)
+print("Confidence:", confidence)
+
+    antecedent, consequent = rule
+    antecedent_support = support(antecedent, dataset)
+    rule_support = support(antecedent + consequent, dataset)
+
+    if antecedent_support == 0:
+        return 0  # Avoid division by zero
+
+    confidence_value = rule_support / antecedent_support
+    return confidence_value
+