From 8ddc6b2c20149b0d4c7556abd775e824954e547b Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Mon, 25 Nov 2024 18:01:41 +0530
Subject: [PATCH 1/3] Feat: added argument csv_report to export data as csv

---
 main.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 86 insertions(+), 11 deletions(-)

diff --git a/main.py b/main.py
index f9b0196..9b45338 100644
--- a/main.py
+++ b/main.py
@@ -6,6 +6,7 @@
 import sys
 import time
 import textwrap
+import csv
 from dataclasses import dataclass
 from datetime import datetime
 from functools import partial
@@ -15,7 +16,7 @@
 from tqdm import tqdm
 from unstract.api_deployments.client import APIDeploymentsClient
 
-DB_NAME = "file_processing.db"
+DB_NAME = "/home/praveen/Documents/db/demo.db"
 global_arguments = None
 logger = logging.getLogger(__name__)
 
@@ -35,6 +36,7 @@ class Arguments:
     skip_unprocessed: bool = False
     log_level: str = "INFO"
     print_report: bool = False
+    csv_report: str = ""
     include_metadata: bool = True
     verify: bool = True
 
@@ -58,6 +60,7 @@ def init_db():
                     total_embedding_tokens INTEGER,
                     total_llm_cost REAL,
                     total_llm_tokens INTEGER,
+                    error_message TEXT,
                     updated_at TEXT,
                     created_at TEXT
                 )"""
@@ -73,6 +76,7 @@ def init_db():
         "total_embedding_tokens": "INTEGER",
         "total_llm_cost": "REAL",
         "total_llm_tokens": "INTEGER",
+        "error_message": "TEXT",
     }
 
     # Add missing columns
@@ -126,10 +130,14 @@ def update_db(
     total_embedding_tokens = None
     total_llm_cost = None
     total_llm_tokens = None
+    error_message = None
 
     if result is not None:
         total_embedding_cost, total_llm_cost, total_embedding_tokens, total_llm_tokens = calculate_cost_and_tokens(result)
 
+    if execution_status is "ERROR":
+        error_message = extract_error_message(result)
+
     conn = sqlite3.connect(DB_NAME)
     conn.set_trace_callback(
         lambda x: (
@@ -142,8 +150,8 @@ def update_db(
     now = datetime.now().isoformat()
     c.execute(
         """
-        INSERT OR REPLACE INTO file_status (file_name, execution_status, result, time_taken, status_code, status_api_endpoint, total_embedding_cost, total_embedding_tokens, total_llm_cost, total_llm_tokens, updated_at, created_at)
-        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, COALESCE((SELECT created_at FROM file_status WHERE file_name = ?), ?))
+        INSERT OR REPLACE INTO file_status (file_name, execution_status, result, time_taken, status_code, status_api_endpoint, total_embedding_cost, total_embedding_tokens, total_llm_cost, total_llm_tokens, error_message, updated_at, created_at)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, COALESCE((SELECT created_at FROM file_status WHERE file_name = ?), ?))
         """,
         (
             file_name,
@@ -156,6 +164,7 @@ def update_db(
             total_embedding_tokens,
             total_llm_cost,
             total_llm_tokens,
+            error_message,
             now,
             file_name,
             now,
@@ -211,6 +220,17 @@ def calculate_cost_and_tokens(result):
         
     return total_embedding_cost, total_llm_cost, total_embedding_tokens, total_llm_tokens
 
+# Exract error message from the result JSON
+def extract_error_message(result):
+    result_data = json.loads(result)
+    # Check for error in extraction_result
+    extraction_result = result_data.get("extraction_result", [])
+    if extraction_result and isinstance(extraction_result, list):
+        for item in extraction_result:
+            if "error" in item and item["error"]:
+                return item["error"]
+    # Fallback to the parent error
+    return result_data.get("error", "No error message found")
 
 # Print final summary with count of each status and average time using a single SQL query
 def print_summary():
@@ -243,7 +263,7 @@ def print_report():
     # Fetch required fields, including total_cost and total_tokens
     c.execute(
         """
-        SELECT file_name, execution_status, time_taken, total_embedding_cost, total_embedding_tokens, total_llm_cost, total_llm_tokens
+        SELECT file_name, execution_status, time_taken, total_embedding_cost, total_embedding_tokens, total_llm_cost, total_llm_tokens, error_message
         FROM file_status
     """
     )
@@ -254,23 +274,69 @@ def print_report():
     print("\nDetailed Report:")
     if report_data:
         # Tabulate the data with column headers
-        headers = ["File Name", "Execution Status", "Time Elapsed (seconds)", "Total Embedding Cost", "Total Embedding Tokens", "Total LLM Cost", "Total LLM Tokens"]
+        headers = [
+            textwrap.fill(header, width=20) 
+            for header in [
+                "File Name", 
+                "Execution Status", 
+                "Time Elapsed (seconds)", 
+                "Total Embedding Cost", 
+                "Total Embedding Tokens", 
+                "Total LLM Cost", 
+                "Total LLM Tokens",
+                "Error Message"
+            ]
+        ]
+
 
-        # Wrap text in each column to a specific width (e.g., 30 characters for file names and 20 for others) and return None if the value is NULL
         formatted_data = []
+        # Wrap text in each column to a specific width (e.g., 30 characters for file names and 20 for others) and return None if the value is NULL
         for row in report_data:
             formatted_row = [
                 "None" if cell is None else
-                textwrap.fill(str(cell), width=30) if isinstance(cell, str) else 
-                f"{cell:.8f}" if isinstance(cell, float) else cell
-                for cell in row
-            ]
-            formatted_data.append(formatted_row)
+                textwrap.fill(str(cell), width=30) if isinstance(cell, str) else
+                cell if idx == 2 else f"{cell:.8f}" if isinstance(cell, float) else cell
+                for idx, cell in enumerate(row)
+        ]
+        formatted_data.append(formatted_row)
 
         print(tabulate(formatted_data, headers=headers, tablefmt="pretty"))
     else:
         print("No records found in the database.")
 
+def export_report_to_csv(output_path):
+    conn = sqlite3.connect(DB_NAME)
+    c = conn.cursor()
+
+    c.execute(
+        """
+        SELECT file_name, execution_status, time_taken, total_embedding_cost, total_embedding_tokens, total_llm_cost, total_llm_tokens, error_message
+        FROM file_status
+        """
+    )
+    report_data = c.fetchall()
+    conn.close()
+
+    if not report_data:
+        print("No data available to export.")
+        return
+
+    # Define the headers
+    headers = [
+        "File Name", "Execution Status", "Time Elapsed (seconds)",
+        "Total Embedding Cost", "Total Embedding Tokens",
+        "Total LLM Cost", "Total LLM Tokens", "Error Message"
+    ]
+
+    try:
+        with open(output_path, 'w', newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            writer.writerow(headers)  # Write headers
+            writer.writerows(report_data)  # Write data rows
+        print(f"CSV successfully exported to {output_path}")
+    except Exception as e:
+        print(f"Error exporting to CSV: {e}")
+
 
 def get_status_endpoint(file_path, client, args: Arguments):
     """Returns status_endpoint, status and response (if available)"""
@@ -523,6 +589,12 @@ def main():
         help="Disable SSL certificate verification.",
     )
 
+    parser.add_argument(
+        '--csv_report',
+        dest="csv_report",
+        type=str,
+        help='Path to export the detailed report as a CSV file',
+    )
 
     args = Arguments(**vars(parser.parse_args()))
 
@@ -543,6 +615,9 @@ def main():
             "Elapsed time calculation of a file which was resumed"
             " from pending state will not be correct"
         )
+    
+    if args.csv_report:
+        export_report_to_csv(args.csv_report)
 
 
 if __name__ == "__main__":

From fb88266ac797b80798a03b22122845bae2fe4cb1 Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Mon, 25 Nov 2024 21:18:25 +0530
Subject: [PATCH 2/3] Modified README.md to include details about csv_report
 arg and error_msg column

---
 README.md | 2 ++
 main.py   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d9be71f..58251df 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ The script uses a local SQLite database (`file_processing.db`) with the followin
   - `total_embedding_tokens` (INTEGER): Total tokens used for embeddings.
   - `total_llm_cost` (REAL): Total cost incurred for LLM operations.
   - `total_llm_tokens` (INTEGER): Total tokens used for LLM operations.
+  - `error_message` (TEXT): Details of errors if `execution_status` is `ERROR`; otherwise NULL.
   - `updated_at` (TEXT): Last updated timestamp
   - `created_at` (TEXT): Creation timestamp
 
@@ -66,6 +67,7 @@ This will display detailed usage information.
 - `--print_report`: Print a detailed report of all processed files at the end.
 - `--exclude_metadata`: Exclude metadata on tokens consumed and the context passed to LLMs for prompt studio exported tools in the result for each file.
 - `--no_verify`: Disable SSL certificate verification. (By default, SSL verification is enabled.)
+- `--csv_report`: Path to export the detailed report as a CSV file.
 
 ## Usage Examples
 
diff --git a/main.py b/main.py
index 9b45338..333382d 100644
--- a/main.py
+++ b/main.py
@@ -16,7 +16,7 @@
 from tqdm import tqdm
 from unstract.api_deployments.client import APIDeploymentsClient
 
-DB_NAME = "/home/praveen/Documents/db/demo.db"
+DB_NAME = "file_processing.db"
 global_arguments = None
 logger = logging.getLogger(__name__)
 

From cc8d8462a789c15cab04b47f59db44fa3b6355fc Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Mon, 25 Nov 2024 21:37:17 +0530
Subject: [PATCH 3/3] Fixed condition check for execution_status

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 333382d..b4f8281 100644
--- a/main.py
+++ b/main.py
@@ -135,7 +135,7 @@ def update_db(
     if result is not None:
         total_embedding_cost, total_llm_cost, total_embedding_tokens, total_llm_tokens = calculate_cost_and_tokens(result)
 
-    if execution_status is "ERROR":
+    if execution_status == "ERROR":
         error_message = extract_error_message(result)
 
     conn = sqlite3.connect(DB_NAME)