From 16e79e217d37ee10f2bab1ebf044ed13f55f16d3 Mon Sep 17 00:00:00 2001
From: TeachMeTW <robin@robinttw.com>
Date: Mon, 23 Dec 2024 18:50:21 -0800
Subject: [PATCH] Added unit tests to verify that the stats are generated in
 both cases: (i) when there is new data and (ii) when there is no new data.

---
 emission/analysis/result/user_stat.py         |   4 +-
 .../analysisTests/intakeTests/TestUserStat.py | 108 +++++++++++++++++-
 2 files changed, 108 insertions(+), 4 deletions(-)

diff --git a/emission/analysis/result/user_stat.py b/emission/analysis/result/user_stat.py
index fa1d7ac95..27d633cec 100644
--- a/emission/analysis/result/user_stat.py
+++ b/emission/analysis/result/user_stat.py
@@ -61,9 +61,9 @@ def get_and_store_user_stats(user_id: str, trip_key: str) -> None:
         end_ts_result = ts.get_first_value_for_field(trip_key, "data.end_ts", pymongo.DESCENDING)
         end_ts = None if end_ts_result == -1 else end_ts_result
 
-        total_trips = ts.find_entries_count(key_list=["analysis/confirmed_trip"])
+        total_trips = ts.find_entries_count(key_list=[trip_key])
         labeled_trips = ts.find_entries_count(
-            key_list=["analysis/confirmed_trip"],
+            key_list=[trip_key],
             extra_query_list=[{'data.user_input': {'$ne': {}}}]
         )
 
diff --git a/emission/tests/analysisTests/intakeTests/TestUserStat.py b/emission/tests/analysisTests/intakeTests/TestUserStat.py
index 207aa0a98..8cae238b1 100644
--- a/emission/tests/analysisTests/intakeTests/TestUserStat.py
+++ b/emission/tests/analysisTests/intakeTests/TestUserStat.py
@@ -57,7 +57,7 @@ def tearDown(self):
         edb.get_analysis_timeseries_db().delete_many({"user_id": self.testUUID})
         edb.get_profile_db().delete_one({"user_id": self.testUUID})
 
-    def testGetAndStoreUserStats(self):
+    def testGetAndStoreUserStatsDefault(self):
         """
         Test get_and_store_user_stats for the user to ensure that user statistics
         are correctly aggregated and stored in the user profile.
@@ -75,7 +75,7 @@ def testGetAndStoreUserStats(self):
         self.assertIn("pipeline_range", profile, "User profile should contain 'pipeline_range'.")
         self.assertIn("last_call_ts", profile, "User profile should contain 'last_call_ts'.")
 
-        expected_total_trips = 5
+        expected_total_trips = 8
         expected_labeled_trips = 0
 
         self.assertEqual(profile["total_trips"], expected_total_trips,
@@ -115,6 +115,110 @@ def testLastCall(self):
             f"Expected last_call_ts to be {expected_last_call_ts}, got {actual_last_call_ts}"
         )
 
+    def testGetAndStoreUserStatsSecondRunNoNewData(self):
+        """
+        Case (ii): Verify stats remain unchanged if we run the pipeline again
+        without adding new data.
+        """
+        # Check stats after the initial run (from setUp()).
+        initial_profile = edb.get_profile_db().find_one({"user_id": self.testUUID})
+        self.assertIsNotNone(initial_profile, "User profile should exist after first run.")
+        initial_total_trips = initial_profile["total_trips"]
+        initial_labeled_trips = initial_profile["labeled_trips"]
+
+        # Run the pipeline again, but don't add any new data
+        etc.runIntakePipeline(self.testUUID)
+
+        # Stats should remain the same
+        updated_profile = edb.get_profile_db().find_one({"user_id": self.testUUID})
+        self.assertIsNotNone(updated_profile, "Profile should still exist.")
+        self.assertEqual(
+            updated_profile["total_trips"], 
+            initial_total_trips,
+            f"Expected total_trips to remain {initial_total_trips}, got {updated_profile['total_trips']}"
+        )
+        self.assertEqual(
+            updated_profile["labeled_trips"], 
+            initial_labeled_trips,
+            f"Expected labeled_trips to remain {initial_labeled_trips}, got {updated_profile['labeled_trips']}"
+        )
+
+
+    def testGetAndStoreUserStatsNewData(self):
+        """
+        Case (i): Verify stats are updated properly when new data is inserted
+        from shankari_2015-aug-27 without modifying the original data and the pipeline is rerun.
+        We then assert the actual number of total trips (e.g., from 8 to 18).
+        """
+        # 1. Retrieve the initial user profile after setUp()
+        initial_profile = edb.get_profile_db().find_one({"user_id": self.testUUID})
+        self.assertIsNotNone(initial_profile, "User profile should exist after the first run.")
+
+        # 2. Assert that the initial total trips are as expected (8 trips)
+        expected_initial_trips = 8
+        self.assertEqual(
+            initial_profile["total_trips"],
+            expected_initial_trips,
+            f"Expected initial total_trips to be {expected_initial_trips}, got {initial_profile['total_trips']}"
+        )
+
+        # Store initial trips count and labeled trips for later comparison
+        initial_total_trips = initial_profile["total_trips"]
+        initial_labeled_trips = initial_profile["labeled_trips"]
+
+        # 3. Load and prepare new data from shankari_2015-aug-27
+        new_entries = []
+        aug27_file_path = "emission/tests/data/real_examples/shankari_2015-aug-27"
+
+        try:
+            with open(aug27_file_path) as fp:
+                # Load entries using the existing JSON wrapper
+                aug27_entries = json.load(fp, object_hook=esj.wrapped_object_hook)
+                for entry in aug27_entries:
+                    # Replace the user_id UUID with self.testUUID
+                    entry['user_id'] = self.testUUID
+
+                    # Remove the '_id' field to let MongoDB assign a new one
+                    if '_id' in entry:
+                        del entry['_id']
+
+                    # Append the modified entry to the new_entries list
+                    new_entries.append(entry)
+                    
+        except FileNotFoundError:
+            self.fail(f"New data file not found at path: {aug27_file_path}")
+        except json.JSONDecodeError as e:
+            self.fail(f"JSON decoding failed for file {aug27_file_path}: {e}")
+
+        # 4. Insert the new entries into the timeseries collection
+        if new_entries:
+            edb.get_timeseries_db().insert_many(new_entries)
+        else:
+            self.fail("No new entries were loaded from the new data file.")
+
+        # 5. Run the pipeline again to process the newly inserted entries
+        etc.runIntakePipeline(self.testUUID)
+
+        # 6. Retrieve the updated user profile after processing new data
+        updated_profile = edb.get_profile_db().find_one({"user_id": self.testUUID})
+        self.assertIsNotNone(updated_profile, "Profile should exist after inserting new data.")
+
+        # 7. Assert that the total trips have increased from 8 to 18
+        expected_final_trips = 18
+        self.assertEqual(
+            updated_profile["total_trips"],
+            expected_final_trips,
+            f"Expected total_trips to be {expected_final_trips}, got {updated_profile['total_trips']}"
+        )
+
+        # 8. Ensure that labeled_trips is not less than it was before
+        self.assertGreaterEqual(
+            updated_profile["labeled_trips"],
+            initial_labeled_trips,
+            f"Expected labeled_trips >= {initial_labeled_trips}, got {updated_profile['labeled_trips']}"
+        )
+
+
 if __name__ == '__main__':
     # Configure logging for the test
     etc.configLogging()