[IMPROVE] extract now works on segments instead of files - easier to …

…debug
NINAnor · Jan 9, 2025 · d3b4b61 · d3b4b61
1 parent cded2d6
commit d3b4b61
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 35 deletions.
diff --git a/src/analysefs.py b/src/analysefs.py
@@ -100,8 +100,8 @@ def analyzeFile(fpath: pathlib.Path):
 
     # Start time
     start_time = datetime.datetime.now()
-    offset = 0
-    duration = cfg.FILE_SPLITTING_DURATION
+    #offset = 0
+    #duration = cfg.FILE_SPLITTING_DURATION
     start, end = 0, cfg.SIG_LENGTH
     results = {}
     result_file_name = get_result_file_names(fpath)
@@ -113,48 +113,52 @@ def analyzeFile(fpath: pathlib.Path):
     print(f"Analyzing {fpath}", flush=True)
 
     # Process each chunk
-    while offset < fileLengthSeconds:
-        chunks = splitSignal(wave, sr, cfg.SIG_LENGTH, cfg.SIG_OVERLAP, cfg.SIG_MINLEN)
-        samples = []
-        timestamps = []
+    #while offset < fileLengthSeconds:
+    chunks = splitSignal(wave, sr, cfg.SIG_LENGTH, cfg.SIG_OVERLAP, cfg.SIG_MINLEN)
+    print(len(chunks))
+    samples = []
+    timestamps = []
+
+    for chunk_index, chunk in enumerate(chunks):
+        print(chunk_index)
+        # Add to batch
+        samples.append(chunk)
+        timestamps.append([start, end])
 
-        for chunk_index, chunk in enumerate(chunks):
-            # Add to batch
-            samples.append(chunk)
-            timestamps.append([start, end])
+        # Advance start and end
+        start += cfg.SIG_LENGTH - cfg.SIG_OVERLAP
+        end = start + cfg.SIG_LENGTH
 
-            # Advance start and end
-            start += cfg.SIG_LENGTH - cfg.SIG_OVERLAP
-            end = start + cfg.SIG_LENGTH
+        print(start)
 
-            # Check if batch is full or last chunk
-            if len(samples) < cfg.BATCH_SIZE and chunk_index < len(chunks) - 1:
-                continue
+        # Check if batch is full or last chunk
+        if len(samples) < cfg.BATCH_SIZE and chunk_index < len(chunks) - 1:
+            continue
 
-            # Predict
-            p = predict(samples)
+        # Predict
+        p = predict(samples)
 
-            # Add to results
-            for i in range(len(samples)):
-                # Get timestamp
-                s_start, s_end = timestamps[i]
+        # Add to results
+        for i in range(len(samples)):
+            # Get timestamp
+            s_start, s_end = timestamps[i]
 
-                # Get prediction
-                pred = p[i]
+            # Get prediction
+            pred = p[i]
 
-                # Assign scores to labels
-                p_labels = zip(cfg.LABELS, pred, strict=False)
+            # Assign scores to labels
+            p_labels = zip(cfg.LABELS, pred, strict=False)
 
-                # Sort by score
-                p_sorted = sorted(p_labels, key=operator.itemgetter(1), reverse=True)
+            # Sort by score
+            p_sorted = sorted(p_labels, key=operator.itemgetter(1), reverse=True)
 
-                # Store top 5 results and advance indices
-                results[str(s_start) + "-" + str(s_end)] = p_sorted
+            # Store top 5 results and advance indices
+            results[str(s_start) + "-" + str(s_end)] = p_sorted
 
-            # Clear batch
-            samples = []
-            timestamps = []
-        offset = offset + duration
+        # Clear batch
+        samples = []
+        timestamps = []
+        #offset = offset + duration
 
     saveResultFiles(results, result_file_name, fpath, cfg.SAMPLE_RATE)
     delta_time = (datetime.datetime.now() - start_time).total_seconds()

diff --git a/src/global_sampler.py b/src/global_sampler.py
@@ -13,14 +13,20 @@
     # Read the original Parquet file
     parquet_df = pd.read_parquet(config["PARQUET_DB"])
 
+    # Filter for segments where start < 3600
+    filtered_df = parquet_df[parquet_df["start"] < 3600]
+
     # Limit the total number of segments per species across all files
     num_segments_per_species = config["NUM_SEGMENTS"]
     sampled_df = (
-        parquet_df.groupby("species")
+        filtered_df.groupby("species")
         .apply(lambda x: x.sample(min(len(x), num_segments_per_species), random_state=None))
         .reset_index(drop=True)
     )
 
+    # Add a unique row identifier
+    sampled_df['rowid'] = range(len(sampled_df))
+
     # Save the globally sampled DataFrame to a new parquet file
     sampled_df.to_parquet(config["TO_EXTRACT_FILE"])
     print(f"Global sampling complete. Saved to {config['TO_EXTRACT_FILE']}")