Skip to content

Commit

Permalink
[IMPROVE] extract now works on segments instead of files - easier to …
Browse files Browse the repository at this point in the history
…debug
  • Loading branch information
BenCretois committed Jan 9, 2025
1 parent cded2d6 commit d3b4b61
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 35 deletions.
72 changes: 38 additions & 34 deletions src/analysefs.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ def analyzeFile(fpath: pathlib.Path):

# Start time
start_time = datetime.datetime.now()
offset = 0
duration = cfg.FILE_SPLITTING_DURATION
#offset = 0
#duration = cfg.FILE_SPLITTING_DURATION
start, end = 0, cfg.SIG_LENGTH
results = {}
result_file_name = get_result_file_names(fpath)
Expand All @@ -113,48 +113,52 @@ def analyzeFile(fpath: pathlib.Path):
print(f"Analyzing {fpath}", flush=True)

# Process each chunk
while offset < fileLengthSeconds:
chunks = splitSignal(wave, sr, cfg.SIG_LENGTH, cfg.SIG_OVERLAP, cfg.SIG_MINLEN)
samples = []
timestamps = []
#while offset < fileLengthSeconds:
chunks = splitSignal(wave, sr, cfg.SIG_LENGTH, cfg.SIG_OVERLAP, cfg.SIG_MINLEN)
print(len(chunks))
samples = []
timestamps = []

for chunk_index, chunk in enumerate(chunks):
print(chunk_index)
# Add to batch
samples.append(chunk)
timestamps.append([start, end])

for chunk_index, chunk in enumerate(chunks):
# Add to batch
samples.append(chunk)
timestamps.append([start, end])
# Advance start and end
start += cfg.SIG_LENGTH - cfg.SIG_OVERLAP
end = start + cfg.SIG_LENGTH

# Advance start and end
start += cfg.SIG_LENGTH - cfg.SIG_OVERLAP
end = start + cfg.SIG_LENGTH
print(start)

# Check if batch is full or last chunk
if len(samples) < cfg.BATCH_SIZE and chunk_index < len(chunks) - 1:
continue
# Check if batch is full or last chunk
if len(samples) < cfg.BATCH_SIZE and chunk_index < len(chunks) - 1:
continue

# Predict
p = predict(samples)
# Predict
p = predict(samples)

# Add to results
for i in range(len(samples)):
# Get timestamp
s_start, s_end = timestamps[i]
# Add to results
for i in range(len(samples)):
# Get timestamp
s_start, s_end = timestamps[i]

# Get prediction
pred = p[i]
# Get prediction
pred = p[i]

# Assign scores to labels
p_labels = zip(cfg.LABELS, pred, strict=False)
# Assign scores to labels
p_labels = zip(cfg.LABELS, pred, strict=False)

# Sort by score
p_sorted = sorted(p_labels, key=operator.itemgetter(1), reverse=True)
# Sort by score
p_sorted = sorted(p_labels, key=operator.itemgetter(1), reverse=True)

# Store top 5 results and advance indices
results[str(s_start) + "-" + str(s_end)] = p_sorted
# Store top 5 results and advance indices
results[str(s_start) + "-" + str(s_end)] = p_sorted

# Clear batch
samples = []
timestamps = []
offset = offset + duration
# Clear batch
samples = []
timestamps = []
#offset = offset + duration

saveResultFiles(results, result_file_name, fpath, cfg.SAMPLE_RATE)
delta_time = (datetime.datetime.now() - start_time).total_seconds()
Expand Down
8 changes: 7 additions & 1 deletion src/global_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,20 @@
# Read the original Parquet file
parquet_df = pd.read_parquet(config["PARQUET_DB"])

# Filter for segments where start < 3600
filtered_df = parquet_df[parquet_df["start"] < 3600]

# Limit the total number of segments per species across all files
num_segments_per_species = config["NUM_SEGMENTS"]
sampled_df = (
parquet_df.groupby("species")
filtered_df.groupby("species")
.apply(lambda x: x.sample(min(len(x), num_segments_per_species), random_state=None))
.reset_index(drop=True)
)

# Add a unique row identifier
sampled_df['rowid'] = range(len(sampled_df))

# Save the globally sampled DataFrame to a new parquet file
sampled_df.to_parquet(config["TO_EXTRACT_FILE"])
print(f"Global sampling complete. Saved to {config['TO_EXTRACT_FILE']}")

0 comments on commit d3b4b61

Please sign in to comment.