Skip to content

Commit

Permalink
Update extract-db-otm.py
Browse files Browse the repository at this point in the history
  • Loading branch information
TheAIWizard authored Sep 8, 2024
1 parent abf65d4 commit d369556
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions utils/extract-db-otm.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def sample_data(df_path: str, n_lines: str):
print(len(stratified_sample_df_cg))

# Récupérer la dernière date disponible dans la table
last_date = df['date_modification_dt'].max().strftime("%Y%m%d")
#last_date = df['date_modification_dt'].max().strftime("%Y%m%d")

# Supprimer la colonne datetime si elle existe, après traitement (pour traitement ultérieur JSON)
if 'date_modification_dt' in stratified_sample_df.columns:
Expand All @@ -96,7 +96,7 @@ def sample_data(df_path: str, n_lines: str):
# Partitionner par 'emetteur' et sauvegarder chaque partition dans un fichier Parquet
for categorie, partition_df in stratified_sample.groupby('categorie_demande'):
if categorie in ["CG","SOCET"]:
partition_file = os.path.join(f's3://projet-ape/label-studio/annotation-campaign-2024/rev-NAF2025/{categorie}/data-samples/queue/extrait_{categorie}_sirene_last_date_{last_date}.parquet')
partition_file = os.path.join(f's3://projet-ape/label-studio/annotation-campaign-2024/rev-NAF2025/{categorie}/data-samples/queue/extrait_{categorie}_sirene_last_date_{"unavaible_for_now"}.parquet')
pq.write_table(pa.Table.from_pandas(partition_df), partition_file, filesystem=fs)
print(f'Saved {partition_file}')

Expand Down

0 comments on commit d369556

Please sign in to comment.