ProcessJSON.py

import json
import csv
from collections import defaultdict
from pathlib import Path
import os

# Define the directories
pycefr_dir = '/pycefr'  # Path to the PyCEFR scripts
json_data_dir = os.path.join(pycefr_dir, 'DATA_JSON')  # Where JSON data is stored
output_dir = '/CompetencyScore'

# Ensure output directory exists
Path(output_dir).mkdir(parents=True, exist_ok=True)

def process_json_files():
    """
    Processes JSON files generated by PyCEFR to extract competency levels and 
    generate summary CSV and JSON files.
    """
    for json_file in Path(json_data_dir).glob('*.json'):
        process_json_file(json_file)

def process_json_file(json_file):
    """
    Processes a single JSON file generated by PyCEFR to extract competency levels and 
    generate summary CSV and JSON files.
    """
    with open(json_file) as f:
        data = json.load(f)

    commit_hash = json_file.stem
    all_files_data = data.get(commit_hash, {})
    
    after_sum, before_sum = defaultdict(int), defaultdict(int)
    
    for file_name, file_content in all_files_data.items():
        parts = file_name.split('_')
        if len(parts) < 7:  # Adjusted for time format inclusion
            print(f"Unexpected filename structure: {file_name}")
            continue
        
        project_name, author_id, author_date_format, time_format, status = parts[1], parts[2], parts[3], parts[4], parts[5]
        
        for level, score in file_content['Levels'].items():
            if 'after' in status:
                after_sum[level] += score
            elif 'before' in status:
                before_sum[level] += score
    
    diff = {level: after_sum[level] - before_sum.get(level, 0) for level in set(after_sum) | set(before_sum)}
    
    generate_summary_files(commit_hash, project_name, author_id, author_date_format, time_format, after_sum, before_sum, diff)

def generate_summary_files(commit_hash, project_name, author_id, author_date_format, time_format, after_sum, before_sum, diff):
    """
    Generates CSV and JSON summary files for competency levels, including time format.
    """
    output_csv_dir = os.path.join(output_dir, 'CSV', project_name, author_id)
    output_json_dir = os.path.join(output_dir, 'JSON', project_name, author_id)
    Path(output_csv_dir).mkdir(parents=True, exist_ok=True)
    Path(output_json_dir).mkdir(parents=True, exist_ok=True)

    summary_base = f"{commit_hash}_summary_{author_date_format}_{time_format}"
    csv_path = os.path.join(output_csv_dir, f"{summary_base}.csv")
    json_path = os.path.join(output_json_dir, f"{summary_base}.json")

    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['CommitHash', 'ProjectName', 'AuthorID', 'AuthorDateFormat', 'TimeFormat', 'Level', 'After', 'Before', 'Difference'])
        for level in sorted(set(after_sum.keys()).union(before_sum.keys())):
            writer.writerow([
                commit_hash, project_name, author_id, author_date_format, time_format,
                level, after_sum.get(level, 0), before_sum.get(level, 0), diff.get(level, 0)
            ])

    with open(json_path, 'w') as jsonfile:
        json.dump({
            'CommitHash': commit_hash,
            'ProjectName': project_name,
            'AuthorID': author_id,
            'AuthorDateFormat': author_date_format,
            'TimeFormat': time_format,
            'Levels': {
                'After': dict(after_sum),
                'Before': dict(before_sum),
                'Difference': dict(diff)
            }
        }, jsonfile, indent=4)

# Call the process_json_files() function if you want to run it immediately
process_json_files()