From 9abd3add6e1696ef6dd91e5d3158bf30964f7701 Mon Sep 17 00:00:00 2001 From: Ramakrishna Sakhamuru Date: Mon, 22 Jan 2024 10:53:43 +0530 Subject: [PATCH 1/8] script to generate test data --- Importer/scripts/generate_test_data.py | 70 ++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 Importer/scripts/generate_test_data.py diff --git a/Importer/scripts/generate_test_data.py b/Importer/scripts/generate_test_data.py new file mode 100644 index 0000000..ee4c9ff --- /dev/null +++ b/Importer/scripts/generate_test_data.py @@ -0,0 +1,70 @@ +""" +This is useful to generate json data files by reading the data file as csv. +""" + +import csv +import json +import os +import sys +from collections import defaultdict + +def parse_header(header): + """ Parse the header to get the filename and the JSON key. """ + parts = header.split('.') + return parts[0], '.'.join(parts[1:]) if len(parts) > 1 else None + +def set_value(dct, keys, value): + """ Set a value in a nested dictionary based on a list of keys. """ + for key in keys[:-1]: + if key not in dct or not isinstance(dct[key], dict): + dct[key] = {} + dct = dct[key] + dct[keys[-1]] = value if value != '' else None + +def write_json_files(data, written_files, output_dir): + """ Write the organized data to JSON files, each JSON object in a single line. """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + for filename, contents in data.items(): + file_path = os.path.join(output_dir, f"{filename}.json") + mode = 'a' if filename in written_files else 'w' + with open(file_path, mode, encoding='utf-8') as json_file: + for content in contents: + json.dump(content, json_file) + json_file.write('\n') + written_files.add(filename) + +def csv_to_json(csv_file_paths, output_dir): + written_files = set() + for csv_file_path in csv_file_paths: + data = defaultdict(list) + + with open(csv_file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + headers = reader.fieldnames + + for row in reader: + json_objects = defaultdict(dict) + for header in headers: + if '.' in header: + filename, json_key = parse_header(header) + if json_key: + nested_keys = json_key.split('.') + set_value(json_objects[filename], nested_keys, row[header]) + + # Add the created JSON objects to the data + for filename, json_obj in json_objects.items(): + data[filename].append(json_obj) + + write_json_files(data, written_files, output_dir) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python script.py ...") + sys.exit(1) + + csv_file_paths = sys.argv[1:] + output_dir = os.path.join("..", "test_database", "jct") # Output directory one level up + csv_to_json(csv_file_paths, output_dir) + From 132639094346629d3caf05563d93656d1be73ed2 Mon Sep 17 00:00:00 2001 From: Ramakrishna Sakhamuru Date: Tue, 23 Jan 2024 12:13:00 +0530 Subject: [PATCH 2/8] added documentation --- Importer/scripts/generate_test_data.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Importer/scripts/generate_test_data.py b/Importer/scripts/generate_test_data.py index ee4c9ff..90c2260 100644 --- a/Importer/scripts/generate_test_data.py +++ b/Importer/scripts/generate_test_data.py @@ -1,5 +1,10 @@ """ -This is useful to generate json data files by reading the data file as csv. +This script is useful to generate json data files by reading the data file as csv. +The script generates output at test_database/jct directory at one level above the scripts directory. + +Usage: + python generate_test_data.py [ ... ] + """ import csv @@ -61,7 +66,7 @@ def csv_to_json(csv_file_paths, output_dir): if __name__ == "__main__": if len(sys.argv) < 2: - print("Usage: python script.py ...") + print("Usage: python generate_test_data.py ...") sys.exit(1) csv_file_paths = sys.argv[1:] From 92d3fd9bb21a731792f84b25587dee6442f38485 Mon Sep 17 00:00:00 2001 From: Ramakrishna Sakhamuru Date: Fri, 9 Feb 2024 14:59:52 +0530 Subject: [PATCH 3/8] Updated the script to generate more data for journal.json and to create jac.json --- Importer/scripts/generate_test_data.py | 74 ++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/Importer/scripts/generate_test_data.py b/Importer/scripts/generate_test_data.py index 90c2260..2bdd092 100644 --- a/Importer/scripts/generate_test_data.py +++ b/Importer/scripts/generate_test_data.py @@ -12,12 +12,60 @@ import os import sys from collections import defaultdict +from datetime import datetime, timedelta +import random + +def calculate_issn_check_digit(issn): + """Calculate the ISSN check digit for the first seven ISSN digits.""" + sum_of_digits = sum((8 - i) * int(digit) for i, digit in enumerate(issn)) + remainder = sum_of_digits % 11 + check_digit = 0 if remainder == 0 else 11 - remainder + # Actual checksum digit would be 'X' if check_digit == 10 else str(check_digit) + # return a fake checksum so that generated issn will not match with real issn + return '1' if check_digit == 10 or check_digit == 0 else str(check_digit - 1) + + +def generate_issn(): + """Generate a valid ISSN.""" + first_seven_digits = ''.join(str(random.randint(0, 9)) for _ in range(7)) + check_digit = calculate_issn_check_digit(first_seven_digits) + return f"{first_seven_digits[:4]}-{first_seven_digits[4:]}{check_digit}" + +def generate_created_at(): + """Generate a dynamic creation datetime string in ISO 8601 format.""" + now = datetime.utcnow() - timedelta(days=random.randint(0, 365), hours=random.randint(0, 23), + minutes=random.randint(0, 59)) + return now.strftime('%Y-%m-%dT%H:%M:%SZ') def parse_header(header): """ Parse the header to get the filename and the JSON key. """ parts = header.split('.') return parts[0], '.'.join(parts[1:]) if len(parts) > 1 else None +counter = 0 + +def generate_random_title_and_publisher(): + """Generate a random title and publisher from predefined lists.""" + global counter + counter = counter + 1 + title = "Test title " + str(counter) + publisher = "Test publisher " +str(counter) + return title, publisher + +def generate_jac_entry(issn): + """Generate an entry for jac.json with test data.""" + title, publisher = generate_random_title_and_publisher() + return { + "issns": [issn], + "title": title, + "publisher": publisher, + "index": { + "issns": [issn, issn.replace("-", "")], + "title": [title.lower()], + "alts": [title.lower()] + } + } + def set_value(dct, keys, value): """ Set a value in a nested dictionary based on a list of keys. """ for key in keys[:-1]: @@ -31,15 +79,36 @@ def write_json_files(data, written_files, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) + # Open jac.json file for writing if journal data is present and prepare it for writing + journal_filename = "journal" + jac_filename = "jac" + # Check if there's data for journal (case-insensitive) to process + journal_data_present = any(filename.lower() == journal_filename for filename in data.keys()) + if journal_data_present: + jac_file_path = os.path.join(output_dir, f"{jac_filename}.json") + mode = 'a' if jac_filename in written_files else 'w' + jac_file = open(jac_file_path, mode, encoding='utf-8') + for filename, contents in data.items(): file_path = os.path.join(output_dir, f"{filename}.json") mode = 'a' if filename in written_files else 'w' with open(file_path, mode, encoding='utf-8') as json_file: for content in contents: + if filename.lower() == journal_filename: + issn = generate_issn() + content.update({"issn": [issn], "createdAt": generate_created_at()}) + # Directly write the corresponding jac entry + jac_entry = generate_jac_entry(issn) + json.dump(jac_entry, jac_file) + jac_file.write('\n') json.dump(content, json_file) json_file.write('\n') written_files.add(filename) + if journal_filename in data: + jac_file.close() + written_files.add(jac_filename) + def csv_to_json(csv_file_paths, output_dir): written_files = set() for csv_file_path in csv_file_paths: @@ -66,10 +135,9 @@ def csv_to_json(csv_file_paths, output_dir): if __name__ == "__main__": if len(sys.argv) < 2: - print("Usage: python generate_test_data.py ...") + print("Usage: python script.py ...") sys.exit(1) csv_file_paths = sys.argv[1:] - output_dir = os.path.join("..", "test_database", "jct") # Output directory one level up + output_dir = os.path.join("..", "test_database") # Output directory one level up csv_to_json(csv_file_paths, output_dir) - From c6dee40122a985e0092f5223a35eb4dfffa8fb44 Mon Sep 17 00:00:00 2001 From: Ramakrishna Sakhamuru Date: Thu, 11 Apr 2024 18:00:15 +0530 Subject: [PATCH 4/8] Updated the code to load the data to es --- Importer/jctdata/cli.py | 18 +-- Importer/jctdata/lib/loader.py | 53 ++++++++- Importer/jctdata/settings.py | 2 + Importer/scripts/generate_test_data.py | 155 ++++++++++++++++++++----- funderdb/funders.py | 61 +++++++++- ui/funderdb.git | 2 +- ui/funderdb.sh | 6 +- 7 files changed, 252 insertions(+), 45 deletions(-) diff --git a/Importer/jctdata/cli.py b/Importer/jctdata/cli.py index 9cee1cc..cd622ee 100644 --- a/Importer/jctdata/cli.py +++ b/Importer/jctdata/cli.py @@ -13,19 +13,21 @@ @click.option("-o", "--only", "full_pipeline", flag_value=False) @click.option("-a", "--all", "full_pipeline", flag_value=True, default=True) @click.option("-f", "--force-resolve", is_flag=True) -def entry_point(mode, targets, stage=None, full_pipeline=True, force_resolve=False): - run(mode, targets, stage, full_pipeline, force_resolve) +@click.option("-t", "--test-database", is_flag=True) +def entry_point(mode, targets, stage=None, full_pipeline=True, force_resolve=False, test_database=False): + run(mode, targets, stage, full_pipeline, force_resolve, test_database) -def run(mode:str, targets:tuple, stage:str=None, full_pipeline:bool=True, force_resolve:bool=False): +def run(mode:str, targets:tuple, stage:str=None, full_pipeline:bool=True, force_resolve:bool=False, + test_database:bool=False): processor = MODE_MAP.get(mode) if not processor: return - processor(targets, stage, full_pipeline, force_resolve) + processor(targets, stage, full_pipeline, force_resolve, test_database) -def resolve(targets, stage=None, full_pipeline=True, force_resolve=False): +def resolve(targets, stage=None, full_pipeline=True, force_resolve=False, test_database=False): if targets[0] == "_all": targets = resolver.SOURCES.keys() @@ -43,7 +45,7 @@ def resolve(targets, stage=None, full_pipeline=True, force_resolve=False): getattr(datasource, stage)() -def index(targets, stage=None, full_pipeline=True, force_resolve=False): +def index(targets, stage=None, full_pipeline=True, force_resolve=False, test_database=False): if targets[0] == "_all": indexers = factory.get_all_indexers() else: @@ -62,7 +64,7 @@ def index(targets, stage=None, full_pipeline=True, force_resolve=False): getattr(indexer, stage)() -def load(targets, stage=None, full_pipeline=True, force_resolve=False): +def load(targets, stage=None, full_pipeline=True, force_resolve=False, test_database=False): if targets[0] == "_all": targets = factory.get_all_index_names() @@ -72,7 +74,7 @@ def load(targets, stage=None, full_pipeline=True, force_resolve=False): for t in targets: load_type = settings.INDEX_LOADERS[t] if load_type == "es": - loader.index_latest_with_alias(t, settings.ES_INDEX_SUFFIX) + loader.index_latest_with_alias(t, settings.ES_INDEX_SUFFIX, test_database) elif load_type == "file": loader.load_to_file(t) elif load_type == "helpdesk": diff --git a/Importer/jctdata/lib/loader.py b/Importer/jctdata/lib/loader.py index 0057f34..e5b2479 100644 --- a/Importer/jctdata/lib/loader.py +++ b/Importer/jctdata/lib/loader.py @@ -18,11 +18,15 @@ def index(infile, bulkfile, conn, index_type, mapping, alias): with open(infile, "r") as f, open(bulkfile, "w") as o: line = f.readline() while line: - d = json.loads(line) - if "id" not in d: - d["id"] = uuid.uuid4().hex - bulklines = esprit.raw.to_bulk_single_rec(d) - o.write(bulklines) + if line: + try: + d = json.loads(line) + if "id" not in d: + d["id"] = uuid.uuid4().hex + bulklines = esprit.raw.to_bulk_single_rec(d) + o.write(bulklines) + except json.JSONDecodeError: + print(f"skipped line {line}") line = f.readline() if not esprit.raw.type_exists(conn, index_type, es_version="1.7.5"): @@ -63,7 +67,38 @@ def index(infile, bulkfile, conn, index_type, mapping, alias): esprit.raw.delete(conn) -def index_latest_with_alias(target, index_suffix): +def update_with_test_data(input_file, output_file): + """ + Reads each line from the input file and appends it to a new line in the output file. + + Args: + input_file: Path to the input file. + output_file: Path to the output file. + """ + test_data_file = os.path.join(settings.TEST_DATABASE, input_file) + # Check if output file exists + if os.path.exists(test_data_file) and os.path.exists(output_file): + os.makedirs(settings.TEMP_DIR, exist_ok=True) + + # Clear existing files in temp directory (if any) + for filename in os.listdir(settings.TEMP_DIR): + file_path = os.path.join(settings.TEMP_DIR, filename) + os.remove(file_path) + + # Copy existing output file to temporary directory + output_file = shutil.copy2(output_file, settings.TEMP_DIR) + + # Open the input file in read mode and output file in append mode + with open(test_data_file, 'r') as in_file, open(output_file, 'a') as out_file: + # Read each line from the input file + for line in in_file: + # Write the line to the output file + out_file.write('\n' + line.strip()) + + return output_file + + +def index_latest_with_alias(target, index_suffix, test_database=False): target_dir = settings.INDEX_PATH[target] os.makedirs(target_dir, exist_ok=True) @@ -101,6 +136,12 @@ def index_latest_with_alias(target, index_suffix): print("LOADER: ALIAS: {x}".format(x=ALIAS)) print("LOADER: BULK: {x}".format(x=BULK_FILE)) + if test_database: + print("Appending test data") + # add test database records to the file + IN = update_with_test_data(target + ".json", IN) + print("LOADER: IN: {x}".format(x=IN)) + index(IN, BULK_FILE, CONN, INDEX_TYPE, MAPPING, ALIAS) diff --git a/Importer/jctdata/settings.py b/Importer/jctdata/settings.py index e6c97ec..40b9e7d 100644 --- a/Importer/jctdata/settings.py +++ b/Importer/jctdata/settings.py @@ -10,6 +10,8 @@ def rel2abs(file, *args): DATABASES = rel2abs(__file__, "..", "databases") RESOURCES = rel2abs(__file__, "..", "resources") +TEST_DATABASE = rel2abs(__file__, "..", "test_database") +TEMP_DIR = rel2abs(__file__, "..", "temp") DIR_DATE_FORMAT = "%Y-%m-%d_%H%M" diff --git a/Importer/scripts/generate_test_data.py b/Importer/scripts/generate_test_data.py index 2bdd092..306a928 100644 --- a/Importer/scripts/generate_test_data.py +++ b/Importer/scripts/generate_test_data.py @@ -6,7 +6,8 @@ python generate_test_data.py [ ... ] """ - +import argparse +import ast import csv import json import os @@ -14,6 +15,8 @@ from collections import defaultdict from datetime import datetime, timedelta import random +from jctdata import settings + def calculate_issn_check_digit(issn): """Calculate the ISSN check digit for the first seven ISSN digits.""" @@ -31,18 +34,23 @@ def generate_issn(): check_digit = calculate_issn_check_digit(first_seven_digits) return f"{first_seven_digits[:4]}-{first_seven_digits[4:]}{check_digit}" + def generate_created_at(): """Generate a dynamic creation datetime string in ISO 8601 format.""" now = datetime.utcnow() - timedelta(days=random.randint(0, 365), hours=random.randint(0, 23), minutes=random.randint(0, 59)) return now.strftime('%Y-%m-%dT%H:%M:%SZ') + def parse_header(header): """ Parse the header to get the filename and the JSON key. """ parts = header.split('.') return parts[0], '.'.join(parts[1:]) if len(parts) > 1 else None + counter = 0 +funder_counter = 0 + def generate_random_title_and_publisher(): """Generate a random title and publisher from predefined lists.""" @@ -52,6 +60,16 @@ def generate_random_title_and_publisher(): publisher = "Test publisher " +str(counter) return title, publisher + +def generate_funder_id_name(): + """Generate a random id and name for funder.""" + global funder_counter + funder_counter = funder_counter + 1 + funder_id = "test_funder_" + str(funder_counter) + name = "Test Funder " +str(funder_counter) + return funder_id, name + + def generate_jac_entry(issn): """Generate an entry for jac.json with test data.""" title, publisher = generate_random_title_and_publisher() @@ -66,6 +84,7 @@ def generate_jac_entry(issn): } } + def set_value(dct, keys, value): """ Set a value in a nested dictionary based on a list of keys. """ for key in keys[:-1]: @@ -74,13 +93,62 @@ def set_value(dct, keys, value): dct = dct[key] dct[keys[-1]] = value if value != '' else None -def write_json_files(data, written_files, output_dir): + +def get_funder_data(route, data): + FUNDER_ROW = {"routes": {"self_archiving": {"calculate": True, "rights_retention": "2022-11-01T00:00:00Z", + "license": ["cc-by", "cc-by-sa", "cc0"], "embargo": 0}, "fully_oa": {"calculate": True, + "license": ["cc-by", "cc-by-sa", "cc0"]}, "hybrid": {"calculate": False, + "license": ["cc-by", "cc-by-sa", "cc0"]}, "ta": {"calculate": True, + "license": ["cc-by", "cc-by-sa", "cc0"]}, "tj": {"calculate": True}}, + "card_order": ["fully_oa", "fully_oa_by_exception", "ta", "ta_aq", "tj", "self_archiving", "sa_rr", + "journal_non_compliant", "funder_non_compliant", "institution_non_compliant", + "rights_retention_non_compliant"], + "cards": [{"id": "fully_oa", "compliant": True, "match_routes": {"must": ["fully_oa"]}, + "match_qualifications": {"not": ["fully_oa.oa_exception_caveat"]}, "preferred": False}, + {"id": "fully_oa_by_exception", "compliant": True, "match_routes": {"must": ["fully_oa"]}, + "match_qualifications": {"must": ["fully_oa.oa_exception_caveat"]}, "preferred": False}, + {"id": "sa_rr", "compliant": True, "match_routes": {"must": ["self_archiving"], + "not": ["fully_oa"]}, "match_qualifications": + {"must": ["self_archiving.rights_retention_author_advice"]}, "preferred": False, + "modal": "sa_rr"}, {"id": "self_archiving", "compliant": True, "match_routes": + {"must": ["self_archiving"], "not": ["fully_oa"]}, + "match_qualifications": {"not": ["self_archiving.rights_retention_author_advice"]}, + "preferred": False, "modal": "sa"}, {"id": "ta", "compliant": True, "match_routes": + {"must": ["ta"]}, "match_qualifications": {"not": ["ta.corresponding_authors"]}, + "preferred": False}, {"id": "ta_aq", "compliant": True, "match_routes": {"must": ["ta"]}, + "match_qualifications": {"must": ["ta.corresponding_authors"]}, "preferred": False}, + {"id": "tj", "compliant": True, "match_routes": {"must": ["tj"]}, "preferred": False, + "modal": "tj"}, {"id": "journal_non_compliant", "compliant": False, "match_routes": + {"not": ["self_archiving", "fully_oa", "ta", "tj"]}, "preferred": False}, + {"id": "funder_non_compliant", "compliant": False, "match_routes": + {"not": ["self_archiving", "fully_oa", "ta", "tj"]}, "preferred": False}, + {"id": "institution_non_compliant", "compliant": False, "match_routes": {"not": + ["self_archiving", "fully_oa", "ta", "tj"]}, "preferred": False}, + {"id": "rights_retention_non_compliant", "compliant": False, "match_routes": + {"not": ["self_archiving", "fully_oa", "ta", "tj"]}, "preferred": False}], + "id": "testid", "name": "Test Funder", "abbr": "SNSF", "plan_s": "2022-11-01T00:00:00Z", + "apc": {"tj": False}} + + for key, value in data.items(): + if value: + FUNDER_ROW["routes"][route][key] = value + funder_id, name = generate_funder_id_name() + FUNDER_ROW["id"] = funder_id + FUNDER_ROW["name"] = name + else: + return None + + return FUNDER_ROW + + +def write_json_files(route, data, written_files, output_dir): """ Write the organized data to JSON files, each JSON object in a single line. """ if not os.path.exists(output_dir): os.makedirs(output_dir) # Open jac.json file for writing if journal data is present and prepare it for writing journal_filename = "journal" + funder_filename = "funder_config" jac_filename = "jac" # Check if there's data for journal (case-insensitive) to process journal_data_present = any(filename.lower() == journal_filename for filename in data.keys()) @@ -88,6 +156,8 @@ def write_json_files(data, written_files, output_dir): jac_file_path = os.path.join(output_dir, f"{jac_filename}.json") mode = 'a' if jac_filename in written_files else 'w' jac_file = open(jac_file_path, mode, encoding='utf-8') + if mode == 'w': + written_files.add(jac_filename) for filename, contents in data.items(): file_path = os.path.join(output_dir, f"{filename}.json") @@ -97,47 +167,78 @@ def write_json_files(data, written_files, output_dir): if filename.lower() == journal_filename: issn = generate_issn() content.update({"issn": [issn], "createdAt": generate_created_at()}) - # Directly write the corresponding jac entry + # write the corresponding jac entry jac_entry = generate_jac_entry(issn) json.dump(jac_entry, jac_file) jac_file.write('\n') - json.dump(content, json_file) - json_file.write('\n') + if filename.lower() == funder_filename: + funder_data = get_funder_data(route, content) + # write data to funder config the funder data available + if funder_data: + json.dump(funder_data, json_file) + json_file.write('\n') + else: + json.dump(content, json_file) + json_file.write('\n') written_files.add(filename) if journal_filename in data: jac_file.close() written_files.add(jac_filename) -def csv_to_json(csv_file_paths, output_dir): - written_files = set() - for csv_file_path in csv_file_paths: - data = defaultdict(list) - with open(csv_file_path, newline='', encoding='utf-8') as csvfile: - reader = csv.DictReader(csvfile) - headers = reader.fieldnames +def csv_to_json(route, csv_file_path, output_dir, written_files): + + data = defaultdict(list) + + with open(csv_file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + headers = reader.fieldnames + + for row in reader: + json_objects = defaultdict(dict) + for header in headers: + if '.' in header: + filename, json_key = parse_header(header) + if json_key: + nested_keys = json_key.split('.') + if row[header]: - for row in reader: - json_objects = defaultdict(dict) - for header in headers: - if '.' in header: - filename, json_key = parse_header(header) - if json_key: - nested_keys = json_key.split('.') - set_value(json_objects[filename], nested_keys, row[header]) + try: + set_value(json_objects[filename], nested_keys, ast.literal_eval(row[header])) + except: + print("route : " + route + " header : " + header + " value : " + row[header]) + set_value(json_objects[filename], nested_keys, row[header]) - # Add the created JSON objects to the data - for filename, json_obj in json_objects.items(): - data[filename].append(json_obj) + # Add the created JSON objects to the data + for filename, json_obj in json_objects.items(): + data[filename].append(json_obj) + + write_json_files(route, data, written_files, output_dir) - write_json_files(data, written_files, output_dir) if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python script.py ...") sys.exit(1) - csv_file_paths = sys.argv[1:] - output_dir = os.path.join("..", "test_database") # Output directory one level up - csv_to_json(csv_file_paths, output_dir) + parser = argparse.ArgumentParser(description="Generate JSON files from specified CSV inputs.") + parser.add_argument("--self_archiving", help="Path to the self_archiving CSV file", type=str) + parser.add_argument("--fully_oa", help="Path to the fully_oa CSV file", type=str) + parser.add_argument("--hybrid", help="Path to the hybrid CSV file", type=str) + parser.add_argument("--ta", help="Path to the ta CSV file", type=str) + parser.add_argument("--tj", help="Path to the tj CSV file", type=str) + + args = parser.parse_args() + + written_files = set() + output_dir = settings.TEST_DATABASE # Output directory + funderdb_funders_dir = os.path.join("..", "..", "funderdb", "funders") + + # Handle each specified CSV file + file_paths = [{"self_archiving": args.self_archiving}, {"fully_oa": args.fully_oa}, + {"hybrid": args.hybrid}, {"ta": args.ta}, {"tj": args.tj}] # Add other files as needed + for file_path in file_paths: + for key, value in file_path.items(): + if value: + csv_to_json(key, value, output_dir, written_files) diff --git a/funderdb/funders.py b/funderdb/funders.py index b15301f..4e533e6 100644 --- a/funderdb/funders.py +++ b/funderdb/funders.py @@ -1,10 +1,58 @@ +import argparse import os import shutil import yaml import json -def list_funders(path, out_dir): +def rel2abs(file, *args): + file = os.path.realpath(file) + if os.path.isfile(file): + file = os.path.dirname(file) + return os.path.abspath(os.path.join(file, *args)) + + +def append_test_funders_to_list(funders): + """ + Reads a test JSON file and append contents to the funders. + + Args: + funders: funders list. + """ + + funder_config_file = rel2abs(__file__, "..", "Importer", "test_database", "funder_config.json") + + print(f"Test config file : {funder_config_file}") + + # Check if the file exists + if not os.path.exists(funder_config_file): + print(f"Error: File '{funder_config_file}' does not exist.") + return + + with open(funder_config_file, 'r') as f: + for line in f: + funder_cfg = json.loads(line.strip()) + + if funder_cfg: + funders.append({ + "id": funder_cfg.get("id"), + "name": funder_cfg.get("name"), + "abbr": funder_cfg.get("abbr"), + "country": funder_cfg.get("country"), + "primary": True + }) + + for aka in funder_cfg.get("aka", []): + funders.append({ + "id": funder_cfg.get("id"), + "name": aka.get("name"), + "abbr": aka.get("abbr"), + "country": funder_cfg.get("country"), + "primary": False + }) + + +def list_funders(path, out_dir, append_test_data:bool): if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) @@ -41,6 +89,8 @@ def list_funders(path, out_dir): # write a js version which can be imported in a script tag with open(os.path.join(out_dir, "funders.js"), "w") as o: + if append_test_data: + append_test_funders_to_list(funders) o.write("jct.funderlist=" + json.dumps(funders)) # write a markdown fragment which can be used in the API documentation @@ -51,4 +101,11 @@ def list_funders(path, out_dir): if __name__ == "__main__": - list_funders("funders", "autocomplete") \ No newline at end of file + parser = argparse.ArgumentParser(description="Generate funders data.") + parser.add_argument("-t", "--test_data", action="store_true", default=False, help="Appends test data") + args = parser.parse_args() + + if args.test_data: + print("Test mode - Appends test data") + + list_funders("funders", "autocomplete", args.test_data) \ No newline at end of file diff --git a/ui/funderdb.git b/ui/funderdb.git index ab8fdbd..4d6c1ad 100644 --- a/ui/funderdb.git +++ b/ui/funderdb.git @@ -1 +1 @@ -249d09e \ No newline at end of file +92d3fd9 \ No newline at end of file diff --git a/ui/funderdb.sh b/ui/funderdb.sh index 9362375..ef50856 100755 --- a/ui/funderdb.sh +++ b/ui/funderdb.sh @@ -3,7 +3,11 @@ FUNDERDB=../funderdb FDB2HERE=../ui -(cd $FUNDERDB || exit; python3 funders.py) +# This script takes and argument '-t'. When argument '-t' specified, funders data will be appended with test data +# sh funders.sh -t to append test funders data (usually for development and testing) +# sh funders.sh without testdata (for production) + +(cd $FUNDERDB || exit; python3 funders.py $1) (cd $FUNDERDB || exit; git log --pretty=format:'%h' -n 1 > $FDB2HERE/funderdb.git) cp $FUNDERDB/autocomplete/funders.js static/js From 51cd6eba0bfd39bfae917ea57a125dec6c7567b8 Mon Sep 17 00:00:00 2001 From: Ramakrishna Sakhamuru Date: Fri, 17 May 2024 15:28:57 +0530 Subject: [PATCH 5/8] Updated code to write test records --- Importer/jctdata/cli.py | 2 + Importer/jctdata/lib/loader.py | 2 +- Importer/scripts/generate_test_data.py | 100 +++++++++++++++++++++---- 3 files changed, 88 insertions(+), 16 deletions(-) diff --git a/Importer/jctdata/cli.py b/Importer/jctdata/cli.py index cd622ee..e1f6510 100644 --- a/Importer/jctdata/cli.py +++ b/Importer/jctdata/cli.py @@ -15,6 +15,8 @@ @click.option("-f", "--force-resolve", is_flag=True) @click.option("-t", "--test-database", is_flag=True) def entry_point(mode, targets, stage=None, full_pipeline=True, force_resolve=False, test_database=False): + if test_database: + full_pipeline = False run(mode, targets, stage, full_pipeline, force_resolve, test_database) diff --git a/Importer/jctdata/lib/loader.py b/Importer/jctdata/lib/loader.py index e5b2479..b46d1af 100644 --- a/Importer/jctdata/lib/loader.py +++ b/Importer/jctdata/lib/loader.py @@ -93,7 +93,7 @@ def update_with_test_data(input_file, output_file): # Read each line from the input file for line in in_file: # Write the line to the output file - out_file.write('\n' + line.strip()) + out_file.write(line.strip()+"\n") return output_file diff --git a/Importer/scripts/generate_test_data.py b/Importer/scripts/generate_test_data.py index 306a928..e91a6cb 100644 --- a/Importer/scripts/generate_test_data.py +++ b/Importer/scripts/generate_test_data.py @@ -17,6 +17,18 @@ import random from jctdata import settings +journal_filename = "journal" +funder_filename = "funder_config" +jac_filename = "jac" +iac_filename = "iac" +institution_filename = "institution" +testrecords = defaultdict(list) + +counter = 0 +institution_counter = 0 +funder_counter = 0 +countries = ["United States", "Canada", "United Kingdom", "France", "Germany", "Japan", "Australia"] + def calculate_issn_check_digit(issn): """Calculate the ISSN check digit for the first seven ISSN digits.""" @@ -35,6 +47,11 @@ def generate_issn(): return f"{first_seven_digits[:4]}-{first_seven_digits[4:]}{check_digit}" +def generate_ror_id(): + ror_id = "".join(random.choice('0123456789abcdef') for _ in range(9)) + return ror_id + + def generate_created_at(): """Generate a dynamic creation datetime string in ISO 8601 format.""" now = datetime.utcnow() - timedelta(days=random.randint(0, 365), hours=random.randint(0, 23), @@ -48,19 +65,24 @@ def parse_header(header): return parts[0], '.'.join(parts[1:]) if len(parts) > 1 else None -counter = 0 -funder_counter = 0 - - def generate_random_title_and_publisher(): """Generate a random title and publisher from predefined lists.""" global counter counter = counter + 1 title = "Test title " + str(counter) - publisher = "Test publisher " +str(counter) + publisher = "Test publisher " + str(counter) return title, publisher +def generate_random_title_and_country(): + """Generate a random title and country from predefined lists for institution.""" + global institution_counter + institution_counter = institution_counter + 1 + title = "Test University " + str(institution_counter) + country = random.choice(countries) + return title, country + + def generate_funder_id_name(): """Generate a random id and name for funder.""" global funder_counter @@ -85,6 +107,20 @@ def generate_jac_entry(issn): } +def generate_iac_entry(ror): + """Generate an entry for jac.json with test data.""" + title, country = generate_random_title_and_country() + return { + "ror": ror, + "title": title, + "country": country, + "index": { + "title": [title.lower()], + "ror": ror + } + } + + def set_value(dct, keys, value): """ Set a value in a nested dictionary based on a list of keys. """ for key in keys[:-1]: @@ -132,9 +168,9 @@ def get_funder_data(route, data): for key, value in data.items(): if value: FUNDER_ROW["routes"][route][key] = value - funder_id, name = generate_funder_id_name() - FUNDER_ROW["id"] = funder_id - FUNDER_ROW["name"] = name + # funder_id, name = generate_funder_id_name() + FUNDER_ROW["id"] = data["id"] + FUNDER_ROW["name"] = data["name"] else: return None @@ -146,29 +182,35 @@ def write_json_files(route, data, written_files, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) - # Open jac.json file for writing if journal data is present and prepare it for writing - journal_filename = "journal" - funder_filename = "funder_config" - jac_filename = "jac" # Check if there's data for journal (case-insensitive) to process journal_data_present = any(filename.lower() == journal_filename for filename in data.keys()) if journal_data_present: + # Open jac.json file for writing if journal data is present and prepare it for writing jac_file_path = os.path.join(output_dir, f"{jac_filename}.json") mode = 'a' if jac_filename in written_files else 'w' jac_file = open(jac_file_path, mode, encoding='utf-8') if mode == 'w': written_files.add(jac_filename) + # Check if there's data for institution (case-insensitive) to process + institution_data_present = any(filename.lower() == institution_filename for filename in data.keys()) + if institution_data_present: + # Open jac.json file for writing if journal data is present and prepare it for writing + iac_file_path = os.path.join(output_dir, f"{iac_filename}.json") + mode = 'a' if iac_filename in written_files else 'w' + iac_file = open(iac_file_path, mode, encoding='utf-8') + if mode == 'w': + written_files.add(iac_filename) + for filename, contents in data.items(): file_path = os.path.join(output_dir, f"{filename}.json") mode = 'a' if filename in written_files else 'w' with open(file_path, mode, encoding='utf-8') as json_file: for content in contents: if filename.lower() == journal_filename: - issn = generate_issn() - content.update({"issn": [issn], "createdAt": generate_created_at()}) + content.update({"createdAt": generate_created_at()}) # write the corresponding jac entry - jac_entry = generate_jac_entry(issn) + jac_entry = generate_jac_entry(content["issn"]) json.dump(jac_entry, jac_file) jac_file.write('\n') if filename.lower() == funder_filename: @@ -177,6 +219,11 @@ def write_json_files(route, data, written_files, output_dir): if funder_data: json.dump(funder_data, json_file) json_file.write('\n') + if filename.lower() == institution_filename: + # write the corresponding jac entry + iac_entry = generate_iac_entry(content["ror"]) + json.dump(iac_entry, iac_file) + iac_file.write('\n') else: json.dump(content, json_file) json_file.write('\n') @@ -196,6 +243,10 @@ def csv_to_json(route, csv_file_path, output_dir, written_files): headers = reader.fieldnames for row in reader: + journal_id = "" + funder_id = "" + institution_id = "" + json_objects = defaultdict(dict) for header in headers: if '.' in header: @@ -209,14 +260,32 @@ def csv_to_json(route, csv_file_path, output_dir, written_files): except: print("route : " + route + " header : " + header + " value : " + row[header]) set_value(json_objects[filename], nested_keys, row[header]) + if filename.lower() == journal_filename: + journal_id = generate_issn() + set_value(json_objects[filename], ["issn"], journal_id) + if filename.lower() == institution_filename: + institution_id = generate_ror_id() + set_value(json_objects[filename], ["ror"], institution_id) + if filename.lower() == funder_filename: + funder_id, name = generate_funder_id_name() + set_value(json_objects[filename], ["id"], funder_id) + set_value(json_objects[filename], ["name"], name) # Add the created JSON objects to the data for filename, json_obj in json_objects.items(): data[filename].append(json_obj) + testrecords[row["Test"]] = dict(journal=journal_id, institution=institution_id, funder=funder_id) + write_json_files(route, data, written_files, output_dir) +def write_test_records(): + file_path = os.path.join(output_dir, "test_records.json") + with open(file_path, "w", encoding='utf-8') as records_file: + json.dump(testrecords, records_file) + + if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python script.py ...") @@ -242,3 +311,4 @@ def csv_to_json(route, csv_file_path, output_dir, written_files): for key, value in file_path.items(): if value: csv_to_json(key, value, output_dir, written_files) + write_test_records() From 8db7226e84f02744d2cc933498fbcfa4e1991bbb Mon Sep 17 00:00:00 2001 From: Ramakrishna Sakhamuru Date: Mon, 12 Aug 2024 12:04:53 +0530 Subject: [PATCH 6/8] Fixed few issues generating test data --- Importer/scripts/generate_test_data.py | 97 +++++++++++++++++++------- 1 file changed, 72 insertions(+), 25 deletions(-) diff --git a/Importer/scripts/generate_test_data.py b/Importer/scripts/generate_test_data.py index e91a6cb..4f43d78 100644 --- a/Importer/scripts/generate_test_data.py +++ b/Importer/scripts/generate_test_data.py @@ -16,9 +16,11 @@ from datetime import datetime, timedelta import random from jctdata import settings +import constants journal_filename = "journal" funder_filename = "funder_config" +funder_lang_filename = "funder_language" jac_filename = "jac" iac_filename = "iac" institution_filename = "institution" @@ -62,7 +64,7 @@ def generate_created_at(): def parse_header(header): """ Parse the header to get the filename and the JSON key. """ parts = header.split('.') - return parts[0], '.'.join(parts[1:]) if len(parts) > 1 else None + return parts[0].lower(), '.'.join(parts[1:]) if len(parts) > 1 else None def generate_random_title_and_publisher(): @@ -169,14 +171,29 @@ def get_funder_data(route, data): if value: FUNDER_ROW["routes"][route][key] = value # funder_id, name = generate_funder_id_name() - FUNDER_ROW["id"] = data["id"] - FUNDER_ROW["name"] = data["name"] - else: - return None + elif key.lower() == 'rights_retention': + FUNDER_ROW['routes']['self_archiving']['rights_retention'] = '2050-01-17T00:00:00Z' + + FUNDER_ROW["id"] = data["id"] + FUNDER_ROW["name"] = data["name"] return FUNDER_ROW +def get_lang_data(lang, data): + lang = lang.lower() + lang_data = None + if lang == "en": + lang_data = constants.FUNDER_LANG_EN + elif lang == "fr": + lang_data = constants.FUNDER_LANG_FR + + if lang_data: + lang_data["id"] = data["id"] + "__" + lang + + return lang_data + + def write_json_files(route, data, written_files, output_dir): """ Write the organized data to JSON files, each JSON object in a single line. """ if not os.path.exists(output_dir): @@ -202,6 +219,16 @@ def write_json_files(route, data, written_files, output_dir): if mode == 'w': written_files.add(iac_filename) + # Check if there's data for institution (case-insensitive) to process + funder_config_present = any(filename.lower() == funder_filename for filename in data.keys()) + if funder_config_present: + # Open funder_language.json file for writing if journal data is present and prepare it for writing + funder_lang_file_path = os.path.join(output_dir, f"{funder_lang_filename}.json") + mode = 'a' if funder_lang_file_path in written_files else 'w' + funder_lang_file = open(funder_lang_file_path, mode, encoding='utf-8') + if mode == 'w': + written_files.add(funder_lang_filename) + for filename, contents in data.items(): file_path = os.path.join(output_dir, f"{filename}.json") mode = 'a' if filename in written_files else 'w' @@ -213,13 +240,24 @@ def write_json_files(route, data, written_files, output_dir): jac_entry = generate_jac_entry(content["issn"]) json.dump(jac_entry, jac_file) jac_file.write('\n') - if filename.lower() == funder_filename: + elif filename.lower() == funder_filename: funder_data = get_funder_data(route, content) # write data to funder config the funder data available if funder_data: json.dump(funder_data, json_file) json_file.write('\n') - if filename.lower() == institution_filename: + + # Write funder language data + funder_lang_data_en = get_lang_data("en", content) + funder_lang_data_fr = get_lang_data("fr", content) + if funder_lang_data_en: + json.dump(funder_lang_data_en, funder_lang_file) + funder_lang_file.write('\n') + if funder_lang_data_fr: + json.dump(funder_lang_data_fr, funder_lang_file) + funder_lang_file.write('\n') + + elif filename.lower() == institution_filename: # write the corresponding jac entry iac_entry = generate_iac_entry(content["ror"]) json.dump(iac_entry, iac_file) @@ -234,6 +272,15 @@ def write_json_files(route, data, written_files, output_dir): written_files.add(jac_filename) +def parse_value(value: str): + if value.lower() == "true": + return True + elif value.lower() == "false": + return False + else: + return ast.literal_eval(value) + + def csv_to_json(route, csv_file_path, output_dir, written_files): data = defaultdict(list) @@ -253,23 +300,23 @@ def csv_to_json(route, csv_file_path, output_dir, written_files): filename, json_key = parse_header(header) if json_key: nested_keys = json_key.split('.') - if row[header]: - - try: - set_value(json_objects[filename], nested_keys, ast.literal_eval(row[header])) - except: - print("route : " + route + " header : " + header + " value : " + row[header]) - set_value(json_objects[filename], nested_keys, row[header]) - if filename.lower() == journal_filename: - journal_id = generate_issn() - set_value(json_objects[filename], ["issn"], journal_id) - if filename.lower() == institution_filename: - institution_id = generate_ror_id() - set_value(json_objects[filename], ["ror"], institution_id) - if filename.lower() == funder_filename: - funder_id, name = generate_funder_id_name() - set_value(json_objects[filename], ["id"], funder_id) - set_value(json_objects[filename], ["name"], name) + + try: + set_value(json_objects[filename], nested_keys, parse_value(row[header])) + except: + print("route : " + route + " header : " + header + " value : " + row[header]) + set_value(json_objects[filename], nested_keys, row[header]) + + if journal_filename in json_objects: + journal_id = generate_issn() + set_value(json_objects[journal_filename], ["issn"], journal_id) + if institution_filename in json_objects: + institution_id = generate_ror_id() + set_value(json_objects[institution_filename], ["ror"], institution_id) + if funder_filename in json_objects: + funder_id, name = generate_funder_id_name() + set_value(json_objects[funder_filename], ["id"], funder_id) + set_value(json_objects[funder_filename], ["name"], name) # Add the created JSON objects to the data for filename, json_obj in json_objects.items(): @@ -283,7 +330,7 @@ def csv_to_json(route, csv_file_path, output_dir, written_files): def write_test_records(): file_path = os.path.join(output_dir, "test_records.json") with open(file_path, "w", encoding='utf-8') as records_file: - json.dump(testrecords, records_file) + json.dump(testrecords, records_file, indent=2) if __name__ == "__main__": From debf0cdd4477ef1db5936aac809e9cbaf69e639b Mon Sep 17 00:00:00 2001 From: Ramakrishna Sakhamuru Date: Thu, 29 Aug 2024 14:06:45 +0530 Subject: [PATCH 7/8] corrected writing the json file for journal --- Importer/scripts/generate_test_data.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Importer/scripts/generate_test_data.py b/Importer/scripts/generate_test_data.py index 4f43d78..d734bcf 100644 --- a/Importer/scripts/generate_test_data.py +++ b/Importer/scripts/generate_test_data.py @@ -240,6 +240,10 @@ def write_json_files(route, data, written_files, output_dir): jac_entry = generate_jac_entry(content["issn"]) json.dump(jac_entry, jac_file) jac_file.write('\n') + + json.dump(content, json_file) + json_file.write('\n') + elif filename.lower() == funder_filename: funder_data = get_funder_data(route, content) # write data to funder config the funder data available @@ -262,6 +266,9 @@ def write_json_files(route, data, written_files, output_dir): iac_entry = generate_iac_entry(content["ror"]) json.dump(iac_entry, iac_file) iac_file.write('\n') + + json.dump(content, json_file) + json_file.write('\n') else: json.dump(content, json_file) json_file.write('\n') From 7eacf21216e3fde5747cf65561bb59ab80b186c1 Mon Sep 17 00:00:00 2001 From: Ramakrishna Sakhamuru Date: Mon, 30 Sep 2024 16:30:29 +0530 Subject: [PATCH 8/8] Added OA permissions --- Importer/scripts/generate_test_data.py | 15 +++++++++++++++ api/server/service/jct/api.coffee | 14 ++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/Importer/scripts/generate_test_data.py b/Importer/scripts/generate_test_data.py index d734bcf..3fcbc5b 100644 --- a/Importer/scripts/generate_test_data.py +++ b/Importer/scripts/generate_test_data.py @@ -14,6 +14,7 @@ import sys from collections import defaultdict from datetime import datetime, timedelta +from copy import deepcopy import random from jctdata import settings import constants @@ -241,6 +242,7 @@ def write_json_files(route, data, written_files, output_dir): json.dump(jac_entry, jac_file) jac_file.write('\n') + add_oa_works_permissions(content) json.dump(content, json_file) json_file.write('\n') @@ -279,6 +281,19 @@ def write_json_files(route, data, written_files, output_dir): written_files.add(jac_filename) +def add_oa_works_permissions(journal_json): + if "all_permissions" in journal_json and journal_json["all_permissions"]: + perms = deepcopy(constants.OA_WORKS_PERMISSIONS) + if "oabcompliant" in journal_json and not journal_json["oabcompliant"]: + perms["best_permission"]["licences"] = [] + perms["best_permission"]["licence"] = "" + for p in perms["all_permissions"]: + p["licences"] = [] + p["licence"] = "" + + journal_json["oa_works_permissions"] = perms + + def parse_value(value: str): if value.lower() == "true": return True diff --git a/api/server/service/jct/api.coffee b/api/server/service/jct/api.coffee index 5af7554..216776d 100644 --- a/api/server/service/jct/api.coffee +++ b/api/server/service/jct/api.coffee @@ -490,9 +490,19 @@ API.service.jct.calculate = (params={}, refresh) -> hascompliant = false allcached = true _results = [] + oa_permissions = undefined + + if journal + issn = journal.split(',') if typeof journal is 'string' + issn_qr = 'issn.exact:"' + issn.join('" OR issn.exact:"') + '"' + journal_rec = jct_journal.find issn_qr + if journal_rec and journal_rec.oa_works_permissions + oa_permissions = journal_rec.oa_works_permissions + + if oa_permissions is undefined + # get data from oa.works + oa_permissions = API.service.jct.oa_works (issnsets[journal] ? journal), (if institution? then institution else undefined) - # get data from oa.works - oa_permissions = API.service.jct.oa_works (issnsets[journal] ? journal), (if institution? then institution else undefined) # get funder config funder_config = API.service.jct.funder_config funder, undefined