Skip to content

Commit

Permalink
norm duplicated cits
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanhb committed Dec 22, 2023
1 parent 964af2e commit 7fba15c
Show file tree
Hide file tree
Showing 2 changed files with 186 additions and 0 deletions.
185 changes: 185 additions & 0 deletions scripts/norm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
#!python
# Copyright (c) 2022 The OpenCitations Index Authors.
#
# Permission to use, copy, modify, and/or distribute this software for any purpose
# with or without fee is hereby granted, provided that the above copyright notice
# and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
# SOFTWARE.

import multiprocessing
import os
import time
import csv
import redis
from zipfile import ZipFile
import json
import io
import re

from tqdm import tqdm
from argparse import ArgumentParser
from urllib.parse import quote
from datetime import datetime, timezone
from collections import defaultdict

from oc.index.parsing.base import CitationParser
from oc.index.utils.logging import get_logger
from oc.index.utils.config import get_config
from oc.index.finder.base import ResourceFinderHandler
from oc.index.finder.base import OMIDResourceFinder
from oc.index.finder.orcid import ORCIDResourceFinder
from oc.index.finder.crossref import CrossrefResourceFinder
from oc.index.finder.datacite import DataCiteResourceFinder
from oc.index.oci.citation import Citation, OCIManager
from oc.index.oci.storer import CitationStorer
from oc.index.glob.redis import RedisDataSource
from oc.index.glob.csv import CSVDataSource

_config = get_config()

def wr_with_buffer(data, f_out, buffer, force = False):
if len(data) >= buffer or force:
with open(f_out, 'a+') as f:
write = csv.writer(f)
write.writerows([[e] for e in data])
return len(data)
return 0

def get_from_redis(data, redis_db, buffer, force = False):
not_in_redis = []
if len(data) >= buffer or force:
for k, val in zip( data , redis_db.mget(data) ):
if val == None:
not_in_redis.append(k)
return not_in_redis


def normalize_dump(input_dir, output_dir, mapping_file):
global _config
logger = get_logger()

# get the INDEX service values from the CONFIG.INI
idbase_url = _config.get("INDEX", "idbaseurl")
index_identifier = _config.get("INDEX", "identifier")
agent = _config.get("INDEX", "agent")
service_name = _config.get("INDEX", "service")
baseurl = _config.get("INDEX", "baseurl")

identifier = ""
citing_col = "citing"
cited_col = "cited"

redis_cits = redis.Redis(
host="127.0.0.1",
port="6379",
db=_config.get("cnc", "db_cits")
)

invalidated_cits = []
valid_cits = []
omid_mapper = dict()
WR_BUFFER = 10000
BUFFER_REDIS = 10000

# Load the mapping table
with open(mapping_file, 'r') as file:
reader = csv.reader(file)
next(reader)
for row in reader:
if "meta/br/" in row[0]:
correct_omid = row[0].split("https://w3id.org/oc/meta/br/")[1]
for duplicated_omid in row[1].split("; "):
if "meta/br/" in duplicated_omid:
duplicated_omid = duplicated_omid.split("https://w3id.org/oc/meta/br/")[1]
omid_mapper[duplicated_omid] = correct_omid

files_to_process = [file for file in os.listdir(input_dir) if file.endswith('.ttl')]
for idx,file in enumerate(files_to_process):
with open(os.path.join(input_dir, file), 'r') as ttl_file:
lines = ttl_file.readlines()
for line in lines:
if line.strip() != "":
oci_pattern = r"https://w3id.org/oc/index/ci/(\d{1,}-\d{1,})"
oci = re.search(oci_pattern, line)
# if an OCI is found
if oci:
oci = oci.group(1)
citing_omid = oci.split("-")[0]
cited_omid = oci.split("-")[1]

new_citing_omid = citing_omid
new_cited_omid = cited_omid

if citing_omid in omid_mapper:
new_citing_omid = omid_mapper[citing_omid]

if cited_omid in omid_mapper:
new_cited_omid = omid_mapper[cited_omid]

new_oci = new_citing_omid + "-" + new_cited_omid

#check if the citation has been modified
if new_oci != oci:
invalidated_cits.append(oci)
valid_cits.append(new_oci)

num_wr_invalidated = wr_with_buffer(invalidated_cits, output_dir+'invalidated_cits.csv', WR_BUFFER, idx == len(files_to_process)-1 )
if num_wr_invalidated > 0:
logger.info("> "+str(num_wr_invalidated)+" citations have been invalidated!")
invalidated_cits = []

new_cits = get_from_redis(valid_cits, redis_cits, BUFFER_REDIS, idx == len(files_to_process)-1)
num_new_cits = wr_with_buffer(new_cits, output_dir+'new_cits.csv', WR_BUFFER, idx == len(files_to_process)-1)
if num_new_cits > 0:
logger.info("> "+str(num_new_cits)+" new citations!")
valid_cits = []


def main():
global _config
logger = get_logger()

arg_parser = ArgumentParser(description="Normalize the data of OpenCitations Index")
arg_parser.add_argument(
"-i",
"--input",
required=True,
help="The input directory contatining compressed file(s) (ZIP format) or the original files in TTL",
)
arg_parser.add_argument(
"-m",
"--map",
required=True,
help="CSV file to map duplicates",
)
arg_parser.add_argument(
"-o",
"--output",
default="out",
help="The destination directory to save outputs",
)

args = arg_parser.parse_args()

# input directory/file
input_dir = args.input + "/" if args.input[-1] != "/" else args.input

# output directory
output_dir = args.output + "/" if args.output[-1] != "/" else args.output
if not os.path.exists(output_dir):
os.makedirs(output_dir)

#mapping file
mapping_file = args.map

# call the normalize_dump function
normalize_dump(input_dir, output_dir, mapping_file)

logger.info("Done !!")
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
exclude_package_data={"": ["./test_data"]},
entry_points={
"console_scripts": [
"oc.index.norm=oc.index.scripts.norm:main",
"oc.index.calc_stats=oc.index.scripts.calc_stats:main",
"oc.index.edit_rdf=oc.index.scripts.edit_rdf:main",
"oc.index.meta2redis=oc.index.scripts.meta2redis:main",
Expand Down

0 comments on commit 7fba15c

Please sign in to comment.