Skip to content

Commit

Permalink
citations and refereces
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanhb committed Sep 12, 2023
1 parent 39d3837 commit 0e652a8
Show file tree
Hide file tree
Showing 2 changed files with 222 additions and 32 deletions.
70 changes: 69 additions & 1 deletion index_v1.hf
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ SELECT ?val (count(?oci) as ?citation_count) (GROUP_CONCAT(?citing; separator=";
#type operation
#doi str(10\..+)
#preprocess lower(doi) --> doi2omid(doi)
#postprocess citations_info(citing,cited)
#postprocess citations_info(oci,citing,cited)
#method get
#description This operation retrieves the citation data for all the references appearing in the reference lists of other citing works to the bibliographic entity identified by the input DOI, that constitute the incoming citations of that identified bibliographic entity.

Expand Down Expand Up @@ -583,3 +583,71 @@ WHERE {
BIND(<https://w3id.org/oc/meta/br/[[doi]]> as ?cited) .
}
LIMIT 50000

#url /references-uniindex/{doi}
#type operation
#doi str(10\..+)
#preprocess lower(doi) --> doi2omid(doi)
#postprocess citations_info(oci,citing,cited)
#method get
#description This operation retrieves the citation data for all the outgoing references to other cited works appearing in the reference list of the bibliographic entity identified by the input DOI.

The fields returned by this operation are:

* *oci*: the Open Citation Identifier (OCI) of the citation in consideration;
* *citing*: the DOI of the citing entity;
* *cited*: the DOI of the cited entity;
* *creation*: the creation date of the citation according to the [ISO date format](https://en.wikipedia.org/wiki/ISO_8601) `YYYY-MM-DD`, which corresponds to the publication date of the citing entity;
* *timespan*: the interval between the publication date of the cited entity and the publication date of the citing entity, expressed using the [XSD duration format](https://www.w3.org/TR/xmlschema11-2/#duration) `PnYnMnD`;
* *journal_sc*: it records whether the citation is a journal self-citations (i.e. the citing and the cited entities are published in the same journal);
* *author_sc*: it records whether the citation is an author self-citation (i.e. the citing and the cited entities have at least one author in common).

The values of all the fields are prefixed with `[index name] => `, so as to cleary identify from where the related data is coming, and can contain one or more information, separated by `; `. This is particularly useful when a citation is actually contained in two or more OpenCitations Indexes. In this case, only one row will be returned, and the prefix used in the various data allows one to understand the source Index of such data.
#call /references/10.1186/1756-8722-6-59
#output_json [
{
"oci": "02001010806360107050663080702026306630509-0200101080636102703040309",
"citing": "10.1186/1756-8722-6-59",
"cited": "10.1186/ar3439",
"creation": "2013",
"timespan": "P2Y",
"journal_sc": "no",
"author_sc": "no"
},
{
"oci": "02001010806360107050663080702026306630509-0200101080636102704000806",
"citing": "10.1186/1756-8722-6-59",
"cited": "10.1186/ar4086",
"creation": "2013",
"timespan": "P1Y",
"journal_sc": "no",
"author_sc": "no"
},
{
"oci": "02001010806360107050663080702026306630509-020010200003619122437020001023704023707090006",
"citing": "10.1186/1756-8722-6-59",
"cited": "10.1200/jco.2012.42.7906",
"creation": "2013",
"timespan": "P0Y",
"journal_sc": "no",
"author_sc": "no"
},
{
"oci": "02001010806360107050663080702026306630509-02003010009360008080300010805370200010237060604070907",
"citing": "10.1186/1756-8722-6-59",
"cited": "10.3109/08830185.2012.664797",
"creation": "2013",
"timespan": "P0Y",
"journal_sc": "no",
"author_sc": "no"
}
]
#field_type str(oci) str(citing) str(cited) datetime(creation) duration(timespan) str(?journal_sc) str(?author_sc)
#sparql PREFIX cito: <http://purl.org/spar/cito/>
SELECT ?oci ?citing ?cited
WHERE {
?oci cito:hasCitingEntity <https://w3id.org/oc/meta/br/[[doi]]> .
?oci cito:hasCitedEntity ?cited .
BIND(<https://w3id.org/oc/meta/br/[[doi]]> as ?citing) .
}
LIMIT 50000
184 changes: 153 additions & 31 deletions indexapi_v1.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from rdflib import Graph, URIRef
from re import sub,findall
from json import loads
from datetime import datetime
from dateutil.relativedelta import relativedelta
from dateutil.parser import parse


def lower(s):
Expand Down Expand Up @@ -161,38 +164,138 @@ def metadata(res, *args):

# args must contain the [[citing]] and [[cited]]
def citations_info(res, *args):

def ___get_omid_str(val):
return "omid:"+val.split("oc/meta/")[1]

header = res[0]
fields = [header.index(args[0]), header.index(args[1])]
oci_idx = header.index(args[0]);
citing_idx = header.index(args[1])
cited_idx = header.index(args[2])
index_meta = {}

additional_fields = ["creation", "timespan", "journal_sc","author_sc"]
header.extend(additional_fields)

all_entities = set()
#all_entities = ["omid:br/06101068294","omid:br/0610123167","omid:br/06101494166"]
res_entities = {}
all_entities = []
if len(res) > 1:
for row in res[1:]:
for f in fields:
entity = row[f][1].split("oc/meta/")[1][:-1]
all_entities.add(str(entity))
for idx, row in enumerate(res[1:]):
res_entities[idx] = []
res_entities[idx] += [___get_omid_str(row[citing_idx][1]), ___get_omid_str(row[cited_idx][1])]
all_entities += [___get_omid_str(row[citing_idx][1]), ___get_omid_str(row[cited_idx][1])]

# ["id", "author", "year", "pub_date", "title", "source_title", "volume", "issue", "page", "source_id"]
all_entities = {"br/06101068294"}
r = __ocmeta_parser(list(all_entities),"omid")
# delete the citing, cited columns
res = [[elem for idx, elem in enumerate(row) if idx != oci_idx and idx != citing_idx and idx != cited_idx] for row in res]

additional_fields = ["oci", "citing", "cited", "creation", "timespan", "journal_sc","author_sc"]
header = res[0]
header.extend(additional_fields)

r = {}
# call __ocmeta_parser for each STEP entities each time
STEP = 8
all_entities = list(set(all_entities))
for i in range(0, len(all_entities), STEP):
r.update(__ocmeta_parser(all_entities[i:i + STEP]))
# if i > 200:
# break
r.update(__ocmeta_parser(all_entities[i:]))

# process and elaborate additional fields
#creation = entities_data["citing"][1]
for row in res[1:]:
row.extend([
"__".join(r.keys()),
"",
"",
""
])

#row.extend(["","","",""])
if len(res) > 1:
for idx, row in enumerate(res[1:]):

citing_entity = res_entities[idx][0]
cited_entity = res_entities[idx][1]

oci_val = citing_entity.replace("omid:br/","") +"-"+ cited_entity.replace("omid:br/","")
if citing_entity in r and cited_entity in r:
row.extend([
# oci value
oci_val,
# citing
r[citing_entity]["id"].replace("doi:",""),
# cited
r[cited_entity]["id"].replace("doi:",""),
# creation = citing[pub_date]
r[citing_entity]["pub_date"],
# timespan = citing[pub_date] - cited[pub_date]
__cit_duration(r[citing_entity]["pub_date"], r[cited_entity]["pub_date"]),
# journal_sc = compare citing[source_id] and cited[source_id]
__cit_journal_sc(r[citing_entity]["all_source_ids"], r[cited_entity]["all_source_ids"]),
# author_sc = compare citing[source_id] and cited[source_id]
__cit_author_sc(r[citing_entity]["authors_orcid"], r[cited_entity]["authors_orcid"]),
])
else:
row.extend([oci_val,"","","","","",""])

return res, True

def __cit_journal_sc(citing_source_ids, cited_source_ids):
if len(set(citing_source_ids).intersection(set(cited_source_ids))) > 0:
return "yes"
return "no"

def __cit_author_sc(citing_authors, cited_authors):
if len(set(citing_authors).intersection(set(cited_authors))) > 0:
return "yes"
return "no"

def __cit_duration(citing_complete_pub_date, cited_complete_pub_date):

def ___contains_years(date):
return date is not None and len(date) >= 4

def ___contains_months(date):
return date is not None and len(date) >= 7

def ___contains_days(date):
return date is not None and len(date) >= 10

DEFAULT_DATE = datetime(1970, 1, 1, 0, 0)
consider_months = ___contains_months(citing_complete_pub_date) and ___contains_months(cited_complete_pub_date)
consider_days = ___contains_days(citing_complete_pub_date) and ___contains_days(cited_complete_pub_date)

try:
citing_pub_datetime = parse(
citing_complete_pub_date, default=DEFAULT_DATE
)
except ValueError: # It is not a leap year
citing_pub_datetime = parse(
citing_complete_pub_date[:7] + "-28", default=DEFAULT_DATE
)
try:
cited_pub_datetime = parse(
cited_complete_pub_date, default=DEFAULT_DATE
)
except ValueError: # It is not a leap year
cited_pub_datetime = parse(
cited_complete_pub_date[:7] + "-28", default=DEFAULT_DATE
)

delta = relativedelta(citing_pub_datetime, cited_pub_datetime)

result = ""
if (
delta.years < 0
or (delta.years == 0 and delta.months < 0 and consider_months)
or (
delta.years == 0
and delta.months == 0
and delta.days < 0
and consider_days
)
):
result += "-"
result += "P%sY" % abs(delta.years)

if consider_months:
result += "%sM" % abs(delta.months)

if consider_days:
result += "%sD" % abs(delta.days)

return result

def __get_issn(body):
cur_id = ""
Expand Down Expand Up @@ -241,11 +344,10 @@ def __normalise(o):
s = str(o)
return sub("\s+", " ", s).strip()

def __ocmeta_parser(dois,pre="doi"):
api = "http://127.0.0.1/meta/api/v1/metadata/"
str_dois = "__".join([pre+":" + d for d in dois])
def __ocmeta_parser(ids, pre="doi"):
api = "https://test.opencitations.net/meta/api/v1/metadata/"

r = get(api + str_dois, headers={"User-Agent": "INDEX REST API (via OpenCitations - http://opencitations.net; mailto:[email protected])"}, timeout=60)
r = get(api + "__".join(ids), headers={"User-Agent": "INDEX REST API (via OpenCitations - http://opencitations.net; mailto:[email protected])"}, timeout=60)

f_res = {}
if r.status_code == 200:
Expand All @@ -254,14 +356,21 @@ def __ocmeta_parser(dois,pre="doi"):

for body in json_res:

id = ""
id = None
omid = None
if "id" in body:
for p_id in body["id"].split(" "):
if str(p_id).startswith(pre):
id = str(p_id)
break
if str(p_id).startswith("omid"):
omid = str(p_id)

if omid == None:
continue

authors = []
l_authors_id = []
authors_orcid = []
if "author" in body:
if body["author"] != "":
for author in body["author"].split(";"):
Expand All @@ -271,19 +380,23 @@ def __ocmeta_parser(dois,pre="doi"):
if len(author_ids) > 0:
author_string = author.replace(author_ids[0],"").strip()
if len(author_orcid) > 0:
authors_orcid.append(author_orcid[0].strip())
author_string = author_string+", "+author_orcid[0].strip()
if author_string is not None:
authors.append(__normalise(author_string))

source_title = ""
source_id = ""
all_source_ids = []
if "venue" in body:
if body["venue"] != "":
source_title_string = body["venue"]

source_issn = findall(r"(issn\:[\d\-^\]]{1,})",source_title_string)
source_isbn = findall(r"(isbn\:[\d\-^\]]{1,})",source_title_string)
source_ids = findall(r"\[.{1,}\]",source_title_string)
if len(source_ids) > 0:
all_source_ids = source_ids[0].split(" ")
source_title_string = source_title_string.replace(source_ids[0],"").strip()
if len(source_issn) > 0:
source_id = source_issn[0]
Expand Down Expand Up @@ -314,13 +427,22 @@ def __ocmeta_parser(dois,pre="doi"):
if "page" in body:
page = __normalise(body["page"])

# ["id", "author", "year", "pub_date", "title", "source_title", "volume", "issue", "page", "source_id"]
f_res[id] = ["; ".join(authors),year,pub_date,title,source_title,source_id,volume,issue,page]
f_res[omid] = {
"id": id,
"authors_str": "; ".join(authors),
"authors_orcid": authors_orcid,
"pub_date": pub_date,
"title": title,
"source_title": source_title,
"source_id": source_id,
"all_source_ids": all_source_ids,
"volume": volume,
"issue": issue,
"page":page
}

return f_res


# ["id", "author", "year", "pub_date", "title", "source_title", "volume", "issue", "page", "source_id"]
return f_res

def __crossref_parser(doi):
Expand Down

0 comments on commit 0e652a8

Please sign in to comment.