citations and refereces

opencitations · Sep 12, 2023 · 0e652a8 · 0e652a8
1 parent 39d3837
commit 0e652a8
Show file tree

Hide file tree

Showing 2 changed files with 222 additions and 32 deletions.
diff --git a/index_v1.hf b/index_v1.hf
@@ -529,7 +529,7 @@ SELECT ?val (count(?oci) as ?citation_count) (GROUP_CONCAT(?citing; separator=";
 #type operation
 #doi str(10\..+)
 #preprocess lower(doi) --> doi2omid(doi)
-#postprocess citations_info(citing,cited)
+#postprocess citations_info(oci,citing,cited)
 #method get
 #description This operation retrieves the citation data for all the references appearing in the reference lists of other citing works to the bibliographic entity identified by the input DOI, that constitute the incoming citations of that identified bibliographic entity.
 
@@ -583,3 +583,71 @@ WHERE {
       BIND(<https://w3id.org/oc/meta/br/[[doi]]> as ?cited) .
 }
 LIMIT 50000
+
+#url /references-uniindex/{doi}
+#type operation
+#doi str(10\..+)
+#preprocess lower(doi) --> doi2omid(doi)
+#postprocess citations_info(oci,citing,cited)
+#method get
+#description This operation retrieves the citation data for all the outgoing references to other cited works appearing in the reference list of the bibliographic entity identified by the input DOI.
+
+The fields returned by this operation are:
+
+* *oci*: the Open Citation Identifier (OCI) of the citation in consideration;
+* *citing*: the DOI of the citing entity;
+* *cited*: the DOI of the cited entity;
+* *creation*: the creation date of the citation according to the [ISO date format](https://en.wikipedia.org/wiki/ISO_8601) `YYYY-MM-DD`, which corresponds to the publication date of the citing entity;
+* *timespan*: the interval between the publication date of the cited entity and the publication date of the citing entity, expressed using the [XSD duration format](https://www.w3.org/TR/xmlschema11-2/#duration) `PnYnMnD`;
+* *journal_sc*: it records whether the citation is a journal self-citations (i.e. the citing and the cited entities are published in the same journal);
+* *author_sc*: it records whether the citation is an author self-citation (i.e. the citing and the cited entities have at least one author in common).
+
+The values of all the fields are prefixed with `[index name] => `, so as to cleary identify from where the related data is coming, and can contain one or more information, separated by `; `. This is particularly useful when a citation is actually contained in two or more OpenCitations Indexes. In this case, only one row will be returned, and the prefix used in the various data allows one to understand the source Index of such data.
+#call /references/10.1186/1756-8722-6-59
+#output_json [
+      {
+          "oci": "02001010806360107050663080702026306630509-0200101080636102703040309",
+          "citing": "10.1186/1756-8722-6-59",
+          "cited": "10.1186/ar3439",
+          "creation": "2013",
+          "timespan": "P2Y",
+          "journal_sc": "no",
+          "author_sc": "no"
+      },
+      {
+          "oci": "02001010806360107050663080702026306630509-0200101080636102704000806",
+          "citing": "10.1186/1756-8722-6-59",
+          "cited": "10.1186/ar4086",
+          "creation": "2013",
+          "timespan": "P1Y",
+          "journal_sc": "no",
+          "author_sc": "no"
+      },
+      {
+          "oci": "02001010806360107050663080702026306630509-020010200003619122437020001023704023707090006",
+          "citing": "10.1186/1756-8722-6-59",
+          "cited": "10.1200/jco.2012.42.7906",
+          "creation": "2013",
+          "timespan": "P0Y",
+          "journal_sc": "no",
+          "author_sc": "no"
+      },
+      {
+          "oci": "02001010806360107050663080702026306630509-02003010009360008080300010805370200010237060604070907",
+          "citing": "10.1186/1756-8722-6-59",
+          "cited": "10.3109/08830185.2012.664797",
+          "creation": "2013",
+          "timespan": "P0Y",
+          "journal_sc": "no",
+          "author_sc": "no"
+      }
+]
+#field_type str(oci) str(citing) str(cited) datetime(creation) duration(timespan) str(?journal_sc) str(?author_sc)
+#sparql PREFIX cito: <http://purl.org/spar/cito/>
+SELECT ?oci ?citing ?cited
+WHERE {
+      ?oci cito:hasCitingEntity <https://w3id.org/oc/meta/br/[[doi]]> .
+      ?oci cito:hasCitedEntity ?cited .
+      BIND(<https://w3id.org/oc/meta/br/[[doi]]> as ?citing) .
+}
+LIMIT 50000
diff --git a/indexapi_v1.py b/indexapi_v1.py
@@ -20,6 +20,9 @@
 from rdflib import Graph, URIRef
 from re import sub,findall
 from json import loads
+from datetime import datetime
+from dateutil.relativedelta import relativedelta
+from dateutil.parser import parse
 
 
 def lower(s):
@@ -161,38 +164,138 @@ def metadata(res, *args):
 
 # args must contain the [[citing]] and [[cited]]
 def citations_info(res, *args):
+
+    def ___get_omid_str(val):
+        return "omid:"+val.split("oc/meta/")[1]
+
     header = res[0]
-    fields = [header.index(args[0]), header.index(args[1])]
+    oci_idx = header.index(args[0]);
+    citing_idx = header.index(args[1])
+    cited_idx = header.index(args[2])
     index_meta = {}
 
-    additional_fields = ["creation", "timespan", "journal_sc","author_sc"]
-    header.extend(additional_fields)
-
-    all_entities = set()
+    #all_entities = ["omid:br/06101068294","omid:br/0610123167","omid:br/06101494166"]
+    res_entities = {}
+    all_entities = []
     if len(res) > 1:
-        for row in res[1:]:
-            for f in fields:
-                entity = row[f][1].split("oc/meta/")[1][:-1]
-                all_entities.add(str(entity))
+        for idx, row in enumerate(res[1:]):
+            res_entities[idx] = []
+            res_entities[idx] += [___get_omid_str(row[citing_idx][1]), ___get_omid_str(row[cited_idx][1])]
+            all_entities += [___get_omid_str(row[citing_idx][1]), ___get_omid_str(row[cited_idx][1])]
 
-    # ["id", "author", "year", "pub_date", "title", "source_title", "volume", "issue", "page", "source_id"]
-    all_entities = {"br/06101068294"}
-    r = __ocmeta_parser(list(all_entities),"omid")
+    # delete the citing, cited columns
+    res = [[elem for idx, elem in enumerate(row) if idx != oci_idx and idx != citing_idx and idx != cited_idx] for row in res]
+
+    additional_fields = ["oci", "citing", "cited", "creation", "timespan", "journal_sc","author_sc"]
+    header = res[0]
+    header.extend(additional_fields)
+
+    r = {}
+    # call __ocmeta_parser for each STEP entities each time
+    STEP = 8
+    all_entities = list(set(all_entities))
+    for i in range(0, len(all_entities), STEP):
+        r.update(__ocmeta_parser(all_entities[i:i + STEP]))
+        # if i > 200:
+        #     break
+    r.update(__ocmeta_parser(all_entities[i:]))
 
     # process and elaborate additional fields
     #creation = entities_data["citing"][1]
-    for row in res[1:]:
-        row.extend([
-            "__".join(r.keys()),
-            "",
-            "",
-            ""
-        ])
-
-        #row.extend(["","","",""])
+    if len(res) > 1:
+        for idx, row in enumerate(res[1:]):
+
+            citing_entity = res_entities[idx][0]
+            cited_entity = res_entities[idx][1]
+
+            oci_val = citing_entity.replace("omid:br/","") +"-"+ cited_entity.replace("omid:br/","")
+            if citing_entity in r and cited_entity in r:
+                row.extend([
+                    # oci value
+                    oci_val,
+                    # citing
+                    r[citing_entity]["id"].replace("doi:",""),
+                    # cited
+                    r[cited_entity]["id"].replace("doi:",""),
+                    # creation = citing[pub_date]
+                    r[citing_entity]["pub_date"],
+                    # timespan = citing[pub_date] - cited[pub_date]
+                    __cit_duration(r[citing_entity]["pub_date"], r[cited_entity]["pub_date"]),
+                    # journal_sc = compare citing[source_id] and cited[source_id]
+                    __cit_journal_sc(r[citing_entity]["all_source_ids"], r[cited_entity]["all_source_ids"]),
+                    # author_sc = compare citing[source_id] and cited[source_id]
+                    __cit_author_sc(r[citing_entity]["authors_orcid"], r[cited_entity]["authors_orcid"]),
+                ])
+            else:
+                row.extend([oci_val,"","","","","",""])
 
     return res, True
 
+def __cit_journal_sc(citing_source_ids, cited_source_ids):
+    if len(set(citing_source_ids).intersection(set(cited_source_ids))) > 0:
+        return "yes"
+    return "no"
+
+def __cit_author_sc(citing_authors, cited_authors):
+    if len(set(citing_authors).intersection(set(cited_authors))) > 0:
+        return "yes"
+    return "no"
+
+def __cit_duration(citing_complete_pub_date, cited_complete_pub_date):
+
+    def ___contains_years(date):
+        return date is not None and len(date) >= 4
+
+    def ___contains_months(date):
+        return date is not None and len(date) >= 7
+
+    def ___contains_days(date):
+        return date is not None and len(date) >= 10
+
+    DEFAULT_DATE = datetime(1970, 1, 1, 0, 0)
+    consider_months = ___contains_months(citing_complete_pub_date) and ___contains_months(cited_complete_pub_date)
+    consider_days = ___contains_days(citing_complete_pub_date) and ___contains_days(cited_complete_pub_date)
+
+    try:
+        citing_pub_datetime = parse(
+            citing_complete_pub_date, default=DEFAULT_DATE
+        )
+    except ValueError:  # It is not a leap year
+        citing_pub_datetime = parse(
+            citing_complete_pub_date[:7] + "-28", default=DEFAULT_DATE
+        )
+    try:
+        cited_pub_datetime = parse(
+            cited_complete_pub_date, default=DEFAULT_DATE
+        )
+    except ValueError:  # It is not a leap year
+        cited_pub_datetime = parse(
+            cited_complete_pub_date[:7] + "-28", default=DEFAULT_DATE
+        )
+
+    delta = relativedelta(citing_pub_datetime, cited_pub_datetime)
+
+    result = ""
+    if (
+        delta.years < 0
+        or (delta.years == 0 and delta.months < 0 and consider_months)
+        or (
+            delta.years == 0
+            and delta.months == 0
+            and delta.days < 0
+            and consider_days
+        )
+    ):
+        result += "-"
+    result += "P%sY" % abs(delta.years)
+
+    if consider_months:
+        result += "%sM" % abs(delta.months)
+
+    if consider_days:
+        result += "%sD" % abs(delta.days)
+
+    return result
 
 def __get_issn(body):
     cur_id = ""
@@ -241,11 +344,10 @@ def __normalise(o):
         s = str(o)
     return sub("\s+", " ", s).strip()
 
-def __ocmeta_parser(dois,pre="doi"):
-    api = "http://127.0.0.1/meta/api/v1/metadata/"
-    str_dois = "__".join([pre+":" + d for d in dois])
+def __ocmeta_parser(ids, pre="doi"):
+    api = "https://test.opencitations.net/meta/api/v1/metadata/"
 
-    r = get(api + str_dois, headers={"User-Agent": "INDEX REST API (via OpenCitations - http://opencitations.net; mailto:[email protected])"}, timeout=60)
+    r = get(api + "__".join(ids), headers={"User-Agent": "INDEX REST API (via OpenCitations - http://opencitations.net; mailto:[email protected])"}, timeout=60)
 
     f_res = {}
     if r.status_code == 200:
@@ -254,14 +356,21 @@ def __ocmeta_parser(dois,pre="doi"):
 
             for body in json_res:
 
-                id = ""
+                id = None
+                omid = None
                 if "id" in body:
                     for p_id in body["id"].split(" "):
                         if str(p_id).startswith(pre):
                             id = str(p_id)
-                            break
+                        if str(p_id).startswith("omid"):
+                            omid = str(p_id)
+
+                if omid == None:
+                    continue
 
                 authors = []
+                l_authors_id = []
+                authors_orcid = []
                 if "author" in body:
                     if body["author"] != "":
                         for author in body["author"].split(";"):
@@ -271,19 +380,23 @@ def __ocmeta_parser(dois,pre="doi"):
                             if len(author_ids) > 0:
                                 author_string = author.replace(author_ids[0],"").strip()
                                 if len(author_orcid) > 0:
+                                    authors_orcid.append(author_orcid[0].strip())
                                     author_string = author_string+", "+author_orcid[0].strip()
                             if author_string is not None:
                                 authors.append(__normalise(author_string))
 
                 source_title = ""
                 source_id = ""
+                all_source_ids = []
                 if "venue" in body:
                     if body["venue"] != "":
                         source_title_string = body["venue"]
+
                         source_issn = findall(r"(issn\:[\d\-^\]]{1,})",source_title_string)
                         source_isbn = findall(r"(isbn\:[\d\-^\]]{1,})",source_title_string)
                         source_ids = findall(r"\[.{1,}\]",source_title_string)
                         if len(source_ids) > 0:
+                            all_source_ids = source_ids[0].split(" ")
                             source_title_string = source_title_string.replace(source_ids[0],"").strip()
                         if len(source_issn) > 0:
                             source_id = source_issn[0]
@@ -314,13 +427,22 @@ def __ocmeta_parser(dois,pre="doi"):
                 if "page" in body:
                     page = __normalise(body["page"])
 
-                # ["id", "author", "year", "pub_date", "title", "source_title", "volume", "issue", "page", "source_id"]
-                f_res[id] = ["; ".join(authors),year,pub_date,title,source_title,source_id,volume,issue,page]
+                f_res[omid] = {
+                    "id": id,
+                    "authors_str": "; ".join(authors),
+                    "authors_orcid": authors_orcid,
+                    "pub_date": pub_date,
+                    "title": title,
+                    "source_title": source_title,
+                    "source_id": source_id,
+                    "all_source_ids": all_source_ids,
+                    "volume": volume,
+                    "issue": issue,
+                    "page":page
+                }
 
         return f_res
 
-
-    # ["id", "author", "year", "pub_date", "title", "source_title", "volume", "issue", "page", "source_id"]
     return f_res
 
 def __crossref_parser(doi):