Merge pull request #88 from coreofscience/scopus-support

Scopus support
coreofscience · Oct 16, 2020 · 6e9ae17 · 6e9ae17
2 parents ff6cd97 + 1bb6e2d
commit 6e9ae17
Show file tree

Hide file tree

Showing 19 changed files with 36,085 additions and 158 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       max-parallel: 4
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8, 3.9]
 
     steps:
     - uses: actions/checkout@v1

diff --git a/.zenodo.json b/.zenodo.json
@@ -2,7 +2,7 @@
     "description": "Translates isi web of knowledge files into python objects.",
     "license": "MIT",
     "title": "coreofscience/python-wostools",
-    "version": "v2.0.7",
+    "version": "v3.0.0",
     "upload_type": "software",
     "publication_date": "2018-08-13",
     "creators": [
@@ -25,7 +25,7 @@
     "related_identifiers": [
         {
             "scheme": "url",
-            "identifier": "https://github.com/coreofscience/python-wostools/tree/v2.0.7",
+            "identifier": "https://github.com/coreofscience/python-wostools/tree/v3.0.0",
             "relation": "isSupplementTo"
         },
         {

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -110,9 +110,6 @@ Before you submit a pull request, check that it meets these guidelines:
 2.  If the pull request adds functionality, the docs should be updated.
     Put your new functionality into a function with a docstring, and add
     the feature to the list in README.md.
-3.  The pull request should work for Python 3.6, and for PyPy. Check
-    <https://travis-ci.org/coreofscience/python-wostools/pull_requests>
-    and make sure that the tests pass for all supported Python versions.
 
 ## Tips
 

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,13 @@
 # History
 
+## 3.0.0 (2020-10-15)
+
+- (!) Adds scopus RIS format support.
+- Drops support for `LazyCollection`.
+- Adds docummented support for Python 3.8 and 3.9
+- Drops docummented support for Python 3.6.
+- Improves article matching in collections.
+
 ## 2.0.7 (2020-08-23)
 
 - Remove from the collection those documents whose label is unknow or conflictive.

diff --git a/README.md b/README.md
@@ -18,8 +18,8 @@ Say you want to grab the title of all the articles in an ISI file, you
 can grab [this example file](docs/examples/bit-pattern-savedrecs.txt).
 
 ```python
->>> from wostools import CachedCollection
->>> collection = CachedCollection.from_filenames('docs/examples/bit-pattern-savedrecs.txt')
+>>> from wostools import Cached
+>>> collection = Cached.from_filenames('docs/examples/bit-pattern-savedrecs.txt')
 >>> for article in collection:
 ...     print(article.title)
 In situ grazing incidence small-angle X-ray scattering study of solvent vapor annealing in lamellae-forming block copolymer thin films: Trade-off of defects in deswelling
@@ -40,16 +40,17 @@ $ wostools to-json docs/examples/bit-pattern-savedrecs.txt --output=document.jso
 
 ## Features
 
--   Free software: MIT license
--   Just parses an ISI Web of Knowledge file and produces a native
-    python object.
--   Through the `CollectionLazy` object it can do this using the minimum
-    amount of memory it can possibly do.
--   It has a cli to extract documents and citation pairs for you :smile:
+- Free software: MIT license
+- Parses an ISI Web of Knowledge file and produces a native python object.
+- Parses RIS scopus files and produces a native python object.
+- Merges ISI and RIS files into enriched collections.
+- It has a cli to extract documents and citation pairs for you :smile:
 
 ## Credits
 
 This package was created with
 [Cookiecutter](https://github.com/audreyr/cookiecutter) and the
 [audreyr/cookiecutter-pypackage](https://github.com/audreyr/cookiecutter-pypackage)
 project template.
+
+Development of this package is supported by [Core of Science](https://coreofscience.com).
diff --git a/docs/examples/scopus.ris b/docs/examples/scopus.ris
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -1,9 +1,9 @@
-flake8==3.8.3
-coverage==5.2.1
+flake8==3.8.4
+coverage==5.3
 
-pytest==6.0.1
+pytest==6.1.1
 pytest-runner==5.2
 pytest-watch==4.2.0
-pytest-bdd==3.4.0
+pytest-bdd==4.0.1
 
 dataclasses==0.7; python_version < "3.7"
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 with open("HISTORY.md") as history_file:
     history = history_file.read()
 
-requirements = ["Click>=7.0"]
+requirements = ["Click>=7.0<8"]
 
 setup_requirements = ["pytest-runner"]
 
@@ -26,8 +26,9 @@
         "License :: OSI Approved :: MIT License",
         "Natural Language :: English",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
     ],
     entry_points={"console_scripts": ["wostools=wostools.cli:main"]},
     description="Translates isi web of knowledge files into python objects.",
@@ -42,7 +43,7 @@
     test_suite="tests",
     tests_require=test_requirements,
     url="https://github.com/coreofscience/python-wostools",
-    version="2.0.7",
+    version="3.0.0",
     zip_safe=False,
     long_description_content_type="text/markdown",
 )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2,7 +2,7 @@
 Configuration file for python-wostools tests.
 """
 
-from wostools import Article, LazyCollection, CachedCollection
+from wostools import Article
 
 import pytest
 import io
@@ -102,13 +102,3 @@ def filename_single_document():
 @pytest.fixture
 def filename_many_documents():
     return "docs/examples/bit-pattern-savedrecs.txt"
-
-
-@pytest.fixture(params=[CachedCollection, LazyCollection])
-def collection_single_document(request, filename_single_document):
-    return request.param.from_filenames(filename_single_document)
-
-
-@pytest.fixture(params=[CachedCollection, LazyCollection])
-def collection_many_documents(request, filename_many_documents):
-    return request.param.from_filenames(filename_many_documents)
diff --git a/wostools/__init__.py b/wostools/__init__.py
@@ -2,10 +2,10 @@
 
 __author__ = """Core of Science"""
 __email__ = "[email protected]"
-__version__ = "2.0.7"
+__version__ = "3.0.0"
 
 from wostools.article import Article
-from wostools.lazy import LazyCollection
 from wostools.cached import CachedCollection
+from wostools.cached import CachedCollection as Collection
 
-__all__ = ["CachedCollection", "LazyCollection", "Article"]
+__all__ = ["CachedCollection", "Collection", "Article"]
diff --git a/wostools/article.py b/wostools/article.py
@@ -14,11 +14,11 @@
 )
 
 ISI_CITATION_PATTERN = re.compile(
-    r"""^(?P<AU>[^,]+)?,[ ]         # First author
-        (?P<PY>\d{4})?,[ ]          # Publication year
-        (?P<J9>[^,]+)?              # Journal
+    r"""^(?P<AU>[^,]+),[ ]          # First author
+        (?P<PY>\d{4}),[ ]           # Publication year
+        (?P<J9>[^,]+)               # Journal
         (,[ ]V(?P<VL>[\w\d-]+))?    # Volume
-        (,[ ][Pp](?P<BP>\d+))?      # Start page
+        (,[ ][Pp](?P<BP>\w+))?      # Start page
         (,[ ]DOI[ ](?P<DI>.+))?     # The all important DOI
         """,
     re.X,
@@ -55,19 +55,37 @@ def __init__(
         self.extra: Mapping[str, Any] = extra or {}
 
     @property
-    def label(self):
+    def label(self) -> str:
+        if self.doi:
+            return self.doi
+        return self._label()
+
+    def _label(self, exclude_doi=False, lower_p=False) -> str:
         if not (self.authors and self.year and self.journal):
             raise MissingLabelFields(self)
+        page_prefix = "p" if lower_p else "P"
         pieces = {
             "AU": self.authors[0].replace(",", ""),
             "PY": str(self.year),
             "J9": str(self.journal),
             "VL": f"V{self.volume}" if self.volume else None,
-            "BP": f"P{self.page}" if self.page else None,
+            "BP": f"{page_prefix}{self.page}" if self.page else None,
             "DI": f"DOI {self.doi}" if self.doi else None,
         }
         return ", ".join(value for value in pieces.values() if value)
 
+    @property
+    def labels(self) -> Set[str]:
+        if not self.doi:
+            return {self.label, self._label(lower_p=True)}
+        return {
+            self.doi,
+            self.label,
+            self._label(exclude_doi=True),
+            self._label(lower_p=True),
+            self._label(exclude_doi=True, lower_p=True),
+        }
+
     def to_dict(self, simplified=True):
         """
         Transform the article into some key value pairs for easy transportation.
@@ -95,12 +113,12 @@ def to_dict(self, simplified=True):
         }
 
     def merge(self, other: "Article") -> "Article":
-        if self.label != other.label:
+        if other.label not in self.labels:
             logger.warning(
                 "\n".join(
                     [
                         "Mixing articles with different labels might result in tragedy",
-                        f"  mine:   {self.label}",
+                        f"  mine:   {self.labels}",
                         f"  others: {other.label}",
                     ]
                 )

diff --git a/wostools/base.py b/wostools/base.py
@@ -4,10 +4,11 @@
 
 import glob
 import logging
-from typing import Iterable, Iterator, Tuple
+from typing import Iterable, Iterator, TextIO, Tuple
 
 from wostools.article import Article
-from wostools.exceptions import InvalidReference
+from wostools.exceptions import InvalidReference, WosToolsError
+from wostools.sources import isi, scopus
 
 logger = logging.getLogger(__name__)
 
@@ -30,7 +31,7 @@ def from_glob(cls, pattern):
             pattern (str): String with the pattern to be passed to glob.
 
         Returns:
-            CollectionLazy: Collection with the articles by using the pattern.
+            BaseCollection: Collection with the articles by using the pattern.
         """
         return cls.from_filenames(*glob.glob(pattern))
 
@@ -42,34 +43,30 @@ def from_filenames(cls, *filenames):
             filenames (str): String with the filename.
 
         Returns:
-            CollectionLazy: Collection with the articles by reading the
+            BaseCollection: Collection with the articles by reading the
                 filenames.
         """
         files = [open(filename, encoding="utf-8-sig") for filename in filenames]
         return cls(*files)
 
     @property
-    def _article_texts(self) -> Iterable[str]:
+    def _iter_files(self) -> Iterable[TextIO]:
         """Iterates over all the single article texts in the colection.
 
         Returns:
             generator: A generator of strings with the text articles.
         """
         for filehandle in self._files:
             filehandle.seek(0)
-            data = filehandle.read()
+            yield filehandle
             filehandle.seek(0)
-            for article_text in data.split("\n\n"):
-                if article_text != "EF":
-                    yield article_text
 
     def _articles(self) -> Iterable[Article]:
-        """
-        Should iterate over all the articles in the ISI file, excluding references.
-        """
-        raise NotImplementedError(
-            "Sub classes should know how to iterate over articles"
-        )
+        for file in self._iter_files:
+            try:
+                yield from isi.parse_file(file)
+            except WosToolsError:
+                yield from scopus.parse_file(file)
 
     def __iter__(self) -> Iterator[Article]:
         """

diff --git a/wostools/cached.py b/wostools/cached.py
@@ -5,7 +5,7 @@
 import itertools
 import logging
 from contextlib import suppress
-from typing import Dict, Iterable, Iterator, Tuple
+from typing import Dict, Iterable, Iterator, Set, Tuple
 
 from wostools.article import Article
 from wostools.base import BaseCollection
@@ -23,17 +23,29 @@ def __init__(self, *files):
         super().__init__(*files)
         self._cache_key = None
         self._cache: Dict[str, Article] = {}
+        self._labels: Dict[str, Set[str]] = {}
+        self._refs: Dict[str, str] = {}
         self._preheat()
 
-    def _articles(self) -> Iterable[Article]:
-        for article_text in self._article_texts:
-            yield Article.from_isi_text(article_text)
-
-    def _add_article(self, article):
-        label = article.label
-        if label in self._cache:
-            article = article.merge(self._cache[label])
-        self._cache[label] = article
+    def _add_article(self, article: Article):
+        existing_labels = {
+            alias
+            for label in article.labels
+            for alias in self._labels.get(label, set())
+        }
+        all_labels = existing_labels | article.labels
+        existing_refs = {
+            self._refs[label] for label in all_labels if label in self._refs
+        }
+        for ref in existing_refs:
+            other = self._cache.pop(ref, None)
+            if other is not None:
+                article = article.merge(other)
+
+        self._cache[article.label] = article
+        for label in all_labels:
+            self._labels[label] = all_labels
+            self._refs[label] = article.label
 
     def _preheat(self):
         # Preheat our cache
@@ -94,7 +106,8 @@ def citation_pairs(self) -> Iterable[Tuple[Article, Article]]:
             labesl, where the firts element is the article which cites the
             second element.
         """
-        for article in self._cache.values():
+        for article in self:
             for reference in article.references:
-                if reference in self._cache:
-                    yield (article, self._cache[reference])
+                if reference in self._refs:
+                    label = self._refs[reference]
+                    yield (article, self._cache[label])
diff --git a/wostools/cli.py b/wostools/cli.py
@@ -41,7 +41,7 @@ def citation_pairs(sources, output):
     json.dump(pairs, output, indent=2)
 
 
-@main.command("to-dict")
+@main.command("to-json")
 @click.argument("sources", type=click.File("r"), nargs=-1)
 @click.option(
     "--output",
@@ -58,7 +58,7 @@ def citation_pairs(sources, output):
     default=False,
     help="Add extra info to the output",
 )
-def to_dict(sources, output, more):
+def to_json(sources, output, more):
     """
     Build a collection by using the sources and print the citation pairs in json
     format or dumps them in the `output`.

diff --git a/wostools/exceptions.py b/wostools/exceptions.py
@@ -13,6 +13,11 @@ def __init__(self, reference: str):
         super().__init__(f"{reference} does not look like an ISI citation")
 
 
+class InvalidScopusFile(WosToolsError, ValueError):
+    def __init__(self):
+        super().__init__("The file does not look like a valid bib file")
+
+
 class InvalidIsiLine(WosToolsError, ValueError):
     """
     Raised when we encounter an invalid line when processing an ISI file.