Skip to content

Commit

Permalink
Merge pull request #185 from surfedushare/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
fako authored Sep 18, 2023
2 parents fac5b7b + e2c3c0a commit 4e4c211
Show file tree
Hide file tree
Showing 15 changed files with 71 additions and 36 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ If you want to run the project outside of a container you'll need to add the fol
127.0.0.1 harvester
127.0.0.1 service
127.0.0.1 redis
127.0.0.1 tika
```

This way you can reach these containers outside of the container network through their names.
Expand Down
3 changes: 3 additions & 0 deletions environments/acceptance/invoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ opensearch:
dutch: "analyzers/F20018998"
domain_name: "surfpol-main"

tika:
host: "http://localhost:9998"

aws:
is_aws: true
load_secrets: true
3 changes: 3 additions & 0 deletions environments/development/invoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ opensearch:
dutch: "analyzers/F133444250"
domain_name: "surfpol-main"

tika:
host: "http://localhost:9998"

aws:
is_aws: true
load_secrets: true
3 changes: 3 additions & 0 deletions environments/localhost/invoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ opensearch:
dutch: "decompound_word_list.nl.txt"
domain_name: null

tika:
host: "http://tika:9998"

aws:
is_aws: false
load_secrets: true
Expand Down
3 changes: 3 additions & 0 deletions environments/production/invoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ opensearch:
dutch: "analyzers/F131123737"
domain_name: "surfpol-main"

tika:
host: "http://localhost:9998"

aws:
is_aws: true
load_secrets: true
4 changes: 2 additions & 2 deletions harvester/core/fixtures/resources-basic-delta.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"model": "core.httptikaresource",
"pk": 10,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fsurfsharekit.nl%2Fobjectstore%2F182216be-31a2-43c3-b7de-e5dd355b09f7&fetcherName=http",
"uri": "tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fsurfsharekit.nl%2Fobjectstore%2F182216be-31a2-43c3-b7de-e5dd355b09f7&fetcherName=http",
"status": 200,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:26:56.538Z",
Expand All @@ -13,7 +13,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fsurfsharekit.nl%2Fobjectstore%2F182216be-31a2-43c3-b7de-e5dd355b09f7&fetcherName=http",
"url": "http://tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fsurfsharekit.nl%2Fobjectstore%2F182216be-31a2-43c3-b7de-e5dd355b09f7&fetcherName=http",
"args": [
"https://surfsharekit.nl/objectstore/182216be-31a2-43c3-b7de-e5dd355b09f7"
],
Expand Down
36 changes: 18 additions & 18 deletions harvester/core/fixtures/resources-basic-initial.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"model": "core.httptikaresource",
"pk": 1,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=http%3A%2F%2Fonline.codarts.nl%2Fmod%2Fpage%2Fview.php%3Fid%3D313&fetcherName=http",
"uri": "tika:9998/rmeta/text?fetchKey=http%3A%2F%2Fonline.codarts.nl%2Fmod%2Fpage%2Fview.php%3Fid%3D313&fetcherName=http",
"status": 500,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:26:55.240Z",
Expand All @@ -13,7 +13,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=http%3A%2F%2Fonline.codarts.nl%2Fmod%2Fpage%2Fview.php%3Fid%3D313&fetcherName=http",
"url": "http://tika:9998/rmeta/text?fetchKey=http%3A%2F%2Fonline.codarts.nl%2Fmod%2Fpage%2Fview.php%3Fid%3D313&fetcherName=http",
"args": [
"http://online.codarts.nl/mod/page/view.php?id=313"
],
Expand All @@ -40,7 +40,7 @@
"model": "core.httptikaresource",
"pk": 2,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fsurfsharekit.nl%2Fobjectstore%2F182216be-31a2-43c3-b7de-e5dd355b09f7&fetcherName=http",
"uri": "tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fsurfsharekit.nl%2Fobjectstore%2F182216be-31a2-43c3-b7de-e5dd355b09f7&fetcherName=http",
"status": 404,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:26:56.538Z",
Expand All @@ -50,7 +50,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fsurfsharekit.nl%2Fobjectstore%2F182216be-31a2-43c3-b7de-e5dd355b09f7&fetcherName=http",
"url": "http://tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fsurfsharekit.nl%2Fobjectstore%2F182216be-31a2-43c3-b7de-e5dd355b09f7&fetcherName=http",
"args": [
"https://surfsharekit.nl/objectstore/182216be-31a2-43c3-b7de-e5dd355b09f7"
],
Expand Down Expand Up @@ -80,7 +80,7 @@
"model": "core.httptikaresource",
"pk": 3,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DZl59P5ZNX3M&fetcherName=http",
"uri": "tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DZl59P5ZNX3M&fetcherName=http",
"status": 200,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:26:57.146Z",
Expand All @@ -90,7 +90,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DZl59P5ZNX3M&fetcherName=http",
"url": "http://tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DZl59P5ZNX3M&fetcherName=http",
"args": [
"https://www.youtube.com/watch?v=Zl59P5ZNX3M"
],
Expand Down Expand Up @@ -120,7 +120,7 @@
"model": "core.httptikaresource",
"pk": 4,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fmaken.wikiwijs.nl%2F94812%2FMacro_meso_micro&fetcherName=http#!page-2935729",
"uri": "tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fmaken.wikiwijs.nl%2F94812%2FMacro_meso_micro&fetcherName=http#!page-2935729",
"status": 200,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:26:57.318Z",
Expand All @@ -130,7 +130,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fmaken.wikiwijs.nl%2F94812%2FMacro_meso_micro&fetcherName=http#!page-2935729",
"url": "http://tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fmaken.wikiwijs.nl%2F94812%2FMacro_meso_micro&fetcherName=http#!page-2935729",
"args": [
"https://maken.wikiwijs.nl/94812/Macro_meso_micro#!page-2935729"
],
Expand Down Expand Up @@ -160,7 +160,7 @@
"model": "core.httptikaresource",
"pk": 5,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fyoutu.be%2FD6_p6y3dvXw&fetcherName=http",
"uri": "tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fyoutu.be%2FD6_p6y3dvXw&fetcherName=http",
"status": 200,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:26:57.899Z",
Expand All @@ -170,7 +170,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fyoutu.be%2FD6_p6y3dvXw&fetcherName=http",
"url": "http://tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fyoutu.be%2FD6_p6y3dvXw&fetcherName=http",
"args": [
"https://youtu.be/D6_p6y3dvXw"
],
Expand Down Expand Up @@ -200,7 +200,7 @@
"model": "core.httptikaresource",
"pk": 6,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DwVxZDVriTZc&fetcherName=http",
"uri": "tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DwVxZDVriTZc&fetcherName=http",
"status": 200,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:26:58.453Z",
Expand All @@ -210,7 +210,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DwVxZDVriTZc&fetcherName=http",
"url": "http://tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DwVxZDVriTZc&fetcherName=http",
"args": [
"https://www.youtube.com/watch?v=wVxZDVriTZc"
],
Expand Down Expand Up @@ -240,7 +240,7 @@
"model": "core.httptikaresource",
"pk": 7,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DHx0kUX7MZ7g&fetcherName=http",
"uri": "tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DHx0kUX7MZ7g&fetcherName=http",
"status": 200,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:26:58.950Z",
Expand All @@ -250,7 +250,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DHx0kUX7MZ7g&fetcherName=http",
"url": "http://tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DHx0kUX7MZ7g&fetcherName=http",
"args": [
"https://www.youtube.com/watch?v=Hx0kUX7MZ7g"
],
Expand Down Expand Up @@ -280,7 +280,7 @@
"model": "core.httptikaresource",
"pk": 8,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DD4w-EClY664&fetcherName=http&t=8s",
"uri": "tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DD4w-EClY664&fetcherName=http&t=8s",
"status": 200,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:26:59.537Z",
Expand All @@ -290,7 +290,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DD4w-EClY664&t=8s&fetcherName=http",
"url": "http://tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DD4w-EClY664&t=8s&fetcherName=http",
"args": [
"https://www.youtube.com/watch?v=D4w-EClY664&t=8s"
],
Expand Down Expand Up @@ -320,7 +320,7 @@
"model": "core.httptikaresource",
"pk": 9,
"fields": {
"uri": "localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3D21v9IYUcruI&fetcherName=http",
"uri": "tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3D21v9IYUcruI&fetcherName=http",
"status": 200,
"config": "{\"_namespace\": \"global\", \"_private\": [\"_private\", \"_defaults\", \"_namespace\"]}",
"created_at": "2023-01-12T15:27:00.148Z",
Expand All @@ -330,7 +330,7 @@
"retainer_id": null,
"data_hash": "",
"request": {
"url": "http://localhost:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3D21v9IYUcruI&fetcherName=http",
"url": "http://tika:9998/rmeta/text?fetchKey=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3D21v9IYUcruI&fetcherName=http",
"args": [
"https://www.youtube.com/watch?v=21v9IYUcruI"
],
Expand Down
16 changes: 8 additions & 8 deletions harvester/core/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,14 @@ def report_material(self, external_id, title=None, url=None, pipeline=None, stat
extra = self._get_extra_info(phase="report", material=material_info)
documents.info(f"Report: {external_id}", extra=extra)

def _get_document_counts(self, document_set):
total = document_set.count()
inactive_educational_level_count = document_set \
def _get_document_counts(self, document_queryset):
total = document_queryset.count()
inactive_educational_level_count = document_queryset \
.filter(properties__state="inactive", properties__lowest_educational_level__lt=1) \
.count()
inactive_copyright_count = \
document_set.filter(properties__state="inactive").count() - inactive_educational_level_count
deleted_count = document_set.filter(properties__state="deleted").count()
document_queryset.filter(properties__state="inactive").count() - inactive_educational_level_count
deleted_count = document_queryset.filter(properties__state="deleted").count()
return {
"total": total - inactive_educational_level_count - inactive_copyright_count - deleted_count,
"inactive_educational_level_count": inactive_educational_level_count,
Expand All @@ -126,7 +126,7 @@ def _get_document_counts(self, document_set):
}

def report_collection(self, collection, repository):
document_counts = self._get_document_counts(collection.document_set)
document_counts = self._get_document_counts(collection.documents)
extra = self._get_extra_info(result={
"source": collection.name,
"repository": repository,
Expand All @@ -142,14 +142,14 @@ def report_collection(self, collection, repository):
def report_dataset_version(self, dataset_version):
collection_names = set()
collection_ids = set()
for collection in dataset_version.collection_set.all():
for collection in dataset_version.sets.all():
if collection.name in collection_names:
continue
collection_names.add(collection.name)
collection_ids.add(collection.id)
self.report_collection(collection, None)
document_counts = self._get_document_counts(
dataset_version.document_set.filter(collection__id__in=collection_ids)
dataset_version.documents.filter(collection__id__in=collection_ids)
)
extra = self._get_extra_info(result={
"source": str(dataset_version),
Expand Down
8 changes: 8 additions & 0 deletions harvester/core/models/legacy/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,14 @@ class DatasetVersion(models.Model):
created_at = models.DateTimeField(auto_now_add=True)
version = models.CharField(max_length=50, null=False, blank=True)

@property
def documents(self):
return self.document_set

@property
def sets(self):
return self.collection_set

def __str__(self):
return "{} (v={}, id={})".format(self.dataset.name, self.version, self.id)

Expand Down
6 changes: 6 additions & 0 deletions harvester/edurep/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
logger = logging.getLogger("harvester")


LOWEST_EDUCATIONAL_LEVEL = 2 # HBO


class EdurepDataExtraction(object):

youtube_regex = re.compile(r".*(youtube\.com|youtu\.be).*", re.IGNORECASE)
Expand Down Expand Up @@ -61,6 +64,9 @@ def get_oaipmh_external_id(cls, soup, el):

@classmethod
def get_oaipmh_record_state(cls, soup, el):
lowest_educational_level = cls.get_lowest_educational_level(soup, el)
if lowest_educational_level < LOWEST_EDUCATIONAL_LEVEL:
return "inactive"
header = el.find('header')
return header.get("status", "active")

Expand Down
4 changes: 3 additions & 1 deletion harvester/files/models/resources/metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
from json import JSONDecodeError

from django.conf import settings

from datagrowth.resources import HttpResource, URLResource
import extruct

Expand All @@ -10,7 +12,7 @@

class HttpTikaResourceBase(HttpResource):

URI_TEMPLATE = "http://localhost:9998/rmeta/text?fetchKey={}"
URI_TEMPLATE = settings.TIKA_HOST + "/rmeta/text?fetchKey={}"
PARAMETERS = {
"fetcherName": "http"
}
Expand Down
5 changes: 5 additions & 0 deletions harvester/harvester/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,11 @@
OPENSEARCH_ALIAS_PREFIX = environment.opensearch.alias_prefix


# Tika

TIKA_HOST = environment.tika.host


# Logging
# https://docs.djangoproject.com/en/2.2/topics/logging/
# https://docs.sentry.io/
Expand Down
7 changes: 1 addition & 6 deletions harvester/harvester/utils/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,8 @@ def prepare_seed(seed):
seed["language"] = {"metadata": language} if language else None
if seed["state"] == "deleted":
return
if not settings.ALLOW_CLOSED_ACCESS_DOCUMENTS and \
(not seed["copyright"] or seed["copyright"] in ["yes", "closed-access", "unknown"]):
seed["state"] = "inactive"
if seed["lowest_educational_level"] < settings.LOWEST_EDUCATIONAL_LEVEL:
seed["state"] = "inactive"
if settings.SHAREKIT_TEST_ORGANIZATION in seed["publishers"] and \
settings.ENVIRONMENT in ["acceptance", "production"]:
settings.ENVIRONMENT in ["production"]:
seed["state"] = "skipped"


Expand Down
2 changes: 1 addition & 1 deletion harvester/package.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
PACKAGE = {
"version": "1.39.10",
"version": "1.39.11",
"name": "harvester",
"cpu": "2048",
}
6 changes: 6 additions & 0 deletions harvester/sources/extraction/edurep.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from django.utils.text import slugify


LOWEST_EDUCATIONAL_LEVEL = 2 # HBO


class EdurepMetadataExtraction(ExtractProcessor):

youtube_regex = re.compile(r".*(youtube\.com|youtu\.be).*", re.IGNORECASE)
Expand All @@ -17,6 +20,9 @@ class EdurepMetadataExtraction(ExtractProcessor):

@classmethod
def get_record_state(cls, node):
lowest_educational_level = cls.get_lowest_educational_level(node)
if lowest_educational_level < LOWEST_EDUCATIONAL_LEVEL:
return "inactive"
return "active"

#############################
Expand Down

0 comments on commit 4e4c211

Please sign in to comment.