-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathobsahator.py
104 lines (79 loc) · 4.37 KB
/
obsahator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# OBSAHATOR
#
# is responsible for processing of the digitized cover pages and TOC pages of
# newly bought documents. Cover and TOC pages are used to enhance the document records in library OPAC.
#
import workflow
import config
import os
import re
from datetime import datetime
from modules import utility
from modules import catalogue
batch_dict = {}
# scan input folder for new documents
docs = [os.path.join(config.INPUT_DIR, d) for d in os.listdir(config.INPUT_DIR) if os.path.isdir(os.path.join(
config.INPUT_DIR, d)) and re.match(r'^\d{8}', d)]
# check if there is any invisible file indication status of the document in each folder
for doc_path in docs:
doc_dict = {}
doc_dict.update({'id': os.path.basename(doc_path)[9:],
'name': os.path.basename(doc_path),
'path': doc_path,
'toc': [os.path.join(doc_path, f) for f in os.listdir(doc_path) if os.path.isfile(os.path.join(
doc_path, f)) and re.match(r'^toc-', f)],
'cover': [os.path.join(doc_path, f) for f in os.listdir(doc_path) if os.path.isfile(os.path.join(
doc_path, f)) and re.match(r'^(?!toc-|\..*)', f)]
#doc_path, f)) and re.match(r'\d{1,3}', f)] ^(?!toc-|.cover|.ocr)
})
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} INFO starting work on th folder: {doc_dict['name']}")
id_type, id_value = utility.determine_identifier(doc_dict['id'])
if id_type == 'fail':
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} ERROR unable to determine identifier for: {doc_dict['name']}")
utility.set_fail(doc_path, [f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} ERROR unable to determine identifier for: {doc_dict['name']}"])
continue
doc_dict.update({'id_type' : id_type,
'id_value' : id_value
})
try:
sysno = catalogue.resolve_id_to_sysno(id_type,id_value)
except IOError as e:
utility.set_fail(doc_path,[e])
continue
if sysno == None:
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} ERROR unable to determine sysno for: {doc_dict['name']}")
utility.set_fail(doc_path, [f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} ERROR unable to determine sysno for: {doc_dict['name']}"])
continue
doc_dict['sysno'] = sysno
if doc_dict['id_type'] in ['isbn', 'cnb', 'sysno','ocolc']:
try:
status, errors = workflow.process_doc_monograph(doc_dict)
doc_dict['status'] = status
doc_dict['error'] = errors
batch_dict[os.path.basename(doc_path)] = doc_dict # store the document information into the batch dictionary
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} INFO (OBSAHATOR): Processing {doc_dict['name']} finished with {len(errors)} errors...")
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} INFO (OBSAHATOR): List of errors:")
print(errors)
except IOError as e:
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} ERROR (OBSAHATOR): Processing error in {doc_dict['name']} : {e}")
print(">"*79)
elif doc_dict['id_type'] in ['issn']:
try:
status, errors = workflow.process_doc_periodical(doc_dict)
doc_dict['status'] = status
doc_dict['error'] = errors
batch_dict[os.path.basename(doc_path)] = doc_dict # store the document information into the batch dictionary
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} INFO (OBSAHATOR): Processing {doc_dict['name']} finished with {len(errors)} errors...")
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} INFO (OBSAHATOR): List of errors:")
print(errors)
except IOError as e:
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} ERROR (OBSAHATOR): Processing error in {doc_dict['name']} : {e}")
print(">"*79)
else:
print(f"{format(datetime.now(), '%Y-%m-%d %H:%M:%S')} ERROR unable to determine identifier for: {doc_dict['name']}")
utility.rename_document(original_path=doc_dict['path'],
new_name=config.FAIL_PREFIX+doc_dict['name'])
print(">"*79)
pass