-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathbsbang-index.py
executable file
·64 lines (49 loc) · 1.92 KB
/
bsbang-index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
import argparse
import contextlib
import json
import logging
import os
import sqlite3
import bioschemas.indexers
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# FUNCTIONS
def index_row(_row):
indexer.index(_row['url'], json.loads(_row['jsonld']))
# MAIN
parser = argparse.ArgumentParser('Index extracted JSONLD into Solr.')
parser.add_argument('path_to_crawl_db', help='Path to the database used to store crawl information.')
parser.add_argument('url_to_index', nargs='?', help='URL to index using only data from the crawl DB')
parser.add_argument('-s', '--solr-core-url', nargs='?', help='URL to solr endpoint')
args = parser.parse_args()
if args.solr_core_url is None:
endpoint = 'http://localhost:8983/solr/bsbang/'
else:
endpoint = str(args.solr_core_url)
logger.info('Indexing at Solr core %s', args.solr_core_url)
if not os.path.exists(args.path_to_crawl_db):
logger.error('Crawl database %s does not exist', args.path_to_crawl_db)
exit(1)
config = bioschemas.DEFAULT_CONFIG.copy()
config.update({
'post_to_solr': True,
'solr_json_doc_update_url': endpoint + 'update/json/docs',
'solr_query_url': endpoint +'select'
})
indexer = bioschemas.indexers.SolrIndexer(config)
with sqlite3.connect(args.path_to_crawl_db) as conn:
conn.execute("PRAGMA busy_timeout = 30000")
conn.row_factory = sqlite3.Row
with contextlib.closing(conn.cursor()) as curs:
if args.url_to_index is not None:
curs.execute('SELECT jsonld, url FROM jsonld where url=?', (args.url_to_index,))
index_row(curs.fetchone())
else:
curs.execute('SELECT COUNT(*) from jsonld')
count = int(curs.fetchone()[0])
i = 1
for row in curs.execute('SELECT jsonld, url FROM jsonld'):
logger.info('Indexing %s (%d of %d)', row['url'], i, count)
index_row(row)
i += 1