Skip to content

Commit

Permalink
fix the bug in search.py
Browse files Browse the repository at this point in the history
  • Loading branch information
cl117 committed Aug 28, 2024
1 parent 0418a64 commit ccad037
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 145 deletions.
218 changes: 96 additions & 122 deletions flask/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,222 +5,198 @@
import json
from logger import Logger

# Load config and initialize managers once
config_manager = ConfigManager()
config = config_manager.load_config()
elasticsearch_manager = ElasticsearchManager(config_manager)
logger_ = Logger()

def add_pagerank(parts_response, uri2rank):
"""
Adds the pagerank score for each part
Adds the pagerank score for each part.
Arguments:
parts_response {List} -- List containing all parts from the SPARQL query
uri2rank {List} -- List of each part and its calculated pagerank score
uri2rank {Dict} -- Dictionary of each part and its calculated pagerank score
"""

for part in parts_response:
subject = part['subject']

if subject in uri2rank:
part['pagerank'] = uri2rank[subject]
else:
part['pagerank'] = 1
part['pagerank'] = uri2rank.get(part['subject'], 1)


def add_keywords(parts_response):
"""
Adds the displayId to the 'keyword' category
Adds the displayId to the 'keyword' category.
Arguments:
parts_response {List} -- List containing all parts from the SPARQL query
"""

for part in parts_response:
keywords = []

displayId = part.get('displayId')
if displayId is not None:
keywords.extend(displayId.split('_'))
display_id = part.get('displayId')
if display_id:
part['keywords'] = ' '.join(display_id.split('_'))
else:
part['keywords'] = ''

part['keywords'] = ' '.join(keywords)

def add_roles(parts_response):
def add_roles(parts_response, term_list):
"""
Adds the synonyms from the SO-Ontologies list to each part's keyword category
Adds the synonyms from the SO-Ontologies list to each part's keyword category.
Arguments:
parts_response {List} -- List containing all parts from the SPARQL query
term_list {List} -- List of terms from the SO-Ontologies
"""
with open('so-simplified.json','r') as so_json:
term_list = json.load(so_json)

for part in parts_response:
# Split the CSV of roles from sparql
role = part.get('role')
print("parts_response: ", len(parts_response))
print("term_list: ", len(term_list))
for part in parts_response:
# Split the CSV of roles from sparql
role = part.get('role')
if role and 'identifiers.org' in role:
keywords_list = []
so_term = role[-10:].replace(':','_')

for term in term_list:
if so_term in term['id']:
keywords_list.append(term['lbl'])
synonyms = term.get('synonyms', [])
for synonym in synonyms:
# remove the annoying header from the synonyms
if 'INSDC' in synonym:
synonym = synonym.replace('INSDC_qualifier:', '')
if synonym not in keywords_list:
keywords_list.append(synonym)

part['keywords'] += ' ' + ' '.join(keywords_list)

if role is not None and 'identifiers.org' in role:
keywords_list = []
so_term = role[-10:]
so_term = so_term.replace(':','_')

for term in term_list:
if so_term in term['id']:
keywords_list.append(term['lbl'])

if 'synonyms' in term and term['synonyms'] is not None:
for synonym in term['synonyms']:

# remove the annoying header from the synonyms
if 'INSDC' in synonym:
synonym = synonym.replace('INSDC_qualifier:', '')

if synonym not in keywords_list:
keywords_list.append(synonym)

for keyword in keywords_list:
part['keywords'] += ' ' + keyword

def add_sbol_type(parts_response):
for part in parts_response:
sbol_type = part.get('sboltype')
if sbol_type and 'http://www.biopax.org/release/biopax-level3.owl#' in sbol_type:
type_ = sbol_type[48:]
if 'region' in type_:
type_ = type_.replace('Region','')
part['keywords'] += ' ' + type_

if sbol_type is not None and 'http://www.biopax.org/release/biopax-level3.owl#' in sbol_type:
type = sbol_type[48:]

if 'region' in type:
type = type.replace('Region','')

part['keywords'] += ' ' + type

def create_parts_index(index_name):
"""
Creates a new index
Creates a new index.
Arguments:
index_name {String} -- Name of the new index
"""

if elasticsearch_manager.get_es().indices.exists(index_name):
es = elasticsearch_manager.get_es()
if es.indices.exists(index_name):
logger_.log('Index already exists -> deleting', True)
elasticsearch_manager.get_es().indices.delete(index=index_name)
es.indices.delete(index=index_name)

body = {
'mappings': {
index_name: {
'properties': {
'subject': {
'type': 'keyword'
},
'graph': {
'type': 'keyword'
}
},
'properties': {
'subject': {'type': 'keyword'},
'graph': {'type': 'keyword'}
}
},
"settings": {
"number_of_shards": 1
'settings': {
'number_of_shards': 1
}

}
elasticsearch_manager.get_es().indices.create(index=index_name, body=body)
es.indices.create(index=index_name, body=body)
logger_.log('Index created', True)


def bulk_index_parts(parts_response, index_name):
"""
Adds each part as a document to the index
Adds each part as a document to the index.
Arguments:
parts_response {List} -- List containing all parts from the SPARQL query
index_name {String} -- Name of the index
Raises:
Exception -- Indexing fails
"""

actions = []
for i in range(len(parts_response)):
action = {
'_index': index_name,
'_type': index_name,
'_id': parts_response[i].get('subject'),
'_source': parts_response[i]
}

actions.append(action)
es = elasticsearch_manager.get_es()

def actions():
for part in parts_response:
yield {
'_index': index_name,
'_id': part['subject'],
'_source': part
}

logger_.log('Bulk indexing', True)
try:
stats = helpers.bulk(elasticsearch_manager.get_es(), actions)
stats = helpers.bulk(es, actions())
logger_.log('Bulk indexing complete', True)
except:
logger_.log('[ERROR] Error_messages: ' + '\n'.join(stats[1]), True)
raise Exception("Bulk indexing failed")
except Exception as e:
logger_.log(f'[ERROR] Error during bulk indexing: {str(e)}' + '\n'.join(stats[1]), True)
raise


def update_index(uri2rank):
"""
Main method
Args:
uri2rank: List of pageranks for each URI
Returns:
Main method to update the index.
Args:
uri2rank: Dictionary of pageranks for each URI
"""
index_name = config_manager.load_config()['elasticsearch_index_name']
index_name = config['elasticsearch_index_name']

logger_.log('------------ Updating index ------------', True)

logger_.log('******** Query for parts ********', True)
parts_response = query.query_parts(indexing = True)
parts_response = query.query_parts(indexing=True)
logger_.log('******** Query for parts complete ********', True)

logger_.log('******** Adding parts to new index ********', True)
add_pagerank(parts_response, uri2rank)
add_keywords(parts_response)
add_roles(parts_response)

# Load the SO-Ontologies list once
with open('so-simplified.json', 'r') as so_json:
term_list = json.load(so_json)
add_roles(parts_response, term_list)

add_sbol_type(parts_response)
create_parts_index(index_name)
bulk_index_parts(parts_response, index_name)

logger_.log('******** Finished adding ' + str(len(parts_response)) + ' parts to index ********', True)

logger_.log(f'******** Finished adding {len(parts_response)} parts to index ********', True)
logger_.log('------------ Successfully updated index ------------\n', True)


def delete_subject(subject):
"""
Delete part for incremental indexing
Args:
subject:
Returns:
Delete part for incremental indexing.
Args:
subject: The subject to delete from the index.
"""
index_name = config_manager.load_config()['elasticsearch_index_name']
index_name = config['elasticsearch_index_name']
es = elasticsearch_manager.get_es()

body = {
'query': {
'bool': {
'must': [
{'ids': {'values': subject}}
{'ids': {'values': [subject]}}
]
}
},
'conflicts': 'proceed'
}
elasticsearch_manager.get_es().delete_by_query(index=index_name, doc_type=index_name, body=body)
es.delete_by_query(index=index_name, body=body)


def index_part(part):
delete_subject(part['subject'])
index_name = config_manager.load_config()['elasticsearch_index_name']
elasticsearch_manager.get_es().index(index=index_name, doc_type=index_name, id=part['subject'], body=part)
index_name = config['elasticsearch_index_name']
es = elasticsearch_manager.get_es()
es.index(index=index_name, id=part['subject'], body=part)


def refresh_index(subject, uri2rank):
delete_subject(subject)

part_response = query.query_parts('', 'FILTER (?subject = <' + subject + '>)', True)
part_response = query.query_parts('', f'FILTER (?subject = <{subject}>)', True)

if len(part_response) == 1:
add_pagerank(part_response, uri2rank)
Expand All @@ -246,18 +222,16 @@ def incremental_remove(subject):


def incremental_remove_collection(subject, uri_prefix):
collection_membership_query = '''
collection_membership_query = f'''
SELECT
?s
WHERE {
<''' + subject + '''> sbol2:member ?s .
FILTER(STRSTARTS(str(?s),''' + "'" + uri_prefix + "'" + '''))
}
WHERE {{
<{subject}> sbol2:member ?s .
FILTER(STRSTARTS(str(?s), '{uri_prefix}'))
}}
'''
members = query.query_sparql(collection_membership_query)

delete_subject(subject)
for member in members:
delete_subject(member['s'])


Loading

0 comments on commit ccad037

Please sign in to comment.