Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/suggest 2 #2108

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
CK_EDITOR = "CK_EDITOR"
QUILL_EDITOR = "QUILL_EDITOR"
TEXT = "TEXT"
TIPTAP = "TIPTAP"

RH_COMMENT_CONTENT_TYPES = (
(CK_EDITOR, CK_EDITOR),
(QUILL_EDITOR, QUILL_EDITOR),
(TEXT, TEXT),
(TIPTAP, TIPTAP),
)
7 changes: 1 addition & 6 deletions src/search/documents/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,7 @@ class UserDocument(BaseDocument):
)
created_date = es_fields.DateField()

author_profile = es_fields.ObjectField(
properties={
"profile_img": es_fields.TextField(),
"id": es_fields.IntegerField(),
},
)
author_profile = es_fields.ObjectField()

class Index:
name = "user"
Expand Down
209 changes: 150 additions & 59 deletions src/search/views/suggest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from rest_framework.views import APIView

from search.documents.paper import PaperDocument
from search.documents.person import PersonDocument
from search.documents.post import PostDocument
from search.documents.user import UserDocument
from utils.doi import DOI
from utils.openalex import OpenAlex

Expand All @@ -14,17 +17,58 @@ class SuggestView(APIView):
permission_classes = [AllowAny]
renderer_classes = [JSONRenderer]

# Map of index names to their document classes and transform functions
INDEX_MAP = {
"paper": {
"document": PaperDocument,
"transform": lambda self, result: self.transform_es_result(result),
"external_search": True, # Indicates if OpenAlex search should be included
},
"author": {
"document": PersonDocument,
"transform": lambda self, result: {
"entity_type": "person",
"id": result.get("_source", {}).get("id"),
"display_name": result.get("_source", {}).get("full_name", ""),
"profile_image": result.get("_source", {}).get("profile_image"),
"headline": result.get("_source", {}).get("headline", {}),
"source": "researchhub",
},
},
"user": {
"document": UserDocument,
"transform": lambda self, result: {
"entity_type": "user",
"id": result.get("_source", {}).get("id"),
"display_name": result.get("_source", {}).get("full_name", ""),
"source": "researchhub",
"author_profile": result.get("_source", {}).get("author_profile", {}),
},
},
"post": {
"document": PostDocument,
"transform": lambda self, result: {
"entity_type": "post",
"id": result.get("_source", {}).get("id"),
"display_name": result.get("_source", {}).get("title", ""),
"document_type": result.get("_source", {}).get("document_type"),
"created_date": result.get("_source", {}).get("created_date"),
"authors": result.get("_source", {}).get("authors", []),
"source": "researchhub",
},
},
}

def transform_openalex_result(self, result):
normalized_doi = DOI.normalize_doi(result.get("external_id"))
return {
"entity_type": "work",
"entity_type": "paper",
"doi": (f"{normalized_doi}" if normalized_doi else None),
"normalized_doi": normalized_doi, # Used for comparison
"display_name": result.get("display_name", ""),
"authors": (
result.get("hint", "").split(", ") if result.get("hint") else []
),
"_score": result.get("cited_by_count", 0),
"citations": result.get("cited_by_count", 0),
"source": "openalex",
"openalex_id": result.get("id"), # OpenAlex ID from result
Expand All @@ -34,7 +78,7 @@ def transform_es_result(self, result):
source = result.get("_source", {})
normalized_doi = DOI.normalize_doi(source.get("doi"))
return {
"entity_type": "work",
"entity_type": "paper",
"id": source.get("id"), # ResearchHub paper ID
"doi": (f"{normalized_doi}" if normalized_doi else None),
"normalized_doi": normalized_doi, # Used for comparison
Expand All @@ -44,7 +88,6 @@ def transform_es_result(self, result):
for author in source.get("raw_authors", [])
if author.get("full_name")
],
"_score": result.get("_score", 0),
"citations": source.get("citations", 0),
"source": "researchhub",
"openalex_id": source.get("openalex_id"), # OpenAlex ID if exists in ES
Expand All @@ -55,6 +98,8 @@ def get(self, request):
Combined autocomplete search using both OpenAlex and local Elasticsearch.
Query params:
- q: search query (required)
- index: index(es) to search in (optional, defaults to 'paper')
Can be a single index or comma-separated list (e.g. 'user,person')
"""
query = request.query_params.get("q", None)
if not query:
Expand All @@ -63,62 +108,108 @@ def get(self, request):
status=status.HTTP_400_BAD_REQUEST,
)

try:
# Get OpenAlex results
openalex = OpenAlex()
openalex_response = openalex.autocomplete_works(query)
openalex_results = [
self.transform_openalex_result(result)
for result in openalex_response.get("results", [])
if result.get(
"external_id"
) # Only include results with external_id (DOI)
]

# Get local Elasticsearch results
search = Search(index=PaperDocument._index._name)
suggest = search.suggest(
"suggestions", query, completion={"field": "suggestion_phrases"}
# Parse indexes from the query parameter
index_param = request.query_params.get("index", "paper")
indexes = [idx.strip() for idx in index_param.split(",")]

# Validate all indexes
invalid_indexes = [idx for idx in indexes if idx not in self.INDEX_MAP]
if invalid_indexes:
available_indexes = ", ".join(self.INDEX_MAP.keys())
return Response(
{
"error": (
f"Invalid indexes: {', '.join(invalid_indexes)}. "
f"Available indexes: {available_indexes}"
)
},
status=status.HTTP_400_BAD_REQUEST,
)
es_response = suggest.execute().suggest.to_dict()
es_results = []
for suggestion in es_response.get("suggestions", []):
es_results.extend(
[
self.transform_es_result(option)
for option in suggestion.get("options", [])[:3]
if option.get("_source", {}).get(
"doi"
) # Only include results with DOI
]
)

# Combine results and deduplicate by DOI
seen_dois = set()
unique_results = []

# First add ResearchHub results (all should have DOIs at this point)
for result in sorted(
es_results, key=lambda x: x.get("_score", 0), reverse=True
):
doi = result.get("normalized_doi")
seen_dois.add(doi)
unique_results.append(result)

# Then add OpenAlex results if DOI not already seen
for result in sorted(
openalex_results, key=lambda x: x.get("_score", 0), reverse=True
):
doi = result.get("normalized_doi")
if doi not in seen_dois:
seen_dois.add(doi)
unique_results.append(result)

# Remove normalized_doi from results before sending response
for result in unique_results:
del result["normalized_doi"]

return Response(unique_results, status=status.HTTP_200_OK)

try:
all_results = []
seen_dois = set() # For deduplicating paper results

# Process each index
for index in indexes:
results = []
index_config = self.INDEX_MAP[index]
document = index_config["document"]
transform_func = index_config["transform"]

# Get OpenAlex results if enabled for this index
if index_config.get("external_search", False):
openalex = OpenAlex()
openalex_response = openalex.autocomplete_works(query)
results.extend(
[
self.transform_openalex_result(result)
for result in openalex_response.get("results", [])
if result.get(
"external_id"
) # Only include results with external_id (DOI)
]
)

# Get local Elasticsearch results
search = Search(index=document._index._name)
if hasattr(document, "_doc_type"):
suggest_field = "suggestion_phrases"
if index == "user":
suggest_field = "full_name_suggest"
elif index in ["hub", "journal"]:
suggest_field = "name_suggest"

suggest = search.suggest(
"suggestions", query, completion={"field": suggest_field}
)
es_response = suggest.execute().suggest.to_dict()
es_results = []
for suggestion in es_response.get("suggestions", []):
es_results.extend(
[
transform_func(self, option)
for option in suggestion.get("options", [])[:3]
]
)
results.extend(es_results)

# Handle paper-specific deduplication
if index == "paper":
unique_results = []

# First add ResearchHub results
for result in sorted(
[r for r in results if r["source"] == "researchhub"],
key=lambda x: x.get("_score", 0),
reverse=True,
):
doi = result.get("normalized_doi")
if doi and doi not in seen_dois:
seen_dois.add(doi)
del result["normalized_doi"]
unique_results.append(result)

# Then add OpenAlex results if DOI not already seen
for result in sorted(
[r for r in results if r["source"] == "openalex"],
key=lambda x: x.get("_score", 0),
reverse=True,
):
doi = result.get("normalized_doi")
if doi and doi not in seen_dois:
seen_dois.add(doi)
del result["normalized_doi"]
unique_results.append(result)

all_results.extend(unique_results)
else:
# For other indexes, just sort by score and add all results
all_results.extend(
sorted(results, key=lambda x: x.get("_score", 0), reverse=True)
)

return Response(all_results, status=status.HTTP_200_OK)

except Exception as e:
return Response(
Expand Down
Loading