Skip to content

Commit

Permalink
feat: add wiki title search (#453)
Browse files Browse the repository at this point in the history
* feat: add wiki title search

* fix: fix update title index

* chore: optimize code

* chore: optimize code

* fix index title in trash bin

* chore: optimze code

* chore: optimize code

* chore: optimize code

* chore: optimize code
  • Loading branch information
cir9no authored Jan 3, 2025
1 parent 79fcc7b commit e74cfe0
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 16 deletions.
5 changes: 2 additions & 3 deletions seafevent_server/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,15 @@
from seafevents.repo_metadata.constants import ZERO_OBJ_ID
from seafevents.repo_data import repo_data
from seafevents.utils.md2sdoc import md2sdoc
from seafevents.utils.constants import WIKI_PAGES_DIR, WIKI_CONFIG_PATH, \
WIKI_CONFIG_FILE_NAME

from seaserv import get_org_id_by_repo_id, seafile_api, get_commit
from seafobj import CommitDiffer, commit_mgr, fs_mgr
from seafobj.exceptions import GetObjectError

logger = logging.getLogger('seafevents')

WIKI_PAGES_DIR = '/wiki-pages'
WIKI_CONFIG_PATH = '_Internal/Wiki'
WIKI_CONFIG_FILE_NAME = 'index.json'
WIKI_FILE_TMP_DIR = '/tmp'
SYS_DIR_PATHS = ['images']

Expand Down
126 changes: 113 additions & 13 deletions seasearch/index_store/wiki_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from seafevents.seasearch.utils import get_library_diff_files, is_wiki_page, extract_sdoc_text, md5
from seafevents.seasearch.utils.constants import ZERO_OBJ_ID, WIKI_INDEX_PREFIX
from seafevents.utils.constants import WIKI_PAGES_DIR, WIKI_CONFIG_PATH, WIKI_CONFIG_FILE_NAME
from seafobj import fs_mgr, commit_mgr
from seaserv import seafile_api

Expand All @@ -13,9 +14,6 @@

SEASEARCH_WIKI_BULK_ADD_LIMIT = 10
SEASEARCH_WIKI_BULK_DELETE_LIMIT = 50
WIKI_PAGES_DIR = '/wiki-pages'
WIKI_CONFIG_PATH = '_Internal/Wiki'
WIKI_CONFIG_FILE_NAME = 'index.json'


class WikiIndex(object):
Expand All @@ -24,9 +22,20 @@ class WikiIndex(object):
'wiki_id': {
'type': 'keyword',
},
'doc_uuid':{
'doc_uuid': {
'type': 'keyword',
},
'title': {
'type': 'text',
'highlightable': True,
'fields': {
'ngram': {
'type': 'text',
'index': True,
'analyzer': 'seafile_wiki_ngram_analyzer',
},
},
},
'content': {
'type': 'text',
'highlightable': True,
Expand Down Expand Up @@ -83,6 +92,7 @@ def _make_match_query(field, key_word, **kw):

searches = []
searches.append(_make_match_query('content', keyword, **match_query_kwargs))
searches.append(_make_match_query('title', keyword, **match_query_kwargs))
searches.append({
'match': {
'content.ngram': {
Expand All @@ -91,6 +101,14 @@ def _make_match_query(field, key_word, **kw):
}
}
})
searches.append({
'match': {
'title.ngram': {
'query': keyword,
'minimum_should_match': '80%',
}
}
})
return searches

def create_index_if_missing(self, index_name):
Expand Down Expand Up @@ -131,6 +149,27 @@ def get_wiki_conf(self, wiki_id, commit_id=None):
f = fs_mgr.load_seafile(wiki_id, 1, file_id)
return json.loads(f.get_content().decode())

def get_updated_title_uuids(self, old_conf, new_conf, excluded_uuids):
"""Calculate the items that are in new_conf but not in old_conf,
or the names in New_conf are different from the names in old_conf.
return based on new_conf data
Args:
old_conf: get from get_wiki_conf
new_conf: get from get_wiki_conf
excluded_uuids: set of doc_uuids that should be excluded from the result
Returns:
set: A set of doc_uuids for updated titles."""

old_pages = {page['id']: page for page in old_conf['pages']} if old_conf else {}
new_pages = {page['id']: page for page in new_conf['pages']} if new_conf else {}

doc_uuids = set()
for new_id, new_page in new_pages.items():
if new_id not in old_pages or new_page['name'] != old_pages[new_id]['name']:
if new_page['docUuid'] not in excluded_uuids:
doc_uuids.add(new_page['docUuid'])
return doc_uuids

def get_uuid_path_mapping(self, config):
"""Determine the UUID-PATH mapping for extracting unremoved or deleted wiki pages
"""
Expand All @@ -153,20 +192,40 @@ def extract_ids_from_navigation(navigation_items, navigation_ids):

return uuid_to_path, rm_uuid_to_path

def add_files(self, index_name, wiki_id, files, uuid_path, commit_id):
def add_files(
self,
index_name,
wiki_id,
files,
uuid_path,
commit_id,
updated_title_uuids,
title_info,
):
"""Add wiki files to the index
Args:
index_name: str
wiki_id: str
files: list
uuid_path: dict
commit_id: str
updated_title_uuids: set
title_info: dict: {doc_uuid: (name, path)}"""

bulk_add_params = []

def bulk_add():
if bulk_add_params:
self.seasearch_api.bulk(index_name, bulk_add_params)
bulk_add_params.clear()

def process_file(doc_uuid, content):
def process_file(doc_uuid, content, title):
index_info = {'index': {'_index': index_name, '_id': doc_uuid}}
doc_info = {
'wiki_id': wiki_id,
'doc_uuid': doc_uuid,
'content': content
'content': content,
'title': title,
}
bulk_add_params.extend([index_info, doc_info])
if len(bulk_add_params) >= SEASEARCH_WIKI_BULK_ADD_LIMIT:
Expand All @@ -179,14 +238,27 @@ def process_file(doc_uuid, content):
if self.size_cap is not None and int(size) >= int(self.size_cap):
continue
doc_uuid = path.split('/')[2]
if content := self.get_wiki_content(wiki_id, obj_id):
process_file(doc_uuid, content)
# remove docuuid from updated_title_uuids if it is in the need updated files
# this is for the case: both the title and content are updated
updated_title_uuids.discard(doc_uuid)
content = self.get_wiki_content(wiki_id, obj_id)
title = title_info.get(doc_uuid)[0]
process_file(doc_uuid, content, title)

# Recovered files
for doc_uuid, path in uuid_path.items():
file_id = seafile_api.get_file_id_by_commit_and_path(wiki_id, commit_id, path)
if content := self.get_wiki_content(wiki_id, file_id):
process_file(doc_uuid, content)
title = title_info.get(doc_uuid)[0]
content = self.get_wiki_content(wiki_id, file_id)
process_file(doc_uuid, content, title)

# For the case: only title is updated
for doc_uuid in updated_title_uuids:
f_path = title_info.get(doc_uuid)[1]
file_id = seafile_api.get_file_id_by_commit_and_path(wiki_id, commit_id, f_path)
content = self.get_wiki_content(wiki_id, file_id)
title = title_info.get(doc_uuid)[0]
process_file(doc_uuid, content,title)
bulk_add()

def delete_files(self, index_name, dirs, doc_uuids):
Expand Down Expand Up @@ -231,13 +303,39 @@ def update(self, index_name, wiki_id, old_commit_id, new_commit_id):

need_added_files = added_files + modified_files

# Check whether wiki title is changed
# This is a necessary but not sufficient condition judgment.
wiki_conf_path = posixpath.join(WIKI_CONFIG_PATH, WIKI_CONFIG_FILE_NAME)
is_wiki_conf_modified = any(wiki_conf_path == tup[0].lstrip('/') for tup in need_added_files)

if is_wiki_conf_modified:
need_updated_title_uuids = self.get_updated_title_uuids(
old_cfg, new_cfg, excluded_uuids=curr_recycled_uuid_paths.keys()
)
else:
need_updated_title_uuids = set()

recently_restore_uuid_to_path = {
uuid: path
for uuid, path in curr_uuid_paths.items()
if uuid in prev_recycled_uuid_paths
}
get_title_name_path_by_conf = lambda conf: {
page['docUuid']: (page.get('name'), page.get('path'))
for page in conf.get('pages', [])
if 'docUuid' in page
}
# {doc_uuid: (name, path)}
title_info = get_title_name_path_by_conf(new_cfg)

self.add_files(
index_name, wiki_id, need_added_files, recently_restore_uuid_to_path, new_commit_id
index_name,
wiki_id,
need_added_files,
recently_restore_uuid_to_path,
new_commit_id,
need_updated_title_uuids,
title_info
)

def search_wiki(self, wiki, keyword, start=0, size=10):
Expand All @@ -256,7 +354,7 @@ def search_wiki(self, wiki, keyword, start=0, size=10):
"highlight": {
"pre_tags": ["<mark>"],
"post_tags": ["</mark>"],
"fields": {"content": {}},
"fields": {"content": {}, "title": {}},
},
}
index_name = WIKI_INDEX_PREFIX + wiki
Expand Down Expand Up @@ -285,6 +383,8 @@ def search_wiki(self, wiki, keyword, start=0, size=10):
}
if highlight_content := hit.get('highlight', {}).get('content', [None])[0]:
r.update(content=highlight_content)
if highlight_title := hit.get('highlight', {}).get('title', [None])[0]:
r.update(title=highlight_title)
wiki_content.append(r)

return wiki_content
Expand Down
3 changes: 3 additions & 0 deletions utils/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
WIKI_PAGES_DIR = '/wiki-pages'
WIKI_CONFIG_PATH = '_Internal/Wiki'
WIKI_CONFIG_FILE_NAME = 'index.json'

0 comments on commit e74cfe0

Please sign in to comment.