diff --git a/seafevent_server/utils.py b/seafevent_server/utils.py index 612159dd..ddf46a89 100644 --- a/seafevent_server/utils.py +++ b/seafevent_server/utils.py @@ -22,6 +22,8 @@ from seafevents.repo_metadata.constants import ZERO_OBJ_ID from seafevents.repo_data import repo_data from seafevents.utils.md2sdoc import md2sdoc +from seafevents.utils.constants import WIKI_PAGES_DIR, WIKI_CONFIG_PATH, \ + WIKI_CONFIG_FILE_NAME from seaserv import get_org_id_by_repo_id, seafile_api, get_commit from seafobj import CommitDiffer, commit_mgr, fs_mgr @@ -29,9 +31,6 @@ logger = logging.getLogger('seafevents') -WIKI_PAGES_DIR = '/wiki-pages' -WIKI_CONFIG_PATH = '_Internal/Wiki' -WIKI_CONFIG_FILE_NAME = 'index.json' WIKI_FILE_TMP_DIR = '/tmp' SYS_DIR_PATHS = ['images'] diff --git a/seasearch/index_store/wiki_index.py b/seasearch/index_store/wiki_index.py index e19f8b71..32e3912c 100644 --- a/seasearch/index_store/wiki_index.py +++ b/seasearch/index_store/wiki_index.py @@ -4,6 +4,7 @@ from seafevents.seasearch.utils import get_library_diff_files, is_wiki_page, extract_sdoc_text, md5 from seafevents.seasearch.utils.constants import ZERO_OBJ_ID, WIKI_INDEX_PREFIX +from seafevents.utils.constants import WIKI_PAGES_DIR, WIKI_CONFIG_PATH, WIKI_CONFIG_FILE_NAME from seafobj import fs_mgr, commit_mgr from seaserv import seafile_api @@ -13,9 +14,6 @@ SEASEARCH_WIKI_BULK_ADD_LIMIT = 10 SEASEARCH_WIKI_BULK_DELETE_LIMIT = 50 -WIKI_PAGES_DIR = '/wiki-pages' -WIKI_CONFIG_PATH = '_Internal/Wiki' -WIKI_CONFIG_FILE_NAME = 'index.json' class WikiIndex(object): @@ -24,9 +22,20 @@ class WikiIndex(object): 'wiki_id': { 'type': 'keyword', }, - 'doc_uuid':{ + 'doc_uuid': { 'type': 'keyword', }, + 'title': { + 'type': 'text', + 'highlightable': True, + 'fields': { + 'ngram': { + 'type': 'text', + 'index': True, + 'analyzer': 'seafile_wiki_ngram_analyzer', + }, + }, + }, 'content': { 'type': 'text', 'highlightable': True, @@ -83,6 +92,7 @@ def _make_match_query(field, key_word, **kw): searches = [] searches.append(_make_match_query('content', keyword, **match_query_kwargs)) + searches.append(_make_match_query('title', keyword, **match_query_kwargs)) searches.append({ 'match': { 'content.ngram': { @@ -91,6 +101,14 @@ def _make_match_query(field, key_word, **kw): } } }) + searches.append({ + 'match': { + 'title.ngram': { + 'query': keyword, + 'minimum_should_match': '80%', + } + } + }) return searches def create_index_if_missing(self, index_name): @@ -131,6 +149,27 @@ def get_wiki_conf(self, wiki_id, commit_id=None): f = fs_mgr.load_seafile(wiki_id, 1, file_id) return json.loads(f.get_content().decode()) + def get_updated_title_uuids(self, old_conf, new_conf, excluded_uuids): + """Calculate the items that are in new_conf but not in old_conf, + or the names in New_conf are different from the names in old_conf. + return based on new_conf data + Args: + old_conf: get from get_wiki_conf + new_conf: get from get_wiki_conf + excluded_uuids: set of doc_uuids that should be excluded from the result + Returns: + set: A set of doc_uuids for updated titles.""" + + old_pages = {page['id']: page for page in old_conf['pages']} if old_conf else {} + new_pages = {page['id']: page for page in new_conf['pages']} if new_conf else {} + + doc_uuids = set() + for new_id, new_page in new_pages.items(): + if new_id not in old_pages or new_page['name'] != old_pages[new_id]['name']: + if new_page['docUuid'] not in excluded_uuids: + doc_uuids.add(new_page['docUuid']) + return doc_uuids + def get_uuid_path_mapping(self, config): """Determine the UUID-PATH mapping for extracting unremoved or deleted wiki pages """ @@ -153,7 +192,26 @@ def extract_ids_from_navigation(navigation_items, navigation_ids): return uuid_to_path, rm_uuid_to_path - def add_files(self, index_name, wiki_id, files, uuid_path, commit_id): + def add_files( + self, + index_name, + wiki_id, + files, + uuid_path, + commit_id, + updated_title_uuids, + title_info, + ): + """Add wiki files to the index + Args: + index_name: str + wiki_id: str + files: list + uuid_path: dict + commit_id: str + updated_title_uuids: set + title_info: dict: {doc_uuid: (name, path)}""" + bulk_add_params = [] def bulk_add(): @@ -161,12 +219,13 @@ def bulk_add(): self.seasearch_api.bulk(index_name, bulk_add_params) bulk_add_params.clear() - def process_file(doc_uuid, content): + def process_file(doc_uuid, content, title): index_info = {'index': {'_index': index_name, '_id': doc_uuid}} doc_info = { 'wiki_id': wiki_id, 'doc_uuid': doc_uuid, - 'content': content + 'content': content, + 'title': title, } bulk_add_params.extend([index_info, doc_info]) if len(bulk_add_params) >= SEASEARCH_WIKI_BULK_ADD_LIMIT: @@ -179,14 +238,27 @@ def process_file(doc_uuid, content): if self.size_cap is not None and int(size) >= int(self.size_cap): continue doc_uuid = path.split('/')[2] - if content := self.get_wiki_content(wiki_id, obj_id): - process_file(doc_uuid, content) + # remove docuuid from updated_title_uuids if it is in the need updated files + # this is for the case: both the title and content are updated + updated_title_uuids.discard(doc_uuid) + content = self.get_wiki_content(wiki_id, obj_id) + title = title_info.get(doc_uuid)[0] + process_file(doc_uuid, content, title) # Recovered files for doc_uuid, path in uuid_path.items(): file_id = seafile_api.get_file_id_by_commit_and_path(wiki_id, commit_id, path) - if content := self.get_wiki_content(wiki_id, file_id): - process_file(doc_uuid, content) + title = title_info.get(doc_uuid)[0] + content = self.get_wiki_content(wiki_id, file_id) + process_file(doc_uuid, content, title) + + # For the case: only title is updated + for doc_uuid in updated_title_uuids: + f_path = title_info.get(doc_uuid)[1] + file_id = seafile_api.get_file_id_by_commit_and_path(wiki_id, commit_id, f_path) + content = self.get_wiki_content(wiki_id, file_id) + title = title_info.get(doc_uuid)[0] + process_file(doc_uuid, content,title) bulk_add() def delete_files(self, index_name, dirs, doc_uuids): @@ -231,13 +303,39 @@ def update(self, index_name, wiki_id, old_commit_id, new_commit_id): need_added_files = added_files + modified_files + # Check whether wiki title is changed + # This is a necessary but not sufficient condition judgment. + wiki_conf_path = posixpath.join(WIKI_CONFIG_PATH, WIKI_CONFIG_FILE_NAME) + is_wiki_conf_modified = any(wiki_conf_path == tup[0].lstrip('/') for tup in need_added_files) + + if is_wiki_conf_modified: + need_updated_title_uuids = self.get_updated_title_uuids( + old_cfg, new_cfg, excluded_uuids=curr_recycled_uuid_paths.keys() + ) + else: + need_updated_title_uuids = set() + recently_restore_uuid_to_path = { uuid: path for uuid, path in curr_uuid_paths.items() if uuid in prev_recycled_uuid_paths } + get_title_name_path_by_conf = lambda conf: { + page['docUuid']: (page.get('name'), page.get('path')) + for page in conf.get('pages', []) + if 'docUuid' in page + } + # {doc_uuid: (name, path)} + title_info = get_title_name_path_by_conf(new_cfg) + self.add_files( - index_name, wiki_id, need_added_files, recently_restore_uuid_to_path, new_commit_id + index_name, + wiki_id, + need_added_files, + recently_restore_uuid_to_path, + new_commit_id, + need_updated_title_uuids, + title_info ) def search_wiki(self, wiki, keyword, start=0, size=10): @@ -256,7 +354,7 @@ def search_wiki(self, wiki, keyword, start=0, size=10): "highlight": { "pre_tags": [""], "post_tags": [""], - "fields": {"content": {}}, + "fields": {"content": {}, "title": {}}, }, } index_name = WIKI_INDEX_PREFIX + wiki @@ -285,6 +383,8 @@ def search_wiki(self, wiki, keyword, start=0, size=10): } if highlight_content := hit.get('highlight', {}).get('content', [None])[0]: r.update(content=highlight_content) + if highlight_title := hit.get('highlight', {}).get('title', [None])[0]: + r.update(title=highlight_title) wiki_content.append(r) return wiki_content diff --git a/utils/constants.py b/utils/constants.py new file mode 100644 index 00000000..cfc0cde0 --- /dev/null +++ b/utils/constants.py @@ -0,0 +1,3 @@ +WIKI_PAGES_DIR = '/wiki-pages' +WIKI_CONFIG_PATH = '_Internal/Wiki' +WIKI_CONFIG_FILE_NAME = 'index.json'