Split long documents into smaller parts

Closes #143.
cppalliance · Apr 19, 2024 · 48a9056 · 48a9056
1 parent 672bfbf
commit 48a9056
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 6 deletions.
diff --git a/.github/workflows/index_on_algolia.yml b/.github/workflows/index_on_algolia.yml
@@ -5,6 +5,7 @@ on:
     branches: [develop, ci-*]
     paths:
       - config/**
+      - gecko/**
       - .github/workflows/index_on_algolia.yml
 
 jobs:

diff --git a/config/config.yaml b/config/config.yaml
@@ -22,7 +22,6 @@ algolia:
       - content
     numericAttributesToIndex: null
     attributesToRetrieve:
-      - boost_version
       - hierarchy.lvl0.path
       - hierarchy.lvl1.path
       - hierarchy.lvl2.path
@@ -32,8 +31,6 @@ algolia:
       - hierarchy.lvl6.path
       - library_key
       - library_name
-      - type
-      - url_without_anchor
     allowTyposOnNumericTokens: false
     ignorePlurals: true
     camelCaseAttributes:
@@ -57,7 +54,7 @@ algolia:
       - hierarchy.lvl5.title
       - hierarchy.lvl6.title
     paginationLimitedTo: 1000
-    attributeForDistinct: url
+    attributeForDistinct: path
     exactOnSingleWordQuery: attribute
     ranking:
       - words

diff --git a/gecko/extract_records.py b/gecko/extract_records.py
@@ -64,6 +64,7 @@ def create_algolia_records(library_key: str, sections: dict, boost_root: str):
                 'level': 100 - len(section['lvls']) * 10,
                 'position': 0
             },
+            'path': section['lvls'][-1]['path'] if len(section['lvls']) > 0 else None,
             'hierarchy': {
                 'lvl0': section['lvls'][0] if len(section['lvls']) > 0 else None,
                 'lvl1': section['lvls'][1] if len(section['lvls']) > 1 else None,

diff --git a/gecko/index_on_algolia.py b/gecko/index_on_algolia.py
@@ -23,9 +23,13 @@
             # Delete the existing records for this library.
             index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})
 
+            # Split long documents into smaller parts.
             for record in records:
-                # TODO do something about truncation of long contents
-                record['content'] = record['content'][:90000]
+                if len(record['content']) > 5000:
+                    new_record = record
+                    new_record['content'] = new_record['content'][4900:]
+                    record['content'] = record['content'][:5000]
+                    records.append(new_record)
 
             records = [record for record in records if not (
                 record['content'] == '' and not record['hierarchy']['lvl0'])]