Skip to content

Commit

Permalink
Split long documents into smaller parts
Browse files Browse the repository at this point in the history
Closes #143.
  • Loading branch information
ashtum committed Apr 19, 2024
1 parent 672bfbf commit 3aae8f5
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 13 deletions.
1 change: 1 addition & 0 deletions .github/workflows/index_on_algolia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
branches: [develop, ci-*]
paths:
- config/**
- gecko/**
- .github/workflows/index_on_algolia.yml

jobs:
Expand Down
19 changes: 8 additions & 11 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,15 @@ algolia:
- content
numericAttributesToIndex: null
attributesToRetrieve:
- boost_version
- hierarchy.lvl0.path
- hierarchy.lvl1.path
- hierarchy.lvl2.path
- hierarchy.lvl3.path
- hierarchy.lvl4.path
- hierarchy.lvl5.path
- hierarchy.lvl6.path
- hierarchy.lvl0
- hierarchy.lvl1
- hierarchy.lvl2
- hierarchy.lvl3
- hierarchy.lvl4
- hierarchy.lvl5
- hierarchy.lvl6
- library_key
- library_name
- type
- url_without_anchor
allowTyposOnNumericTokens: false
ignorePlurals: true
camelCaseAttributes:
Expand All @@ -57,7 +54,7 @@ algolia:
- hierarchy.lvl5.title
- hierarchy.lvl6.title
paginationLimitedTo: 1000
attributeForDistinct: url
attributeForDistinct: path
exactOnSingleWordQuery: attribute
ranking:
- words
Expand Down
1 change: 1 addition & 0 deletions gecko/extract_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def create_algolia_records(library_key: str, sections: dict, boost_root: str):
'level': 100 - len(section['lvls']) * 10,
'position': 0
},
'path': section['lvls'][-1]['path'] if len(section['lvls']) > 0 else None,
'hierarchy': {
'lvl0': section['lvls'][0] if len(section['lvls']) > 0 else None,
'lvl1': section['lvls'][1] if len(section['lvls']) > 1 else None,
Expand Down
8 changes: 6 additions & 2 deletions gecko/index_on_algolia.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@
# Delete the existing records for this library.
index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})

# Split long documents into smaller parts.
for record in records:
# TODO do something about truncation of long contents
record['content'] = record['content'][:90000]
if len(record['content']) > 5000:
new_record = record
new_record['content'] = new_record['content'][5000:]
record['content'] = record['content'][:5000]
records.append(new_record)

records = [record for record in records if not (
record['content'] == '' and not record['hierarchy']['lvl0'])]
Expand Down

0 comments on commit 3aae8f5

Please sign in to comment.