Skip to content

Commit

Permalink
Extract search records for learn sections
Browse files Browse the repository at this point in the history
  • Loading branch information
ashtum committed Jun 29, 2024
1 parent 3c1ee1a commit 7e5308d
Show file tree
Hide file tree
Showing 8 changed files with 157 additions and 19 deletions.
28 changes: 19 additions & 9 deletions .github/workflows/index_on_algolia.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: Index on Algolia

on:
pull_request:
push:
branches: [develop, ci-*]
paths:
Expand All @@ -26,18 +27,27 @@ jobs:
- name: Install dependencies
run: pip install -r requirements.txt

- name: Download and extract boost release archive
- name: Clone and build website-v2-docs
run: |
BOOST_VERSION=$(sed -n 's/.*version: "\(.*\)"/\1/p' config/config.yaml)
BOOST_VERSION_MAIN=$(echo $BOOST_VERSION | sed -E 's/([0-9]+)_([0-9]+)_([0-9]+)(.*)/\1.\2.\3/g')
wget --no-verbose https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz
tar -xzf boost_$BOOST_VERSION.tar.gz -C ../
git clone --depth=1 --branch=master https://github.com/boostorg/website-v2-docs.git ../website-v2-docs
cd ../website-v2-docs
./build.sh
- name: Extract records
run: python -m gecko.extract_records
# - name: Download and extract boost release archive
# run: |
# BOOST_VERSION=$(sed -n 's/.*version: "\(.*\)"/\1/p' config/config.yaml)
# BOOST_VERSION_MAIN=$(echo $BOOST_VERSION | sed -E 's/([0-9]+)_([0-9]+)_([0-9]+)(.*)/\1.\2.\3/g')
# wget --no-verbose https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz
# tar -xzf boost_$BOOST_VERSION.tar.gz -C ../

- name: Check validity of records
run: python -m gecko.sanitizer check
- name: Extract learn records
run: python -m gecko.extract_learn_records

# - name: Extract libraries records
# run: python -m gecko.extract_libraries_records

# - name: Check validity of records
# run: python -m gecko.sanitizer check

- name: Index on Algolia
env:
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions algolia_records/libraries/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.json
14 changes: 12 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
boost:
version: "1_85_0"
root: "../boost_1_85_0"
version: '1_85_0'
root: '../boost_1_85_0'

website-v2-docs:
root: '../website-v2-docs/build'
sections:
- key: 'contributor-guide'
name: 'Contributor Guide'
- key: 'formal-reviews'
name: 'Formal Reviews'
- key: 'user-guide'
name: 'User Guide'

algolia:
app-id: D7O1MLLTAF
Expand Down
11 changes: 10 additions & 1 deletion gecko/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,19 @@
'version': str,
'root': os.path.exists
},
'website-v2-docs': {
'root': os.path.exists,
'sections': [
{
'key': str,
'name': str
}
]
},
'algolia': {
'app-id': str,
'api-key': str,
'settings':dict
'settings': dict
},
'crawlers': [
{
Expand Down
96 changes: 96 additions & 0 deletions gecko/extract_learn_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import re
import json
from pathlib import Path
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag

from .crawlers.helpers import has_class
from .config import config


class AntoraDoc():
def crawl(self, doc_root: Path) -> dict:
sections = {}
doc_root = doc_root.resolve()

for file_path in doc_root.rglob('*.html'):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
soup = BeautifulSoup(file.read(), 'html.parser')

lvls = []
for link in soup.select('body nav.breadcrumbs ul li a'):
lvls = lvls + [{'title': link.text, 'path': urljoin(str(file_path), link.get('href'))}]

sect1 = soup.select_one('body article.doc')
if sect1:
self._extract_section_n(str(file_path), sections, sect1, lvls)

return sections

def _extract_section_n(self, file_path: str, sections: dict, sect: Tag, lvls: list = []):
header = sect.select_one('h1, h2, h3, h4, h5, h6')

if header.name == 'h1':
path = file_path
else:
title = header.text
path = file_path + '#' + header.get('id')
lvls = lvls + [{'title': title, 'path': path}]

if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'):
siblings = header.find_next_sibling().find_all(recursive=False)
else:
siblings = header.next_siblings

content = ''
for sibling in siblings:
if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0:
self._extract_section_n(file_path, sections, sibling, lvls)
continue
content += sibling.get_text() + ' '

sections[path] = {'content': content, 'lvls': lvls}


def create_algolia_records(section_key: str, section_name: str, doc_root: Path, sections: dict):
doc_root = doc_root.resolve()
records = []

for _, section in sections.items():
for lvl in section['lvls']:
lvl['path'] = lvl['path'].replace(str(doc_root) + '/', '')

records.append({
'type': 'content',
'section_key': section_key,
'section_name': section_name,
'content': re.sub(r'\s+', ' ', section['content']).strip(),
'weight': {
'pageRank': 0,
'level': 100 - len(section['lvls']) * 10,
'position': 0
},
'path': section['lvls'][-1]['path'] if len(section['lvls']) > 0 else None,
'hierarchy': {
'lvl0': section['lvls'][0] if len(section['lvls']) > 0 else None,
'lvl1': section['lvls'][1] if len(section['lvls']) > 1 else None,
'lvl2': section['lvls'][2] if len(section['lvls']) > 2 else None,
'lvl3': section['lvls'][3] if len(section['lvls']) > 3 else None,
'lvl4': section['lvls'][4] if len(section['lvls']) > 4 else None,
'lvl5': section['lvls'][5] if len(section['lvls']) > 5 else None,
'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
}})

with open('./algolia_records/learn/' + section_key + '.json', 'w', encoding='utf-8') as outfile:
json.dump(records, outfile, indent=4)


if __name__ == "__main__":
crawler = AntoraDoc()

for section in config['website-v2-docs']['sections']:
sections = crawler.crawl(Path(config['website-v2-docs']['root']) / section['key'])
create_algolia_records(section['key'],
section['name'],
Path(config['website-v2-docs']['root']),
sections)
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def create_algolia_records(library_key: str, sections: dict, boost_root: str):
'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
}})

with open('./algolia_records/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
with open('./algolia_records/libraries/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
json.dump(records, outfile, indent=4)


Expand Down
24 changes: 18 additions & 6 deletions gecko/index_on_algolia.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@
client = SearchClient.create(config['algolia']['app-id'], config['algolia']['api-key'])

print('Initializing {} index ...'.format(config['boost']['version']))
index = client.init_index(config['boost']['version'])
libraries_index = client.init_index(config['boost']['version'])

print('Setting settings for {} index ...'.format(config['boost']['version']))
index.set_settings(config['algolia']['settings'])
libraries_index.set_settings(config['algolia']['settings'])

for path in Path('./algolia_records').glob('*.json'):
for path in Path('./algolia_records/libraries').glob('*.json'):
print('uploading records for {}...'.format(path.stem))

with open(path, 'r', encoding='utf-8') as f:
records = json.load(f)

# Delete the existing records for this library.
index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})
libraries_index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})

# Split long documents into smaller parts.
for record in records:
Expand All @@ -34,5 +34,17 @@
records = [record for record in records if not (
record['content'] == '' and not record['hierarchy']['lvl0'])]

# TODO instead of using autoGenerateObjectIDIfNotExist we might create a hash out of hierarchy items
index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
libraries_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})

learn_index = client.init_index('learn')

for path in Path('./algolia_records/learn').glob('*.json'):
print('uploading records for {}...'.format(path.stem))

with open(path, 'r', encoding='utf-8') as f:
records = json.load(f)

# Delete the existing records for this library.
libraries_index.delete_by({'filters': 'section_key:{}'.format(records[0]['section_key'])})

libraries_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})

0 comments on commit 7e5308d

Please sign in to comment.