Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract search records for learn sections #147

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions .github/workflows/index_on_algolia.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: Index on Algolia

on:
pull_request:
push:
branches: [develop, ci-*]
paths:
Expand All @@ -26,18 +27,27 @@ jobs:
- name: Install dependencies
run: pip install -r requirements.txt

- name: Clone and build website-v2-docs
run: |
git clone --depth=1 --branch=master [email protected]:boostorg/website-v2-docs.git ../website-v2-docs
cd ../website-v2-docs
./build.sh

- name: Download and extract boost release archive
run: |
BOOST_VERSION=$(sed -n 's/.*version: "\(.*\)"/\1/p' config/config.yaml)
BOOST_VERSION_MAIN=$(echo $BOOST_VERSION | sed -E 's/([0-9]+)_([0-9]+)_([0-9]+)(.*)/\1.\2.\3/g')
wget --no-verbose https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz
tar -xzf boost_$BOOST_VERSION.tar.gz -C ../

- name: Extract records
run: python -m gecko.extract_records
- name: Extract learn records
run: python -m gecko.extract_learn_records

# - name: Extract libraries records
# run: python -m gecko.extract_libraries_records

- name: Check validity of records
run: python -m gecko.sanitizer check
# - name: Check validity of records
# run: python -m gecko.sanitizer check

- name: Index on Algolia
env:
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions algolia_records/libraries/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.json
24 changes: 17 additions & 7 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
boost:
version: "1_85_0"
root: "../boost_1_85_0"
version: '1_85_0'
root: '../boost_1_85_0'

website-v2-docs:
root: '../website-v2-docs/build'
sections:
- key: 'contributor-guide'
name: 'Contributor Guide'
- key: 'formal-reviews'
name: 'Formal Reviews'
- key: 'user-guide'
name: 'User Guide'

algolia:
app-id: D7O1MLLTAF
Expand Down Expand Up @@ -307,6 +317,11 @@ crawlers:
last-words: 4059
last-lvls: 204

- key: cobalt
last-records: 131
last-words: 14546
last-lvls: 319

- key: compat
last-records: 46
last-words: 1079
Expand Down Expand Up @@ -382,11 +397,6 @@ crawlers:
last-words: 6161
last-lvls: 127

- key: cobalt
last-records: 131
last-words: 14546
last-lvls: 319

- name: QuickBook
libraries:
- key: accumulators
Expand Down
11 changes: 10 additions & 1 deletion gecko/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,19 @@
'version': str,
'root': os.path.exists
},
'website-v2-docs': {
'root': os.path.exists,
'sections': [
{
'key': str,
'name': str
}
]
},
'algolia': {
'app-id': str,
'api-key': str,
'settings':dict
'settings': dict
},
'crawlers': [
{
Expand Down
96 changes: 96 additions & 0 deletions gecko/extract_learn_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import re
import json
from pathlib import Path
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag

from .crawlers.helpers import has_class
from .config import config


class AntoraDoc():
def crawl(self, doc_root: Path) -> dict:
sections = {}
doc_root = doc_root.resolve()

for file_path in doc_root.rglob('*.html'):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
soup = BeautifulSoup(file.read(), 'html.parser')

lvls = []
for link in soup.select('body nav.breadcrumbs ul li a'):
lvls = lvls + [{'title': link.text, 'path': urljoin(str(file_path), link.get('href'))}]

sect1 = soup.select_one('body article.doc')
if sect1:
self._extract_section_n(str(file_path), sections, sect1, lvls)

return sections

def _extract_section_n(self, file_path: str, sections: dict, sect: Tag, lvls: list = []):
header = sect.select_one('h1, h2, h3, h4, h5, h6')

if header.name == 'h1':
path = file_path
else:
title = header.text
path = file_path + '#' + header.get('id')
lvls = lvls + [{'title': title, 'path': path}]

if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'):
siblings = header.find_next_sibling().find_all(recursive=False)
else:
siblings = header.next_siblings

content = ''
for sibling in siblings:
if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0:
self._extract_section_n(file_path, sections, sibling, lvls)
continue
content += sibling.get_text() + ' '

sections[path] = {'content': content, 'lvls': lvls}


def create_algolia_records(section_key: str, section_name: str, doc_root: Path, sections: dict):
doc_root = doc_root.resolve()
records = []

for _, section in sections.items():
for lvl in section['lvls']:
lvl['path'] = lvl['path'].replace(str(doc_root) + '/', '')

records.append({
'type': 'content',
'section_key': section_key,
'section_name': section_name,
'content': re.sub(r'\s+', ' ', section['content']).strip(),
'weight': {
'pageRank': 0,
'level': 100 - len(section['lvls']) * 10,
'position': 0
},
'path': section['lvls'][-1]['path'] if len(section['lvls']) > 0 else None,
'hierarchy': {
'lvl0': section['lvls'][0] if len(section['lvls']) > 0 else None,
'lvl1': section['lvls'][1] if len(section['lvls']) > 1 else None,
'lvl2': section['lvls'][2] if len(section['lvls']) > 2 else None,
'lvl3': section['lvls'][3] if len(section['lvls']) > 3 else None,
'lvl4': section['lvls'][4] if len(section['lvls']) > 4 else None,
'lvl5': section['lvls'][5] if len(section['lvls']) > 5 else None,
'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
}})

with open('./algolia_records/learn/' + section_key + '.json', 'w', encoding='utf-8') as outfile:
json.dump(records, outfile, indent=4)


if __name__ == "__main__":
crawler = AntoraDoc()

for section in config['website-v2-docs']['sections']:
sections = crawler.crawl(Path(config['website-v2-docs']['root']) / section['key'])
create_algolia_records(section['key'],
section['name'],
Path(config['website-v2-docs']['root']),
sections)
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def create_algolia_records(library_key: str, sections: dict, boost_root: str):
'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
}})

with open('./algolia_records/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
with open('./algolia_records/libraries/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
json.dump(records, outfile, indent=4)


Expand Down
24 changes: 18 additions & 6 deletions gecko/index_on_algolia.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@
client = SearchClient.create(config['algolia']['app-id'], config['algolia']['api-key'])

print('Initializing {} index ...'.format(config['boost']['version']))
index = client.init_index(config['boost']['version'])
libraries_index = client.init_index(config['boost']['version'])

print('Setting settings for {} index ...'.format(config['boost']['version']))
index.set_settings(config['algolia']['settings'])
libraries_index.set_settings(config['algolia']['settings'])

for path in Path('./algolia_records').glob('*.json'):
for path in Path('./algolia_records/libraries').glob('*.json'):
print('uploading records for {}...'.format(path.stem))

with open(path, 'r', encoding='utf-8') as f:
records = json.load(f)

# Delete the existing records for this library.
index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})
libraries_index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})

# Split long documents into smaller parts.
for record in records:
Expand All @@ -34,5 +34,17 @@
records = [record for record in records if not (
record['content'] == '' and not record['hierarchy']['lvl0'])]

# TODO instead of using autoGenerateObjectIDIfNotExist we might create a hash out of hierarchy items
index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
libraries_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})

learn_index = client.init_index('learn')

for path in Path('./algolia_records/learn').glob('*.json'):
print('uploading records for {}...'.format(path.stem))

with open(path, 'r', encoding='utf-8') as f:
records = json.load(f)

# Delete the existing records for this library.
libraries_index.delete_by({'filters': 'section_key:{}'.format(records[0]['section_key'])})

libraries_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
Loading