Skip to content

Commit

Permalink
Merge pull request #265 from openzim/restore-search-functionality
Browse files Browse the repository at this point in the history
Restore search functionality by indexing content in JSON files
  • Loading branch information
benoit74 authored Jul 15, 2024
2 parents bf12db2 + 435dadb commit fda7798
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ repos:
hooks:
- id: ruff
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.368
rev: v1.1.371
hooks:
- id: pyright
name: pyright (system)
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Validate if ZIM cannot be created at given output location (#204)
- Add videos, subtitles, thumbnails and channel branding to the ZIM "on the fly" (#209)
- Remove `--no-zim`, `--keep` CLI arguments
- Add support to index content from `zimui` JSON files in the ZIM using custom `IndexData` (#224)

## [2.3.0] - 2024-05-22

Expand Down
2 changes: 1 addition & 1 deletion scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ additional-keywords = ["youtube"]
[project.optional-dependencies]
scripts = ["invoke==2.2.0"]
lint = ["black==24.4.2", "ruff==0.4.10"]
check = ["pyright==1.1.368"]
check = ["pyright==1.1.371"]
test = ["pytest==8.2.2", "coverage==7.5.3"]
dev = [
"pre-commit==3.7.1",
Expand Down
75 changes: 70 additions & 5 deletions scraper/src/youtube2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import yt_dlp
from kiwixstorage import KiwixStorage
from libzim.writer import IndexData # type: ignore
from pif import get_public_ip
from zimscraperlib.download import stream_file
from zimscraperlib.filesystem import delete_callback
Expand All @@ -31,7 +32,7 @@
from zimscraperlib.image.transformation import resize_image
from zimscraperlib.inputs import compute_descriptions
from zimscraperlib.video.presets import VideoMp4Low, VideoWebmLow
from zimscraperlib.zim import Creator
from zimscraperlib.zim import Creator, StaticItem
from zimscraperlib.zim.filesystem import validate_zimfile_creatable
from zimscraperlib.zim.metadata import (
validate_description,
Expand Down Expand Up @@ -84,6 +85,29 @@
MAXIMUM_YOUTUBEID_LENGTH = 24


class CustomIndexData(IndexData):
"""Custom IndexData class to allow for custom title and content"""

def __init__(self, title: str, content: str):
self.title = title
self.content = content

def has_indexdata(self):
return True

def get_title(self):
return self.title

def get_content(self):
return self.content

def get_keywords(self):
return ""

def get_wordcount(self):
return len(self.content.split()) if self.content else 0


class Youtube2Zim:
def __init__(
self,
Expand Down Expand Up @@ -1066,11 +1090,22 @@ def get_video_slug(video) -> str:
def generate_playlist_object(playlist) -> Playlist:
channel_data = get_channel_json(playlist.creator_id)
videos = get_videos_list(playlist)
playlist_videos = [generate_video_preview_object(video) for video in videos]

# add videos to ZIM index
for idx, video_obj in enumerate(playlist_videos):
self.add_custom_item_to_zim_index(
video_obj.title,
videos[idx]["snippet"]["description"],
video_obj.slug,
f"watch/{video_obj.slug}?list={get_playlist_slug(playlist)}",
)

return Playlist(
id=playlist.playlist_id,
title=playlist.title,
description=playlist.description,
videos=[generate_video_preview_object(video) for video in videos],
videos=playlist_videos,
publication_date=playlist.published_at,
author=Author(
channel_id=playlist.creator_id,
Expand Down Expand Up @@ -1141,16 +1176,23 @@ def get_playlist_slug(playlist) -> str:
playlist_slug # set uploads playlist as main playlist
)

playlist_obj = generate_playlist_object(playlist)
self.zim_file.add_item_for(
path=playlist_path,
title=playlist.title,
content=generate_playlist_object(playlist).model_dump_json(
by_alias=True, indent=2
),
content=playlist_obj.model_dump_json(by_alias=True, indent=2),
mimetype="application/json",
is_front=False,
)

# add playlist to ZIM index
self.add_custom_item_to_zim_index(
playlist_obj.title,
playlist_obj.description,
playlist_slug,
f"playlist/{playlist_slug}",
)

# write playlists.json file
self.zim_file.add_item_for(
path="playlists.json",
Expand Down Expand Up @@ -1215,3 +1257,26 @@ def add_file_to_zim(
fpath=fpath,
callback=callback,
)

def add_custom_item_to_zim_index(
self, title: str, content: str, fname: str, zimui_redirect: str
):
"""add a custom item to the ZIM index"""

redirect_url = f"../index.html#/{zimui_redirect}"
html_content = (
f"<html><head><title>{title}</title>"
f'<meta http-equiv="refresh" content="0;URL=\'{redirect_url}\'" />'
f"</head><body></body></html>"
)

item = StaticItem(
title=title,
path="index/" + fname,
content=bytes(html_content, "utf-8"),
mimetype="text/html",
)
item.get_indexdata = lambda: CustomIndexData(title, content)

logger.debug(f"Adding {fname} to ZIM index")
self.zim_file.add_item(item)

0 comments on commit fda7798

Please sign in to comment.