From 811f3cd8afb2603ff9c0801386749bd9a9e00b16 Mon Sep 17 00:00:00 2001 From: Faraaz Biyabani Date: Sun, 6 Feb 2022 00:28:43 +0530 Subject: [PATCH 1/2] Store message content as HTML Store the messages as HTML so that all formatting is preserved. Telegram links in a message to other messages of the group or channel are replaced with site links. --- tgarchive/build.py | 19 ++++++++++++++++--- tgarchive/example/rss_template.html | 2 +- tgarchive/example/template.html | 2 +- tgarchive/sync.py | 3 ++- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/tgarchive/build.py b/tgarchive/build.py index 0a73b32..adac56d 100644 --- a/tgarchive/build.py +++ b/tgarchive/build.py @@ -119,7 +119,8 @@ def _render_page(self, messages, month, dayline, fname, page, total_pages): pagination={"current": page, "total": total_pages}, make_filename=self.make_filename, - nl2br=self._nl2br) + nl2br=self._nl2br, + replace_msg_link=self._replace_msg_link) with open(os.path.join(self.config["publish_dir"], fname), "w", encoding="utf8") as f: f.write(html) @@ -165,7 +166,8 @@ def _make_abstract(self, m, media_mime): m=m, media_mime=media_mime, page_ids=self.page_ids, - nl2br=self._nl2br) + nl2br=self._nl2br, + replace_msg_link=self._replace_msg_link) out = m.content if not out and m.media: out = m.media.title @@ -174,7 +176,18 @@ def _make_abstract(self, m, media_mime): def _nl2br(self, s) -> str: # There has to be a \n before
so as to not break # Jinja's automatic hyperlinking of URLs. - return _NL2BR.sub("\n\n", s).replace("\n", "\n
") + return _NL2BR.sub("\n\n", str(s)).replace("\n", "\n
") + + def _replace_msg_link(self, s) -> str: + # Replace Telegram message links with site links + result = re.sub(r"".format(self.config["group"]), + self._sub_msg_link, s) + return result + + def _sub_msg_link(self, match): + if self.page_ids.get(int(match.group(2))) is None: + return match.group(0) + return match.group(0).replace(match.group(1), self.page_ids[int(match.group(2))] + "#") def _create_publish_dir(self): pubdir = self.config["publish_dir"] diff --git a/tgarchive/example/rss_template.html b/tgarchive/example/rss_template.html index 867dace..bd1ea05 100644 --- a/tgarchive/example/rss_template.html +++ b/tgarchive/example/rss_template.html @@ -31,7 +31,7 @@
{% if m.type == "message" %} - {{ nl2br(m.content | escape) | safe | urlize }} + {{ replace_msg_link(nl2br(m.content)) }} {% else %} {% if m.type == "user_joined" %} Joined. diff --git a/tgarchive/example/template.html b/tgarchive/example/template.html index a77233c..d213786 100644 --- a/tgarchive/example/template.html +++ b/tgarchive/example/template.html @@ -123,7 +123,7 @@

{{ year }}

{% if m.type == "message" %} - {{ nl2br(m.content | escape) | safe | urlize }} + {{ replace_msg_link(nl2br(m.content)) }} {% else %} {% if m.type == "user_joined" %} Joined. diff --git a/tgarchive/sync.py b/tgarchive/sync.py index 0274303..16c7ddf 100644 --- a/tgarchive/sync.py +++ b/tgarchive/sync.py @@ -97,6 +97,7 @@ def sync(self, ids=None, from_id=None): def new_client(self, session, config): client = TelegramClient(session, config["api_id"], config["api_hash"]) client.start() + client.parse_mode = 'html' if config.get("use_takeout", False): for retry in range(3): try: @@ -160,7 +161,7 @@ def _get_messages(self, group, offset_id, ids=None) -> Message: id=m.id, date=m.date, edit_date=m.edit_date, - content=sticker if sticker else m.raw_text, + content=sticker if sticker else m.text, reply_to=m.reply_to_msg_id if m.reply_to and m.reply_to.reply_to_msg_id else None, user=self._get_user(m.sender), media=med From ad060e11469741021611ebb11adaccb5be62ad32 Mon Sep 17 00:00:00 2001 From: Faraaz Biyabani Date: Thu, 17 Feb 2022 01:28:30 +0530 Subject: [PATCH 2/2] Add 'html_messages' config. option New sites will preserve message formatting by default. Fix hyperlinks not rendering on existing sites. --- tgarchive/build.py | 15 +++++++++++---- tgarchive/example/config.yaml | 3 +++ tgarchive/example/rss_template.html | 2 +- tgarchive/example/template.html | 2 +- tgarchive/sync.py | 3 ++- 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/tgarchive/build.py b/tgarchive/build.py index adac56d..1d0ebe1 100644 --- a/tgarchive/build.py +++ b/tgarchive/build.py @@ -10,6 +10,7 @@ from feedgen.feed import FeedGenerator from jinja2 import Template +from jinja2.filters import urlize, escape from .db import User, Message @@ -110,6 +111,7 @@ def make_filename(self, month, page) -> str: return fname def _render_page(self, messages, month, dayline, fname, page, total_pages): + urlizer = self._urlize if self.config.get("html_messages") else self._urlize_raw html = self.template.render(config=self.config, timeline=self.timeline, dayline=dayline, @@ -120,7 +122,7 @@ def _render_page(self, messages, month, dayline, fname, page, total_pages): "total": total_pages}, make_filename=self.make_filename, nl2br=self._nl2br, - replace_msg_link=self._replace_msg_link) + urlize=urlizer) with open(os.path.join(self.config["publish_dir"], fname), "w", encoding="utf8") as f: f.write(html) @@ -161,13 +163,14 @@ def _build_rss(self, messages, rss_file, atom_file): f.atom_file(os.path.join(self.config["publish_dir"], "index.atom")) def _make_abstract(self, m, media_mime): + urlizer = self._urlize if self.config.get("html_messages") else self._urlize_raw if self.rss_template: return self.rss_template.render(config=self.config, m=m, media_mime=media_mime, page_ids=self.page_ids, nl2br=self._nl2br, - replace_msg_link=self._replace_msg_link) + urlize=urlizer) out = m.content if not out and m.media: out = m.media.title @@ -178,10 +181,14 @@ def _nl2br(self, s) -> str: # Jinja's automatic hyperlinking of URLs. return _NL2BR.sub("\n\n", str(s)).replace("\n", "\n
") - def _replace_msg_link(self, s) -> str: + def _urlize_raw(self, s) -> str: + # Escape raw text, apply jinja urlize and finally _urlize + return self._urlize(urlize(escape(s))) + + def _urlize(self, s) -> str: # Replace Telegram message links with site links result = re.sub(r"".format(self.config["group"]), - self._sub_msg_link, s) + self._sub_msg_link, str(s)) return result def _sub_msg_link(self, match): diff --git a/tgarchive/example/config.yaml b/tgarchive/example/config.yaml index 412653e..85d186d 100644 --- a/tgarchive/example/config.yaml +++ b/tgarchive/example/config.yaml @@ -14,6 +14,9 @@ download_avatars: True avatar_size: [64, 64] # Width, Height. media_dir: "media" +# Preserve formatting in messages (inline links, bold, italic, underline, etc.). +html_messages: True + # Takeout mode allows you to fetch messages at a higher rate than the standard mode. # It is the method used in the desktop client to export data. # You can use a larger fetch_batch_size. Set this as False to use the standard mode. diff --git a/tgarchive/example/rss_template.html b/tgarchive/example/rss_template.html index bd1ea05..55bb202 100644 --- a/tgarchive/example/rss_template.html +++ b/tgarchive/example/rss_template.html @@ -31,7 +31,7 @@
{% if m.type == "message" %} - {{ replace_msg_link(nl2br(m.content)) }} + {{ nl2br(urlize(m.content)) }} {% else %} {% if m.type == "user_joined" %} Joined. diff --git a/tgarchive/example/template.html b/tgarchive/example/template.html index d213786..7fe35c3 100644 --- a/tgarchive/example/template.html +++ b/tgarchive/example/template.html @@ -123,7 +123,7 @@

{{ year }}

{% if m.type == "message" %} - {{ replace_msg_link(nl2br(m.content)) }} + {{ nl2br(urlize(m.content)) }} {% else %} {% if m.type == "user_joined" %} Joined. diff --git a/tgarchive/sync.py b/tgarchive/sync.py index 16c7ddf..8026914 100644 --- a/tgarchive/sync.py +++ b/tgarchive/sync.py @@ -125,6 +125,7 @@ def finish_takeout(self): self.client.__exit__(None, None, None) def _get_messages(self, group, offset_id, ids=None) -> Message: + msg_text_type = "text" if self.config.get("html_messages") else "raw_text" messages = self._fetch_messages(group, offset_id, ids) # https://docs.telethon.dev/en/latest/quick-references/objects-reference.html#message for m in messages: @@ -161,7 +162,7 @@ def _get_messages(self, group, offset_id, ids=None) -> Message: id=m.id, date=m.date, edit_date=m.edit_date, - content=sticker if sticker else m.text, + content=sticker if sticker else getattr(m, msg_text_type), reply_to=m.reply_to_msg_id if m.reply_to and m.reply_to.reply_to_msg_id else None, user=self._get_user(m.sender), media=med