Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store message content as HTML #50

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions tgarchive/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from feedgen.feed import FeedGenerator
from jinja2 import Template
from jinja2.filters import urlize, escape

from .db import User, Message

Expand Down Expand Up @@ -114,6 +115,7 @@ def make_filename(self, month, page) -> str:
return fname

def _render_page(self, messages, month, dayline, fname, page, total_pages):
urlizer = self._urlize if self.config.get("html_messages") else self._urlize_raw
html = self.template.render(config=self.config,
timeline=self.timeline,
dayline=dayline,
Expand All @@ -123,7 +125,8 @@ def _render_page(self, messages, month, dayline, fname, page, total_pages):
pagination={"current": page,
"total": total_pages},
make_filename=self.make_filename,
nl2br=self._nl2br)
nl2br=self._nl2br,
urlize=urlizer)

with open(os.path.join(self.config["publish_dir"], fname), "w", encoding="utf8") as f:
f.write(html)
Expand Down Expand Up @@ -164,12 +167,14 @@ def _build_rss(self, messages, rss_file, atom_file):
f.atom_file(os.path.join(self.config["publish_dir"], "index.atom"))

def _make_abstract(self, m, media_mime):
urlizer = self._urlize if self.config.get("html_messages") else self._urlize_raw
if self.rss_template:
return self.rss_template.render(config=self.config,
m=m,
media_mime=media_mime,
page_ids=self.page_ids,
nl2br=self._nl2br)
nl2br=self._nl2br,
urlize=urlizer)
out = m.content
if not out and m.media:
out = m.media.title
Expand All @@ -178,7 +183,22 @@ def _make_abstract(self, m, media_mime):
def _nl2br(self, s) -> str:
# There has to be a \n before <br> so as to not break
# Jinja's automatic hyperlinking of URLs.
return _NL2BR.sub("\n\n", s).replace("\n", "\n<br />")
return _NL2BR.sub("\n\n", str(s)).replace("\n", "\n<br />")
Comment on lines -181 to +186
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we use this? Whether the s variable is a string or not?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it can be removed now as urlize() does a cast already.
The re.sub() method expects a string and there are NULL/None messages which need to be cast to string. This was indirectly taken care of by the escape filter in the master branch. Since I changed the way filters work in the first commit, I had to make this cast.


def _urlize_raw(self, s) -> str:
# Escape raw text, apply jinja urlize and finally _urlize
return self._urlize(urlize(escape(s)))

def _urlize(self, s) -> str:
# Replace Telegram message links with site links
result = re.sub(r"<a href=\"(https://t\.me/{}/)(\d+)\">".format(self.config["group"]),
self._sub_msg_link, str(s))
return result

def _sub_msg_link(self, match):
if self.page_ids.get(int(match.group(2))) is None:
return match.group(0)
return match.group(0).replace(match.group(1), self.page_ids[int(match.group(2))] + "#")

def _create_publish_dir(self):
pubdir = self.config["publish_dir"]
Expand Down
3 changes: 3 additions & 0 deletions tgarchive/example/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ media_dir: "media"
# If left empty, files of all types are downloaded.
media_mime_types: []

# Preserve formatting in messages (inline links, bold, italic, underline, etc.).
html_messages: True

# Takeout mode allows you to fetch messages at a higher rate than the standard mode.
# It is the method used in the desktop client to export data.
# You can use a larger fetch_batch_size. Set this as False to use the standard mode.
Expand Down
2 changes: 1 addition & 1 deletion tgarchive/example/rss_template.html
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
</div>
<div class="text">
{% if m.type == "message" %}
{{ nl2br(m.content | escape) | safe | urlize }}
{{ nl2br(urlize(m.content)) }}
{% else %}
{% if m.type == "user_joined" %}
Joined.
Expand Down
2 changes: 1 addition & 1 deletion tgarchive/example/template.html
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ <h3 class="year"><a href="{{ months[0].slug }}.html">{{ year }}</a></h3>
</div>
<div class="text">
{% if m.type == "message" %}
{{ nl2br(m.content | escape) | safe | urlize }}
{{ nl2br(urlize(m.content)) }}
{% else %}
{% if m.type == "user_joined" %}
Joined.
Expand Down
4 changes: 3 additions & 1 deletion tgarchive/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def sync(self, ids=None, from_id=None):
def new_client(self, session, config):
client = TelegramClient(session, config["api_id"], config["api_hash"])
client.start()
client.parse_mode = 'html'
if config.get("use_takeout", False):
for retry in range(3):
try:
Expand Down Expand Up @@ -124,6 +125,7 @@ def finish_takeout(self):
self.client.__exit__(None, None, None)

def _get_messages(self, group, offset_id, ids=None) -> Message:
msg_text_type = "text" if self.config.get("html_messages") else "raw_text"
messages = self._fetch_messages(group, offset_id, ids)
# https://docs.telethon.dev/en/latest/quick-references/objects-reference.html#message
for m in messages:
Expand Down Expand Up @@ -160,7 +162,7 @@ def _get_messages(self, group, offset_id, ids=None) -> Message:
id=m.id,
date=m.date,
edit_date=m.edit_date,
content=sticker if sticker else m.raw_text,
content=sticker if sticker else getattr(m, msg_text_type),
reply_to=m.reply_to_msg_id if m.reply_to and m.reply_to.reply_to_msg_id else None,
user=self._get_user(m.sender),
media=med
Expand Down