diff --git a/tgarchive/build.py b/tgarchive/build.py index 2630065..7a713a3 100644 --- a/tgarchive/build.py +++ b/tgarchive/build.py @@ -10,6 +10,7 @@ from feedgen.feed import FeedGenerator from jinja2 import Template +from jinja2.filters import urlize, escape from .db import User, Message @@ -114,6 +115,7 @@ def make_filename(self, month, page) -> str: return fname def _render_page(self, messages, month, dayline, fname, page, total_pages): + urlizer = self._urlize if self.config.get("html_messages") else self._urlize_raw html = self.template.render(config=self.config, timeline=self.timeline, dayline=dayline, @@ -123,7 +125,8 @@ def _render_page(self, messages, month, dayline, fname, page, total_pages): pagination={"current": page, "total": total_pages}, make_filename=self.make_filename, - nl2br=self._nl2br) + nl2br=self._nl2br, + urlize=urlizer) with open(os.path.join(self.config["publish_dir"], fname), "w", encoding="utf8") as f: f.write(html) @@ -164,12 +167,14 @@ def _build_rss(self, messages, rss_file, atom_file): f.atom_file(os.path.join(self.config["publish_dir"], "index.atom")) def _make_abstract(self, m, media_mime): + urlizer = self._urlize if self.config.get("html_messages") else self._urlize_raw if self.rss_template: return self.rss_template.render(config=self.config, m=m, media_mime=media_mime, page_ids=self.page_ids, - nl2br=self._nl2br) + nl2br=self._nl2br, + urlize=urlizer) out = m.content if not out and m.media: out = m.media.title @@ -178,7 +183,22 @@ def _make_abstract(self, m, media_mime): def _nl2br(self, s) -> str: # There has to be a \n before
so as to not break # Jinja's automatic hyperlinking of URLs. - return _NL2BR.sub("\n\n", s).replace("\n", "\n
") + return _NL2BR.sub("\n\n", str(s)).replace("\n", "\n
") + + def _urlize_raw(self, s) -> str: + # Escape raw text, apply jinja urlize and finally _urlize + return self._urlize(urlize(escape(s))) + + def _urlize(self, s) -> str: + # Replace Telegram message links with site links + result = re.sub(r"".format(self.config["group"]), + self._sub_msg_link, str(s)) + return result + + def _sub_msg_link(self, match): + if self.page_ids.get(int(match.group(2))) is None: + return match.group(0) + return match.group(0).replace(match.group(1), self.page_ids[int(match.group(2))] + "#") def _create_publish_dir(self): pubdir = self.config["publish_dir"] diff --git a/tgarchive/example/config.yaml b/tgarchive/example/config.yaml index 4f7a695..77310b7 100644 --- a/tgarchive/example/config.yaml +++ b/tgarchive/example/config.yaml @@ -19,6 +19,9 @@ media_dir: "media" # If left empty, files of all types are downloaded. media_mime_types: [] +# Preserve formatting in messages (inline links, bold, italic, underline, etc.). +html_messages: True + # Takeout mode allows you to fetch messages at a higher rate than the standard mode. # It is the method used in the desktop client to export data. # You can use a larger fetch_batch_size. Set this as False to use the standard mode. diff --git a/tgarchive/example/rss_template.html b/tgarchive/example/rss_template.html index 867dace..55bb202 100644 --- a/tgarchive/example/rss_template.html +++ b/tgarchive/example/rss_template.html @@ -31,7 +31,7 @@
{% if m.type == "message" %} - {{ nl2br(m.content | escape) | safe | urlize }} + {{ nl2br(urlize(m.content)) }} {% else %} {% if m.type == "user_joined" %} Joined. diff --git a/tgarchive/example/template.html b/tgarchive/example/template.html index a77233c..7fe35c3 100644 --- a/tgarchive/example/template.html +++ b/tgarchive/example/template.html @@ -123,7 +123,7 @@

{{ year }}

{% if m.type == "message" %} - {{ nl2br(m.content | escape) | safe | urlize }} + {{ nl2br(urlize(m.content)) }} {% else %} {% if m.type == "user_joined" %} Joined. diff --git a/tgarchive/sync.py b/tgarchive/sync.py index e014485..a695e93 100644 --- a/tgarchive/sync.py +++ b/tgarchive/sync.py @@ -97,6 +97,7 @@ def sync(self, ids=None, from_id=None): def new_client(self, session, config): client = TelegramClient(session, config["api_id"], config["api_hash"]) client.start() + client.parse_mode = 'html' if config.get("use_takeout", False): for retry in range(3): try: @@ -124,6 +125,7 @@ def finish_takeout(self): self.client.__exit__(None, None, None) def _get_messages(self, group, offset_id, ids=None) -> Message: + msg_text_type = "text" if self.config.get("html_messages") else "raw_text" messages = self._fetch_messages(group, offset_id, ids) # https://docs.telethon.dev/en/latest/quick-references/objects-reference.html#message for m in messages: @@ -160,7 +162,7 @@ def _get_messages(self, group, offset_id, ids=None) -> Message: id=m.id, date=m.date, edit_date=m.edit_date, - content=sticker if sticker else m.raw_text, + content=sticker if sticker else getattr(m, msg_text_type), reply_to=m.reply_to_msg_id if m.reply_to and m.reply_to.reply_to_msg_id else None, user=self._get_user(m.sender), media=med