diff --git a/mautrix_telegram/formatter/__init__.py b/mautrix_telegram/formatter/__init__.py new file mode 100644 index 00000000..cf46d796 --- /dev/null +++ b/mautrix_telegram/formatter/__init__.py @@ -0,0 +1,2 @@ +from .from_matrix import matrix_reply_to_telegram, matrix_to_telegram +from .from_telegram import telegram_reply_to_matrix, telegram_to_matrix diff --git a/mautrix_telegram/formatter/from_matrix.py b/mautrix_telegram/formatter/from_matrix.py new file mode 100644 index 00000000..6c00838f --- /dev/null +++ b/mautrix_telegram/formatter/from_matrix.py @@ -0,0 +1,182 @@ +# -*- coding: future_fstrings -*- +# mautrix-telegram - A Matrix-Telegram puppeting bridge +# Copyright (C) 2018 Tulir Asokan +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +from html import unescape +from html.parser import HTMLParser +from collections import deque +import re +import logging + +from telethon.tl.types import * + +from .. import user as u, puppet as p +from ..db import Message as DBMessage + +TEMP_ENC = "utf-16-le" + +log = logging.getLogger("mau.fmt.mx") + + +class MatrixParser(HTMLParser): + mention_regex = re.compile("https://matrix.to/#/(@.+)") + + def __init__(self): + super().__init__() + self.text = "" + self.entities = [] + self._building_entities = {} + self._list_counter = 0 + self._open_tags = deque() + self._open_tags_meta = deque() + self._previous_ended_line = True + + def handle_starttag(self, tag, attrs): + self._open_tags.appendleft(tag) + self._open_tags_meta.appendleft(0) + attrs = dict(attrs) + entity_type = None + args = {} + if tag == "strong" or tag == "b": + entity_type = MessageEntityBold + elif tag == "em" or tag == "i": + entity_type = MessageEntityItalic + elif tag == "code": + try: + pre = self._building_entities["pre"] + try: + pre.language = attrs["class"][len("language-"):] + except KeyError: + pass + except KeyError: + entity_type = MessageEntityCode + elif tag == "pre": + entity_type = MessageEntityPre + args["language"] = "" + elif tag == "a": + try: + url = attrs["href"] + except KeyError: + return + mention = self.mention_regex.search(url) + if mention: + mxid = mention.group(1) + user = p.Puppet.get_by_mxid(mxid, create=False) + if not user: + user = u.User.get_by_mxid(mxid, create=False) + if not user: + return + if user.username: + entity_type = MessageEntityMention + url = f"@{user.username}" + else: + entity_type = MessageEntityMentionName + args["user_id"] = user.tgid + elif url.startswith("mailto:"): + url = url[len("mailto:"):] + entity_type = MessageEntityEmail + else: + if self.get_starttag_text() == url: + entity_type = MessageEntityUrl + else: + entity_type = MessageEntityTextUrl + args["url"] = url + url = None + self._open_tags_meta.popleft() + self._open_tags_meta.appendleft(url) + + if entity_type and tag not in self._building_entities: + # See "TEXT LEN EXPLANATION" near start of file + offset = int(len(self.text.encode(TEMP_ENC)) / 2) + self._building_entities[tag] = entity_type(offset=offset, length=0, **args) + + def _list_depth(self): + depth = 0 + for tag in self._open_tags: + if tag == "ol" or tag == "ul": + depth += 1 + return depth + + def handle_data(self, text): + text = unescape(text) + previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else "" + list_format_offset = 0 + if previous_tag == "a": + url = self._open_tags_meta[0] + if url: + text = url + elif len(self._open_tags) > 1 and self._previous_ended_line and previous_tag == "li": + list_type = self._open_tags[1] + indent = (self._list_depth() - 1) * 4 * " " + text = text.strip("\n") + if len(text) == 0: + return + elif list_type == "ul": + text = f"{indent}* {text}" + list_format_offset = len(indent) + 2 + elif list_type == "ol": + n = self._open_tags_meta[1] + n += 1 + self._open_tags_meta[1] = n + text = f"{indent}{n}. {text}" + list_format_offset = len(indent) + 3 + for tag, entity in self._building_entities.items(): + # See "TEXT LEN EXPLANATION" near start of file + entity.length += int(len(text.strip("\n").encode(TEMP_ENC)) / 2) + entity.offset += list_format_offset + + if text.endswith("\n"): + self._previous_ended_line = True + else: + self._previous_ended_line = False + + self.text += text + + def handle_endtag(self, tag): + try: + self._open_tags.popleft() + self._open_tags_meta.popleft() + except IndexError: + pass + if (tag == "ul" or tag == "ol") and self.text.endswith("\n"): + self.text = self.text[:-1] + entity = self._building_entities.pop(tag, None) + if entity: + self.entities.append(entity) + + +def matrix_to_telegram(html): + try: + parser = MatrixParser() + parser.feed(html) + return parser.text, parser.entities + except Exception: + log.exception("Failed to convert Matrix format:\nhtml=%s", html) + + +def matrix_reply_to_telegram(content, tg_space, room_id=None): + try: + reply = content["m.relates_to"]["m.in_reply_to"] + room_id = room_id or reply["room_id"] + event_id = reply["event_id"] + message = DBMessage.query.filter(DBMessage.mxid == event_id, + DBMessage.tg_space == tg_space, + DBMessage.mx_room == room_id).one_or_none() + if message: + return message.tgid + except KeyError: + pass + return None + diff --git a/mautrix_telegram/formatter.py b/mautrix_telegram/formatter/from_telegram.py similarity index 53% rename from mautrix_telegram/formatter.py rename to mautrix_telegram/formatter/from_telegram.py index 1a21d68a..6dbf78b3 100644 --- a/mautrix_telegram/formatter.py +++ b/mautrix_telegram/formatter/from_telegram.py @@ -14,193 +14,19 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from html import escape, unescape -from html.parser import HTMLParser -from collections import deque -import re +from html import escape import logging +from telethon.tl.types import * from mautrix_appservice import MatrixRequestError -from telethon.tl.types import * +from .. import user as u, puppet as p +from ..db import Message as DBMessage -from . import user as u, puppet as p -from .db import Message as DBMessage - -log = logging.getLogger("mau.formatter") - -# TEXT LEN EXPLANATION: -# Telegram formatting counts two bytes in an UTF-16 string as one character. -# -# For Telegram -> Matrix formatting, we get the same counting mechanism by encoding the input -# text as UTF-16 Little Endian and doubling all the offsets and lengths given by Telegram. With -# those doubled values, we process the input entities and text. The text is converted back to -# native str format before it's inserted into the output HTML. -# -# For Matrix -> Telegram formatting, do the same input encoding, but divide the length by two -# instead of multiplying when generating the lengths and offsets of Telegram entities. -# -# The endianness doesn't matter, but it has to be specified to avoid the two BOM bits messing -# everything up. TEMP_ENC = "utf-16-le" +log = logging.getLogger("mau.fmt.tg") -# region Matrix to Telegram - - -class MatrixParser(HTMLParser): - mention_regex = re.compile("https://matrix.to/#/(@.+)") - - def __init__(self): - super().__init__() - self.text = "" - self.entities = [] - self._building_entities = {} - self._list_counter = 0 - self._open_tags = deque() - self._open_tags_meta = deque() - self._previous_ended_line = True - - def handle_starttag(self, tag, attrs): - self._open_tags.appendleft(tag) - self._open_tags_meta.appendleft(0) - attrs = dict(attrs) - entity_type = None - args = {} - if tag == "strong" or tag == "b": - entity_type = MessageEntityBold - elif tag == "em" or tag == "i": - entity_type = MessageEntityItalic - elif tag == "code": - try: - pre = self._building_entities["pre"] - try: - pre.language = attrs["class"][len("language-"):] - except KeyError: - pass - except KeyError: - entity_type = MessageEntityCode - elif tag == "pre": - entity_type = MessageEntityPre - args["language"] = "" - elif tag == "a": - try: - url = attrs["href"] - except KeyError: - return - mention = self.mention_regex.search(url) - if mention: - mxid = mention.group(1) - user = p.Puppet.get_by_mxid(mxid, create=False) - if not user: - user = u.User.get_by_mxid(mxid, create=False) - if not user: - return - if user.username: - entity_type = MessageEntityMention - url = f"@{user.username}" - else: - entity_type = MessageEntityMentionName - args["user_id"] = user.tgid - elif url.startswith("mailto:"): - url = url[len("mailto:"):] - entity_type = MessageEntityEmail - else: - if self.get_starttag_text() == url: - entity_type = MessageEntityUrl - else: - entity_type = MessageEntityTextUrl - args["url"] = url - url = None - self._open_tags_meta.popleft() - self._open_tags_meta.appendleft(url) - - if entity_type and tag not in self._building_entities: - # See "TEXT LEN EXPLANATION" near start of file - offset = int(len(self.text.encode(TEMP_ENC)) / 2) - self._building_entities[tag] = entity_type(offset=offset, length=0, **args) - - def _list_depth(self): - depth = 0 - for tag in self._open_tags: - if tag == "ol" or tag == "ul": - depth += 1 - return depth - - def handle_data(self, text): - text = unescape(text) - previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else "" - list_format_offset = 0 - if previous_tag == "a": - url = self._open_tags_meta[0] - if url: - text = url - elif len(self._open_tags) > 1 and self._previous_ended_line and previous_tag == "li": - list_type = self._open_tags[1] - indent = (self._list_depth() - 1) * 4 * " " - text = text.strip("\n") - if len(text) == 0: - return - elif list_type == "ul": - text = f"{indent}* {text}" - list_format_offset = len(indent) + 2 - elif list_type == "ol": - n = self._open_tags_meta[1] - n += 1 - self._open_tags_meta[1] = n - text = f"{indent}{n}. {text}" - list_format_offset = len(indent) + 3 - for tag, entity in self._building_entities.items(): - # See "TEXT LEN EXPLANATION" near start of file - entity.length += int(len(text.strip("\n").encode(TEMP_ENC)) / 2) - entity.offset += list_format_offset - - if text.endswith("\n"): - self._previous_ended_line = True - else: - self._previous_ended_line = False - - self.text += text - - def handle_endtag(self, tag): - try: - self._open_tags.popleft() - self._open_tags_meta.popleft() - except IndexError: - pass - if (tag == "ul" or tag == "ol") and self.text.endswith("\n"): - self.text = self.text[:-1] - entity = self._building_entities.pop(tag, None) - if entity: - self.entities.append(entity) - - -def matrix_to_telegram(html): - try: - parser = MatrixParser() - parser.feed(html) - return parser.text, parser.entities - except Exception: - log.exception("Failed to convert Matrix format:\nhtml=%s", html) - - -def matrix_reply_to_telegram(content, tg_space, room_id=None): - try: - reply = content["m.relates_to"]["m.in_reply_to"] - room_id = room_id or reply["room_id"] - event_id = reply["event_id"] - message = DBMessage.query.filter(DBMessage.mxid == event_id, - DBMessage.tg_space == tg_space, - DBMessage.mx_room == room_id).one_or_none() - if message: - return message.tgid - except KeyError: - pass - return None - - -# endregion -# region Telegram to Matrix def telegram_reply_to_matrix(evt, source): if evt.reply_to_msg_id: @@ -218,10 +44,10 @@ def telegram_reply_to_matrix(evt, source): return {} -async def telegram_event_to_matrix(evt, source, native_replies=False, message_link_in_reply=False, - main_intent=None, reply_text="Reply"): +async def telegram_to_matrix(evt, source, native_replies=False, message_link_in_reply=False, + main_intent=None, reply_text="Reply"): text = evt.message - html = telegram_to_matrix(evt.message, evt.entities) if evt.entities else None + html = _telegram_entities_to_matrix_catch(evt.message, evt.entities) if evt.entities else None relates_to = {} if evt.fwd_from: @@ -293,9 +119,9 @@ async def telegram_event_to_matrix(evt, source, native_replies=False, message_li return text, html, relates_to -def telegram_to_matrix(text, entities): +def _telegram_entities_to_matrix_catch(text, entities): try: - return _telegram_to_matrix(text, entities) + return _telegram_entities_to_matrix(text, entities) except Exception: log.exception("Failed to convert Telegram format:\n" "message=%s\n" @@ -303,7 +129,7 @@ def telegram_to_matrix(text, entities): text, entities) -def _telegram_to_matrix(text, entities): +def _telegram_entities_to_matrix(text, entities): if not entities: return text # See "TEXT LEN EXPLANATION" near start of file @@ -376,5 +202,3 @@ def _telegram_to_matrix(text, entities): html.append(text[last_offset:].decode(TEMP_ENC)) return "".join(html) - -# endregion diff --git a/mautrix_telegram/portal.py b/mautrix_telegram/portal.py index 88ead3c8..eccdf091 100644 --- a/mautrix_telegram/portal.py +++ b/mautrix_telegram/portal.py @@ -842,7 +842,7 @@ class Portal: async def handle_telegram_text(self, source, intent, evt): self.log.debug(f"Sending {evt.message} to {self.mxid} by {intent.mxid}") - text, html, relates_to = await formatter.telegram_event_to_matrix( + text, html, relates_to = await formatter.telegram_to_matrix( evt, source, config["bridge.native_replies"], config["bridge.link_in_reply"], @@ -870,7 +870,7 @@ class Portal: return evt.reply_to_msg_id = evt.id - text, html, relates_to = await formatter.telegram_event_to_matrix( + text, html, relates_to = await formatter.telegram_to_matrix( evt, source, config["bridge.native_replies"], config["bridge.link_in_reply"],