diff --git a/mautrix_telegram/formatter/from_matrix/__init__.py b/mautrix_telegram/formatter/from_matrix/__init__.py new file mode 100644 index 00000000..99a9a70a --- /dev/null +++ b/mautrix_telegram/formatter/from_matrix/__init__.py @@ -0,0 +1,156 @@ +# -*- coding: future_fstrings -*- +# mautrix-telegram - A Matrix-Telegram puppeting bridge +# Copyright (C) 2018 Tulir Asokan +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from typing import Optional, List, Tuple, Callable, Pattern, Match, TYPE_CHECKING +import re +import logging + +from telethon.tl.types import (MessageEntityMention, MessageEntityMentionName, MessageEntityItalic, + TypeMessageEntity) + +from ... import puppet as pu +from ...db import Message as DBMessage +from ..util import (add_surrogates, remove_surrogates, trim_reply_fallback_html, + trim_reply_fallback_text) +from .parser_common import ParsedMessage + +try: + from mautrix_telegram.formatter.from_matrix.parser_lxml import parse_html +except ImportError: + from mautrix_telegram.formatter.from_matrix.parser_htmlparser import parse_html + +if TYPE_CHECKING: + from ...context import Context + +log = logging.getLogger("mau.fmt.mx") # type: logging.Logger +should_bridge_plaintext_highlights = False # type: bool + +command_regex = re.compile(r"^!([A-Za-z0-9@]+)") # type: Pattern +not_command_regex = re.compile(r"^\\(![A-Za-z0-9@]+)") # type: Pattern +plain_mention_regex = None # type: Pattern + + +def plain_mention_to_html(match: Match) -> str: + puppet = pu.Puppet.find_by_displayname(match.group(2)) + if puppet: + return (f"{match.group(1)}" + f"" + f"{puppet.displayname}" + "") + return "".join(match.groups()) + + +def cut_long_message(message: str, entities: List[TypeMessageEntity]) -> ParsedMessage: + if len(message) > 4096: + message = message[0:4082] + " [message cut]" + new_entities = [] + for entity in entities: + if entity.offset > 4082: + continue + if entity.offset + entity.length > 4082: + entity.length = 4082 - entity.offset + new_entities.append(entity) + new_entities.append(MessageEntityItalic(4082, len(" [message cut]"))) + entities = new_entities + return message, entities + + +class FormatError(Exception): + pass + + +def matrix_to_telegram(html: str) -> ParsedMessage: + try: + html = command_regex.sub(r"\1", html) + html = html.replace("\t", " " * 4) + html = not_command_regex.sub(r"\1", html) + if should_bridge_plaintext_highlights: + html = plain_mention_regex.sub(plain_mention_to_html, html) + + html = add_surrogates(html) + text, entities = parse_html(add_surrogates(html)) + text = remove_surrogates(text.strip()) + text, entities = cut_long_message(text, entities) + + return text, entities + except Exception as e: + raise FormatError(f"Failed to convert Matrix format: {html}") from e + + +def matrix_reply_to_telegram(content: dict, tg_space: int, room_id: Optional[str] = None + ) -> Optional[int]: + try: + reply = content["m.relates_to"]["m.in_reply_to"] + room_id = room_id or reply["room_id"] + event_id = reply["event_id"] + + try: + if content["format"] == "org.matrix.custom.html": + content["formatted_body"] = trim_reply_fallback_html(content["formatted_body"]) + except KeyError: + pass + content["body"] = trim_reply_fallback_text(content["body"]) + + message = DBMessage.query.filter(DBMessage.mxid == event_id, + DBMessage.tg_space == tg_space, + DBMessage.mx_room == room_id).one_or_none() + if message: + return message.tgid + except KeyError: + pass + return None + + +def matrix_text_to_telegram(text: str) -> ParsedMessage: + text = command_regex.sub(r"/\1", text) + text = text.replace("\t", " " * 4) + text = not_command_regex.sub(r"\1", text) + if should_bridge_plaintext_highlights: + entities, pmr_replacer = plain_mention_to_text() + text = plain_mention_regex.sub(pmr_replacer, text) + else: + entities = [] + return text, entities + + +def plain_mention_to_text() -> Tuple[List[TypeMessageEntity], Callable[[str], str]]: + entities = [] + + def replacer(match) -> str: + puppet = pu.Puppet.find_by_displayname(match.group(2)) + if puppet: + offset = match.start() + length = match.end() - offset + if puppet.username: + entity = MessageEntityMention(offset, length) + text = f"@{puppet.username}" + else: + entity = MessageEntityMentionName(offset, length, user_id=puppet.tgid) + text = puppet.displayname + entities.append(entity) + return text + return "".join(match.groups()) + + return entities, replacer + + +def init_mx(context: "Context"): + global plain_mention_regex, should_bridge_plaintext_highlights + config = context.config + dn_template = config.get("bridge.displayname_template", "{displayname} (Telegram)") + dn_template = re.escape(dn_template).replace(re.escape("{displayname}"), "[^>]+") + plain_mention_regex = re.compile(f"(\s|^)({dn_template})") + should_bridge_plaintext_highlights = config["bridge.plaintext_highlights"] or False diff --git a/mautrix_telegram/formatter/from_matrix/parser_common.py b/mautrix_telegram/formatter/from_matrix/parser_common.py new file mode 100644 index 00000000..6db8dc3a --- /dev/null +++ b/mautrix_telegram/formatter/from_matrix/parser_common.py @@ -0,0 +1,31 @@ +# -*- coding: future_fstrings -*- +# mautrix-telegram - A Matrix-Telegram puppeting bridge +# Copyright (C) 2018 Tulir Asokan +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +import re +from typing import List, Tuple, Pattern +from telethon.tl.types import TypeMessageEntity + + +class MatrixParserCommon: + mention_regex = re.compile("https://matrix.to/#/(@.+:.+)") # type: Pattern + room_regex = re.compile("https://matrix.to/#/(#.+:.+)") # type: Pattern + block_tags = ("br", "p", "pre", "blockquote", + "ol", "ul", "li", + "h1", "h2", "h3", "h4", "h5", "h6", + "div", "hr", "table") # type: Tuple[str, ...] + + +ParsedMessage = Tuple[str, List[TypeMessageEntity]] diff --git a/mautrix_telegram/formatter/from_matrix.py b/mautrix_telegram/formatter/from_matrix/parser_htmlparser.py similarity index 63% rename from mautrix_telegram/formatter/from_matrix.py rename to mautrix_telegram/formatter/from_matrix/parser_htmlparser.py index 6619ef02..4de0cba1 100644 --- a/mautrix_telegram/formatter/from_matrix.py +++ b/mautrix_telegram/formatter/from_matrix/parser_htmlparser.py @@ -14,41 +14,31 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from typing import (Optional, List, Tuple, Type, Callable, Dict, Any, Pattern, Deque, Match, TYPE_CHECKING) +from typing import (Optional, List, Tuple, Type, Dict, Any, Deque, Match) from html import unescape from html.parser import HTMLParser from collections import deque import math -import re -import logging from telethon.tl.types import (MessageEntityMention, MessageEntityMentionName, MessageEntityEmail, MessageEntityUrl, MessageEntityTextUrl, MessageEntityBold, MessageEntityItalic, MessageEntityCode, MessageEntityPre, MessageEntityBotCommand, TypeMessageEntity) -from .. import user as u, puppet as pu, portal as po -from ..db import Message as DBMessage -from .util import (add_surrogates, remove_surrogates, trim_reply_fallback_html, - trim_reply_fallback_text, html_to_unicode) - -if TYPE_CHECKING: - from ..context import Context - -log = logging.getLogger("mau.fmt.mx") # type: logging.Logger -should_bridge_plaintext_highlights = False # type: bool +from ... import user as u, puppet as pu, portal as po +from ..util import html_to_unicode +from .parser_common import MatrixParserCommon, ParsedMessage -class MatrixParser(HTMLParser): - mention_regex = re.compile("https://matrix.to/#/(@.+:.+)") # type: Pattern - room_regex = re.compile("https://matrix.to/#/(#.+:.+)") # type: Pattern - block_tags = ("br", "p", "pre", "blockquote", - "ol", "ul", "li", - "h1", "h2", "h3", "h4", "h5", "h6", - "div", "hr", "table") # type: Tuple[str, ...] +def parse_html(html: str) -> ParsedMessage: + parser = MatrixParser() + parser.feed(html) + return parser.text, parser.entities + +class MatrixParser(HTMLParser, MatrixParserCommon): def __init__(self): - super().__init__() + super(MatrixParser, self).__init__() self.text = "" # type: str self.entities = [] # type: List[TypeMessageEntity] self._building_entities = {} # type: Dict[str, TypeMessageEntity] @@ -244,120 +234,3 @@ class MatrixParser(HTMLParser): if tag in self.block_tags and tag != "br" and "blockquote" not in self._open_tags: self._newline(allow_multi=tag == "br") - - -command_regex = re.compile(r"^!([A-Za-z0-9@]+)") # type: Pattern -not_command_regex = re.compile(r"^\\(![A-Za-z0-9@]+)") # type: Pattern -plain_mention_regex = None # type: Pattern - - -def plain_mention_to_html(match: Match) -> str: - puppet = pu.Puppet.find_by_displayname(match.group(2)) - if puppet: - return (f"{match.group(1)}" - f"" - f"{puppet.displayname}" - "") - return "".join(match.groups()) - - -def cut_long_message(message: str, entities: List[TypeMessageEntity] - ) -> Tuple[str, List[TypeMessageEntity]]: - if len(message) > 4096: - message = message[0:4082] + " [message cut]" - new_entities = [] - for entity in entities: - if entity.offset > 4082: - continue - if entity.offset + entity.length > 4082: - entity.length = 4082 - entity.offset - new_entities.append(entity) - new_entities.append(MessageEntityItalic(4082, len(" [message cut]"))) - entities = new_entities - return message, entities - - -def matrix_to_telegram(html: str) -> Tuple[str, List[TypeMessageEntity]]: - try: - parser = MatrixParser() - html = command_regex.sub(r"\1", html) - html = html.replace("\t", " " * 4) - html = not_command_regex.sub(r"\1", html) - if should_bridge_plaintext_highlights: - html = plain_mention_regex.sub(plain_mention_to_html, html) - parser.feed(add_surrogates(html)) - - message_text = remove_surrogates(parser.text.strip()) - message_entities = parser.entities - - message_text, message_entities = cut_long_message(message_text, message_entities) - - return message_text, message_entities - except Exception: - log.exception("Failed to convert Matrix format:\nhtml=%s", html) - - -def matrix_reply_to_telegram(content: dict, tg_space: int, room_id: Optional[str] = None - ) -> Optional[int]: - try: - reply = content["m.relates_to"]["m.in_reply_to"] - room_id = room_id or reply["room_id"] - event_id = reply["event_id"] - - try: - if content["format"] == "org.matrix.custom.html": - content["formatted_body"] = trim_reply_fallback_html(content["formatted_body"]) - except KeyError: - pass - content["body"] = trim_reply_fallback_text(content["body"]) - - message = DBMessage.query.filter(DBMessage.mxid == event_id, - DBMessage.tg_space == tg_space, - DBMessage.mx_room == room_id).one_or_none() - if message: - return message.tgid - except KeyError: - pass - return None - - -def matrix_text_to_telegram(text: str) -> Tuple[str, List[TypeMessageEntity]]: - text = command_regex.sub(r"/\1", text) - text = text.replace("\t", " " * 4) - text = not_command_regex.sub(r"\1", text) - if should_bridge_plaintext_highlights: - entities, pmr_replacer = plain_mention_to_text() - text = plain_mention_regex.sub(pmr_replacer, text) - else: - entities = [] - return text, entities - - -def plain_mention_to_text() -> Tuple[List[TypeMessageEntity], Callable[[str], str]]: - entities = [] - - def replacer(match): - puppet = pu.Puppet.find_by_displayname(match.group(2)) - if puppet: - offset = match.start() - length = match.end() - offset - if puppet.username: - entity = MessageEntityMention(offset, length) - text = f"@{puppet.username}" - else: - entity = MessageEntityMentionName(offset, length, user_id=puppet.tgid) - text = puppet.displayname - entities.append(entity) - return text - return "".join(match.groups()) - - return entities, replacer - - -def init_mx(context: "Context"): - global plain_mention_regex, should_bridge_plaintext_highlights - config = context.config - dn_template = config.get("bridge.displayname_template", "{displayname} (Telegram)") - dn_template = re.escape(dn_template).replace(re.escape("{displayname}"), "[^>]+") - plain_mention_regex = re.compile(f"(\s|^)({dn_template})") - should_bridge_plaintext_highlights = config["bridge.plaintext_highlights"] or False diff --git a/mautrix_telegram/formatter/from_matrix/parser_lxml.py b/mautrix_telegram/formatter/from_matrix/parser_lxml.py new file mode 100644 index 00000000..41b135ff --- /dev/null +++ b/mautrix_telegram/formatter/from_matrix/parser_lxml.py @@ -0,0 +1,337 @@ +# -*- coding: future_fstrings -*- +# mautrix-telegram - A Matrix-Telegram puppeting bridge +# Copyright (C) 2018 Tulir Asokan +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from typing import Optional, List, Tuple, Union, Callable +from lxml import html + +from telethon.tl.types import (MessageEntityMention as Mention, + MessageEntityMentionName as MentionName, MessageEntityEmail as Email, + MessageEntityUrl as URL, MessageEntityTextUrl as TextURL, + MessageEntityBold as Bold, MessageEntityItalic as Italic, + MessageEntityCode as Code, MessageEntityPre as Pre, + MessageEntityBotCommand as Command, TypeMessageEntity, + InputMessageEntityMentionName as InputMentionName) + +from ... import user as u, puppet as pu, portal as po +from ..util import html_to_unicode +from .parser_common import MatrixParserCommon, ParsedMessage + + +def parse_html(html: str) -> ParsedMessage: + return MatrixParser.parse(html) + + +class Entity: + @staticmethod + def copy(entity: TypeMessageEntity) -> TypeMessageEntity: + kwargs = { + "offset": entity.offset, + "length": entity.length, + } + if isinstance(entity, Pre): + kwargs["language"] = entity.language + elif isinstance(entity, TextURL): + kwargs["url"] = entity.url + elif isinstance(entity, (MentionName, InputMentionName)): + kwargs["user_id"] = entity.user_id + return entity.__class__(**kwargs) + + @classmethod + def adjust(cls, entity: Union[TypeMessageEntity, List[TypeMessageEntity]], + func: Callable[[TypeMessageEntity], None] + ) -> Union[TypeMessageEntity, List[TypeMessageEntity]]: + if isinstance(entity, list): + return [Entity.adjust(element, func) for element in entity] + entity = cls.copy(entity) + func(entity) + if entity.offset < 0: + entity.length += entity.offset + entity.offset = 0 + return entity + + +def offset_diff(amount: int): + def func(entity: TypeMessageEntity): + entity.offset += amount + + return func + + +def offset_length_multiply(amount: int): + def func(entity: TypeMessageEntity): + entity.offset *= amount + entity.length *= amount + + return func + + +class TelegramMessage: + def __init__(self, text: str = "", entities: Optional[List[TypeMessageEntity]] = None): + self.text = text # type: str + self.entities = entities or [] # type: List[TypeMessageEntity] + + def offset_entities(self, offset: int) -> "TelegramMessage": + def apply_offset(entity: TypeMessageEntity, inner_offset: int): + entity = Entity.copy(entity) + entity.offset += inner_offset + if entity.offset < 0: + entity.offset = 0 + elif entity.offset > len(self.text): + return None + elif entity.offset + entity.length > len(self.text): + entity.length = len(self.text) - entity.offset + return entity + + self.entities = [apply_offset(entity, offset) for entity in self.entities if entity] + return self + + def append(self, *args: Union[str, "TelegramMessage"]) -> "TelegramMessage": + for msg in args: + if isinstance(msg, str): + msg = TelegramMessage(text=msg) + self.entities += Entity.adjust(msg.entities, offset_diff(len(self.text))) + self.text += msg.text + return self + + def prepend(self, *args: Union[str, "TelegramMessage"]) -> "TelegramMessage": + for msg in args: + if isinstance(msg, str): + msg = TelegramMessage(text=msg) + self.entities = msg.entities + Entity.adjust(self.entities, offset_diff(len(self.text))) + self.text = msg.text + self.text + return self + + def format(self, entity_type: type(TypeMessageEntity), offset: int = None, length: int = None, + **kwargs) -> "TelegramMessage": + self.entities.append(entity_type(offset=offset or 0, + length=length if length is not None else len(self.text), + **kwargs)) + return self + + def concat(self, *args: Union[str, "TelegramMessage"]) -> "TelegramMessage": + return TelegramMessage().append(self, *args) + + def trim(self) -> "TelegramMessage": + orig_len = len(self.text) + self.text = self.text.lstrip() + diff = orig_len - len(self.text) + self.text = self.text.rstrip() + self.offset_entities(-diff) + return self + + def split(self, separator, max_items: int = 0) -> List["TelegramMessage"]: + text_parts = self.text.split(separator, max_items - 1) + output = [] # type: List[TelegramMessage] + + offset = 0 + for part in text_parts: + msg = TelegramMessage(part) + for entity in self.entities: + start_in_range = len(part) > entity.offset - offset >= 0 + end_in_range = len(part) >= entity.offset - offset + entity.length > 0 + if start_in_range and end_in_range: + msg.entities.append(Entity.adjust(entity, offset_diff(-offset))) + output.append(msg) + + offset += len(part) + offset += len(separator) + + return output + + @staticmethod + def join(items: List[Union[str, "TelegramMessage"]], separator: str = " ") -> "TelegramMessage": + main = TelegramMessage() + for msg in items: + if isinstance(msg, str): + msg = TelegramMessage(text=msg) + main.entities += Entity.adjust(msg.entities, offset_diff(len(main.text))) + main.text += msg.text + separator + main.text = main.text[:-len(separator)] + return main + + +class MatrixParser(MatrixParserCommon): + @classmethod + def list_to_tmessage(cls, node: html.HtmlElement, strip_linebreaks) -> TelegramMessage: + ordered = node.tag == "ol" + tagged_children = cls.node_to_tagged_tmessages(node, strip_linebreaks) + counter = 1 + indent_length = 0 + if ordered: + try: + counter = int(node.attrib.get("start", "1")) + except ValueError: + counter = 1 + + longest_index = counter - 1 + len(tagged_children) + indent_length = len(str(longest_index)) + indent = (indent_length + 4) * " " + children = [] # type: List[TelegramMessage] + for child, tag in tagged_children: + if tag != "li": + continue + + if ordered: + prefix = f"{counter}. " + counter += 1 + else: + prefix = "● " + child = child.prepend(prefix) + parts = child.split("\n") + parts = parts[:1] + [part.prepend(indent) for part in parts[1:]] + child = TelegramMessage.join(parts, "\n") + children.append(child) + return TelegramMessage.join(children, "\n") + + @classmethod + def blockquote_to_tmessage(cls, node: html.HtmlElement, strip_linebreaks) -> TelegramMessage: + msg = cls.tag_aware_parse_node(node, strip_linebreaks) + children = msg.trim().split("\n") + children = [child.prepend("> ") for child in children] + return TelegramMessage.join(children, "\n") + + @classmethod + def header_to_tmessage(cls, node: html.HtmlElement, strip_linebreaks) -> TelegramMessage: + children = cls.node_to_tmessages(node, strip_linebreaks) + length = int(node.tag[1]) + prefix = "#" * length + " " + return TelegramMessage.join(children, "").prepend(prefix) + + @classmethod + def basic_format_to_tmessage(cls, node: html.HtmlElement, strip_linebreaks) -> TelegramMessage: + msg = cls.tag_aware_parse_node(node, strip_linebreaks) + if node.tag in ("b", "strong"): + msg.format(Bold) + elif node.tag in ("i", "em"): + msg.format(Italic) + elif node.tag == "command": + msg.format(Command) + elif node.tag in ("s", "del"): + msg.text = html_to_unicode(msg.text, "\u0336") + elif node.tag in ("u", "ins"): + msg.text = html_to_unicode(msg.text, "\u0332") + + if node.tag in ("s", "del", "u", "ins"): + msg.entities = Entity.adjust(msg.entities, offset_length_multiply(2)) + + return msg + + @classmethod + def link_to_tstring(cls, node: html.HtmlElement, strip_linebreaks) -> TelegramMessage: + msg = cls.tag_aware_parse_node(node, strip_linebreaks) + href = node.attrib.get("href", "") + if not href: + return msg + + if href.startswith("mailto:"): + return TelegramMessage(href[len("mailto:"):]).format(Email) + + mention = cls.mention_regex.match(href) + if mention: + mxid = mention.group(1) + user = (pu.Puppet.get_by_mxid(mxid) + or u.User.get_by_mxid(mxid, create=False)) + if not user: + return msg + if user.username: + return TelegramMessage(f"@{user.username}").format(Mention) + elif user.tgid: + return TelegramMessage(user.displayname or msg.text).format(MentionName, + user_id=user.tgid) + return msg + + room = cls.room_regex.match(href) + if room: + username = po.Portal.get_username_from_mx_alias(room.group(1)) + portal = po.Portal.find_by_username(username) + if portal and portal.username: + return TelegramMessage(f"@{portal.username}").format(Mention) + + return (msg.format(URL) + if msg.text == href + else msg.format(TextURL, url=href)) + + @classmethod + def node_to_tmessage(cls, node: html.HtmlElement, strip_linebreaks) -> TelegramMessage: + if node.tag == "blockquote": + return cls.blockquote_to_tmessage(node, strip_linebreaks) + elif node.tag in ("ol", "ul"): + return cls.list_to_tmessage(node, strip_linebreaks) + elif node.tag in ("h1", "h2", "h3", "h4", "h5", "h6"): + return cls.header_to_tmessage(node, strip_linebreaks) + elif node.tag == "br": + return TelegramMessage("\n") + elif node.tag in ("b", "strong", "i", "em", "s", "del", "u", "ins", "command"): + return cls.basic_format_to_tmessage(node, strip_linebreaks) + elif node.tag == "a": + return cls.link_to_tstring(node, strip_linebreaks) + elif node.tag == "p": + return cls.tag_aware_parse_node(node, strip_linebreaks).append("\n") + elif node.tag == "pre": + lang = "" + try: + if node[0].tag == "code": + lang = node[0].attrib["class"][len("language-"):] + node = node[0] + except (IndexError, KeyError): + pass + return cls.parse_node(node, strip_linebreaks=False).format(Pre, language=lang) + elif node.tag == "code": + return cls.parse_node(node, strip_linebreaks=False).format(Code) + return cls.tag_aware_parse_node(node, strip_linebreaks) + + @staticmethod + def text_to_tmessage(text: str, strip_linebreaks: bool = True) -> TelegramMessage: + if strip_linebreaks: + text = text.replace("\n", "") + return TelegramMessage(text) + + @classmethod + def node_to_tagged_tmessages(cls, node: html.HtmlElement, strip_linebreaks: bool = True + ) -> List[Tuple[TelegramMessage, str]]: + output = [] + + if node.text: + output.append((cls.text_to_tmessage(node.text, strip_linebreaks), "text")) + for child in node: + output.append((cls.node_to_tmessage(child, strip_linebreaks), child.tag)) + if child.tail: + output.append((cls.text_to_tmessage(child.tail, strip_linebreaks), "text")) + return output + + @classmethod + def node_to_tmessages(cls, node: html.HtmlElement, strip_linebreaks) -> List[TelegramMessage]: + return [msg for (msg, tag) in cls.node_to_tagged_tmessages(node, strip_linebreaks)] + + @classmethod + def tag_aware_parse_node(cls, node: html.HtmlElement, strip_linebreaks) -> TelegramMessage: + msgs = cls.node_to_tagged_tmessages(node, strip_linebreaks) + output = TelegramMessage() + for msg, tag in msgs: + if tag in cls.block_tags: + msg = msg.append("\n").prepend("\n") + output = output.append(msg) + return output.trim() + + @classmethod + def parse_node(cls, node: html.HtmlElement, strip_linebreaks) -> TelegramMessage: + return TelegramMessage.join(cls.node_to_tmessages(node, strip_linebreaks)) + + @classmethod + def parse(cls, data: str) -> ParsedMessage: + document = html.fromstring(f"{data}") + msg = cls.parse_node(document, strip_linebreaks=True) + return msg.text, msg.entities diff --git a/mautrix_telegram/portal.py b/mautrix_telegram/portal.py index a4f80776..a0488b91 100644 --- a/mautrix_telegram/portal.py +++ b/mautrix_telegram/portal.py @@ -789,9 +789,9 @@ class Portal: def _matrix_event_to_entities(event: dict) -> Tuple[str, Optional[List[TypeMessageEntity]]]: try: if event.get("format", None) == "org.matrix.custom.html": - message, entities = formatter.matrix_to_telegram(event["formatted_body"]) + message, entities = formatter.matrix_to_telegram(event.get("formatted_body", "")) else: - message, entities = formatter.matrix_text_to_telegram(event["body"]) + message, entities = formatter.matrix_text_to_telegram(event.get("body", "")) except KeyError: message, entities = None, None return message, entities diff --git a/setup.py b/setup.py index c2d72e38..05276bb5 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,12 @@ import mautrix_telegram extras = { "highlight_edits": ["lxml>=4.1.1,<5"], + "better_formatter": ["lxml>=4.1.1,<5"], "fast_crypto": ["cryptg>=0.1,<0.2"], "webp_convert": ["Pillow>=5.0.0,<6"], "hq_thumbnails": ["moviepy>=0.2,<0.3"], } -extras["all"] = [deps[0] for deps in extras.values()] +extras["all"] = list(set(deps[0] for deps in extras.values())) setuptools.setup( name="mautrix-telegram",