From c39d24ccdc917b4da6d6831a6f4e3b03f2a937e2 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Wed, 28 Nov 2018 02:25:28 +0200 Subject: [PATCH] Add HTMLParser compatibility to recursive Matrix parser and remove old parser --- .../formatter/from_matrix/__init__.py | 7 +- .../formatter/from_matrix/html_reader.py | 4 + .../formatter/from_matrix/html_reader.pyi | 11 + .../from_matrix/html_reader_htmlparser.py | 58 +++++ .../{parser_common.py => html_reader_lxml.py} | 23 +- .../from_matrix/{parser_lxml.py => parser.py} | 52 ++-- .../from_matrix/parser_htmlparser.py | 241 ------------------ .../formatter/from_matrix/telegram_message.py | 3 +- 8 files changed, 114 insertions(+), 285 deletions(-) create mode 100644 mautrix_telegram/formatter/from_matrix/html_reader.py create mode 100644 mautrix_telegram/formatter/from_matrix/html_reader.pyi create mode 100644 mautrix_telegram/formatter/from_matrix/html_reader_htmlparser.py rename mautrix_telegram/formatter/from_matrix/{parser_common.py => html_reader_lxml.py} (51%) rename mautrix_telegram/formatter/from_matrix/{parser_lxml.py => parser.py} (83%) delete mode 100644 mautrix_telegram/formatter/from_matrix/parser_htmlparser.py diff --git a/mautrix_telegram/formatter/from_matrix/__init__.py b/mautrix_telegram/formatter/from_matrix/__init__.py index 206165cf..e9861b44 100644 --- a/mautrix_telegram/formatter/from_matrix/__init__.py +++ b/mautrix_telegram/formatter/from_matrix/__init__.py @@ -26,12 +26,7 @@ from ...types import TelegramID, MatrixRoomID from ...db import Message as DBMessage from ..util import (add_surrogates, remove_surrogates, trim_reply_fallback_html, trim_reply_fallback_text) -from .parser_common import ParsedMessage - -try: - from mautrix_telegram.formatter.from_matrix.parser_lxml import parse_html -except ImportError: - from mautrix_telegram.formatter.from_matrix.parser_htmlparser import parse_html +from .parser import ParsedMessage, parse_html if TYPE_CHECKING: from ...context import Context diff --git a/mautrix_telegram/formatter/from_matrix/html_reader.py b/mautrix_telegram/formatter/from_matrix/html_reader.py new file mode 100644 index 00000000..f1e4b178 --- /dev/null +++ b/mautrix_telegram/formatter/from_matrix/html_reader.py @@ -0,0 +1,4 @@ +try: + from .html_reader_lxml import HTMLNode, read_html +except ImportError: + from .html_reader_htmlparser import HTMLNode, read_html diff --git a/mautrix_telegram/formatter/from_matrix/html_reader.pyi b/mautrix_telegram/formatter/from_matrix/html_reader.pyi new file mode 100644 index 00000000..d292ff3c --- /dev/null +++ b/mautrix_telegram/formatter/from_matrix/html_reader.pyi @@ -0,0 +1,11 @@ +from typing import Dict, List + + +class HTMLNode(List['HTMLNode']): + tag: str + text: str + tail: str + attrib: Dict[str, str] + + +def read_html(data: str) -> HTMLNode: ... diff --git a/mautrix_telegram/formatter/from_matrix/html_reader_htmlparser.py b/mautrix_telegram/formatter/from_matrix/html_reader_htmlparser.py new file mode 100644 index 00000000..9ac10cf2 --- /dev/null +++ b/mautrix_telegram/formatter/from_matrix/html_reader_htmlparser.py @@ -0,0 +1,58 @@ +# -*- coding: future_fstrings -*- +# mautrix-telegram - A Matrix-Telegram puppeting bridge +# Copyright (C) 2018 Tulir Asokan +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from typing import Dict, List, Tuple + +from html.parser import HTMLParser + + +class HTMLNode(list): + def __init__(self, tag: str, attrs: List[Tuple[str, str]]): + super().__init__() + self.tag = tag # type: str + self.text = "" # type: str + self.tail = "" # type: str + self.attrib = dict(attrs) # type: Dict[str, str] + + +class NodeifyingParser(HTMLParser): + def __init__(self): + super().__init__() + self.stack = [HTMLNode("html", [])] # type: List[HTMLNode] + + def handle_starttag(self, tag, attrs): + node = HTMLNode(tag, attrs) + self.stack[-1].append(node) + self.stack.append(node) + + def handle_endtag(self, tag): + if tag == self.stack[-1].tag: + self.stack.pop() + + def handle_data(self, data): + if len(self.stack[-1]) > 0: + self.stack[-1][-1].tail += data + else: + self.stack[-1].text += data + + def error(self, message): + pass + + +def read_html(data: str) -> HTMLNode: + parser = NodeifyingParser() + parser.feed(data) + return parser.stack[0] diff --git a/mautrix_telegram/formatter/from_matrix/parser_common.py b/mautrix_telegram/formatter/from_matrix/html_reader_lxml.py similarity index 51% rename from mautrix_telegram/formatter/from_matrix/parser_common.py rename to mautrix_telegram/formatter/from_matrix/html_reader_lxml.py index 9b04b026..36d6d56e 100644 --- a/mautrix_telegram/formatter/from_matrix/parser_common.py +++ b/mautrix_telegram/formatter/from_matrix/html_reader_lxml.py @@ -14,23 +14,10 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import re -from typing import List, Tuple, Pattern -from telethon.tl.types import TypeMessageEntity +from lxml import html + +HTMLNode = html.HtmlElement -class MatrixParserCommon: - mention_regex = re.compile("https://matrix.to/#/(@.+:.+)") # type: Pattern - room_regex = re.compile("https://matrix.to/#/(#.+:.+)") # type: Pattern - block_tags = ("p", "pre", "blockquote", - "ol", "ul", "li", - "h1", "h2", "h3", "h4", "h5", "h6", - "div", "hr", "table") # type: Tuple[str, ...] - list_bullets = ("●", "○", "■", "‣") # type: Tuple[str, ...] - - @classmethod - def list_bullet(cls, depth: int) -> str: - return cls.list_bullets[(depth - 1) % len(cls.list_bullets)] + " " - - -ParsedMessage = Tuple[str, List[TypeMessageEntity]] +def read_html(data: str) -> HTMLNode: + return html.fromstring(data) diff --git a/mautrix_telegram/formatter/from_matrix/parser_lxml.py b/mautrix_telegram/formatter/from_matrix/parser.py similarity index 83% rename from mautrix_telegram/formatter/from_matrix/parser_lxml.py rename to mautrix_telegram/formatter/from_matrix/parser.py index 70057245..e53c816b 100644 --- a/mautrix_telegram/formatter/from_matrix/parser_lxml.py +++ b/mautrix_telegram/formatter/from_matrix/parser.py @@ -14,21 +14,26 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from typing import List, Tuple -from lxml import html +from typing import List, Tuple, Pattern +import re from telethon.tl.types import (MessageEntityMention as Mention, MessageEntityBotCommand as Command, MessageEntityMentionName as MentionName, MessageEntityEmail as Email, MessageEntityUrl as URL, MessageEntityTextUrl as TextURL, MessageEntityBold as Bold, MessageEntityItalic as Italic, - MessageEntityCode as Code, MessageEntityPre as Pre) + MessageEntityCode as Code, MessageEntityPre as Pre, + TypeMessageEntity) from ... import user as u, puppet as pu, portal as po from ...types import MatrixUserID from ..util import html_to_unicode -from .parser_common import MatrixParserCommon, ParsedMessage from .telegram_message import TelegramMessage, Entity, offset_length_multiply +from .html_reader import HTMLNode, read_html + + +ParsedMessage = Tuple[str, List[TypeMessageEntity]] + def parse_html(input_html: str) -> ParsedMessage: return MatrixParser.parse(input_html) @@ -52,9 +57,21 @@ class RecursionContext: return RecursionContext(strip_linebreaks=False, ul_depth=self.ul_depth) -class MatrixParser(MatrixParserCommon): +class MatrixParser: + mention_regex = re.compile("https://matrix.to/#/(@.+:.+)") # type: Pattern + room_regex = re.compile("https://matrix.to/#/(#.+:.+)") # type: Pattern + block_tags = ("p", "pre", "blockquote", + "ol", "ul", "li", + "h1", "h2", "h3", "h4", "h5", "h6", + "div", "hr", "table") # type: Tuple[str, ...] + list_bullets = ("●", "○", "■", "‣") # type: Tuple[str, ...] + @classmethod - def list_to_tmessage(cls, node: html.HtmlElement, ctx: RecursionContext) -> TelegramMessage: + def list_bullet(cls, depth: int) -> str: + return cls.list_bullets[(depth - 1) % len(cls.list_bullets)] + " " + + @classmethod + def list_to_tmessage(cls, node: HTMLNode, ctx: RecursionContext) -> TelegramMessage: ordered = node.tag == "ol" tagged_children = cls.node_to_tagged_tmessages(node, ctx) counter = 1 @@ -86,23 +103,21 @@ class MatrixParser(MatrixParserCommon): return TelegramMessage.join(children, "\n") @classmethod - def blockquote_to_tmessage(cls, node: html.HtmlElement, ctx: RecursionContext - ) -> TelegramMessage: + def blockquote_to_tmessage(cls, node: HTMLNode, ctx: RecursionContext) -> TelegramMessage: msg = cls.tag_aware_parse_node(node, ctx) children = msg.trim().split("\n") children = [child.prepend("> ") for child in children] return TelegramMessage.join(children, "\n") @classmethod - def header_to_tmessage(cls, node: html.HtmlElement, ctx: RecursionContext) -> TelegramMessage: + def header_to_tmessage(cls, node: HTMLNode, ctx: RecursionContext) -> TelegramMessage: children = cls.node_to_tmessages(node, ctx) length = int(node.tag[1]) prefix = "#" * length + " " return TelegramMessage.join(children, "").prepend(prefix).format(Bold) @classmethod - def basic_format_to_tmessage(cls, node: html.HtmlElement, ctx: RecursionContext - ) -> TelegramMessage: + def basic_format_to_tmessage(cls, node: HTMLNode, ctx: RecursionContext) -> TelegramMessage: msg = cls.tag_aware_parse_node(node, ctx) if node.tag in ("b", "strong"): msg.format(Bold) @@ -121,7 +136,7 @@ class MatrixParser(MatrixParserCommon): return msg @classmethod - def link_to_tstring(cls, node: html.HtmlElement, ctx: RecursionContext) -> TelegramMessage: + def link_to_tstring(cls, node: HTMLNode, ctx: RecursionContext) -> TelegramMessage: msg = cls.tag_aware_parse_node(node, ctx) href = node.attrib.get("href", "") if not href: @@ -156,7 +171,7 @@ class MatrixParser(MatrixParserCommon): else msg.format(TextURL, url=href)) @classmethod - def node_to_tmessage(cls, node: html.HtmlElement, ctx: RecursionContext) -> TelegramMessage: + def node_to_tmessage(cls, node: HTMLNode, ctx: RecursionContext) -> TelegramMessage: if node.tag == "blockquote": return cls.blockquote_to_tmessage(node, ctx) elif node.tag == "ol": @@ -193,7 +208,7 @@ class MatrixParser(MatrixParserCommon): return TelegramMessage(text) @classmethod - def node_to_tagged_tmessages(cls, node: html.HtmlElement, ctx: RecursionContext + def node_to_tagged_tmessages(cls, node: HTMLNode, ctx: RecursionContext ) -> List[Tuple[TelegramMessage, str]]: output = [] @@ -206,12 +221,12 @@ class MatrixParser(MatrixParserCommon): return output @classmethod - def node_to_tmessages(cls, node: html.HtmlElement, ctx: RecursionContext + def node_to_tmessages(cls, node: HTMLNode, ctx: RecursionContext ) -> List[TelegramMessage]: return [msg for (msg, tag) in cls.node_to_tagged_tmessages(node, ctx)] @classmethod - def tag_aware_parse_node(cls, node: html.HtmlElement, ctx: RecursionContext + def tag_aware_parse_node(cls, node: HTMLNode, ctx: RecursionContext ) -> TelegramMessage: msgs = cls.node_to_tagged_tmessages(node, ctx) output = TelegramMessage() @@ -226,11 +241,10 @@ class MatrixParser(MatrixParserCommon): return output.trim() @classmethod - def parse_node(cls, node: html.HtmlElement, ctx: RecursionContext) -> TelegramMessage: + def parse_node(cls, node: HTMLNode, ctx: RecursionContext) -> TelegramMessage: return TelegramMessage.join(cls.node_to_tmessages(node, ctx)) @classmethod def parse(cls, data: str) -> ParsedMessage: - document = html.fromstring(f"{data}") - msg = cls.parse_node(document, RecursionContext()) + msg = cls.node_to_tmessage(read_html(f"{data}"), RecursionContext()) return msg.text, msg.entities diff --git a/mautrix_telegram/formatter/from_matrix/parser_htmlparser.py b/mautrix_telegram/formatter/from_matrix/parser_htmlparser.py deleted file mode 100644 index 8988228b..00000000 --- a/mautrix_telegram/formatter/from_matrix/parser_htmlparser.py +++ /dev/null @@ -1,241 +0,0 @@ -# -*- coding: future_fstrings -*- -# mautrix-telegram - A Matrix-Telegram puppeting bridge -# Copyright (C) 2018 Tulir Asokan -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -from typing import (Optional, List, Tuple, Type, Dict, Any, TYPE_CHECKING, Match) -from html import unescape -from html.parser import HTMLParser -from collections import deque -import math - -from telethon.tl.types import (MessageEntityMention, MessageEntityMentionName, MessageEntityEmail, - MessageEntityUrl, MessageEntityTextUrl, MessageEntityBold, - MessageEntityItalic, MessageEntityCode, MessageEntityPre, - MessageEntityBotCommand, TypeMessageEntity) - -from ... import user as u, puppet as pu, portal as po -from ...types import MatrixUserID -from ..util import html_to_unicode -from .parser_common import MatrixParserCommon, ParsedMessage - -if TYPE_CHECKING: - from typing import Deque - - -def parse_html(html: str) -> ParsedMessage: - parser = MatrixParser() - parser.feed(html) - return parser.text, parser.entities - - -class MatrixParser(HTMLParser, MatrixParserCommon): - def __init__(self): - super(MatrixParser, self).__init__() - self.text = "" # type: str - self.entities = [] # type: List[TypeMessageEntity] - self._building_entities = {} # type: Dict[str, TypeMessageEntity] - self._list_counter = 0 # type: int - self._open_tags = deque() # type: Deque[str] - self._open_tags_meta = deque() # type: Deque[Any] - self._line_is_new = True # type: bool - self._list_entry_is_new = False # type: bool - - def _parse_url(self, url: str, args: Dict[str, Any] - ) -> Tuple[Optional[Type[TypeMessageEntity]], Optional[str]]: - mention = self.mention_regex.match(url) # type: Match - if mention: - mxid = MatrixUserID(mention.group(1)) - user = (pu.Puppet.get_by_mxid(mxid) - or u.User.get_by_mxid(mxid, create=False)) - if not user: - return None, None - if user.username: - return MessageEntityMention, f"@{user.username}" - elif user.tgid: - args["user_id"] = user.tgid - return MessageEntityMentionName, user.displayname or None - else: - return None, None - - room = self.room_regex.match(url) # type: Match - if room: - username = po.Portal.get_username_from_mx_alias(room.group(1)) - portal = po.Portal.find_by_username(username) - if portal and portal.username: - return MessageEntityMention, f"@{portal.username}" - - if url.startswith("mailto:"): - return MessageEntityEmail, url[len("mailto:"):] - elif self.get_starttag_text() == url: - return MessageEntityUrl, url - else: - args["url"] = url - return MessageEntityTextUrl, None - - def handle_starttag(self, tag: str, attrs_list: List[Tuple[str, str]]): - self._open_tags.appendleft(tag) - self._open_tags_meta.appendleft(0) - - attrs = dict(attrs_list) - entity_type = None # type: Optional[Type[TypeMessageEntity]] - args = {} # type: Dict[str, Any] - if tag in ("strong", "b"): - entity_type = MessageEntityBold - elif tag in ("em", "i"): - entity_type = MessageEntityItalic - elif tag == "code": - try: - pre = self._building_entities["pre"] - try: - # Pre tag and language found, add language to MessageEntityPre - pre.language = attrs["class"][len("language-"):] - except KeyError: - # Pre tag found, but language not found, keep pre as-is - pass - except KeyError: - # No pre tag found, this is inline code - entity_type = MessageEntityCode - elif tag == "pre": - entity_type = MessageEntityPre - args["language"] = "" - elif tag == "command": - entity_type = MessageEntityBotCommand - elif tag == "li": - self._list_entry_is_new = True - elif tag == "a": - try: - url = attrs["href"] - except KeyError: - return - entity_type, url = self._parse_url(url, args) - self._open_tags_meta.popleft() - self._open_tags_meta.appendleft(url) - - if (tag in self.block_tags and ("blockquote" not in self._open_tags)) or tag == "br": - self._newline() - - if entity_type and tag not in self._building_entities: - offset = len(self.text) - self._building_entities[tag] = entity_type(offset=offset, length=0, **args) - - @property - def _list_indent(self) -> int: - indent = 0 - first_skipped = False - for index, tag in enumerate(self._open_tags): - if not first_skipped and tag in ("ol", "ul"): - # The first list level isn't indented, so skip it. - first_skipped = True - continue - if tag == "ol": - n = self._open_tags_meta[index] - extra_length_for_long_index = (int(math.log(n, 10)) - 1) * 3 - indent += 4 + extra_length_for_long_index - elif tag == "ul": - indent += 3 - return indent - - def _newline(self, allow_multi: bool = False): - if self._line_is_new and not allow_multi: - return - self.text += "\n" - self._line_is_new = True - for entity in self._building_entities.values(): - entity.length += 1 - - def _handle_special_previous_tags(self, text: str) -> str: - if "pre" not in self._open_tags and "code" not in self._open_tags: - text = text.replace("\n", "") - else: - text = text.strip() - - previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else "" - if previous_tag == "a": - url = self._open_tags_meta[0] - if url: - text = url - elif previous_tag == "command": - text = f"/{text}" - return text - - def _html_to_unicode(self, text: str) -> str: - strikethrough, underline = "del" in self._open_tags, "u" in self._open_tags - if strikethrough and underline: - text = html_to_unicode(text, "\u0336\u0332") - elif strikethrough: - text = html_to_unicode(text, "\u0336") - elif underline: - text = html_to_unicode(text, "\u0332") - return text - - def _handle_tags_for_data(self, text: str) -> Tuple[str, int]: - extra_offset = 0 - list_entry_handled_once = False - # In order to maintain order of things like blockquotes in lists or lists in blockquotes, - # we can't just have ifs/elses and we need to actually loop through the open tags in order. - for index, tag in enumerate(self._open_tags): - if tag == "blockquote" and self._line_is_new: - text = f"> {text}" - extra_offset += 2 - elif tag == "li" and not list_entry_handled_once: - list_type_index = index + 1 - list_type = self._open_tags[list_type_index] - indent = self._list_indent * " " if self._line_is_new else "" - if list_type == "ol": - n = self._open_tags_meta[list_type_index] - if self._list_entry_is_new: - n += 1 - self._open_tags_meta[list_type_index] = n - prefix = f"{n}. " - else: - prefix = int(math.log(n, 10)) * 3 * " " + 4 * " " - else: - prefix = (self.list_bullet(self._open_tags.count('ul')) - if self._list_entry_is_new else 3 * " ") - if not self._list_entry_is_new and not self._line_is_new: - prefix = "" - extra_offset += len(indent) + len(prefix) - text = indent + prefix + text - self._list_entry_is_new = False - list_entry_handled_once = True - return text, extra_offset - - def _extend_entities_in_construction(self, text: str, extra_offset: int): - for tag, entity in self._building_entities.items(): - entity.length += len(text) - extra_offset - entity.offset += extra_offset - - def handle_data(self, text: str): - text = unescape(text) - text = self._handle_special_previous_tags(text) - text = self._html_to_unicode(text) - text, extra_offset = self._handle_tags_for_data(text) - self._extend_entities_in_construction(text, extra_offset) - self._line_is_new = False - self.text += text - - def handle_endtag(self, tag: str): - try: - self._open_tags.popleft() - self._open_tags_meta.popleft() - except IndexError: - pass - - entity = self._building_entities.pop(tag, None) - if entity: - self.entities.append(entity) - - if tag in self.block_tags and tag != "br" and "blockquote" not in self._open_tags: - self._newline(allow_multi=tag == "br") diff --git a/mautrix_telegram/formatter/from_matrix/telegram_message.py b/mautrix_telegram/formatter/from_matrix/telegram_message.py index c849cc00..f78af066 100644 --- a/mautrix_telegram/formatter/from_matrix/telegram_message.py +++ b/mautrix_telegram/formatter/from_matrix/telegram_message.py @@ -153,5 +153,6 @@ class TelegramMessage: msg = TelegramMessage(text=msg) main.entities += Entity.adjust(msg.entities, offset_diff(len(main.text))) main.text += msg.text + separator - main.text = main.text[:-len(separator)] + if len(separator) > 0: + main.text = main.text[:-len(separator)] return main