diff --git a/mautrix_telegram/formatter/from_matrix/__init__.py b/mautrix_telegram/formatter/from_matrix/__init__.py index 6d988557..f71da094 100644 --- a/mautrix_telegram/formatter/from_matrix/__init__.py +++ b/mautrix_telegram/formatter/from_matrix/__init__.py @@ -76,7 +76,6 @@ def matrix_to_telegram(html: str) -> ParsedMessage: if should_bridge_plaintext_highlights: html = plain_mention_regex.sub(plain_mention_to_html, html) - html = add_surrogates(html) text, entities = parse_html(add_surrogates(html)) text = remove_surrogates(text.strip()) text, entities = cut_long_message(text, entities) diff --git a/mautrix_telegram/formatter/from_matrix/html_reader.py b/mautrix_telegram/formatter/from_matrix/html_reader.py index f1e4b178..9ac10cf2 100644 --- a/mautrix_telegram/formatter/from_matrix/html_reader.py +++ b/mautrix_telegram/formatter/from_matrix/html_reader.py @@ -1,4 +1,58 @@ -try: - from .html_reader_lxml import HTMLNode, read_html -except ImportError: - from .html_reader_htmlparser import HTMLNode, read_html +# -*- coding: future_fstrings -*- +# mautrix-telegram - A Matrix-Telegram puppeting bridge +# Copyright (C) 2018 Tulir Asokan +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from typing import Dict, List, Tuple + +from html.parser import HTMLParser + + +class HTMLNode(list): + def __init__(self, tag: str, attrs: List[Tuple[str, str]]): + super().__init__() + self.tag = tag # type: str + self.text = "" # type: str + self.tail = "" # type: str + self.attrib = dict(attrs) # type: Dict[str, str] + + +class NodeifyingParser(HTMLParser): + def __init__(self): + super().__init__() + self.stack = [HTMLNode("html", [])] # type: List[HTMLNode] + + def handle_starttag(self, tag, attrs): + node = HTMLNode(tag, attrs) + self.stack[-1].append(node) + self.stack.append(node) + + def handle_endtag(self, tag): + if tag == self.stack[-1].tag: + self.stack.pop() + + def handle_data(self, data): + if len(self.stack[-1]) > 0: + self.stack[-1][-1].tail += data + else: + self.stack[-1].text += data + + def error(self, message): + pass + + +def read_html(data: str) -> HTMLNode: + parser = NodeifyingParser() + parser.feed(data) + return parser.stack[0] diff --git a/mautrix_telegram/formatter/from_matrix/html_reader_htmlparser.py b/mautrix_telegram/formatter/from_matrix/html_reader_htmlparser.py deleted file mode 100644 index 9ac10cf2..00000000 --- a/mautrix_telegram/formatter/from_matrix/html_reader_htmlparser.py +++ /dev/null @@ -1,58 +0,0 @@ -# -*- coding: future_fstrings -*- -# mautrix-telegram - A Matrix-Telegram puppeting bridge -# Copyright (C) 2018 Tulir Asokan -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -from typing import Dict, List, Tuple - -from html.parser import HTMLParser - - -class HTMLNode(list): - def __init__(self, tag: str, attrs: List[Tuple[str, str]]): - super().__init__() - self.tag = tag # type: str - self.text = "" # type: str - self.tail = "" # type: str - self.attrib = dict(attrs) # type: Dict[str, str] - - -class NodeifyingParser(HTMLParser): - def __init__(self): - super().__init__() - self.stack = [HTMLNode("html", [])] # type: List[HTMLNode] - - def handle_starttag(self, tag, attrs): - node = HTMLNode(tag, attrs) - self.stack[-1].append(node) - self.stack.append(node) - - def handle_endtag(self, tag): - if tag == self.stack[-1].tag: - self.stack.pop() - - def handle_data(self, data): - if len(self.stack[-1]) > 0: - self.stack[-1][-1].tail += data - else: - self.stack[-1].text += data - - def error(self, message): - pass - - -def read_html(data: str) -> HTMLNode: - parser = NodeifyingParser() - parser.feed(data) - return parser.stack[0] diff --git a/mautrix_telegram/formatter/from_matrix/html_reader_lxml.py b/mautrix_telegram/formatter/from_matrix/html_reader_lxml.py deleted file mode 100644 index 36d6d56e..00000000 --- a/mautrix_telegram/formatter/from_matrix/html_reader_lxml.py +++ /dev/null @@ -1,23 +0,0 @@ -# -*- coding: future_fstrings -*- -# mautrix-telegram - A Matrix-Telegram puppeting bridge -# Copyright (C) 2018 Tulir Asokan -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -from lxml import html - -HTMLNode = html.HtmlElement - - -def read_html(data: str) -> HTMLNode: - return html.fromstring(data)