Fix and refactor Matrix->Telegram formatter

This commit is contained in:
Tulir Asokan
2018-03-10 09:39:53 +02:00
parent 77c57eb64b
commit 2d63c5b3ce
3 changed files with 68 additions and 51 deletions
+66 -49
View File
@@ -151,17 +151,22 @@ class MatrixParser(HTMLParser):
for entity in self._building_entities.values():
entity.length += 1
def handle_data(self, text):
text = unescape(text)
def _handle_special_previous_tags(self, text):
if "pre" not in self._open_tags and "code" not in self._open_tags:
text = text.replace("\n", "")
else:
text = text.strip()
previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ""
extra_offset = 0
if previous_tag == "a":
url = self._open_tags_meta[0]
if url:
text = url
elif previous_tag == "command":
text = f"/{text}"
return text
def _html_to_unicode(self, text):
strikethrough, underline = "del" in self._open_tags, "u" in self._open_tags
if strikethrough and underline:
text = html_to_unicode(text, "\u0336\u0332")
@@ -169,7 +174,10 @@ class MatrixParser(HTMLParser):
text = html_to_unicode(text, "\u0336")
elif underline:
text = html_to_unicode(text, "\u0332")
return text
def _handle_tags_for_data(self, text):
extra_offset = 0
list_entry_handled_once = False
# In order to maintain order of things like blockquotes in lists or lists in blockquotes,
# we can't just have ifs/elses and we need to actually loop through the open tags in order.
@@ -197,10 +205,19 @@ class MatrixParser(HTMLParser):
text = indent + prefix + text
self._list_entry_is_new = False
list_entry_handled_once = True
return text, extra_offset
def _extend_entities_in_construction(self, text, extra_offset):
for tag, entity in self._building_entities.items():
entity.length += len(text) - extra_offset
entity.offset += extra_offset
def handle_data(self, text):
text = unescape(text)
text = self._handle_special_previous_tags(text)
text = self._html_to_unicode(text)
text, extra_offset = self._handle_tags_for_data(text)
self._extend_entities_in_construction(text, extra_offset)
self._line_is_new = False
self.text += text
@@ -223,6 +240,52 @@ command_regex = re.compile("(\s|^)!([A-Za-z0-9@]+)")
plain_mention_regex = None
def plain_mention_to_html(match):
puppet = pu.Puppet.find_by_displayname(match.group(2))
if puppet:
return (f"{match.group(1)}"
f"<a href='https://matrix.to/#/{puppet.mxid}'>"
f"{puppet.displayname}"
"</a>")
return "".join(match.groups())
def matrix_to_telegram(html):
try:
parser = MatrixParser()
html = command_regex.sub(r"\1<command>\2</command>", html)
if should_bridge_plaintext_highlights:
html = plain_mention_regex.sub(plain_mention_to_html, html)
parser.feed(add_surrogates(html))
print([str(e) for e in parser.entities])
return remove_surrogates(parser.text.strip()), parser.entities
except Exception:
log.exception("Failed to convert Matrix format:\nhtml=%s", html)
def matrix_reply_to_telegram(content, tg_space, room_id=None):
try:
reply = content["m.relates_to"]["m.in_reply_to"]
room_id = room_id or reply["room_id"]
event_id = reply["event_id"]
try:
if content["format"] == "org.matrix.custom.html":
content["formatted_body"] = trim_reply_fallback_html(content["formatted_body"])
except KeyError:
pass
content["body"] = trim_reply_fallback_text(content["body"])
message = DBMessage.query.filter(DBMessage.mxid == event_id,
DBMessage.tg_space == tg_space,
DBMessage.mx_room == room_id).one_or_none()
if message:
return message.tgid
except KeyError:
pass
return None
def matrix_text_to_telegram(text):
text = command_regex.sub(r"\1/\2", text)
if should_bridge_plaintext_highlights:
@@ -255,52 +318,6 @@ def plain_mention_to_text():
return entities, replacer
def plain_mention_to_html(match):
puppet = pu.Puppet.find_by_displayname(match.group(2))
if puppet:
return (f"{match.group(1)}"
f"<a href='https://matrix.to/#/{puppet.mxid}'>"
f"{puppet.displayname}"
"</a>")
return "".join(match.groups())
def matrix_to_telegram(html):
try:
parser = MatrixParser()
html = html.replace("\n", "")
html = command_regex.sub(r"\1<command>\2</command>", html)
if should_bridge_plaintext_highlights:
html = plain_mention_regex.sub(plain_mention_to_html, html)
parser.feed(add_surrogates(html))
return remove_surrogates(parser.text.strip()), parser.entities
except Exception:
log.exception("Failed to convert Matrix format:\nhtml=%s", html)
def matrix_reply_to_telegram(content, tg_space, room_id=None):
try:
reply = content["m.relates_to"]["m.in_reply_to"]
room_id = room_id or reply["room_id"]
event_id = reply["event_id"]
try:
if content["format"] == "org.matrix.custom.html":
content["formatted_body"] = trim_reply_fallback_html(content["formatted_body"])
except KeyError:
pass
content["body"] = trim_reply_fallback_text(content["body"])
message = DBMessage.query.filter(DBMessage.mxid == event_id,
DBMessage.tg_space == tg_space,
DBMessage.mx_room == room_id).one_or_none()
if message:
return message.tgid
except KeyError:
pass
return None
def init_mx(context):
global plain_mention_regex, should_bridge_plaintext_highlights
config = context.config
+1 -1
View File
@@ -132,7 +132,7 @@ async def _add_reply_header(source, text, html, evt, relates_to, main_intent, is
r_keyword = "In reply to" if not is_edit else "Edit to"
r_msg_link = f"<a href='https://matrix.to/#/{msg.mx_room}/{msg.mxid}'>{r_keyword}</a>"
html = (f"<blockquote data-mx-reply>{r_msg_link} {r_sender_link} {r_html_body}</blockquote>"
html = (f"<blockquote data-mx-reply>{r_msg_link} {r_sender_link}\n{r_html_body}</blockquote>"
+ (html or escape(text)))
lines = r_text_body.strip().split("\n")
+1 -1
View File
@@ -594,7 +594,7 @@ class Portal:
entity.user_id = await client.get_input_entity(entity.user_id.user_id)
else:
message, entities = formatter.matrix_text_to_telegram(message["body"])
return await client.send_message(self.peer, message, reply_to=reply_to)
return await client.send_message(self.peer, message, entities=entities, reply_to=reply_to)
async def _handle_matrix_file(self, client, message, reply_to):
file = await self.main_intent.download_file(message["url"])