From 092b80ad027726c57060f6c8597701d17eae035a Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Mon, 19 Feb 2018 20:53:37 +0200 Subject: [PATCH] Handle surrogates in a non-hacky way --- mautrix_telegram/formatter/from_matrix.py | 14 +++++--------- mautrix_telegram/formatter/from_telegram.py | 19 +++++++------------ mautrix_telegram/formatter/util.py | 16 ++++++++++++++++ 3 files changed, 28 insertions(+), 21 deletions(-) create mode 100644 mautrix_telegram/formatter/util.py diff --git a/mautrix_telegram/formatter/from_matrix.py b/mautrix_telegram/formatter/from_matrix.py index 6c00838f..dfbe490b 100644 --- a/mautrix_telegram/formatter/from_matrix.py +++ b/mautrix_telegram/formatter/from_matrix.py @@ -24,8 +24,7 @@ from telethon.tl.types import * from .. import user as u, puppet as p from ..db import Message as DBMessage - -TEMP_ENC = "utf-16-le" +from .util import add_surrogates, remove_surrogates log = logging.getLogger("mau.fmt.mx") @@ -98,8 +97,7 @@ class MatrixParser(HTMLParser): self._open_tags_meta.appendleft(url) if entity_type and tag not in self._building_entities: - # See "TEXT LEN EXPLANATION" near start of file - offset = int(len(self.text.encode(TEMP_ENC)) / 2) + offset = len(self.text) self._building_entities[tag] = entity_type(offset=offset, length=0, **args) def _list_depth(self): @@ -133,8 +131,7 @@ class MatrixParser(HTMLParser): text = f"{indent}{n}. {text}" list_format_offset = len(indent) + 3 for tag, entity in self._building_entities.items(): - # See "TEXT LEN EXPLANATION" near start of file - entity.length += int(len(text.strip("\n").encode(TEMP_ENC)) / 2) + entity.length += len(text.strip("\n")) entity.offset += list_format_offset if text.endswith("\n"): @@ -160,8 +157,8 @@ class MatrixParser(HTMLParser): def matrix_to_telegram(html): try: parser = MatrixParser() - parser.feed(html) - return parser.text, parser.entities + parser.feed(add_surrogates(html)) + return remove_surrogates(parser.text), parser.entities except Exception: log.exception("Failed to convert Matrix format:\nhtml=%s", html) @@ -179,4 +176,3 @@ def matrix_reply_to_telegram(content, tg_space, room_id=None): except KeyError: pass return None - diff --git a/mautrix_telegram/formatter/from_telegram.py b/mautrix_telegram/formatter/from_telegram.py index 6dbf78b3..aa74e3fa 100644 --- a/mautrix_telegram/formatter/from_telegram.py +++ b/mautrix_telegram/formatter/from_telegram.py @@ -22,8 +22,7 @@ from mautrix_appservice import MatrixRequestError from .. import user as u, puppet as p from ..db import Message as DBMessage - -TEMP_ENC = "utf-16-le" +from .util import add_surrogates, remove_surrogates log = logging.getLogger("mau.fmt.tg") @@ -46,8 +45,8 @@ def telegram_reply_to_matrix(evt, source): async def telegram_to_matrix(evt, source, native_replies=False, message_link_in_reply=False, main_intent=None, reply_text="Reply"): - text = evt.message - html = _telegram_entities_to_matrix_catch(evt.message, evt.entities) if evt.entities else None + text = add_surrogates(evt.message) + html = _telegram_entities_to_matrix_catch(text, evt.entities) if evt.entities else None relates_to = {} if evt.fwd_from: @@ -116,7 +115,7 @@ async def telegram_to_matrix(evt, source, native_replies=False, message_link_in_ if html: html = html.replace("\n", "
") - return text, html, relates_to + return remove_surrogates(text), remove_surrogates(html), relates_to def _telegram_entities_to_matrix_catch(text, entities): @@ -132,20 +131,16 @@ def _telegram_entities_to_matrix_catch(text, entities): def _telegram_entities_to_matrix(text, entities): if not entities: return text - # See "TEXT LEN EXPLANATION" near start of file - text = text.encode(TEMP_ENC) html = [] last_offset = 0 for entity in entities: - entity.offset *= 2 - entity.length *= 2 if entity.offset > last_offset: - html.append(escape(text[last_offset:entity.offset].decode(TEMP_ENC))) + html.append(escape(text[last_offset:entity.offset])) elif entity.offset < last_offset: continue skip_entity = False - entity_text = escape(text[entity.offset:entity.offset + entity.length].decode(TEMP_ENC)) + entity_text = escape(text[entity.offset:entity.offset + entity.length]) entity_type = type(entity) if entity_type == MessageEntityBold: @@ -199,6 +194,6 @@ def _telegram_entities_to_matrix(text, entities): else: skip_entity = True last_offset = entity.offset + (0 if skip_entity else entity.length) - html.append(text[last_offset:].decode(TEMP_ENC)) + html.append(text[last_offset:]) return "".join(html) diff --git a/mautrix_telegram/formatter/util.py b/mautrix_telegram/formatter/util.py new file mode 100644 index 00000000..ff35519d --- /dev/null +++ b/mautrix_telegram/formatter/util.py @@ -0,0 +1,16 @@ +# Unicode surrogate handling +# From https://github.com/LonamiWebs/Telethon/blob/master/telethon/extensions/markdown.py +import struct + + +def add_surrogates(text): + if text is None: + return None + return "".join("".join(chr(y) for y in struct.unpack("