diff --git a/mautrix_telegram/formatter/from_matrix.py b/mautrix_telegram/formatter/from_matrix.py
index 6c00838f..dfbe490b 100644
--- a/mautrix_telegram/formatter/from_matrix.py
+++ b/mautrix_telegram/formatter/from_matrix.py
@@ -24,8 +24,7 @@ from telethon.tl.types import *
from .. import user as u, puppet as p
from ..db import Message as DBMessage
-
-TEMP_ENC = "utf-16-le"
+from .util import add_surrogates, remove_surrogates
log = logging.getLogger("mau.fmt.mx")
@@ -98,8 +97,7 @@ class MatrixParser(HTMLParser):
self._open_tags_meta.appendleft(url)
if entity_type and tag not in self._building_entities:
- # See "TEXT LEN EXPLANATION" near start of file
- offset = int(len(self.text.encode(TEMP_ENC)) / 2)
+ offset = len(self.text)
self._building_entities[tag] = entity_type(offset=offset, length=0, **args)
def _list_depth(self):
@@ -133,8 +131,7 @@ class MatrixParser(HTMLParser):
text = f"{indent}{n}. {text}"
list_format_offset = len(indent) + 3
for tag, entity in self._building_entities.items():
- # See "TEXT LEN EXPLANATION" near start of file
- entity.length += int(len(text.strip("\n").encode(TEMP_ENC)) / 2)
+ entity.length += len(text.strip("\n"))
entity.offset += list_format_offset
if text.endswith("\n"):
@@ -160,8 +157,8 @@ class MatrixParser(HTMLParser):
def matrix_to_telegram(html):
try:
parser = MatrixParser()
- parser.feed(html)
- return parser.text, parser.entities
+ parser.feed(add_surrogates(html))
+ return remove_surrogates(parser.text), parser.entities
except Exception:
log.exception("Failed to convert Matrix format:\nhtml=%s", html)
@@ -179,4 +176,3 @@ def matrix_reply_to_telegram(content, tg_space, room_id=None):
except KeyError:
pass
return None
-
diff --git a/mautrix_telegram/formatter/from_telegram.py b/mautrix_telegram/formatter/from_telegram.py
index 6dbf78b3..aa74e3fa 100644
--- a/mautrix_telegram/formatter/from_telegram.py
+++ b/mautrix_telegram/formatter/from_telegram.py
@@ -22,8 +22,7 @@ from mautrix_appservice import MatrixRequestError
from .. import user as u, puppet as p
from ..db import Message as DBMessage
-
-TEMP_ENC = "utf-16-le"
+from .util import add_surrogates, remove_surrogates
log = logging.getLogger("mau.fmt.tg")
@@ -46,8 +45,8 @@ def telegram_reply_to_matrix(evt, source):
async def telegram_to_matrix(evt, source, native_replies=False, message_link_in_reply=False,
main_intent=None, reply_text="Reply"):
- text = evt.message
- html = _telegram_entities_to_matrix_catch(evt.message, evt.entities) if evt.entities else None
+ text = add_surrogates(evt.message)
+ html = _telegram_entities_to_matrix_catch(text, evt.entities) if evt.entities else None
relates_to = {}
if evt.fwd_from:
@@ -116,7 +115,7 @@ async def telegram_to_matrix(evt, source, native_replies=False, message_link_in_
if html:
html = html.replace("\n", "
")
- return text, html, relates_to
+ return remove_surrogates(text), remove_surrogates(html), relates_to
def _telegram_entities_to_matrix_catch(text, entities):
@@ -132,20 +131,16 @@ def _telegram_entities_to_matrix_catch(text, entities):
def _telegram_entities_to_matrix(text, entities):
if not entities:
return text
- # See "TEXT LEN EXPLANATION" near start of file
- text = text.encode(TEMP_ENC)
html = []
last_offset = 0
for entity in entities:
- entity.offset *= 2
- entity.length *= 2
if entity.offset > last_offset:
- html.append(escape(text[last_offset:entity.offset].decode(TEMP_ENC)))
+ html.append(escape(text[last_offset:entity.offset]))
elif entity.offset < last_offset:
continue
skip_entity = False
- entity_text = escape(text[entity.offset:entity.offset + entity.length].decode(TEMP_ENC))
+ entity_text = escape(text[entity.offset:entity.offset + entity.length])
entity_type = type(entity)
if entity_type == MessageEntityBold:
@@ -199,6 +194,6 @@ def _telegram_entities_to_matrix(text, entities):
else:
skip_entity = True
last_offset = entity.offset + (0 if skip_entity else entity.length)
- html.append(text[last_offset:].decode(TEMP_ENC))
+ html.append(text[last_offset:])
return "".join(html)
diff --git a/mautrix_telegram/formatter/util.py b/mautrix_telegram/formatter/util.py
new file mode 100644
index 00000000..ff35519d
--- /dev/null
+++ b/mautrix_telegram/formatter/util.py
@@ -0,0 +1,16 @@
+# Unicode surrogate handling
+# From https://github.com/LonamiWebs/Telethon/blob/master/telethon/extensions/markdown.py
+import struct
+
+
+def add_surrogates(text):
+ if text is None:
+ return None
+ return "".join("".join(chr(y) for y in struct.unpack("