diff --git a/mautrix_telegram/formatter/from_telegram.py b/mautrix_telegram/formatter/from_telegram.py index 40ce1403..b82b1c4d 100644 --- a/mautrix_telegram/formatter/from_telegram.py +++ b/mautrix_telegram/formatter/from_telegram.py @@ -20,7 +20,7 @@ import logging import re from telethon.errors import RPCError -from telethon.helpers import add_surrogate, del_surrogate +from telethon.helpers import add_surrogate, del_surrogate, within_surrogate from telethon.tl.custom import Message from telethon.tl.types import ( MessageEntityBlockquote, @@ -249,7 +249,7 @@ async def _telegram_entities_to_matrix( html = [] last_offset = 0 for i, entity in enumerate(entities): - if entity.offset > offset + length: + if entity.offset >= offset + length: break relative_offset = entity.offset - offset if relative_offset > last_offset: @@ -257,6 +257,11 @@ async def _telegram_entities_to_matrix( elif relative_offset < last_offset: continue + while within_surrogate(text, relative_offset, length=length): + relative_offset += 1 + while within_surrogate(text, relative_offset + length, length=length): + entity.length += 1 + skip_entity = False is_code_entity = isinstance(entity, (MessageEntityCode, MessageEntityPre)) entity_text = await _telegram_entities_to_matrix( @@ -294,7 +299,7 @@ async def _telegram_entities_to_matrix( elif entity_type == MessageEntityEmail: html.append(f"{entity_text}") elif entity_type in (MessageEntityTextUrl, MessageEntityUrl): - skip_entity = await _parse_url( + await _parse_url( html, entity_text, entity.url if entity_type == MessageEntityTextUrl else None ) elif entity_type in ( @@ -374,7 +379,7 @@ message_link_regex = re.compile( ) -async def _parse_url(html: list[str], entity_text: str, url: str) -> bool: +async def _parse_url(html: list[str], entity_text: str, url: str): url = escape(url) if url else entity_text if not url.startswith(("https://", "http://", "ftp://", "magnet://")): url = "http://" + url @@ -394,4 +399,3 @@ async def _parse_url(html: list[str], entity_text: str, url: str) -> bool: url = f"https://matrix.to/#/{portal.mxid}/{message.mxid}" html.append(f"{entity_text}") - return False