Fix bugs in Telegram entity parser

This commit is contained in:
Tulir Asokan
2022-03-31 01:53:51 +03:00
parent 58f8ee2ee2
commit 8abb9c3884
+9 -5
View File
@@ -20,7 +20,7 @@ import logging
import re
from telethon.errors import RPCError
from telethon.helpers import add_surrogate, del_surrogate
from telethon.helpers import add_surrogate, del_surrogate, within_surrogate
from telethon.tl.custom import Message
from telethon.tl.types import (
MessageEntityBlockquote,
@@ -249,7 +249,7 @@ async def _telegram_entities_to_matrix(
html = []
last_offset = 0
for i, entity in enumerate(entities):
if entity.offset > offset + length:
if entity.offset >= offset + length:
break
relative_offset = entity.offset - offset
if relative_offset > last_offset:
@@ -257,6 +257,11 @@ async def _telegram_entities_to_matrix(
elif relative_offset < last_offset:
continue
while within_surrogate(text, relative_offset, length=length):
relative_offset += 1
while within_surrogate(text, relative_offset + length, length=length):
entity.length += 1
skip_entity = False
is_code_entity = isinstance(entity, (MessageEntityCode, MessageEntityPre))
entity_text = await _telegram_entities_to_matrix(
@@ -294,7 +299,7 @@ async def _telegram_entities_to_matrix(
elif entity_type == MessageEntityEmail:
html.append(f"<a href='mailto:{entity_text}'>{entity_text}</a>")
elif entity_type in (MessageEntityTextUrl, MessageEntityUrl):
skip_entity = await _parse_url(
await _parse_url(
html, entity_text, entity.url if entity_type == MessageEntityTextUrl else None
)
elif entity_type in (
@@ -374,7 +379,7 @@ message_link_regex = re.compile(
)
async def _parse_url(html: list[str], entity_text: str, url: str) -> bool:
async def _parse_url(html: list[str], entity_text: str, url: str):
url = escape(url) if url else entity_text
if not url.startswith(("https://", "http://", "ftp://", "magnet://")):
url = "http://" + url
@@ -394,4 +399,3 @@ async def _parse_url(html: list[str], entity_text: str, url: str) -> bool:
url = f"https://matrix.to/#/{portal.mxid}/{message.mxid}"
html.append(f"<a href='{url}'>{entity_text}</a>")
return False