From 72b8a25cecb7f693825be9041d0ab70ccfbc0821 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Tue, 6 Feb 2018 13:49:03 +0200 Subject: [PATCH] Implement message deduplication. Fixes #5 --- mautrix_telegram/portal.py | 58 ++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/mautrix_telegram/portal.py b/mautrix_telegram/portal.py index bef409b7..435a6f5b 100644 --- a/mautrix_telegram/portal.py +++ b/mautrix_telegram/portal.py @@ -14,15 +14,20 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +from io import BytesIO +from collections import deque +from datetime import datetime +import mimetypes +import hashlib + +from PIL import Image +import magic + from telethon.tl.functions.messages import * from telethon.tl.functions.channels import * from telethon.errors.rpc_error_list import * from telethon.tl.types import * -from PIL import Image -from io import BytesIO -from datetime import datetime -import mimetypes -import magic + from .db import Portal as DBPortal, Message as DBMessage from . import puppet as p, user as u, formatter @@ -50,6 +55,8 @@ class Portal: self.photo_id = photo_id self._main_intent = None + self._dedup = deque() + if tgid: self.by_tgid[self.tgid_full] = self if mxid: @@ -74,6 +81,43 @@ class Portal: elif self.peer_type == "channel": return PeerChannel(channel_id=self.tgid) + def _hash_event(self, event): + if self.peer_type == "channel": + # Message IDs are unique per-channel + return event.id + + # Non-channel messages are unique per-user (wtf telegram), so we have no other choice than + # to deduplicate based on a hash of the message content. + + # The timestamp is only accurate to the second, so we can't rely on solely that either. + hash_content = [str(event.date.timestamp()), event.from_id, event.message] + if event.fwd_from: + hash_content += [event.fwd_from.from_id, event.fwd_from.channel_id] + elif event.media: + try: + hash_content += { + MessageMediaContact: lambda media: [media.user_id], + MessageMediaDocument: lambda media: [media.document.id, media.caption], + MessageMediaPhoto: lambda media: [media.photo.id, media.caption], + MessageMediaGeo: lambda media: [media.geo.long, media.geo.lat], + }[type(event.media)](event.media) + except KeyError: + pass + + return hashlib.md5("-" + .join(str(a) for a in hash_content) + .encode("utf-8") + ).hexdigest() + + def is_duplicate(self, event): + hash = self._hash_event(event) + if hash in self._dedup: + return True + self._dedup.append(hash) + if len(self._dedup) > 20: + self._dedup.popleft() + return False + def get_input_entity(self, user): return user.client.get_input_entity(self.peer) @@ -365,6 +409,7 @@ class Portal: else: self.log.debug("Unhandled Matrix event: %s", message) return + self.is_duplicate(response) self.db.add( DBMessage(tgid=response.id, mx_room=self.mxid, mxid=event_id, user=sender.tgid)) self.db.commit() @@ -631,6 +676,9 @@ class Portal: if not self.mxid: self.create_matrix_room(source, invites=[source.mxid]) + if self.is_duplicate(evt): + return + if evt.message: response = self.handle_telegram_text(source, sender, evt) elif evt.media: