diff --git a/go.mod b/go.mod
index 1bdc3e35..63887313 100644
--- a/go.mod
+++ b/go.mod
@@ -10,6 +10,7 @@ require (
go.mau.fi/zerozap v0.1.1
go.uber.org/zap v1.27.0
golang.org/x/exp v0.0.0-20240716175740-e3f259677ff7
+ golang.org/x/net v0.27.0
maunium.net/go/mautrix v0.19.1-0.20240719130542-cc5f225bc61c
)
@@ -44,7 +45,6 @@ require (
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.25.0 // indirect
- golang.org/x/net v0.27.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.22.0 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
diff --git a/pkg/connector/matrix.go b/pkg/connector/matrix.go
index faa27945..06dca68e 100644
--- a/pkg/connector/matrix.go
+++ b/pkg/connector/matrix.go
@@ -4,13 +4,12 @@ import (
"context"
"crypto/sha256"
"fmt"
+ "math/rand"
"strconv"
"strings"
"time"
"github.com/gotd/td/telegram/message"
- "github.com/gotd/td/telegram/message/html"
- "github.com/gotd/td/telegram/message/styling"
"github.com/gotd/td/telegram/uploader"
"github.com/gotd/td/tg"
"github.com/rs/zerolog"
@@ -24,20 +23,57 @@ import (
"go.mau.fi/mautrix-telegram/pkg/connector/emojis"
"go.mau.fi/mautrix-telegram/pkg/connector/ids"
+ "go.mau.fi/mautrix-telegram/pkg/connector/matrixfmt"
"go.mau.fi/mautrix-telegram/pkg/connector/waveform"
)
-func getMediaFilenameAndCaption(content *event.MessageEventContent) (filename, caption string) {
+func getMediaFilename(content *event.MessageEventContent) string {
if content.FileName != "" {
- filename = content.FileName
- caption = content.FormattedBody
- if caption == "" {
- caption = content.Body
- }
+ return content.FileName
} else {
- filename = content.Body
+ return content.Body
}
- return
+}
+
+func (t *TelegramClient) transferMediaToTelegram(ctx context.Context, content *event.MessageEventContent) (tg.InputMediaClass, error) {
+ filename := getMediaFilename(content)
+ var fileData []byte
+ fileData, err := t.main.Bridge.Bot.DownloadMedia(ctx, content.URL, content.File)
+ if err != nil {
+ return nil, fmt.Errorf("failed to download media from Matrix: %w", err)
+ }
+ uploader := uploader.NewUploader(t.client.API())
+ var upload tg.InputFileClass
+ upload, err = uploader.FromBytes(ctx, filename, fileData)
+ if err != nil {
+ return nil, fmt.Errorf("failed to upload media to Telegram: %w", err)
+ }
+
+ if content.MsgType == event.MsgImage {
+ return &tg.InputMediaUploadedPhoto{File: upload}, nil
+ }
+
+ var attributes []tg.DocumentAttributeClass
+ attributes = append(attributes, &tg.DocumentAttributeFilename{FileName: filename})
+
+ if content.MsgType == event.MsgAudio {
+ audioAttr := tg.DocumentAttributeAudio{
+ Voice: content.MSC3245Voice != nil,
+ }
+ if content.MSC1767Audio != nil {
+ audioAttr.Duration = content.MSC1767Audio.Duration / 1000
+ if len(content.MSC1767Audio.Waveform) > 0 {
+ audioAttr.Waveform = waveform.Encode(content.MSC1767Audio.Waveform)
+ }
+ }
+ attributes = append(attributes, &audioAttr)
+ }
+
+ return &tg.InputMediaUploadedDocument{
+ File: upload,
+ MimeType: content.Info.MimeType,
+ Attributes: attributes,
+ }, nil
}
func (t *TelegramClient) HandleMatrixMessage(ctx context.Context, msg *bridgev2.MatrixMessage) (resp *bridgev2.MatrixMessageResponse, err error) {
@@ -45,94 +81,71 @@ func (t *TelegramClient) HandleMatrixMessage(ctx context.Context, msg *bridgev2.
if err != nil {
return nil, err
}
- builder := message.NewSender(t.client.API()).To(peer)
var contentURI id.ContentURIString
// TODO handle sticker
+ noWebpage := msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0
+
+ message, entities := matrixfmt.Parse(ctx, &matrixfmt.HTMLParser{
+ ParseGhostMXID: t.main.Bridge.Matrix.ParseGhostMXID,
+ }, msg.Content)
+
+ var replyTo tg.InputReplyToClass
+ if msg.ReplyTo != nil {
+ messageID, err := ids.ParseMessageID(msg.ReplyTo.ID)
+ if err != nil {
+ return nil, err
+ }
+ replyTo = &tg.InputReplyToMessage{ReplyToMsgID: messageID}
+ }
+
var updates tg.UpdatesClass
switch msg.Content.MsgType {
- case event.MsgText:
- // TODO unify with edits?
- if msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0 {
- builder.NoWebpage()
- }
- updates, err = builder.Text(ctx, msg.Content.Body)
+ case event.MsgText, event.MsgNotice:
+ updates, err = t.client.API().MessagesSendMessage(ctx, &tg.MessagesSendMessageRequest{
+ Peer: peer,
+ NoWebpage: noWebpage,
+ Message: message,
+ Entities: entities,
+ ReplyTo: replyTo,
+ RandomID: rand.Int63(),
+ })
case event.MsgImage, event.MsgFile, event.MsgAudio, event.MsgVideo:
- filename, caption := getMediaFilenameAndCaption(msg.Content)
-
- var fileData []byte
- fileData, err = t.main.Bridge.Bot.DownloadMedia(ctx, msg.Content.URL, msg.Content.File)
+ var media tg.InputMediaClass
+ media, err = t.transferMediaToTelegram(ctx, msg.Content)
if err != nil {
- return nil, fmt.Errorf("failed to download media from Matrix: %w", err)
- }
- contentURI = msg.Content.URL
- if contentURI == "" {
- contentURI = msg.Content.File.URL
- }
-
- uploader := uploader.NewUploader(t.client.API())
- var upload tg.InputFileClass
- upload, err = uploader.FromBytes(ctx, filename, fileData)
- if err != nil {
- return nil, fmt.Errorf("failed to upload media to Telegram: %w", err)
- }
- var styling []styling.StyledTextOption
- if caption != "" {
- // TODO resolver?
- // TODO HTML
- styling = append(styling, html.String(nil, caption))
- }
-
- if msg.Content.MsgType == event.MsgImage {
- updates, err = builder.Media(ctx, message.UploadedPhoto(upload, styling...))
- break
- } else {
- document := message.UploadedDocument(upload, styling...).Filename(filename)
- if msg.Content.Info != nil {
- document.MIME(msg.Content.Info.MimeType)
- }
-
- var media message.MediaOption
-
- switch msg.Content.MsgType {
- case event.MsgAudio:
- audioBuilder := document.Audio()
- if msg.Content.MSC1767Audio != nil {
- audioBuilder.Duration(time.Duration(msg.Content.MSC1767Audio.Duration) * time.Millisecond)
- if len(msg.Content.MSC1767Audio.Waveform) > 0 {
- audioBuilder.Waveform(waveform.Encode(msg.Content.MSC1767Audio.Waveform))
- }
- }
- if msg.Content.MSC3245Voice != nil {
- audioBuilder.Voice()
- }
- media = audioBuilder
- default:
- media = document
- }
- updates, err = builder.Media(ctx, media)
+ return nil, err
}
+ updates, err = t.client.API().MessagesSendMedia(ctx, &tg.MessagesSendMediaRequest{
+ Peer: peer,
+ Message: message,
+ Entities: entities,
+ Media: media,
+ ReplyTo: replyTo,
+ RandomID: rand.Int63(),
+ })
case event.MsgLocation:
var uri GeoURI
uri, err = ParseGeoURI(msg.Content.GeoURI)
if err != nil {
return nil, err
}
- var styling []styling.StyledTextOption
+ message = ""
if location, ok := msg.Event.Content.Raw["org.matrix.msc3488.location"].(map[string]any); ok {
if desc, ok := location["description"].(string); ok {
- // TODO resolver?
- // TODO HTML
- styling = append(styling, html.String(nil, desc))
+ message = desc
}
}
- updates, err = builder.Media(ctx, message.Media(&tg.InputMediaGeoPoint{
- GeoPoint: &tg.InputGeoPoint{
- Lat: uri.Lat,
- Long: uri.Long,
+ updates, err = t.client.API().MessagesSendMedia(ctx, &tg.MessagesSendMediaRequest{
+ Peer: peer,
+ Message: message,
+ Media: &tg.InputMediaGeoPoint{
+ GeoPoint: &tg.InputGeoPoint{Lat: uri.Lat, Long: uri.Long},
},
- }, styling...))
+ ReplyTo: replyTo,
+ RandomID: rand.Int63(),
+ })
default:
return nil, fmt.Errorf("unsupported message type %s", msg.Content.MsgType)
}
@@ -194,88 +207,41 @@ func (t *TelegramClient) HandleMatrixEdit(ctx context.Context, msg *bridgev2.Mat
return err
}
- b := message.NewSender(t.client.API()).To(peer)
- if msg.Content.MsgType == event.MsgText && msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0 {
- b.NoWebpage()
- }
-
targetID, err := ids.ParseMessageID(msg.EditTarget.ID)
if err != nil {
return err
}
- builder := b.Edit(targetID)
+
+ message, entities := matrixfmt.Parse(ctx, &matrixfmt.HTMLParser{
+ ParseGhostMXID: t.main.Bridge.Matrix.ParseGhostMXID,
+ }, msg.Content)
var newContentURI id.ContentURIString
- var updates tg.UpdatesClass
- switch msg.Content.MsgType {
- case event.MsgText:
- updates, err = builder.Text(ctx, msg.Content.Body)
- case event.MsgImage, event.MsgFile, event.MsgAudio, event.MsgVideo:
- filename, caption := getMediaFilenameAndCaption(msg.Content)
-
- var styling []styling.StyledTextOption
- if caption != "" {
- // TODO resolver?
- // TODO HTML
- styling = append(styling, html.String(nil, caption))
- }
-
+ req := tg.MessagesEditMessageRequest{
+ Peer: peer,
+ NoWebpage: msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0,
+ Message: message,
+ Entities: entities,
+ ID: targetID,
+ }
+ if msg.Content.MsgType.IsMedia() {
newContentURI = msg.Content.URL
if newContentURI == "" {
newContentURI = msg.Content.File.URL
}
if msg.EditTarget.Metadata.(*MessageMetadata).ContentURI == newContentURI {
log.Info().Msg("media URI unchanged, skipping re-upload, just editing text")
- updates, err = builder.StyledText(ctx, styling...)
- break
- }
-
- log.Info().Msg("media URI changed, re-uploading media")
-
- var fileData []byte
- fileData, err = t.main.Bridge.Bot.DownloadMedia(ctx, msg.Content.URL, msg.Content.File)
- if err != nil {
- return fmt.Errorf("failed to download media from Matrix: %w", err)
- }
- uploader := uploader.NewUploader(t.client.API())
- var upload tg.InputFileClass
- upload, err = uploader.FromBytes(ctx, filename, fileData)
- if err != nil {
- return fmt.Errorf("failed to upload media to Telegram: %w", err)
- }
-
- if msg.Content.MsgType == event.MsgImage {
- updates, err = builder.Media(ctx, message.UploadedPhoto(upload, styling...))
- break
} else {
- document := message.UploadedDocument(upload, styling...).Filename(filename)
- if msg.Content.Info != nil {
- document.MIME(msg.Content.Info.MimeType)
+ log.Info().Msg("media URI changed, re-uploading media")
+ req.Media, err = t.transferMediaToTelegram(ctx, msg.Content)
+ if err != nil {
+ return err
}
-
- var media message.MediaOption
-
- switch msg.Content.MsgType {
- case event.MsgAudio:
- audioBuilder := document.Audio()
- if msg.Content.MSC1767Audio != nil {
- audioBuilder.Duration(time.Duration(msg.Content.MSC1767Audio.Duration) * time.Millisecond)
- if len(msg.Content.MSC1767Audio.Waveform) > 0 {
- audioBuilder.Waveform(waveform.Encode(msg.Content.MSC1767Audio.Waveform))
- }
- }
- if msg.Content.MSC3245Voice != nil {
- audioBuilder.Voice()
- }
- media = audioBuilder
- default:
- media = document
- }
- updates, err = builder.Media(ctx, media)
}
- default:
- return fmt.Errorf("unsupported message type %s", msg.Content.MsgType)
+ } else if !msg.Content.MsgType.IsText() {
+ return fmt.Errorf("editing message type %s is unsupported", msg.Content.MsgType)
}
+ updates, err := t.client.API().MessagesEditMessage(ctx, &req)
if err != nil {
return err
}
diff --git a/pkg/connector/matrixfmt/convert.go b/pkg/connector/matrixfmt/convert.go
new file mode 100644
index 00000000..40c4e6d5
--- /dev/null
+++ b/pkg/connector/matrixfmt/convert.go
@@ -0,0 +1,103 @@
+// mautrix-telegram - A Matrix-Telegram puppeting bridge.
+// Copyright (C) 2024 Sumner Evans
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package matrixfmt
+
+import (
+ "context"
+
+ "github.com/gotd/td/tg"
+ "maunium.net/go/mautrix/event"
+
+ "go.mau.fi/mautrix-telegram/pkg/connector/ids"
+ "go.mau.fi/mautrix-telegram/pkg/connector/telegramfmt"
+)
+
+func toTelegramEntity(br telegramfmt.BodyRange) tg.MessageEntityClass {
+ switch val := br.Value.(type) {
+ case telegramfmt.Mention:
+ userID, _ := ids.ParseUserID(val.UserID)
+ return &tg.MessageEntityMentionName{
+ Offset: br.Start,
+ Length: br.Length,
+ UserID: userID,
+ }
+ case telegramfmt.Style:
+ switch val.Type {
+ case telegramfmt.StyleBold:
+ return &tg.MessageEntityBold{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleItalic:
+ return &tg.MessageEntityItalic{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleUnderline:
+ return &tg.MessageEntityUnderline{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleStrikethrough:
+ return &tg.MessageEntityStrike{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleBlockquote:
+ return &tg.MessageEntityBlockquote{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleCode:
+ return &tg.MessageEntityCode{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StylePre:
+ return &tg.MessageEntityPre{Offset: br.Start, Length: br.Length, Language: val.Language}
+ case telegramfmt.StyleEmail:
+ return &tg.MessageEntityEmail{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleTextURL:
+ return &tg.MessageEntityTextURL{Offset: br.Start, Length: br.Length, URL: val.URL}
+ case telegramfmt.StyleURL:
+ return &tg.MessageEntityURL{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleBotCommand:
+ return &tg.MessageEntityBotCommand{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleHashtag:
+ return &tg.MessageEntityHashtag{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleCashtag:
+ return &tg.MessageEntityCashtag{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StylePhone:
+ return &tg.MessageEntityPhone{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleSpoiler:
+ return &tg.MessageEntitySpoiler{Offset: br.Start, Length: br.Length}
+ case telegramfmt.StyleBankCard:
+ return &tg.MessageEntityBankCard{Offset: br.Start, Length: br.Length}
+ default:
+ panic("unsupported style type")
+ }
+ default:
+ panic("unknown body range value")
+ }
+}
+
+func Parse(ctx context.Context, parser *HTMLParser, content *event.MessageEventContent) (string, []tg.MessageEntityClass) {
+ if content.MsgType.IsMedia() && content.FileName == "" {
+ // The body is the filename.
+ return "", nil
+ }
+
+ if content.Format != event.FormatHTML {
+ return content.Body, nil
+ }
+ parseCtx := NewContext(ctx)
+ parseCtx.AllowedMentions = content.Mentions
+ parsed := parser.Parse(content.FormattedBody, parseCtx)
+ if parsed == nil {
+ return "", nil
+ }
+ var bodyRanges []tg.MessageEntityClass
+ if len(parsed.Entities) > 0 {
+ bodyRanges = make([]tg.MessageEntityClass, len(parsed.Entities))
+ for i, ent := range parsed.Entities {
+ bodyRanges[i] = toTelegramEntity(ent)
+ }
+ }
+ return parsed.String.String(), bodyRanges
+}
diff --git a/pkg/connector/matrixfmt/html.go b/pkg/connector/matrixfmt/html.go
new file mode 100644
index 00000000..49565877
--- /dev/null
+++ b/pkg/connector/matrixfmt/html.go
@@ -0,0 +1,490 @@
+// mautrix-telegram - A Matrix-Telegram puppeting bridge.
+// Copyright (C) 2024 Sumner Evans
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package matrixfmt
+
+import (
+ "context"
+ "fmt"
+ "math"
+ "strconv"
+ "strings"
+
+ "golang.org/x/exp/slices"
+ "golang.org/x/net/html"
+ "maunium.net/go/mautrix/bridgev2/networkid"
+ "maunium.net/go/mautrix/event"
+ "maunium.net/go/mautrix/id"
+
+ "go.mau.fi/mautrix-telegram/pkg/connector/telegramfmt"
+)
+
+type EntityString struct {
+ String telegramfmt.UTF16String
+ Entities telegramfmt.BodyRangeList
+}
+
+var DebugLog = func(format string, args ...any) {}
+
+func NewEntityString(val string) *EntityString {
+ DebugLog("NEW %q\n", val)
+ return &EntityString{
+ String: telegramfmt.NewUTF16String(val),
+ }
+}
+
+func (es *EntityString) Split(at uint16) []*EntityString {
+ if at > 0x7F {
+ panic("cannot split at non-ASCII character")
+ }
+ if es == nil {
+ return []*EntityString{}
+ }
+ DebugLog("SPLIT %q %q %+v\n", es.String, rune(at), es.Entities)
+ var output []*EntityString
+ prevSplit := 0
+ doSplit := func(i int) *EntityString {
+ newES := &EntityString{
+ String: es.String[prevSplit:i],
+ }
+ for _, entity := range es.Entities {
+ if (entity.End() <= i || entity.End() > prevSplit) && (entity.Start >= prevSplit || entity.Start < i) {
+ entity = *entity.TruncateStart(prevSplit).TruncateEnd(i).Offset(-prevSplit)
+ if entity.Length > 0 {
+ newES.Entities = append(newES.Entities, entity)
+ }
+ }
+ }
+ return newES
+ }
+ for i, chr := range es.String {
+ if chr != at {
+ continue
+ }
+ newES := doSplit(i)
+ output = append(output, newES)
+ DebugLog(" -> %q %+v\n", newES.String, newES.Entities)
+ prevSplit = i + 1
+ }
+ if prevSplit == 0 {
+ DebugLog(" -> NOOP\n")
+ return []*EntityString{es}
+ }
+ if prevSplit != len(es.String) {
+ newES := doSplit(len(es.String))
+ output = append(output, newES)
+ DebugLog(" -> %q %+v\n", newES.String, newES.Entities)
+ }
+ DebugLog("SPLITEND\n")
+ return output
+}
+
+func (es *EntityString) TrimSpace() *EntityString {
+ if es == nil {
+ return nil
+ }
+ DebugLog("TRIMSPACE %q %+v\n", es.String, es.Entities)
+ var cutEnd, cutStart int
+ for cutStart = 0; cutStart < len(es.String); cutStart++ {
+ switch es.String[cutStart] {
+ case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
+ continue
+ }
+ break
+ }
+ for cutEnd = len(es.String) - 1; cutEnd >= 0; cutEnd-- {
+ switch es.String[cutEnd] {
+ case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
+ continue
+ }
+ break
+ }
+ cutEnd++
+ if cutStart == 0 && cutEnd == len(es.String) {
+ DebugLog(" -> NOOP\n")
+ return es
+ }
+ newEntities := es.Entities[:0]
+ for _, ent := range es.Entities {
+ ent = *ent.Offset(-cutStart).TruncateEnd(cutEnd)
+ if ent.Length > 0 {
+ newEntities = append(newEntities, ent)
+ }
+ }
+ es.String = es.String[cutStart:cutEnd]
+ es.Entities = newEntities
+ DebugLog(" -> %q %+v\n", es.String, es.Entities)
+ return es
+}
+
+func JoinEntityString(with string, strings ...*EntityString) *EntityString {
+ withUTF16 := telegramfmt.NewUTF16String(with)
+ totalLen := 0
+ totalEntities := 0
+ for _, s := range strings {
+ totalLen += len(s.String)
+ totalEntities += len(s.Entities)
+ }
+ str := make(telegramfmt.UTF16String, 0, totalLen+len(strings)*len(withUTF16))
+ entities := make(telegramfmt.BodyRangeList, 0, totalEntities)
+ DebugLog("JOIN %q %d\n", with, len(strings))
+ for _, s := range strings {
+ if s == nil || len(s.String) == 0 {
+ continue
+ }
+ DebugLog(" + %q %+v\n", s.String, s.Entities)
+ for _, entity := range s.Entities {
+ entity.Start += len(str)
+ entities = append(entities, entity)
+ }
+ str = append(str, s.String...)
+ str = append(str, withUTF16...)
+ }
+ DebugLog(" -> %q %+v\n", str, entities)
+ return &EntityString{
+ String: str,
+ Entities: entities,
+ }
+}
+
+func (es *EntityString) Format(value telegramfmt.BodyRangeValue) *EntityString {
+ if es == nil {
+ return nil
+ }
+ newEntity := telegramfmt.BodyRange{
+ Start: 0,
+ Length: len(es.String),
+ Value: value,
+ }
+ es.Entities = append(telegramfmt.BodyRangeList{newEntity}, es.Entities...)
+ DebugLog("FORMAT %v %q %+v\n", value, es.String, es.Entities)
+ return es
+}
+
+func (es *EntityString) Append(other *EntityString) *EntityString {
+ if es == nil {
+ return other
+ } else if other == nil {
+ return es
+ }
+ DebugLog("APPEND %q %+v\n + %q %+v\n", es.String, es.Entities, other.String, other.Entities)
+ for _, entity := range other.Entities {
+ entity.Start += len(es.String)
+ es.Entities = append(es.Entities, entity)
+ }
+ es.String = append(es.String, other.String...)
+ DebugLog(" -> %q %+v\n", es.String, es.Entities)
+ return es
+}
+
+func (es *EntityString) AppendString(other string) *EntityString {
+ if es == nil {
+ return NewEntityString(other)
+ } else if len(other) == 0 {
+ return es
+ }
+ DebugLog("APPENDSTRING %q %+v\n + %q\n", es.String, es.Entities, other)
+ es.String = append(es.String, telegramfmt.NewUTF16String(other)...)
+ DebugLog(" -> %q %+v\n", es.String, es.Entities)
+ return es
+}
+
+type TagStack []string
+
+func (ts TagStack) Index(tag string) int {
+ for i := len(ts) - 1; i >= 0; i-- {
+ if ts[i] == tag {
+ return i
+ }
+ }
+ return -1
+}
+
+func (ts TagStack) Has(tag string) bool {
+ return ts.Index(tag) >= 0
+}
+
+type Context struct {
+ Ctx context.Context
+ AllowedMentions *event.Mentions
+ TagStack TagStack
+ PreserveWhitespace bool
+}
+
+func NewContext(ctx context.Context) Context {
+ return Context{
+ Ctx: ctx,
+ TagStack: make(TagStack, 0, 4),
+ }
+}
+
+func (ctx Context) WithTag(tag string) Context {
+ ctx.TagStack = append(ctx.TagStack, tag)
+ return ctx
+}
+
+func (ctx Context) WithWhitespace() Context {
+ ctx.PreserveWhitespace = true
+ return ctx
+}
+
+// HTMLParser is a somewhat customizable Matrix HTML parser.
+type HTMLParser struct {
+ ParseGhostMXID func(id.UserID) (networkid.UserID, bool)
+}
+
+// TaggedString is a string that also contains a HTML tag.
+type TaggedString struct {
+ *EntityString
+ tag string
+}
+
+func (parser *HTMLParser) maybeGetAttribute(node *html.Node, attribute string) (string, bool) {
+ for _, attr := range node.Attr {
+ if attr.Key == attribute {
+ return attr.Val, true
+ }
+ }
+ return "", false
+}
+
+func (parser *HTMLParser) getAttribute(node *html.Node, attribute string) string {
+ val, _ := parser.maybeGetAttribute(node, attribute)
+ return val
+}
+
+// Digits counts the number of digits (and the sign, if negative) in an integer.
+func Digits(num int) int {
+ if num == 0 {
+ return 1
+ } else if num < 0 {
+ return Digits(-num) + 1
+ }
+ return int(math.Floor(math.Log10(float64(num))) + 1)
+}
+
+func (parser *HTMLParser) listToString(node *html.Node, ctx Context) *EntityString {
+ ordered := node.Data == "ol"
+ taggedChildren := parser.nodeToTaggedStrings(node.FirstChild, ctx)
+ counter := 1
+ indentLength := 0
+ if ordered {
+ start := parser.getAttribute(node, "start")
+ if len(start) > 0 {
+ counter, _ = strconv.Atoi(start)
+ }
+
+ longestIndex := (counter - 1) + len(taggedChildren)
+ indentLength = Digits(longestIndex)
+ }
+ indent := strings.Repeat(" ", indentLength+2)
+ var children []*EntityString
+ for _, child := range taggedChildren {
+ if child.tag != "li" {
+ continue
+ }
+ var prefix string
+ if ordered {
+ indexPadding := indentLength - Digits(counter)
+ if indexPadding < 0 {
+ // This will happen on negative start indexes where longestIndex is usually wrong, otherwise shouldn't happen
+ indexPadding = 0
+ }
+ prefix = fmt.Sprintf("%d. %s", counter, strings.Repeat(" ", indexPadding))
+ } else {
+ prefix = "* "
+ }
+ es := NewEntityString(prefix).Append(child.EntityString)
+ counter++
+ parts := es.Split('\n')
+ for i, part := range parts[1:] {
+ parts[i+1] = NewEntityString(indent).Append(part)
+ }
+ children = append(children, parts...)
+ }
+ return JoinEntityString("\n", children...)
+}
+
+func (parser *HTMLParser) basicFormatToString(node *html.Node, ctx Context) *EntityString {
+ str := parser.nodeToTagAwareString(node.FirstChild, ctx)
+ switch node.Data {
+ case "b", "strong":
+ return str.Format(telegramfmt.Style{Type: telegramfmt.StyleBold})
+ case "i", "em":
+ return str.Format(telegramfmt.Style{Type: telegramfmt.StyleItalic})
+ case "s", "del", "strike":
+ return str.Format(telegramfmt.Style{Type: telegramfmt.StyleStrikethrough})
+ case "u", "ins":
+ return str.Format(telegramfmt.Style{Type: telegramfmt.StyleUnderline})
+ case "tt", "code":
+ return str.Format(telegramfmt.Style{Type: telegramfmt.StyleCode})
+ }
+ return str
+}
+
+func (parser *HTMLParser) spanToString(node *html.Node, ctx Context) *EntityString {
+ str := parser.nodeToTagAwareString(node.FirstChild, ctx)
+ if node.Data == "span" {
+ _, isSpoiler := parser.maybeGetAttribute(node, "data-mx-spoiler")
+ if isSpoiler {
+ str = str.Format(telegramfmt.Style{Type: telegramfmt.StyleSpoiler})
+ }
+ }
+ return str
+}
+
+func (parser *HTMLParser) headerToString(node *html.Node, ctx Context) *EntityString {
+ length := int(node.Data[1] - '0')
+ prefix := strings.Repeat("#", length) + " "
+ return NewEntityString(prefix).Append(parser.nodeToString(node.FirstChild, ctx)).Format(telegramfmt.Style{Type: telegramfmt.StyleBold})
+}
+
+func (parser *HTMLParser) linkToString(node *html.Node, ctx Context) *EntityString {
+ str := parser.nodeToTagAwareString(node.FirstChild, ctx)
+ href := parser.getAttribute(node, "href")
+ if len(href) == 0 {
+ return str
+ }
+ ent := NewEntityString(str.String.String())
+
+ parsedMatrix, err := id.ParseMatrixURIOrMatrixToURL(href)
+ if err == nil && parsedMatrix != nil && parsedMatrix.Sigil1 == '@' {
+ mxid := parsedMatrix.UserID()
+ if ctx.AllowedMentions != nil && !slices.Contains(ctx.AllowedMentions.UserIDs, mxid) {
+ // Mention not allowed, use name as-is
+ return str
+ }
+ userID, ok := parser.ParseGhostMXID(mxid)
+ if !ok {
+ return str
+ }
+ return ent.Format(telegramfmt.Mention{UserID: userID})
+ }
+ if str.String.String() == href {
+ return ent.Format(telegramfmt.Style{Type: telegramfmt.StyleURL, URL: href})
+ } else {
+ return ent.Format(telegramfmt.Style{Type: telegramfmt.StyleTextURL, URL: href})
+ }
+}
+
+func (parser *HTMLParser) tagToString(node *html.Node, ctx Context) *EntityString {
+ ctx = ctx.WithTag(node.Data)
+ switch node.Data {
+ case "blockquote":
+ return parser.
+ nodeToTagAwareString(node.FirstChild, ctx).
+ Format(telegramfmt.Style{Type: telegramfmt.StyleBlockquote})
+ case "ol", "ul":
+ return parser.listToString(node, ctx)
+ case "h1", "h2", "h3", "h4", "h5", "h6":
+ return parser.headerToString(node, ctx)
+ case "br":
+ return NewEntityString("\n")
+ case "b", "strong", "i", "em", "s", "strike", "del", "u", "ins", "tt", "code":
+ return parser.basicFormatToString(node, ctx)
+ case "span", "font":
+ return parser.spanToString(node, ctx)
+ case "a":
+ return parser.linkToString(node, ctx)
+ case "p":
+ return parser.nodeToTagAwareString(node.FirstChild, ctx)
+ case "hr":
+ return NewEntityString("---")
+ case "pre":
+ var preStr *EntityString
+ var language string
+ if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" {
+ class := parser.getAttribute(node.FirstChild, "class")
+ if strings.HasPrefix(class, "language-") {
+ language = class[len("language-"):]
+ }
+ preStr = parser.nodeToString(node.FirstChild.FirstChild, ctx.WithWhitespace())
+ } else {
+ preStr = parser.nodeToString(node.FirstChild, ctx.WithWhitespace())
+ }
+ return preStr.Format(telegramfmt.Style{Type: telegramfmt.StylePre, Language: language})
+ default:
+ return parser.nodeToTagAwareString(node.FirstChild, ctx)
+ }
+}
+
+func (parser *HTMLParser) singleNodeToString(node *html.Node, ctx Context) TaggedString {
+ switch node.Type {
+ case html.TextNode:
+ if !ctx.PreserveWhitespace {
+ node.Data = strings.ReplaceAll(node.Data, "\n", "")
+ }
+ return TaggedString{NewEntityString(node.Data), "text"}
+ case html.ElementNode:
+ return TaggedString{parser.tagToString(node, ctx), node.Data}
+ case html.DocumentNode:
+ return TaggedString{parser.nodeToTagAwareString(node.FirstChild, ctx), "html"}
+ default:
+ return TaggedString{&EntityString{}, "unknown"}
+ }
+}
+
+func (parser *HTMLParser) nodeToTaggedStrings(node *html.Node, ctx Context) (strs []TaggedString) {
+ for ; node != nil; node = node.NextSibling {
+ strs = append(strs, parser.singleNodeToString(node, ctx))
+ }
+ return
+}
+
+var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"}
+
+func (parser *HTMLParser) isBlockTag(tag string) bool {
+ for _, blockTag := range BlockTags {
+ if tag == blockTag {
+ return true
+ }
+ }
+ return false
+}
+
+func (parser *HTMLParser) nodeToTagAwareString(node *html.Node, ctx Context) *EntityString {
+ strs := parser.nodeToTaggedStrings(node, ctx)
+ var output *EntityString
+ for _, str := range strs {
+ tstr := str.EntityString
+ if parser.isBlockTag(str.tag) {
+ tstr = NewEntityString("\n").Append(tstr).AppendString("\n")
+ }
+ if output == nil {
+ output = tstr
+ } else {
+ output = output.Append(tstr)
+ }
+ }
+ return output.TrimSpace()
+}
+
+func (parser *HTMLParser) nodeToStrings(node *html.Node, ctx Context) (strs []*EntityString) {
+ for ; node != nil; node = node.NextSibling {
+ strs = append(strs, parser.singleNodeToString(node, ctx).EntityString)
+ }
+ return
+}
+
+func (parser *HTMLParser) nodeToString(node *html.Node, ctx Context) *EntityString {
+ return JoinEntityString("", parser.nodeToStrings(node, ctx)...)
+}
+
+// Parse converts Matrix HTML into text using the settings in this parser.
+func (parser *HTMLParser) Parse(htmlData string, ctx Context) *EntityString {
+ node, _ := html.Parse(strings.NewReader(htmlData))
+ return parser.nodeToTagAwareString(node, ctx)
+}