diff --git a/go.mod b/go.mod index 1bdc3e35..63887313 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( go.mau.fi/zerozap v0.1.1 go.uber.org/zap v1.27.0 golang.org/x/exp v0.0.0-20240716175740-e3f259677ff7 + golang.org/x/net v0.27.0 maunium.net/go/mautrix v0.19.1-0.20240719130542-cc5f225bc61c ) @@ -44,7 +45,6 @@ require ( go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/crypto v0.25.0 // indirect - golang.org/x/net v0.27.0 // indirect golang.org/x/sync v0.7.0 // indirect golang.org/x/sys v0.22.0 // indirect gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect diff --git a/pkg/connector/matrix.go b/pkg/connector/matrix.go index faa27945..06dca68e 100644 --- a/pkg/connector/matrix.go +++ b/pkg/connector/matrix.go @@ -4,13 +4,12 @@ import ( "context" "crypto/sha256" "fmt" + "math/rand" "strconv" "strings" "time" "github.com/gotd/td/telegram/message" - "github.com/gotd/td/telegram/message/html" - "github.com/gotd/td/telegram/message/styling" "github.com/gotd/td/telegram/uploader" "github.com/gotd/td/tg" "github.com/rs/zerolog" @@ -24,20 +23,57 @@ import ( "go.mau.fi/mautrix-telegram/pkg/connector/emojis" "go.mau.fi/mautrix-telegram/pkg/connector/ids" + "go.mau.fi/mautrix-telegram/pkg/connector/matrixfmt" "go.mau.fi/mautrix-telegram/pkg/connector/waveform" ) -func getMediaFilenameAndCaption(content *event.MessageEventContent) (filename, caption string) { +func getMediaFilename(content *event.MessageEventContent) string { if content.FileName != "" { - filename = content.FileName - caption = content.FormattedBody - if caption == "" { - caption = content.Body - } + return content.FileName } else { - filename = content.Body + return content.Body } - return +} + +func (t *TelegramClient) transferMediaToTelegram(ctx context.Context, content *event.MessageEventContent) (tg.InputMediaClass, error) { + filename := getMediaFilename(content) + var fileData []byte + fileData, err := t.main.Bridge.Bot.DownloadMedia(ctx, content.URL, content.File) + if err != nil { + return nil, fmt.Errorf("failed to download media from Matrix: %w", err) + } + uploader := uploader.NewUploader(t.client.API()) + var upload tg.InputFileClass + upload, err = uploader.FromBytes(ctx, filename, fileData) + if err != nil { + return nil, fmt.Errorf("failed to upload media to Telegram: %w", err) + } + + if content.MsgType == event.MsgImage { + return &tg.InputMediaUploadedPhoto{File: upload}, nil + } + + var attributes []tg.DocumentAttributeClass + attributes = append(attributes, &tg.DocumentAttributeFilename{FileName: filename}) + + if content.MsgType == event.MsgAudio { + audioAttr := tg.DocumentAttributeAudio{ + Voice: content.MSC3245Voice != nil, + } + if content.MSC1767Audio != nil { + audioAttr.Duration = content.MSC1767Audio.Duration / 1000 + if len(content.MSC1767Audio.Waveform) > 0 { + audioAttr.Waveform = waveform.Encode(content.MSC1767Audio.Waveform) + } + } + attributes = append(attributes, &audioAttr) + } + + return &tg.InputMediaUploadedDocument{ + File: upload, + MimeType: content.Info.MimeType, + Attributes: attributes, + }, nil } func (t *TelegramClient) HandleMatrixMessage(ctx context.Context, msg *bridgev2.MatrixMessage) (resp *bridgev2.MatrixMessageResponse, err error) { @@ -45,94 +81,71 @@ func (t *TelegramClient) HandleMatrixMessage(ctx context.Context, msg *bridgev2. if err != nil { return nil, err } - builder := message.NewSender(t.client.API()).To(peer) var contentURI id.ContentURIString // TODO handle sticker + noWebpage := msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0 + + message, entities := matrixfmt.Parse(ctx, &matrixfmt.HTMLParser{ + ParseGhostMXID: t.main.Bridge.Matrix.ParseGhostMXID, + }, msg.Content) + + var replyTo tg.InputReplyToClass + if msg.ReplyTo != nil { + messageID, err := ids.ParseMessageID(msg.ReplyTo.ID) + if err != nil { + return nil, err + } + replyTo = &tg.InputReplyToMessage{ReplyToMsgID: messageID} + } + var updates tg.UpdatesClass switch msg.Content.MsgType { - case event.MsgText: - // TODO unify with edits? - if msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0 { - builder.NoWebpage() - } - updates, err = builder.Text(ctx, msg.Content.Body) + case event.MsgText, event.MsgNotice: + updates, err = t.client.API().MessagesSendMessage(ctx, &tg.MessagesSendMessageRequest{ + Peer: peer, + NoWebpage: noWebpage, + Message: message, + Entities: entities, + ReplyTo: replyTo, + RandomID: rand.Int63(), + }) case event.MsgImage, event.MsgFile, event.MsgAudio, event.MsgVideo: - filename, caption := getMediaFilenameAndCaption(msg.Content) - - var fileData []byte - fileData, err = t.main.Bridge.Bot.DownloadMedia(ctx, msg.Content.URL, msg.Content.File) + var media tg.InputMediaClass + media, err = t.transferMediaToTelegram(ctx, msg.Content) if err != nil { - return nil, fmt.Errorf("failed to download media from Matrix: %w", err) - } - contentURI = msg.Content.URL - if contentURI == "" { - contentURI = msg.Content.File.URL - } - - uploader := uploader.NewUploader(t.client.API()) - var upload tg.InputFileClass - upload, err = uploader.FromBytes(ctx, filename, fileData) - if err != nil { - return nil, fmt.Errorf("failed to upload media to Telegram: %w", err) - } - var styling []styling.StyledTextOption - if caption != "" { - // TODO resolver? - // TODO HTML - styling = append(styling, html.String(nil, caption)) - } - - if msg.Content.MsgType == event.MsgImage { - updates, err = builder.Media(ctx, message.UploadedPhoto(upload, styling...)) - break - } else { - document := message.UploadedDocument(upload, styling...).Filename(filename) - if msg.Content.Info != nil { - document.MIME(msg.Content.Info.MimeType) - } - - var media message.MediaOption - - switch msg.Content.MsgType { - case event.MsgAudio: - audioBuilder := document.Audio() - if msg.Content.MSC1767Audio != nil { - audioBuilder.Duration(time.Duration(msg.Content.MSC1767Audio.Duration) * time.Millisecond) - if len(msg.Content.MSC1767Audio.Waveform) > 0 { - audioBuilder.Waveform(waveform.Encode(msg.Content.MSC1767Audio.Waveform)) - } - } - if msg.Content.MSC3245Voice != nil { - audioBuilder.Voice() - } - media = audioBuilder - default: - media = document - } - updates, err = builder.Media(ctx, media) + return nil, err } + updates, err = t.client.API().MessagesSendMedia(ctx, &tg.MessagesSendMediaRequest{ + Peer: peer, + Message: message, + Entities: entities, + Media: media, + ReplyTo: replyTo, + RandomID: rand.Int63(), + }) case event.MsgLocation: var uri GeoURI uri, err = ParseGeoURI(msg.Content.GeoURI) if err != nil { return nil, err } - var styling []styling.StyledTextOption + message = "" if location, ok := msg.Event.Content.Raw["org.matrix.msc3488.location"].(map[string]any); ok { if desc, ok := location["description"].(string); ok { - // TODO resolver? - // TODO HTML - styling = append(styling, html.String(nil, desc)) + message = desc } } - updates, err = builder.Media(ctx, message.Media(&tg.InputMediaGeoPoint{ - GeoPoint: &tg.InputGeoPoint{ - Lat: uri.Lat, - Long: uri.Long, + updates, err = t.client.API().MessagesSendMedia(ctx, &tg.MessagesSendMediaRequest{ + Peer: peer, + Message: message, + Media: &tg.InputMediaGeoPoint{ + GeoPoint: &tg.InputGeoPoint{Lat: uri.Lat, Long: uri.Long}, }, - }, styling...)) + ReplyTo: replyTo, + RandomID: rand.Int63(), + }) default: return nil, fmt.Errorf("unsupported message type %s", msg.Content.MsgType) } @@ -194,88 +207,41 @@ func (t *TelegramClient) HandleMatrixEdit(ctx context.Context, msg *bridgev2.Mat return err } - b := message.NewSender(t.client.API()).To(peer) - if msg.Content.MsgType == event.MsgText && msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0 { - b.NoWebpage() - } - targetID, err := ids.ParseMessageID(msg.EditTarget.ID) if err != nil { return err } - builder := b.Edit(targetID) + + message, entities := matrixfmt.Parse(ctx, &matrixfmt.HTMLParser{ + ParseGhostMXID: t.main.Bridge.Matrix.ParseGhostMXID, + }, msg.Content) var newContentURI id.ContentURIString - var updates tg.UpdatesClass - switch msg.Content.MsgType { - case event.MsgText: - updates, err = builder.Text(ctx, msg.Content.Body) - case event.MsgImage, event.MsgFile, event.MsgAudio, event.MsgVideo: - filename, caption := getMediaFilenameAndCaption(msg.Content) - - var styling []styling.StyledTextOption - if caption != "" { - // TODO resolver? - // TODO HTML - styling = append(styling, html.String(nil, caption)) - } - + req := tg.MessagesEditMessageRequest{ + Peer: peer, + NoWebpage: msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0, + Message: message, + Entities: entities, + ID: targetID, + } + if msg.Content.MsgType.IsMedia() { newContentURI = msg.Content.URL if newContentURI == "" { newContentURI = msg.Content.File.URL } if msg.EditTarget.Metadata.(*MessageMetadata).ContentURI == newContentURI { log.Info().Msg("media URI unchanged, skipping re-upload, just editing text") - updates, err = builder.StyledText(ctx, styling...) - break - } - - log.Info().Msg("media URI changed, re-uploading media") - - var fileData []byte - fileData, err = t.main.Bridge.Bot.DownloadMedia(ctx, msg.Content.URL, msg.Content.File) - if err != nil { - return fmt.Errorf("failed to download media from Matrix: %w", err) - } - uploader := uploader.NewUploader(t.client.API()) - var upload tg.InputFileClass - upload, err = uploader.FromBytes(ctx, filename, fileData) - if err != nil { - return fmt.Errorf("failed to upload media to Telegram: %w", err) - } - - if msg.Content.MsgType == event.MsgImage { - updates, err = builder.Media(ctx, message.UploadedPhoto(upload, styling...)) - break } else { - document := message.UploadedDocument(upload, styling...).Filename(filename) - if msg.Content.Info != nil { - document.MIME(msg.Content.Info.MimeType) + log.Info().Msg("media URI changed, re-uploading media") + req.Media, err = t.transferMediaToTelegram(ctx, msg.Content) + if err != nil { + return err } - - var media message.MediaOption - - switch msg.Content.MsgType { - case event.MsgAudio: - audioBuilder := document.Audio() - if msg.Content.MSC1767Audio != nil { - audioBuilder.Duration(time.Duration(msg.Content.MSC1767Audio.Duration) * time.Millisecond) - if len(msg.Content.MSC1767Audio.Waveform) > 0 { - audioBuilder.Waveform(waveform.Encode(msg.Content.MSC1767Audio.Waveform)) - } - } - if msg.Content.MSC3245Voice != nil { - audioBuilder.Voice() - } - media = audioBuilder - default: - media = document - } - updates, err = builder.Media(ctx, media) } - default: - return fmt.Errorf("unsupported message type %s", msg.Content.MsgType) + } else if !msg.Content.MsgType.IsText() { + return fmt.Errorf("editing message type %s is unsupported", msg.Content.MsgType) } + updates, err := t.client.API().MessagesEditMessage(ctx, &req) if err != nil { return err } diff --git a/pkg/connector/matrixfmt/convert.go b/pkg/connector/matrixfmt/convert.go new file mode 100644 index 00000000..40c4e6d5 --- /dev/null +++ b/pkg/connector/matrixfmt/convert.go @@ -0,0 +1,103 @@ +// mautrix-telegram - A Matrix-Telegram puppeting bridge. +// Copyright (C) 2024 Sumner Evans +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package matrixfmt + +import ( + "context" + + "github.com/gotd/td/tg" + "maunium.net/go/mautrix/event" + + "go.mau.fi/mautrix-telegram/pkg/connector/ids" + "go.mau.fi/mautrix-telegram/pkg/connector/telegramfmt" +) + +func toTelegramEntity(br telegramfmt.BodyRange) tg.MessageEntityClass { + switch val := br.Value.(type) { + case telegramfmt.Mention: + userID, _ := ids.ParseUserID(val.UserID) + return &tg.MessageEntityMentionName{ + Offset: br.Start, + Length: br.Length, + UserID: userID, + } + case telegramfmt.Style: + switch val.Type { + case telegramfmt.StyleBold: + return &tg.MessageEntityBold{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleItalic: + return &tg.MessageEntityItalic{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleUnderline: + return &tg.MessageEntityUnderline{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleStrikethrough: + return &tg.MessageEntityStrike{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleBlockquote: + return &tg.MessageEntityBlockquote{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleCode: + return &tg.MessageEntityCode{Offset: br.Start, Length: br.Length} + case telegramfmt.StylePre: + return &tg.MessageEntityPre{Offset: br.Start, Length: br.Length, Language: val.Language} + case telegramfmt.StyleEmail: + return &tg.MessageEntityEmail{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleTextURL: + return &tg.MessageEntityTextURL{Offset: br.Start, Length: br.Length, URL: val.URL} + case telegramfmt.StyleURL: + return &tg.MessageEntityURL{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleBotCommand: + return &tg.MessageEntityBotCommand{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleHashtag: + return &tg.MessageEntityHashtag{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleCashtag: + return &tg.MessageEntityCashtag{Offset: br.Start, Length: br.Length} + case telegramfmt.StylePhone: + return &tg.MessageEntityPhone{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleSpoiler: + return &tg.MessageEntitySpoiler{Offset: br.Start, Length: br.Length} + case telegramfmt.StyleBankCard: + return &tg.MessageEntityBankCard{Offset: br.Start, Length: br.Length} + default: + panic("unsupported style type") + } + default: + panic("unknown body range value") + } +} + +func Parse(ctx context.Context, parser *HTMLParser, content *event.MessageEventContent) (string, []tg.MessageEntityClass) { + if content.MsgType.IsMedia() && content.FileName == "" { + // The body is the filename. + return "", nil + } + + if content.Format != event.FormatHTML { + return content.Body, nil + } + parseCtx := NewContext(ctx) + parseCtx.AllowedMentions = content.Mentions + parsed := parser.Parse(content.FormattedBody, parseCtx) + if parsed == nil { + return "", nil + } + var bodyRanges []tg.MessageEntityClass + if len(parsed.Entities) > 0 { + bodyRanges = make([]tg.MessageEntityClass, len(parsed.Entities)) + for i, ent := range parsed.Entities { + bodyRanges[i] = toTelegramEntity(ent) + } + } + return parsed.String.String(), bodyRanges +} diff --git a/pkg/connector/matrixfmt/html.go b/pkg/connector/matrixfmt/html.go new file mode 100644 index 00000000..49565877 --- /dev/null +++ b/pkg/connector/matrixfmt/html.go @@ -0,0 +1,490 @@ +// mautrix-telegram - A Matrix-Telegram puppeting bridge. +// Copyright (C) 2024 Sumner Evans +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package matrixfmt + +import ( + "context" + "fmt" + "math" + "strconv" + "strings" + + "golang.org/x/exp/slices" + "golang.org/x/net/html" + "maunium.net/go/mautrix/bridgev2/networkid" + "maunium.net/go/mautrix/event" + "maunium.net/go/mautrix/id" + + "go.mau.fi/mautrix-telegram/pkg/connector/telegramfmt" +) + +type EntityString struct { + String telegramfmt.UTF16String + Entities telegramfmt.BodyRangeList +} + +var DebugLog = func(format string, args ...any) {} + +func NewEntityString(val string) *EntityString { + DebugLog("NEW %q\n", val) + return &EntityString{ + String: telegramfmt.NewUTF16String(val), + } +} + +func (es *EntityString) Split(at uint16) []*EntityString { + if at > 0x7F { + panic("cannot split at non-ASCII character") + } + if es == nil { + return []*EntityString{} + } + DebugLog("SPLIT %q %q %+v\n", es.String, rune(at), es.Entities) + var output []*EntityString + prevSplit := 0 + doSplit := func(i int) *EntityString { + newES := &EntityString{ + String: es.String[prevSplit:i], + } + for _, entity := range es.Entities { + if (entity.End() <= i || entity.End() > prevSplit) && (entity.Start >= prevSplit || entity.Start < i) { + entity = *entity.TruncateStart(prevSplit).TruncateEnd(i).Offset(-prevSplit) + if entity.Length > 0 { + newES.Entities = append(newES.Entities, entity) + } + } + } + return newES + } + for i, chr := range es.String { + if chr != at { + continue + } + newES := doSplit(i) + output = append(output, newES) + DebugLog(" -> %q %+v\n", newES.String, newES.Entities) + prevSplit = i + 1 + } + if prevSplit == 0 { + DebugLog(" -> NOOP\n") + return []*EntityString{es} + } + if prevSplit != len(es.String) { + newES := doSplit(len(es.String)) + output = append(output, newES) + DebugLog(" -> %q %+v\n", newES.String, newES.Entities) + } + DebugLog("SPLITEND\n") + return output +} + +func (es *EntityString) TrimSpace() *EntityString { + if es == nil { + return nil + } + DebugLog("TRIMSPACE %q %+v\n", es.String, es.Entities) + var cutEnd, cutStart int + for cutStart = 0; cutStart < len(es.String); cutStart++ { + switch es.String[cutStart] { + case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: + continue + } + break + } + for cutEnd = len(es.String) - 1; cutEnd >= 0; cutEnd-- { + switch es.String[cutEnd] { + case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0: + continue + } + break + } + cutEnd++ + if cutStart == 0 && cutEnd == len(es.String) { + DebugLog(" -> NOOP\n") + return es + } + newEntities := es.Entities[:0] + for _, ent := range es.Entities { + ent = *ent.Offset(-cutStart).TruncateEnd(cutEnd) + if ent.Length > 0 { + newEntities = append(newEntities, ent) + } + } + es.String = es.String[cutStart:cutEnd] + es.Entities = newEntities + DebugLog(" -> %q %+v\n", es.String, es.Entities) + return es +} + +func JoinEntityString(with string, strings ...*EntityString) *EntityString { + withUTF16 := telegramfmt.NewUTF16String(with) + totalLen := 0 + totalEntities := 0 + for _, s := range strings { + totalLen += len(s.String) + totalEntities += len(s.Entities) + } + str := make(telegramfmt.UTF16String, 0, totalLen+len(strings)*len(withUTF16)) + entities := make(telegramfmt.BodyRangeList, 0, totalEntities) + DebugLog("JOIN %q %d\n", with, len(strings)) + for _, s := range strings { + if s == nil || len(s.String) == 0 { + continue + } + DebugLog(" + %q %+v\n", s.String, s.Entities) + for _, entity := range s.Entities { + entity.Start += len(str) + entities = append(entities, entity) + } + str = append(str, s.String...) + str = append(str, withUTF16...) + } + DebugLog(" -> %q %+v\n", str, entities) + return &EntityString{ + String: str, + Entities: entities, + } +} + +func (es *EntityString) Format(value telegramfmt.BodyRangeValue) *EntityString { + if es == nil { + return nil + } + newEntity := telegramfmt.BodyRange{ + Start: 0, + Length: len(es.String), + Value: value, + } + es.Entities = append(telegramfmt.BodyRangeList{newEntity}, es.Entities...) + DebugLog("FORMAT %v %q %+v\n", value, es.String, es.Entities) + return es +} + +func (es *EntityString) Append(other *EntityString) *EntityString { + if es == nil { + return other + } else if other == nil { + return es + } + DebugLog("APPEND %q %+v\n + %q %+v\n", es.String, es.Entities, other.String, other.Entities) + for _, entity := range other.Entities { + entity.Start += len(es.String) + es.Entities = append(es.Entities, entity) + } + es.String = append(es.String, other.String...) + DebugLog(" -> %q %+v\n", es.String, es.Entities) + return es +} + +func (es *EntityString) AppendString(other string) *EntityString { + if es == nil { + return NewEntityString(other) + } else if len(other) == 0 { + return es + } + DebugLog("APPENDSTRING %q %+v\n + %q\n", es.String, es.Entities, other) + es.String = append(es.String, telegramfmt.NewUTF16String(other)...) + DebugLog(" -> %q %+v\n", es.String, es.Entities) + return es +} + +type TagStack []string + +func (ts TagStack) Index(tag string) int { + for i := len(ts) - 1; i >= 0; i-- { + if ts[i] == tag { + return i + } + } + return -1 +} + +func (ts TagStack) Has(tag string) bool { + return ts.Index(tag) >= 0 +} + +type Context struct { + Ctx context.Context + AllowedMentions *event.Mentions + TagStack TagStack + PreserveWhitespace bool +} + +func NewContext(ctx context.Context) Context { + return Context{ + Ctx: ctx, + TagStack: make(TagStack, 0, 4), + } +} + +func (ctx Context) WithTag(tag string) Context { + ctx.TagStack = append(ctx.TagStack, tag) + return ctx +} + +func (ctx Context) WithWhitespace() Context { + ctx.PreserveWhitespace = true + return ctx +} + +// HTMLParser is a somewhat customizable Matrix HTML parser. +type HTMLParser struct { + ParseGhostMXID func(id.UserID) (networkid.UserID, bool) +} + +// TaggedString is a string that also contains a HTML tag. +type TaggedString struct { + *EntityString + tag string +} + +func (parser *HTMLParser) maybeGetAttribute(node *html.Node, attribute string) (string, bool) { + for _, attr := range node.Attr { + if attr.Key == attribute { + return attr.Val, true + } + } + return "", false +} + +func (parser *HTMLParser) getAttribute(node *html.Node, attribute string) string { + val, _ := parser.maybeGetAttribute(node, attribute) + return val +} + +// Digits counts the number of digits (and the sign, if negative) in an integer. +func Digits(num int) int { + if num == 0 { + return 1 + } else if num < 0 { + return Digits(-num) + 1 + } + return int(math.Floor(math.Log10(float64(num))) + 1) +} + +func (parser *HTMLParser) listToString(node *html.Node, ctx Context) *EntityString { + ordered := node.Data == "ol" + taggedChildren := parser.nodeToTaggedStrings(node.FirstChild, ctx) + counter := 1 + indentLength := 0 + if ordered { + start := parser.getAttribute(node, "start") + if len(start) > 0 { + counter, _ = strconv.Atoi(start) + } + + longestIndex := (counter - 1) + len(taggedChildren) + indentLength = Digits(longestIndex) + } + indent := strings.Repeat(" ", indentLength+2) + var children []*EntityString + for _, child := range taggedChildren { + if child.tag != "li" { + continue + } + var prefix string + if ordered { + indexPadding := indentLength - Digits(counter) + if indexPadding < 0 { + // This will happen on negative start indexes where longestIndex is usually wrong, otherwise shouldn't happen + indexPadding = 0 + } + prefix = fmt.Sprintf("%d. %s", counter, strings.Repeat(" ", indexPadding)) + } else { + prefix = "* " + } + es := NewEntityString(prefix).Append(child.EntityString) + counter++ + parts := es.Split('\n') + for i, part := range parts[1:] { + parts[i+1] = NewEntityString(indent).Append(part) + } + children = append(children, parts...) + } + return JoinEntityString("\n", children...) +} + +func (parser *HTMLParser) basicFormatToString(node *html.Node, ctx Context) *EntityString { + str := parser.nodeToTagAwareString(node.FirstChild, ctx) + switch node.Data { + case "b", "strong": + return str.Format(telegramfmt.Style{Type: telegramfmt.StyleBold}) + case "i", "em": + return str.Format(telegramfmt.Style{Type: telegramfmt.StyleItalic}) + case "s", "del", "strike": + return str.Format(telegramfmt.Style{Type: telegramfmt.StyleStrikethrough}) + case "u", "ins": + return str.Format(telegramfmt.Style{Type: telegramfmt.StyleUnderline}) + case "tt", "code": + return str.Format(telegramfmt.Style{Type: telegramfmt.StyleCode}) + } + return str +} + +func (parser *HTMLParser) spanToString(node *html.Node, ctx Context) *EntityString { + str := parser.nodeToTagAwareString(node.FirstChild, ctx) + if node.Data == "span" { + _, isSpoiler := parser.maybeGetAttribute(node, "data-mx-spoiler") + if isSpoiler { + str = str.Format(telegramfmt.Style{Type: telegramfmt.StyleSpoiler}) + } + } + return str +} + +func (parser *HTMLParser) headerToString(node *html.Node, ctx Context) *EntityString { + length := int(node.Data[1] - '0') + prefix := strings.Repeat("#", length) + " " + return NewEntityString(prefix).Append(parser.nodeToString(node.FirstChild, ctx)).Format(telegramfmt.Style{Type: telegramfmt.StyleBold}) +} + +func (parser *HTMLParser) linkToString(node *html.Node, ctx Context) *EntityString { + str := parser.nodeToTagAwareString(node.FirstChild, ctx) + href := parser.getAttribute(node, "href") + if len(href) == 0 { + return str + } + ent := NewEntityString(str.String.String()) + + parsedMatrix, err := id.ParseMatrixURIOrMatrixToURL(href) + if err == nil && parsedMatrix != nil && parsedMatrix.Sigil1 == '@' { + mxid := parsedMatrix.UserID() + if ctx.AllowedMentions != nil && !slices.Contains(ctx.AllowedMentions.UserIDs, mxid) { + // Mention not allowed, use name as-is + return str + } + userID, ok := parser.ParseGhostMXID(mxid) + if !ok { + return str + } + return ent.Format(telegramfmt.Mention{UserID: userID}) + } + if str.String.String() == href { + return ent.Format(telegramfmt.Style{Type: telegramfmt.StyleURL, URL: href}) + } else { + return ent.Format(telegramfmt.Style{Type: telegramfmt.StyleTextURL, URL: href}) + } +} + +func (parser *HTMLParser) tagToString(node *html.Node, ctx Context) *EntityString { + ctx = ctx.WithTag(node.Data) + switch node.Data { + case "blockquote": + return parser. + nodeToTagAwareString(node.FirstChild, ctx). + Format(telegramfmt.Style{Type: telegramfmt.StyleBlockquote}) + case "ol", "ul": + return parser.listToString(node, ctx) + case "h1", "h2", "h3", "h4", "h5", "h6": + return parser.headerToString(node, ctx) + case "br": + return NewEntityString("\n") + case "b", "strong", "i", "em", "s", "strike", "del", "u", "ins", "tt", "code": + return parser.basicFormatToString(node, ctx) + case "span", "font": + return parser.spanToString(node, ctx) + case "a": + return parser.linkToString(node, ctx) + case "p": + return parser.nodeToTagAwareString(node.FirstChild, ctx) + case "hr": + return NewEntityString("---") + case "pre": + var preStr *EntityString + var language string + if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" { + class := parser.getAttribute(node.FirstChild, "class") + if strings.HasPrefix(class, "language-") { + language = class[len("language-"):] + } + preStr = parser.nodeToString(node.FirstChild.FirstChild, ctx.WithWhitespace()) + } else { + preStr = parser.nodeToString(node.FirstChild, ctx.WithWhitespace()) + } + return preStr.Format(telegramfmt.Style{Type: telegramfmt.StylePre, Language: language}) + default: + return parser.nodeToTagAwareString(node.FirstChild, ctx) + } +} + +func (parser *HTMLParser) singleNodeToString(node *html.Node, ctx Context) TaggedString { + switch node.Type { + case html.TextNode: + if !ctx.PreserveWhitespace { + node.Data = strings.ReplaceAll(node.Data, "\n", "") + } + return TaggedString{NewEntityString(node.Data), "text"} + case html.ElementNode: + return TaggedString{parser.tagToString(node, ctx), node.Data} + case html.DocumentNode: + return TaggedString{parser.nodeToTagAwareString(node.FirstChild, ctx), "html"} + default: + return TaggedString{&EntityString{}, "unknown"} + } +} + +func (parser *HTMLParser) nodeToTaggedStrings(node *html.Node, ctx Context) (strs []TaggedString) { + for ; node != nil; node = node.NextSibling { + strs = append(strs, parser.singleNodeToString(node, ctx)) + } + return +} + +var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"} + +func (parser *HTMLParser) isBlockTag(tag string) bool { + for _, blockTag := range BlockTags { + if tag == blockTag { + return true + } + } + return false +} + +func (parser *HTMLParser) nodeToTagAwareString(node *html.Node, ctx Context) *EntityString { + strs := parser.nodeToTaggedStrings(node, ctx) + var output *EntityString + for _, str := range strs { + tstr := str.EntityString + if parser.isBlockTag(str.tag) { + tstr = NewEntityString("\n").Append(tstr).AppendString("\n") + } + if output == nil { + output = tstr + } else { + output = output.Append(tstr) + } + } + return output.TrimSpace() +} + +func (parser *HTMLParser) nodeToStrings(node *html.Node, ctx Context) (strs []*EntityString) { + for ; node != nil; node = node.NextSibling { + strs = append(strs, parser.singleNodeToString(node, ctx).EntityString) + } + return +} + +func (parser *HTMLParser) nodeToString(node *html.Node, ctx Context) *EntityString { + return JoinEntityString("", parser.nodeToStrings(node, ctx)...) +} + +// Parse converts Matrix HTML into text using the settings in this parser. +func (parser *HTMLParser) Parse(htmlData string, ctx Context) *EntityString { + node, _ := html.Parse(strings.NewReader(htmlData)) + return parser.nodeToTagAwareString(node, ctx) +}