matrixfmt: text formatting Matrix -> TG

Signed-off-by: Sumner Evans <sumner.evans@automattic.com>
This commit is contained in:
Sumner Evans
2024-08-01 11:42:58 -06:00
parent 882582456e
commit e8b5d286dc
4 changed files with 704 additions and 145 deletions
+1 -1
View File
@@ -10,6 +10,7 @@ require (
go.mau.fi/zerozap v0.1.1
go.uber.org/zap v1.27.0
golang.org/x/exp v0.0.0-20240716175740-e3f259677ff7
golang.org/x/net v0.27.0
maunium.net/go/mautrix v0.19.1-0.20240719130542-cc5f225bc61c
)
@@ -44,7 +45,6 @@ require (
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.25.0 // indirect
golang.org/x/net v0.27.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.22.0 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
+110 -144
View File
@@ -4,13 +4,12 @@ import (
"context"
"crypto/sha256"
"fmt"
"math/rand"
"strconv"
"strings"
"time"
"github.com/gotd/td/telegram/message"
"github.com/gotd/td/telegram/message/html"
"github.com/gotd/td/telegram/message/styling"
"github.com/gotd/td/telegram/uploader"
"github.com/gotd/td/tg"
"github.com/rs/zerolog"
@@ -24,20 +23,57 @@ import (
"go.mau.fi/mautrix-telegram/pkg/connector/emojis"
"go.mau.fi/mautrix-telegram/pkg/connector/ids"
"go.mau.fi/mautrix-telegram/pkg/connector/matrixfmt"
"go.mau.fi/mautrix-telegram/pkg/connector/waveform"
)
func getMediaFilenameAndCaption(content *event.MessageEventContent) (filename, caption string) {
func getMediaFilename(content *event.MessageEventContent) string {
if content.FileName != "" {
filename = content.FileName
caption = content.FormattedBody
if caption == "" {
caption = content.Body
}
return content.FileName
} else {
filename = content.Body
return content.Body
}
return
}
func (t *TelegramClient) transferMediaToTelegram(ctx context.Context, content *event.MessageEventContent) (tg.InputMediaClass, error) {
filename := getMediaFilename(content)
var fileData []byte
fileData, err := t.main.Bridge.Bot.DownloadMedia(ctx, content.URL, content.File)
if err != nil {
return nil, fmt.Errorf("failed to download media from Matrix: %w", err)
}
uploader := uploader.NewUploader(t.client.API())
var upload tg.InputFileClass
upload, err = uploader.FromBytes(ctx, filename, fileData)
if err != nil {
return nil, fmt.Errorf("failed to upload media to Telegram: %w", err)
}
if content.MsgType == event.MsgImage {
return &tg.InputMediaUploadedPhoto{File: upload}, nil
}
var attributes []tg.DocumentAttributeClass
attributes = append(attributes, &tg.DocumentAttributeFilename{FileName: filename})
if content.MsgType == event.MsgAudio {
audioAttr := tg.DocumentAttributeAudio{
Voice: content.MSC3245Voice != nil,
}
if content.MSC1767Audio != nil {
audioAttr.Duration = content.MSC1767Audio.Duration / 1000
if len(content.MSC1767Audio.Waveform) > 0 {
audioAttr.Waveform = waveform.Encode(content.MSC1767Audio.Waveform)
}
}
attributes = append(attributes, &audioAttr)
}
return &tg.InputMediaUploadedDocument{
File: upload,
MimeType: content.Info.MimeType,
Attributes: attributes,
}, nil
}
func (t *TelegramClient) HandleMatrixMessage(ctx context.Context, msg *bridgev2.MatrixMessage) (resp *bridgev2.MatrixMessageResponse, err error) {
@@ -45,94 +81,71 @@ func (t *TelegramClient) HandleMatrixMessage(ctx context.Context, msg *bridgev2.
if err != nil {
return nil, err
}
builder := message.NewSender(t.client.API()).To(peer)
var contentURI id.ContentURIString
// TODO handle sticker
noWebpage := msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0
message, entities := matrixfmt.Parse(ctx, &matrixfmt.HTMLParser{
ParseGhostMXID: t.main.Bridge.Matrix.ParseGhostMXID,
}, msg.Content)
var replyTo tg.InputReplyToClass
if msg.ReplyTo != nil {
messageID, err := ids.ParseMessageID(msg.ReplyTo.ID)
if err != nil {
return nil, err
}
replyTo = &tg.InputReplyToMessage{ReplyToMsgID: messageID}
}
var updates tg.UpdatesClass
switch msg.Content.MsgType {
case event.MsgText:
// TODO unify with edits?
if msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0 {
builder.NoWebpage()
}
updates, err = builder.Text(ctx, msg.Content.Body)
case event.MsgText, event.MsgNotice:
updates, err = t.client.API().MessagesSendMessage(ctx, &tg.MessagesSendMessageRequest{
Peer: peer,
NoWebpage: noWebpage,
Message: message,
Entities: entities,
ReplyTo: replyTo,
RandomID: rand.Int63(),
})
case event.MsgImage, event.MsgFile, event.MsgAudio, event.MsgVideo:
filename, caption := getMediaFilenameAndCaption(msg.Content)
var fileData []byte
fileData, err = t.main.Bridge.Bot.DownloadMedia(ctx, msg.Content.URL, msg.Content.File)
var media tg.InputMediaClass
media, err = t.transferMediaToTelegram(ctx, msg.Content)
if err != nil {
return nil, fmt.Errorf("failed to download media from Matrix: %w", err)
}
contentURI = msg.Content.URL
if contentURI == "" {
contentURI = msg.Content.File.URL
}
uploader := uploader.NewUploader(t.client.API())
var upload tg.InputFileClass
upload, err = uploader.FromBytes(ctx, filename, fileData)
if err != nil {
return nil, fmt.Errorf("failed to upload media to Telegram: %w", err)
}
var styling []styling.StyledTextOption
if caption != "" {
// TODO resolver?
// TODO HTML
styling = append(styling, html.String(nil, caption))
}
if msg.Content.MsgType == event.MsgImage {
updates, err = builder.Media(ctx, message.UploadedPhoto(upload, styling...))
break
} else {
document := message.UploadedDocument(upload, styling...).Filename(filename)
if msg.Content.Info != nil {
document.MIME(msg.Content.Info.MimeType)
}
var media message.MediaOption
switch msg.Content.MsgType {
case event.MsgAudio:
audioBuilder := document.Audio()
if msg.Content.MSC1767Audio != nil {
audioBuilder.Duration(time.Duration(msg.Content.MSC1767Audio.Duration) * time.Millisecond)
if len(msg.Content.MSC1767Audio.Waveform) > 0 {
audioBuilder.Waveform(waveform.Encode(msg.Content.MSC1767Audio.Waveform))
}
}
if msg.Content.MSC3245Voice != nil {
audioBuilder.Voice()
}
media = audioBuilder
default:
media = document
}
updates, err = builder.Media(ctx, media)
return nil, err
}
updates, err = t.client.API().MessagesSendMedia(ctx, &tg.MessagesSendMediaRequest{
Peer: peer,
Message: message,
Entities: entities,
Media: media,
ReplyTo: replyTo,
RandomID: rand.Int63(),
})
case event.MsgLocation:
var uri GeoURI
uri, err = ParseGeoURI(msg.Content.GeoURI)
if err != nil {
return nil, err
}
var styling []styling.StyledTextOption
message = ""
if location, ok := msg.Event.Content.Raw["org.matrix.msc3488.location"].(map[string]any); ok {
if desc, ok := location["description"].(string); ok {
// TODO resolver?
// TODO HTML
styling = append(styling, html.String(nil, desc))
message = desc
}
}
updates, err = builder.Media(ctx, message.Media(&tg.InputMediaGeoPoint{
GeoPoint: &tg.InputGeoPoint{
Lat: uri.Lat,
Long: uri.Long,
updates, err = t.client.API().MessagesSendMedia(ctx, &tg.MessagesSendMediaRequest{
Peer: peer,
Message: message,
Media: &tg.InputMediaGeoPoint{
GeoPoint: &tg.InputGeoPoint{Lat: uri.Lat, Long: uri.Long},
},
}, styling...))
ReplyTo: replyTo,
RandomID: rand.Int63(),
})
default:
return nil, fmt.Errorf("unsupported message type %s", msg.Content.MsgType)
}
@@ -194,88 +207,41 @@ func (t *TelegramClient) HandleMatrixEdit(ctx context.Context, msg *bridgev2.Mat
return err
}
b := message.NewSender(t.client.API()).To(peer)
if msg.Content.MsgType == event.MsgText && msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0 {
b.NoWebpage()
}
targetID, err := ids.ParseMessageID(msg.EditTarget.ID)
if err != nil {
return err
}
builder := b.Edit(targetID)
message, entities := matrixfmt.Parse(ctx, &matrixfmt.HTMLParser{
ParseGhostMXID: t.main.Bridge.Matrix.ParseGhostMXID,
}, msg.Content)
var newContentURI id.ContentURIString
var updates tg.UpdatesClass
switch msg.Content.MsgType {
case event.MsgText:
updates, err = builder.Text(ctx, msg.Content.Body)
case event.MsgImage, event.MsgFile, event.MsgAudio, event.MsgVideo:
filename, caption := getMediaFilenameAndCaption(msg.Content)
var styling []styling.StyledTextOption
if caption != "" {
// TODO resolver?
// TODO HTML
styling = append(styling, html.String(nil, caption))
}
req := tg.MessagesEditMessageRequest{
Peer: peer,
NoWebpage: msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0,
Message: message,
Entities: entities,
ID: targetID,
}
if msg.Content.MsgType.IsMedia() {
newContentURI = msg.Content.URL
if newContentURI == "" {
newContentURI = msg.Content.File.URL
}
if msg.EditTarget.Metadata.(*MessageMetadata).ContentURI == newContentURI {
log.Info().Msg("media URI unchanged, skipping re-upload, just editing text")
updates, err = builder.StyledText(ctx, styling...)
break
}
log.Info().Msg("media URI changed, re-uploading media")
var fileData []byte
fileData, err = t.main.Bridge.Bot.DownloadMedia(ctx, msg.Content.URL, msg.Content.File)
if err != nil {
return fmt.Errorf("failed to download media from Matrix: %w", err)
}
uploader := uploader.NewUploader(t.client.API())
var upload tg.InputFileClass
upload, err = uploader.FromBytes(ctx, filename, fileData)
if err != nil {
return fmt.Errorf("failed to upload media to Telegram: %w", err)
}
if msg.Content.MsgType == event.MsgImage {
updates, err = builder.Media(ctx, message.UploadedPhoto(upload, styling...))
break
} else {
document := message.UploadedDocument(upload, styling...).Filename(filename)
if msg.Content.Info != nil {
document.MIME(msg.Content.Info.MimeType)
log.Info().Msg("media URI changed, re-uploading media")
req.Media, err = t.transferMediaToTelegram(ctx, msg.Content)
if err != nil {
return err
}
var media message.MediaOption
switch msg.Content.MsgType {
case event.MsgAudio:
audioBuilder := document.Audio()
if msg.Content.MSC1767Audio != nil {
audioBuilder.Duration(time.Duration(msg.Content.MSC1767Audio.Duration) * time.Millisecond)
if len(msg.Content.MSC1767Audio.Waveform) > 0 {
audioBuilder.Waveform(waveform.Encode(msg.Content.MSC1767Audio.Waveform))
}
}
if msg.Content.MSC3245Voice != nil {
audioBuilder.Voice()
}
media = audioBuilder
default:
media = document
}
updates, err = builder.Media(ctx, media)
}
default:
return fmt.Errorf("unsupported message type %s", msg.Content.MsgType)
} else if !msg.Content.MsgType.IsText() {
return fmt.Errorf("editing message type %s is unsupported", msg.Content.MsgType)
}
updates, err := t.client.API().MessagesEditMessage(ctx, &req)
if err != nil {
return err
}
+103
View File
@@ -0,0 +1,103 @@
// mautrix-telegram - A Matrix-Telegram puppeting bridge.
// Copyright (C) 2024 Sumner Evans
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package matrixfmt
import (
"context"
"github.com/gotd/td/tg"
"maunium.net/go/mautrix/event"
"go.mau.fi/mautrix-telegram/pkg/connector/ids"
"go.mau.fi/mautrix-telegram/pkg/connector/telegramfmt"
)
func toTelegramEntity(br telegramfmt.BodyRange) tg.MessageEntityClass {
switch val := br.Value.(type) {
case telegramfmt.Mention:
userID, _ := ids.ParseUserID(val.UserID)
return &tg.MessageEntityMentionName{
Offset: br.Start,
Length: br.Length,
UserID: userID,
}
case telegramfmt.Style:
switch val.Type {
case telegramfmt.StyleBold:
return &tg.MessageEntityBold{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleItalic:
return &tg.MessageEntityItalic{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleUnderline:
return &tg.MessageEntityUnderline{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleStrikethrough:
return &tg.MessageEntityStrike{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleBlockquote:
return &tg.MessageEntityBlockquote{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleCode:
return &tg.MessageEntityCode{Offset: br.Start, Length: br.Length}
case telegramfmt.StylePre:
return &tg.MessageEntityPre{Offset: br.Start, Length: br.Length, Language: val.Language}
case telegramfmt.StyleEmail:
return &tg.MessageEntityEmail{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleTextURL:
return &tg.MessageEntityTextURL{Offset: br.Start, Length: br.Length, URL: val.URL}
case telegramfmt.StyleURL:
return &tg.MessageEntityURL{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleBotCommand:
return &tg.MessageEntityBotCommand{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleHashtag:
return &tg.MessageEntityHashtag{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleCashtag:
return &tg.MessageEntityCashtag{Offset: br.Start, Length: br.Length}
case telegramfmt.StylePhone:
return &tg.MessageEntityPhone{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleSpoiler:
return &tg.MessageEntitySpoiler{Offset: br.Start, Length: br.Length}
case telegramfmt.StyleBankCard:
return &tg.MessageEntityBankCard{Offset: br.Start, Length: br.Length}
default:
panic("unsupported style type")
}
default:
panic("unknown body range value")
}
}
func Parse(ctx context.Context, parser *HTMLParser, content *event.MessageEventContent) (string, []tg.MessageEntityClass) {
if content.MsgType.IsMedia() && content.FileName == "" {
// The body is the filename.
return "", nil
}
if content.Format != event.FormatHTML {
return content.Body, nil
}
parseCtx := NewContext(ctx)
parseCtx.AllowedMentions = content.Mentions
parsed := parser.Parse(content.FormattedBody, parseCtx)
if parsed == nil {
return "", nil
}
var bodyRanges []tg.MessageEntityClass
if len(parsed.Entities) > 0 {
bodyRanges = make([]tg.MessageEntityClass, len(parsed.Entities))
for i, ent := range parsed.Entities {
bodyRanges[i] = toTelegramEntity(ent)
}
}
return parsed.String.String(), bodyRanges
}
+490
View File
@@ -0,0 +1,490 @@
// mautrix-telegram - A Matrix-Telegram puppeting bridge.
// Copyright (C) 2024 Sumner Evans
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package matrixfmt
import (
"context"
"fmt"
"math"
"strconv"
"strings"
"golang.org/x/exp/slices"
"golang.org/x/net/html"
"maunium.net/go/mautrix/bridgev2/networkid"
"maunium.net/go/mautrix/event"
"maunium.net/go/mautrix/id"
"go.mau.fi/mautrix-telegram/pkg/connector/telegramfmt"
)
type EntityString struct {
String telegramfmt.UTF16String
Entities telegramfmt.BodyRangeList
}
var DebugLog = func(format string, args ...any) {}
func NewEntityString(val string) *EntityString {
DebugLog("NEW %q\n", val)
return &EntityString{
String: telegramfmt.NewUTF16String(val),
}
}
func (es *EntityString) Split(at uint16) []*EntityString {
if at > 0x7F {
panic("cannot split at non-ASCII character")
}
if es == nil {
return []*EntityString{}
}
DebugLog("SPLIT %q %q %+v\n", es.String, rune(at), es.Entities)
var output []*EntityString
prevSplit := 0
doSplit := func(i int) *EntityString {
newES := &EntityString{
String: es.String[prevSplit:i],
}
for _, entity := range es.Entities {
if (entity.End() <= i || entity.End() > prevSplit) && (entity.Start >= prevSplit || entity.Start < i) {
entity = *entity.TruncateStart(prevSplit).TruncateEnd(i).Offset(-prevSplit)
if entity.Length > 0 {
newES.Entities = append(newES.Entities, entity)
}
}
}
return newES
}
for i, chr := range es.String {
if chr != at {
continue
}
newES := doSplit(i)
output = append(output, newES)
DebugLog(" -> %q %+v\n", newES.String, newES.Entities)
prevSplit = i + 1
}
if prevSplit == 0 {
DebugLog(" -> NOOP\n")
return []*EntityString{es}
}
if prevSplit != len(es.String) {
newES := doSplit(len(es.String))
output = append(output, newES)
DebugLog(" -> %q %+v\n", newES.String, newES.Entities)
}
DebugLog("SPLITEND\n")
return output
}
func (es *EntityString) TrimSpace() *EntityString {
if es == nil {
return nil
}
DebugLog("TRIMSPACE %q %+v\n", es.String, es.Entities)
var cutEnd, cutStart int
for cutStart = 0; cutStart < len(es.String); cutStart++ {
switch es.String[cutStart] {
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
continue
}
break
}
for cutEnd = len(es.String) - 1; cutEnd >= 0; cutEnd-- {
switch es.String[cutEnd] {
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
continue
}
break
}
cutEnd++
if cutStart == 0 && cutEnd == len(es.String) {
DebugLog(" -> NOOP\n")
return es
}
newEntities := es.Entities[:0]
for _, ent := range es.Entities {
ent = *ent.Offset(-cutStart).TruncateEnd(cutEnd)
if ent.Length > 0 {
newEntities = append(newEntities, ent)
}
}
es.String = es.String[cutStart:cutEnd]
es.Entities = newEntities
DebugLog(" -> %q %+v\n", es.String, es.Entities)
return es
}
func JoinEntityString(with string, strings ...*EntityString) *EntityString {
withUTF16 := telegramfmt.NewUTF16String(with)
totalLen := 0
totalEntities := 0
for _, s := range strings {
totalLen += len(s.String)
totalEntities += len(s.Entities)
}
str := make(telegramfmt.UTF16String, 0, totalLen+len(strings)*len(withUTF16))
entities := make(telegramfmt.BodyRangeList, 0, totalEntities)
DebugLog("JOIN %q %d\n", with, len(strings))
for _, s := range strings {
if s == nil || len(s.String) == 0 {
continue
}
DebugLog(" + %q %+v\n", s.String, s.Entities)
for _, entity := range s.Entities {
entity.Start += len(str)
entities = append(entities, entity)
}
str = append(str, s.String...)
str = append(str, withUTF16...)
}
DebugLog(" -> %q %+v\n", str, entities)
return &EntityString{
String: str,
Entities: entities,
}
}
func (es *EntityString) Format(value telegramfmt.BodyRangeValue) *EntityString {
if es == nil {
return nil
}
newEntity := telegramfmt.BodyRange{
Start: 0,
Length: len(es.String),
Value: value,
}
es.Entities = append(telegramfmt.BodyRangeList{newEntity}, es.Entities...)
DebugLog("FORMAT %v %q %+v\n", value, es.String, es.Entities)
return es
}
func (es *EntityString) Append(other *EntityString) *EntityString {
if es == nil {
return other
} else if other == nil {
return es
}
DebugLog("APPEND %q %+v\n + %q %+v\n", es.String, es.Entities, other.String, other.Entities)
for _, entity := range other.Entities {
entity.Start += len(es.String)
es.Entities = append(es.Entities, entity)
}
es.String = append(es.String, other.String...)
DebugLog(" -> %q %+v\n", es.String, es.Entities)
return es
}
func (es *EntityString) AppendString(other string) *EntityString {
if es == nil {
return NewEntityString(other)
} else if len(other) == 0 {
return es
}
DebugLog("APPENDSTRING %q %+v\n + %q\n", es.String, es.Entities, other)
es.String = append(es.String, telegramfmt.NewUTF16String(other)...)
DebugLog(" -> %q %+v\n", es.String, es.Entities)
return es
}
type TagStack []string
func (ts TagStack) Index(tag string) int {
for i := len(ts) - 1; i >= 0; i-- {
if ts[i] == tag {
return i
}
}
return -1
}
func (ts TagStack) Has(tag string) bool {
return ts.Index(tag) >= 0
}
type Context struct {
Ctx context.Context
AllowedMentions *event.Mentions
TagStack TagStack
PreserveWhitespace bool
}
func NewContext(ctx context.Context) Context {
return Context{
Ctx: ctx,
TagStack: make(TagStack, 0, 4),
}
}
func (ctx Context) WithTag(tag string) Context {
ctx.TagStack = append(ctx.TagStack, tag)
return ctx
}
func (ctx Context) WithWhitespace() Context {
ctx.PreserveWhitespace = true
return ctx
}
// HTMLParser is a somewhat customizable Matrix HTML parser.
type HTMLParser struct {
ParseGhostMXID func(id.UserID) (networkid.UserID, bool)
}
// TaggedString is a string that also contains a HTML tag.
type TaggedString struct {
*EntityString
tag string
}
func (parser *HTMLParser) maybeGetAttribute(node *html.Node, attribute string) (string, bool) {
for _, attr := range node.Attr {
if attr.Key == attribute {
return attr.Val, true
}
}
return "", false
}
func (parser *HTMLParser) getAttribute(node *html.Node, attribute string) string {
val, _ := parser.maybeGetAttribute(node, attribute)
return val
}
// Digits counts the number of digits (and the sign, if negative) in an integer.
func Digits(num int) int {
if num == 0 {
return 1
} else if num < 0 {
return Digits(-num) + 1
}
return int(math.Floor(math.Log10(float64(num))) + 1)
}
func (parser *HTMLParser) listToString(node *html.Node, ctx Context) *EntityString {
ordered := node.Data == "ol"
taggedChildren := parser.nodeToTaggedStrings(node.FirstChild, ctx)
counter := 1
indentLength := 0
if ordered {
start := parser.getAttribute(node, "start")
if len(start) > 0 {
counter, _ = strconv.Atoi(start)
}
longestIndex := (counter - 1) + len(taggedChildren)
indentLength = Digits(longestIndex)
}
indent := strings.Repeat(" ", indentLength+2)
var children []*EntityString
for _, child := range taggedChildren {
if child.tag != "li" {
continue
}
var prefix string
if ordered {
indexPadding := indentLength - Digits(counter)
if indexPadding < 0 {
// This will happen on negative start indexes where longestIndex is usually wrong, otherwise shouldn't happen
indexPadding = 0
}
prefix = fmt.Sprintf("%d. %s", counter, strings.Repeat(" ", indexPadding))
} else {
prefix = "* "
}
es := NewEntityString(prefix).Append(child.EntityString)
counter++
parts := es.Split('\n')
for i, part := range parts[1:] {
parts[i+1] = NewEntityString(indent).Append(part)
}
children = append(children, parts...)
}
return JoinEntityString("\n", children...)
}
func (parser *HTMLParser) basicFormatToString(node *html.Node, ctx Context) *EntityString {
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
switch node.Data {
case "b", "strong":
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleBold})
case "i", "em":
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleItalic})
case "s", "del", "strike":
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleStrikethrough})
case "u", "ins":
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleUnderline})
case "tt", "code":
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleCode})
}
return str
}
func (parser *HTMLParser) spanToString(node *html.Node, ctx Context) *EntityString {
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
if node.Data == "span" {
_, isSpoiler := parser.maybeGetAttribute(node, "data-mx-spoiler")
if isSpoiler {
str = str.Format(telegramfmt.Style{Type: telegramfmt.StyleSpoiler})
}
}
return str
}
func (parser *HTMLParser) headerToString(node *html.Node, ctx Context) *EntityString {
length := int(node.Data[1] - '0')
prefix := strings.Repeat("#", length) + " "
return NewEntityString(prefix).Append(parser.nodeToString(node.FirstChild, ctx)).Format(telegramfmt.Style{Type: telegramfmt.StyleBold})
}
func (parser *HTMLParser) linkToString(node *html.Node, ctx Context) *EntityString {
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
href := parser.getAttribute(node, "href")
if len(href) == 0 {
return str
}
ent := NewEntityString(str.String.String())
parsedMatrix, err := id.ParseMatrixURIOrMatrixToURL(href)
if err == nil && parsedMatrix != nil && parsedMatrix.Sigil1 == '@' {
mxid := parsedMatrix.UserID()
if ctx.AllowedMentions != nil && !slices.Contains(ctx.AllowedMentions.UserIDs, mxid) {
// Mention not allowed, use name as-is
return str
}
userID, ok := parser.ParseGhostMXID(mxid)
if !ok {
return str
}
return ent.Format(telegramfmt.Mention{UserID: userID})
}
if str.String.String() == href {
return ent.Format(telegramfmt.Style{Type: telegramfmt.StyleURL, URL: href})
} else {
return ent.Format(telegramfmt.Style{Type: telegramfmt.StyleTextURL, URL: href})
}
}
func (parser *HTMLParser) tagToString(node *html.Node, ctx Context) *EntityString {
ctx = ctx.WithTag(node.Data)
switch node.Data {
case "blockquote":
return parser.
nodeToTagAwareString(node.FirstChild, ctx).
Format(telegramfmt.Style{Type: telegramfmt.StyleBlockquote})
case "ol", "ul":
return parser.listToString(node, ctx)
case "h1", "h2", "h3", "h4", "h5", "h6":
return parser.headerToString(node, ctx)
case "br":
return NewEntityString("\n")
case "b", "strong", "i", "em", "s", "strike", "del", "u", "ins", "tt", "code":
return parser.basicFormatToString(node, ctx)
case "span", "font":
return parser.spanToString(node, ctx)
case "a":
return parser.linkToString(node, ctx)
case "p":
return parser.nodeToTagAwareString(node.FirstChild, ctx)
case "hr":
return NewEntityString("---")
case "pre":
var preStr *EntityString
var language string
if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" {
class := parser.getAttribute(node.FirstChild, "class")
if strings.HasPrefix(class, "language-") {
language = class[len("language-"):]
}
preStr = parser.nodeToString(node.FirstChild.FirstChild, ctx.WithWhitespace())
} else {
preStr = parser.nodeToString(node.FirstChild, ctx.WithWhitespace())
}
return preStr.Format(telegramfmt.Style{Type: telegramfmt.StylePre, Language: language})
default:
return parser.nodeToTagAwareString(node.FirstChild, ctx)
}
}
func (parser *HTMLParser) singleNodeToString(node *html.Node, ctx Context) TaggedString {
switch node.Type {
case html.TextNode:
if !ctx.PreserveWhitespace {
node.Data = strings.ReplaceAll(node.Data, "\n", "")
}
return TaggedString{NewEntityString(node.Data), "text"}
case html.ElementNode:
return TaggedString{parser.tagToString(node, ctx), node.Data}
case html.DocumentNode:
return TaggedString{parser.nodeToTagAwareString(node.FirstChild, ctx), "html"}
default:
return TaggedString{&EntityString{}, "unknown"}
}
}
func (parser *HTMLParser) nodeToTaggedStrings(node *html.Node, ctx Context) (strs []TaggedString) {
for ; node != nil; node = node.NextSibling {
strs = append(strs, parser.singleNodeToString(node, ctx))
}
return
}
var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"}
func (parser *HTMLParser) isBlockTag(tag string) bool {
for _, blockTag := range BlockTags {
if tag == blockTag {
return true
}
}
return false
}
func (parser *HTMLParser) nodeToTagAwareString(node *html.Node, ctx Context) *EntityString {
strs := parser.nodeToTaggedStrings(node, ctx)
var output *EntityString
for _, str := range strs {
tstr := str.EntityString
if parser.isBlockTag(str.tag) {
tstr = NewEntityString("\n").Append(tstr).AppendString("\n")
}
if output == nil {
output = tstr
} else {
output = output.Append(tstr)
}
}
return output.TrimSpace()
}
func (parser *HTMLParser) nodeToStrings(node *html.Node, ctx Context) (strs []*EntityString) {
for ; node != nil; node = node.NextSibling {
strs = append(strs, parser.singleNodeToString(node, ctx).EntityString)
}
return
}
func (parser *HTMLParser) nodeToString(node *html.Node, ctx Context) *EntityString {
return JoinEntityString("", parser.nodeToStrings(node, ctx)...)
}
// Parse converts Matrix HTML into text using the settings in this parser.
func (parser *HTMLParser) Parse(htmlData string, ctx Context) *EntityString {
node, _ := html.Parse(strings.NewReader(htmlData))
return parser.nodeToTagAwareString(node, ctx)
}