matrixfmt: text formatting Matrix -> TG
Signed-off-by: Sumner Evans <sumner.evans@automattic.com>
This commit is contained in:
@@ -10,6 +10,7 @@ require (
|
||||
go.mau.fi/zerozap v0.1.1
|
||||
go.uber.org/zap v1.27.0
|
||||
golang.org/x/exp v0.0.0-20240716175740-e3f259677ff7
|
||||
golang.org/x/net v0.27.0
|
||||
maunium.net/go/mautrix v0.19.1-0.20240719130542-cc5f225bc61c
|
||||
)
|
||||
|
||||
@@ -44,7 +45,6 @@ require (
|
||||
go.uber.org/atomic v1.11.0 // indirect
|
||||
go.uber.org/multierr v1.11.0 // indirect
|
||||
golang.org/x/crypto v0.25.0 // indirect
|
||||
golang.org/x/net v0.27.0 // indirect
|
||||
golang.org/x/sync v0.7.0 // indirect
|
||||
golang.org/x/sys v0.22.0 // indirect
|
||||
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
|
||||
|
||||
+110
-144
@@ -4,13 +4,12 @@ import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gotd/td/telegram/message"
|
||||
"github.com/gotd/td/telegram/message/html"
|
||||
"github.com/gotd/td/telegram/message/styling"
|
||||
"github.com/gotd/td/telegram/uploader"
|
||||
"github.com/gotd/td/tg"
|
||||
"github.com/rs/zerolog"
|
||||
@@ -24,20 +23,57 @@ import (
|
||||
|
||||
"go.mau.fi/mautrix-telegram/pkg/connector/emojis"
|
||||
"go.mau.fi/mautrix-telegram/pkg/connector/ids"
|
||||
"go.mau.fi/mautrix-telegram/pkg/connector/matrixfmt"
|
||||
"go.mau.fi/mautrix-telegram/pkg/connector/waveform"
|
||||
)
|
||||
|
||||
func getMediaFilenameAndCaption(content *event.MessageEventContent) (filename, caption string) {
|
||||
func getMediaFilename(content *event.MessageEventContent) string {
|
||||
if content.FileName != "" {
|
||||
filename = content.FileName
|
||||
caption = content.FormattedBody
|
||||
if caption == "" {
|
||||
caption = content.Body
|
||||
}
|
||||
return content.FileName
|
||||
} else {
|
||||
filename = content.Body
|
||||
return content.Body
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (t *TelegramClient) transferMediaToTelegram(ctx context.Context, content *event.MessageEventContent) (tg.InputMediaClass, error) {
|
||||
filename := getMediaFilename(content)
|
||||
var fileData []byte
|
||||
fileData, err := t.main.Bridge.Bot.DownloadMedia(ctx, content.URL, content.File)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to download media from Matrix: %w", err)
|
||||
}
|
||||
uploader := uploader.NewUploader(t.client.API())
|
||||
var upload tg.InputFileClass
|
||||
upload, err = uploader.FromBytes(ctx, filename, fileData)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to upload media to Telegram: %w", err)
|
||||
}
|
||||
|
||||
if content.MsgType == event.MsgImage {
|
||||
return &tg.InputMediaUploadedPhoto{File: upload}, nil
|
||||
}
|
||||
|
||||
var attributes []tg.DocumentAttributeClass
|
||||
attributes = append(attributes, &tg.DocumentAttributeFilename{FileName: filename})
|
||||
|
||||
if content.MsgType == event.MsgAudio {
|
||||
audioAttr := tg.DocumentAttributeAudio{
|
||||
Voice: content.MSC3245Voice != nil,
|
||||
}
|
||||
if content.MSC1767Audio != nil {
|
||||
audioAttr.Duration = content.MSC1767Audio.Duration / 1000
|
||||
if len(content.MSC1767Audio.Waveform) > 0 {
|
||||
audioAttr.Waveform = waveform.Encode(content.MSC1767Audio.Waveform)
|
||||
}
|
||||
}
|
||||
attributes = append(attributes, &audioAttr)
|
||||
}
|
||||
|
||||
return &tg.InputMediaUploadedDocument{
|
||||
File: upload,
|
||||
MimeType: content.Info.MimeType,
|
||||
Attributes: attributes,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (t *TelegramClient) HandleMatrixMessage(ctx context.Context, msg *bridgev2.MatrixMessage) (resp *bridgev2.MatrixMessageResponse, err error) {
|
||||
@@ -45,94 +81,71 @@ func (t *TelegramClient) HandleMatrixMessage(ctx context.Context, msg *bridgev2.
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
builder := message.NewSender(t.client.API()).To(peer)
|
||||
|
||||
var contentURI id.ContentURIString
|
||||
// TODO handle sticker
|
||||
|
||||
noWebpage := msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0
|
||||
|
||||
message, entities := matrixfmt.Parse(ctx, &matrixfmt.HTMLParser{
|
||||
ParseGhostMXID: t.main.Bridge.Matrix.ParseGhostMXID,
|
||||
}, msg.Content)
|
||||
|
||||
var replyTo tg.InputReplyToClass
|
||||
if msg.ReplyTo != nil {
|
||||
messageID, err := ids.ParseMessageID(msg.ReplyTo.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
replyTo = &tg.InputReplyToMessage{ReplyToMsgID: messageID}
|
||||
}
|
||||
|
||||
var updates tg.UpdatesClass
|
||||
switch msg.Content.MsgType {
|
||||
case event.MsgText:
|
||||
// TODO unify with edits?
|
||||
if msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0 {
|
||||
builder.NoWebpage()
|
||||
}
|
||||
updates, err = builder.Text(ctx, msg.Content.Body)
|
||||
case event.MsgText, event.MsgNotice:
|
||||
updates, err = t.client.API().MessagesSendMessage(ctx, &tg.MessagesSendMessageRequest{
|
||||
Peer: peer,
|
||||
NoWebpage: noWebpage,
|
||||
Message: message,
|
||||
Entities: entities,
|
||||
ReplyTo: replyTo,
|
||||
RandomID: rand.Int63(),
|
||||
})
|
||||
case event.MsgImage, event.MsgFile, event.MsgAudio, event.MsgVideo:
|
||||
filename, caption := getMediaFilenameAndCaption(msg.Content)
|
||||
|
||||
var fileData []byte
|
||||
fileData, err = t.main.Bridge.Bot.DownloadMedia(ctx, msg.Content.URL, msg.Content.File)
|
||||
var media tg.InputMediaClass
|
||||
media, err = t.transferMediaToTelegram(ctx, msg.Content)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to download media from Matrix: %w", err)
|
||||
}
|
||||
contentURI = msg.Content.URL
|
||||
if contentURI == "" {
|
||||
contentURI = msg.Content.File.URL
|
||||
}
|
||||
|
||||
uploader := uploader.NewUploader(t.client.API())
|
||||
var upload tg.InputFileClass
|
||||
upload, err = uploader.FromBytes(ctx, filename, fileData)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to upload media to Telegram: %w", err)
|
||||
}
|
||||
var styling []styling.StyledTextOption
|
||||
if caption != "" {
|
||||
// TODO resolver?
|
||||
// TODO HTML
|
||||
styling = append(styling, html.String(nil, caption))
|
||||
}
|
||||
|
||||
if msg.Content.MsgType == event.MsgImage {
|
||||
updates, err = builder.Media(ctx, message.UploadedPhoto(upload, styling...))
|
||||
break
|
||||
} else {
|
||||
document := message.UploadedDocument(upload, styling...).Filename(filename)
|
||||
if msg.Content.Info != nil {
|
||||
document.MIME(msg.Content.Info.MimeType)
|
||||
}
|
||||
|
||||
var media message.MediaOption
|
||||
|
||||
switch msg.Content.MsgType {
|
||||
case event.MsgAudio:
|
||||
audioBuilder := document.Audio()
|
||||
if msg.Content.MSC1767Audio != nil {
|
||||
audioBuilder.Duration(time.Duration(msg.Content.MSC1767Audio.Duration) * time.Millisecond)
|
||||
if len(msg.Content.MSC1767Audio.Waveform) > 0 {
|
||||
audioBuilder.Waveform(waveform.Encode(msg.Content.MSC1767Audio.Waveform))
|
||||
}
|
||||
}
|
||||
if msg.Content.MSC3245Voice != nil {
|
||||
audioBuilder.Voice()
|
||||
}
|
||||
media = audioBuilder
|
||||
default:
|
||||
media = document
|
||||
}
|
||||
updates, err = builder.Media(ctx, media)
|
||||
return nil, err
|
||||
}
|
||||
updates, err = t.client.API().MessagesSendMedia(ctx, &tg.MessagesSendMediaRequest{
|
||||
Peer: peer,
|
||||
Message: message,
|
||||
Entities: entities,
|
||||
Media: media,
|
||||
ReplyTo: replyTo,
|
||||
RandomID: rand.Int63(),
|
||||
})
|
||||
case event.MsgLocation:
|
||||
var uri GeoURI
|
||||
uri, err = ParseGeoURI(msg.Content.GeoURI)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var styling []styling.StyledTextOption
|
||||
message = ""
|
||||
if location, ok := msg.Event.Content.Raw["org.matrix.msc3488.location"].(map[string]any); ok {
|
||||
if desc, ok := location["description"].(string); ok {
|
||||
// TODO resolver?
|
||||
// TODO HTML
|
||||
styling = append(styling, html.String(nil, desc))
|
||||
message = desc
|
||||
}
|
||||
}
|
||||
updates, err = builder.Media(ctx, message.Media(&tg.InputMediaGeoPoint{
|
||||
GeoPoint: &tg.InputGeoPoint{
|
||||
Lat: uri.Lat,
|
||||
Long: uri.Long,
|
||||
updates, err = t.client.API().MessagesSendMedia(ctx, &tg.MessagesSendMediaRequest{
|
||||
Peer: peer,
|
||||
Message: message,
|
||||
Media: &tg.InputMediaGeoPoint{
|
||||
GeoPoint: &tg.InputGeoPoint{Lat: uri.Lat, Long: uri.Long},
|
||||
},
|
||||
}, styling...))
|
||||
ReplyTo: replyTo,
|
||||
RandomID: rand.Int63(),
|
||||
})
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported message type %s", msg.Content.MsgType)
|
||||
}
|
||||
@@ -194,88 +207,41 @@ func (t *TelegramClient) HandleMatrixEdit(ctx context.Context, msg *bridgev2.Mat
|
||||
return err
|
||||
}
|
||||
|
||||
b := message.NewSender(t.client.API()).To(peer)
|
||||
if msg.Content.MsgType == event.MsgText && msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0 {
|
||||
b.NoWebpage()
|
||||
}
|
||||
|
||||
targetID, err := ids.ParseMessageID(msg.EditTarget.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
builder := b.Edit(targetID)
|
||||
|
||||
message, entities := matrixfmt.Parse(ctx, &matrixfmt.HTMLParser{
|
||||
ParseGhostMXID: t.main.Bridge.Matrix.ParseGhostMXID,
|
||||
}, msg.Content)
|
||||
|
||||
var newContentURI id.ContentURIString
|
||||
var updates tg.UpdatesClass
|
||||
switch msg.Content.MsgType {
|
||||
case event.MsgText:
|
||||
updates, err = builder.Text(ctx, msg.Content.Body)
|
||||
case event.MsgImage, event.MsgFile, event.MsgAudio, event.MsgVideo:
|
||||
filename, caption := getMediaFilenameAndCaption(msg.Content)
|
||||
|
||||
var styling []styling.StyledTextOption
|
||||
if caption != "" {
|
||||
// TODO resolver?
|
||||
// TODO HTML
|
||||
styling = append(styling, html.String(nil, caption))
|
||||
}
|
||||
|
||||
req := tg.MessagesEditMessageRequest{
|
||||
Peer: peer,
|
||||
NoWebpage: msg.Content.BeeperLinkPreviews != nil && len(msg.Content.BeeperLinkPreviews) == 0,
|
||||
Message: message,
|
||||
Entities: entities,
|
||||
ID: targetID,
|
||||
}
|
||||
if msg.Content.MsgType.IsMedia() {
|
||||
newContentURI = msg.Content.URL
|
||||
if newContentURI == "" {
|
||||
newContentURI = msg.Content.File.URL
|
||||
}
|
||||
if msg.EditTarget.Metadata.(*MessageMetadata).ContentURI == newContentURI {
|
||||
log.Info().Msg("media URI unchanged, skipping re-upload, just editing text")
|
||||
updates, err = builder.StyledText(ctx, styling...)
|
||||
break
|
||||
}
|
||||
|
||||
log.Info().Msg("media URI changed, re-uploading media")
|
||||
|
||||
var fileData []byte
|
||||
fileData, err = t.main.Bridge.Bot.DownloadMedia(ctx, msg.Content.URL, msg.Content.File)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to download media from Matrix: %w", err)
|
||||
}
|
||||
uploader := uploader.NewUploader(t.client.API())
|
||||
var upload tg.InputFileClass
|
||||
upload, err = uploader.FromBytes(ctx, filename, fileData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to upload media to Telegram: %w", err)
|
||||
}
|
||||
|
||||
if msg.Content.MsgType == event.MsgImage {
|
||||
updates, err = builder.Media(ctx, message.UploadedPhoto(upload, styling...))
|
||||
break
|
||||
} else {
|
||||
document := message.UploadedDocument(upload, styling...).Filename(filename)
|
||||
if msg.Content.Info != nil {
|
||||
document.MIME(msg.Content.Info.MimeType)
|
||||
log.Info().Msg("media URI changed, re-uploading media")
|
||||
req.Media, err = t.transferMediaToTelegram(ctx, msg.Content)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var media message.MediaOption
|
||||
|
||||
switch msg.Content.MsgType {
|
||||
case event.MsgAudio:
|
||||
audioBuilder := document.Audio()
|
||||
if msg.Content.MSC1767Audio != nil {
|
||||
audioBuilder.Duration(time.Duration(msg.Content.MSC1767Audio.Duration) * time.Millisecond)
|
||||
if len(msg.Content.MSC1767Audio.Waveform) > 0 {
|
||||
audioBuilder.Waveform(waveform.Encode(msg.Content.MSC1767Audio.Waveform))
|
||||
}
|
||||
}
|
||||
if msg.Content.MSC3245Voice != nil {
|
||||
audioBuilder.Voice()
|
||||
}
|
||||
media = audioBuilder
|
||||
default:
|
||||
media = document
|
||||
}
|
||||
updates, err = builder.Media(ctx, media)
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("unsupported message type %s", msg.Content.MsgType)
|
||||
} else if !msg.Content.MsgType.IsText() {
|
||||
return fmt.Errorf("editing message type %s is unsupported", msg.Content.MsgType)
|
||||
}
|
||||
updates, err := t.client.API().MessagesEditMessage(ctx, &req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
// mautrix-telegram - A Matrix-Telegram puppeting bridge.
|
||||
// Copyright (C) 2024 Sumner Evans
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
package matrixfmt
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/gotd/td/tg"
|
||||
"maunium.net/go/mautrix/event"
|
||||
|
||||
"go.mau.fi/mautrix-telegram/pkg/connector/ids"
|
||||
"go.mau.fi/mautrix-telegram/pkg/connector/telegramfmt"
|
||||
)
|
||||
|
||||
func toTelegramEntity(br telegramfmt.BodyRange) tg.MessageEntityClass {
|
||||
switch val := br.Value.(type) {
|
||||
case telegramfmt.Mention:
|
||||
userID, _ := ids.ParseUserID(val.UserID)
|
||||
return &tg.MessageEntityMentionName{
|
||||
Offset: br.Start,
|
||||
Length: br.Length,
|
||||
UserID: userID,
|
||||
}
|
||||
case telegramfmt.Style:
|
||||
switch val.Type {
|
||||
case telegramfmt.StyleBold:
|
||||
return &tg.MessageEntityBold{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleItalic:
|
||||
return &tg.MessageEntityItalic{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleUnderline:
|
||||
return &tg.MessageEntityUnderline{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleStrikethrough:
|
||||
return &tg.MessageEntityStrike{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleBlockquote:
|
||||
return &tg.MessageEntityBlockquote{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleCode:
|
||||
return &tg.MessageEntityCode{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StylePre:
|
||||
return &tg.MessageEntityPre{Offset: br.Start, Length: br.Length, Language: val.Language}
|
||||
case telegramfmt.StyleEmail:
|
||||
return &tg.MessageEntityEmail{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleTextURL:
|
||||
return &tg.MessageEntityTextURL{Offset: br.Start, Length: br.Length, URL: val.URL}
|
||||
case telegramfmt.StyleURL:
|
||||
return &tg.MessageEntityURL{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleBotCommand:
|
||||
return &tg.MessageEntityBotCommand{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleHashtag:
|
||||
return &tg.MessageEntityHashtag{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleCashtag:
|
||||
return &tg.MessageEntityCashtag{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StylePhone:
|
||||
return &tg.MessageEntityPhone{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleSpoiler:
|
||||
return &tg.MessageEntitySpoiler{Offset: br.Start, Length: br.Length}
|
||||
case telegramfmt.StyleBankCard:
|
||||
return &tg.MessageEntityBankCard{Offset: br.Start, Length: br.Length}
|
||||
default:
|
||||
panic("unsupported style type")
|
||||
}
|
||||
default:
|
||||
panic("unknown body range value")
|
||||
}
|
||||
}
|
||||
|
||||
func Parse(ctx context.Context, parser *HTMLParser, content *event.MessageEventContent) (string, []tg.MessageEntityClass) {
|
||||
if content.MsgType.IsMedia() && content.FileName == "" {
|
||||
// The body is the filename.
|
||||
return "", nil
|
||||
}
|
||||
|
||||
if content.Format != event.FormatHTML {
|
||||
return content.Body, nil
|
||||
}
|
||||
parseCtx := NewContext(ctx)
|
||||
parseCtx.AllowedMentions = content.Mentions
|
||||
parsed := parser.Parse(content.FormattedBody, parseCtx)
|
||||
if parsed == nil {
|
||||
return "", nil
|
||||
}
|
||||
var bodyRanges []tg.MessageEntityClass
|
||||
if len(parsed.Entities) > 0 {
|
||||
bodyRanges = make([]tg.MessageEntityClass, len(parsed.Entities))
|
||||
for i, ent := range parsed.Entities {
|
||||
bodyRanges[i] = toTelegramEntity(ent)
|
||||
}
|
||||
}
|
||||
return parsed.String.String(), bodyRanges
|
||||
}
|
||||
@@ -0,0 +1,490 @@
|
||||
// mautrix-telegram - A Matrix-Telegram puppeting bridge.
|
||||
// Copyright (C) 2024 Sumner Evans
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
package matrixfmt
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
"golang.org/x/net/html"
|
||||
"maunium.net/go/mautrix/bridgev2/networkid"
|
||||
"maunium.net/go/mautrix/event"
|
||||
"maunium.net/go/mautrix/id"
|
||||
|
||||
"go.mau.fi/mautrix-telegram/pkg/connector/telegramfmt"
|
||||
)
|
||||
|
||||
type EntityString struct {
|
||||
String telegramfmt.UTF16String
|
||||
Entities telegramfmt.BodyRangeList
|
||||
}
|
||||
|
||||
var DebugLog = func(format string, args ...any) {}
|
||||
|
||||
func NewEntityString(val string) *EntityString {
|
||||
DebugLog("NEW %q\n", val)
|
||||
return &EntityString{
|
||||
String: telegramfmt.NewUTF16String(val),
|
||||
}
|
||||
}
|
||||
|
||||
func (es *EntityString) Split(at uint16) []*EntityString {
|
||||
if at > 0x7F {
|
||||
panic("cannot split at non-ASCII character")
|
||||
}
|
||||
if es == nil {
|
||||
return []*EntityString{}
|
||||
}
|
||||
DebugLog("SPLIT %q %q %+v\n", es.String, rune(at), es.Entities)
|
||||
var output []*EntityString
|
||||
prevSplit := 0
|
||||
doSplit := func(i int) *EntityString {
|
||||
newES := &EntityString{
|
||||
String: es.String[prevSplit:i],
|
||||
}
|
||||
for _, entity := range es.Entities {
|
||||
if (entity.End() <= i || entity.End() > prevSplit) && (entity.Start >= prevSplit || entity.Start < i) {
|
||||
entity = *entity.TruncateStart(prevSplit).TruncateEnd(i).Offset(-prevSplit)
|
||||
if entity.Length > 0 {
|
||||
newES.Entities = append(newES.Entities, entity)
|
||||
}
|
||||
}
|
||||
}
|
||||
return newES
|
||||
}
|
||||
for i, chr := range es.String {
|
||||
if chr != at {
|
||||
continue
|
||||
}
|
||||
newES := doSplit(i)
|
||||
output = append(output, newES)
|
||||
DebugLog(" -> %q %+v\n", newES.String, newES.Entities)
|
||||
prevSplit = i + 1
|
||||
}
|
||||
if prevSplit == 0 {
|
||||
DebugLog(" -> NOOP\n")
|
||||
return []*EntityString{es}
|
||||
}
|
||||
if prevSplit != len(es.String) {
|
||||
newES := doSplit(len(es.String))
|
||||
output = append(output, newES)
|
||||
DebugLog(" -> %q %+v\n", newES.String, newES.Entities)
|
||||
}
|
||||
DebugLog("SPLITEND\n")
|
||||
return output
|
||||
}
|
||||
|
||||
func (es *EntityString) TrimSpace() *EntityString {
|
||||
if es == nil {
|
||||
return nil
|
||||
}
|
||||
DebugLog("TRIMSPACE %q %+v\n", es.String, es.Entities)
|
||||
var cutEnd, cutStart int
|
||||
for cutStart = 0; cutStart < len(es.String); cutStart++ {
|
||||
switch es.String[cutStart] {
|
||||
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
for cutEnd = len(es.String) - 1; cutEnd >= 0; cutEnd-- {
|
||||
switch es.String[cutEnd] {
|
||||
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
cutEnd++
|
||||
if cutStart == 0 && cutEnd == len(es.String) {
|
||||
DebugLog(" -> NOOP\n")
|
||||
return es
|
||||
}
|
||||
newEntities := es.Entities[:0]
|
||||
for _, ent := range es.Entities {
|
||||
ent = *ent.Offset(-cutStart).TruncateEnd(cutEnd)
|
||||
if ent.Length > 0 {
|
||||
newEntities = append(newEntities, ent)
|
||||
}
|
||||
}
|
||||
es.String = es.String[cutStart:cutEnd]
|
||||
es.Entities = newEntities
|
||||
DebugLog(" -> %q %+v\n", es.String, es.Entities)
|
||||
return es
|
||||
}
|
||||
|
||||
func JoinEntityString(with string, strings ...*EntityString) *EntityString {
|
||||
withUTF16 := telegramfmt.NewUTF16String(with)
|
||||
totalLen := 0
|
||||
totalEntities := 0
|
||||
for _, s := range strings {
|
||||
totalLen += len(s.String)
|
||||
totalEntities += len(s.Entities)
|
||||
}
|
||||
str := make(telegramfmt.UTF16String, 0, totalLen+len(strings)*len(withUTF16))
|
||||
entities := make(telegramfmt.BodyRangeList, 0, totalEntities)
|
||||
DebugLog("JOIN %q %d\n", with, len(strings))
|
||||
for _, s := range strings {
|
||||
if s == nil || len(s.String) == 0 {
|
||||
continue
|
||||
}
|
||||
DebugLog(" + %q %+v\n", s.String, s.Entities)
|
||||
for _, entity := range s.Entities {
|
||||
entity.Start += len(str)
|
||||
entities = append(entities, entity)
|
||||
}
|
||||
str = append(str, s.String...)
|
||||
str = append(str, withUTF16...)
|
||||
}
|
||||
DebugLog(" -> %q %+v\n", str, entities)
|
||||
return &EntityString{
|
||||
String: str,
|
||||
Entities: entities,
|
||||
}
|
||||
}
|
||||
|
||||
func (es *EntityString) Format(value telegramfmt.BodyRangeValue) *EntityString {
|
||||
if es == nil {
|
||||
return nil
|
||||
}
|
||||
newEntity := telegramfmt.BodyRange{
|
||||
Start: 0,
|
||||
Length: len(es.String),
|
||||
Value: value,
|
||||
}
|
||||
es.Entities = append(telegramfmt.BodyRangeList{newEntity}, es.Entities...)
|
||||
DebugLog("FORMAT %v %q %+v\n", value, es.String, es.Entities)
|
||||
return es
|
||||
}
|
||||
|
||||
func (es *EntityString) Append(other *EntityString) *EntityString {
|
||||
if es == nil {
|
||||
return other
|
||||
} else if other == nil {
|
||||
return es
|
||||
}
|
||||
DebugLog("APPEND %q %+v\n + %q %+v\n", es.String, es.Entities, other.String, other.Entities)
|
||||
for _, entity := range other.Entities {
|
||||
entity.Start += len(es.String)
|
||||
es.Entities = append(es.Entities, entity)
|
||||
}
|
||||
es.String = append(es.String, other.String...)
|
||||
DebugLog(" -> %q %+v\n", es.String, es.Entities)
|
||||
return es
|
||||
}
|
||||
|
||||
func (es *EntityString) AppendString(other string) *EntityString {
|
||||
if es == nil {
|
||||
return NewEntityString(other)
|
||||
} else if len(other) == 0 {
|
||||
return es
|
||||
}
|
||||
DebugLog("APPENDSTRING %q %+v\n + %q\n", es.String, es.Entities, other)
|
||||
es.String = append(es.String, telegramfmt.NewUTF16String(other)...)
|
||||
DebugLog(" -> %q %+v\n", es.String, es.Entities)
|
||||
return es
|
||||
}
|
||||
|
||||
type TagStack []string
|
||||
|
||||
func (ts TagStack) Index(tag string) int {
|
||||
for i := len(ts) - 1; i >= 0; i-- {
|
||||
if ts[i] == tag {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func (ts TagStack) Has(tag string) bool {
|
||||
return ts.Index(tag) >= 0
|
||||
}
|
||||
|
||||
type Context struct {
|
||||
Ctx context.Context
|
||||
AllowedMentions *event.Mentions
|
||||
TagStack TagStack
|
||||
PreserveWhitespace bool
|
||||
}
|
||||
|
||||
func NewContext(ctx context.Context) Context {
|
||||
return Context{
|
||||
Ctx: ctx,
|
||||
TagStack: make(TagStack, 0, 4),
|
||||
}
|
||||
}
|
||||
|
||||
func (ctx Context) WithTag(tag string) Context {
|
||||
ctx.TagStack = append(ctx.TagStack, tag)
|
||||
return ctx
|
||||
}
|
||||
|
||||
func (ctx Context) WithWhitespace() Context {
|
||||
ctx.PreserveWhitespace = true
|
||||
return ctx
|
||||
}
|
||||
|
||||
// HTMLParser is a somewhat customizable Matrix HTML parser.
|
||||
type HTMLParser struct {
|
||||
ParseGhostMXID func(id.UserID) (networkid.UserID, bool)
|
||||
}
|
||||
|
||||
// TaggedString is a string that also contains a HTML tag.
|
||||
type TaggedString struct {
|
||||
*EntityString
|
||||
tag string
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) maybeGetAttribute(node *html.Node, attribute string) (string, bool) {
|
||||
for _, attr := range node.Attr {
|
||||
if attr.Key == attribute {
|
||||
return attr.Val, true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) getAttribute(node *html.Node, attribute string) string {
|
||||
val, _ := parser.maybeGetAttribute(node, attribute)
|
||||
return val
|
||||
}
|
||||
|
||||
// Digits counts the number of digits (and the sign, if negative) in an integer.
|
||||
func Digits(num int) int {
|
||||
if num == 0 {
|
||||
return 1
|
||||
} else if num < 0 {
|
||||
return Digits(-num) + 1
|
||||
}
|
||||
return int(math.Floor(math.Log10(float64(num))) + 1)
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) listToString(node *html.Node, ctx Context) *EntityString {
|
||||
ordered := node.Data == "ol"
|
||||
taggedChildren := parser.nodeToTaggedStrings(node.FirstChild, ctx)
|
||||
counter := 1
|
||||
indentLength := 0
|
||||
if ordered {
|
||||
start := parser.getAttribute(node, "start")
|
||||
if len(start) > 0 {
|
||||
counter, _ = strconv.Atoi(start)
|
||||
}
|
||||
|
||||
longestIndex := (counter - 1) + len(taggedChildren)
|
||||
indentLength = Digits(longestIndex)
|
||||
}
|
||||
indent := strings.Repeat(" ", indentLength+2)
|
||||
var children []*EntityString
|
||||
for _, child := range taggedChildren {
|
||||
if child.tag != "li" {
|
||||
continue
|
||||
}
|
||||
var prefix string
|
||||
if ordered {
|
||||
indexPadding := indentLength - Digits(counter)
|
||||
if indexPadding < 0 {
|
||||
// This will happen on negative start indexes where longestIndex is usually wrong, otherwise shouldn't happen
|
||||
indexPadding = 0
|
||||
}
|
||||
prefix = fmt.Sprintf("%d. %s", counter, strings.Repeat(" ", indexPadding))
|
||||
} else {
|
||||
prefix = "* "
|
||||
}
|
||||
es := NewEntityString(prefix).Append(child.EntityString)
|
||||
counter++
|
||||
parts := es.Split('\n')
|
||||
for i, part := range parts[1:] {
|
||||
parts[i+1] = NewEntityString(indent).Append(part)
|
||||
}
|
||||
children = append(children, parts...)
|
||||
}
|
||||
return JoinEntityString("\n", children...)
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) basicFormatToString(node *html.Node, ctx Context) *EntityString {
|
||||
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
|
||||
switch node.Data {
|
||||
case "b", "strong":
|
||||
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleBold})
|
||||
case "i", "em":
|
||||
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleItalic})
|
||||
case "s", "del", "strike":
|
||||
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleStrikethrough})
|
||||
case "u", "ins":
|
||||
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleUnderline})
|
||||
case "tt", "code":
|
||||
return str.Format(telegramfmt.Style{Type: telegramfmt.StyleCode})
|
||||
}
|
||||
return str
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) spanToString(node *html.Node, ctx Context) *EntityString {
|
||||
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
|
||||
if node.Data == "span" {
|
||||
_, isSpoiler := parser.maybeGetAttribute(node, "data-mx-spoiler")
|
||||
if isSpoiler {
|
||||
str = str.Format(telegramfmt.Style{Type: telegramfmt.StyleSpoiler})
|
||||
}
|
||||
}
|
||||
return str
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) headerToString(node *html.Node, ctx Context) *EntityString {
|
||||
length := int(node.Data[1] - '0')
|
||||
prefix := strings.Repeat("#", length) + " "
|
||||
return NewEntityString(prefix).Append(parser.nodeToString(node.FirstChild, ctx)).Format(telegramfmt.Style{Type: telegramfmt.StyleBold})
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) linkToString(node *html.Node, ctx Context) *EntityString {
|
||||
str := parser.nodeToTagAwareString(node.FirstChild, ctx)
|
||||
href := parser.getAttribute(node, "href")
|
||||
if len(href) == 0 {
|
||||
return str
|
||||
}
|
||||
ent := NewEntityString(str.String.String())
|
||||
|
||||
parsedMatrix, err := id.ParseMatrixURIOrMatrixToURL(href)
|
||||
if err == nil && parsedMatrix != nil && parsedMatrix.Sigil1 == '@' {
|
||||
mxid := parsedMatrix.UserID()
|
||||
if ctx.AllowedMentions != nil && !slices.Contains(ctx.AllowedMentions.UserIDs, mxid) {
|
||||
// Mention not allowed, use name as-is
|
||||
return str
|
||||
}
|
||||
userID, ok := parser.ParseGhostMXID(mxid)
|
||||
if !ok {
|
||||
return str
|
||||
}
|
||||
return ent.Format(telegramfmt.Mention{UserID: userID})
|
||||
}
|
||||
if str.String.String() == href {
|
||||
return ent.Format(telegramfmt.Style{Type: telegramfmt.StyleURL, URL: href})
|
||||
} else {
|
||||
return ent.Format(telegramfmt.Style{Type: telegramfmt.StyleTextURL, URL: href})
|
||||
}
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) tagToString(node *html.Node, ctx Context) *EntityString {
|
||||
ctx = ctx.WithTag(node.Data)
|
||||
switch node.Data {
|
||||
case "blockquote":
|
||||
return parser.
|
||||
nodeToTagAwareString(node.FirstChild, ctx).
|
||||
Format(telegramfmt.Style{Type: telegramfmt.StyleBlockquote})
|
||||
case "ol", "ul":
|
||||
return parser.listToString(node, ctx)
|
||||
case "h1", "h2", "h3", "h4", "h5", "h6":
|
||||
return parser.headerToString(node, ctx)
|
||||
case "br":
|
||||
return NewEntityString("\n")
|
||||
case "b", "strong", "i", "em", "s", "strike", "del", "u", "ins", "tt", "code":
|
||||
return parser.basicFormatToString(node, ctx)
|
||||
case "span", "font":
|
||||
return parser.spanToString(node, ctx)
|
||||
case "a":
|
||||
return parser.linkToString(node, ctx)
|
||||
case "p":
|
||||
return parser.nodeToTagAwareString(node.FirstChild, ctx)
|
||||
case "hr":
|
||||
return NewEntityString("---")
|
||||
case "pre":
|
||||
var preStr *EntityString
|
||||
var language string
|
||||
if node.FirstChild != nil && node.FirstChild.Type == html.ElementNode && node.FirstChild.Data == "code" {
|
||||
class := parser.getAttribute(node.FirstChild, "class")
|
||||
if strings.HasPrefix(class, "language-") {
|
||||
language = class[len("language-"):]
|
||||
}
|
||||
preStr = parser.nodeToString(node.FirstChild.FirstChild, ctx.WithWhitespace())
|
||||
} else {
|
||||
preStr = parser.nodeToString(node.FirstChild, ctx.WithWhitespace())
|
||||
}
|
||||
return preStr.Format(telegramfmt.Style{Type: telegramfmt.StylePre, Language: language})
|
||||
default:
|
||||
return parser.nodeToTagAwareString(node.FirstChild, ctx)
|
||||
}
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) singleNodeToString(node *html.Node, ctx Context) TaggedString {
|
||||
switch node.Type {
|
||||
case html.TextNode:
|
||||
if !ctx.PreserveWhitespace {
|
||||
node.Data = strings.ReplaceAll(node.Data, "\n", "")
|
||||
}
|
||||
return TaggedString{NewEntityString(node.Data), "text"}
|
||||
case html.ElementNode:
|
||||
return TaggedString{parser.tagToString(node, ctx), node.Data}
|
||||
case html.DocumentNode:
|
||||
return TaggedString{parser.nodeToTagAwareString(node.FirstChild, ctx), "html"}
|
||||
default:
|
||||
return TaggedString{&EntityString{}, "unknown"}
|
||||
}
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) nodeToTaggedStrings(node *html.Node, ctx Context) (strs []TaggedString) {
|
||||
for ; node != nil; node = node.NextSibling {
|
||||
strs = append(strs, parser.singleNodeToString(node, ctx))
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var BlockTags = []string{"p", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "blockquote", "div", "hr", "table"}
|
||||
|
||||
func (parser *HTMLParser) isBlockTag(tag string) bool {
|
||||
for _, blockTag := range BlockTags {
|
||||
if tag == blockTag {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) nodeToTagAwareString(node *html.Node, ctx Context) *EntityString {
|
||||
strs := parser.nodeToTaggedStrings(node, ctx)
|
||||
var output *EntityString
|
||||
for _, str := range strs {
|
||||
tstr := str.EntityString
|
||||
if parser.isBlockTag(str.tag) {
|
||||
tstr = NewEntityString("\n").Append(tstr).AppendString("\n")
|
||||
}
|
||||
if output == nil {
|
||||
output = tstr
|
||||
} else {
|
||||
output = output.Append(tstr)
|
||||
}
|
||||
}
|
||||
return output.TrimSpace()
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) nodeToStrings(node *html.Node, ctx Context) (strs []*EntityString) {
|
||||
for ; node != nil; node = node.NextSibling {
|
||||
strs = append(strs, parser.singleNodeToString(node, ctx).EntityString)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (parser *HTMLParser) nodeToString(node *html.Node, ctx Context) *EntityString {
|
||||
return JoinEntityString("", parser.nodeToStrings(node, ctx)...)
|
||||
}
|
||||
|
||||
// Parse converts Matrix HTML into text using the settings in this parser.
|
||||
func (parser *HTMLParser) Parse(htmlData string, ctx Context) *EntityString {
|
||||
node, _ := html.Parse(strings.NewReader(htmlData))
|
||||
return parser.nodeToTagAwareString(node, ctx)
|
||||
}
|
||||
Reference in New Issue
Block a user