7a04f298d2
- update to latest telegram layer - remove some references to fields in tg.Entities that don't exist in the schema - originally added here: https://github.com/beeper/td/commit/820929062a2ba0104397bc01235ab58a9cff780e - referenced here - https://github.com/mautrix/telegramgo/commit/124f0967ed195b5a380c9bd02e170ada9710dde3 - https://github.com/mautrix/telegramgo/commit/4205047aab2e0639217148b5d125bfaab668bd8e
283 lines
5.7 KiB
Go
283 lines
5.7 KiB
Go
package html
|
|
|
|
import (
|
|
"io"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/go-faster/errors"
|
|
"golang.org/x/net/html"
|
|
|
|
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
|
|
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
|
|
)
|
|
|
|
type htmlParser struct {
|
|
tokenizer *html.Tokenizer
|
|
builder *entity.Builder
|
|
stack stack
|
|
attr map[string]string
|
|
opts Options
|
|
}
|
|
|
|
func (p *htmlParser) fillAttrs() {
|
|
// Clear old attrs.
|
|
for k := range p.attr {
|
|
delete(p.attr, k)
|
|
}
|
|
|
|
// Fill with new attributes.
|
|
for {
|
|
key, value, ok := p.tokenizer.TagAttr()
|
|
p.attr[string(key)] = string(value)
|
|
if !ok {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
const (
|
|
pre = "pre"
|
|
code = "code"
|
|
em = "em"
|
|
ins = "ins"
|
|
strike = "strike"
|
|
del = "del"
|
|
strong = "strong"
|
|
span = "span"
|
|
tgSpoiler = "tg-spoiler"
|
|
tgEmoji = "tg-emoji"
|
|
blockquote = "blockquote"
|
|
)
|
|
|
|
func (p *htmlParser) tag(tn []byte) string {
|
|
// Here we intern some well-known tags.
|
|
switch string(tn) {
|
|
case "b":
|
|
return "b"
|
|
case strong:
|
|
return strong
|
|
case "i":
|
|
return "i"
|
|
case em:
|
|
return em
|
|
case "u":
|
|
return "u"
|
|
case ins:
|
|
return ins
|
|
case "s":
|
|
return "s"
|
|
case strike:
|
|
return strike
|
|
case del:
|
|
return del
|
|
case "a":
|
|
return "a"
|
|
case pre:
|
|
return pre
|
|
case code:
|
|
return code
|
|
case span:
|
|
return span
|
|
case tgSpoiler:
|
|
return tgSpoiler
|
|
case tgEmoji:
|
|
return tgEmoji
|
|
case blockquote:
|
|
return blockquote
|
|
default:
|
|
return string(tn)
|
|
}
|
|
}
|
|
|
|
func (p *htmlParser) startTag() error {
|
|
var e stackElem
|
|
tn, hasAttr := p.tokenizer.TagName()
|
|
e.tag = p.tag(tn)
|
|
if hasAttr {
|
|
p.fillAttrs()
|
|
}
|
|
|
|
e.token = p.builder.Token()
|
|
// See https://core.telegram.org/bots/api#html-style.
|
|
switch e.tag {
|
|
case "b", strong:
|
|
e.format = entity.Bold()
|
|
case "i", em:
|
|
e.format = entity.Italic()
|
|
case "u", ins:
|
|
e.format = entity.Underline()
|
|
case "s", strike, del:
|
|
e.format = entity.Strike()
|
|
case "a":
|
|
e.attr = p.attr["href"]
|
|
if e.attr == "" {
|
|
break
|
|
}
|
|
|
|
f, err := getURLFormatter(e.attr, p.opts.UserResolver)
|
|
if err != nil {
|
|
f = nil
|
|
}
|
|
e.format = f
|
|
case code:
|
|
const langPrefix = "language-"
|
|
|
|
e.format = entity.Code()
|
|
e.attr = strings.TrimPrefix(p.attr["class"], langPrefix)
|
|
if len(p.stack) < 1 {
|
|
break
|
|
}
|
|
|
|
// BotAPI docs says:
|
|
// > Use nested <pre> and <code> tags, to define programming language for <pre> entity.
|
|
last := &p.stack[len(p.stack)-1]
|
|
if last.tag != pre {
|
|
break
|
|
}
|
|
|
|
if lang := e.attr; lang != "" {
|
|
// Set language parameter.
|
|
last.format = entity.Pre(lang)
|
|
}
|
|
case pre:
|
|
e.format = entity.Pre("")
|
|
if len(p.stack) < 1 {
|
|
break
|
|
}
|
|
|
|
last := &p.stack[len(p.stack)-1]
|
|
if last.tag != code {
|
|
break
|
|
}
|
|
|
|
if lang := last.attr; lang != "" {
|
|
// Set language parameter.
|
|
e.format = entity.Pre(lang)
|
|
}
|
|
case span:
|
|
if p.attr["class"] == "tg-spoiler" {
|
|
e.format = entity.Spoiler()
|
|
}
|
|
case tgSpoiler:
|
|
e.format = entity.Spoiler()
|
|
case tgEmoji:
|
|
if id, err := strconv.ParseInt(p.attr["emoji-id"], 10, 64); err == nil {
|
|
e.format = entity.CustomEmoji(id)
|
|
}
|
|
case blockquote:
|
|
_, collapsed := p.attr["expandable"]
|
|
e.format = entity.Blockquote(collapsed)
|
|
}
|
|
|
|
p.stack.push(e)
|
|
return nil
|
|
}
|
|
|
|
func (p *htmlParser) endTag(checkName bool) error {
|
|
tn, _ := p.tokenizer.TagName()
|
|
|
|
s, ok := p.stack.pop()
|
|
switch {
|
|
case !ok:
|
|
return errors.Errorf("unexpected end tag %q", tn)
|
|
case checkName && s.tag != string(tn):
|
|
return errors.Errorf("expected tag %q, got %q", s.tag, tn)
|
|
}
|
|
|
|
// Compute UTF-16 length of entity.
|
|
length := s.token.UTF16Length(p.builder)
|
|
|
|
switch s.tag {
|
|
case "a":
|
|
// TDLib tries to parse link from <a> body, so we should too.
|
|
if s.attr == "" {
|
|
msg := s.token.Text(p.builder)
|
|
if f, err := getURLFormatter(msg, p.opts.UserResolver); err == nil {
|
|
s.format = f
|
|
}
|
|
}
|
|
case "code":
|
|
l, ok := p.builder.LastEntity()
|
|
if !ok {
|
|
break
|
|
}
|
|
last, ok := l.(*tg.MessageEntityPre)
|
|
if !ok {
|
|
break
|
|
}
|
|
// Do not add Code entity, if last entity is Pre with same offset.
|
|
if last.GetOffset() == s.token.UTF16Offset() && last.GetLength() == length {
|
|
return nil
|
|
}
|
|
}
|
|
// Do not add empty entities.
|
|
if length == 0 || s.format == nil {
|
|
return nil
|
|
}
|
|
|
|
s.token.Apply(p.builder, s.format)
|
|
return nil
|
|
}
|
|
|
|
func (p *htmlParser) parse() error {
|
|
for {
|
|
tt := p.tokenizer.Next()
|
|
switch tt {
|
|
case html.ErrorToken:
|
|
if err := p.tokenizer.Err(); !errors.Is(err, io.EOF) {
|
|
return err
|
|
}
|
|
return nil
|
|
case html.TextToken:
|
|
var text []byte
|
|
if p.opts.DisableTelegramEscape {
|
|
text = p.tokenizer.Text()
|
|
} else {
|
|
text = telegramUnescape(p.tokenizer.Raw())
|
|
}
|
|
_, _ = p.builder.Write(text)
|
|
case html.StartTagToken:
|
|
if err := p.startTag(); err != nil {
|
|
return err
|
|
}
|
|
case html.EndTagToken:
|
|
if err := p.endTag(true); err != nil {
|
|
return err
|
|
}
|
|
case html.CommentToken:
|
|
// html.Tokenizer returns comment token for empty closing tags.
|
|
raw := p.tokenizer.Raw()
|
|
if len(raw) >= 3 && string(raw[:2]) == "</" && raw[len(raw)-1] == '>' {
|
|
if err := p.endTag(false); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// HTML parses given input from reader and adds parsed entities to given builder.
|
|
// Notice that this parser ignores unsupported tags.
|
|
//
|
|
// Parameter userResolver is used to resolve user by ID during formatting. May be nil.
|
|
// If userResolver is nil, formatter will create tg.InputUser using only ID.
|
|
// Notice that it's okay for bots, but not for users.
|
|
//
|
|
// See https://core.telegram.org/bots/api#html-style.
|
|
func HTML(r io.Reader, b *entity.Builder, opts Options) error {
|
|
opts.setDefaults()
|
|
p := htmlParser{
|
|
tokenizer: html.NewTokenizer(r),
|
|
builder: b,
|
|
attr: map[string]string{},
|
|
opts: opts,
|
|
}
|
|
|
|
if err := p.parse(); err != nil {
|
|
return errors.Wrap(err, "parse")
|
|
}
|
|
b.ShrinkPreCode()
|
|
return nil
|
|
}
|