package html import ( "io" "strconv" "strings" "github.com/go-faster/errors" "golang.org/x/net/html" "go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity" "go.mau.fi/mautrix-telegram/pkg/gotd/tg" ) type htmlParser struct { tokenizer *html.Tokenizer builder *entity.Builder stack stack attr map[string]string opts Options } func (p *htmlParser) fillAttrs() { // Clear old attrs. for k := range p.attr { delete(p.attr, k) } // Fill with new attributes. for { key, value, ok := p.tokenizer.TagAttr() p.attr[string(key)] = string(value) if !ok { break } } } const ( pre = "pre" code = "code" em = "em" ins = "ins" strike = "strike" del = "del" strong = "strong" span = "span" tgSpoiler = "tg-spoiler" tgEmoji = "tg-emoji" blockquote = "blockquote" ) func (p *htmlParser) tag(tn []byte) string { // Here we intern some well-known tags. switch string(tn) { case "b": return "b" case strong: return strong case "i": return "i" case em: return em case "u": return "u" case ins: return ins case "s": return "s" case strike: return strike case del: return del case "a": return "a" case pre: return pre case code: return code case span: return span case tgSpoiler: return tgSpoiler case tgEmoji: return tgEmoji case blockquote: return blockquote default: return string(tn) } } func (p *htmlParser) startTag() error { var e stackElem tn, hasAttr := p.tokenizer.TagName() e.tag = p.tag(tn) if hasAttr { p.fillAttrs() } e.token = p.builder.Token() // See https://core.telegram.org/bots/api#html-style. switch e.tag { case "b", strong: e.format = entity.Bold() case "i", em: e.format = entity.Italic() case "u", ins: e.format = entity.Underline() case "s", strike, del: e.format = entity.Strike() case "a": e.attr = p.attr["href"] if e.attr == "" { break } f, err := getURLFormatter(e.attr, p.opts.UserResolver) if err != nil { f = nil } e.format = f case code: const langPrefix = "language-" e.format = entity.Code() e.attr = strings.TrimPrefix(p.attr["class"], langPrefix) if len(p.stack) < 1 { break } // BotAPI docs says: // > Use nested
and tags, to define programming language for entity.
last := &p.stack[len(p.stack)-1]
if last.tag != pre {
break
}
if lang := e.attr; lang != "" {
// Set language parameter.
last.format = entity.Pre(lang)
}
case pre:
e.format = entity.Pre("")
if len(p.stack) < 1 {
break
}
last := &p.stack[len(p.stack)-1]
if last.tag != code {
break
}
if lang := last.attr; lang != "" {
// Set language parameter.
e.format = entity.Pre(lang)
}
case span:
if p.attr["class"] == "tg-spoiler" {
e.format = entity.Spoiler()
}
case tgSpoiler:
e.format = entity.Spoiler()
case tgEmoji:
if id, err := strconv.ParseInt(p.attr["emoji-id"], 10, 64); err == nil {
e.format = entity.CustomEmoji(id)
}
case blockquote:
_, collapsed := p.attr["expandable"]
e.format = entity.Blockquote(collapsed)
}
p.stack.push(e)
return nil
}
func (p *htmlParser) endTag(checkName bool) error {
tn, _ := p.tokenizer.TagName()
s, ok := p.stack.pop()
switch {
case !ok:
return errors.Errorf("unexpected end tag %q", tn)
case checkName && s.tag != string(tn):
return errors.Errorf("expected tag %q, got %q", s.tag, tn)
}
// Compute UTF-16 length of entity.
length := s.token.UTF16Length(p.builder)
switch s.tag {
case "a":
// TDLib tries to parse link from body, so we should too.
if s.attr == "" {
msg := s.token.Text(p.builder)
if f, err := getURLFormatter(msg, p.opts.UserResolver); err == nil {
s.format = f
}
}
case "code":
l, ok := p.builder.LastEntity()
if !ok {
break
}
last, ok := l.(*tg.MessageEntityPre)
if !ok {
break
}
// Do not add Code entity, if last entity is Pre with same offset.
if last.GetOffset() == s.token.UTF16Offset() && last.GetLength() == length {
return nil
}
}
// Do not add empty entities.
if length == 0 || s.format == nil {
return nil
}
s.token.Apply(p.builder, s.format)
return nil
}
func (p *htmlParser) parse() error {
for {
tt := p.tokenizer.Next()
switch tt {
case html.ErrorToken:
if err := p.tokenizer.Err(); !errors.Is(err, io.EOF) {
return err
}
return nil
case html.TextToken:
var text []byte
if p.opts.DisableTelegramEscape {
text = p.tokenizer.Text()
} else {
text = telegramUnescape(p.tokenizer.Raw())
}
_, _ = p.builder.Write(text)
case html.StartTagToken:
if err := p.startTag(); err != nil {
return err
}
case html.EndTagToken:
if err := p.endTag(true); err != nil {
return err
}
case html.CommentToken:
// html.Tokenizer returns comment token for empty closing tags.
raw := p.tokenizer.Raw()
if len(raw) >= 3 && string(raw[:2]) == "" && raw[len(raw)-1] == '>' {
if err := p.endTag(false); err != nil {
return err
}
}
}
}
}
// HTML parses given input from reader and adds parsed entities to given builder.
// Notice that this parser ignores unsupported tags.
//
// Parameter userResolver is used to resolve user by ID during formatting. May be nil.
// If userResolver is nil, formatter will create tg.InputUser using only ID.
// Notice that it's okay for bots, but not for users.
//
// See https://core.telegram.org/bots/api#html-style.
func HTML(r io.Reader, b *entity.Builder, opts Options) error {
opts.setDefaults()
p := htmlParser{
tokenizer: html.NewTokenizer(r),
builder: b,
attr: map[string]string{},
opts: opts,
}
if err := p.parse(); err != nil {
return errors.Wrap(err, "parse")
}
b.ShrinkPreCode()
return nil
}