move gotd fork into repo. (#111)

- update to latest telegram layer
- remove some references to fields in tg.Entities that don't exist in
the schema
- originally added here:
https://github.com/beeper/td/commit/820929062a2ba0104397bc01235ab58a9cff780e
  - referenced here
-
https://github.com/mautrix/telegramgo/commit/124f0967ed195b5a380c9bd02e170ada9710dde3
-
https://github.com/mautrix/telegramgo/commit/4205047aab2e0639217148b5d125bfaab668bd8e
This commit is contained in:
Adam Van Ymeren
2025-06-27 20:03:37 -07:00
committed by GitHub
parent 0952df0244
commit 7a04f298d2
19264 changed files with 1539697 additions and 84 deletions
+50
View File
@@ -0,0 +1,50 @@
// Package html contains HTML styling options.
package html
import (
"bytes"
"fmt"
"io"
"strings"
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/styling"
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
)
// Bytes reads HTML from given byte slice and returns styling option
// to build styled text block.
func Bytes(resolver func(id int64) (tg.InputUserClass, error), b []byte) styling.StyledTextOption {
return Reader(resolver, bytes.NewReader(b))
}
// String reads HTML from given string and returns styling option
// to build styled text block.
func String(resolver func(id int64) (tg.InputUserClass, error), s string) styling.StyledTextOption {
return Reader(resolver, strings.NewReader(s))
}
// Format formats string using fmt, parses HTML from formatted string and returns styling option
// to build styled text block.
func Format(resolver func(id int64) (tg.InputUserClass, error), format string, args ...interface{}) styling.StyledTextOption {
return styling.Custom(func(eb *entity.Builder) error {
var buf bytes.Buffer
_, err := fmt.Fprintf(&buf, format, args...)
if err != nil {
return err
}
return HTML(&buf, eb, Options{
UserResolver: resolver,
})
})
}
// Reader reads HTML from given reader and returns styling option
// to build styled text block.
func Reader(resolver func(id int64) (tg.InputUserClass, error), r io.Reader) styling.StyledTextOption {
return styling.Custom(func(eb *entity.Builder) error {
return HTML(r, eb, Options{
UserResolver: resolver,
})
})
}
@@ -0,0 +1,52 @@
package html_test
import (
"context"
"fmt"
"os"
"os/signal"
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram"
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message"
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/html"
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
)
func sendHTML(ctx context.Context) error {
client, err := telegram.ClientFromEnvironment(telegram.Options{})
if err != nil {
return err
}
// This example creates a styled message from BotAPI examples
// and sends to your Saved Messages folder.
// See https://core.telegram.org/bots/api#html-style.
return client.Run(ctx, func(ctx context.Context) error {
_, err := message.NewSender(tg.NewClient(client)).
Self().StyledText(ctx, html.String(nil, `<b>bold</b>, <strong>bold</strong>
<i>italic</i>, <em>italic</em>
<u>underline</u>, <ins>underline</ins>
<s>strikethrough</s>, <strike>strikethrough</strike>, <del>strikethrough</del>
<span class="tg-spoiler">spoiler</span>, <tg-spoiler>spoiler</tg-spoiler>
<b>bold <i>italic bold <s>italic bold strikethrough <span class="tg-spoiler">italic bold strikethrough spoiler</span></s> <u>underline italic bold</u></i> bold</b>
<a href="http://www.example.com/">inline URL</a>
<a href="tg://user?id=123456789">inline mention of a user</a>
<tg-emoji emoji-id="5368324170671202286">👍</tg-emoji>
<code>inline fixed-width code</code>
<pre>pre-formatted fixed-width code block</pre>
<pre><code class="language-python">pre-formatted fixed-width code block written in the Python programming language</code></pre>
<blockquote>Block quotation started\nBlock quotation continued\nThe last line of the block quotation</blockquote>
<blockquote expandable>Expandable block quotation started\nExpandable block quotation continued\nExpandable block quotation continued\nHidden by default part of the block quotation started\nExpandable block quotation continued\nThe last line of the block quotation</blockquote>`))
return err
})
}
func ExampleString() {
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()
if err := sendHTML(ctx); err != nil {
_, _ = fmt.Fprintf(os.Stderr, "%+v\n", err)
os.Exit(2)
}
}
+28
View File
@@ -0,0 +1,28 @@
package html
import (
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
)
// Options is options of HTML.
type Options struct {
// UserResolver is used to resolve user by ID during formatting. May be nil.
//
// If userResolver is nil, formatter will create tg.InputUser using only ID.
// Notice that it's okay for bots, but not for users.
UserResolver entity.UserResolver
// DisableTelegramEscape disable Telegram BotAPI escaping and uses default
// golang.org/x/net/html escape.
DisableTelegramEscape bool
}
func (o *Options) setDefaults() {
if o.UserResolver == nil {
o.UserResolver = func(id int64) (tg.InputUserClass, error) {
return &tg.InputUser{
UserID: id,
}, nil
}
}
}
+282
View File
@@ -0,0 +1,282 @@
package html
import (
"io"
"strconv"
"strings"
"github.com/go-faster/errors"
"golang.org/x/net/html"
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
)
type htmlParser struct {
tokenizer *html.Tokenizer
builder *entity.Builder
stack stack
attr map[string]string
opts Options
}
func (p *htmlParser) fillAttrs() {
// Clear old attrs.
for k := range p.attr {
delete(p.attr, k)
}
// Fill with new attributes.
for {
key, value, ok := p.tokenizer.TagAttr()
p.attr[string(key)] = string(value)
if !ok {
break
}
}
}
const (
pre = "pre"
code = "code"
em = "em"
ins = "ins"
strike = "strike"
del = "del"
strong = "strong"
span = "span"
tgSpoiler = "tg-spoiler"
tgEmoji = "tg-emoji"
blockquote = "blockquote"
)
func (p *htmlParser) tag(tn []byte) string {
// Here we intern some well-known tags.
switch string(tn) {
case "b":
return "b"
case strong:
return strong
case "i":
return "i"
case em:
return em
case "u":
return "u"
case ins:
return ins
case "s":
return "s"
case strike:
return strike
case del:
return del
case "a":
return "a"
case pre:
return pre
case code:
return code
case span:
return span
case tgSpoiler:
return tgSpoiler
case tgEmoji:
return tgEmoji
case blockquote:
return blockquote
default:
return string(tn)
}
}
func (p *htmlParser) startTag() error {
var e stackElem
tn, hasAttr := p.tokenizer.TagName()
e.tag = p.tag(tn)
if hasAttr {
p.fillAttrs()
}
e.token = p.builder.Token()
// See https://core.telegram.org/bots/api#html-style.
switch e.tag {
case "b", strong:
e.format = entity.Bold()
case "i", em:
e.format = entity.Italic()
case "u", ins:
e.format = entity.Underline()
case "s", strike, del:
e.format = entity.Strike()
case "a":
e.attr = p.attr["href"]
if e.attr == "" {
break
}
f, err := getURLFormatter(e.attr, p.opts.UserResolver)
if err != nil {
f = nil
}
e.format = f
case code:
const langPrefix = "language-"
e.format = entity.Code()
e.attr = strings.TrimPrefix(p.attr["class"], langPrefix)
if len(p.stack) < 1 {
break
}
// BotAPI docs says:
// > Use nested <pre> and <code> tags, to define programming language for <pre> entity.
last := &p.stack[len(p.stack)-1]
if last.tag != pre {
break
}
if lang := e.attr; lang != "" {
// Set language parameter.
last.format = entity.Pre(lang)
}
case pre:
e.format = entity.Pre("")
if len(p.stack) < 1 {
break
}
last := &p.stack[len(p.stack)-1]
if last.tag != code {
break
}
if lang := last.attr; lang != "" {
// Set language parameter.
e.format = entity.Pre(lang)
}
case span:
if p.attr["class"] == "tg-spoiler" {
e.format = entity.Spoiler()
}
case tgSpoiler:
e.format = entity.Spoiler()
case tgEmoji:
if id, err := strconv.ParseInt(p.attr["emoji-id"], 10, 64); err == nil {
e.format = entity.CustomEmoji(id)
}
case blockquote:
_, collapsed := p.attr["expandable"]
e.format = entity.Blockquote(collapsed)
}
p.stack.push(e)
return nil
}
func (p *htmlParser) endTag(checkName bool) error {
tn, _ := p.tokenizer.TagName()
s, ok := p.stack.pop()
switch {
case !ok:
return errors.Errorf("unexpected end tag %q", tn)
case checkName && s.tag != string(tn):
return errors.Errorf("expected tag %q, got %q", s.tag, tn)
}
// Compute UTF-16 length of entity.
length := s.token.UTF16Length(p.builder)
switch s.tag {
case "a":
// TDLib tries to parse link from <a> body, so we should too.
if s.attr == "" {
msg := s.token.Text(p.builder)
if f, err := getURLFormatter(msg, p.opts.UserResolver); err == nil {
s.format = f
}
}
case "code":
l, ok := p.builder.LastEntity()
if !ok {
break
}
last, ok := l.(*tg.MessageEntityPre)
if !ok {
break
}
// Do not add Code entity, if last entity is Pre with same offset.
if last.GetOffset() == s.token.UTF16Offset() && last.GetLength() == length {
return nil
}
}
// Do not add empty entities.
if length == 0 || s.format == nil {
return nil
}
s.token.Apply(p.builder, s.format)
return nil
}
func (p *htmlParser) parse() error {
for {
tt := p.tokenizer.Next()
switch tt {
case html.ErrorToken:
if err := p.tokenizer.Err(); !errors.Is(err, io.EOF) {
return err
}
return nil
case html.TextToken:
var text []byte
if p.opts.DisableTelegramEscape {
text = p.tokenizer.Text()
} else {
text = telegramUnescape(p.tokenizer.Raw())
}
_, _ = p.builder.Write(text)
case html.StartTagToken:
if err := p.startTag(); err != nil {
return err
}
case html.EndTagToken:
if err := p.endTag(true); err != nil {
return err
}
case html.CommentToken:
// html.Tokenizer returns comment token for empty closing tags.
raw := p.tokenizer.Raw()
if len(raw) >= 3 && string(raw[:2]) == "</" && raw[len(raw)-1] == '>' {
if err := p.endTag(false); err != nil {
return err
}
}
}
}
}
// HTML parses given input from reader and adds parsed entities to given builder.
// Notice that this parser ignores unsupported tags.
//
// Parameter userResolver is used to resolve user by ID during formatting. May be nil.
// If userResolver is nil, formatter will create tg.InputUser using only ID.
// Notice that it's okay for bots, but not for users.
//
// See https://core.telegram.org/bots/api#html-style.
func HTML(r io.Reader, b *entity.Builder, opts Options) error {
opts.setDefaults()
p := htmlParser{
tokenizer: html.NewTokenizer(r),
builder: b,
attr: map[string]string{},
opts: opts,
}
if err := p.parse(); err != nil {
return errors.Wrap(err, "parse")
}
b.ShrinkPreCode()
return nil
}
@@ -0,0 +1,34 @@
package html
import (
"strings"
"testing"
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
)
func BenchmarkHTML(b *testing.B) {
input := `<b>bold</b>, <strong>bold</strong>
<i>italic</i>, <em>italic</em>
<u>underline</u>, <ins>underline</ins>
<s>strikethrough</s>, <strike>strikethrough</strike>, <del>strikethrough</del>
<b>bold <i>italic bold <s>italic bold strikethrough</s> <u>underline italic bold</u></i> bold</b>
<a href="http://www.example.com/">inline URL</a>
<a href="tg://user?id=123456789">inline mention of a user</a>
<code>inline fixed-width code</code>
<pre>pre-formatted fixed-width code block</pre>
<pre><code class="language-python">pre-formatted fixed-width code block written in the Python programming language</code></pre>`
reader := strings.NewReader(input)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
reader.Reset(input)
builder := entity.Builder{}
if err := HTML(reader, &builder, Options{}); err != nil {
b.Fatal(err)
}
}
}
@@ -0,0 +1,211 @@
package html
import (
"fmt"
"strings"
"testing"
"github.com/stretchr/testify/require"
"golang.org/x/net/html"
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
)
type htmlTestCase struct {
html string
msg string
entities func(msg string) []tg.MessageEntityClass
wantErr bool
skipReason string
}
func getEntities(formats ...entity.Formatter) func(msg string) []tg.MessageEntityClass {
return func(msg string) []tg.MessageEntityClass {
length := entity.ComputeLength(msg)
r := make([]tg.MessageEntityClass, len(formats))
for i := range formats {
r[i] = formats[i](0, length)
}
return r
}
}
func TestHTML(t *testing.T) {
runTests := func(tests []htmlTestCase, numericName bool) func(t *testing.T) {
return func(t *testing.T) {
for i, test := range tests {
testName := test.msg
if numericName || testName == "" {
testName = fmt.Sprintf("Test%d", i+1)
}
t.Run(strings.Title(testName), func(t *testing.T) {
t.Cleanup(func() {
if t.Failed() {
t.Logf("Input: %q", test.html)
}
})
if test.skipReason != "" {
t.Skip(test.skipReason)
}
a := require.New(t)
b := entity.Builder{}
err := HTML(strings.NewReader(test.html), &b, Options{})
if test.wantErr {
a.Error(err)
return
}
a.NoError(err)
var (
msg string
entities []tg.MessageEntityClass
)
if strings.TrimSpace(test.msg) != test.msg {
// Complete cuts spaces and fixes entities, but TDLib test expects
// that it happens after parsing.
msg, entities = b.Raw()
entity.SortEntities(entities)
} else {
msg, entities = b.Complete()
}
a.Equal(test.msg, msg)
if test.entities != nil {
expect := test.entities(test.msg)
a.Len(entities, len(expect))
a.ElementsMatch(expect, entities)
} else {
a.Empty(entities)
}
})
}
}
}
{
tests := []htmlTestCase{
{html: "<b>bold</b>", msg: "bold", entities: getEntities(entity.Bold())},
{html: "<strong>bold</strong>", msg: "bold", entities: getEntities(entity.Bold())},
{html: "<i>italic</i>", msg: "italic", entities: getEntities(entity.Italic())},
{html: "<em>italic</em>", msg: "italic", entities: getEntities(entity.Italic())},
{html: "<u>underline</u>", msg: "underline", entities: getEntities(entity.Underline())},
{html: "<ins>underline</ins>", msg: "underline", entities: getEntities(entity.Underline())},
{html: "<s>strikethrough</s>", msg: "strikethrough", entities: getEntities(entity.Strike())},
{html: "<strike>strikethrough</strike>", msg: "strikethrough", entities: getEntities(entity.Strike())},
{html: "<del>strikethrough</del>", msg: "strikethrough", entities: getEntities(entity.Strike())},
{html: "<code>code</code>", msg: "code", entities: getEntities(entity.Code())},
{html: "<pre>abc</pre>", msg: "abc", entities: getEntities(entity.Pre(""))},
{html: `<a href="http://www.example.com/">inline URL</a>`, msg: "inline URL",
entities: getEntities(entity.TextURL("http://www.example.com/"))},
{html: `<a href="tg://user?id=123456789">inline mention of a user</a>`, msg: "inline mention of a user",
entities: getEntities(entity.MentionName(&tg.InputUser{
UserID: 123456789,
}))},
{html: `<pre><code class="language-python">python code</code></pre>`, msg: "python code",
entities: getEntities(entity.Pre("python"))},
{html: "<b>&lt;</b>", msg: "<", entities: getEntities(entity.Bold())},
{html: `<span class="tg-spoiler">spoiler</span>`, msg: "spoiler", entities: getEntities(entity.Spoiler())},
{html: "<tg-emoji emoji-id=\"5368324170671202286\">👍</tg-emoji>", msg: "👍", entities: getEntities(entity.CustomEmoji(5368324170671202286))},
{html: "<blockquote expandable>quote</blockquote>", msg: "quote", entities: getEntities(entity.Blockquote(true))},
{html: "<blockquote>quote</blockquote>", msg: "quote", entities: getEntities(entity.Blockquote(false))},
}
t.Run("Common", runTests(tests, false))
}
{
negativeTests := []htmlTestCase{
{html: "&#57311;", wantErr: true},
{html: "&#xDFDF;", wantErr: true},
{html: "&#xDFDF", wantErr: true},
{html: "🏟 🏟&lt;<abacaba", wantErr: true},
{html: "🏟 🏟&lt;<abac aba>", wantErr: true},
{html: "🏟 🏟&lt;<abac>", wantErr: true},
{html: "🏟 🏟&lt;<i =aba>", wantErr: true},
{html: "🏟 🏟&lt;<i aba>", wantErr: true},
{html: "🏟 🏟&lt;<i aba = ", wantErr: true},
{html: "🏟 🏟&lt;<i aba = 190azAz-.,", wantErr: true},
{html: "🏟 🏟&lt;<i aba = \"&lt;&gt;&quot;>", wantErr: true},
{html: "🏟 🏟&lt;<i aba = \\'&lt;&gt;&quot;>", wantErr: true},
{html: "🏟 🏟&lt;</", wantErr: true},
{html: "🏟 🏟&lt;<b></b></", wantErr: true},
{html: "🏟 🏟&lt;<i>a</i ", wantErr: true},
{html: "🏟 🏟&lt;<i>a</em >", wantErr: true},
}
// FIXME(tdakkota): sanitize HTML
_ = negativeTests
t.Run("TDLib", runTests(tdlibHTMLTests(), true))
}
}
func TestIssue525(t *testing.T) {
test := func(text string, expected []tg.MessageEntityClass) func(t *testing.T) {
return func(t *testing.T) {
a := require.New(t)
b := entity.Builder{}
p := htmlParser{
tokenizer: html.NewTokenizer(strings.NewReader(text)),
builder: &b,
attr: map[string]string{},
}
a.NoError(p.parse())
_, entities := b.Complete()
a.Equal(expected, entities)
}
}
t.Run("Ru", test(`Строка
<i>Строка текста курсивом</i>
Обычный текст с <a href="https://google.com">Ссылкой</a> внутри, и
ещё одна ссылка - <a href="https://go.dev">Здесь</a>.
Ещё одна строка.
`,
[]tg.MessageEntityClass{
&tg.MessageEntityItalic{
Offset: 7,
Length: 22,
},
&tg.MessageEntityTextURL{
Offset: 47,
Length: 7,
URL: "https://google.com",
},
&tg.MessageEntityTextURL{
Offset: 83,
Length: 5,
URL: "https://go.dev",
},
}),
)
t.Run("En", test(`Line
<i>Italic line of text</i>
Normal line of text with <a href="https://google.com">Link</a> inside, and
another link now - <a href="https://go.dev">Here</a>.
One more line.
`,
[]tg.MessageEntityClass{
&tg.MessageEntityItalic{
Offset: 5,
Length: 19,
},
&tg.MessageEntityTextURL{
Offset: 51,
Length: 4,
URL: "https://google.com",
},
&tg.MessageEntityTextURL{
Offset: 87,
Length: 4,
URL: "https://go.dev",
},
}),
)
}
+35
View File
@@ -0,0 +1,35 @@
package html
import "go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
type stackElem struct {
token entity.Token
tag string
attr string
format entity.Formatter
}
type stack []stackElem
func (s *stack) push(e stackElem) {
*s = append(*s, e)
}
func (s *stack) last() (stackElem, bool) {
l := len(*s)
if l == 0 {
return stackElem{}, false
}
elem := (*s)[l-1]
return elem, true
}
func (s *stack) pop() (stackElem, bool) {
e, ok := s.last()
if !ok {
return stackElem{}, false
}
*s = (*s)[:len(*s)-1]
return e, true
}
@@ -0,0 +1,413 @@
package html
import "go.mau.fi/mautrix-telegram/pkg/gotd/tg"
func tdlibHTMLTests() []htmlTestCase {
entities := func(e ...tg.MessageEntityClass) func(msg string) []tg.MessageEntityClass {
return func(msg string) []tg.MessageEntityClass {
return e
}
}
return []htmlTestCase{
{"", "", nil, false, ""},
{"➡️ ➡️", "➡️ ➡️", nil, false, ""},
{
"&lt;&gt;&amp;&quot;&laquo;&raquo;&#12345678;",
"<>&\"&laquo;&raquo;&#12345678;",
nil,
false,
"",
},
{
"➡️ ➡️<i>➡️ ➡️</i>",
"➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntityItalic{Offset: 5, Length: 5}),
false,
"",
},
{
"➡️ ➡️<em>➡️ ➡️</em>", "➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntityItalic{Offset: 5, Length: 5}),
false,
"",
},
{
"➡️ ➡️<b>➡️ ➡️</b>",
"➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntityBold{Offset: 5, Length: 5}),
false,
"",
},
{
"➡️ ➡️<strong>➡️ ➡️</strong>",
"➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntityBold{Offset: 5, Length: 5}),
false,
"",
},
{
"➡️ ➡️<u>➡️ ➡️</u>",
"➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntityUnderline{Offset: 5, Length: 5}),
false,
"",
},
{
"➡️ ➡️<ins>➡️ ➡️</ins>",
"➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntityUnderline{Offset: 5, Length: 5}),
false,
"",
},
{
"➡️ ➡️<s>➡️ ➡️</s>",
"➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntityStrike{Offset: 5, Length: 5}),
false,
"",
},
{
"➡️ ➡️<strike>➡️ ➡️</strike>",
"➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntityStrike{Offset: 5, Length: 5}),
false,
"",
},
{
"➡️ ➡️<del>➡️ ➡️</del>",
"➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntityStrike{Offset: 5, Length: 5}),
false,
"",
},
{
"➡️ ➡️<i>➡️ ➡️</i><b>➡️ ➡️</b>",
"➡️ ➡️➡️ ➡️➡️ ➡️",
entities(
&tg.MessageEntityItalic{Offset: 5, Length: 5},
&tg.MessageEntityBold{Offset: 10, Length: 5},
),
false,
"",
},
{
"🏟 🏟<i>🏟 &lt🏟</i>",
"🏟 🏟🏟 <🏟",
entities(&tg.MessageEntityItalic{Offset: 5, Length: 6}),
false,
"",
},
{
"🏟 🏟<i>🏟 &gt;<b aba = caba>&lt🏟</b></i>",
"🏟 🏟🏟 ><🏟",
entities(
&tg.MessageEntityItalic{Offset: 5, Length: 7},
&tg.MessageEntityBold{Offset: 9, Length: 3},
),
false,
"",
},
{
"🏟 🏟&lt;<i aba = 190azAz-. >a</i>",
"🏟 🏟<a",
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
false,
"",
},
{
"🏟 🏟&lt;<i aba = 190azAz-.>a</i>",
"🏟 🏟<a",
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
false,
"",
},
{
"🏟 🏟&lt;<i aba = \"&lt;&gt;&quot;\">a</i>",
"🏟 🏟<a",
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
false,
"",
},
{
"🏟 🏟&lt;<i aba = '&lt;&gt;&quot;'>a</i>",
"🏟 🏟<a",
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
false,
"",
},
{
"🏟 🏟&lt;<i aba = '&lt;&gt;&quot;'>a</>",
"🏟 🏟<a",
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
false,
"",
},
{
"🏟 🏟&lt;<i>🏟 🏟&lt;</>",
"🏟 🏟<🏟 🏟<",
entities(&tg.MessageEntityItalic{Offset: 6, Length: 6}),
false,
"",
},
{
"🏟 🏟&lt;<i>a</ >",
"🏟 🏟<a",
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
false,
"",
},
{
"🏟 🏟&lt;<i>a</i >",
"🏟 🏟<a",
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
false,
"",
},
// Empty entity.
{
"🏟 🏟&lt;<b></b>",
"🏟 🏟<",
nil,
false,
"",
},
// Space handling.
{
"<i>\t</i>",
"\t",
entities(&tg.MessageEntityItalic{Offset: 0, Length: 1}),
false,
"",
},
{
"<i>\r</i>",
"\r",
entities(&tg.MessageEntityItalic{Offset: 0, Length: 1}),
false,
"",
},
{
"<i>\n</i>",
"\n",
entities(&tg.MessageEntityItalic{Offset: 0, Length: 1}),
false,
"",
},
{
"➡️ ➡️<span class = \"tg-spoiler\">➡️ ➡️</span><b>➡️ ➡️</b>",
"➡️ ➡️➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 5}, &tg.MessageEntityBold{Offset: 10, Length: 5}),
false,
"",
},
{
"🏟 🏟<span class=\"tg-spoiler\">🏟 &lt🏟</span>",
"🏟 🏟🏟 <🏟",
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 6}),
false,
"",
},
{
"🏟 🏟<span class=\"tg-spoiler\">🏟 &gt;<b aba = caba>&lt🏟</b></span>",
"🏟 🏟🏟 ><🏟",
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 7}, &tg.MessageEntityBold{Offset: 9, Length: 3}),
false,
"",
},
{
"➡️ ➡️<tg-spoiler>➡️ ➡️</tg-spoiler><b>➡️ ➡️</b>",
"➡️ ➡️➡️ ➡️➡️ ➡️",
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 5}, &tg.MessageEntityBold{Offset: 10, Length: 5}),
false,
"",
},
{
"🏟 🏟<tg-spoiler>🏟 &lt🏟</tg-spoiler>",
"🏟 🏟🏟 <🏟",
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 6}),
false,
"",
},
{
"🏟 🏟<tg-spoiler>🏟 &gt;<b aba = caba>&lt🏟</b></tg-spoiler>",
"🏟 🏟🏟 ><🏟",
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 7}, &tg.MessageEntityBold{Offset: 9, Length: 3}),
false,
"",
},
{
"<a href=telegram.org>\t</a>",
"\t",
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
false,
"",
},
{
"<a href=telegram.org>\r</a>",
"\r",
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
false,
"",
},
{
"<a href=telegram.org>\n</a>",
"\n",
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
false,
"",
},
{
"<code><i><b> </b></i></code><i><b><code> </code></b></i>",
" ",
entities(
&tg.MessageEntityCode{Offset: 0, Length: 1},
&tg.MessageEntityBold{Offset: 0, Length: 1},
&tg.MessageEntityItalic{Offset: 0, Length: 1},
&tg.MessageEntityCode{Offset: 1, Length: 1},
&tg.MessageEntityBold{Offset: 1, Length: 1},
&tg.MessageEntityItalic{Offset: 1, Length: 1}),
false,
"",
},
{
"<i><b> </b> <code> </code></i>",
" ",
entities(
&tg.MessageEntityItalic{Offset: 0, Length: 3},
&tg.MessageEntityBold{Offset: 0, Length: 1},
&tg.MessageEntityCode{Offset: 2, Length: 1},
),
false,
"",
},
{
"<a href=telegram.org> </a>",
" ",
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
false,
"",
},
{
"<a href =\"telegram.org\" > </a>",
" ",
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
false,
"",
},
{
"<a href= 'telegram.org' > </a>",
" ",
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
false,
"",
},
{
"<a href= 'telegram.org?&lt;' > </a>",
" ",
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/?<"}),
false,
"",
},
// URL handling
{
"<a>telegram.org </a>",
"telegram.org ",
nil,
false,
"",
},
{
"<a>telegram.org</a>", "telegram.org",
entities(&tg.MessageEntityTextURL{
Offset: 0,
Length: 12,
URL: "http://telegram.org/",
}),
false,
"",
},
{
"<a>https://telegram.org/asdsa?asdasdwe#12e3we</a>",
"https://telegram.org/asdsa?asdasdwe#12e3we",
entities(&tg.MessageEntityTextURL{
Offset: 0,
Length: 42,
URL: "https://telegram.org/asdsa?asdasdwe#12e3we",
}),
false,
"",
},
// <pre> and <code> handling
{
"🏟 🏟&lt;<pre >🏟 🏟&lt;</>",
"🏟 🏟<🏟 🏟<",
entities(&tg.MessageEntityPre{Offset: 6, Length: 6}),
false,
"",
},
{
"🏟 🏟&lt;<code >🏟 🏟&lt;</>",
"🏟 🏟<🏟 🏟<",
entities(&tg.MessageEntityCode{Offset: 6, Length: 6}),
false,
"",
},
{
"🏟 🏟&lt;<pre><code>🏟 🏟&lt;</code></>",
"🏟 🏟<🏟 🏟<",
entities(
&tg.MessageEntityPre{Offset: 6, Length: 6},
&tg.MessageEntityCode{Offset: 6, Length: 6},
),
false,
"",
},
{
"🏟 🏟&lt;<pre><code class=\"language-\">🏟 🏟&lt;</code></>",
"🏟 🏟<🏟 🏟<",
entities(
&tg.MessageEntityPre{Offset: 6, Length: 6},
&tg.MessageEntityCode{Offset: 6, Length: 6},
),
false,
"",
},
{
"🏟 🏟&lt;<pre><code class=\"language-fift\">🏟 🏟&lt;</></>",
"🏟 🏟<🏟 🏟<",
entities(&tg.MessageEntityPre{Offset: 6, Length: 6, Language: "fift"}),
false,
"",
},
{
"🏟 🏟&lt;<code class=\"language-fift\"><pre>🏟 🏟&lt;</></>",
"🏟 🏟<🏟 🏟<",
entities(&tg.MessageEntityPre{Offset: 6, Length: 6, Language: "fift"}),
false,
"",
},
{
"🏟 🏟&lt;<pre><code class=\"language-fift\">🏟 🏟&lt;</> </>",
"🏟 🏟<🏟 🏟< ",
entities(
&tg.MessageEntityPre{Offset: 6, Length: 7},
&tg.MessageEntityCode{Offset: 6, Length: 6},
),
false,
"",
},
{
"🏟 🏟&lt;<pre> <code class=\"language-fift\">🏟 🏟&lt;</></>",
"🏟 🏟< 🏟 🏟<",
entities(
&tg.MessageEntityPre{Offset: 6, Length: 7},
&tg.MessageEntityCode{Offset: 7, Length: 6},
),
false,
"",
},
}
}
@@ -0,0 +1,89 @@
package html
import (
"net"
"net/url"
"strconv"
"strings"
"unicode/utf8"
"github.com/go-faster/errors"
"go.mau.fi/mautrix-telegram/pkg/gotd/ascii"
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
)
func isIPv6(str string) bool {
ip := net.ParseIP(str)
return strings.Contains(str, ":") && ip != nil
}
func validateHostname(u *url.URL) error {
// TODO(tdakkota): make sure that it is correct
ipv6 := isIPv6(u.Host)
if !strings.ContainsRune(u.Host, '.') && ipv6 {
return errors.New("wrong HTTP URL")
}
if ipv6 {
return nil
}
allowedSymbol := func(c rune) bool {
return ascii.IsLatinLetter(c) ||
ascii.IsDigit(c) ||
(c >= '&' && c <= '.') ||
c == '_' ||
c == '!' ||
c == '$' ||
c == '~' ||
c == ';' ||
c == '=' ||
c > utf8.RuneSelf
}
for _, c := range u.Host {
if !allowedSymbol(c) {
return errors.Errorf("disallowed character %c in URL host", c)
}
}
return nil
}
func getURLFormatter(rawURL string, resolver entity.UserResolver) (entity.Formatter, error) {
const defaultProtocol = "http"
if rawURL == "" {
return nil, errors.New("empty URL")
}
// FIXME(tdakkota): move normalization to deeplink package when it's done?
u, err := url.Parse(rawURL)
if err != nil {
return nil, err
}
if u.Scheme == "tg" && u.Host == "user" {
id, err := strconv.ParseInt(u.Query().Get("id"), 10, 64)
if err != nil {
return nil, errors.Wrapf(err, "invalid user ID %q", id)
}
user, err := resolver(id)
if err != nil {
return nil, errors.Wrapf(err, "can't resolve user %q", id)
}
return entity.MentionName(user), nil
}
if u.Scheme == "" {
u.Scheme = defaultProtocol
u.Host = u.Path
u.Path = "/"
rawURL = u.String()
}
if err := validateHostname(u); err != nil {
return nil, err
}
return entity.TextURL(rawURL), nil
}
+129
View File
@@ -0,0 +1,129 @@
package html
import "unicode/utf8"
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src.
//
// This is adaption of html.UnescapeString from Go sources.
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
// i starts at 1 because we already know that s[0] == '&'.
i, s := 1, b[src:]
if len(s) <= 1 {
b[dst] = b[src]
return dst + 1, src + 1
}
if s[i] == '#' {
if len(s) <= 3 { // We need to have at least "&#.".
b[dst] = b[src]
return dst + 1, src + 1
}
i++
c := s[i]
hex := false
if c == 'x' || c == 'X' {
hex = true
i++
}
x := '\x00'
for i < len(s) {
c = s[i]
i++
if hex {
switch {
case '0' <= c && c <= '9':
x = 16*x + rune(c) - '0'
continue
case 'a' <= c && c <= 'f':
x = 16*x + rune(c) - 'a' + 10
continue
case 'A' <= c && c <= 'F':
x = 16*x + rune(c) - 'A' + 10
continue
}
} else if '0' <= c && c <= '9' {
x = 10*x + rune(c) - '0'
continue
}
if c != ';' {
i--
}
break
}
if i <= 3 { // No characters matched.
b[dst] = b[src]
return dst + 1, src + 1
}
if x == 0 || x >= 0x10ffff {
b[dst] = b[src]
return dst + 1, src + 1
}
return dst + utf8.EncodeRune(b[dst:], x), src + i
}
// Consume the maximum number of characters possible, with the
// consumed characters matching one of the named references.
for i < len(s) {
c := s[i]
i++
// Lower-cased characters are more common in entities, so we check for them first.
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
continue
}
if c != ';' {
i--
}
break
}
var x rune
tagEnd := i
if i > 0 && s[tagEnd-1] == ';' {
tagEnd--
}
switch string(s[1:tagEnd]) {
case "lt":
x = '<'
case "gt":
x = '>'
case "amp":
x = '&'
case "quot":
x = '"'
}
if x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
}
dst1, src1 = dst+i, src+i
copy(b[dst:dst1], b[src:src1])
return dst1, src1
}
// telegramEscape implements Telegram BotAPI HTML unescape.
func telegramUnescape(b []byte) []byte {
for i, c := range b {
if c == '&' {
dst, src := unescapeEntity(b, i, i)
for src < len(b) {
c := b[src]
if c == '&' {
dst, src = unescapeEntity(b, dst, src)
} else {
b[dst] = c
dst, src = dst+1, src+1
}
}
return b[0:dst]
}
}
return b
}
@@ -0,0 +1,35 @@
package html
import (
"testing"
"github.com/stretchr/testify/require"
)
func Test_telegramUnescape(t *testing.T) {
tests := []struct {
name string
b string
want string
}{
{"NoEscapeCode", "&", "&"},
{"NoEscapeCode", "&#", "&#"},
{"UnicodeFlag", "&#127987", string(rune(127987))},
{"UnicodeFlag", "&#127987;", string(rune(127987))},
{"UnicodeFlagHex", "&#x1F3f3", string(rune(0x1f3f3))},
{"UnicodeFlagHex", "&#x1F3f3;", string(rune(0x1f3f3))},
{"lt", "&lt;", "<"},
{"lt", "&lt", "<"},
{"gt", "&gt;", ">"},
{"gt", "&gt", ">"},
{"amp", "&amp;", "&"},
{"amp", "&amp", "&"},
{"quot", "&quot;", `"`},
{"quot", "&quot", `"`},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
require.Equal(t, []byte(tt.want), telegramUnescape([]byte(tt.b)))
})
}
}