move gotd fork into repo. (#111)
- update to latest telegram layer - remove some references to fields in tg.Entities that don't exist in the schema - originally added here: https://github.com/beeper/td/commit/820929062a2ba0104397bc01235ab58a9cff780e - referenced here - https://github.com/mautrix/telegramgo/commit/124f0967ed195b5a380c9bd02e170ada9710dde3 - https://github.com/mautrix/telegramgo/commit/4205047aab2e0639217148b5d125bfaab668bd8e
This commit is contained in:
@@ -0,0 +1,50 @@
|
||||
// Package html contains HTML styling options.
|
||||
package html
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/styling"
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
|
||||
)
|
||||
|
||||
// Bytes reads HTML from given byte slice and returns styling option
|
||||
// to build styled text block.
|
||||
func Bytes(resolver func(id int64) (tg.InputUserClass, error), b []byte) styling.StyledTextOption {
|
||||
return Reader(resolver, bytes.NewReader(b))
|
||||
}
|
||||
|
||||
// String reads HTML from given string and returns styling option
|
||||
// to build styled text block.
|
||||
func String(resolver func(id int64) (tg.InputUserClass, error), s string) styling.StyledTextOption {
|
||||
return Reader(resolver, strings.NewReader(s))
|
||||
}
|
||||
|
||||
// Format formats string using fmt, parses HTML from formatted string and returns styling option
|
||||
// to build styled text block.
|
||||
func Format(resolver func(id int64) (tg.InputUserClass, error), format string, args ...interface{}) styling.StyledTextOption {
|
||||
return styling.Custom(func(eb *entity.Builder) error {
|
||||
var buf bytes.Buffer
|
||||
_, err := fmt.Fprintf(&buf, format, args...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return HTML(&buf, eb, Options{
|
||||
UserResolver: resolver,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// Reader reads HTML from given reader and returns styling option
|
||||
// to build styled text block.
|
||||
func Reader(resolver func(id int64) (tg.InputUserClass, error), r io.Reader) styling.StyledTextOption {
|
||||
return styling.Custom(func(eb *entity.Builder) error {
|
||||
return HTML(r, eb, Options{
|
||||
UserResolver: resolver,
|
||||
})
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
package html_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram"
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message"
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/html"
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
|
||||
)
|
||||
|
||||
func sendHTML(ctx context.Context) error {
|
||||
client, err := telegram.ClientFromEnvironment(telegram.Options{})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// This example creates a styled message from BotAPI examples
|
||||
// and sends to your Saved Messages folder.
|
||||
// See https://core.telegram.org/bots/api#html-style.
|
||||
return client.Run(ctx, func(ctx context.Context) error {
|
||||
_, err := message.NewSender(tg.NewClient(client)).
|
||||
Self().StyledText(ctx, html.String(nil, `<b>bold</b>, <strong>bold</strong>
|
||||
<i>italic</i>, <em>italic</em>
|
||||
<u>underline</u>, <ins>underline</ins>
|
||||
<s>strikethrough</s>, <strike>strikethrough</strike>, <del>strikethrough</del>
|
||||
<span class="tg-spoiler">spoiler</span>, <tg-spoiler>spoiler</tg-spoiler>
|
||||
<b>bold <i>italic bold <s>italic bold strikethrough <span class="tg-spoiler">italic bold strikethrough spoiler</span></s> <u>underline italic bold</u></i> bold</b>
|
||||
<a href="http://www.example.com/">inline URL</a>
|
||||
<a href="tg://user?id=123456789">inline mention of a user</a>
|
||||
<tg-emoji emoji-id="5368324170671202286">👍</tg-emoji>
|
||||
<code>inline fixed-width code</code>
|
||||
<pre>pre-formatted fixed-width code block</pre>
|
||||
<pre><code class="language-python">pre-formatted fixed-width code block written in the Python programming language</code></pre>
|
||||
<blockquote>Block quotation started\nBlock quotation continued\nThe last line of the block quotation</blockquote>
|
||||
<blockquote expandable>Expandable block quotation started\nExpandable block quotation continued\nExpandable block quotation continued\nHidden by default part of the block quotation started\nExpandable block quotation continued\nThe last line of the block quotation</blockquote>`))
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
func ExampleString() {
|
||||
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
|
||||
defer cancel()
|
||||
|
||||
if err := sendHTML(ctx); err != nil {
|
||||
_, _ = fmt.Fprintf(os.Stderr, "%+v\n", err)
|
||||
os.Exit(2)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
package html
|
||||
|
||||
import (
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
|
||||
)
|
||||
|
||||
// Options is options of HTML.
|
||||
type Options struct {
|
||||
// UserResolver is used to resolve user by ID during formatting. May be nil.
|
||||
//
|
||||
// If userResolver is nil, formatter will create tg.InputUser using only ID.
|
||||
// Notice that it's okay for bots, but not for users.
|
||||
UserResolver entity.UserResolver
|
||||
// DisableTelegramEscape disable Telegram BotAPI escaping and uses default
|
||||
// golang.org/x/net/html escape.
|
||||
DisableTelegramEscape bool
|
||||
}
|
||||
|
||||
func (o *Options) setDefaults() {
|
||||
if o.UserResolver == nil {
|
||||
o.UserResolver = func(id int64) (tg.InputUserClass, error) {
|
||||
return &tg.InputUser{
|
||||
UserID: id,
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,282 @@
|
||||
package html
|
||||
|
||||
import (
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/go-faster/errors"
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
|
||||
)
|
||||
|
||||
type htmlParser struct {
|
||||
tokenizer *html.Tokenizer
|
||||
builder *entity.Builder
|
||||
stack stack
|
||||
attr map[string]string
|
||||
opts Options
|
||||
}
|
||||
|
||||
func (p *htmlParser) fillAttrs() {
|
||||
// Clear old attrs.
|
||||
for k := range p.attr {
|
||||
delete(p.attr, k)
|
||||
}
|
||||
|
||||
// Fill with new attributes.
|
||||
for {
|
||||
key, value, ok := p.tokenizer.TagAttr()
|
||||
p.attr[string(key)] = string(value)
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
pre = "pre"
|
||||
code = "code"
|
||||
em = "em"
|
||||
ins = "ins"
|
||||
strike = "strike"
|
||||
del = "del"
|
||||
strong = "strong"
|
||||
span = "span"
|
||||
tgSpoiler = "tg-spoiler"
|
||||
tgEmoji = "tg-emoji"
|
||||
blockquote = "blockquote"
|
||||
)
|
||||
|
||||
func (p *htmlParser) tag(tn []byte) string {
|
||||
// Here we intern some well-known tags.
|
||||
switch string(tn) {
|
||||
case "b":
|
||||
return "b"
|
||||
case strong:
|
||||
return strong
|
||||
case "i":
|
||||
return "i"
|
||||
case em:
|
||||
return em
|
||||
case "u":
|
||||
return "u"
|
||||
case ins:
|
||||
return ins
|
||||
case "s":
|
||||
return "s"
|
||||
case strike:
|
||||
return strike
|
||||
case del:
|
||||
return del
|
||||
case "a":
|
||||
return "a"
|
||||
case pre:
|
||||
return pre
|
||||
case code:
|
||||
return code
|
||||
case span:
|
||||
return span
|
||||
case tgSpoiler:
|
||||
return tgSpoiler
|
||||
case tgEmoji:
|
||||
return tgEmoji
|
||||
case blockquote:
|
||||
return blockquote
|
||||
default:
|
||||
return string(tn)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *htmlParser) startTag() error {
|
||||
var e stackElem
|
||||
tn, hasAttr := p.tokenizer.TagName()
|
||||
e.tag = p.tag(tn)
|
||||
if hasAttr {
|
||||
p.fillAttrs()
|
||||
}
|
||||
|
||||
e.token = p.builder.Token()
|
||||
// See https://core.telegram.org/bots/api#html-style.
|
||||
switch e.tag {
|
||||
case "b", strong:
|
||||
e.format = entity.Bold()
|
||||
case "i", em:
|
||||
e.format = entity.Italic()
|
||||
case "u", ins:
|
||||
e.format = entity.Underline()
|
||||
case "s", strike, del:
|
||||
e.format = entity.Strike()
|
||||
case "a":
|
||||
e.attr = p.attr["href"]
|
||||
if e.attr == "" {
|
||||
break
|
||||
}
|
||||
|
||||
f, err := getURLFormatter(e.attr, p.opts.UserResolver)
|
||||
if err != nil {
|
||||
f = nil
|
||||
}
|
||||
e.format = f
|
||||
case code:
|
||||
const langPrefix = "language-"
|
||||
|
||||
e.format = entity.Code()
|
||||
e.attr = strings.TrimPrefix(p.attr["class"], langPrefix)
|
||||
if len(p.stack) < 1 {
|
||||
break
|
||||
}
|
||||
|
||||
// BotAPI docs says:
|
||||
// > Use nested <pre> and <code> tags, to define programming language for <pre> entity.
|
||||
last := &p.stack[len(p.stack)-1]
|
||||
if last.tag != pre {
|
||||
break
|
||||
}
|
||||
|
||||
if lang := e.attr; lang != "" {
|
||||
// Set language parameter.
|
||||
last.format = entity.Pre(lang)
|
||||
}
|
||||
case pre:
|
||||
e.format = entity.Pre("")
|
||||
if len(p.stack) < 1 {
|
||||
break
|
||||
}
|
||||
|
||||
last := &p.stack[len(p.stack)-1]
|
||||
if last.tag != code {
|
||||
break
|
||||
}
|
||||
|
||||
if lang := last.attr; lang != "" {
|
||||
// Set language parameter.
|
||||
e.format = entity.Pre(lang)
|
||||
}
|
||||
case span:
|
||||
if p.attr["class"] == "tg-spoiler" {
|
||||
e.format = entity.Spoiler()
|
||||
}
|
||||
case tgSpoiler:
|
||||
e.format = entity.Spoiler()
|
||||
case tgEmoji:
|
||||
if id, err := strconv.ParseInt(p.attr["emoji-id"], 10, 64); err == nil {
|
||||
e.format = entity.CustomEmoji(id)
|
||||
}
|
||||
case blockquote:
|
||||
_, collapsed := p.attr["expandable"]
|
||||
e.format = entity.Blockquote(collapsed)
|
||||
}
|
||||
|
||||
p.stack.push(e)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *htmlParser) endTag(checkName bool) error {
|
||||
tn, _ := p.tokenizer.TagName()
|
||||
|
||||
s, ok := p.stack.pop()
|
||||
switch {
|
||||
case !ok:
|
||||
return errors.Errorf("unexpected end tag %q", tn)
|
||||
case checkName && s.tag != string(tn):
|
||||
return errors.Errorf("expected tag %q, got %q", s.tag, tn)
|
||||
}
|
||||
|
||||
// Compute UTF-16 length of entity.
|
||||
length := s.token.UTF16Length(p.builder)
|
||||
|
||||
switch s.tag {
|
||||
case "a":
|
||||
// TDLib tries to parse link from <a> body, so we should too.
|
||||
if s.attr == "" {
|
||||
msg := s.token.Text(p.builder)
|
||||
if f, err := getURLFormatter(msg, p.opts.UserResolver); err == nil {
|
||||
s.format = f
|
||||
}
|
||||
}
|
||||
case "code":
|
||||
l, ok := p.builder.LastEntity()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
last, ok := l.(*tg.MessageEntityPre)
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
// Do not add Code entity, if last entity is Pre with same offset.
|
||||
if last.GetOffset() == s.token.UTF16Offset() && last.GetLength() == length {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
// Do not add empty entities.
|
||||
if length == 0 || s.format == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
s.token.Apply(p.builder, s.format)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *htmlParser) parse() error {
|
||||
for {
|
||||
tt := p.tokenizer.Next()
|
||||
switch tt {
|
||||
case html.ErrorToken:
|
||||
if err := p.tokenizer.Err(); !errors.Is(err, io.EOF) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
case html.TextToken:
|
||||
var text []byte
|
||||
if p.opts.DisableTelegramEscape {
|
||||
text = p.tokenizer.Text()
|
||||
} else {
|
||||
text = telegramUnescape(p.tokenizer.Raw())
|
||||
}
|
||||
_, _ = p.builder.Write(text)
|
||||
case html.StartTagToken:
|
||||
if err := p.startTag(); err != nil {
|
||||
return err
|
||||
}
|
||||
case html.EndTagToken:
|
||||
if err := p.endTag(true); err != nil {
|
||||
return err
|
||||
}
|
||||
case html.CommentToken:
|
||||
// html.Tokenizer returns comment token for empty closing tags.
|
||||
raw := p.tokenizer.Raw()
|
||||
if len(raw) >= 3 && string(raw[:2]) == "</" && raw[len(raw)-1] == '>' {
|
||||
if err := p.endTag(false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// HTML parses given input from reader and adds parsed entities to given builder.
|
||||
// Notice that this parser ignores unsupported tags.
|
||||
//
|
||||
// Parameter userResolver is used to resolve user by ID during formatting. May be nil.
|
||||
// If userResolver is nil, formatter will create tg.InputUser using only ID.
|
||||
// Notice that it's okay for bots, but not for users.
|
||||
//
|
||||
// See https://core.telegram.org/bots/api#html-style.
|
||||
func HTML(r io.Reader, b *entity.Builder, opts Options) error {
|
||||
opts.setDefaults()
|
||||
p := htmlParser{
|
||||
tokenizer: html.NewTokenizer(r),
|
||||
builder: b,
|
||||
attr: map[string]string{},
|
||||
opts: opts,
|
||||
}
|
||||
|
||||
if err := p.parse(); err != nil {
|
||||
return errors.Wrap(err, "parse")
|
||||
}
|
||||
b.ShrinkPreCode()
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
package html
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
|
||||
)
|
||||
|
||||
func BenchmarkHTML(b *testing.B) {
|
||||
input := `<b>bold</b>, <strong>bold</strong>
|
||||
<i>italic</i>, <em>italic</em>
|
||||
<u>underline</u>, <ins>underline</ins>
|
||||
<s>strikethrough</s>, <strike>strikethrough</strike>, <del>strikethrough</del>
|
||||
<b>bold <i>italic bold <s>italic bold strikethrough</s> <u>underline italic bold</u></i> bold</b>
|
||||
<a href="http://www.example.com/">inline URL</a>
|
||||
<a href="tg://user?id=123456789">inline mention of a user</a>
|
||||
<code>inline fixed-width code</code>
|
||||
<pre>pre-formatted fixed-width code block</pre>
|
||||
<pre><code class="language-python">pre-formatted fixed-width code block written in the Python programming language</code></pre>`
|
||||
reader := strings.NewReader(input)
|
||||
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
reader.Reset(input)
|
||||
builder := entity.Builder{}
|
||||
|
||||
if err := HTML(reader, &builder, Options{}); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,211 @@
|
||||
package html
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/tg"
|
||||
)
|
||||
|
||||
type htmlTestCase struct {
|
||||
html string
|
||||
msg string
|
||||
entities func(msg string) []tg.MessageEntityClass
|
||||
wantErr bool
|
||||
skipReason string
|
||||
}
|
||||
|
||||
func getEntities(formats ...entity.Formatter) func(msg string) []tg.MessageEntityClass {
|
||||
return func(msg string) []tg.MessageEntityClass {
|
||||
length := entity.ComputeLength(msg)
|
||||
r := make([]tg.MessageEntityClass, len(formats))
|
||||
for i := range formats {
|
||||
r[i] = formats[i](0, length)
|
||||
}
|
||||
return r
|
||||
}
|
||||
}
|
||||
|
||||
func TestHTML(t *testing.T) {
|
||||
runTests := func(tests []htmlTestCase, numericName bool) func(t *testing.T) {
|
||||
return func(t *testing.T) {
|
||||
for i, test := range tests {
|
||||
testName := test.msg
|
||||
if numericName || testName == "" {
|
||||
testName = fmt.Sprintf("Test%d", i+1)
|
||||
}
|
||||
t.Run(strings.Title(testName), func(t *testing.T) {
|
||||
t.Cleanup(func() {
|
||||
if t.Failed() {
|
||||
t.Logf("Input: %q", test.html)
|
||||
}
|
||||
})
|
||||
if test.skipReason != "" {
|
||||
t.Skip(test.skipReason)
|
||||
}
|
||||
a := require.New(t)
|
||||
b := entity.Builder{}
|
||||
|
||||
err := HTML(strings.NewReader(test.html), &b, Options{})
|
||||
if test.wantErr {
|
||||
a.Error(err)
|
||||
return
|
||||
}
|
||||
a.NoError(err)
|
||||
|
||||
var (
|
||||
msg string
|
||||
entities []tg.MessageEntityClass
|
||||
)
|
||||
if strings.TrimSpace(test.msg) != test.msg {
|
||||
// Complete cuts spaces and fixes entities, but TDLib test expects
|
||||
// that it happens after parsing.
|
||||
msg, entities = b.Raw()
|
||||
entity.SortEntities(entities)
|
||||
} else {
|
||||
msg, entities = b.Complete()
|
||||
}
|
||||
|
||||
a.Equal(test.msg, msg)
|
||||
if test.entities != nil {
|
||||
expect := test.entities(test.msg)
|
||||
a.Len(entities, len(expect))
|
||||
a.ElementsMatch(expect, entities)
|
||||
} else {
|
||||
a.Empty(entities)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
tests := []htmlTestCase{
|
||||
{html: "<b>bold</b>", msg: "bold", entities: getEntities(entity.Bold())},
|
||||
{html: "<strong>bold</strong>", msg: "bold", entities: getEntities(entity.Bold())},
|
||||
{html: "<i>italic</i>", msg: "italic", entities: getEntities(entity.Italic())},
|
||||
{html: "<em>italic</em>", msg: "italic", entities: getEntities(entity.Italic())},
|
||||
{html: "<u>underline</u>", msg: "underline", entities: getEntities(entity.Underline())},
|
||||
{html: "<ins>underline</ins>", msg: "underline", entities: getEntities(entity.Underline())},
|
||||
{html: "<s>strikethrough</s>", msg: "strikethrough", entities: getEntities(entity.Strike())},
|
||||
{html: "<strike>strikethrough</strike>", msg: "strikethrough", entities: getEntities(entity.Strike())},
|
||||
{html: "<del>strikethrough</del>", msg: "strikethrough", entities: getEntities(entity.Strike())},
|
||||
{html: "<code>code</code>", msg: "code", entities: getEntities(entity.Code())},
|
||||
{html: "<pre>abc</pre>", msg: "abc", entities: getEntities(entity.Pre(""))},
|
||||
{html: `<a href="http://www.example.com/">inline URL</a>`, msg: "inline URL",
|
||||
entities: getEntities(entity.TextURL("http://www.example.com/"))},
|
||||
{html: `<a href="tg://user?id=123456789">inline mention of a user</a>`, msg: "inline mention of a user",
|
||||
entities: getEntities(entity.MentionName(&tg.InputUser{
|
||||
UserID: 123456789,
|
||||
}))},
|
||||
{html: `<pre><code class="language-python">python code</code></pre>`, msg: "python code",
|
||||
entities: getEntities(entity.Pre("python"))},
|
||||
{html: "<b><</b>", msg: "<", entities: getEntities(entity.Bold())},
|
||||
{html: `<span class="tg-spoiler">spoiler</span>`, msg: "spoiler", entities: getEntities(entity.Spoiler())},
|
||||
{html: "<tg-emoji emoji-id=\"5368324170671202286\">👍</tg-emoji>", msg: "👍", entities: getEntities(entity.CustomEmoji(5368324170671202286))},
|
||||
{html: "<blockquote expandable>quote</blockquote>", msg: "quote", entities: getEntities(entity.Blockquote(true))},
|
||||
{html: "<blockquote>quote</blockquote>", msg: "quote", entities: getEntities(entity.Blockquote(false))},
|
||||
}
|
||||
t.Run("Common", runTests(tests, false))
|
||||
}
|
||||
|
||||
{
|
||||
negativeTests := []htmlTestCase{
|
||||
{html: "�", wantErr: true},
|
||||
{html: "�", wantErr: true},
|
||||
{html: "�", wantErr: true},
|
||||
{html: "🏟 🏟<<abacaba", wantErr: true},
|
||||
{html: "🏟 🏟<<abac aba>", wantErr: true},
|
||||
{html: "🏟 🏟<<abac>", wantErr: true},
|
||||
{html: "🏟 🏟<<i =aba>", wantErr: true},
|
||||
{html: "🏟 🏟<<i aba>", wantErr: true},
|
||||
{html: "🏟 🏟<<i aba = ", wantErr: true},
|
||||
{html: "🏟 🏟<<i aba = 190azAz-.,", wantErr: true},
|
||||
{html: "🏟 🏟<<i aba = \"<>">", wantErr: true},
|
||||
{html: "🏟 🏟<<i aba = \\'<>">", wantErr: true},
|
||||
{html: "🏟 🏟<</", wantErr: true},
|
||||
{html: "🏟 🏟<<b></b></", wantErr: true},
|
||||
{html: "🏟 🏟<<i>a</i ", wantErr: true},
|
||||
{html: "🏟 🏟<<i>a</em >", wantErr: true},
|
||||
}
|
||||
// FIXME(tdakkota): sanitize HTML
|
||||
_ = negativeTests
|
||||
|
||||
t.Run("TDLib", runTests(tdlibHTMLTests(), true))
|
||||
}
|
||||
}
|
||||
|
||||
func TestIssue525(t *testing.T) {
|
||||
test := func(text string, expected []tg.MessageEntityClass) func(t *testing.T) {
|
||||
return func(t *testing.T) {
|
||||
a := require.New(t)
|
||||
|
||||
b := entity.Builder{}
|
||||
p := htmlParser{
|
||||
tokenizer: html.NewTokenizer(strings.NewReader(text)),
|
||||
builder: &b,
|
||||
attr: map[string]string{},
|
||||
}
|
||||
|
||||
a.NoError(p.parse())
|
||||
_, entities := b.Complete()
|
||||
a.Equal(expected, entities)
|
||||
}
|
||||
}
|
||||
|
||||
t.Run("Ru", test(`Строка
|
||||
<i>Строка текста курсивом</i>
|
||||
|
||||
Обычный текст с <a href="https://google.com">Ссылкой</a> внутри, и
|
||||
ещё одна ссылка - <a href="https://go.dev">Здесь</a>.
|
||||
|
||||
Ещё одна строка.
|
||||
`,
|
||||
[]tg.MessageEntityClass{
|
||||
&tg.MessageEntityItalic{
|
||||
Offset: 7,
|
||||
Length: 22,
|
||||
},
|
||||
&tg.MessageEntityTextURL{
|
||||
Offset: 47,
|
||||
Length: 7,
|
||||
URL: "https://google.com",
|
||||
},
|
||||
&tg.MessageEntityTextURL{
|
||||
Offset: 83,
|
||||
Length: 5,
|
||||
URL: "https://go.dev",
|
||||
},
|
||||
}),
|
||||
)
|
||||
t.Run("En", test(`Line
|
||||
<i>Italic line of text</i>
|
||||
|
||||
Normal line of text with <a href="https://google.com">Link</a> inside, and
|
||||
another link now - <a href="https://go.dev">Here</a>.
|
||||
|
||||
One more line.
|
||||
`,
|
||||
[]tg.MessageEntityClass{
|
||||
&tg.MessageEntityItalic{
|
||||
Offset: 5,
|
||||
Length: 19,
|
||||
},
|
||||
&tg.MessageEntityTextURL{
|
||||
Offset: 51,
|
||||
Length: 4,
|
||||
URL: "https://google.com",
|
||||
},
|
||||
&tg.MessageEntityTextURL{
|
||||
Offset: 87,
|
||||
Length: 4,
|
||||
URL: "https://go.dev",
|
||||
},
|
||||
}),
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package html
|
||||
|
||||
import "go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
|
||||
|
||||
type stackElem struct {
|
||||
token entity.Token
|
||||
tag string
|
||||
attr string
|
||||
format entity.Formatter
|
||||
}
|
||||
|
||||
type stack []stackElem
|
||||
|
||||
func (s *stack) push(e stackElem) {
|
||||
*s = append(*s, e)
|
||||
}
|
||||
|
||||
func (s *stack) last() (stackElem, bool) {
|
||||
l := len(*s)
|
||||
if l == 0 {
|
||||
return stackElem{}, false
|
||||
}
|
||||
|
||||
elem := (*s)[l-1]
|
||||
return elem, true
|
||||
}
|
||||
|
||||
func (s *stack) pop() (stackElem, bool) {
|
||||
e, ok := s.last()
|
||||
if !ok {
|
||||
return stackElem{}, false
|
||||
}
|
||||
*s = (*s)[:len(*s)-1]
|
||||
return e, true
|
||||
}
|
||||
@@ -0,0 +1,413 @@
|
||||
package html
|
||||
|
||||
import "go.mau.fi/mautrix-telegram/pkg/gotd/tg"
|
||||
|
||||
func tdlibHTMLTests() []htmlTestCase {
|
||||
entities := func(e ...tg.MessageEntityClass) func(msg string) []tg.MessageEntityClass {
|
||||
return func(msg string) []tg.MessageEntityClass {
|
||||
return e
|
||||
}
|
||||
}
|
||||
return []htmlTestCase{
|
||||
{"", "", nil, false, ""},
|
||||
{"➡️ ➡️", "➡️ ➡️", nil, false, ""},
|
||||
{
|
||||
"<>&"«»�",
|
||||
"<>&\"«»�",
|
||||
nil,
|
||||
false,
|
||||
"",
|
||||
},
|
||||
|
||||
{
|
||||
"➡️ ➡️<i>➡️ ➡️</i>",
|
||||
"➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntityItalic{Offset: 5, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<em>➡️ ➡️</em>", "➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntityItalic{Offset: 5, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<b>➡️ ➡️</b>",
|
||||
"➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntityBold{Offset: 5, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<strong>➡️ ➡️</strong>",
|
||||
"➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntityBold{Offset: 5, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<u>➡️ ➡️</u>",
|
||||
"➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntityUnderline{Offset: 5, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<ins>➡️ ➡️</ins>",
|
||||
"➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntityUnderline{Offset: 5, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<s>➡️ ➡️</s>",
|
||||
"➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntityStrike{Offset: 5, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<strike>➡️ ➡️</strike>",
|
||||
"➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntityStrike{Offset: 5, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<del>➡️ ➡️</del>",
|
||||
"➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntityStrike{Offset: 5, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<i>➡️ ➡️</i><b>➡️ ➡️</b>",
|
||||
"➡️ ➡️➡️ ➡️➡️ ➡️",
|
||||
entities(
|
||||
&tg.MessageEntityItalic{Offset: 5, Length: 5},
|
||||
&tg.MessageEntityBold{Offset: 10, Length: 5},
|
||||
),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
|
||||
{
|
||||
"🏟 🏟<i>🏟 <🏟</i>",
|
||||
"🏟 🏟🏟 <🏟",
|
||||
entities(&tg.MessageEntityItalic{Offset: 5, Length: 6}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<i>🏟 ><b aba = caba><🏟</b></i>",
|
||||
"🏟 🏟🏟 ><🏟",
|
||||
entities(
|
||||
&tg.MessageEntityItalic{Offset: 5, Length: 7},
|
||||
&tg.MessageEntityBold{Offset: 9, Length: 3},
|
||||
),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<i aba = 190azAz-. >a</i>",
|
||||
"🏟 🏟<a",
|
||||
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<i aba = 190azAz-.>a</i>",
|
||||
"🏟 🏟<a",
|
||||
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<i aba = \"<>"\">a</i>",
|
||||
"🏟 🏟<a",
|
||||
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<i aba = '<>"'>a</i>",
|
||||
"🏟 🏟<a",
|
||||
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<i aba = '<>"'>a</>",
|
||||
"🏟 🏟<a",
|
||||
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<i>🏟 🏟<</>",
|
||||
"🏟 🏟<🏟 🏟<",
|
||||
entities(&tg.MessageEntityItalic{Offset: 6, Length: 6}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
|
||||
{
|
||||
"🏟 🏟<<i>a</ >",
|
||||
"🏟 🏟<a",
|
||||
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<i>a</i >",
|
||||
"🏟 🏟<a",
|
||||
entities(&tg.MessageEntityItalic{Offset: 6, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
// Empty entity.
|
||||
{
|
||||
"🏟 🏟<<b></b>",
|
||||
"🏟 🏟<",
|
||||
nil,
|
||||
false,
|
||||
"",
|
||||
},
|
||||
// Space handling.
|
||||
{
|
||||
"<i>\t</i>",
|
||||
"\t",
|
||||
entities(&tg.MessageEntityItalic{Offset: 0, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<i>\r</i>",
|
||||
"\r",
|
||||
entities(&tg.MessageEntityItalic{Offset: 0, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<i>\n</i>",
|
||||
"\n",
|
||||
entities(&tg.MessageEntityItalic{Offset: 0, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<span class = \"tg-spoiler\">➡️ ➡️</span><b>➡️ ➡️</b>",
|
||||
"➡️ ➡️➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 5}, &tg.MessageEntityBold{Offset: 10, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<span class=\"tg-spoiler\">🏟 <🏟</span>",
|
||||
"🏟 🏟🏟 <🏟",
|
||||
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 6}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<span class=\"tg-spoiler\">🏟 ><b aba = caba><🏟</b></span>",
|
||||
"🏟 🏟🏟 ><🏟",
|
||||
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 7}, &tg.MessageEntityBold{Offset: 9, Length: 3}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"➡️ ➡️<tg-spoiler>➡️ ➡️</tg-spoiler><b>➡️ ➡️</b>",
|
||||
"➡️ ➡️➡️ ➡️➡️ ➡️",
|
||||
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 5}, &tg.MessageEntityBold{Offset: 10, Length: 5}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<tg-spoiler>🏟 <🏟</tg-spoiler>",
|
||||
"🏟 🏟🏟 <🏟",
|
||||
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 6}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<tg-spoiler>🏟 ><b aba = caba><🏟</b></tg-spoiler>",
|
||||
"🏟 🏟🏟 ><🏟",
|
||||
entities(&tg.MessageEntitySpoiler{Offset: 5, Length: 7}, &tg.MessageEntityBold{Offset: 9, Length: 3}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<a href=telegram.org>\t</a>",
|
||||
"\t",
|
||||
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<a href=telegram.org>\r</a>",
|
||||
"\r",
|
||||
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<a href=telegram.org>\n</a>",
|
||||
"\n",
|
||||
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<code><i><b> </b></i></code><i><b><code> </code></b></i>",
|
||||
" ",
|
||||
entities(
|
||||
&tg.MessageEntityCode{Offset: 0, Length: 1},
|
||||
&tg.MessageEntityBold{Offset: 0, Length: 1},
|
||||
&tg.MessageEntityItalic{Offset: 0, Length: 1},
|
||||
&tg.MessageEntityCode{Offset: 1, Length: 1},
|
||||
&tg.MessageEntityBold{Offset: 1, Length: 1},
|
||||
&tg.MessageEntityItalic{Offset: 1, Length: 1}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<i><b> </b> <code> </code></i>",
|
||||
" ",
|
||||
entities(
|
||||
&tg.MessageEntityItalic{Offset: 0, Length: 3},
|
||||
&tg.MessageEntityBold{Offset: 0, Length: 1},
|
||||
&tg.MessageEntityCode{Offset: 2, Length: 1},
|
||||
),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<a href=telegram.org> </a>",
|
||||
" ",
|
||||
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<a href =\"telegram.org\" > </a>",
|
||||
" ",
|
||||
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<a href= 'telegram.org' > </a>",
|
||||
" ",
|
||||
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/"}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<a href= 'telegram.org?<' > </a>",
|
||||
" ",
|
||||
entities(&tg.MessageEntityTextURL{Offset: 0, Length: 1, URL: "http://telegram.org/?<"}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
// URL handling
|
||||
{
|
||||
"<a>telegram.org </a>",
|
||||
"telegram.org ",
|
||||
nil,
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<a>telegram.org</a>", "telegram.org",
|
||||
entities(&tg.MessageEntityTextURL{
|
||||
Offset: 0,
|
||||
Length: 12,
|
||||
URL: "http://telegram.org/",
|
||||
}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"<a>https://telegram.org/asdsa?asdasdwe#12e3we</a>",
|
||||
"https://telegram.org/asdsa?asdasdwe#12e3we",
|
||||
entities(&tg.MessageEntityTextURL{
|
||||
Offset: 0,
|
||||
Length: 42,
|
||||
URL: "https://telegram.org/asdsa?asdasdwe#12e3we",
|
||||
}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
// <pre> and <code> handling
|
||||
{
|
||||
"🏟 🏟<<pre >🏟 🏟<</>",
|
||||
"🏟 🏟<🏟 🏟<",
|
||||
entities(&tg.MessageEntityPre{Offset: 6, Length: 6}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<code >🏟 🏟<</>",
|
||||
"🏟 🏟<🏟 🏟<",
|
||||
entities(&tg.MessageEntityCode{Offset: 6, Length: 6}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<pre><code>🏟 🏟<</code></>",
|
||||
"🏟 🏟<🏟 🏟<",
|
||||
entities(
|
||||
&tg.MessageEntityPre{Offset: 6, Length: 6},
|
||||
&tg.MessageEntityCode{Offset: 6, Length: 6},
|
||||
),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<pre><code class=\"language-\">🏟 🏟<</code></>",
|
||||
"🏟 🏟<🏟 🏟<",
|
||||
entities(
|
||||
&tg.MessageEntityPre{Offset: 6, Length: 6},
|
||||
&tg.MessageEntityCode{Offset: 6, Length: 6},
|
||||
),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<pre><code class=\"language-fift\">🏟 🏟<</></>",
|
||||
"🏟 🏟<🏟 🏟<",
|
||||
entities(&tg.MessageEntityPre{Offset: 6, Length: 6, Language: "fift"}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<code class=\"language-fift\"><pre>🏟 🏟<</></>",
|
||||
"🏟 🏟<🏟 🏟<",
|
||||
entities(&tg.MessageEntityPre{Offset: 6, Length: 6, Language: "fift"}),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<pre><code class=\"language-fift\">🏟 🏟<</> </>",
|
||||
"🏟 🏟<🏟 🏟< ",
|
||||
entities(
|
||||
&tg.MessageEntityPre{Offset: 6, Length: 7},
|
||||
&tg.MessageEntityCode{Offset: 6, Length: 6},
|
||||
),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"🏟 🏟<<pre> <code class=\"language-fift\">🏟 🏟<</></>",
|
||||
"🏟 🏟< 🏟 🏟<",
|
||||
entities(
|
||||
&tg.MessageEntityPre{Offset: 6, Length: 7},
|
||||
&tg.MessageEntityCode{Offset: 7, Length: 6},
|
||||
),
|
||||
false,
|
||||
"",
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
package html
|
||||
|
||||
import (
|
||||
"net"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/go-faster/errors"
|
||||
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/ascii"
|
||||
"go.mau.fi/mautrix-telegram/pkg/gotd/telegram/message/entity"
|
||||
)
|
||||
|
||||
func isIPv6(str string) bool {
|
||||
ip := net.ParseIP(str)
|
||||
return strings.Contains(str, ":") && ip != nil
|
||||
}
|
||||
|
||||
func validateHostname(u *url.URL) error {
|
||||
// TODO(tdakkota): make sure that it is correct
|
||||
ipv6 := isIPv6(u.Host)
|
||||
if !strings.ContainsRune(u.Host, '.') && ipv6 {
|
||||
return errors.New("wrong HTTP URL")
|
||||
}
|
||||
if ipv6 {
|
||||
return nil
|
||||
}
|
||||
|
||||
allowedSymbol := func(c rune) bool {
|
||||
return ascii.IsLatinLetter(c) ||
|
||||
ascii.IsDigit(c) ||
|
||||
(c >= '&' && c <= '.') ||
|
||||
c == '_' ||
|
||||
c == '!' ||
|
||||
c == '$' ||
|
||||
c == '~' ||
|
||||
c == ';' ||
|
||||
c == '=' ||
|
||||
c > utf8.RuneSelf
|
||||
}
|
||||
|
||||
for _, c := range u.Host {
|
||||
if !allowedSymbol(c) {
|
||||
return errors.Errorf("disallowed character %c in URL host", c)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getURLFormatter(rawURL string, resolver entity.UserResolver) (entity.Formatter, error) {
|
||||
const defaultProtocol = "http"
|
||||
if rawURL == "" {
|
||||
return nil, errors.New("empty URL")
|
||||
}
|
||||
|
||||
// FIXME(tdakkota): move normalization to deeplink package when it's done?
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if u.Scheme == "tg" && u.Host == "user" {
|
||||
id, err := strconv.ParseInt(u.Query().Get("id"), 10, 64)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "invalid user ID %q", id)
|
||||
}
|
||||
|
||||
user, err := resolver(id)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "can't resolve user %q", id)
|
||||
}
|
||||
|
||||
return entity.MentionName(user), nil
|
||||
}
|
||||
if u.Scheme == "" {
|
||||
u.Scheme = defaultProtocol
|
||||
u.Host = u.Path
|
||||
u.Path = "/"
|
||||
rawURL = u.String()
|
||||
}
|
||||
|
||||
if err := validateHostname(u); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return entity.TextURL(rawURL), nil
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
package html
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// unescapeEntity reads an entity like "<" from b[src:] and writes the
|
||||
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
||||
// Precondition: b[src] == '&' && dst <= src.
|
||||
//
|
||||
// This is adaption of html.UnescapeString from Go sources.
|
||||
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
||||
// i starts at 1 because we already know that s[0] == '&'.
|
||||
i, s := 1, b[src:]
|
||||
|
||||
if len(s) <= 1 {
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
|
||||
if s[i] == '#' {
|
||||
if len(s) <= 3 { // We need to have at least "&#.".
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
i++
|
||||
c := s[i]
|
||||
hex := false
|
||||
if c == 'x' || c == 'X' {
|
||||
hex = true
|
||||
i++
|
||||
}
|
||||
|
||||
x := '\x00'
|
||||
for i < len(s) {
|
||||
c = s[i]
|
||||
i++
|
||||
if hex {
|
||||
switch {
|
||||
case '0' <= c && c <= '9':
|
||||
x = 16*x + rune(c) - '0'
|
||||
continue
|
||||
case 'a' <= c && c <= 'f':
|
||||
x = 16*x + rune(c) - 'a' + 10
|
||||
continue
|
||||
case 'A' <= c && c <= 'F':
|
||||
x = 16*x + rune(c) - 'A' + 10
|
||||
continue
|
||||
}
|
||||
} else if '0' <= c && c <= '9' {
|
||||
x = 10*x + rune(c) - '0'
|
||||
continue
|
||||
}
|
||||
if c != ';' {
|
||||
i--
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
if i <= 3 { // No characters matched.
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
|
||||
if x == 0 || x >= 0x10ffff {
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||
}
|
||||
|
||||
// Consume the maximum number of characters possible, with the
|
||||
// consumed characters matching one of the named references.
|
||||
|
||||
for i < len(s) {
|
||||
c := s[i]
|
||||
i++
|
||||
// Lower-cased characters are more common in entities, so we check for them first.
|
||||
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
|
||||
continue
|
||||
}
|
||||
if c != ';' {
|
||||
i--
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
var x rune
|
||||
tagEnd := i
|
||||
if i > 0 && s[tagEnd-1] == ';' {
|
||||
tagEnd--
|
||||
}
|
||||
switch string(s[1:tagEnd]) {
|
||||
case "lt":
|
||||
x = '<'
|
||||
case "gt":
|
||||
x = '>'
|
||||
case "amp":
|
||||
x = '&'
|
||||
case "quot":
|
||||
x = '"'
|
||||
}
|
||||
if x != 0 {
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||
}
|
||||
|
||||
dst1, src1 = dst+i, src+i
|
||||
copy(b[dst:dst1], b[src:src1])
|
||||
return dst1, src1
|
||||
}
|
||||
|
||||
// telegramEscape implements Telegram BotAPI HTML unescape.
|
||||
func telegramUnescape(b []byte) []byte {
|
||||
for i, c := range b {
|
||||
if c == '&' {
|
||||
dst, src := unescapeEntity(b, i, i)
|
||||
for src < len(b) {
|
||||
c := b[src]
|
||||
if c == '&' {
|
||||
dst, src = unescapeEntity(b, dst, src)
|
||||
} else {
|
||||
b[dst] = c
|
||||
dst, src = dst+1, src+1
|
||||
}
|
||||
}
|
||||
return b[0:dst]
|
||||
}
|
||||
}
|
||||
return b
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package html
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func Test_telegramUnescape(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
b string
|
||||
want string
|
||||
}{
|
||||
{"NoEscapeCode", "&", "&"},
|
||||
{"NoEscapeCode", "&#", "&#"},
|
||||
{"UnicodeFlag", "🏳", string(rune(127987))},
|
||||
{"UnicodeFlag", "🏳", string(rune(127987))},
|
||||
{"UnicodeFlagHex", "🏳", string(rune(0x1f3f3))},
|
||||
{"UnicodeFlagHex", "🏳", string(rune(0x1f3f3))},
|
||||
{"lt", "<", "<"},
|
||||
{"lt", "<", "<"},
|
||||
{"gt", ">", ">"},
|
||||
{"gt", ">", ">"},
|
||||
{"amp", "&", "&"},
|
||||
{"amp", "&", "&"},
|
||||
{"quot", """, `"`},
|
||||
{"quot", """, `"`},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
require.Equal(t, []byte(tt.want), telegramUnescape([]byte(tt.b)))
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user