7a04f298d2
- update to latest telegram layer - remove some references to fields in tg.Entities that don't exist in the schema - originally added here: https://github.com/beeper/td/commit/820929062a2ba0104397bc01235ab58a9cff780e - referenced here - https://github.com/mautrix/telegramgo/commit/124f0967ed195b5a380c9bd02e170ada9710dde3 - https://github.com/mautrix/telegramgo/commit/4205047aab2e0639217148b5d125bfaab668bd8e
130 lines
2.4 KiB
Go
130 lines
2.4 KiB
Go
package html
|
|
|
|
import "unicode/utf8"
|
|
|
|
// unescapeEntity reads an entity like "<" from b[src:] and writes the
|
|
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
|
// Precondition: b[src] == '&' && dst <= src.
|
|
//
|
|
// This is adaption of html.UnescapeString from Go sources.
|
|
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
|
// i starts at 1 because we already know that s[0] == '&'.
|
|
i, s := 1, b[src:]
|
|
|
|
if len(s) <= 1 {
|
|
b[dst] = b[src]
|
|
return dst + 1, src + 1
|
|
}
|
|
|
|
if s[i] == '#' {
|
|
if len(s) <= 3 { // We need to have at least "&#.".
|
|
b[dst] = b[src]
|
|
return dst + 1, src + 1
|
|
}
|
|
i++
|
|
c := s[i]
|
|
hex := false
|
|
if c == 'x' || c == 'X' {
|
|
hex = true
|
|
i++
|
|
}
|
|
|
|
x := '\x00'
|
|
for i < len(s) {
|
|
c = s[i]
|
|
i++
|
|
if hex {
|
|
switch {
|
|
case '0' <= c && c <= '9':
|
|
x = 16*x + rune(c) - '0'
|
|
continue
|
|
case 'a' <= c && c <= 'f':
|
|
x = 16*x + rune(c) - 'a' + 10
|
|
continue
|
|
case 'A' <= c && c <= 'F':
|
|
x = 16*x + rune(c) - 'A' + 10
|
|
continue
|
|
}
|
|
} else if '0' <= c && c <= '9' {
|
|
x = 10*x + rune(c) - '0'
|
|
continue
|
|
}
|
|
if c != ';' {
|
|
i--
|
|
}
|
|
break
|
|
}
|
|
|
|
if i <= 3 { // No characters matched.
|
|
b[dst] = b[src]
|
|
return dst + 1, src + 1
|
|
}
|
|
|
|
if x == 0 || x >= 0x10ffff {
|
|
b[dst] = b[src]
|
|
return dst + 1, src + 1
|
|
}
|
|
|
|
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
|
}
|
|
|
|
// Consume the maximum number of characters possible, with the
|
|
// consumed characters matching one of the named references.
|
|
|
|
for i < len(s) {
|
|
c := s[i]
|
|
i++
|
|
// Lower-cased characters are more common in entities, so we check for them first.
|
|
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
|
|
continue
|
|
}
|
|
if c != ';' {
|
|
i--
|
|
}
|
|
break
|
|
}
|
|
|
|
var x rune
|
|
tagEnd := i
|
|
if i > 0 && s[tagEnd-1] == ';' {
|
|
tagEnd--
|
|
}
|
|
switch string(s[1:tagEnd]) {
|
|
case "lt":
|
|
x = '<'
|
|
case "gt":
|
|
x = '>'
|
|
case "amp":
|
|
x = '&'
|
|
case "quot":
|
|
x = '"'
|
|
}
|
|
if x != 0 {
|
|
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
|
}
|
|
|
|
dst1, src1 = dst+i, src+i
|
|
copy(b[dst:dst1], b[src:src1])
|
|
return dst1, src1
|
|
}
|
|
|
|
// telegramEscape implements Telegram BotAPI HTML unescape.
|
|
func telegramUnescape(b []byte) []byte {
|
|
for i, c := range b {
|
|
if c == '&' {
|
|
dst, src := unescapeEntity(b, i, i)
|
|
for src < len(b) {
|
|
c := b[src]
|
|
if c == '&' {
|
|
dst, src = unescapeEntity(b, dst, src)
|
|
} else {
|
|
b[dst] = c
|
|
dst, src = dst+1, src+1
|
|
}
|
|
}
|
|
return b[0:dst]
|
|
}
|
|
}
|
|
return b
|
|
}
|