vendor dependencies

This commit is contained in:
2025-03-15 20:42:37 -04:00
parent dd693fb000
commit af65c66317
84 changed files with 19478 additions and 0 deletions

View File

@@ -0,0 +1,627 @@
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package jsonwire
import (
"io"
"math"
"slices"
"strconv"
"unicode/utf16"
"unicode/utf8"
)
type ValueFlags uint
const (
_ ValueFlags = (1 << iota) / 2 // powers of two starting with zero
stringNonVerbatim // string cannot be naively treated as valid UTF-8
stringNonCanonical // string not formatted according to RFC 8785, section 3.2.2.2.
// TODO: Track whether a number is a non-integer?
)
func (f *ValueFlags) Join(f2 ValueFlags) { *f |= f2 }
func (f ValueFlags) IsVerbatim() bool { return f&stringNonVerbatim == 0 }
func (f ValueFlags) IsCanonical() bool { return f&stringNonCanonical == 0 }
// ConsumeWhitespace consumes leading JSON whitespace per RFC 7159, section 2.
func ConsumeWhitespace(b []byte) (n int) {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
for len(b) > n && (b[n] == ' ' || b[n] == '\t' || b[n] == '\r' || b[n] == '\n') {
n++
}
return n
}
// ConsumeNull consumes the next JSON null literal per RFC 7159, section 3.
// It returns 0 if it is invalid, in which case consumeLiteral should be used.
func ConsumeNull(b []byte) int {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
const literal = "null"
if len(b) >= len(literal) && string(b[:len(literal)]) == literal {
return len(literal)
}
return 0
}
// ConsumeFalse consumes the next JSON false literal per RFC 7159, section 3.
// It returns 0 if it is invalid, in which case consumeLiteral should be used.
func ConsumeFalse(b []byte) int {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
const literal = "false"
if len(b) >= len(literal) && string(b[:len(literal)]) == literal {
return len(literal)
}
return 0
}
// ConsumeTrue consumes the next JSON true literal per RFC 7159, section 3.
// It returns 0 if it is invalid, in which case consumeLiteral should be used.
func ConsumeTrue(b []byte) int {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
const literal = "true"
if len(b) >= len(literal) && string(b[:len(literal)]) == literal {
return len(literal)
}
return 0
}
// ConsumeLiteral consumes the next JSON literal per RFC 7159, section 3.
// If the input appears truncated, it returns io.ErrUnexpectedEOF.
func ConsumeLiteral(b []byte, lit string) (n int, err error) {
for i := 0; i < len(b) && i < len(lit); i++ {
if b[i] != lit[i] {
return i, NewInvalidCharacterError(b[i:], "in literal "+lit+" (expecting "+strconv.QuoteRune(rune(lit[i]))+")")
}
}
if len(b) < len(lit) {
return len(b), io.ErrUnexpectedEOF
}
return len(lit), nil
}
// ConsumeSimpleString consumes the next JSON string per RFC 7159, section 7
// but is limited to the grammar for an ASCII string without escape sequences.
// It returns 0 if it is invalid or more complicated than a simple string,
// in which case consumeString should be called.
//
// It rejects '<', '>', and '&' for compatibility reasons since these were
// always escaped in the v1 implementation. Thus, if this function reports
// non-zero then we know that the string would be encoded the same way
// under both v1 or v2 escape semantics.
func ConsumeSimpleString(b []byte) (n int) {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
if len(b) > 0 && b[0] == '"' {
n++
for len(b) > n && b[n] < utf8.RuneSelf && escapeASCII[b[n]] == 0 {
n++
}
if uint(len(b)) > uint(n) && b[n] == '"' {
n++
return n
}
}
return 0
}
// ConsumeString consumes the next JSON string per RFC 7159, section 7.
// If validateUTF8 is false, then this allows the presence of invalid UTF-8
// characters within the string itself.
// It reports the number of bytes consumed and whether an error was encountered.
// If the input appears truncated, it returns io.ErrUnexpectedEOF.
func ConsumeString(flags *ValueFlags, b []byte, validateUTF8 bool) (n int, err error) {
return ConsumeStringResumable(flags, b, 0, validateUTF8)
}
// ConsumeStringResumable is identical to consumeString but supports resuming
// from a previous call that returned io.ErrUnexpectedEOF.
func ConsumeStringResumable(flags *ValueFlags, b []byte, resumeOffset int, validateUTF8 bool) (n int, err error) {
// Consume the leading double quote.
switch {
case resumeOffset > 0:
n = resumeOffset // already handled the leading quote
case uint(len(b)) == 0:
return n, io.ErrUnexpectedEOF
case b[0] == '"':
n++
default:
return n, NewInvalidCharacterError(b[n:], `at start of string (expecting '"')`)
}
// Consume every character in the string.
for uint(len(b)) > uint(n) {
// Optimize for long sequences of unescaped characters.
noEscape := func(c byte) bool {
return c < utf8.RuneSelf && ' ' <= c && c != '\\' && c != '"'
}
for uint(len(b)) > uint(n) && noEscape(b[n]) {
n++
}
if uint(len(b)) <= uint(n) {
return n, io.ErrUnexpectedEOF
}
// Check for terminating double quote.
if b[n] == '"' {
n++
return n, nil
}
switch r, rn := utf8.DecodeRune(b[n:]); {
// Handle UTF-8 encoded byte sequence.
// Due to specialized handling of ASCII above, we know that
// all normal sequences at this point must be 2 bytes or larger.
case rn > 1:
n += rn
// Handle escape sequence.
case r == '\\':
flags.Join(stringNonVerbatim)
resumeOffset = n
if uint(len(b)) < uint(n+2) {
return resumeOffset, io.ErrUnexpectedEOF
}
switch r := b[n+1]; r {
case '/':
// Forward slash is the only character with 3 representations.
// Per RFC 8785, section 3.2.2.2., this must not be escaped.
flags.Join(stringNonCanonical)
n += 2
case '"', '\\', 'b', 'f', 'n', 'r', 't':
n += 2
case 'u':
if uint(len(b)) < uint(n+6) {
if hasEscapedUTF16Prefix(b[n:], false) {
return resumeOffset, io.ErrUnexpectedEOF
}
flags.Join(stringNonCanonical)
return n, NewInvalidEscapeSequenceError(b[n:])
}
v1, ok := parseHexUint16(b[n+2 : n+6])
if !ok {
flags.Join(stringNonCanonical)
return n, NewInvalidEscapeSequenceError(b[n : n+6])
}
// Only certain control characters can use the \uFFFF notation
// for canonical formatting (per RFC 8785, section 3.2.2.2.).
switch v1 {
// \uFFFF notation not permitted for these characters.
case '\b', '\f', '\n', '\r', '\t':
flags.Join(stringNonCanonical)
default:
// \uFFFF notation only permitted for control characters.
if v1 >= ' ' {
flags.Join(stringNonCanonical)
} else {
// \uFFFF notation must be lower case.
for _, c := range b[n+2 : n+6] {
if 'A' <= c && c <= 'F' {
flags.Join(stringNonCanonical)
}
}
}
}
n += 6
r := rune(v1)
if validateUTF8 && utf16.IsSurrogate(r) {
if uint(len(b)) < uint(n+6) {
if hasEscapedUTF16Prefix(b[n:], true) {
return resumeOffset, io.ErrUnexpectedEOF
}
flags.Join(stringNonCanonical)
return n - 6, NewInvalidEscapeSequenceError(b[n-6:])
} else if v2, ok := parseHexUint16(b[n+2 : n+6]); b[n] != '\\' || b[n+1] != 'u' || !ok {
flags.Join(stringNonCanonical)
return n - 6, NewInvalidEscapeSequenceError(b[n-6 : n+6])
} else if r = utf16.DecodeRune(rune(v1), rune(v2)); r == utf8.RuneError {
flags.Join(stringNonCanonical)
return n - 6, NewInvalidEscapeSequenceError(b[n-6 : n+6])
} else {
n += 6
}
}
default:
flags.Join(stringNonCanonical)
return n, NewInvalidEscapeSequenceError(b[n : n+2])
}
// Handle invalid UTF-8.
case r == utf8.RuneError:
if !utf8.FullRune(b[n:]) {
return n, io.ErrUnexpectedEOF
}
flags.Join(stringNonVerbatim | stringNonCanonical)
if validateUTF8 {
return n, ErrInvalidUTF8
}
n++
// Handle invalid control characters.
case r < ' ':
flags.Join(stringNonVerbatim | stringNonCanonical)
return n, NewInvalidCharacterError(b[n:], "in string (expecting non-control character)")
default:
panic("BUG: unhandled character " + QuoteRune(b[n:]))
}
}
return n, io.ErrUnexpectedEOF
}
// AppendUnquote appends the unescaped form of a JSON string in src to dst.
// Any invalid UTF-8 within the string will be replaced with utf8.RuneError,
// but the error will be specified as having encountered such an error.
// The input must be an entire JSON string with no surrounding whitespace.
func AppendUnquote[Bytes ~[]byte | ~string](dst []byte, src Bytes) (v []byte, err error) {
dst = slices.Grow(dst, len(src))
// Consume the leading double quote.
var i, n int
switch {
case uint(len(src)) == 0:
return dst, io.ErrUnexpectedEOF
case src[0] == '"':
i, n = 1, 1
default:
return dst, NewInvalidCharacterError(src, `at start of string (expecting '"')`)
}
// Consume every character in the string.
for uint(len(src)) > uint(n) {
// Optimize for long sequences of unescaped characters.
noEscape := func(c byte) bool {
return c < utf8.RuneSelf && ' ' <= c && c != '\\' && c != '"'
}
for uint(len(src)) > uint(n) && noEscape(src[n]) {
n++
}
if uint(len(src)) <= uint(n) {
dst = append(dst, src[i:n]...)
return dst, io.ErrUnexpectedEOF
}
// Check for terminating double quote.
if src[n] == '"' {
dst = append(dst, src[i:n]...)
n++
if n < len(src) {
err = NewInvalidCharacterError(src[n:], "after string value")
}
return dst, err
}
switch r, rn := utf8.DecodeRuneInString(string(truncateMaxUTF8(src[n:]))); {
// Handle UTF-8 encoded byte sequence.
// Due to specialized handling of ASCII above, we know that
// all normal sequences at this point must be 2 bytes or larger.
case rn > 1:
n += rn
// Handle escape sequence.
case r == '\\':
dst = append(dst, src[i:n]...)
// Handle escape sequence.
if uint(len(src)) < uint(n+2) {
return dst, io.ErrUnexpectedEOF
}
switch r := src[n+1]; r {
case '"', '\\', '/':
dst = append(dst, r)
n += 2
case 'b':
dst = append(dst, '\b')
n += 2
case 'f':
dst = append(dst, '\f')
n += 2
case 'n':
dst = append(dst, '\n')
n += 2
case 'r':
dst = append(dst, '\r')
n += 2
case 't':
dst = append(dst, '\t')
n += 2
case 'u':
if uint(len(src)) < uint(n+6) {
if hasEscapedUTF16Prefix(src[n:], false) {
return dst, io.ErrUnexpectedEOF
}
return dst, NewInvalidEscapeSequenceError(src[n:])
}
v1, ok := parseHexUint16(src[n+2 : n+6])
if !ok {
return dst, NewInvalidEscapeSequenceError(src[n : n+6])
}
n += 6
// Check whether this is a surrogate half.
r := rune(v1)
if utf16.IsSurrogate(r) {
r = utf8.RuneError // assume failure unless the following succeeds
if uint(len(src)) < uint(n+6) {
if hasEscapedUTF16Prefix(src[n:], true) {
return utf8.AppendRune(dst, r), io.ErrUnexpectedEOF
}
err = NewInvalidEscapeSequenceError(src[n-6:])
} else if v2, ok := parseHexUint16(src[n+2 : n+6]); src[n] != '\\' || src[n+1] != 'u' || !ok {
err = NewInvalidEscapeSequenceError(src[n-6 : n+6])
} else if r = utf16.DecodeRune(rune(v1), rune(v2)); r == utf8.RuneError {
err = NewInvalidEscapeSequenceError(src[n-6 : n+6])
} else {
n += 6
}
}
dst = utf8.AppendRune(dst, r)
default:
return dst, NewInvalidEscapeSequenceError(src[n : n+2])
}
i = n
// Handle invalid UTF-8.
case r == utf8.RuneError:
dst = append(dst, src[i:n]...)
if !utf8.FullRuneInString(string(truncateMaxUTF8(src[n:]))) {
return dst, io.ErrUnexpectedEOF
}
// NOTE: An unescaped string may be longer than the escaped string
// because invalid UTF-8 bytes are being replaced.
dst = append(dst, "\uFFFD"...)
n += rn
i = n
err = ErrInvalidUTF8
// Handle invalid control characters.
case r < ' ':
dst = append(dst, src[i:n]...)
return dst, NewInvalidCharacterError(src[n:], "in string (expecting non-control character)")
default:
panic("BUG: unhandled character " + QuoteRune(src[n:]))
}
}
dst = append(dst, src[i:n]...)
return dst, io.ErrUnexpectedEOF
}
// hasEscapedUTF16Prefix reports whether b is possibly
// the truncated prefix of a \uFFFF escape sequence.
func hasEscapedUTF16Prefix[Bytes ~[]byte | ~string](b Bytes, lowerSurrogateHalf bool) bool {
for i := range len(b) {
switch c := b[i]; {
case i == 0 && c != '\\':
return false
case i == 1 && c != 'u':
return false
case i == 2 && lowerSurrogateHalf && c != 'd' && c != 'D':
return false // not within ['\uDC00':'\uDFFF']
case i == 3 && lowerSurrogateHalf && !('c' <= c && c <= 'f') && !('C' <= c && c <= 'F'):
return false // not within ['\uDC00':'\uDFFF']
case i >= 2 && i < 6 && !('0' <= c && c <= '9') && !('a' <= c && c <= 'f') && !('A' <= c && c <= 'F'):
return false
}
}
return true
}
// UnquoteMayCopy returns the unescaped form of b.
// If there are no escaped characters, the output is simply a subslice of
// the input with the surrounding quotes removed.
// Otherwise, a new buffer is allocated for the output.
// It assumes the input is valid.
func UnquoteMayCopy(b []byte, isVerbatim bool) []byte {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
if isVerbatim {
return b[len(`"`) : len(b)-len(`"`)]
}
b, _ = AppendUnquote(nil, b)
return b
}
// ConsumeSimpleNumber consumes the next JSON number per RFC 7159, section 6
// but is limited to the grammar for a positive integer.
// It returns 0 if it is invalid or more complicated than a simple integer,
// in which case consumeNumber should be called.
func ConsumeSimpleNumber(b []byte) (n int) {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
if len(b) > 0 {
if b[0] == '0' {
n++
} else if '1' <= b[0] && b[0] <= '9' {
n++
for len(b) > n && ('0' <= b[n] && b[n] <= '9') {
n++
}
} else {
return 0
}
if uint(len(b)) <= uint(n) || (b[n] != '.' && b[n] != 'e' && b[n] != 'E') {
return n
}
}
return 0
}
type ConsumeNumberState uint
const (
consumeNumberInit ConsumeNumberState = iota
beforeIntegerDigits
withinIntegerDigits
beforeFractionalDigits
withinFractionalDigits
beforeExponentDigits
withinExponentDigits
)
// ConsumeNumber consumes the next JSON number per RFC 7159, section 6.
// It reports the number of bytes consumed and whether an error was encountered.
// If the input appears truncated, it returns io.ErrUnexpectedEOF.
//
// Note that JSON numbers are not self-terminating.
// If the entire input is consumed, then the caller needs to consider whether
// there may be subsequent unread data that may still be part of this number.
func ConsumeNumber(b []byte) (n int, err error) {
n, _, err = ConsumeNumberResumable(b, 0, consumeNumberInit)
return n, err
}
// ConsumeNumberResumable is identical to consumeNumber but supports resuming
// from a previous call that returned io.ErrUnexpectedEOF.
func ConsumeNumberResumable(b []byte, resumeOffset int, state ConsumeNumberState) (n int, _ ConsumeNumberState, err error) {
// Jump to the right state when resuming from a partial consumption.
n = resumeOffset
if state > consumeNumberInit {
switch state {
case withinIntegerDigits, withinFractionalDigits, withinExponentDigits:
// Consume leading digits.
for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') {
n++
}
if uint(len(b)) <= uint(n) {
return n, state, nil // still within the same state
}
state++ // switches "withinX" to "beforeY" where Y is the state after X
}
switch state {
case beforeIntegerDigits:
goto beforeInteger
case beforeFractionalDigits:
goto beforeFractional
case beforeExponentDigits:
goto beforeExponent
default:
return n, state, nil
}
}
// Consume required integer component (with optional minus sign).
beforeInteger:
resumeOffset = n
if uint(len(b)) > 0 && b[0] == '-' {
n++
}
switch {
case uint(len(b)) <= uint(n):
return resumeOffset, beforeIntegerDigits, io.ErrUnexpectedEOF
case b[n] == '0':
n++
state = beforeFractionalDigits
case '1' <= b[n] && b[n] <= '9':
n++
for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') {
n++
}
state = withinIntegerDigits
default:
return n, state, NewInvalidCharacterError(b[n:], "in number (expecting digit)")
}
// Consume optional fractional component.
beforeFractional:
if uint(len(b)) > uint(n) && b[n] == '.' {
resumeOffset = n
n++
switch {
case uint(len(b)) <= uint(n):
return resumeOffset, beforeFractionalDigits, io.ErrUnexpectedEOF
case '0' <= b[n] && b[n] <= '9':
n++
default:
return n, state, NewInvalidCharacterError(b[n:], "in number (expecting digit)")
}
for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') {
n++
}
state = withinFractionalDigits
}
// Consume optional exponent component.
beforeExponent:
if uint(len(b)) > uint(n) && (b[n] == 'e' || b[n] == 'E') {
resumeOffset = n
n++
if uint(len(b)) > uint(n) && (b[n] == '-' || b[n] == '+') {
n++
}
switch {
case uint(len(b)) <= uint(n):
return resumeOffset, beforeExponentDigits, io.ErrUnexpectedEOF
case '0' <= b[n] && b[n] <= '9':
n++
default:
return n, state, NewInvalidCharacterError(b[n:], "in number (expecting digit)")
}
for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') {
n++
}
state = withinExponentDigits
}
return n, state, nil
}
// parseHexUint16 is similar to strconv.ParseUint,
// but operates directly on []byte and is optimized for base-16.
// See https://go.dev/issue/42429.
func parseHexUint16[Bytes ~[]byte | ~string](b Bytes) (v uint16, ok bool) {
if len(b) != 4 {
return 0, false
}
for i := range 4 {
c := b[i]
switch {
case '0' <= c && c <= '9':
c = c - '0'
case 'a' <= c && c <= 'f':
c = 10 + c - 'a'
case 'A' <= c && c <= 'F':
c = 10 + c - 'A'
default:
return 0, false
}
v = v*16 + uint16(c)
}
return v, true
}
// ParseUint parses b as a decimal unsigned integer according to
// a strict subset of the JSON number grammar, returning the value if valid.
// It returns (0, false) if there is a syntax error and
// returns (math.MaxUint64, false) if there is an overflow.
func ParseUint(b []byte) (v uint64, ok bool) {
const unsafeWidth = 20 // len(fmt.Sprint(uint64(math.MaxUint64)))
var n int
for ; len(b) > n && ('0' <= b[n] && b[n] <= '9'); n++ {
v = 10*v + uint64(b[n]-'0')
}
switch {
case n == 0 || len(b) != n || (b[0] == '0' && string(b) != "0"):
return 0, false
case n >= unsafeWidth && (b[0] != '1' || v < 1e19 || n > unsafeWidth):
return math.MaxUint64, false
}
return v, true
}
// ParseFloat parses a floating point number according to the Go float grammar.
// Note that the JSON number grammar is a strict subset.
//
// If the number overflows the finite representation of a float,
// then we return MaxFloat since any finite value will always be infinitely
// more accurate at representing another finite value than an infinite value.
func ParseFloat(b []byte, bits int) (v float64, ok bool) {
fv, err := strconv.ParseFloat(string(b), bits)
if math.IsInf(fv, 0) {
switch {
case bits == 32 && math.IsInf(fv, +1):
fv = +math.MaxFloat32
case bits == 64 && math.IsInf(fv, +1):
fv = +math.MaxFloat64
case bits == 32 && math.IsInf(fv, -1):
fv = -math.MaxFloat32
case bits == 64 && math.IsInf(fv, -1):
fv = -math.MaxFloat64
}
}
return fv, err == nil
}

View File

@@ -0,0 +1,292 @@
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package jsonwire
import (
"math"
"slices"
"strconv"
"unicode/utf16"
"unicode/utf8"
"github.com/go-json-experiment/json/internal/jsonflags"
)
// escapeASCII reports whether the ASCII character needs to be escaped.
// It conservatively assumes EscapeForHTML.
var escapeASCII = [...]uint8{
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // escape control characters
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // escape control characters
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, // escape '"' and '&'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, // escape '<' and '>'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, // escape '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}
// NeedEscape reports whether src needs escaping of any characters.
// It conservatively assumes EscapeForHTML and EscapeForJS.
// It reports true for inputs with invalid UTF-8.
func NeedEscape[Bytes ~[]byte | ~string](src Bytes) bool {
var i int
for uint(len(src)) > uint(i) {
if c := src[i]; c < utf8.RuneSelf {
if escapeASCII[c] > 0 {
return true
}
i++
} else {
r, rn := utf8.DecodeRuneInString(string(truncateMaxUTF8(src[i:])))
if r == utf8.RuneError || r == '\u2028' || r == '\u2029' {
return true
}
i += rn
}
}
return false
}
// AppendQuote appends src to dst as a JSON string per RFC 7159, section 7.
//
// It takes in flags and respects the following:
// - EscapeForHTML escapes '<', '>', and '&'.
// - EscapeForJS escapes '\u2028' and '\u2029'.
// - AllowInvalidUTF8 avoids reporting an error for invalid UTF-8.
//
// Regardless of whether AllowInvalidUTF8 is specified,
// invalid bytes are replaced with the Unicode replacement character ('\ufffd').
// If no escape flags are set, then the shortest representable form is used,
// which is also the canonical form for strings (RFC 8785, section 3.2.2.2).
func AppendQuote[Bytes ~[]byte | ~string](dst []byte, src Bytes, flags *jsonflags.Flags) ([]byte, error) {
var i, n int
var hasInvalidUTF8 bool
dst = slices.Grow(dst, len(`"`)+len(src)+len(`"`))
dst = append(dst, '"')
for uint(len(src)) > uint(n) {
if c := src[n]; c < utf8.RuneSelf {
// Handle single-byte ASCII.
n++
if escapeASCII[c] == 0 {
continue // no escaping possibly needed
}
// Handle escaping of single-byte ASCII.
if !(c == '<' || c == '>' || c == '&') || flags.Get(jsonflags.EscapeForHTML) {
dst = append(dst, src[i:n-1]...)
dst = appendEscapedASCII(dst, c)
i = n
}
} else {
// Handle multi-byte Unicode.
r, rn := utf8.DecodeRuneInString(string(truncateMaxUTF8(src[n:])))
n += rn
if r != utf8.RuneError && r != '\u2028' && r != '\u2029' {
continue // no escaping possibly needed
}
// Handle escaping of multi-byte Unicode.
switch {
case isInvalidUTF8(r, rn):
hasInvalidUTF8 = true
dst = append(dst, src[i:n-rn]...)
if flags.Get(jsonflags.EscapeInvalidUTF8) {
dst = append(dst, `\ufffd`...)
} else {
dst = append(dst, "\ufffd"...)
}
i = n
case (r == '\u2028' || r == '\u2029') && flags.Get(jsonflags.EscapeForJS):
dst = append(dst, src[i:n-rn]...)
dst = appendEscapedUnicode(dst, r)
i = n
}
}
}
dst = append(dst, src[i:n]...)
dst = append(dst, '"')
if hasInvalidUTF8 && !flags.Get(jsonflags.AllowInvalidUTF8) {
return dst, ErrInvalidUTF8
}
return dst, nil
}
func appendEscapedASCII(dst []byte, c byte) []byte {
switch c {
case '"', '\\':
dst = append(dst, '\\', c)
case '\b':
dst = append(dst, "\\b"...)
case '\f':
dst = append(dst, "\\f"...)
case '\n':
dst = append(dst, "\\n"...)
case '\r':
dst = append(dst, "\\r"...)
case '\t':
dst = append(dst, "\\t"...)
default:
dst = appendEscapedUTF16(dst, uint16(c))
}
return dst
}
func appendEscapedUnicode(dst []byte, r rune) []byte {
if r1, r2 := utf16.EncodeRune(r); r1 != '\ufffd' && r2 != '\ufffd' {
dst = appendEscapedUTF16(dst, uint16(r1))
dst = appendEscapedUTF16(dst, uint16(r2))
} else {
dst = appendEscapedUTF16(dst, uint16(r))
}
return dst
}
func appendEscapedUTF16(dst []byte, x uint16) []byte {
const hex = "0123456789abcdef"
return append(dst, '\\', 'u', hex[(x>>12)&0xf], hex[(x>>8)&0xf], hex[(x>>4)&0xf], hex[(x>>0)&0xf])
}
// ReformatString consumes a JSON string from src and appends it to dst,
// reformatting it if necessary according to the specified flags.
// It returns the appended output and the number of consumed input bytes.
func ReformatString(dst, src []byte, flags *jsonflags.Flags) ([]byte, int, error) {
// TODO: Should this update ValueFlags as input?
var valFlags ValueFlags
n, err := ConsumeString(&valFlags, src, !flags.Get(jsonflags.AllowInvalidUTF8))
if err != nil {
return dst, n, err
}
// If the output requires no special escapes, and the input
// is already in canonical form or should be preserved verbatim,
// then directly copy the input to the output.
if !flags.Get(jsonflags.AnyEscape) &&
(valFlags.IsCanonical() || flags.Get(jsonflags.PreserveRawStrings)) {
dst = append(dst, src[:n]...) // copy the string verbatim
return dst, n, nil
}
// Under [jsonflags.PreserveRawStrings], any pre-escaped sequences
// remain escaped, however we still need to respect the
// [jsonflags.EscapeForHTML] and [jsonflags.EscapeForJS] options.
if flags.Get(jsonflags.PreserveRawStrings) {
var i, lastAppendIndex int
for i < n {
if c := src[i]; c < utf8.RuneSelf {
if (c == '<' || c == '>' || c == '&') && flags.Get(jsonflags.EscapeForHTML) {
dst = append(dst, src[lastAppendIndex:i]...)
dst = appendEscapedASCII(dst, c)
lastAppendIndex = i + 1
}
i++
} else {
r, rn := utf8.DecodeRune(truncateMaxUTF8(src[i:]))
if (r == '\u2028' || r == '\u2029') && flags.Get(jsonflags.EscapeForJS) {
dst = append(dst, src[lastAppendIndex:i]...)
dst = appendEscapedUnicode(dst, r)
lastAppendIndex = i + rn
}
i += rn
}
}
return append(dst, src[lastAppendIndex:n]...), n, nil
}
// The input contains characters that might need escaping,
// unnecessary escape sequences, or invalid UTF-8.
// Perform a round-trip unquote and quote to properly reformat
// these sequences according the current flags.
b, _ := AppendUnquote(nil, src[:n])
dst, _ = AppendQuote(dst, b, flags)
return dst, n, nil
}
// AppendFloat appends src to dst as a JSON number per RFC 7159, section 6.
// It formats numbers similar to the ES6 number-to-string conversion.
// See https://go.dev/issue/14135.
//
// The output is identical to ECMA-262, 6th edition, section 7.1.12.1 and with
// RFC 8785, section 3.2.2.3 for 64-bit floating-point numbers except for -0,
// which is formatted as -0 instead of just 0.
//
// For 32-bit floating-point numbers,
// the output is a 32-bit equivalent of the algorithm.
// Note that ECMA-262 specifies no algorithm for 32-bit numbers.
func AppendFloat(dst []byte, src float64, bits int) []byte {
if bits == 32 {
src = float64(float32(src))
}
abs := math.Abs(src)
fmt := byte('f')
if abs != 0 {
if bits == 64 && (float64(abs) < 1e-6 || float64(abs) >= 1e21) ||
bits == 32 && (float32(abs) < 1e-6 || float32(abs) >= 1e21) {
fmt = 'e'
}
}
dst = strconv.AppendFloat(dst, src, fmt, -1, bits)
if fmt == 'e' {
// Clean up e-09 to e-9.
n := len(dst)
if n >= 4 && dst[n-4] == 'e' && dst[n-3] == '-' && dst[n-2] == '0' {
dst[n-2] = dst[n-1]
dst = dst[:n-1]
}
}
return dst
}
// ReformatNumber consumes a JSON string from src and appends it to dst,
// canonicalizing it if specified.
// It returns the appended output and the number of consumed input bytes.
func ReformatNumber(dst, src []byte, flags *jsonflags.Flags) ([]byte, int, error) {
n, err := ConsumeNumber(src)
if err != nil {
return dst, n, err
}
if !flags.Get(jsonflags.CanonicalizeNumbers) {
dst = append(dst, src[:n]...) // copy the number verbatim
return dst, n, nil
}
// Identify the kind of number.
var isFloat bool
for _, c := range src[:n] {
if c == '.' || c == 'e' || c == 'E' {
isFloat = true // has fraction or exponent
break
}
}
// Check if need to canonicalize this kind of number.
switch {
case string(src[:n]) == "-0":
break // canonicalize -0 as 0 regardless of kind
case isFloat:
if !flags.Get(jsonflags.CanonicalizeRawFloats) {
dst = append(dst, src[:n]...) // copy the number verbatim
return dst, n, nil
}
default:
// As an optimization, we can copy integer numbers below 2⁵³ verbatim
// since the canonical form is always identical.
const maxExactIntegerDigits = 16 // len(strconv.AppendUint(nil, 1<<53, 10))
if !flags.Get(jsonflags.CanonicalizeRawInts) || n < maxExactIntegerDigits {
dst = append(dst, src[:n]...) // copy the number verbatim
return dst, n, nil
}
}
// Parse and reformat the number (which uses a canonical format).
fv, _ := strconv.ParseFloat(string(src[:n]), 64)
switch {
case fv == 0:
fv = 0 // normalize negative zero as just zero
case math.IsInf(fv, +1):
fv = +math.MaxFloat64
case math.IsInf(fv, -1):
fv = -math.MaxFloat64
}
return AppendFloat(dst, fv, 64), n, nil
}

View File

@@ -0,0 +1,215 @@
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package jsonwire implements stateless functionality for handling JSON text.
package jsonwire
import (
"cmp"
"errors"
"strconv"
"strings"
"unicode"
"unicode/utf16"
"unicode/utf8"
)
// TrimSuffixWhitespace trims JSON from the end of b.
func TrimSuffixWhitespace(b []byte) []byte {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
n := len(b) - 1
for n >= 0 && (b[n] == ' ' || b[n] == '\t' || b[n] == '\r' || b[n] == '\n') {
n--
}
return b[:n+1]
}
// TrimSuffixString trims a valid JSON string at the end of b.
// The behavior is undefined if there is not a valid JSON string present.
func TrimSuffixString(b []byte) []byte {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
if len(b) > 0 && b[len(b)-1] == '"' {
b = b[:len(b)-1]
}
for len(b) >= 2 && !(b[len(b)-1] == '"' && b[len(b)-2] != '\\') {
b = b[:len(b)-1] // trim all characters except an unescaped quote
}
if len(b) > 0 && b[len(b)-1] == '"' {
b = b[:len(b)-1]
}
return b
}
// HasSuffixByte reports whether b ends with c.
func HasSuffixByte(b []byte, c byte) bool {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
return len(b) > 0 && b[len(b)-1] == c
}
// TrimSuffixByte removes c from the end of b if it is present.
func TrimSuffixByte(b []byte, c byte) []byte {
// NOTE: The arguments and logic are kept simple to keep this inlinable.
if len(b) > 0 && b[len(b)-1] == c {
return b[:len(b)-1]
}
return b
}
// QuoteRune quotes the first rune in the input.
func QuoteRune[Bytes ~[]byte | ~string](b Bytes) string {
r, n := utf8.DecodeRuneInString(string(truncateMaxUTF8(b)))
if r == utf8.RuneError && n == 1 {
return `'\x` + strconv.FormatUint(uint64(b[0]), 16) + `'`
}
return strconv.QuoteRune(r)
}
// CompareUTF16 lexicographically compares x to y according
// to the UTF-16 codepoints of the UTF-8 encoded input strings.
// This implements the ordering specified in RFC 8785, section 3.2.3.
func CompareUTF16[Bytes ~[]byte | ~string](x, y Bytes) int {
// NOTE: This is an optimized, mostly allocation-free implementation
// of CompareUTF16Simple in wire_test.go. FuzzCompareUTF16 verifies that the
// two implementations agree on the result of comparing any two strings.
isUTF16Self := func(r rune) bool {
return ('\u0000' <= r && r <= '\uD7FF') || ('\uE000' <= r && r <= '\uFFFF')
}
for {
if len(x) == 0 || len(y) == 0 {
return cmp.Compare(len(x), len(y))
}
// ASCII fast-path.
if x[0] < utf8.RuneSelf || y[0] < utf8.RuneSelf {
if x[0] != y[0] {
return cmp.Compare(x[0], y[0])
}
x, y = x[1:], y[1:]
continue
}
// Decode next pair of runes as UTF-8.
rx, nx := utf8.DecodeRuneInString(string(truncateMaxUTF8(x)))
ry, ny := utf8.DecodeRuneInString(string(truncateMaxUTF8(y)))
selfx := isUTF16Self(rx)
selfy := isUTF16Self(ry)
switch {
// The x rune is a single UTF-16 codepoint, while
// the y rune is a surrogate pair of UTF-16 codepoints.
case selfx && !selfy:
ry, _ = utf16.EncodeRune(ry)
// The y rune is a single UTF-16 codepoint, while
// the x rune is a surrogate pair of UTF-16 codepoints.
case selfy && !selfx:
rx, _ = utf16.EncodeRune(rx)
}
if rx != ry {
return cmp.Compare(rx, ry)
}
// Check for invalid UTF-8, in which case,
// we just perform a byte-for-byte comparison.
if isInvalidUTF8(rx, nx) || isInvalidUTF8(ry, ny) {
if x[0] != y[0] {
return cmp.Compare(x[0], y[0])
}
}
x, y = x[nx:], y[ny:]
}
}
// truncateMaxUTF8 truncates b such it contains at least one rune.
//
// The utf8 package currently lacks generic variants, which complicates
// generic functions that operates on either []byte or string.
// As a hack, we always call the utf8 function operating on strings,
// but always truncate the input such that the result is identical.
//
// Example usage:
//
// utf8.DecodeRuneInString(string(truncateMaxUTF8(b)))
//
// Converting a []byte to a string is stack allocated since
// truncateMaxUTF8 guarantees that the []byte is short.
func truncateMaxUTF8[Bytes ~[]byte | ~string](b Bytes) Bytes {
// TODO(https://go.dev/issue/56948): Remove this function and
// instead directly call generic utf8 functions wherever used.
if len(b) > utf8.UTFMax {
return b[:utf8.UTFMax]
}
return b
}
// TODO(https://go.dev/issue/70547): Use utf8.ErrInvalid instead.
var ErrInvalidUTF8 = errors.New("invalid UTF-8")
func NewInvalidCharacterError[Bytes ~[]byte | ~string](prefix Bytes, where string) error {
what := QuoteRune(prefix)
return errors.New("invalid character " + what + " " + where)
}
func NewInvalidEscapeSequenceError[Bytes ~[]byte | ~string](what Bytes) error {
label := "escape sequence"
if len(what) > 6 {
label = "surrogate pair"
}
needEscape := strings.IndexFunc(string(what), func(r rune) bool {
return r == '`' || r == utf8.RuneError || unicode.IsSpace(r) || !unicode.IsPrint(r)
}) >= 0
if needEscape {
return errors.New("invalid " + label + " " + strconv.Quote(string(what)) + " in string")
} else {
return errors.New("invalid " + label + " `" + string(what) + "` in string")
}
}
// TruncatePointer optionally truncates the JSON pointer,
// enforcing that the length roughly does not exceed n.
func TruncatePointer(s string, n int) string {
if len(s) <= n {
return s
}
i := n / 2
j := len(s) - n/2
// Avoid truncating a name if there are multiple names present.
if k := strings.LastIndexByte(s[:i], '/'); k > 0 {
i = k
}
if k := strings.IndexByte(s[j:], '/'); k >= 0 {
j += k + len("/")
}
// Avoid truncation in the middle of a UTF-8 rune.
for i > 0 && isInvalidUTF8(utf8.DecodeLastRuneInString(s[:i])) {
i--
}
for j < len(s) && isInvalidUTF8(utf8.DecodeRuneInString(s[j:])) {
j++
}
// Determine the right middle fragment to use.
var middle string
switch strings.Count(s[i:j], "/") {
case 0:
middle = "…"
case 1:
middle = "…/…"
default:
middle = "…/…/…"
}
if strings.HasPrefix(s[i:j], "/") && middle != "…" {
middle = strings.TrimPrefix(middle, "…")
}
if strings.HasSuffix(s[i:j], "/") && middle != "…" {
middle = strings.TrimSuffix(middle, "…")
}
return s[:i] + middle + s[j:]
}
func isInvalidUTF8(r rune, rn int) bool {
return r == utf8.RuneError && rn == 1
}