// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package runes provide transforms for UTF-8 encoded text. package runes import ( "unicode" "unicode/utf8" "golang.org/x/text/transform" ) // A Set is a collection of runes. type Set interface { // Contains returns true if r is contained in the set. Contains(r rune) bool } type setFunc func(rune) bool func (s setFunc) Contains(r rune) bool { return s(r) } // Note: using funcs here instead of wrapping types result in cleaner // documentation and a smaller API. // In creates a Set with a Contains method that returns true for all runes in // the given RangeTable. func In(rt *unicode.RangeTable) Set { return setFunc(func(r rune) bool { return unicode.Is(rt, r) }) } // In creates a Set with a Contains method that returns true for all runes not // in the given RangeTable. func NotIn(rt *unicode.RangeTable) Set { return setFunc(func(r rune) bool { return !unicode.Is(rt, r) }) } // Predicate creates a Set with a Contains method that returns f(r). func Predicate(f func(rune) bool) Set { return setFunc(f) } // Transformer implements the transform.Transformer interface. type Transformer struct { t transform.SpanningTransformer } func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { return t.t.Transform(dst, src, atEOF) } func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) { return t.t.Span(b, atEOF) } func (t Transformer) Reset() { t.t.Reset() } // Bytes returns a new byte slice with the result of converting b using t. It // calls Reset on t. It returns nil if any error was found. This can only happen // if an error-producing Transformer is passed to If. func (t Transformer) Bytes(b []byte) []byte { b, _, err := transform.Bytes(t, b) if err != nil { return nil } return b } // String returns a string with the result of converting s using t. It calls // Reset on t. It returns the empty string if any error was found. This can only // happen if an error-producing Transformer is passed to If. func (t Transformer) String(s string) string { s, _, err := transform.String(t, s) if err != nil { return "" } return s } // TODO: // - Copy: copying strings and bytes in whole-rune units. // - Validation (maybe) // - Well-formed-ness (maybe) const runeErrorString = string(utf8.RuneError) // Remove returns a Transformer that removes runes r for which s.Contains(r). // Illegal input bytes are replaced by RuneError before being passed to f. func Remove(s Set) Transformer { if f, ok := s.(setFunc); ok { // This little trick cuts the running time of BenchmarkRemove for sets // created by Predicate roughly in half. // TODO: special-case RangeTables as well. return Transformer{remove(f)} } return Transformer{remove(s.Contains)} } // TODO: remove transform.RemoveFunc. type remove func(r rune) bool func (remove) Reset() {} // Span implements transform.Spanner. func (t remove) Span(src []byte, atEOF bool) (n int, err error) { for r, size := rune(0), 0; n < len(src); { if r = rune(src[n]); r < utf8.RuneSelf { size = 1 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 { // Invalid rune. if !atEOF && !utf8.FullRune(src[n:]) { err = transform.ErrShortSrc } else { err = transform.ErrEndOfSpan } break } if t(r) { err = transform.ErrEndOfSpan break } n += size } return } // Transform implements transform.Transformer. func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { for r, size := rune(0), 0; nSrc < len(src); { if r = rune(src[nSrc]); r < utf8.RuneSelf { size = 1 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { // Invalid rune. if !atEOF && !utf8.FullRune(src[nSrc:]) { err = transform.ErrShortSrc break } // We replace illegal bytes with RuneError. Not doing so might // otherwise turn a sequence of invalid UTF-8 into valid UTF-8. // The resulting byte sequence may subsequently contain runes // for which t(r) is true that were passed unnoticed. if !t(utf8.RuneError) { if nDst+3 > len(dst) { err = transform.ErrShortDst break } dst[nDst+0] = runeErrorString[0] dst[nDst+1] = runeErrorString[1] dst[nDst+2] = runeErrorString[2] nDst += 3 } nSrc++ continue } if t(r) { nSrc += size continue } if nDst+size > len(dst) { err = transform.ErrShortDst break } for i := 0; i < size; i++ { dst[nDst] = src[nSrc] nDst++ nSrc++ } } return } // Map returns a Transformer that maps the runes in the input using the given // mapping. Illegal bytes in the input are converted to utf8.RuneError before // being passed to the mapping func. func Map(mapping func(rune) rune) Transformer { return Transformer{mapper(mapping)} } type mapper func(rune) rune func (mapper) Reset() {} // Span implements transform.Spanner. func (t mapper) Span(src []byte, atEOF bool) (n int, err error) { for r, size := rune(0), 0; n < len(src); n += size { if r = rune(src[n]); r < utf8.RuneSelf { size = 1 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 { // Invalid rune. if !atEOF && !utf8.FullRune(src[n:]) { err = transform.ErrShortSrc } else { err = transform.ErrEndOfSpan } break } if t(r) != r { err = transform.ErrEndOfSpan break } } return n, err } // Transform implements transform.Transformer. func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { var replacement rune var b [utf8.UTFMax]byte for r, size := rune(0), 0; nSrc < len(src); { if r = rune(src[nSrc]); r < utf8.RuneSelf { if replacement = t(r); replacement < utf8.RuneSelf { if nDst == len(dst) { err = transform.ErrShortDst break } dst[nDst] = byte(replacement) nDst++ nSrc++ continue } size = 1 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { // Invalid rune. if !atEOF && !utf8.FullRune(src[nSrc:]) { err = transform.ErrShortSrc break } if replacement = t(utf8.RuneError); replacement == utf8.RuneError { if nDst+3 > len(dst) { err = transform.ErrShortDst break } dst[nDst+0] = runeErrorString[0] dst[nDst+1] = runeErrorString[1] dst[nDst+2] = runeErrorString[2] nDst += 3 nSrc++ continue } } else if replacement = t(r); replacement == r { if nDst+size > len(dst) { err = transform.ErrShortDst break } for i := 0; i < size; i++ { dst[nDst] = src[nSrc] nDst++ nSrc++ } continue } n := utf8.EncodeRune(b[:], replacement) if nDst+n > len(dst) { err = transform.ErrShortDst break } for i := 0; i < n; i++ { dst[nDst] = b[i] nDst++ } nSrc += size } return } // ReplaceIllFormed returns a transformer that replaces all input bytes that are // not part of a well-formed UTF-8 code sequence with utf8.RuneError. func ReplaceIllFormed() Transformer { return Transformer{&replaceIllFormed{}} } type replaceIllFormed struct{ transform.NopResetter } func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) { for n < len(src) { // ASCII fast path. if src[n] < utf8.RuneSelf { n++ continue } r, size := utf8.DecodeRune(src[n:]) // Look for a valid non-ASCII rune. if r != utf8.RuneError || size != 1 { n += size continue } // Look for short source data. if !atEOF && !utf8.FullRune(src[n:]) { err = transform.ErrShortSrc break } // We have an invalid rune. err = transform.ErrEndOfSpan break } return n, err } func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { for nSrc < len(src) { // ASCII fast path. if r := src[nSrc]; r < utf8.RuneSelf { if nDst == len(dst) { err = transform.ErrShortDst break } dst[nDst] = r nDst++ nSrc++ continue } // Look for a valid non-ASCII rune. if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 { if size != copy(dst[nDst:], src[nSrc:nSrc+size]) { err = transform.ErrShortDst break } nDst += size nSrc += size continue } // Look for short source data. if !atEOF && !utf8.FullRune(src[nSrc:]) { err = transform.ErrShortSrc break } // We have an invalid rune. if nDst+3 > len(dst) { err = transform.ErrShortDst break } dst[nDst+0] = runeErrorString[0] dst[nDst+1] = runeErrorString[1] dst[nDst+2] = runeErrorString[2] nDst += 3 nSrc++ } return nDst, nSrc, err }