Skip to content

Commit

Permalink
bytes, strings: add Lines, SplitSeq, SplitAfterSeq, FieldsSeq, Fields…
Browse files Browse the repository at this point in the history
…FuncSeq

Fixes golang#61901.
  • Loading branch information
aimuz committed Jul 26, 2024
1 parent d8c7230 commit 532c44e
Show file tree
Hide file tree
Showing 8 changed files with 422 additions and 2 deletions.
10 changes: 10 additions & 0 deletions api/next/61901.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
pkg bytes, func FieldsFuncSeq([]uint8, func(int32) bool) iter.Seq[[]uint8] #61901
pkg bytes, func FieldsSeq([]uint8) iter.Seq[[]uint8] #61901
pkg bytes, func Lines([]uint8) iter.Seq[[]uint8] #61901
pkg bytes, func SplitAfterSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
pkg bytes, func SplitSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
pkg strings, func FieldsFuncSeq(string, func(int32) bool) iter.Seq[string] #61901
pkg strings, func FieldsSeq(string) iter.Seq[string] #61901
pkg strings, func Lines(string) iter.Seq[string] #61901
pkg strings, func SplitAfterSeq(string, string) iter.Seq[string] #61901
pkg strings, func SplitSeq(string, string) iter.Seq[string] #61901
30 changes: 30 additions & 0 deletions doc/next/6-stdlib/3-iter.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
### Iterators

The new [iter] package provides the basic definitions for working with
user-defined iterators.

The [bytes] package adds several functions that work with iterators:
- [Lines](/pkg/bytes#Lines) returns an iterator over the
newline-terminated lines in the byte slice s.
- [SplitSeq](/pkg/bytes#SplitSeq) returns an iterator over
all substrings of s separated by sep.
- [SplitAfterSeq](/pkg/bytes#SplitAfterSeq) returns an iterator
over substrings of s split after each instance of sep.
- [FieldsSeq](/pkg/bytes#FieldsSeq) returns an iterator over
substrings of s split around runs of whitespace characters,
as defined by unicode.IsSpace.
- [FieldsFuncSeq](/pkg/bytes#FieldsFuncSeq) returns an iterator
over substrings of s split around runs of Unicode code points satisfying f(c).

The [strings] package adds several functions that work with iterators:
- [Lines](/pkg/strings#Lines) returns an iterator over
the newline-terminated lines in the string s.
- [SplitSeq](/pkg/strings#SplitSeq) returns an iterator over
all substrings of s separated by sep.
- [SplitAfterSeq](/pkg/strings#SplitAfterSeq) returns an iterator
over substrings of s split after each instance of sep.
- [FieldsSeq](/pkg/strings#FieldsSeq) returns an iterator over
substrings of s split around runs of whitespace characters,
as defined by unicode.IsSpace.
- [FieldsFuncSeq](/pkg/strings#FieldsFuncSeq) returns an iterator
over substrings of s split around runs of Unicode code points satisfying f(c).
1 change: 1 addition & 0 deletions doc/next/6-stdlib/99-minor/bytes/61901.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!-- see ../../3-iter.md -->
1 change: 1 addition & 0 deletions doc/next/6-stdlib/99-minor/strings/61901.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!-- see ../../3-iter.md -->
140 changes: 140 additions & 0 deletions src/bytes/bytes.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package bytes

import (
"internal/bytealg"
"iter"
"unicode"
"unicode/utf8"
_ "unsafe" // for linkname
Expand Down Expand Up @@ -319,6 +320,28 @@ func LastIndexAny(s []byte, chars string) int {
return -1
}

// Lines returns an iterator over the newline-terminated lines in the byte slice s.
// The lines yielded by the iterator include their terminating newlines.
// If s is empty, the iterator yields no lines at all.
// If s does not end in a newline, the final yielded line will not end in a newline.
// It returns a single-use iterator.
func Lines(s []byte) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
for len(s) > 0 {
var line []byte
if i := IndexByte(s, '\n'); i >= 0 {
line, s = s[:i+1], s[i+1:]
} else {
line, s = s, nil
}
if !yield(line) {
return
}
}
return
}
}

// Generic split: splits after each instance of sep,
// including sepSave bytes of sep in the subslices.
func genSplit(s, sep []byte, sepSave, n int) [][]byte {
Expand Down Expand Up @@ -389,6 +412,57 @@ func SplitAfter(s, sep []byte) [][]byte {
return genSplit(s, sep, len(sep), -1)
}

// explodeSeq returns an iterator over the runes in s.
func explodeSeq(s []byte) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
for len(s) > 0 {
_, size := utf8.DecodeRune(s)
if !yield(s[:size]) {
return
}
s = s[size:]
}
}
}

// splitSeq is SplitSeq or SplitAfterSeq, configured by how many
// bytes of sep to include in the results (none or all).
func splitSeq(s, sep []byte, sepSave int) iter.Seq[[]byte] {
if len(sep) == 0 {
return explodeSeq(s)
}
return func(yield func([]byte) bool) {
for {
i := Index(s, sep)
if i < 0 {
break
}
frag := s[:i+sepSave]
if !yield(frag) {
return
}
s = s[i+len(sep):]
}
yield(s)
}
}

// SplitSeq returns an iterator over all substrings of s separated by sep.
// The iterator yields the same strings that would be returned by Split(s, sep),
// but without constructing the slice.
// It returns a single-use iterator.
func SplitSeq(s, sep []byte) iter.Seq[[]byte] {
return splitSeq(s, sep, 0)
}

// SplitAfterSeq returns an iterator over substrings of s split after each instance of sep.
// The iterator yields the same strings that would be returned by SplitAfter(s, sep),
// but without constructing the slice.
// It returns a single-use iterator.
func SplitAfterSeq(s, sep []byte) iter.Seq[[]byte] {
return splitSeq(s, sep, len(sep))
}

var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}

// Fields interprets s as a sequence of UTF-8-encoded code points.
Expand Down Expand Up @@ -445,6 +519,40 @@ func Fields(s []byte) [][]byte {
return a
}

// FieldsSeq returns an iterator over substrings of s split around runs of
// whitespace characters, as defined by unicode.IsSpace.
// The iterator yields the same strings that would be returned by Fields(s),
// but without constructing the slice.
func FieldsSeq(s []byte) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
s := s
start := -1
for i := 0; i < len(s); {
size := 1
r := rune(s[i])
isSpace := asciiSpace[s[i]] != 0
if r >= utf8.RuneSelf {
r, size = utf8.DecodeRune(s[i:])
isSpace = unicode.IsSpace(r)
}
if isSpace {
if start >= 0 {
if !yield(s[start:i]) {
return
}
start = -1
}
} else if start < 0 {
start = i
}
i += size
}
if start >= 0 {
yield(s[start:])
}
}
}

// FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
// It splits the slice s at each run of code points c satisfying f(c) and
// returns a slice of subslices of s. If all code points in s satisfy f(c), or
Expand Down Expand Up @@ -499,6 +607,38 @@ func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
return a
}

// FieldsFuncSeq returns an iterator over substrings of s split around runs of
// Unicode code points satisfying f(c).
// The iterator yields the same strings that would be returned by FieldsFunc(s),
// but without constructing the slice.
func FieldsFuncSeq(s []byte, f func(rune) bool) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
s := s
start := -1
for i := 0; i < len(s); {
size := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, size = utf8.DecodeRune(s[i:])
}
if f(r) {
if start >= 0 {
if !yield(s[start:i]) {
return
}
start = -1
}
} else if start < 0 {
start = i
}
i += size
}
if start >= 0 {
yield(s[start:])
}
}
}

// Join concatenates the elements of s to create a new byte slice. The separator
// sep is placed between elements in the resulting slice.
func Join(s [][]byte, sep []byte) []byte {
Expand Down
51 changes: 51 additions & 0 deletions src/bytes/bytes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
. "bytes"
"fmt"
"internal/testenv"
"iter"
"math"
"math/rand"
"reflect"
Expand Down Expand Up @@ -758,6 +759,22 @@ func BenchmarkCountSingle(b *testing.B) {
})
}

var LinesTest = []string{
"abc\nabc\n",
"abc\r\nabc",
"abc\r\n",
"abc\n",
}

func TestLines(t *testing.T) {
for _, s := range LinesTest {
result := Join(slices.Collect(Lines([]byte(s))), []byte(""))
if string(result) != s {
t.Errorf(`Join(collect(Lines(%q)), "") = %q`, s, result)
}
}
}

type SplitTest struct {
s string
sep string
Expand Down Expand Up @@ -801,6 +818,14 @@ func TestSplit(t *testing.T) {
t.Errorf(`Split(%q, %q, %d) = %v; want %v`, tt.s, tt.sep, tt.n, result, tt.a)
continue
}

if tt.n < 0 {
result2 := sliceOfString(slices.Collect(SplitSeq([]byte(tt.s), []byte(tt.sep))))
if !slices.Equal(result2, tt.a) {
t.Errorf(`collect(SplitSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, result2, tt.a)
}
}

if tt.n == 0 || len(a) == 0 {
continue
}
Expand Down Expand Up @@ -860,6 +885,13 @@ func TestSplitAfter(t *testing.T) {
continue
}

if tt.n < 0 {
result2 := sliceOfString(slices.Collect(SplitAfterSeq([]byte(tt.s), []byte(tt.sep))))
if !slices.Equal(result2, tt.a) {
t.Errorf(`collect(SplitAfterSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, result2, tt.a)
}
}

if want := tt.a[len(tt.a)-1] + "z"; string(x) != want {
t.Errorf("last appended result was %s; want %s", x, want)
}
Expand Down Expand Up @@ -913,6 +945,11 @@ func TestFields(t *testing.T) {
continue
}

result2 := sliceOfString(collect(t, FieldsSeq([]byte(tt.s))))
if !slices.Equal(result2, tt.a) {
t.Errorf(`collect(FieldsSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
}

if string(b) != tt.s {
t.Errorf("slice changed to %s; want %s", string(b), tt.s)
}
Expand Down Expand Up @@ -955,6 +992,11 @@ func TestFieldsFunc(t *testing.T) {
t.Errorf("FieldsFunc(%q) = %v, want %v", tt.s, a, tt.a)
}

result2 := sliceOfString(collect(t, FieldsFuncSeq([]byte(tt.s), pred)))
if !slices.Equal(result2, tt.a) {
t.Errorf(`collect(FieldsFuncSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
}

if string(b) != tt.s {
t.Errorf("slice changed to %s; want %s", b, tt.s)
}
Expand Down Expand Up @@ -2255,3 +2297,12 @@ func TestClone(t *testing.T) {
}
}
}

func collect(t *testing.T, seq iter.Seq[[]byte]) [][]byte {
out := slices.Collect(seq)
out1 := slices.Collect(seq)
if !reflect.DeepEqual(out, out1) {
t.Fatalf("inconsistent seq:\n%s\n%s", out, out1)
}
return out
}
Loading

0 comments on commit 532c44e

Please sign in to comment.