-
Notifications
You must be signed in to change notification settings - Fork 797
ringbuf: add zero-copy consumer APIs #1915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -5,9 +5,12 @@ import ( | |||||
| "fmt" | ||||||
| "os" | ||||||
| "sync" | ||||||
| "sync/atomic" | ||||||
| "time" | ||||||
| "unsafe" | ||||||
|
|
||||||
| "iter" | ||||||
|
|
||||||
| "github.com/cilium/ebpf" | ||||||
| "github.com/cilium/ebpf/internal/platform" | ||||||
| "github.com/cilium/ebpf/internal/sys" | ||||||
|
|
@@ -40,6 +43,11 @@ func (rh *ringbufHeader) dataLen() int { | |||||
| } | ||||||
|
|
||||||
| type Record struct { | ||||||
| // RawSample contains the raw bytes of a ringbuf record. | ||||||
| // | ||||||
| // When obtained via [Reader.Records], RawSample is a zero-copy view into the | ||||||
| // underlying mmap. It is only valid until the iterator yields the next | ||||||
| // record or terminates. Callers must copy the data if they need to retain it. | ||||||
| RawSample []byte | ||||||
|
|
||||||
| // The minimum number of bytes remaining in the ring buffer after this Record has been read. | ||||||
|
|
@@ -144,6 +152,59 @@ func (r *Reader) Read() (Record, error) { | |||||
|
|
||||||
| // ReadInto is like Read except that it allows reusing Record and associated buffers. | ||||||
| func (r *Reader) ReadInto(rec *Record) error { | ||||||
| return r.readLocked(func() error { | ||||||
| return r.ring.readRecord(rec) | ||||||
| }) | ||||||
| } | ||||||
|
|
||||||
| // Records iterates over records in the reader until [Reader.Close] is called. | ||||||
| // | ||||||
| // Record.RawSample is only valid until the next call to the iterator. Callers | ||||||
| // must copy the data if it needs to outlive the current iteration. | ||||||
| // | ||||||
| // This convenience wrapper allocates a single Record once. To fully avoid | ||||||
| // allocations, use [Reader.RecordsInto] and pass in a reusable Record. | ||||||
| func (r *Reader) Records() iter.Seq2[*Record, error] { | ||||||
| rec := Record{} | ||||||
| return r.RecordsInto(&rec) | ||||||
| } | ||||||
|
|
||||||
| // RecordsInto is like Records but allows reusing the Record and associated buffers. | ||||||
| func (r *Reader) RecordsInto(rec *Record) iter.Seq2[*Record, error] { | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||||||
| return func(yield func(*Record, error) bool) { | ||||||
| var ( | ||||||
| sample []byte | ||||||
| remaining int | ||||||
| nextCons uintptr | ||||||
| ) | ||||||
|
|
||||||
| for { | ||||||
| err := r.readLocked(func() error { | ||||||
| var err error | ||||||
| sample, remaining, nextCons, err = r.ring.readSample() | ||||||
| return err | ||||||
| }) | ||||||
| if err != nil { | ||||||
| yield(nil, err) | ||||||
| return | ||||||
| } | ||||||
|
|
||||||
| // Limit cap to len so append can't write past the record and corrupt the ring. | ||||||
| rec.RawSample = sample[:len(sample):len(sample)] | ||||||
| rec.Remaining = remaining | ||||||
|
|
||||||
| if !yield(rec, nil) { | ||||||
| atomic.StoreUintptr(r.ring.cons_pos, nextCons) | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is safe to do. Consider two concurrent callers of Records():
I think you need to hold the lock across the yield. Maybe its as simple as moving it all into readLocked?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am aware of this concurrent read issue, my first version was holding the lock during the iterator. However, that leads to the problem of dead lock when calling reader.SetDeadline: for rec, err := range reader.Records() {
reader.SetDeadline() // dead lock
}reader.Close() also tries to acquire the reader.mu, it will become impossible to call Close() during iteration 😬
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I drafted a PR on my fork for your preview: jschwinger233#5 |
||||||
| return | ||||||
| } | ||||||
|
|
||||||
| atomic.StoreUintptr(r.ring.cons_pos, nextCons) | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // readLocked drives the polling / data-availability loop shared by Record reads. | ||||||
| func (r *Reader) readLocked(read func() error) error { | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. having read be a closure over Reader state is a bit black magic. How about
Suggested change
readLocked would be responsible for calling readSample and advancing the consumer position. Probably also easier for the compiler to turn into good code since handle doesn't have to be a closure. |
||||||
| r.mu.Lock() | ||||||
| defer r.mu.Unlock() | ||||||
|
|
||||||
|
|
@@ -171,9 +232,7 @@ func (r *Reader) ReadInto(rec *Record) error { | |||||
| } | ||||||
|
|
||||||
| for { | ||||||
| err := r.ring.readRecord(rec) | ||||||
| // Not using errors.Is which is quite a bit slower | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: please keep the comment. |
||||||
| // For a tight loop it might make a difference | ||||||
| err := read() | ||||||
| if err == errBusy { | ||||||
| continue | ||||||
| } | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -384,3 +384,34 @@ func BenchmarkReadInto(b *testing.B) { | |
| } | ||
| } | ||
| } | ||
|
|
||
| func TestRecordsIterator(t *testing.T) { | ||
| testutils.SkipOnOldKernel(t, "5.8", "BPF ring buffer") | ||
|
|
||
| prog, events := mustOutputSamplesProg(t, | ||
| sampleMessage{size: 5, flags: 0}, | ||
| sampleMessage{size: 7, flags: 0}, | ||
| ) | ||
| mustRun(t, prog) | ||
|
|
||
| rd, err := NewReader(events) | ||
| if err != nil { | ||
| t.Fatal(err) | ||
| } | ||
| defer rd.Close() | ||
|
|
||
| var seen [][]byte | ||
| for rec, err := range rd.Records() { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs a test that Reader.Close() stops the iterator without yielding io.ErrClosed. Using iter.Pull might be the easiest way to achieve that. |
||
| if err != nil { | ||
| t.Fatalf("iteration error: %v", err) | ||
| } | ||
| seen = append(seen, append([]byte(nil), rec.RawSample...)) | ||
| if len(seen) == 2 { | ||
| break | ||
| } | ||
| } | ||
|
|
||
| qt.Assert(t, qt.Equals(len(seen), 2)) | ||
| qt.Assert(t, qt.DeepEquals(seen[0], []byte{1, 2, 3, 4, 4})) | ||
| qt.Assert(t, qt.DeepEquals(seen[1], []byte{1, 2, 3, 4, 4, 3, 2})) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,16 +41,17 @@ func (rr *ringReader) AvailableBytes() uint64 { | |
| return uint64(prod - cons) | ||
| } | ||
|
|
||
| // Read a record from an event ring. | ||
| func (rr *ringReader) readRecord(rec *Record) error { | ||
| // readSample returns a zero-copy view into the next sample, together with the | ||
| // consumer position that should be stored to release the data. | ||
| func (rr *ringReader) readSample() (sample []byte, remaining int, nextCons uintptr, err error) { | ||
| prod := atomic.LoadUintptr(rr.prod_pos) | ||
| cons := atomic.LoadUintptr(rr.cons_pos) | ||
|
|
||
| for { | ||
| if remaining := prod - cons; remaining == 0 { | ||
| return errEOR | ||
| return nil, 0, 0, errEOR | ||
| } else if remaining < sys.BPF_RINGBUF_HDR_SZ { | ||
| return fmt.Errorf("read record header: %w", io.ErrUnexpectedEOF) | ||
| return nil, 0, 0, fmt.Errorf("read record header: %w", io.ErrUnexpectedEOF) | ||
| } | ||
|
|
||
| // read the len field of the header atomically to ensure a happens before | ||
|
|
@@ -65,15 +66,15 @@ func (rr *ringReader) readRecord(rec *Record) error { | |
| // the next sample in the ring is not committed yet so we | ||
| // exit without storing the reader/consumer position | ||
| // and start again from the same position. | ||
| return errBusy | ||
| return nil, 0, 0, errBusy | ||
| } | ||
|
|
||
| cons += sys.BPF_RINGBUF_HDR_SZ | ||
|
|
||
| // Data is always padded to 8 byte alignment. | ||
| dataLenAligned := uintptr(internal.Align(header.dataLen(), 8)) | ||
| if remaining := prod - cons; remaining < dataLenAligned { | ||
| return fmt.Errorf("read sample data: %w", io.ErrUnexpectedEOF) | ||
| return nil, 0, 0, fmt.Errorf("read sample data: %w", io.ErrUnexpectedEOF) | ||
| } | ||
|
|
||
| start = cons & rr.mask | ||
|
|
@@ -87,15 +88,26 @@ func (rr *ringReader) readRecord(rec *Record) error { | |
| continue | ||
| } | ||
|
|
||
| if n := header.dataLen(); cap(rec.RawSample) < n { | ||
| rec.RawSample = make([]byte, n) | ||
| } else { | ||
| rec.RawSample = rec.RawSample[:n] | ||
| } | ||
| end := int(start) + header.dataLen() | ||
| return rr.ring[start:end], int(prod - cons), cons, nil | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reslicing to reduce cap could already happen here. |
||
| } | ||
| } | ||
|
|
||
| copy(rec.RawSample, rr.ring[start:]) | ||
| rec.Remaining = int(prod - cons) | ||
| atomic.StoreUintptr(rr.cons_pos, cons) | ||
| return nil | ||
| // Read a record from an event ring, copying the sample into the provided Record. | ||
| func (rr *ringReader) readRecord(rec *Record) error { | ||
| sample, remaining, nextCons, err := rr.readSample() | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| if n := len(sample); cap(rec.RawSample) < n { | ||
| rec.RawSample = make([]byte, n) | ||
| } else { | ||
| rec.RawSample = rec.RawSample[:n] | ||
| } | ||
|
|
||
| copy(rec.RawSample, sample) | ||
| rec.Remaining = remaining | ||
| atomic.StoreUintptr(rr.cons_pos, nextCons) | ||
| return nil | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even more explicit: both Record and Record.RawSample are only valid until the next call.