1414 * limitations under the License.
1515 */
1616
17+ // Adapted from Apache Iceberg C++
18+ // https://github.com/apache/iceberg-cpp/blob/main/src/iceberg/avro/avro_stream_internal.cc
19+
1720#include " paimon/format/avro/avro_input_stream_impl.h"
1821
1922#include < algorithm>
23+ #include < memory>
2024#include < string>
2125#include < utility>
2226
@@ -39,82 +43,85 @@ AvroInputStreamImpl::AvroInputStreamImpl(const std::shared_ptr<paimon::InputStre
3943 size_t buffer_size, const uint64_t total_length,
4044 const std::shared_ptr<MemoryPool>& pool)
4145 : pool_(pool),
46+ in_ (input_stream),
4247 buffer_size_(buffer_size),
4348 total_length_(total_length),
44- buffer_(reinterpret_cast <uint8_t *>(pool_->Malloc (buffer_size))),
45- in_(input_stream),
46- byte_count_(0 ),
47- next_(buffer_),
48- available_(0 ) {}
49+ buffer_(reinterpret_cast <uint8_t *>(pool_->Malloc (buffer_size))) {}
4950
5051AvroInputStreamImpl::~AvroInputStreamImpl () {
5152 pool_->Free (buffer_, buffer_size_);
5253}
5354
54- bool AvroInputStreamImpl::next (const uint8_t ** data, size_t * size) {
55- if (available_ == 0 && !fill ()) {
56- return false ;
55+ bool AvroInputStreamImpl::next (const uint8_t ** data, size_t * len) {
56+ // Return all unconsumed data in the buffer
57+ if (buffer_pos_ < available_bytes_) {
58+ *data = buffer_ + buffer_pos_;
59+ *len = available_bytes_ - buffer_pos_;
60+ byte_count_ += available_bytes_ - buffer_pos_;
61+ buffer_pos_ = available_bytes_;
62+ return true ;
63+ }
64+
65+ // Read from the input stream when the buffer is empty
66+ uint64_t remaining = total_length_ - stream_pos_;
67+ if (remaining == 0 ) {
68+ return false ; // eof
69+ }
70+ auto read_length =
71+ in_->Read (reinterpret_cast <char *>(buffer_), std::min (buffer_size_, remaining));
72+ if (!read_length.ok ()) {
73+ throw ::avro::Exception (" Read failed: {}" , read_length.status ().ToString ());
5774 }
58- *data = next_;
59- *size = available_;
60- next_ += available_;
61- byte_count_ += available_;
62- available_ = 0 ;
75+ available_bytes_ = read_length.value ();
76+ stream_pos_ += available_bytes_;
77+ buffer_pos_ = 0 ;
78+
79+ // Return the whole buffer
80+ *data = buffer_;
81+ *len = available_bytes_;
82+ byte_count_ += available_bytes_;
83+ buffer_pos_ = available_bytes_;
84+
6385 return true ;
6486}
6587
6688void AvroInputStreamImpl::backup (size_t len) {
67- next_ -= len;
68- available_ += len;
89+ if (len > buffer_pos_) {
90+ throw ::avro::Exception (" Cannot backup {} bytes, only {} bytes available" , len,
91+ buffer_pos_);
92+ }
93+
94+ buffer_pos_ -= len;
6995 byte_count_ -= len;
7096}
7197
7298void AvroInputStreamImpl::skip (size_t len) {
73- while (len > 0 ) {
74- if (available_ == 0 ) {
75- auto s = in_->Seek (len, paimon::FS_SEEK_CUR);
76- if (!s.ok ()) {
77- throw ::avro::Exception (s.ToString ());
78- }
79- byte_count_ += len;
80- total_read_len_ += len;
81- return ;
82- }
83- size_t n = std::min (available_, len);
84- available_ -= n;
85- next_ += n;
86- len -= n;
87- byte_count_ += n;
99+ // The range to skip is within the buffer
100+ if (buffer_pos_ + len <= available_bytes_) {
101+ buffer_pos_ += len;
102+ byte_count_ += len;
103+ return ;
88104 }
105+ seek (byte_count_ + len);
89106}
90107
91- void AvroInputStreamImpl::seek (int64_t position) {
92- auto s = in_->Seek (position - byte_count_ - available_, paimon::FS_SEEK_CUR);
93- if (!s.ok ()) {
94- throw ::avro::Exception (s.ToString ());
95- }
96- byte_count_ = position;
97- total_read_len_ = position;
98- available_ = 0 ;
108+ size_t AvroInputStreamImpl::byteCount () const {
109+ return byte_count_;
99110}
100111
101- bool AvroInputStreamImpl::fill () {
102- if (static_cast <uint64_t >(total_read_len_) >= total_length_) {
103- // eof
104- return false ;
105- }
106- Result<int32_t > actual_len = in_->Read (reinterpret_cast <char *>(buffer_),
107- std::min (buffer_size_, total_length_ - total_read_len_));
108- if (!actual_len.ok ()) {
109- throw ::avro::Exception (actual_len.status ().ToString ());
112+ void AvroInputStreamImpl::seek (int64_t position) {
113+ if (static_cast <uint64_t >(position) > total_length_) {
114+ throw ::avro::Exception (" Cannot seek to {}, total length is {}" , position, total_length_);
110115 }
111- total_read_len_ += actual_len.value ();
112- if (actual_len.value () != 0 ) {
113- next_ = buffer_;
114- available_ = actual_len.value ();
115- return true ;
116+ auto status = in_->Seek (position, SeekOrigin::FS_SEEK_SET);
117+ if (!status.ok ()) {
118+ throw ::avro::Exception (" Failed to seek to {}, got {}" , position, status.ToString ());
116119 }
117- return false ;
120+
121+ stream_pos_ = position;
122+ buffer_pos_ = 0 ;
123+ available_bytes_ = 0 ;
124+ byte_count_ = position;
118125}
119126
120127} // namespace paimon::avro
0 commit comments