Skip to content

Support encoding into a bytes tensor #635

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Apr 29, 2025
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7921558
Disable FFmpeg logs for encoder
NicolasHug Apr 7, 2025
2a19014
Merge branch 'main' of github.com:pytorch/torchcodec into loglevelenc…
NicolasHug Apr 8, 2025
73bdc85
Use c++ strings
NicolasHug Apr 8, 2025
54f5543
Merge branch 'main' of github.com:pytorch/torchcodec into loglevelenc…
NicolasHug Apr 8, 2025
24842b6
Account for frame_size being 0
NicolasHug Apr 8, 2025
c3ac80a
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_wav
NicolasHug Apr 9, 2025
5b39c8f
WIP
NicolasHug Apr 9, 2025
1f9f904
Move createSwrContext in ffmpeg file
NicolasHug Apr 9, 2025
f525848
WIP
NicolasHug Apr 9, 2025
9150137
Move convertAudioAVFrameSampleFormatAndSampleRate in ffmpeg file
NicolasHug Apr 9, 2025
872b569
Automatically find output sample format
NicolasHug Apr 9, 2025
a0dcafd
Convert sample format, update tests
NicolasHug Apr 9, 2025
f49d507
Skip wav on FFmpeg4
NicolasHug Apr 9, 2025
ee3a199
Add assertion
NicolasHug Apr 9, 2025
485ee2e
Move comment
NicolasHug Apr 9, 2025
27fdbac
Better default heuristic
NicolasHug Apr 9, 2025
8467b92
Merge branch 'main' of github.com:pytorch/torchcodec into encoding_wav
NicolasHug Apr 10, 2025
ee7a217
Support encoding into tensor
NicolasHug Apr 10, 2025
7b3847f
Merge branch 'main' of github.com:pytorch/torchcodec into avio
NicolasHug Apr 14, 2025
d85baa2
nits
NicolasHug Apr 14, 2025
42c6373
Allow output tensor re-allocation
NicolasHug Apr 14, 2025
254529f
Fix compilation on FFmpeg7?
NicolasHug Apr 14, 2025
3f0417c
Fix?
NicolasHug Apr 14, 2025
290c96e
Use int64_t consistently
NicolasHug Apr 14, 2025
5f42d15
cmake
NicolasHug Apr 14, 2025
0f415c1
Merge branch 'main' of github.com:pytorch/torchcodec into avio
NicolasHug Apr 22, 2025
2954c9b
Move type aliases, fix avioAllocContext name
NicolasHug Apr 22, 2025
29866fc
Merge branch 'main' of github.com:pytorch/torchcodec into avio
NicolasHug Apr 24, 2025
9cb31c9
Merge branch 'main' of github.com:pytorch/torchcodec into avio
NicolasHug Apr 28, 2025
ff6c1e0
Create 2 separate constructors
NicolasHug Apr 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 68 additions & 1 deletion src/torchcodec/_core/AVIOBytesContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ AVIOBytesContext::AVIOBytesContext(const void* data, int64_t dataSize)
: dataContext_{static_cast<const uint8_t*>(data), dataSize, 0} {
TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
TORCH_CHECK(dataSize > 0, "Video data size must be positive");
createAVIOContext(&read, &seek, &dataContext_);
createAVIOContext(&read, nullptr, &seek, &dataContext_);
}

// The signature of this function is defined by FFMPEG.
Expand Down Expand Up @@ -67,4 +67,71 @@ int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
return ret;
}

AVIOToTensorContext::AVIOToTensorContext()
: dataContext_{
torch::empty(
{AVIOToTensorContext::INITIAL_TENSOR_SIZE},
{torch::kUInt8}),
0} {
createAVIOContext(nullptr, &write, &seek, &dataContext_);
}

// The signature of this function is defined by FFMPEG.
int AVIOToTensorContext::write(void* opaque, const uint8_t* buf, int buf_size) {
auto dataContext = static_cast<DataContext*>(opaque);

int64_t bufSize = static_cast<int64_t>(buf_size);
if (dataContext->current + bufSize > dataContext->outputTensor.numel()) {
TORCH_CHECK(
dataContext->outputTensor.numel() * 2 <=
AVIOToTensorContext::MAX_TENSOR_SIZE,
"We tried to allocate an output encoded tensor larger than ",
AVIOToTensorContext::MAX_TENSOR_SIZE,
" bytes. If you think this should be supported, please report.");

// We double the size of the outpout tensor. Calling cat() may not be the
// most efficient, but it's simple.
dataContext->outputTensor =
torch::cat({dataContext->outputTensor, dataContext->outputTensor});
}

TORCH_CHECK(
dataContext->current + bufSize <= dataContext->outputTensor.numel(),
"Re-allocation of the output tensor didn't work. ",
"This should not happen, please report on TorchCodec bug tracker");

uint8_t* outputTensorData = dataContext->outputTensor.data_ptr<uint8_t>();
std::memcpy(outputTensorData + dataContext->current, buf, bufSize);
dataContext->current += bufSize;
return buf_size;
}

// The signature of this function is defined by FFMPEG.
// Note: This `seek()` implementation is very similar to that of
// AVIOBytesContext. We could consider merging both classes, or do some kind of
// refac, but this doesn't seem worth it ATM.
int64_t AVIOToTensorContext::seek(void* opaque, int64_t offset, int whence) {
auto dataContext = static_cast<DataContext*>(opaque);
int64_t ret = -1;

switch (whence) {
case AVSEEK_SIZE:
ret = dataContext->outputTensor.numel();
break;
case SEEK_SET:
dataContext->current = offset;
ret = offset;
break;
default:
break;
}

return ret;
}

torch::Tensor AVIOToTensorContext::getOutputTensor() {
return dataContext_.outputTensor.narrow(
/*dim=*/0, /*start=*/0, /*length=*/dataContext_.current);
}

} // namespace facebook::torchcodec
26 changes: 24 additions & 2 deletions src/torchcodec/_core/AVIOBytesContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@

#pragma once

#include <torch/types.h>
#include "src/torchcodec/_core/AVIOContextHolder.h"

namespace facebook::torchcodec {

// Enables users to pass in the entire video as bytes. Our read and seek
// functions then traverse the bytes in memory.
// For Decoding: enables users to pass in the entire video or audio as bytes.
// Our read and seek functions then traverse the bytes in memory.
class AVIOBytesContext : public AVIOContextHolder {
public:
explicit AVIOBytesContext(const void* data, int64_t dataSize);
Expand All @@ -29,4 +30,25 @@ class AVIOBytesContext : public AVIOContextHolder {
DataContext dataContext_;
};

// For Encoding: used to encode into an output uint8 (bytes) tensor.
class AVIOToTensorContext : public AVIOContextHolder {
public:
explicit AVIOToTensorContext();
torch::Tensor getOutputTensor();

private:
struct DataContext {
torch::Tensor outputTensor;
int64_t current;
};

static constexpr int64_t INITIAL_TENSOR_SIZE = 10'000'000; // 10MB
static constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB
static int write(void* opaque, const uint8_t* buf, int buf_size);
// We need to expose seek() for some formats like mp3.
static int64_t seek(void* opaque, int64_t offset, int whence);

DataContext dataContext_;
};

} // namespace facebook::torchcodec
11 changes: 8 additions & 3 deletions src/torchcodec/_core/AVIOContextHolder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ namespace facebook::torchcodec {

void AVIOContextHolder::createAVIOContext(
AVIOReadFunction read,
AVIOWriteFunction write,
AVIOSeekFunction seek,
void* heldData,
int bufferSize) {
Expand All @@ -22,13 +23,17 @@ void AVIOContextHolder::createAVIOContext(
buffer != nullptr,
"Failed to allocate buffer of size " + std::to_string(bufferSize));

avioContext_.reset(avio_alloc_context(
TORCH_CHECK(
(seek != nullptr) && ((write != nullptr) ^ (read != nullptr)),
"seek method must be defined, and either write or read must be defined. "
"But not both!")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may relax the mutual-exclusivity check above eventually, if we implement both write and read within the same class. For now, mutual-exclusivity is assumed and enforced, because we use the existence of write to set the write_flag below.

avioContext_.reset(avioAllocContext(
buffer,
bufferSize,
0,
/*write_flag=*/write != nullptr,
heldData,
read,
nullptr, // write function; not supported yet
write,
seek));

if (!avioContext_) {
Expand Down
11 changes: 4 additions & 7 deletions src/torchcodec/_core/AVIOContextHolder.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ namespace facebook::torchcodec {
// freed.
// 2. It is a base class for AVIOContext specializations. When specializing a
// AVIOContext, we need to provide four things:
// 1. A read callback function.
// 2. A seek callback function.
// 3. A write callback function. (Not supported yet; it's for encoding.)
// 1. A read callback function, for decoding.
// 2. A seek callback function, for decoding and encoding.
// 3. A write callback function, for encoding.
// 4. A pointer to some context object that has the same lifetime as the
// AVIOContext itself. This context object holds the custom state that
// tracks the custom behavior of reading, seeking and writing. It is
Expand All @@ -44,13 +44,10 @@ class AVIOContextHolder {
// enforced by having a pure virtual methods, but we don't have any.)
AVIOContextHolder() = default;

// These signatures are defined by FFmpeg.
using AVIOReadFunction = int (*)(void*, uint8_t*, int);
using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);

// Deriving classes should call this function in their constructor.
void createAVIOContext(
AVIOReadFunction read,
AVIOWriteFunction write,
AVIOSeekFunction seek,
void* heldData,
int bufferSize = defaultBufferSize);
Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/AVIOFileLikeContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
py::hasattr(fileLike, "seek"),
"File like object must implement a seek method.");
}
createAVIOContext(&read, &seek, &fileLike_);
createAVIOContext(&read, nullptr, &seek, &fileLike_);
}

int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {
Expand Down
3 changes: 2 additions & 1 deletion src/torchcodec/_core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,9 @@ function(make_torchcodec_libraries
set(decoder_library_name "libtorchcodec_decoder${ffmpeg_major_version}")
set(decoder_sources
AVIOContextHolder.cpp
AVIOBytesContext.cpp
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Encoder.[cpp, h] rely on AVIOToTensorContext, and specifically on the AVIOToTensorContext::getOutputTensor() method. They don't rely on the AVIOContextHolder base class, like the decoder. For this reason we have to add AVIOBytesContext.cpp to the source dependency here.

Alternatively, I think we could make getOutputTensor a virtual method of the base class? But this method wouldn't make much sense for the existing child classes (like AVIOBytesContext), so this doesn't sounds like a great OOP design.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed on not making getOutputTensor() virtual on the base class, since it's only applicable to one of the derived classes. We could potentially pull AVIOToTensorContext out of AVIOBytesContext.[h|cpp] to limit what gets put into libtorchcodec_decoderN.so.

I think the issue here is that the Encoder class needs to call getOutputTensor(), which means that it must actually hold a reference to an actual AVIOToTensorContext rather than just the base AVIOContextHolder. (Which is what SingleStreamDecoder does.) A potential way around this is for AVIOToTensorContext to accept a tensor rather than creating its own. The caller, however, would still need to keep track of how many bytes it asked it to encode in order to do the final narrow on the tensor, so that's not necessarily any cleaner. Your call on what you think makes the most sense.

FFMPEGCommon.cpp
DeviceInterface.cpp
DeviceInterface.cpp
SingleStreamDecoder.cpp
# TODO: lib name should probably not be "*_decoder*" now that it also
# contains an encoder
Expand Down
49 changes: 36 additions & 13 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <sstream>

#include "src/torchcodec/_core/AVIOBytesContext.h"
#include "src/torchcodec/_core/Encoder.h"
#include "torch/types.h"

Expand Down Expand Up @@ -78,9 +79,13 @@ AudioEncoder::~AudioEncoder() {}
AudioEncoder::AudioEncoder(
const torch::Tensor wf,
int sampleRate,
std::string_view fileName,
std::optional<std::string_view> fileName,
std::optional<std::string_view> formatName,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We enforce that fileName and formatName parameters are mutually exclusive. Happy to consider alternative designs. Creating separate constructors seemed more complex and doesn't provide much value IMHO.

Note that this only exists at the C++ constructor level. The custom ops expose two distinct entry-points: encode_audio_to_file() and encode_audio_to_tensor(). I suspect that the public Python API will look similar to this, but that's still up for discussion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should follow the same pattern set out in SingleStreamDecoder: one constructor that takes the file name, and another constructor that takes the AVIOToTensorContext as an argument. That means it's the callers responsibility to create the AVIOToTensorContext object; that's how it currently works in SingleStreamDecoder. That would also require pulling all common initialization into some initializeEncoder() function that both constructors would call.

The value, to me, is that it's more clear when creating the object that you're doing it correctly, and then when reading the initialization code, it's more clear what's required for, and correct for, each path.

std::optional<int64_t> bitRate)
: wf_(wf) {
TORCH_CHECK(
fileName.has_value() ^ formatName.has_value(),
"Pass one of filename OR format, not both.");
TORCH_CHECK(
wf_.dtype() == torch::kFloat32,
"waveform must have float32 dtype, got ",
Expand All @@ -92,25 +97,32 @@ AudioEncoder::AudioEncoder(

setFFmpegLogLevel();
AVFormatContext* avFormatContext = nullptr;
auto status = avformat_alloc_output_context2(
&avFormatContext, nullptr, nullptr, fileName.data());
int status = AVSUCCESS;
if (fileName.has_value()) {
status = avformat_alloc_output_context2(
&avFormatContext, nullptr, nullptr, fileName->data());
} else {
status = avformat_alloc_output_context2(
&avFormatContext, nullptr, formatName->data(), nullptr);
}
TORCH_CHECK(
avFormatContext != nullptr,
"Couldn't allocate AVFormatContext. ",
"Check the desired extension? ",
getFFMPEGErrorStringFromErrorCode(status));
avFormatContext_.reset(avFormatContext);

// TODO-ENCODING: Should also support encoding into bytes (use
// AVIOBytesContext)
TORCH_CHECK(
!(avFormatContext->oformat->flags & AVFMT_NOFILE),
"AVFMT_NOFILE is set. We only support writing to a file.");
status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
TORCH_CHECK(
status >= 0,
"avio_open failed: ",
getFFMPEGErrorStringFromErrorCode(status));
if (fileName.has_value()) {
status =
avio_open(&avFormatContext_->pb, fileName->data(), AVIO_FLAG_WRITE);
TORCH_CHECK(
status >= 0,
"avio_open failed: ",
getFFMPEGErrorStringFromErrorCode(status));
} else {
avioContextHolder_ = std::make_unique<AVIOToTensorContext>();
avFormatContext->pb = avioContextHolder_->getAVIOContext();
}

// We use the AVFormatContext's default codec for that
// specific format/container.
Expand Down Expand Up @@ -170,7 +182,18 @@ AudioEncoder::AudioEncoder(
streamIndex_ = avStream->index;
}

torch::Tensor AudioEncoder::encodeToTensor() {
TORCH_CHECK(
avioContextHolder_ != nullptr,
"Cannot encode to tensor, avio context doesn't exist.");
encode();
return avioContextHolder_->getOutputTensor();
}

void AudioEncoder::encode() {
// TODO-ENCODING: Need to check, but consecutive calls to encode() are
// probably invalid. We can address this once we (re)design the public and
// private encoding APIs.
UniqueAVFrame avFrame(av_frame_alloc());
TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
// Default to 256 like in torchaudio
Expand Down
8 changes: 7 additions & 1 deletion src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include <torch/types.h>
#include "src/torchcodec/_core/AVIOBytesContext.h"
#include "src/torchcodec/_core/FFMPEGCommon.h"

namespace facebook::torchcodec {
Expand All @@ -19,9 +20,11 @@ class AudioEncoder {
// match this, and that's up to the user. If sample rates don't match,
// encoding will still work but audio will be distorted.
int sampleRate,
std::string_view fileName,
std::optional<std::string_view> fileName,
std::optional<std::string_view> formatName,
std::optional<int64_t> bitRate = std::nullopt);
void encode();
torch::Tensor encodeToTensor();

private:
void encodeInnerLoop(
Expand All @@ -35,5 +38,8 @@ class AudioEncoder {
UniqueSwrContext swrContext_;

const torch::Tensor wf_;

// Stores the AVIOContext for the output tensor buffer.
std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
};
} // namespace facebook::torchcodec
23 changes: 23 additions & 0 deletions src/torchcodec/_core/FFMPEGCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,4 +261,27 @@ void setFFmpegLogLevel() {
av_log_set_level(logLevel);
}

AVIOContext* avioAllocContext(
uint8_t* buffer,
int buffer_size,
int write_flag,
void* opaque,
AVIOReadFunction read_packet,
AVIOWriteFunction write_packet,
AVIOSeekFunction seek) {
return avio_alloc_context(
buffer,
buffer_size,
write_flag,
opaque,
read_packet,
// The buf parameter of the write function is not const before FFmpeg 7.
#if LIBAVFILTER_VERSION_MAJOR >= 10 // FFmpeg >= 7
write_packet,
#else
reinterpret_cast<AVIOWriteFunctionOld>(write_packet),
#endif
seek);
}

} // namespace facebook::torchcodec
15 changes: 15 additions & 0 deletions src/torchcodec/_core/FFMPEGCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,19 @@ bool canSwsScaleHandleUnalignedData();

void setFFmpegLogLevel();

// These signatures are defined by FFmpeg.
using AVIOReadFunction = int (*)(void*, uint8_t*, int);
using AVIOWriteFunction = int (*)(void*, const uint8_t*, int); // FFmpeg >= 7
using AVIOWriteFunctionOld = int (*)(void*, uint8_t*, int); // FFmpeg < 7
using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);

AVIOContext* avioAllocContext(
uint8_t* buffer,
int buffer_size,
int write_flag,
void* opaque,
AVIOReadFunction read_packet,
AVIOWriteFunction write_packet,
AVIOSeekFunction seek);

} // namespace facebook::torchcodec
4 changes: 2 additions & 2 deletions src/torchcodec/_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
_test_frame_pts_equality,
add_audio_stream,
add_video_stream,
create_audio_encoder,
create_from_bytes,
create_from_file,
create_from_file_like,
create_from_tensor,
encode_audio,
encode_audio_to_file,
encode_audio_to_tensor,
get_ffmpeg_library_versions,
get_frame_at_index,
get_frame_at_pts,
Expand Down
Loading
Loading