pytorch · NicolasHug · Apr 29, 2025 · Apr 7, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/src/torchcodec/_core/AVIOBytesContext.cpp b/src/torchcodec/_core/AVIOBytesContext.cpp
@@ -13,7 +13,7 @@ AVIOBytesContext::AVIOBytesContext(const void* data, int64_t dataSize)
     : dataContext_{static_cast<const uint8_t*>(data), dataSize, 0} {
   TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
   TORCH_CHECK(dataSize > 0, "Video data size must be positive");
-  createAVIOContext(&read, &seek, &dataContext_);
+  createAVIOContext(&read, nullptr, &seek, &dataContext_);
 }
 
 // The signature of this function is defined by FFMPEG.
@@ -67,4 +67,71 @@ int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
   return ret;
 }
 
+AVIOToTensorContext::AVIOToTensorContext()
+    : dataContext_{
+          torch::empty(
+              {AVIOToTensorContext::INITIAL_TENSOR_SIZE},
+              {torch::kUInt8}),
+          0} {
+  createAVIOContext(nullptr, &write, &seek, &dataContext_);
+}
+
+// The signature of this function is defined by FFMPEG.
+int AVIOToTensorContext::write(void* opaque, const uint8_t* buf, int buf_size) {
+  auto dataContext = static_cast<DataContext*>(opaque);
+
+  int64_t bufSize = static_cast<int64_t>(buf_size);
+  if (dataContext->current + bufSize > dataContext->outputTensor.numel()) {
+    TORCH_CHECK(
+        dataContext->outputTensor.numel() * 2 <=
+            AVIOToTensorContext::MAX_TENSOR_SIZE,
+        "We tried to allocate an output encoded tensor larger than ",
+        AVIOToTensorContext::MAX_TENSOR_SIZE,
+        " bytes. If you think this should be supported, please report.");
+
+    // We double the size of the outpout tensor. Calling cat() may not be the
+    // most efficient, but it's simple.
+    dataContext->outputTensor =
+        torch::cat({dataContext->outputTensor, dataContext->outputTensor});
+  }
+
+  TORCH_CHECK(
+      dataContext->current + bufSize <= dataContext->outputTensor.numel(),
+      "Re-allocation of the output tensor didn't work. ",
+      "This should not happen, please report on TorchCodec bug tracker");
+
+  uint8_t* outputTensorData = dataContext->outputTensor.data_ptr<uint8_t>();
+  std::memcpy(outputTensorData + dataContext->current, buf, bufSize);
+  dataContext->current += bufSize;
+  return buf_size;
+}
+
+// The signature of this function is defined by FFMPEG.
+// Note: This `seek()` implementation is very similar to that of
+// AVIOBytesContext. We could consider merging both classes, or do some kind of
+// refac, but this doesn't seem worth it ATM.
+int64_t AVIOToTensorContext::seek(void* opaque, int64_t offset, int whence) {
+  auto dataContext = static_cast<DataContext*>(opaque);
+  int64_t ret = -1;
+
+  switch (whence) {
+    case AVSEEK_SIZE:
+      ret = dataContext->outputTensor.numel();
+      break;
+    case SEEK_SET:
+      dataContext->current = offset;
+      ret = offset;
+      break;
+    default:
+      break;
+  }
+
+  return ret;
+}
+
+torch::Tensor AVIOToTensorContext::getOutputTensor() {
+  return dataContext_.outputTensor.narrow(
+      /*dim=*/0, /*start=*/0, /*length=*/dataContext_.current);
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/AVIOBytesContext.h b/src/torchcodec/_core/AVIOBytesContext.h
@@ -6,12 +6,13 @@
 
 #pragma once
 
+#include <torch/types.h>
 #include "src/torchcodec/_core/AVIOContextHolder.h"
 
 namespace facebook::torchcodec {
 
-// Enables users to pass in the entire video as bytes. Our read and seek
-// functions then traverse the bytes in memory.
+// For Decoding: enables users to pass in the entire video or audio as bytes.
+// Our read and seek functions then traverse the bytes in memory.
 class AVIOBytesContext : public AVIOContextHolder {
  public:
   explicit AVIOBytesContext(const void* data, int64_t dataSize);
@@ -29,4 +30,25 @@ class AVIOBytesContext : public AVIOContextHolder {
   DataContext dataContext_;
 };
 
+// For Encoding: used to encode into an output uint8 (bytes) tensor.
+class AVIOToTensorContext : public AVIOContextHolder {
+ public:
+  explicit AVIOToTensorContext();
+  torch::Tensor getOutputTensor();
+
+ private:
+  struct DataContext {
+    torch::Tensor outputTensor;
+    int64_t current;
+  };
+
+  static constexpr int64_t INITIAL_TENSOR_SIZE = 10'000'000; // 10MB
+  static constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB
+  static int write(void* opaque, const uint8_t* buf, int buf_size);
+  // We need to expose seek() for some formats like mp3.
+  static int64_t seek(void* opaque, int64_t offset, int whence);
+
+  DataContext dataContext_;
+};
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/AVIOContextHolder.cpp b/src/torchcodec/_core/AVIOContextHolder.cpp
@@ -11,6 +11,7 @@ namespace facebook::torchcodec {
 
 void AVIOContextHolder::createAVIOContext(
     AVIOReadFunction read,
+    AVIOWriteFunction write,
     AVIOSeekFunction seek,
     void* heldData,
     int bufferSize) {
@@ -22,13 +23,17 @@ void AVIOContextHolder::createAVIOContext(
       buffer != nullptr,
       "Failed to allocate buffer of size " + std::to_string(bufferSize));
 
-  avioContext_.reset(avio_alloc_context(
+  TORCH_CHECK(
+      (seek != nullptr) && ((write != nullptr) ^ (read != nullptr)),
+      "seek method must be defined, and either write or read must be defined. "
+      "But not both!")
+  avioContext_.reset(avioAllocContext(
       buffer,
       bufferSize,
-      0,
+      /*write_flag=*/write != nullptr,
       heldData,
       read,
-      nullptr, // write function; not supported yet
+      write,
       seek));
 
   if (!avioContext_) {

diff --git a/src/torchcodec/_core/AVIOContextHolder.h b/src/torchcodec/_core/AVIOContextHolder.h
@@ -19,9 +19,9 @@ namespace facebook::torchcodec {
 //      freed.
 //   2. It is a base class for AVIOContext specializations. When specializing a
 //      AVIOContext, we need to provide four things:
-//        1. A read callback function.
-//        2. A seek callback function.
-//        3. A write callback function. (Not supported yet; it's for encoding.)
+//        1. A read callback function, for decoding.
+//        2. A seek callback function, for decoding and encoding.
+//        3. A write callback function, for encoding.
 //        4. A pointer to some context object that has the same lifetime as the
 //           AVIOContext itself. This context object holds the custom state that
 //           tracks the custom behavior of reading, seeking and writing. It is
@@ -44,13 +44,10 @@ class AVIOContextHolder {
   // enforced by having a pure virtual methods, but we don't have any.)
   AVIOContextHolder() = default;
 
-  // These signatures are defined by FFmpeg.
-  using AVIOReadFunction = int (*)(void*, uint8_t*, int);
-  using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);
-
   // Deriving classes should call this function in their constructor.
   void createAVIOContext(
       AVIOReadFunction read,
+      AVIOWriteFunction write,
       AVIOSeekFunction seek,
       void* heldData,
       int bufferSize = defaultBufferSize);

diff --git a/src/torchcodec/_core/AVIOFileLikeContext.cpp b/src/torchcodec/_core/AVIOFileLikeContext.cpp
@@ -23,7 +23,7 @@ AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
         py::hasattr(fileLike, "seek"),
         "File like object must implement a seek method.");
   }
-  createAVIOContext(&read, &seek, &fileLike_);
+  createAVIOContext(&read, nullptr, &seek, &fileLike_);
 }
 
 int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {

diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -59,8 +59,9 @@ function(make_torchcodec_libraries
     set(decoder_library_name "libtorchcodec_decoder${ffmpeg_major_version}")
     set(decoder_sources
         AVIOContextHolder.cpp
+        AVIOBytesContext.cpp
         FFMPEGCommon.cpp
-	DeviceInterface.cpp
+        DeviceInterface.cpp
         SingleStreamDecoder.cpp
         # TODO: lib name should probably not be "*_decoder*" now that it also
         # contains an encoder

diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -1,5 +1,6 @@
 #include <sstream>
 
+#include "src/torchcodec/_core/AVIOBytesContext.h"
 #include "src/torchcodec/_core/Encoder.h"
 #include "torch/types.h"
 
@@ -78,9 +79,13 @@ AudioEncoder::~AudioEncoder() {}
 AudioEncoder::AudioEncoder(
     const torch::Tensor wf,
     int sampleRate,
-    std::string_view fileName,
+    std::optional<std::string_view> fileName,
+    std::optional<std::string_view> formatName,
     std::optional<int64_t> bitRate)
     : wf_(wf) {
+  TORCH_CHECK(
+      fileName.has_value() ^ formatName.has_value(),
+      "Pass one of filename OR format, not both.");
   TORCH_CHECK(
       wf_.dtype() == torch::kFloat32,
       "waveform must have float32 dtype, got ",
@@ -92,25 +97,32 @@ AudioEncoder::AudioEncoder(
 
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
-  auto status = avformat_alloc_output_context2(
-      &avFormatContext, nullptr, nullptr, fileName.data());
+  int status = AVSUCCESS;
+  if (fileName.has_value()) {
+    status = avformat_alloc_output_context2(
+        &avFormatContext, nullptr, nullptr, fileName->data());
+  } else {
+    status = avformat_alloc_output_context2(
+        &avFormatContext, nullptr, formatName->data(), nullptr);
+  }
   TORCH_CHECK(
       avFormatContext != nullptr,
       "Couldn't allocate AVFormatContext. ",
       "Check the desired extension? ",
       getFFMPEGErrorStringFromErrorCode(status));
   avFormatContext_.reset(avFormatContext);
 
-  // TODO-ENCODING: Should also support encoding into bytes (use
-  // AVIOBytesContext)
-  TORCH_CHECK(
-      !(avFormatContext->oformat->flags & AVFMT_NOFILE),
-      "AVFMT_NOFILE is set. We only support writing to a file.");
-  status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
-  TORCH_CHECK(
-      status >= 0,
-      "avio_open failed: ",
-      getFFMPEGErrorStringFromErrorCode(status));
+  if (fileName.has_value()) {
+    status =
+        avio_open(&avFormatContext_->pb, fileName->data(), AVIO_FLAG_WRITE);
+    TORCH_CHECK(
+        status >= 0,
+        "avio_open failed: ",
+        getFFMPEGErrorStringFromErrorCode(status));
+  } else {
+    avioContextHolder_ = std::make_unique<AVIOToTensorContext>();
+    avFormatContext->pb = avioContextHolder_->getAVIOContext();
+  }
 
   // We use the AVFormatContext's default codec for that
   // specific format/container.
@@ -170,7 +182,18 @@ AudioEncoder::AudioEncoder(
   streamIndex_ = avStream->index;
 }
 
+torch::Tensor AudioEncoder::encodeToTensor() {
+  TORCH_CHECK(
+      avioContextHolder_ != nullptr,
+      "Cannot encode to tensor, avio context doesn't exist.");
+  encode();
+  return avioContextHolder_->getOutputTensor();
+}
+
 void AudioEncoder::encode() {
+  // TODO-ENCODING: Need to check, but consecutive calls to encode() are
+  // probably invalid. We can address this once we (re)design the public and
+  // private encoding APIs.
   UniqueAVFrame avFrame(av_frame_alloc());
   TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
   //  Default to 256 like in torchaudio

diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <torch/types.h>
+#include "src/torchcodec/_core/AVIOBytesContext.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 
 namespace facebook::torchcodec {
@@ -19,9 +20,11 @@ class AudioEncoder {
       // match this, and that's up to the user. If sample rates don't match,
       // encoding will still work but audio will be distorted.
       int sampleRate,
-      std::string_view fileName,
+      std::optional<std::string_view> fileName,
+      std::optional<std::string_view> formatName,
       std::optional<int64_t> bitRate = std::nullopt);
   void encode();
+  torch::Tensor encodeToTensor();
 
  private:
   void encodeInnerLoop(
@@ -35,5 +38,8 @@ class AudioEncoder {
   UniqueSwrContext swrContext_;
 
   const torch::Tensor wf_;
+
+  // Stores the AVIOContext for the output tensor buffer.
+  std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
 };
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -261,4 +261,27 @@ void setFFmpegLogLevel() {
   av_log_set_level(logLevel);
 }
 
+AVIOContext* avioAllocContext(
+    uint8_t* buffer,
+    int buffer_size,
+    int write_flag,
+    void* opaque,
+    AVIOReadFunction read_packet,
+    AVIOWriteFunction write_packet,
+    AVIOSeekFunction seek) {
+  return avio_alloc_context(
+      buffer,
+      buffer_size,
+      write_flag,
+      opaque,
+      read_packet,
+// The buf parameter of the write function is not const before FFmpeg 7.
+#if LIBAVFILTER_VERSION_MAJOR >= 10 // FFmpeg >= 7
+      write_packet,
+#else
+      reinterpret_cast<AVIOWriteFunctionOld>(write_packet),
+#endif
+      seek);
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -177,4 +177,19 @@ bool canSwsScaleHandleUnalignedData();
 
 void setFFmpegLogLevel();
 
+// These signatures are defined by FFmpeg.
+using AVIOReadFunction = int (*)(void*, uint8_t*, int);
+using AVIOWriteFunction = int (*)(void*, const uint8_t*, int); // FFmpeg >= 7
+using AVIOWriteFunctionOld = int (*)(void*, uint8_t*, int); // FFmpeg < 7
+using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);
+
+AVIOContext* avioAllocContext(
+    uint8_t* buffer,
+    int buffer_size,
+    int write_flag,
+    void* opaque,
+    AVIOReadFunction read_packet,
+    AVIOWriteFunction write_packet,
+    AVIOSeekFunction seek);
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/__init__.py b/src/torchcodec/_core/__init__.py
@@ -18,12 +18,12 @@
     _test_frame_pts_equality,
     add_audio_stream,
     add_video_stream,
-    create_audio_encoder,
     create_from_bytes,
     create_from_file,
     create_from_file_like,
     create_from_tensor,
-    encode_audio,
+    encode_audio_to_file,
+    encode_audio_to_tensor,
     get_ffmpeg_library_versions,
     get_frame_at_index,
     get_frame_at_pts,