alibaba · er6y · Feb 11, 2026 · Feb 11, 2026 · Qxinyu · Mar 10, 2026
diff --git a/pymnn/src/llm.h b/pymnn/src/llm.h
@@ -56,6 +56,27 @@ static PyObject* PyMNNLLM_forward(LLM *self, PyObject *args) {
     return (PyObject *)logits;
 }
 
+static PyObject* PyMNNLLM_forward_all(LLM *self, PyObject *args) {
+    if (self->is_embedding) {
+        Py_RETURN_NONE;
+    }
+    PyObject *input_ids = nullptr;
+
+    if (!PyArg_ParseTuple(args, "O", &input_ids) && isInts(input_ids)) {
+        Py_RETURN_NONE;
+    }
+    auto outputs = self->llm->forwardVec(toInts(input_ids));
+
+    // Return a list of all outputs (logits, hidden_states, etc.)
+    PyObject* result = PyList_New(outputs.size());
+    for (size_t i = 0; i < outputs.size(); i++) {
+        auto var = getVar();
+        *(var->var) = outputs[i];
+        PyList_SetItem(result, i, (PyObject*)var);
+    }
+    return result;
+}
+
 static PyObject* PyMNNLLM_generate(LLM *self, PyObject *args) {
     if (self->is_embedding) {
         Py_RETURN_NONE;
@@ -499,6 +520,7 @@ static PyObject* PyMNNLLM_enable_collection_mode(LLM *self, PyObject *args) {
 static PyMethodDef PyMNNLLM_methods[] = {
     {"load", (PyCFunction)PyMNNLLM_load, METH_VARARGS, "load model."},
     {"forward", (PyCFunction)PyMNNLLM_forward, METH_VARARGS, "forward `logits` by `input_ids`."},
+    {"forward_all", (PyCFunction)PyMNNLLM_forward_all, METH_VARARGS, "forward all outputs (logits, hidden_states) by `input_ids`."},
     {"generate", (PyCFunction)PyMNNLLM_generate, METH_VARARGS, "generate `output_ids` by `input_ids`."},
     {"response", (PyCFunction)PyMNNLLM_response, METH_VARARGS, "response `query` - supports both text and multimodal input."},
     {"get_current_history", (PyCFunction)PyMNNLLM_getCurrentHistory, METH_VARARGS, "Get Current History."},

diff --git a/source/backend/cpu/CPUSoftmax.cpp b/source/backend/cpu/CPUSoftmax.cpp
@@ -311,6 +311,10 @@ ErrorCode CPUSoftmax::onExecute(const std::vector<Tensor *> &inputs, const std::
     auto outputTensor       = outputs[0];
     const auto inputDataPtr = inputTensor->host<float>();
     auto outputDataPtr      = outputTensor->host<float>();
+    if (inputDataPtr == nullptr || outputDataPtr == nullptr) {
+        MNN_ERROR("CPUSoftmax: null input/output pointer (OOM for large tensor?)\n");
+        return OUT_OF_MEMORY;
+    }
     const int batch         = inputTensor->batch();
     const auto dims         = inputTensor->buffer().dimensions;
 

diff --git a/source/backend/opencl/core/BufferPool.cpp b/source/backend/opencl/core/BufferPool.cpp
@@ -24,8 +24,17 @@ cl::Buffer* BufferPool::alloc(size_t size, bool separate) {
     node->size = size;
     node->buffer.reset(new cl::Buffer(mContext, mFlag, size, NULL, &ret));
     if (nullptr == node->buffer.get() || ret != CL_SUCCESS) {
-        MNN_ERROR("Alloc Buffer %lu error, code:%d \n", size, ret);
-        return nullptr;
+        // Allocation failed: release free list to reclaim memory and retry once.
+        if (!mFreeList.empty()) {
+            releaseFreeList();
+            ret = CL_SUCCESS;
+            node->buffer.reset(new cl::Buffer(mContext, mFlag, size, NULL, &ret));
+        }
+        if (nullptr == node->buffer.get() || ret != CL_SUCCESS) {
+            MNN_ERROR("Alloc buffer %zu MB failed, code:%d\n", size/(1024*1024), ret);
+            mTotalSize -= size;
+            return nullptr;
+        }
     }
     mAllBuffer.insert(std::make_pair(node->buffer.get(), node));
     return node->buffer.get();
@@ -51,13 +60,20 @@ void BufferPool::clear() {
 }
 
 void BufferPool::releaseFreeList() {
+    std::multimap<size_t, std::shared_ptr<OpenCLBufferNode>> keepList;
     for(auto mf : mFreeList){
+        // Keep large buffers (>1GB) in free list so they can be reused by subsequent
+        // allocations (e.g. 6.78GB fp32 attention score tensors in 1024 edit mode).
+        if (mf.first > 1024UL * 1024 * 1024) {
+            keepList.insert(mf);
+            continue;
+        }
         auto iter = mAllBuffer.find(mf.second->buffer.get());
         if (iter != mAllBuffer.end()) {
             mAllBuffer.erase(iter);
         }
     }
-    mFreeList.clear();
+    mFreeList = keepList;
 }
 
 std::shared_ptr<OpenCLBufferNode> BufferExecutionPool::alloc(size_t size, bool separate) {

diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp
@@ -43,6 +43,11 @@ CLRuntime::CLRuntime(const Backend::Info& info){
             context_ptr = (((MNNDeviceContext*)info.user->sharedContext)->contextPtr);
         }
     }
+    // Allow overriding device selection via environment variable
+    const char* envDeviceId = getenv("MNN_OPENCL_DEVICE_ID");
+    if (envDeviceId != nullptr) {
+        device_id = atoi(envDeviceId);
+    }
 
     if (nullptr != mInfo.user) {
         mPrecision = mInfo.user->precision;
@@ -61,7 +66,7 @@ CLRuntime::CLRuntime(const Backend::Info& info){
     mTunedInfo = new TuneInfo;
 
     mImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
-    mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
+    mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
 }
 
 CLRuntime::~CLRuntime() {
@@ -293,8 +298,8 @@ OpenCLBackend::OpenCLBackend(BackendConfig::PrecisionMode precision, BackendConf
         }
 
         mImagePoolFirst.reset(new ImagePool(mOpenCLRuntime->context()));
-        mBufferPoolFirst.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
-        mExecutionBufferPool.reset(new BufferExecutionPool(mOpenCLRuntime->context(), mOpenCLRuntime->commandQueue(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
+        mBufferPoolFirst.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
+        mExecutionBufferPool.reset(new BufferExecutionPool(mOpenCLRuntime->context(), mOpenCLRuntime->commandQueue(), CL_MEM_READ_WRITE));
         mImagePool = mImagePoolFirst.get();
         mBufferPool = mBufferPoolFirst.get();
     }
@@ -397,7 +402,10 @@ class CLMemReleaseImage : public Backend::MemObj {
 
 float OpenCLBackend::getBytes(const Tensor* tensor) {
     float bytes = (float)tensor->getType().bytes();
-    if (mPrecision != BackendConfig::Precision_High) {// Fp16
+    // For OpenCL buffer mode, always use fp16 size for float tensors unless Precision_High.
+    // This prevents OOM when large attention score tensors (e.g. [1,24,8704,8704]=6.78GB fp32)
+    // are allocated for 1024 edit mode. Kernels still run in fp32 when precision is Normal.
+    if (mPrecision != BackendConfig::Precision_High) {// Fp16 buffer size
         if (halide_type_float == tensor->getType().code) {
             bytes = 2.0;
         }
@@ -453,6 +461,12 @@ Backend::MemObj* OpenCLBackend::onAcquire(const Tensor* nativeTensor, StorageTyp
         #endif
         // Align when int4 memory
         size = ROUND_UP(size, 2);
+#ifdef MNN_DEBUG_MEMORY
+        if ((size_t)(size * typeSize) > 50 * 1024 * 1024) {
+            MNN_PRINT("[OCL] large tensor NHWC=[%d,%d,%d,%d] typeSize=%.1f size=%zu MB\n",
+                      N, H, W, C, typeSize, (size_t)(size*typeSize)/(1024*1024));
+        }
+#endif
         if (storageType == DYNAMIC_SEPERATE) {
             auto buffer = mBufferPool->alloc(size*typeSize, true);
             ((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
@@ -531,7 +545,7 @@ bool OpenCLBackend::onSelectDynamicAllocator(int index, int maxIndex) {
     }
     if (maxIndex > 1 && mImagePoolSecond.get() == nullptr) {
         mImagePoolSecond.reset(new ImagePool(mOpenCLRuntime->context()));
-        mBufferPoolSecond.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
+        mBufferPoolSecond.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
     }
     if (index == 0) {
         mImagePool = mImagePoolFirst.get();

diff --git a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
@@ -298,7 +298,13 @@ ErrorCode BinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
     MNN_CHECK_CL_SUCCESS(ret, "setArg BinaryBufExecution");
 
     std::string name = "binary_buf";
-    mLocalWorkSize = {(uint32_t)16, (uint32_t)1};
+    // Fix: localWorkSize must evenly divide globalWorkSize, otherwise use NullRange (0,0)
+    // When globalWorkSize < 16, set localWorkSize to 0 to let OpenCL runtime choose
+    if (mGlobalWorkSize[0] >= 16 && mGlobalWorkSize[0] % 16 == 0) {
+        mLocalWorkSize = {(uint32_t)16, (uint32_t)1};
+    } else {
+        mLocalWorkSize = {(uint32_t)0, (uint32_t)0};  // Let OpenCL runtime choose
+    }
 
     unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
     unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1]};

diff --git a/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp
@@ -228,6 +228,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
 
     bool isAligned = (e % 32 == 0 && l % 4 == 0 && h % 32 == 0);
     bool enoughComputation = (e >= 512 && l >= 512 && h >= 512) && (1.0 * e / 1024 * l / 1024 * h / 1024 >= 4.0);
+    // Disable Strassen for very large matrices: output [e,h] > 256MB fp16 causes kernel failures
+    // e.g. FFN [8192,3072,9216] in 1024 edit mode -> e*h*2 = 8192*9216*2 = 144MB, but sub-matrices
+    // accumulate and exceed GPU limits. Cap at e*h <= 128M elements.
+    if ((int64_t)e * h > 128 * 1024 * 1024) enoughComputation = false;
 
     if (currentDepth >= mMaxDepth || !isAligned || !enoughComputation) {// not align or not enough computation
         Unit unit;

diff --git a/source/backend/opencl/execution/cl/binary_buf.cl b/source/backend/opencl/execution/cl/binary_buf.cl
@@ -15,7 +15,8 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
         if(offset + 3 >= size){
             int remain = size - offset;
             #ifdef INT_COMPUTE_MOD
-            int4 in0, in1;
+            int4 in0 = (int4)(0, 0, 0, 0);
+            int4 in1 = (int4)(0, 0, 0, 0);
             int* in0_ptr = (int*)&in0;
             int* in1_ptr = (int*)&in1;
 
@@ -42,7 +43,8 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
                 output[offset + i] = (OUTPUT_TYPE)out_ptr[i];
             }
             #else
-            float4 in0, in1;
+            float4 in0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+            float4 in1 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
             float* in0_ptr = (float*)&in0;
             float* in1_ptr = (float*)&in1;
 
@@ -60,10 +62,14 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
                 #endif
             }
             float4 out = OPERATOR;
+            float* out_ptr = (float*)&out;
+            // Note: Apply activation per-element to work around NVIDIA OpenCL compiler bug
+            // where fmax() on float4 in PACK_LEAVE branch causes NaN with float4 pointer operations
             if(activationType == 1) {
-                out = fmax(out, (float4)0);
+                for(int j = 0; j < remain; ++j){
+                    if(out_ptr[j] < 0.0f) out_ptr[j] = 0.0f;
+                }
             }
-            float* out_ptr = (float*)&out;
             for(int i = 0; i < remain; ++i){
                 output[offset + i] = (OUTPUT_TYPE)out_ptr[i];
             }

diff --git a/source/backend/opencl/execution/cl/binary_buf_mnn_cl.cpp b/source/backend/opencl/execution/cl/binary_buf_mnn_cl.cpp
@@ -18,7 +18,8 @@ const char* binary_buf =
 " if(offset+3 >= size){\n"
 " int remain=size-offset;\n"
 " #ifdef INT_COMPUTE_MOD\n"
-" int4 in0,in1;\n"
+" int4 in0=(int4)(0,0,0,0);\n"
+" int4 in1=(int4)(0,0,0,0);\n"
 " int* in0_ptr=(int*)&in0;\n"
 " int* in1_ptr=(int*)&in1;\n"
 " \n"
@@ -45,7 +46,8 @@ const char* binary_buf =
 " output[offset+i]=(OUTPUT_TYPE)out_ptr[i];\n"
 " }\n"
 " #else\n"
-" float4 in0,in1;\n"
+" float4 in0=(float4)(0.0f,0.0f,0.0f,0.0f);\n"
+" float4 in1=(float4)(0.0f,0.0f,0.0f,0.0f);\n"
 " float* in0_ptr=(float*)&in0;\n"
 " float* in1_ptr=(float*)&in1;\n"
 " \n"
@@ -63,10 +65,14 @@ const char* binary_buf =
 " #endif\n"
 " }\n"
 " float4 out=OPERATOR;\n"
+" float* out_ptr=(float*)&out;\n"
+" // Note: Apply activation per-element to work around NVIDIA OpenCL compiler bug\n"
+" // where fmax() on float4 in PACK_LEAVE branch causes NaN with float4 pointer operations\n"
 " if(activationType == 1) {\n"
-" out=fmax(out,(float4)0);\n"
+" for(int j=0; j<remain; ++j){\n"
+" if(out_ptr[j]<0.0f) out_ptr[j]=0.0f;\n"
+" }\n"
 " }\n"
-" float* out_ptr=(float*)&out;\n"
 " for(int i=0; i<remain; ++i){\n"
 " output[offset+i]=(OUTPUT_TYPE)out_ptr[i];\n"
 " }\n"