Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions pymnn/src/llm.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,27 @@ static PyObject* PyMNNLLM_forward(LLM *self, PyObject *args) {
return (PyObject *)logits;
}

static PyObject* PyMNNLLM_forward_all(LLM *self, PyObject *args) {
if (self->is_embedding) {
Py_RETURN_NONE;
}
PyObject *input_ids = nullptr;

if (!PyArg_ParseTuple(args, "O", &input_ids) && isInts(input_ids)) {
Py_RETURN_NONE;
}
auto outputs = self->llm->forwardVec(toInts(input_ids));

// Return a list of all outputs (logits, hidden_states, etc.)
PyObject* result = PyList_New(outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto var = getVar();
*(var->var) = outputs[i];
PyList_SetItem(result, i, (PyObject*)var);
}
return result;
}

static PyObject* PyMNNLLM_generate(LLM *self, PyObject *args) {
if (self->is_embedding) {
Py_RETURN_NONE;
Expand Down Expand Up @@ -499,6 +520,7 @@ static PyObject* PyMNNLLM_enable_collection_mode(LLM *self, PyObject *args) {
static PyMethodDef PyMNNLLM_methods[] = {
{"load", (PyCFunction)PyMNNLLM_load, METH_VARARGS, "load model."},
{"forward", (PyCFunction)PyMNNLLM_forward, METH_VARARGS, "forward `logits` by `input_ids`."},
{"forward_all", (PyCFunction)PyMNNLLM_forward_all, METH_VARARGS, "forward all outputs (logits, hidden_states) by `input_ids`."},
{"generate", (PyCFunction)PyMNNLLM_generate, METH_VARARGS, "generate `output_ids` by `input_ids`."},
{"response", (PyCFunction)PyMNNLLM_response, METH_VARARGS, "response `query` - supports both text and multimodal input."},
{"get_current_history", (PyCFunction)PyMNNLLM_getCurrentHistory, METH_VARARGS, "Get Current History."},
Expand Down
4 changes: 4 additions & 0 deletions source/backend/cpu/CPUSoftmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,10 @@ ErrorCode CPUSoftmax::onExecute(const std::vector<Tensor *> &inputs, const std::
auto outputTensor = outputs[0];
const auto inputDataPtr = inputTensor->host<float>();
auto outputDataPtr = outputTensor->host<float>();
if (inputDataPtr == nullptr || outputDataPtr == nullptr) {
MNN_ERROR("CPUSoftmax: null input/output pointer (OOM for large tensor?)\n");
return OUT_OF_MEMORY;
}
const int batch = inputTensor->batch();
const auto dims = inputTensor->buffer().dimensions;

Expand Down
22 changes: 19 additions & 3 deletions source/backend/opencl/core/BufferPool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,17 @@ cl::Buffer* BufferPool::alloc(size_t size, bool separate) {
node->size = size;
node->buffer.reset(new cl::Buffer(mContext, mFlag, size, NULL, &ret));
if (nullptr == node->buffer.get() || ret != CL_SUCCESS) {
MNN_ERROR("Alloc Buffer %lu error, code:%d \n", size, ret);
return nullptr;
// Allocation failed: release free list to reclaim memory and retry once.
if (!mFreeList.empty()) {
releaseFreeList();
ret = CL_SUCCESS;
node->buffer.reset(new cl::Buffer(mContext, mFlag, size, NULL, &ret));
}
if (nullptr == node->buffer.get() || ret != CL_SUCCESS) {
MNN_ERROR("Alloc buffer %zu MB failed, code:%d\n", size/(1024*1024), ret);
mTotalSize -= size;
return nullptr;
}
}
mAllBuffer.insert(std::make_pair(node->buffer.get(), node));
return node->buffer.get();
Expand All @@ -51,13 +60,20 @@ void BufferPool::clear() {
}

void BufferPool::releaseFreeList() {
std::multimap<size_t, std::shared_ptr<OpenCLBufferNode>> keepList;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using this logic in an OpenCL dynamic memory pool might result in the release of memory reused by preceding layers, leading to a crash.
Do you mean to release the memory previously allocated in the DYNAMIC_IN_EXECUTION memory pool used by the Attention operator?

for(auto mf : mFreeList){
// Keep large buffers (>1GB) in free list so they can be reused by subsequent
// allocations (e.g. 6.78GB fp32 attention score tensors in 1024 edit mode).
if (mf.first > 1024UL * 1024 * 1024) {
keepList.insert(mf);
continue;
}
auto iter = mAllBuffer.find(mf.second->buffer.get());
if (iter != mAllBuffer.end()) {
mAllBuffer.erase(iter);
}
}
mFreeList.clear();
mFreeList = keepList;
}

std::shared_ptr<OpenCLBufferNode> BufferExecutionPool::alloc(size_t size, bool separate) {
Expand Down
24 changes: 19 additions & 5 deletions source/backend/opencl/core/OpenCLBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ CLRuntime::CLRuntime(const Backend::Info& info){
context_ptr = (((MNNDeviceContext*)info.user->sharedContext)->contextPtr);
}
}
// Allow overriding device selection via environment variable
const char* envDeviceId = getenv("MNN_OPENCL_DEVICE_ID");
if (envDeviceId != nullptr) {
device_id = atoi(envDeviceId);
}

if (nullptr != mInfo.user) {
mPrecision = mInfo.user->precision;
Expand All @@ -61,7 +66,7 @@ CLRuntime::CLRuntime(const Backend::Info& info){
mTunedInfo = new TuneInfo;

mImagePool.reset(new ImagePool(mOpenCLRuntime->context()));
mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
mBufferPool.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
}

CLRuntime::~CLRuntime() {
Expand Down Expand Up @@ -293,8 +298,8 @@ OpenCLBackend::OpenCLBackend(BackendConfig::PrecisionMode precision, BackendConf
}

mImagePoolFirst.reset(new ImagePool(mOpenCLRuntime->context()));
mBufferPoolFirst.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
mExecutionBufferPool.reset(new BufferExecutionPool(mOpenCLRuntime->context(), mOpenCLRuntime->commandQueue(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
mBufferPoolFirst.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
mExecutionBufferPool.reset(new BufferExecutionPool(mOpenCLRuntime->context(), mOpenCLRuntime->commandQueue(), CL_MEM_READ_WRITE));
mImagePool = mImagePoolFirst.get();
mBufferPool = mBufferPoolFirst.get();
}
Expand Down Expand Up @@ -397,7 +402,10 @@ class CLMemReleaseImage : public Backend::MemObj {

float OpenCLBackend::getBytes(const Tensor* tensor) {
float bytes = (float)tensor->getType().bytes();
if (mPrecision != BackendConfig::Precision_High) {// Fp16
// For OpenCL buffer mode, always use fp16 size for float tensors unless Precision_High.
// This prevents OOM when large attention score tensors (e.g. [1,24,8704,8704]=6.78GB fp32)
// are allocated for 1024 edit mode. Kernels still run in fp32 when precision is Normal.
if (mPrecision != BackendConfig::Precision_High) {// Fp16 buffer size
if (halide_type_float == tensor->getType().code) {
bytes = 2.0;
}
Expand Down Expand Up @@ -453,6 +461,12 @@ Backend::MemObj* OpenCLBackend::onAcquire(const Tensor* nativeTensor, StorageTyp
#endif
// Align when int4 memory
size = ROUND_UP(size, 2);
#ifdef MNN_DEBUG_MEMORY
if ((size_t)(size * typeSize) > 50 * 1024 * 1024) {
MNN_PRINT("[OCL] large tensor NHWC=[%d,%d,%d,%d] typeSize=%.1f size=%zu MB\n",
N, H, W, C, typeSize, (size_t)(size*typeSize)/(1024*1024));
}
#endif
if (storageType == DYNAMIC_SEPERATE) {
auto buffer = mBufferPool->alloc(size*typeSize, true);
((Tensor*)nativeTensor)->buffer().device = (uint64_t)buffer;
Expand Down Expand Up @@ -531,7 +545,7 @@ bool OpenCLBackend::onSelectDynamicAllocator(int index, int maxIndex) {
}
if (maxIndex > 1 && mImagePoolSecond.get() == nullptr) {
mImagePoolSecond.reset(new ImagePool(mOpenCLRuntime->context()));
mBufferPoolSecond.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR));
mBufferPoolSecond.reset(new BufferPool(mOpenCLRuntime->context(), CL_MEM_READ_WRITE));
}
if (index == 0) {
mImagePool = mImagePoolFirst.get();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,13 @@ ErrorCode BinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
MNN_CHECK_CL_SUCCESS(ret, "setArg BinaryBufExecution");

std::string name = "binary_buf";
mLocalWorkSize = {(uint32_t)16, (uint32_t)1};
// Fix: localWorkSize must evenly divide globalWorkSize, otherwise use NullRange (0,0)
// When globalWorkSize < 16, set localWorkSize to 0 to let OpenCL runtime choose
if (mGlobalWorkSize[0] >= 16 && mGlobalWorkSize[0] % 16 == 0) {
mLocalWorkSize = {(uint32_t)16, (uint32_t)1};
} else {
mLocalWorkSize = {(uint32_t)0, (uint32_t)0}; // Let OpenCL runtime choose
}

unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat

bool isAligned = (e % 32 == 0 && l % 4 == 0 && h % 32 == 0);
bool enoughComputation = (e >= 512 && l >= 512 && h >= 512) && (1.0 * e / 1024 * l / 1024 * h / 1024 >= 4.0);
// Disable Strassen for very large matrices: output [e,h] > 256MB fp16 causes kernel failures
// e.g. FFN [8192,3072,9216] in 1024 edit mode -> e*h*2 = 8192*9216*2 = 144MB, but sub-matrices
// accumulate and exceed GPU limits. Cap at e*h <= 128M elements.
if ((int64_t)e * h > 128 * 1024 * 1024) enoughComputation = false;

if (currentDepth >= mMaxDepth || !isAligned || !enoughComputation) {// not align or not enough computation
Unit unit;
Expand Down
14 changes: 10 additions & 4 deletions source/backend/opencl/execution/cl/binary_buf.cl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
if(offset + 3 >= size){
int remain = size - offset;
#ifdef INT_COMPUTE_MOD
int4 in0, in1;
int4 in0 = (int4)(0, 0, 0, 0);
int4 in1 = (int4)(0, 0, 0, 0);
int* in0_ptr = (int*)&in0;
int* in1_ptr = (int*)&in1;

Expand All @@ -42,7 +43,8 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
output[offset + i] = (OUTPUT_TYPE)out_ptr[i];
}
#else
float4 in0, in1;
float4 in0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
float4 in1 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
float* in0_ptr = (float*)&in0;
float* in1_ptr = (float*)&in1;

Expand All @@ -60,10 +62,14 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
#endif
}
float4 out = OPERATOR;
float* out_ptr = (float*)&out;
// Note: Apply activation per-element to work around NVIDIA OpenCL compiler bug
// where fmax() on float4 in PACK_LEAVE branch causes NaN with float4 pointer operations
if(activationType == 1) {
out = fmax(out, (float4)0);
for(int j = 0; j < remain; ++j){
if(out_ptr[j] < 0.0f) out_ptr[j] = 0.0f;
}
}
float* out_ptr = (float*)&out;
for(int i = 0; i < remain; ++i){
output[offset + i] = (OUTPUT_TYPE)out_ptr[i];
}
Expand Down
14 changes: 10 additions & 4 deletions source/backend/opencl/execution/cl/binary_buf_mnn_cl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ const char* binary_buf =
" if(offset+3 >= size){\n"
" int remain=size-offset;\n"
" #ifdef INT_COMPUTE_MOD\n"
" int4 in0,in1;\n"
" int4 in0=(int4)(0,0,0,0);\n"
" int4 in1=(int4)(0,0,0,0);\n"
" int* in0_ptr=(int*)&in0;\n"
" int* in1_ptr=(int*)&in1;\n"
" \n"
Expand All @@ -45,7 +46,8 @@ const char* binary_buf =
" output[offset+i]=(OUTPUT_TYPE)out_ptr[i];\n"
" }\n"
" #else\n"
" float4 in0,in1;\n"
" float4 in0=(float4)(0.0f,0.0f,0.0f,0.0f);\n"
" float4 in1=(float4)(0.0f,0.0f,0.0f,0.0f);\n"
" float* in0_ptr=(float*)&in0;\n"
" float* in1_ptr=(float*)&in1;\n"
" \n"
Expand All @@ -63,10 +65,14 @@ const char* binary_buf =
" #endif\n"
" }\n"
" float4 out=OPERATOR;\n"
" float* out_ptr=(float*)&out;\n"
" // Note: Apply activation per-element to work around NVIDIA OpenCL compiler bug\n"
" // where fmax() on float4 in PACK_LEAVE branch causes NaN with float4 pointer operations\n"
" if(activationType == 1) {\n"
" out=fmax(out,(float4)0);\n"
" for(int j=0; j<remain; ++j){\n"
" if(out_ptr[j]<0.0f) out_ptr[j]=0.0f;\n"
" }\n"
" }\n"
" float* out_ptr=(float*)&out;\n"
" for(int i=0; i<remain; ++i){\n"
" output[offset+i]=(OUTPUT_TYPE)out_ptr[i];\n"
" }\n"
Expand Down
Loading