Why `mul_mat` in ggml slower than llama.cpp? #13468

xuhao77 · 2025-05-12T07:00:50Z

xuhao77
May 12, 2025

I use the following command to compile an executable file for Android:

cmake \
  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
  -DANDROID_ABI=arm64-v8a \
  -DANDROID_PLATFORM=android-28 \
  -DCMAKE_C_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
  -DCMAKE_CXX_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
  -DGGML_OPENMP=ON \
  -DGGML_LLAMAFILE=OFF \
  -B build-android

This is the command I enter in the shell on Android (REDMI K80 Pro):

cd /data/local/tmp/llama.cpp/build-android && chmod +x llama-bench && export LD_LIBRARY_PATH=:/data/local/tmp/llama.cpp/build-android && taskset 0f ./llama-bench -m ../qwen2.5-0.5b-instruct-fp16.gguf -p 0 -n 1 -b 1 -t 4 >op.txt

I used taskset 0f to limit it to only 4 CPU cores. The model file qwen2.5-0.5b-instruct-fp16.gguf means that weights are stored in FP16 format and activations are computed in FP32.

I modified the ggml_graph_compute_thread function in ggml/src/ggml-cpu/ggml-cpu.c to log information such as the operator name, shape, and execution time. Here is the code:

static void print_op_(struct ggml_tensor * tensor, const char* prefix) {
    printf("\n");
    printf("%sop symbol: %s\n", prefix, ggml_op_symbol(tensor->op));
    printf("%sop desc : %s\n", prefix, ggml_op_desc(tensor));
    printf("%sop type : %s\n", prefix, ggml_type_name(tensor->type));
    printf("%sop ne[0,1,2,3] : %lld %lld %lld %lld\n", prefix,
            (long long) tensor->ne[0], (long long) tensor->ne[1], (long long) tensor->ne[2], (long long) tensor->ne[3]);
}

static void print_op(struct ggml_tensor * tensor, const int64_t us) {
    printf("\n");
    print_op_(tensor,"");
    printf("op time: %f ms\n", us / 1000.0);
    int i = 0;
    struct ggml_tensor * p = tensor->src[i++];
    printf("src:");
    while (p) {
        print_op_(p,"    ");
        p = tensor->src[i++];
    }
}

// static ggml_mutex_t mymutex;
static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
    struct ggml_threadpool    * tp    = state->threadpool;

    const struct ggml_cgraph * cgraph = tp->cgraph;
    const struct ggml_cplan  * cplan  = tp->cplan;

    set_numa_thread_affinity(state->ith);

    struct ggml_compute_params params = {
        /*.ith       =*/ state->ith,
        /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
        /*.wsize     =*/ cplan->work_size,
        /*.wdata     =*/ cplan->work_data,
        /*.threadpool=*/ tp,
    };

    for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
        ggml_barrier(state->threadpool);
        int64_t start_time = ggml_time_us();
        struct ggml_tensor * node = cgraph->nodes[node_n];

        ggml_compute_forward(&params, node);

        if (state->ith == 0 && cplan->abort_callback &&
                cplan->abort_callback(cplan->abort_callback_data)) {
            atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
            tp->ec = GGML_STATUS_ABORTED;
        }

        ggml_barrier(state->threadpool);

        int64_t end_time = ggml_time_us();
        if (state->ith == 0) {
            print_op(node, end_time - start_time);
        }
    }

    ggml_barrier(state->threadpool);

    return 0;
}

I wrote a script to analyze the output, which shows:
X*Y (896 1 1 1) * (896 151936 1 1): 4.616333 ms

This means that multiplying an activation tensor of shape (896 1 1 1) (FP32) with a weight matrix of shape (896 151936 1 1) (FP16) takes an average of 4.616333 ms .

To reproduce this operation, I built a minimal test graph using GGML:
tests/mytest.cpp

#include <ggml-cpu.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/resource.h>
#include "ggml.h"
#include "time_helper.hpp"

void test_f16_matmul() {
    ggml_backend_load_all();
    struct ggml_init_params params = {
        .mem_size   = 1280 * 1024 * 1024,
        .mem_buffer = NULL,
        .no_alloc   = false,
    };
    struct ggml_context * ctx = ggml_init(params);

    const int64_t neC[2] = {896, 896};
    const int64_t neD[2] = {896, 1};
    const int64_t neA[2] = {896, 151936};

    struct ggml_tensor * C = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, neC[0], neC[1]);
    struct ggml_tensor * D = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, neD[0], neD[1]);
    struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, neA[0], neA[1]);

    // Fill tensors with random values
    for (int i = 0; i < ggml_nelements(C); ++i) {
        ggml_set_f32_1d(C, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
    }
    for (int i = 0; i < ggml_nelements(D); ++i) {
        ggml_set_f32_1d(D, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
    }
    for (int i = 0; i < ggml_nelements(A); ++i) {
        ggml_set_f32_1d(A, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
    }

    struct ggml_tensor * B = ggml_mul_mat(ctx, C, D);
    struct ggml_tensor * E = ggml_mul_mat(ctx, A, B);

    // Build and compute the graph
    struct ggml_cgraph * gf = ggml_new_graph(ctx);
    ggml_build_forward_expand(gf, E);
    ggml_graph_compute_with_ctx(ctx, gf, 4);

    ggml_free(ctx);
}

int main() {
    for (int i = 0; i < 6; i++) {
        test_f16_matmul();
        sleep(1);
    }
    return 0;
}

tests/CMakeLists.txt

set(TEST_TARGET mytest)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_include_directories(${TEST_TARGET} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")

Build Command

cmake \
  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
  -DANDROID_ABI=arm64-v8a \
  -DANDROID_PLATFORM=android-28 \
  -DCMAKE_C_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
  -DCMAKE_CXX_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
  -DGGML_OPENMP=ON \
  -DGGML_LLAMAFILE=OFF \
  -B build-ggml

Run Command on Device via ADB
cd /data/local/tmp/ggml/build-ggml && chmod +x mytest && export LD_LIBRARY_PATH=:/data/local/tmp/ggml/build-ggml && taskset 0f ./mytest >ggml_op.txt

The result from running this standalone test shows:
X*Y (896 1 1 1) * (896 151936 1 1): 9.072333 ms

Multiple runs consistently show around 8–9 ms , whereas the same operation inside llama.cpp takes only ~4.6 ms. I verified through debugging that both versions execute the same underlying code path .

Why does the mul_mat operation in llama.cpp run significantly faster than in my standalone test?

This issue has been puzzling me for two weeks now. I would greatly appreciate any insight you can offer!

Thank you for taking the time to read through this.

Python Script for Log Analysis (ana_log.py)

import sys
from collections import defaultdict

def main(logfile, avg=True):
    total_times = defaultdict(float)
    current_symbol = None

    with open(logfile, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    line_cnt = len(lines)
    line_index = 0

    matmul_dict = defaultdict(list)

    while line_index < line_cnt:
        line = lines[line_index]
        stripped_line = line.lstrip()
        is_indented = (len(line) - len(stripped_line)) > 0

        if stripped_line.startswith('op symbol:') and not is_indented:
            parts = stripped_line.split(':', 1)
            current_symbol = parts[1].strip()

        elif stripped_line.startswith('op time:') and not is_indented:
            time_part = stripped_line.split(':', 1)[1].strip()
            time_str = time_part.replace(' ms', '').strip()
            time_val = float(time_str)
            if current_symbol:
                total_times[current_symbol] += time_val

        elif current_symbol == "X*Y" and stripped_line.startswith('src'):
            line_index += 4
            line = lines[line_index].strip()
            src1_shape = line.split(':')[1].strip()
            line_index += 5
            line = lines[line_index].strip()
            src2_shape = line.split(':')[1].strip()
            matmul_dict[(src1_shape, src2_shape)].append(time_val)

        line_index += 1

    infer_times = len(matmul_dict[("896 151936 1 1","896 1 1 1")])

    for symbol, total_time in sorted(total_times.items()):
        print(f"{symbol}: {total_time / infer_times:.6f} ms")

    for shapes, times in matmul_dict.items():
        src1_shape, src2_shape = shapes
        used_time = sum(times) / len(times) if avg else min(times)
        print(f"X*Y ({src2_shape}) * ({src1_shape}): {used_time:.6f} ms, once infer call times: {len(times)/infer_times}")

if __name__ == '__main__':
    if len(sys.argv) < 2 or len(sys.argv) > 3:
        print("Usage: python ana_log.py <log_file> [avg]")
        sys.exit(1)
    avg = True if len(sys.argv) == 3 and sys.argv[2].lower() == 'avg' else False
    print("avg" if avg else "min")
    main(sys.argv[1], avg)

Usage
python ana_log.py op.txt avg

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Why `mul_mat` in ggml slower than llama.cpp? #13468

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Why mul_mat in ggml slower than llama.cpp? #13468

Uh oh!

Uh oh!

xuhao77 May 12, 2025

Replies: 0 comments

Why `mul_mat` in ggml slower than llama.cpp? #13468

xuhao77
May 12, 2025