Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions ggml/src/ggml-hexagon/ggml-hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1976,9 +1976,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
break;

case GGML_TYPE_F16:
if (!opt_experimental) {
return false;
}
break;

default:
Expand Down
65 changes: 37 additions & 28 deletions ggml/src/ggml-hexagon/htp/matmul-ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
const float * restrict vy = (const float * restrict) y;

for (uint32_t i = 0; i < n; i++) {
rsum += vx[i] * (__fp16) vy[i];
rsum += (float)vx[i] * vy[i];
}
*s = rsum;
return;
Expand All @@ -917,7 +917,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri

// for some reason we need volatile here so that the compiler doesn't try anything funky
volatile HVX_Vector rsum = Q6_V_vsplat_R(0);

float r_sum_scalar = 0.0f;
uint32_t i = 0;

for (i = 0; i < nv0; i++) {
Expand All @@ -926,31 +926,42 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
HVX_Vector x = vx[i];
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0

HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
//NOTE: need volatile here to prevent compiler optimization
// Seem compiler cannot guarantee read-after-write??
volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));

HVX_Vector sum = Q6_Vqf32_vadd_Vqf32Vqf32(hi, lo);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
}

if (nv1) {
HVX_VectorPair yp = vy[i];
// HVX_VectorPair yp = vy[i];

HVX_Vector x = vx[i];
HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0
// HVX_Vector x = vx[i];
// HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0

if (nv1 >= 32) {
HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
nv1 -= 32;
}
// if (nv1 >= 32) {
// volatile HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
// rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, hi);
// nv1 -= 32;
// }

// rsum = hvx_vec_qf32_reduce_sum(rsum);

// if (nv1) {
// volatile HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
// HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
// rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
// }

//process the remainder using scalar loop
rsum = hvx_vec_qf32_reduce_sum(rsum);
const __fp16 * restrict sx = (const __fp16 * restrict) x;
const float * restrict sy = (const float * restrict) y;

if (nv1) {
HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
HVX_Vector sum = hvx_vec_qf32_reduce_sum_n(lo, nv1);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, sum);
for (uint32_t i = nv0 * 64; i < n; i++) {
r_sum_scalar += (float) sx[i] * sy[i];
}

// hvx_vec_dump_fp16("X", x);
Expand All @@ -961,7 +972,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
rsum = hvx_vec_qf32_reduce_sum(rsum);
}

*s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum));
*s = hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(rsum)) + r_sum_scalar;

# ifdef HTP_DEBUG
{
Expand Down Expand Up @@ -1498,9 +1509,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();

const size_t src0_row_size = sizeof(__fp16) * ne00;
const size_t src1_row_size = sizeof(float) * ne10;

assert(ne12 % ne02 == 0);
assert(ne13 % ne03 == 0);

Expand All @@ -1510,8 +1518,6 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
// This is the size of the rest of the dimensions of the result
const uint32_t nr1 = ne1 * ne2 * ne3;

uint32_t chunk_size = 64;

// distribute the thread work across the inner or outer loop based on which one is larger
uint32_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
uint32_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
Expand Down Expand Up @@ -1544,11 +1550,11 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
const uint32_t blck_0 = 64;
const uint32_t blck_1 = 64;

float tmp[32];
__attribute__((aligned(128))) float tmp[64];

for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
for (uint32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1++) {
for (uint32_t ir1 = iir1; ir1 < MIN(iir1 + blck_1, ir1_end); ir1++) {
const uint32_t i13 = (ir1 / (ne12 * ne1));
const uint32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
Expand All @@ -1561,13 +1567,16 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
const uint32_t i2 = i12;
const uint32_t i3 = i13;

const uint8_t * restrict src0_row = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
const uint8_t * restrict src0_base = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
const uint8_t * restrict src1_col =
(const uint8_t *) src1->data + (i11 + i12 * ne11 + i13 * ne12 * ne11) * src1_row_size;
(const uint8_t *) src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13);
float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));

for (uint32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0++) {
vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row + ir0 * src0_row_size, src1_col);
const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end);
for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) {
// Use nb01 stride for non-contiguous src0 support
const uint8_t * restrict src0_row = src0_base + ir0 * nb01;
vec_dot_f16_f32(ne00, &tmp[ir0 - iir0], src0_row, src1_col);
}

hvx_copy_fp32_ua((uint8_t *) &dst_col[iir0], (uint8_t *) tmp, MIN(iir0 + blck_0, ir0_end) - iir0);
Expand Down
65 changes: 65 additions & 0 deletions scripts/snapdragon/adb/run-mtmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/sh
#

# Basedir on device
basedir=/data/local/tmp/llama.cpp

cli_opts=

branch=.
[ "$B" != "" ] && branch=$B

adbserial=
[ "$S" != "" ] && adbserial="-s $S"

model="gemma-3-4b-it-Q4_0.gguf"
[ "$M" != "" ] && model="$M"

mmproj="mmproj-F16.gguf"
[ "$MMPROJ" != "" ] && mmproj="$MMPROJ"

image=
[ "$IMG" != "" ] && image="$IMG"

device="HTP0"
[ "$D" != "" ] && device="$D"

verbose=
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"

experimental="GGML_HEXAGON_EXPERIMENTAL=1"
[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"

sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"

profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"

opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"

nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"

ndev=
[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"

# MTMD backend device for vision model (defaults to CPU if not set)
mtmd_backend=
[ "$MTMD_DEVICE" != "" ] && mtmd_backend="MTMD_BACKEND_DEVICE=$MTMD_DEVICE"

set -x

adb $adbserial shell " \
cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $experimental $sched $opmask $profile $nhvx $ndev $mtmd_backend \
./$branch/bin/llama-mtmd-cli --no-mmap -m $basedir/../gguf/$model \
--mmproj $basedir/../gguf/$mmproj \
--image $basedir/../gguf/$image \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
-ngl 99 --device $device -v $cli_opts $@ \
"