Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,9 @@ IF(XNNPACK_TARGET_PROCESSOR STREQUAL "arm")
IF(XNNPACK_ENABLE_ARM_DOTPROD AND XNNPACK_ENABLE_ARM_FP16_VECTOR)
LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS})
ENDIF()
IF(XNNPACK_ENABLE_ARM_BF16)
LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_NEONBF16_MICROKERNEL_SRCS})
ENDIF()
LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_ARMSIMD32_MICROKERNEL_SRCS})
LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_NEON_MICROKERNEL_SRCS})
LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_NEONFP16_MICROKERNEL_SRCS})
Expand Down Expand Up @@ -693,6 +696,9 @@ IF(XNNPACK_TARGET_PROCESSOR MATCHES "^arm64")
IF(XNNPACK_ENABLE_ARM_SME2)
LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_NEONSME2_MICROKERNEL_SRCS})
ENDIF()
IF(XNNPACK_ENABLE_ARM_BF16)
LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_NEONBF16_MICROKERNEL_SRCS})
ENDIF()
LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_NEON_MICROKERNEL_SRCS})
LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_NEONFP16_MICROKERNEL_SRCS})
LIST(APPEND NON_PROD_MICROKERNEL_SRCS ${NON_PROD_NEONFMA_MICROKERNEL_SRCS})
Expand Down
12 changes: 12 additions & 0 deletions bench/vunary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ struct UniformDistribution<xnn_float16> {
}
};

template <>
struct UniformDistribution<xnn_bfloat16> {
std::uniform_real_distribution<float> dist{-10.0f, 10.0f};

template <class Generator>
xnn_bfloat16 operator()(Generator& g) {
return xnn_bfloat16_from_float(dist(g));
}
};

template <>
struct UniformDistribution<int8_t> {
std::uniform_int_distribution<int32_t> dist{
Expand Down Expand Up @@ -287,6 +297,8 @@ void vlrelu(benchmark::State& state, uint64_t arch_flags,
->Apply(benchmark::utils::UnaryElementwiseParameters<datatype_in, \
datatype_out>) \
->UseRealTime();
#include "src/bf16-f32-vcvt/bf16-f32-vcvt.inc"
#include "src/f32-bf16-vcvt/f32-bf16-vcvt.inc"
#include "src/f16-f32-vcvt/f16-f32-vcvt.inc"
#include "src/f16-qs8-vcvt/f16-qs8-vcvt.inc"
#include "src/f32-f16-vcvt/f32-f16-vcvt.inc"
Expand Down
8 changes: 8 additions & 0 deletions cmake/gen/neon_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@


SET(PROD_NEON_MICROKERNEL_SRCS
src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u8.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c
src/f32-argmaxpool/f32-argmaxpool-9p8x-neon-c4.c
src/f32-avgpool/gen/f32-avgpool-9p-minmax-neon-u4.c
src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u8.c
src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-neon-2x2.c
src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neon.c
src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neon.c
Expand Down Expand Up @@ -185,12 +187,18 @@ SET(PROD_NEON_MICROKERNEL_SRCS

SET(NON_PROD_NEON_MICROKERNEL_SRCS
src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c
src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u16.c
src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u24.c
src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u32.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u8.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c
src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u16.c
src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u24.c
src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u32.c
src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-neon-2x1.c
src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-neon-2x2.c
src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-neon-2x1.c
Expand Down
6 changes: 4 additions & 2 deletions cmake/gen/neonbf16_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
# Generator: tools/update-microkernels.py


SET(PROD_NEONBF16_MICROKERNEL_SRCS)
SET(PROD_NEONBF16_MICROKERNEL_SRCS
src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neonbf16-u16.c)

SET(NON_PROD_NEONBF16_MICROKERNEL_SRCS
src/bf16-gemm/gen/bf16-gemm-1x4c8-minmax-neonbf16-bfdot.c
Expand All @@ -25,6 +26,7 @@ SET(NON_PROD_NEONBF16_MICROKERNEL_SRCS
src/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonbf16-bfdot.c
src/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonbf16-bfmlal.c
src/bf16-gemm/gen/bf16-gemm-5x8c2-minmax-neonbf16-bfdot-lane-ld128.c
src/bf16-gemm/gen/bf16-gemm-6x8c2-minmax-neonbf16-bfdot-lane-ld128.c)
src/bf16-gemm/gen/bf16-gemm-6x8c2-minmax-neonbf16-bfdot-lane-ld128.c
src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neonbf16-u8.c)

SET(ALL_NEONBF16_MICROKERNEL_SRCS ${PROD_NEONBF16_MICROKERNEL_SRCS} + ${NON_PROD_NEONBF16_MICROKERNEL_SRCS})
8 changes: 8 additions & 0 deletions gen/neon_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
#

PROD_NEON_MICROKERNEL_SRCS = [
"src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u8.c",
"src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c",
"src/f32-argmaxpool/f32-argmaxpool-9p8x-neon-c4.c",
"src/f32-avgpool/gen/f32-avgpool-9p-minmax-neon-u4.c",
"src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u8.c",
"src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-neon-2x2.c",
"src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neon.c",
"src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neon.c",
Expand Down Expand Up @@ -188,6 +190,12 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [
"src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c",
"src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c",
"src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c",
"src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u16.c",
"src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u24.c",
"src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u32.c",
"src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u16.c",
"src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u24.c",
"src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u32.c",
"src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-neon-2x1.c",
"src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-neon-2x2.c",
"src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-neon-2x1.c",
Expand Down
2 changes: 2 additions & 0 deletions gen/neonbf16_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#

PROD_NEONBF16_MICROKERNEL_SRCS = [
"src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neonbf16-u16.c",
]

NON_PROD_NEONBF16_MICROKERNEL_SRCS = [
Expand All @@ -23,6 +24,7 @@ NON_PROD_NEONBF16_MICROKERNEL_SRCS = [
"src/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonbf16-bfmlal.c",
"src/bf16-gemm/gen/bf16-gemm-5x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
"src/bf16-gemm/gen/bf16-gemm-6x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
"src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neonbf16-u8.c",
]

ALL_NEONBF16_MICROKERNEL_SRCS = PROD_NEONBF16_MICROKERNEL_SRCS + NON_PROD_NEONBF16_MICROKERNEL_SRCS
6 changes: 6 additions & 0 deletions scripts/generate-bf16-f32-vcvt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

################################### ARM NEON ##################################
tools/xngen src/bf16-f32-vcvt/neon.c.in -D BATCH_TILE=8 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u8.c &
tools/xngen src/bf16-f32-vcvt/neon.c.in -D BATCH_TILE=16 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u16.c &
tools/xngen src/bf16-f32-vcvt/neon.c.in -D BATCH_TILE=24 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u24.c &
tools/xngen src/bf16-f32-vcvt/neon.c.in -D BATCH_TILE=32 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u32.c &

#################################### Scalar ###################################
tools/xngen src/bf16-f32-vcvt/scalar.c.in -D BATCH_TILE=1 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u1.c &
tools/xngen src/bf16-f32-vcvt/scalar.c.in -D BATCH_TILE=2 -o src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u2.c &
Expand Down
12 changes: 11 additions & 1 deletion scripts/generate-f32-bf16-vcvt.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
#!/bin/sh
# Copyright 2021 Google LLC
# Copyright 2026 Google LLC
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

################################### ARM NEON ##################################
tools/xngen src/f32-bf16-vcvt/neon.c.in -D BATCH_TILE=8 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u8.c &
tools/xngen src/f32-bf16-vcvt/neon.c.in -D BATCH_TILE=16 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u16.c &
tools/xngen src/f32-bf16-vcvt/neon.c.in -D BATCH_TILE=24 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u24.c &
tools/xngen src/f32-bf16-vcvt/neon.c.in -D BATCH_TILE=32 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neon-u32.c &

################################# ARM NEON BF16 ###############################
tools/xngen src/f32-bf16-vcvt/neonbf16.c.in -D BATCH_TILE=8 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neonbf16-u8.c &
tools/xngen src/f32-bf16-vcvt/neonbf16.c.in -D BATCH_TILE=16 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-neonbf16-u16.c &

#################################### Scalar ###################################
tools/xngen src/f32-bf16-vcvt/scalar.c.in -D BATCH_TILE=1 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u1.c &
tools/xngen src/f32-bf16-vcvt/scalar.c.in -D BATCH_TILE=2 -o src/f32-bf16-vcvt/gen/f32-bf16-vcvt-scalar-u2.c &
Expand Down
7 changes: 7 additions & 0 deletions src/bf16-f32-vcvt/bf16-f32-vcvt.inc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#if XNN_ARCH_ARM || XNN_ARCH_ARM64
XNN_UKERNEL(xnn_arch_arm_neon, xnn_bf16_f32_vcvt_ukernel__neon_u8, 8, false, xnn_bfloat16, float, void, NULL)
XNN_UKERNEL(xnn_arch_arm_neon, xnn_bf16_f32_vcvt_ukernel__neon_u16, 16, false, xnn_bfloat16, float, void, NULL)
XNN_UKERNEL(xnn_arch_arm_neon, xnn_bf16_f32_vcvt_ukernel__neon_u24, 24, false, xnn_bfloat16, float, void, NULL)
XNN_UKERNEL(xnn_arch_arm_neon, xnn_bf16_f32_vcvt_ukernel__neon_u32, 32, false, xnn_bfloat16, float, void, NULL)
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64

XNN_UKERNEL(xnn_arch_none, xnn_bf16_f32_vcvt_ukernel__scalar_u1, 1, false, xnn_bfloat16, float, void, NULL)
XNN_UKERNEL(xnn_arch_none, xnn_bf16_f32_vcvt_ukernel__scalar_u2, 2, false, xnn_bfloat16, float, void, NULL)
XNN_UKERNEL(xnn_arch_none, xnn_bf16_f32_vcvt_ukernel__scalar_u3, 3, false, xnn_bfloat16, float, void, NULL)
Expand Down
74 changes: 74 additions & 0 deletions src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u16.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/bf16-f32-vcvt/neon.c.in
// Generator: tools/xngen
//
// Copyright 2026 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <arm_neon.h>
#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/vcvt.h"


void xnn_bf16_f32_vcvt_ukernel__neon_u16(
size_t batch,
const xnn_bfloat16* input,
float* output,
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(xnn_bfloat16) == 0);
assert(input != NULL);
assert(output != NULL);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(xnn_bfloat16); batch -= 16 * sizeof(xnn_bfloat16)) {
const uint16x8_t vbf0 = vld1q_u16(i); i += 8;
const uint16x8_t vbf1 = vld1q_u16(i); i += 8;

const float32x4_t vf0 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(vbf0), 16));
const float32x4_t vf1 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(vbf0), 16));
const float32x4_t vf2 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(vbf1), 16));
const float32x4_t vf3 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(vbf1), 16));

vst1q_f32(output, vf0); output += 4;
vst1q_f32(output, vf1); output += 4;
vst1q_f32(output, vf2); output += 4;
vst1q_f32(output, vf3); output += 4;
}
for (; batch >= 8 * sizeof(xnn_bfloat16); batch -= 8 * sizeof(xnn_bfloat16)) {
const uint16x8_t vbf = vld1q_u16(i); i += 8;

const float32x4_t vf_lo = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(vbf), 16));
const float32x4_t vf_hi = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(vbf), 16));

vst1q_f32(output, vf_lo); output += 4;
vst1q_f32(output, vf_hi); output += 4;
}
if XNN_UNLIKELY(batch != 0) {
assert(batch >= 1 * sizeof(xnn_bfloat16));
assert(batch <= 7 * sizeof(xnn_bfloat16));
const uint16x8_t vbf = vld1q_u16(i);

float32x4_t vf = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(vbf), 16));
if (batch & (4 * sizeof(xnn_bfloat16))) {
vst1q_f32(output, vf); output += 4;
vf = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(vbf), 16));
}
float32x2_t vf_lo = vget_low_f32(vf);
if (batch & (2 * sizeof(xnn_bfloat16))) {
vst1_f32(output, vf_lo); output += 2;
vf_lo = vget_high_f32(vf);
}
if (batch & (1 * sizeof(xnn_bfloat16))) {
vst1_lane_f32(output, vf_lo, 0);
}
}
}
79 changes: 79 additions & 0 deletions src/bf16-f32-vcvt/gen/bf16-f32-vcvt-neon-u24.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/bf16-f32-vcvt/neon.c.in
// Generator: tools/xngen
//
// Copyright 2026 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <arm_neon.h>
#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/vcvt.h"


void xnn_bf16_f32_vcvt_ukernel__neon_u24(
size_t batch,
const xnn_bfloat16* input,
float* output,
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(xnn_bfloat16) == 0);
assert(input != NULL);
assert(output != NULL);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 24 * sizeof(xnn_bfloat16); batch -= 24 * sizeof(xnn_bfloat16)) {
const uint16x8_t vbf0 = vld1q_u16(i); i += 8;
const uint16x8_t vbf1 = vld1q_u16(i); i += 8;
const uint16x8_t vbf2 = vld1q_u16(i); i += 8;

const float32x4_t vf0 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(vbf0), 16));
const float32x4_t vf1 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(vbf0), 16));
const float32x4_t vf2 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(vbf1), 16));
const float32x4_t vf3 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(vbf1), 16));
const float32x4_t vf4 = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(vbf2), 16));
const float32x4_t vf5 = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(vbf2), 16));

vst1q_f32(output, vf0); output += 4;
vst1q_f32(output, vf1); output += 4;
vst1q_f32(output, vf2); output += 4;
vst1q_f32(output, vf3); output += 4;
vst1q_f32(output, vf4); output += 4;
vst1q_f32(output, vf5); output += 4;
}
for (; batch >= 8 * sizeof(xnn_bfloat16); batch -= 8 * sizeof(xnn_bfloat16)) {
const uint16x8_t vbf = vld1q_u16(i); i += 8;

const float32x4_t vf_lo = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(vbf), 16));
const float32x4_t vf_hi = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(vbf), 16));

vst1q_f32(output, vf_lo); output += 4;
vst1q_f32(output, vf_hi); output += 4;
}
if XNN_UNLIKELY(batch != 0) {
assert(batch >= 1 * sizeof(xnn_bfloat16));
assert(batch <= 7 * sizeof(xnn_bfloat16));
const uint16x8_t vbf = vld1q_u16(i);

float32x4_t vf = vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(vbf), 16));
if (batch & (4 * sizeof(xnn_bfloat16))) {
vst1q_f32(output, vf); output += 4;
vf = vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(vbf), 16));
}
float32x2_t vf_lo = vget_low_f32(vf);
if (batch & (2 * sizeof(xnn_bfloat16))) {
vst1_f32(output, vf_lo); output += 2;
vf_lo = vget_high_f32(vf);
}
if (batch & (1 * sizeof(xnn_bfloat16))) {
vst1_lane_f32(output, vf_lo, 0);
}
}
}
Loading
Loading