src/CodeGen_Internal.cpp

#include "CodeGen_Internal.h"
#include "CSE.h"
#include "Debug.h"
#include "IRMutator.h"
#include "IROperator.h"
#include "IntegerDivisionTable.h"
#include "LLVM_Headers.h"
#include "Simplify.h"
#include "runtime/constants.h"

namespace Halide {
namespace Internal {

using std::string;

using namespace llvm;

llvm::Type *get_vector_element_type(llvm::Type *t) {
    if (t->isVectorTy()) {
        return dyn_cast<llvm::VectorType>(t)->getElementType();
    } else {
        return t;
    }
}

// Returns true if the given function name is one of the Halide runtime
// functions that takes a user_context pointer as its first parameter.
bool function_takes_user_context(const std::string &name) {
    static const char *user_context_runtime_funcs[] = {
        "halide_buffer_copy",
        "halide_copy_to_host",
        "halide_copy_to_device",
        "halide_current_time_ns",
        "halide_debug_to_file",
        "halide_device_free",
        "halide_device_host_nop_free",
        "halide_device_free_as_destructor",
        "halide_device_and_host_free",
        "halide_device_and_host_free_as_destructor",
        "halide_device_malloc",
        "halide_device_and_host_malloc",
        "halide_device_sync",
        "halide_device_sync_global",
        "halide_do_par_for",
        "halide_do_loop_task",
        "halide_do_task",
        "halide_do_async_consumer",
        "halide_error",
        "halide_free",
        "halide_malloc",
        "halide_print",
        "halide_profiler_memory_allocate",
        "halide_profiler_memory_free",
        "halide_profiler_instance_start",
        "halide_profiler_instance_end",
        "halide_profiler_stack_peak_update",
        "halide_spawn_thread",
        "halide_device_release",
        "halide_start_clock",
        "halide_trace",
        "halide_trace_helper",
        "halide_memoization_cache_lookup",
        "halide_memoization_cache_store",
        "halide_memoization_cache_release",
        "halide_cuda_run",
        "halide_opencl_run",
        "halide_metal_run",
        "halide_d3d12compute_run",
        "halide_vulkan_run",
        "halide_webgpu_run",
        "halide_msan_annotate_buffer_is_initialized_as_destructor",
        "halide_msan_annotate_buffer_is_initialized",
        "halide_msan_annotate_memory_is_initialized",
        "halide_msan_check_buffer_is_initialized",
        "halide_msan_check_memory_is_initialized",
        "halide_hexagon_initialize_kernels",
        "halide_hexagon_run",
        "halide_hexagon_device_release",
        "halide_hexagon_get_module_state",
        "halide_hexagon_power_hvx_on",
        "halide_hexagon_power_hvx_on_mode",
        "halide_hexagon_power_hvx_on_perf",
        "halide_hexagon_power_hvx_off",
        "halide_hexagon_power_hvx_off_as_destructor",
        "halide_qurt_hvx_lock",
        "halide_qurt_hvx_unlock",
        "halide_qurt_hvx_unlock_as_destructor",
        "halide_vtcm_malloc",
        "halide_vtcm_free",
        "halide_cuda_initialize_kernels",
        "halide_opencl_initialize_kernels",
        "halide_metal_initialize_kernels",
        "halide_d3d12compute_initialize_kernels",
        "halide_vulkan_initialize_kernels",
        "halide_webgpu_initialize_kernels",
        "halide_get_gpu_device",
        "_halide_buffer_crop",
        "_halide_buffer_retire_crop_after_extern_stage",
        "_halide_buffer_retire_crops_after_extern_stage",
        "_halide_hexagon_do_par_for",
    };
    for (const char *user_context_runtime_func : user_context_runtime_funcs) {
        if (name == user_context_runtime_func) {
            return true;
        }
    }
    // The error functions all take a user context
    return starts_with(name, "halide_error_");
}

bool can_allocation_fit_on_stack(int64_t size) {
    user_assert(size > 0) << "Allocation size should be a positive number\n";
    return (size <= (int64_t)Runtime::Internal::Constants::maximum_stack_allocation_bytes);
}

Expr lower_int_uint_div(const Expr &a, const Expr &b, bool round_to_zero) {
    // Detect if it's a small int division
    internal_assert(a.type() == b.type());
    auto const_int_divisor = as_const_int(b);
    auto const_uint_divisor = as_const_uint(b);

    Type t = a.type();
    internal_assert(!t.is_float())
        << "lower_int_uint_div is not meant to handle floating-point case.\n";

    if (auto shift_amount = is_const_power_of_two_integer(b)) {
        if (round_to_zero) {
            Expr result = a;
            // Normally a right-shift isn't right for division rounding to
            // zero. It does the wrong thing for negative values. Add a fudge so
            // that a right-shift becomes correct.
            result += (result >> (t.bits() - 1)) & (b - 1);
            return result >> *shift_amount;
        } else {
            return a >> make_const(UInt(a.type().bits()), *shift_amount);
        }
    } else if (const_int_divisor &&
               t.is_int() &&
               (t.bits() == 8 || t.bits() == 16 || t.bits() == 32) &&
               *const_int_divisor > 1 &&
               ((t.bits() > 8 && *const_int_divisor < 256) || *const_int_divisor < 128)) {

        int64_t multiplier;
        int shift;
        if (t.bits() == 32) {
            if (round_to_zero) {
                multiplier = IntegerDivision::table_srz32[*const_int_divisor][2];
                shift = IntegerDivision::table_srz32[*const_int_divisor][3];
            } else {
                multiplier = IntegerDivision::table_s32[*const_int_divisor][2];
                shift = IntegerDivision::table_s32[*const_int_divisor][3];
            }
        } else if (t.bits() == 16) {
            if (round_to_zero) {
                multiplier = IntegerDivision::table_srz16[*const_int_divisor][2];
                shift = IntegerDivision::table_srz16[*const_int_divisor][3];
            } else {
                multiplier = IntegerDivision::table_s16[*const_int_divisor][2];
                shift = IntegerDivision::table_s16[*const_int_divisor][3];
            }
        } else {
            // 8 bit
            if (round_to_zero) {
                multiplier = IntegerDivision::table_srz8[*const_int_divisor][2];
                shift = IntegerDivision::table_srz8[*const_int_divisor][3];
            } else {
                multiplier = IntegerDivision::table_s8[*const_int_divisor][2];
                shift = IntegerDivision::table_s8[*const_int_divisor][3];
            }
        }
        Expr num = a;

        // Make an all-ones mask if the numerator is negative
        Type num_as_uint_t = num.type().with_code(Type::UInt);
        Expr sign = cast(num_as_uint_t, num >> make_const(UInt(t.bits()), t.bits() - 1));

        // If the numerator is negative, we want to either flip the bits (when
        // rounding to negative infinity), or negate the numerator (when
        // rounding to zero).
        if (round_to_zero) {
            num = abs(num);
        } else {
            // Flip the numerator bits if the mask is high.
            num = cast(num_as_uint_t, num);
            num = num ^ sign;
        }

        // Multiply and keep the high half of the
        // result, and then apply the shift.
        internal_assert(num.type().can_represent(multiplier));
        Expr mult = make_const(num.type(), multiplier);
        num = mul_shift_right(num, mult, shift + num.type().bits());

        // Maybe flip the bits back or negate again.
        num = cast(a.type(), num ^ sign);
        if (round_to_zero) {
            num -= sign;
        }

        return num;
    } else if (const_uint_divisor &&
               t.is_uint() &&
               (t.bits() == 8 || t.bits() == 16 || t.bits() == 32) &&
               *const_uint_divisor > 1 && *const_uint_divisor < 256) {

        int64_t method, multiplier, shift;
        if (t.bits() == 32) {
            method = IntegerDivision::table_u32[*const_uint_divisor][1];
            multiplier = IntegerDivision::table_u32[*const_uint_divisor][2];
            shift = IntegerDivision::table_u32[*const_uint_divisor][3];
        } else if (t.bits() == 16) {
            method = IntegerDivision::table_u16[*const_uint_divisor][1];
            multiplier = IntegerDivision::table_u16[*const_uint_divisor][2];
            shift = IntegerDivision::table_u16[*const_uint_divisor][3];
        } else {
            method = IntegerDivision::table_u8[*const_uint_divisor][1];
            multiplier = IntegerDivision::table_u8[*const_uint_divisor][2];
            shift = IntegerDivision::table_u8[*const_uint_divisor][3];
        }

        internal_assert(method != 0)
            << "method 0 division is for powers of two and should have been handled elsewhere\n";
        const Expr &num = a;

        // Widen, multiply, narrow
        Expr mult = make_const(num.type(), multiplier);
        Expr val = mul_shift_right(num, mult, (method == 1 ? shift : 0) + num.type().bits());

        if (method == 2) {
            // Average with original numerator.
            val = Call::make(val.type(), Call::sorted_avg, {val, num}, Call::PureIntrinsic);
        } else if (method == 3) {
            // Average with original numerator, rounding up. This
            // method exists because this is cheaper than averaging
            // with the original numerator on x86, where there's an
            // average-round-up instruction (pavg), but no
            // average-round-down instruction. Using method 2,
            // sorted_avg lowers to three instructions on x86.
            //
            // On ARM and other architectures with both
            // average-round-up and average-round-down instructions
            // there's no reason to prefer either method 2 or method 3
            // over the other.
            val = rounding_halving_add(val, num);
        }

        // Do the final shift
        if (shift && (method == 2 || method == 3)) {
            val = val >> make_const(UInt(t.bits()), shift);
        }

        return val;
    } else if (round_to_zero) {
        // Return the input division unchanged.
        return Call::make(a.type(), Call::div_round_to_zero, {a, b}, Call::PureIntrinsic);
    } else {
        return lower_euclidean_div(a, b);
    }
}

Expr lower_int_uint_mod(const Expr &a, const Expr &b) {
    // Detect if it's a small int modulus
    auto const_int_divisor = as_const_int(b);
    auto const_uint_divisor = as_const_uint(b);

    Type t = a.type();
    internal_assert(!t.is_float())
        << "lower_int_uint_div is not meant to handle floating-point case.\n";

    if (is_const_power_of_two_integer(b)) {
        return a & simplify(b - 1);
    } else if (const_int_divisor &&
               t.is_int() &&
               (t.bits() == 8 || t.bits() == 16 || t.bits() == 32) &&
               *const_int_divisor > 1 &&
               ((t.bits() > 8 && *const_int_divisor < 256) || *const_int_divisor < 128)) {
        // We can use our fast signed integer division
        return common_subexpression_elimination(a - (a / b) * b);
    } else if (const_uint_divisor &&
               t.is_uint() &&
               (t.bits() == 8 || t.bits() == 16 || t.bits() == 32) &&
               *const_uint_divisor > 1 && *const_uint_divisor < 256) {
        // We can use our fast unsigned integer division
        return common_subexpression_elimination(a - (a / b) * b);
    } else {
        // To match our definition of division, mod should be between 0
        // and |b|.
        return lower_euclidean_mod(a, b);
    }
}

namespace {
std::pair<Expr, Expr> unsigned_long_div_mod_round_to_zero(Expr &num, const Expr &den,
                                                          std::optional<uint64_t> upper_bound) {
    internal_assert(num.type() == den.type());
    internal_assert(num.type().is_uint());
    Type ty = num.type();
    Expr q = make_zero(ty);
    Expr leading_zeros = cast(ty, count_leading_zeros(den));
    // Each iteration of the loop below checks for a bit in the result.
    const int times = ty.bits();
    int start = 1;
    if (upper_bound) {
        // Set start to times - (index of most significant bit in max_val)
        // as for each iteration:
        //     (1 << shift) <= upper_bound
        start = times;
        uint64_t max_val = *upper_bound;
        while (max_val >>= 1) {
            --start;
        }
        debug(1) << "Max value for long division: " << *upper_bound
                 << ". Evaluate only first " << 1 + times - start << " bits.\n";
    }
    Expr r = num;
    for (int i = start; i <= times; i++) {
        // Check if the bit at 'shift' index should be set in the result.
        int shift = times - i;
        Expr shift_expr = make_const(ty, shift);
        Expr new_r = r - (den << shift_expr);
        // Don't drop any set bits from den after shift. The bit is set if
        // den << shift is no more than remainder.
        Expr bit_set = ((shift_expr <= leading_zeros) && r >= (den << shift_expr));
        // Update the  and the quotient.
        r = select(bit_set, new_r, r);
        q = select(bit_set, make_const(ty, uint64_t(1) << shift) | q, q);
    }
    return {q, r};
}
}  // namespace

std::pair<Expr, Expr> long_div_mod_round_to_zero(const Expr &num, const Expr &den,
                                                 std::optional<uint64_t> max_abs) {
    debug(1) << "Using long div: (num: " << num << "); (den: " << den << ")\n";
    internal_assert(num.type() == den.type());
    Expr abs_num = (num.type().is_int()) ? abs(num) : num;
    Expr abs_den = (den.type().is_int()) ? abs(den) : den;
    std::pair<Expr, Expr> qr = unsigned_long_div_mod_round_to_zero(abs_num, abs_den, max_abs);
    Expr q = qr.first;
    Expr r = qr.second;
    // Correct the signs for quotient and remainder for signed integer division.
    if (num.type().is_int()) {
        Expr num_neg = num >> make_const(UInt(num.type().bits()), (num.type().bits() - 1));
        Expr den_neg = den >> make_const(UInt(num.type().bits()), (num.type().bits() - 1));
        q = cast(num.type(), q) * ((num_neg ^ den_neg) | 1);
        r = cast(num.type(), r) * (num_neg | 1);
    }
    q = simplify(common_subexpression_elimination(q));
    r = simplify(common_subexpression_elimination(r));
    return {q, r};
}

Expr lower_euclidean_div(Expr a, Expr b) {
    internal_assert(a.type() == b.type());

    Expr q;

    if (a.type().is_uint()) {
        // IROperator's div_round_to_zero will replace this with a / b for
        // unsigned ops, so create the intrinsic directly.
        Expr b_is_const_zero = (b == 0);
        if (!can_prove(!b_is_const_zero)) {
            b = b | cast(a.type(), b_is_const_zero);
        }
        q = Call::make(a.type(), Call::div_round_to_zero, {a, b}, Call::PureIntrinsic);
        q = select(b_is_const_zero, 0, q);
    } else {
        internal_assert(a.type().is_int());

        // Signed integer division sucks. It should be defined such
        // that it satisifies (a/b)*b + a%b = a, where 0 <= a%b < |b|,
        // i.e. Euclidean division.
        //
        // We additionally define division by zero to be zero, and
        // division of the most negative integer by -1 to be the most
        // negative integer.

        // See div_imp in IROperator.h for the C code we're trying to match.

        Expr zero = make_zero(a.type());
        Expr minus_one = make_const(a.type(), -1);

        Expr a_neg = a >> make_const(UInt(a.type().bits()), (a.type().bits() - 1));
        Expr b_neg = b >> make_const(UInt(b.type().bits()), (b.type().bits() - 1));
        Expr b_zero = select(b == zero, minus_one, zero);

        // Give the simplifier the chance to skip some of this nonsense
        if (can_prove(b != zero)) {
            b_zero = zero;
        }
        if (can_prove(a >= zero)) {
            a_neg = zero;
        } else if (can_prove(a < zero)) {
            a_neg = minus_one;
        }
        if (can_prove(b >= zero)) {
            b_neg = zero;
        } else if (can_prove(b < zero)) {
            b_neg = minus_one;
        }

        // If b is zero, set it to one instead to avoid faulting
        b -= b_zero;
        // If a is negative, add one to it to get the rounding to work out.
        a -= a_neg;
        // Do the C-style division
        q = Call::make(a.type(), Call::div_round_to_zero, {a, b}, Call::PureIntrinsic);
        // If a is negative, either add or subtract one, depending on
        // the sign of b, to fix the rounding. This can't overflow,
        // because we move the result towards zero in either case (we
        // add zero or one when q is negative, and subtract zero or
        // one when it's positive).
        q += a_neg & (~b_neg - b_neg);
        // Set the result to zero when b is zero
        q = q & ~b_zero;
    }

    q = simplify(common_subexpression_elimination(q));

    return q;
}

Expr lower_euclidean_mod(Expr a, Expr b) {
    Expr q;

    if (a.type().is_uint()) {
        Expr b_is_const_zero = (b == 0);
        if (!can_prove(!b_is_const_zero)) {
            b = b | cast(a.type(), b_is_const_zero);
        }
        q = Call::make(a.type(), Call::mod_round_to_zero, {a, b}, Call::PureIntrinsic);
        q = select(b_is_const_zero, make_zero(a.type()), q);
    } else {
        internal_assert(a.type().is_int());

        Expr zero = make_zero(a.type());
        Expr minus_one = make_const(a.type(), -1);

        Expr a_neg = a >> make_const(UInt(a.type().bits()), (a.type().bits() - 1));
        Expr b_neg = b >> make_const(UInt(a.type().bits()), (a.type().bits() - 1));
        Expr b_zero = select(b == zero, minus_one, zero);

        // Give the simplifier the chance to skip some of this nonsense
        if (can_prove(b != zero)) {
            b_zero = zero;
        }
        if (can_prove(a >= zero)) {
            a_neg = zero;
        } else if (can_prove(a < zero)) {
            a_neg = minus_one;
        }
        if (can_prove(b >= zero)) {
            b_neg = zero;
        } else if (can_prove(b < zero)) {
            b_neg = minus_one;
        }

        // If a is negative, add one to get the rounding to work out
        a -= a_neg;
        // Do the mod, avoiding taking mod by zero
        q = Call::make(a.type(), Call::mod_round_to_zero, {a, (b | b_zero)}, Call::PureIntrinsic);
        // If a is negative, we either need to add b - 1 to the
        // result, or -b - 1, depending on the sign of b.
        q += (a_neg & ((b ^ b_neg) + ~b_neg));
        // If b is zero, return zero by masking off the current result.
        q = q & ~b_zero;
    }

    q = simplify(common_subexpression_elimination(q));

    return q;
}

Expr lower_signed_shift_left(const Expr &a, const Expr &b) {
    internal_assert(b.type().is_int());
    if (auto const_int_b = as_const_int(b)) {
        Expr val;
        const uint64_t b_unsigned = std::abs(*const_int_b);
        if (*const_int_b >= 0) {
            val = a << make_const(UInt(a.type().bits()), b_unsigned);
        } else if (*const_int_b < 0) {
            val = a >> make_const(UInt(a.type().bits()), b_unsigned);
        }
        return common_subexpression_elimination(val);
    } else {
        // The abs() below uses Halide's abs operator. This eliminates the overflow
        // case for the most negative value because its result is unsigned.
        Expr b_unsigned = abs(b);
        Expr val = select(b >= 0, a << b_unsigned, a >> b_unsigned);
        return common_subexpression_elimination(val);
    }
}

Expr lower_signed_shift_right(const Expr &a, const Expr &b) {
    internal_assert(b.type().is_int());
    if (auto const_int_b = as_const_int(b)) {
        Expr val;
        const uint64_t b_unsigned = std::abs(*const_int_b);
        if (*const_int_b >= 0) {
            val = a >> make_const(UInt(a.type().bits()), b_unsigned);
        } else if (*const_int_b < 0) {
            val = a << make_const(UInt(a.type().bits()), b_unsigned);
        }
        return common_subexpression_elimination(val);
    } else {
        // The abs() below uses Halide's abs operator. This eliminates the overflow
        // case for the most negative value because its result is unsigned.
        Expr b_unsigned = abs(b);
        Expr val = select(b >= 0, a >> b_unsigned, a << b_unsigned);
        return common_subexpression_elimination(val);
    }
}

Expr lower_mux(const Call *mux) {
    internal_assert(mux->args.size() >= 2);
    Expr equiv = mux->args.back();
    Expr index = mux->args[0];
    if (const Broadcast *b = index.as<Broadcast>()) {
        index = b->value;
    }
    int num_vals = (int)mux->args.size() - 1;
    for (int i = num_vals - 1; i >= 0; i--) {
        equiv = select(index == make_const(index.type(), i), mux->args[i + 1], equiv);
    }
    return equiv;
}

// An implementation of rounding to nearest integer with ties to even to use for
// Halide::round. Written to avoid all use of c standard library functions so
// that it's cleanly vectorizable and a safe fallback on all platforms.
Expr lower_round_to_nearest_ties_to_even(const Expr &x) {
    Type bits_type = x.type().with_code(halide_type_uint);
    Type int_type = x.type().with_code(halide_type_int);

    // Make one half with the same sign as x
    Expr sign_bit = reinterpret(bits_type, x) & (cast(bits_type, 1) << (x.type().bits() - 1));
    Expr one_half = reinterpret(bits_type, cast(x.type(), 0.5f)) | sign_bit;
    Expr just_under_one_half = reinterpret(x.type(), one_half - 1);
    one_half = reinterpret(x.type(), one_half);
    // Do the same for the constant one.
    Expr one = reinterpret(bits_type, cast(x.type(), 1)) | sign_bit;
    // Round to nearest, with ties going towards zero
    Expr ix = cast(int_type, x + just_under_one_half);
    Expr a = cast(x.type(), ix);
    // Get the residual
    Expr diff = a - x;
    // Make a mask of all ones if the result is odd
    Expr odd = -cast(bits_type, ix & 1);
    // Make a mask of all ones if the result was a tie
    Expr tie = select(diff == one_half, cast(bits_type, -1), cast(bits_type, 0));
    // If it was a tie, and the result is odd, we should have rounded in the
    // other direction.
    Expr correction = reinterpret(x.type(), odd & tie & one);
    return common_subexpression_elimination(a - correction);
}

namespace {
std::optional<int64_t> get_md_int(llvm::Metadata *value) {
    if (!value) {
        return std::nullopt;
    }
    llvm::ConstantAsMetadata *cam = llvm::cast<llvm::ConstantAsMetadata>(value);
    if (!cam) {
        return std::nullopt;
    }
    llvm::ConstantInt *c = llvm::cast<llvm::ConstantInt>(cam->getValue());
    if (!c) {
        return std::nullopt;
    }
    return c->getSExtValue();
}

std::optional<bool> get_md_bool(llvm::Metadata *value) {
    if (auto r = get_md_int(value)) {
        return *r != 0;
    } else {
        return std::nullopt;
    }
}

std::optional<std::string> get_md_string(llvm::Metadata *value) {
    if (!value) {
        return std::nullopt;
    }
    llvm::MDString *c = llvm::dyn_cast<llvm::MDString>(value);
    if (c) {
        return c->getString().str();
    }
    return std::nullopt;
}
}  // namespace

void get_target_options(const llvm::Module &module, llvm::TargetOptions &options) {
    bool use_soft_float_abi =
        get_md_bool(module.getModuleFlag("halide_use_soft_float_abi")).value_or(false);
    std::string mabi =
        get_md_string(module.getModuleFlag("halide_mabi")).value_or(std::string{});

    // FIXME: can this be migrated into `set_function_attributes_from_halide_target_options()`?
    bool per_instruction_fast_math_flags =
        get_md_bool(module.getModuleFlag("halide_per_instruction_fast_math_flags")).value_or(false);

    options = llvm::TargetOptions();
    options.AllowFPOpFusion = per_instruction_fast_math_flags ? llvm::FPOpFusion::Strict : llvm::FPOpFusion::Fast;
    options.UnsafeFPMath = !per_instruction_fast_math_flags;
    options.NoInfsFPMath = !per_instruction_fast_math_flags;
    options.NoNaNsFPMath = !per_instruction_fast_math_flags;
    options.HonorSignDependentRoundingFPMathOption = !per_instruction_fast_math_flags;
    options.NoZerosInBSS = false;
    options.GuaranteedTailCallOpt = false;
    options.FunctionSections = true;
    options.UseInitArray = true;
    options.FloatABIType =
        use_soft_float_abi ? llvm::FloatABI::Soft : llvm::FloatABI::Hard;
#if LLVM_VERSION >= 190
    options.MCOptions.X86RelaxRelocations = false;
#else
    options.RelaxELFRelocations = false;
#endif
    options.MCOptions.ABIName = mabi;
}

void clone_target_options(const llvm::Module &from, llvm::Module &to) {
    to.setTargetTriple(from.getTargetTriple());

    llvm::LLVMContext &context = to.getContext();

    // Clone bool metadata
    for (const char *s : {"halide_use_soft_float_abi",
                          "halide_use_pic"}) {
        if (auto md = get_md_bool(from.getModuleFlag(s))) {
            to.addModuleFlag(llvm::Module::Warning, s, *md ? 1 : 0);
        }
    }

    // Clone string metadata
    for (const char *s : {"halide_mcpu_target",
                          "halide_mcpu_tune",
                          "halide_mattrs"}) {

        if (auto md = get_md_string(from.getModuleFlag(s))) {
            to.addModuleFlag(llvm::Module::Warning, s, llvm::MDString::get(context, *md));
        }
    }
}

std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &module) {
    std::string error_string;

    const llvm::Target *llvm_target = llvm::TargetRegistry::lookupTarget(module.getTargetTriple(), error_string);
    if (!llvm_target) {
        std::cout << error_string << "\n";
        llvm::TargetRegistry::printRegisteredTargetsForVersion(llvm::outs());
    }
    auto triple = llvm::Triple(module.getTargetTriple());
    internal_assert(llvm_target) << "Could not create LLVM target for " << triple.str() << "\n";

    llvm::TargetOptions options;
    get_target_options(module, options);

    bool use_pic =
        get_md_bool(module.getModuleFlag("halide_use_pic")).value_or(true);

    bool use_large_code_model =
        get_md_bool(module.getModuleFlag("halide_use_large_code_model")).value_or(false);

    // Get module mcpu_target and mattrs.
    std::string mcpu_target =
        get_md_string(module.getModuleFlag("halide_mcpu_target")).value_or(std::string{});
    std::string mattrs =
        get_md_string(module.getModuleFlag("halide_mattrs")).value_or(std::string{});

    auto *tm = llvm_target->createTargetMachine(
#if LLVM_VERSION >= 210
        triple,
#else
        triple.str(),
#endif
        mcpu_target,
        mattrs,
        options,
        use_pic ? llvm::Reloc::PIC_ : llvm::Reloc::Static,
        use_large_code_model ? llvm::CodeModel::Large : llvm::CodeModel::Small,
        CodeGenOptLevel::Aggressive);
    return std::unique_ptr<llvm::TargetMachine>(tm);
}

void set_function_attributes_from_halide_target_options(llvm::Function &fn) {
    llvm::Module &module = *fn.getParent();

    std::string mcpu_target =
        get_md_string(module.getModuleFlag("halide_mcpu_target")).value_or(std::string{});
    std::string mcpu_tune =
        get_md_string(module.getModuleFlag("halide_mcpu_tune")).value_or(std::string{});
    std::string mattrs =
        get_md_string(module.getModuleFlag("halide_mattrs")).value_or(std::string{});
    int64_t vscale_range =
        get_md_int(module.getModuleFlag("halide_effective_vscale")).value_or(0);

    fn.addFnAttr("target-cpu", mcpu_target);
    fn.addFnAttr("tune-cpu", mcpu_tune);
    fn.addFnAttr("target-features", mattrs);

    // Halide-generated IR is not exception-safe.
    // No exception should unwind out of Halide functions.
    // No exception should be thrown within Halide functions.
    // All functions called by the Halide function must not unwind.
    fn.setDoesNotThrow();

    // Side-effect-free loops are undefined.
    // But asserts and external calls *might* abort.
    fn.setMustProgress();

    // Turn off approximate reciprocals for division. It's too
    // inaccurate even for us.
    fn.addFnAttr("reciprocal-estimates", "none");

    // If a fixed vscale is asserted, add it as an attribute on the function.
    if (vscale_range != 0) {
        fn.addFnAttr(llvm::Attribute::getWithVScaleRangeArgs(
            module.getContext(), vscale_range, vscale_range));
    }
}

void embed_bitcode(llvm::Module *M, const string &halide_command) {
    // Save llvm.compiler.used and remote it.
    SmallVector<Constant *, 2> used_array;
    SmallVector<GlobalValue *, 4> used_globals;
    llvm::Type *used_element_type = PointerType::get(llvm::Type::getInt8Ty(M->getContext()), 0);
    GlobalVariable *used = collectUsedGlobalVariables(*M, used_globals, true);
    for (auto *GV : used_globals) {
        if (GV->getName() != "llvm.embedded.module" &&
            GV->getName() != "llvm.cmdline") {
            used_array.push_back(
                ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, used_element_type));
        }
    }
    if (used) {
        used->eraseFromParent();
    }

    // Embed the bitcode for the llvm module.
    std::string data;
    Triple triple(M->getTargetTriple());
    // Create a constant that contains the bitcode.
    llvm::raw_string_ostream OS(data);
    llvm::WriteBitcodeToFile(*M, OS, /* ShouldPreserveUseListOrder */ true);
    ArrayRef<uint8_t> module_data =
        ArrayRef<uint8_t>((const uint8_t *)OS.str().data(), OS.str().size());

    llvm::Constant *module_constant =
        llvm::ConstantDataArray::get(M->getContext(), module_data);
    llvm::GlobalVariable *GV = new llvm::GlobalVariable(
        *M, module_constant->getType(), true, llvm::GlobalValue::PrivateLinkage,
        module_constant);
    GV->setSection((triple.getObjectFormat() == Triple::MachO) ? "__LLVM,__bitcode" : ".llvmbc");
    used_array.push_back(
        ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, used_element_type));
    if (llvm::GlobalVariable *old =
            M->getGlobalVariable("llvm.embedded.module", true)) {
        internal_assert(old->hasOneUse()) << "llvm.embedded.module can only be used once in llvm.compiler.used";
        GV->takeName(old);
        old->eraseFromParent();
    } else {
        GV->setName("llvm.embedded.module");
    }

    // Embed command-line options.
    ArrayRef<uint8_t> command_line_data(const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(halide_command.data())),
                                        halide_command.size());
    llvm::Constant *command_line_constant =
        llvm::ConstantDataArray::get(M->getContext(), command_line_data);
    GV = new llvm::GlobalVariable(*M, command_line_constant->getType(), true,
                                  llvm::GlobalValue::PrivateLinkage,
                                  command_line_constant);

    GV->setSection((triple.getObjectFormat() == Triple::MachO) ? "__LLVM,__cmdline" : ".llvmcmd");
    used_array.push_back(
        ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, used_element_type));
    if (llvm::GlobalVariable *old =
            M->getGlobalVariable("llvm.cmdline", true)) {
        internal_assert(old->hasOneUse()) << "llvm.cmdline can only be used once in llvm.compiler.used";
        GV->takeName(old);
        old->eraseFromParent();
    } else {
        GV->setName("llvm.cmdline");
    }

    if (!used_array.empty()) {
        // Recreate llvm.compiler.used.
        ArrayType *ATy = ArrayType::get(used_element_type, used_array.size());
        auto *new_used = new GlobalVariable(
            *M, ATy, false, llvm::GlobalValue::AppendingLinkage,
            llvm::ConstantArray::get(ATy, used_array), "llvm.compiler.used");
        new_used->setSection("llvm.metadata");
    }
}

Expr lower_concat_bits(const Call *op) {
    internal_assert(op->is_intrinsic(Call::concat_bits));
    internal_assert(!op->args.empty());

    Expr result = make_zero(op->type);
    int shift = 0;
    for (const Expr &e : op->args) {
        result = result | (cast(result.type(), e) << shift);
        shift += e.type().bits();
    }
    return result;
}

Expr lower_extract_bits(const Call *op) {
    Expr e = op->args[0];
    // Do a shift-and-cast as a uint, which will zero-fill any out-of-range
    // bits for us.
    if (!e.type().is_uint()) {
        e = reinterpret(e.type().with_code(halide_type_uint), e);
    }
    e = e >> op->args[1];
    e = cast(op->type.with_code(halide_type_uint), e);
    if (op->type != e.type()) {
        e = reinterpret(op->type, e);
    }
    e = simplify(e);
    return e;
}

}  // namespace Internal
}  // namespace Halide